From 216220db366980b3b647f0009d6dc0e02e63783b Mon Sep 17 00:00:00 2001 From: AndreyMarchuk Date: Sun, 22 Feb 2026 23:18:18 -0800 Subject: [PATCH 1/2] agent: add cli session shim and remote install/repair workflows --- .env.schema | 8 + CONFIGURATION.md | 28 + README.md | 21 + bin/baudbot | 6 + bin/baudbot.test.sh | 30 + bin/broker-register.test.mjs | 18 +- bin/lib/remote-common.sh | 517 +++++++ bin/lib/remote-common.test.sh | 155 +++ bin/lib/remote-hetzner.sh | 339 +++++ bin/lib/remote-hetzner.test.sh | 242 ++++ bin/lib/remote-ssh.sh | 134 ++ bin/lib/remote-ssh.test.sh | 167 +++ bin/redact-logs.sh | 57 +- bin/remote.sh | 1195 +++++++++++++++++ bin/remote.test.sh | 145 ++ bin/security-audit.sh | 72 +- bin/test.sh | 4 + docs/agents.md | 5 +- docs/architecture.md | 6 +- docs/operations.md | 29 + package.json | 2 +- pi/extensions/cli-session-shim.mjs | 454 +++++++ pi/extensions/cli-session-shim.test.mjs | 396 ++++++ pi/skills/control-agent/HEARTBEAT.md | 2 +- pi/skills/control-agent/SKILL.md | 61 + pi/skills/control-agent/scripts/bb-update.sh | 234 ++++ .../control-agent/scripts/run-cli-agent.sh | 412 ++++++ .../scripts/run-cli-agent.test.sh | 511 +++++++ .../dev-agent-cli/persona.claude-code.tmpl | 61 + pi/skills/dev-agent-cli/persona.codex.tmpl | 41 + test/broker-bridge.integration.test.mjs | 123 +- test/security-audit.test.mjs | 2 +- test/shell-scripts.test.mjs | 20 + vitest.config.mjs | 1 + 34 files changed, 5437 insertions(+), 61 deletions(-) create mode 100644 bin/lib/remote-common.sh create mode 100755 bin/lib/remote-common.test.sh create mode 100644 bin/lib/remote-hetzner.sh create mode 100755 bin/lib/remote-hetzner.test.sh create mode 100644 bin/lib/remote-ssh.sh create mode 100755 bin/lib/remote-ssh.test.sh create mode 100755 bin/remote.sh create mode 100755 bin/remote.test.sh create mode 100644 pi/extensions/cli-session-shim.mjs create mode 100644 pi/extensions/cli-session-shim.test.mjs create mode 100755 pi/skills/control-agent/scripts/bb-update.sh create mode 100755 pi/skills/control-agent/scripts/run-cli-agent.sh create mode 100755 pi/skills/control-agent/scripts/run-cli-agent.test.sh create mode 100644 pi/skills/dev-agent-cli/persona.claude-code.tmpl create mode 100644 pi/skills/dev-agent-cli/persona.codex.tmpl diff --git a/.env.schema b/.env.schema index f4b66df..d7d345c 100644 --- a/.env.schema +++ b/.env.schema @@ -54,6 +54,14 @@ SLACK_ALLOWED_USERS= # @sensitive=false @type=number BAUDBOT_EXPERIMENTAL=0 +# ── Dev Agent Backend ──────────────────────────────────────────────────────── + +# Default backend for spawning dev agents. +# Control-agent may override per-task. +# Options: pi, claude-code, codex, auto +# @sensitive=false @type=string +DEV_AGENT_BACKEND=pi + # ── Email Monitor (experimental-only) ─────────────────────────────────────── # AgentMail API key (only used when BAUDBOT_EXPERIMENTAL=1) diff --git a/CONFIGURATION.md b/CONFIGURATION.md index 0bb7920..8f1b603 100644 --- a/CONFIGURATION.md +++ b/CONFIGURATION.md @@ -68,6 +68,12 @@ Email tooling is disabled by default. To enable it, run setup/install in experim ## Optional Variables +### Dev Agent Backend + +| Variable | Description | Default | +|----------|-------------|---------| +| `DEV_AGENT_BACKEND` | Default backend for spawning dev agents (`pi`, `claude-code`, `codex`, `auto`) | `pi` | + ### Sentry Integration | Variable | Description | How to get it | @@ -149,6 +155,28 @@ Set during `setup.sh` / `baudbot install` via env vars: | `GIT_USER_NAME` | Git commit author name | `baudbot-agent` | | `GIT_USER_EMAIL` | Git commit author email | `baudbot-agent@users.noreply.github.com` | +### Remote CLI (operator-local, not runtime) + +These apply only to `baudbot remote ...` when run from your local operator machine. They are not part of agent runtime `.env` and should not be written to `/home/baudbot_agent/.config/.env`. + +| Variable | Description | Default | +|----------|-------------|---------| +| `BAUDBOT_REMOTE_DIR` | Local state directory for remote targets/checkpoints/keys | `~/.baudbot/remote` | +| `HETZNER_API_TOKEN` | Hetzner token fallback for `--hetzner-token` | *(empty)* | +| `TAILSCALE_AUTHKEY` | Tailscale auth key fallback for `--tailscale-auth-key` | *(empty)* | +| `REMOTE_BOOTSTRAP_URL` | Bootstrap script URL used by remote install step | `https://raw.githubusercontent.com/modem-dev/baudbot/main/bootstrap.sh` | +| `REMOTE_TAILSCALE_INSTALL_URL` | Tailscale install script URL used by remote workflow | `https://tailscale.com/install.sh` | +| `REMOTE_TAILSCALE_WAIT_ATTEMPTS` | Tailscale readiness polling attempts after `tailscale up` | `40` | +| `REMOTE_TAILSCALE_WAIT_INTERVAL_SEC` | Delay between Tailscale readiness polls | `3` | +| `REMOTE_CHECKPOINT_MAX_RETRIES` | Retries per install checkpoint before interactive escalation | `3` | +| `REMOTE_HETZNER_SERVER_TYPE` | Hetzner default server type for remote install | `cpx11` | +| `REMOTE_HETZNER_IMAGE` | Hetzner default image for remote install | `ubuntu-24.04` | +| `REMOTE_HETZNER_LOCATION` | Hetzner default location for remote install | `ash` | +| `REMOTE_HETZNER_WAIT_TIMEOUT_SEC` | Timeout while waiting for server running state | `600` | +| `REMOTE_HETZNER_WAIT_INTERVAL_SEC` | Poll interval while waiting for server running state | `5` | +| `REMOTE_SSH_REACHABLE_ATTEMPTS` | SSH readiness attempts per checkpoint | `40` | +| `REMOTE_SSH_REACHABLE_INTERVAL_SEC` | Delay between SSH readiness attempts | `3` | + ### Heartbeat | Variable | Description | Default | diff --git a/README.md b/README.md index 05131fd..72fffa6 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,27 @@ Upgrade later: sudo baudbot update ``` +Remote provisioning/install and repair (operator-run from your local machine): + +```bash +# Provision on Hetzner and install Baudbot +baudbot remote install --mode hetzner --target team-bot + +# Install on an existing host +baudbot remote install --mode host --target team-bot --host 203.0.113.10 --ssh-user root + +# Install + connect host to Tailscale +baudbot remote install --mode host --target team-bot --host 203.0.113.10 --tailscale + +# Resume an interrupted run +baudbot remote resume team-bot + +# Guided repair for an existing target +baudbot remote repair --target team-bot +``` + +`baudbot remote` persists checkpoints in `~/.baudbot/remote/targets/*.json`, so interrupted installs can resume from the next incomplete checkpoint. + Install with a specific pi version (optional): ```bash diff --git a/bin/baudbot b/bin/baudbot index 37123e4..d80f5ca 100755 --- a/bin/baudbot +++ b/bin/baudbot @@ -138,6 +138,7 @@ usage() { echo " install Bootstrap install from GitHub (download script, then escalate)" echo " setup One-time system setup (user, deps, firewall, systemd; --experimental enables risky integrations)" echo " config Interactive secrets and config setup" + echo " remote Remote install/repair workflows (Hetzner or existing host)" echo " env Manage env vars and backend source (set/get/sync/backend)" echo " deploy Deploy source + config to agent runtime" echo " broker Slack broker commands (register workspace linkage)" @@ -411,6 +412,11 @@ case "${1:-}" in exec "$BAUDBOT_ROOT/bin/config.sh" "$@" ;; + remote) + shift + exec "$BAUDBOT_ROOT/bin/remote.sh" "$@" + ;; + env) shift exec "$BAUDBOT_ROOT/bin/env.sh" "$@" diff --git a/bin/baudbot.test.sh b/bin/baudbot.test.sh index ba2ba7a..9b5aa9d 100644 --- a/bin/baudbot.test.sh +++ b/bin/baudbot.test.sh @@ -191,6 +191,35 @@ EOF ) } +test_remote_dispatches_to_remote_script() { + ( + set -euo pipefail + local tmp out + tmp="$(mktemp -d /tmp/baudbot-cli-test.XXXXXX)" + trap 'rm -rf "$tmp"' EXIT + + mkdir -p "$tmp/bin/lib" + printf '{"version":"1.2.3"}\n' > "$tmp/package.json" + cat > "$tmp/bin/lib/baudbot-runtime.sh" <<'EOF' +#!/bin/bash +cmd_status() { :; } +cmd_logs() { :; } +cmd_sessions() { :; } +cmd_attach() { :; } +has_systemd() { return 0; } +EOF + + cat > "$tmp/bin/remote.sh" <<'EOF' +#!/bin/bash +echo "remote-dispatch-ok:$*" +EOF + chmod +x "$tmp/bin/remote.sh" + + out="$(BAUDBOT_ROOT="$tmp" bash "$CLI" remote list)" + [ "$out" = "remote-dispatch-ok:list" ] + ) +} + echo "=== baudbot cli tests ===" echo "" @@ -199,6 +228,7 @@ run_test "status dispatches via runtime module" test_status_dispatches_via_runti run_test "attach requires root" test_attach_requires_root run_test "broker register requires root" test_broker_register_requires_root run_test "restart kills bridge tmux then restarts systemd" test_restart_restarts_systemd_and_kills_bridge_tmux +run_test "remote command dispatches to remote.sh" test_remote_dispatches_to_remote_script echo "" echo "=== $PASSED/$TOTAL passed, $FAILED failed ===" diff --git a/bin/broker-register.test.mjs b/bin/broker-register.test.mjs index ff8cf5c..4b70f39 100644 --- a/bin/broker-register.test.mjs +++ b/bin/broker-register.test.mjs @@ -189,7 +189,7 @@ test("registerWithBroker sends registration_token when provided", async () => { }); }); -test("runRegistration integration path succeeds against live local HTTP server", async () => { +test("runRegistration integration path succeeds against live local HTTP server", async (t) => { const brokerPubkey = Buffer.alloc(32, 5).toString("base64"); const brokerSigningPubkey = Buffer.alloc(32, 6).toString("base64"); @@ -222,7 +222,21 @@ test("runRegistration integration path succeeds against live local HTTP server", res.end(JSON.stringify({ ok: false, error: "not found" })); }); - await new Promise((resolve) => server.listen(0, "127.0.0.1", resolve)); + try { + await new Promise((resolve, reject) => { + server.once("error", reject); + server.listen(0, "127.0.0.1", resolve); + }); + } catch (error) { + if (error && typeof error === "object" && "code" in error) { + const code = String(error.code || ""); + if (code === "EPERM" || code === "EACCES") { + t.skip("Localhost bind is not permitted in this environment"); + return; + } + } + throw error; + } const address = server.address(); const brokerUrl = `http://127.0.0.1:${address.port}`; diff --git a/bin/lib/remote-common.sh b/bin/lib/remote-common.sh new file mode 100644 index 0000000..505f1da --- /dev/null +++ b/bin/lib/remote-common.sh @@ -0,0 +1,517 @@ +#!/bin/bash +# Shared helpers for remote install/repair orchestration. + +_REMOTE_COMMON_LIB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=bin/lib/shell-common.sh +source "$_REMOTE_COMMON_LIB_DIR/shell-common.sh" + +REMOTE_ROOT_DEFAULT="${HOME}/.baudbot/remote" +REMOTE_ROOT="${BAUDBOT_REMOTE_DIR:-$REMOTE_ROOT_DEFAULT}" +REMOTE_TARGETS_DIR="${REMOTE_ROOT}/targets" +REMOTE_KEYS_DIR="${REMOTE_ROOT}/keys" +REMOTE_KNOWN_HOSTS="${REMOTE_ROOT}/known_hosts" + +remote_refresh_paths() { + REMOTE_ROOT="${BAUDBOT_REMOTE_DIR:-$REMOTE_ROOT_DEFAULT}" + REMOTE_TARGETS_DIR="${REMOTE_ROOT}/targets" + REMOTE_KEYS_DIR="${REMOTE_ROOT}/keys" + REMOTE_KNOWN_HOSTS="${REMOTE_ROOT}/known_hosts" +} + +remote_log() { + echo "[remote] $*" +} + +remote_warn() { + echo "[remote] WARN: $*" >&2 +} + +remote_error() { + echo "[remote] ERROR: $*" >&2 +} + +remote_die() { + remote_error "$*" + exit 1 +} + +remote_now_iso() { + date -u +"%Y-%m-%dT%H:%M:%SZ" +} + +remote_targets_dir() { + remote_refresh_paths + printf '%s\n' "$REMOTE_TARGETS_DIR" +} + +remote_keys_dir() { + remote_refresh_paths + printf '%s\n' "$REMOTE_KEYS_DIR" +} + +remote_known_hosts_path() { + remote_refresh_paths + printf '%s\n' "$REMOTE_KNOWN_HOSTS" +} + +remote_state_path() { + local target="$1" + printf '%s/%s.json\n' "$(remote_targets_dir)" "$target" +} + +remote_state_exists() { + local target="$1" + [ -f "$(remote_state_path "$target")" ] +} + +remote_validate_target_name() { + local target="$1" + if [ -z "$target" ]; then + remote_error "target name cannot be empty" + return 1 + fi + if [ "${#target}" -gt 63 ]; then + remote_error "target name must be 63 characters or fewer" + return 1 + fi + if ! printf '%s' "$target" | grep -Eq '^[a-z0-9]([-a-z0-9]*[a-z0-9])?$'; then + remote_error "target name must use lowercase letters, numbers, and hyphens" + return 1 + fi + return 0 +} + +remote_init_storage() { + remote_refresh_paths + mkdir -p "$REMOTE_ROOT" "$REMOTE_TARGETS_DIR" "$REMOTE_KEYS_DIR" + chmod 700 "$REMOTE_ROOT" "$REMOTE_TARGETS_DIR" "$REMOTE_KEYS_DIR" + if [ ! -f "$REMOTE_KNOWN_HOSTS" ]; then + : > "$REMOTE_KNOWN_HOSTS" + fi + chmod 600 "$REMOTE_KNOWN_HOSTS" +} + +remote_require_tools() { + local missing=0 + local cmd + for cmd in "$@"; do + if ! command -v "$cmd" >/dev/null 2>&1; then + remote_error "required command not found: $cmd" + missing=1 + fi + done + if [ "$missing" -ne 0 ]; then + return 1 + fi + return 0 +} + +remote_require_dependencies_install() { + local mode="$1" + local tools=(jq ssh scp ssh-keygen) + if [ "$mode" = "hetzner" ]; then + tools+=(curl) + fi + remote_require_tools "${tools[@]}" +} + +remote_require_dependencies_repair() { + remote_require_tools jq ssh scp +} + +remote_expand_path() { + local input="$1" + if [ -z "$input" ]; then + printf '\n' + return 0 + fi + case "$input" in + \~) + printf '%s\n' "$HOME" + ;; + \~/*) + printf '%s/%s\n' "$HOME" "${input#~/}" + ;; + *) + printf '%s\n' "$input" + ;; + esac +} + +_remote_state_write_jq() { + local target="$1" + local filter="$2" + shift 2 + + local state_file tmp_file + state_file="$(remote_state_path "$target")" + if [ ! -f "$state_file" ]; then + remote_die "state not found for target '$target'" + fi + + tmp_file="$(mktemp "${TMPDIR:-/tmp}/baudbot-remote-state.XXXXXX")" + if jq "$@" "$filter" "$state_file" > "$tmp_file"; then + mv "$tmp_file" "$state_file" + else + rm -f "$tmp_file" + remote_die "failed to update state for target '$target'" + fi +} + +remote_state_init() { + local target="$1" + local mode="$2" + local host="$3" + local ssh_user="$4" + local ssh_key_path="$5" + local provider_name="$6" + local location="$7" + local server_type="$8" + local image="$9" + + remote_validate_target_name "$target" || return 1 + remote_init_storage + + local state_file now tmp_file + state_file="$(remote_state_path "$target")" + now="$(remote_now_iso)" + tmp_file="$(mktemp "${TMPDIR:-/tmp}/baudbot-remote-state-init.XXXXXX")" + + if ! jq -n \ + --arg name "$target" \ + --arg mode "$mode" \ + --arg host "$host" \ + --arg ssh_user "$ssh_user" \ + --arg ssh_key_path "$ssh_key_path" \ + --arg provider_name "$provider_name" \ + --arg location "$location" \ + --arg server_type "$server_type" \ + --arg image "$image" \ + --arg now "$now" \ + '{ + name: $name, + mode: $mode, + host: $host, + ssh_user: $ssh_user, + ssh_key_path: $ssh_key_path, + provider: { + name: $provider_name, + server_id: "", + ssh_key_id: "", + location: $location, + server_type: $server_type, + image: $image + }, + tailscale: { + enabled: false, + ip: "" + }, + status: "initialized", + checkpoints: [], + last_error: "", + created_at: $now, + updated_at: $now + }' > "$tmp_file"; then + rm -f "$tmp_file" + remote_die "failed to initialize state for target '$target'" + fi + + mv "$tmp_file" "$state_file" + chmod 600 "$state_file" +} + +remote_state_get_field() { + local target="$1" + local jq_expr="$2" + local state_file + state_file="$(remote_state_path "$target")" + [ -f "$state_file" ] || return 1 + jq -er "$jq_expr // empty" "$state_file" 2>/dev/null || true +} + +remote_state_set_status() { + local target="$1" + local status="$2" + local now + now="$(remote_now_iso)" + _remote_state_write_jq "$target" '.status = $status | .updated_at = $now' --arg status "$status" --arg now "$now" +} + +remote_state_set_mode() { + local target="$1" + local mode="$2" + local now + now="$(remote_now_iso)" + _remote_state_write_jq "$target" '.mode = $mode | .updated_at = $now' --arg mode "$mode" --arg now "$now" +} + +remote_state_set_host() { + local target="$1" + local host="$2" + local now + now="$(remote_now_iso)" + _remote_state_write_jq "$target" '.host = $host | .updated_at = $now' --arg host "$host" --arg now "$now" +} + +remote_state_set_ssh_user() { + local target="$1" + local ssh_user="$2" + local now + now="$(remote_now_iso)" + _remote_state_write_jq "$target" '.ssh_user = $ssh_user | .updated_at = $now' --arg ssh_user "$ssh_user" --arg now "$now" +} + +remote_state_set_ssh_key_path() { + local target="$1" + local ssh_key_path="$2" + local now + now="$(remote_now_iso)" + _remote_state_write_jq "$target" '.ssh_key_path = $ssh_key_path | .updated_at = $now' --arg ssh_key_path "$ssh_key_path" --arg now "$now" +} + +remote_state_set_provider_field() { + local target="$1" + local field="$2" + local value="$3" + local now + now="$(remote_now_iso)" + _remote_state_write_jq "$target" ".provider.${field} = \$value | .updated_at = \$now" --arg value "$value" --arg now "$now" +} + +remote_state_set_tailscale_enabled() { + local target="$1" + local enabled="$2" + local now + now="$(remote_now_iso)" + _remote_state_write_jq "$target" '.tailscale.enabled = ($enabled == "true") | .updated_at = $now' --arg enabled "$enabled" --arg now "$now" +} + +remote_state_set_tailscale_ip() { + local target="$1" + local ip="$2" + local now + now="$(remote_now_iso)" + _remote_state_write_jq "$target" '.tailscale.ip = $ip | .updated_at = $now' --arg ip "$ip" --arg now "$now" +} + +remote_state_set_last_error() { + local target="$1" + local message="$2" + local now + now="$(remote_now_iso)" + _remote_state_write_jq "$target" '.last_error = $message | .updated_at = $now' --arg message "$message" --arg now "$now" +} + +remote_state_clear_last_error() { + local target="$1" + local now + now="$(remote_now_iso)" + _remote_state_write_jq "$target" '.last_error = "" | .updated_at = $now' --arg now "$now" +} + +remote_checkpoint_retry_count() { + local target="$1" + local checkpoint="$2" + local current + current="$(remote_state_get_field "$target" ".checkpoints[]? | select(.name == \"$checkpoint\") | .retry_count")" + if [ -z "$current" ]; then + printf '0\n' + else + printf '%s\n' "$current" + fi +} + +remote_checkpoint_is_complete() { + local target="$1" + local checkpoint="$2" + local completed_at + completed_at="$(remote_state_get_field "$target" ".checkpoints[]? | select(.name == \"$checkpoint\") | .completed_at")" + [ -n "$completed_at" ] +} + +remote_checkpoint_set_retry() { + local target="$1" + local checkpoint="$2" + local retry_count="$3" + local now + now="$(remote_now_iso)" + _remote_state_write_jq "$target" ' + .checkpoints = ( + if (.checkpoints | map(.name) | index($checkpoint)) == null then + .checkpoints + [{ name: $checkpoint, completed_at: "", retry_count: ($retry_count | tonumber) }] + else + .checkpoints | map( + if .name == $checkpoint then + .retry_count = ($retry_count | tonumber) + else + . + end + ) + end + ) + | .updated_at = $now + ' --arg checkpoint "$checkpoint" --arg retry_count "$retry_count" --arg now "$now" +} + +remote_checkpoint_mark_complete() { + local target="$1" + local checkpoint="$2" + local retry_count="$3" + local now + now="$(remote_now_iso)" + _remote_state_write_jq "$target" ' + .checkpoints = ( + if (.checkpoints | map(.name) | index($checkpoint)) == null then + .checkpoints + [{ + name: $checkpoint, + completed_at: $now, + retry_count: ($retry_count | tonumber) + }] + else + .checkpoints | map( + if .name == $checkpoint then + .completed_at = $now + | .retry_count = ($retry_count | tonumber) + else + . + end + ) + end + ) + | .updated_at = $now + ' --arg checkpoint "$checkpoint" --arg retry_count "$retry_count" --arg now "$now" +} + +remote_install_checkpoint_order() { + local mode="$1" + if [ "$mode" = "hetzner" ]; then + cat <<'EOF' +target_selected +ssh_key_ready +server_provisioned +ssh_reachable +bootstrap_installed +baudbot_install_completed +doctor_passed +tailscale_connected +completed +EOF + else + cat <<'EOF' +target_selected +ssh_key_ready +ssh_reachable +bootstrap_installed +baudbot_install_completed +doctor_passed +tailscale_connected +completed +EOF + fi +} + +remote_next_install_checkpoint() { + local target="$1" + local mode="$2" + local checkpoint + while IFS= read -r checkpoint; do + [ -n "$checkpoint" ] || continue + if ! remote_checkpoint_is_complete "$target" "$checkpoint"; then + printf '%s\n' "$checkpoint" + return 0 + fi + done < <(remote_install_checkpoint_order "$mode") + printf 'completed\n' +} + +remote_reset_install_progress() { + local target="$1" + local now + now="$(remote_now_iso)" + _remote_state_write_jq "$target" '.checkpoints = [] | .status = "initialized" | .last_error = "" | .tailscale.enabled = false | .tailscale.ip = "" | .updated_at = $now' --arg now "$now" +} + +remote_prompt_default() { + local prompt="$1" + local default_value="${2:-}" + local answer="" + if [ -n "$default_value" ]; then + printf "%s [%s]: " "$prompt" "$default_value" >&2 + else + printf "%s: " "$prompt" >&2 + fi + read -r answer + if [ -z "$answer" ]; then + printf '%s\n' "$default_value" + else + printf '%s\n' "$answer" + fi +} + +remote_confirm() { + local prompt="$1" + local default_answer="${2:-y}" + local suffix="[Y/n]" + if [ "$default_answer" = "n" ]; then + suffix="[y/N]" + fi + + local answer="" + printf "%s %s " "$prompt" "$suffix" >&2 + read -r answer + if [ -z "$answer" ]; then + answer="$default_answer" + fi + case "$answer" in + y|Y|yes|YES) + return 0 + ;; + *) + return 1 + ;; + esac +} + +remote_is_interactive() { + [ -t 0 ] && [ -t 1 ] +} + +remote_ensure_local_ssh_key() { + local key_path_input="$1" + local comment="$2" + local allow_generate="${3:-1}" + local key_path + key_path="$(remote_expand_path "$key_path_input")" + + if [ -z "$key_path" ]; then + remote_die "ssh key path is empty" + fi + + local pub_key_path="${key_path}.pub" + + if [ -f "$key_path" ]; then + chmod 600 "$key_path" + if [ ! -f "$pub_key_path" ]; then + if ! ssh-keygen -y -f "$key_path" > "$pub_key_path" 2>/dev/null; then + remote_die "failed to derive public key from existing private key: $key_path" + fi + chmod 644 "$pub_key_path" + fi + printf '%s\n' "$key_path" + return 0 + fi + + if [ "$allow_generate" != "1" ]; then + remote_die "ssh private key not found: $key_path" + fi + + mkdir -p "$(dirname "$key_path")" + chmod 700 "$(dirname "$key_path")" + + if ! ssh-keygen -t ed25519 -C "$comment" -f "$key_path" -N "" >/dev/null 2>&1; then + remote_die "failed to generate ssh key pair at: $key_path" + fi + chmod 600 "$key_path" + chmod 644 "$pub_key_path" + printf '%s\n' "$key_path" +} diff --git a/bin/lib/remote-common.test.sh b/bin/lib/remote-common.test.sh new file mode 100755 index 0000000..d703467 --- /dev/null +++ b/bin/lib/remote-common.test.sh @@ -0,0 +1,155 @@ +#!/bin/bash +# Tests for bin/lib/remote-common.sh + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +# shellcheck source=bin/lib/remote-common.sh +source "$SCRIPT_DIR/remote-common.sh" + +TOTAL=0 +PASSED=0 +FAILED=0 + +run_test() { + local name="$1" + shift + local out + + TOTAL=$((TOTAL + 1)) + printf " %-45s " "$name" + + out="$(mktemp /tmp/baudbot-remote-common-test-output.XXXXXX)" + if "$@" >"$out" 2>&1; then + echo "✓" + PASSED=$((PASSED + 1)) + else + echo "✗ FAILED" + tail -40 "$out" | sed 's/^/ /' + FAILED=$((FAILED + 1)) + fi + rm -f "$out" +} + +state_setup() { + export BAUDBOT_REMOTE_DIR + BAUDBOT_REMOTE_DIR="$(mktemp -d /tmp/baudbot-remote-common.XXXXXX)" +} + +state_teardown() { + rm -rf "$BAUDBOT_REMOTE_DIR" + unset BAUDBOT_REMOTE_DIR +} + +test_target_name_validation() { + ( + set -euo pipefail + remote_validate_target_name "valid-name-1" + ! remote_validate_target_name "" + ! remote_validate_target_name "UPPERCASE" + ! remote_validate_target_name "bad_name" + ) +} + +test_state_init_and_fields() { + ( + set -euo pipefail + state_setup + trap state_teardown EXIT + + remote_state_init "demo-target" "host" "203.0.113.9" "root" "$BAUDBOT_REMOTE_DIR/key" "none" "" "" "" + + [ "$(remote_state_get_field "demo-target" '.name')" = "demo-target" ] + [ "$(remote_state_get_field "demo-target" '.mode')" = "host" ] + [ "$(remote_state_get_field "demo-target" '.host')" = "203.0.113.9" ] + [ "$(remote_state_get_field "demo-target" '.status')" = "initialized" ] + ) +} + +test_checkpoint_progression() { + ( + set -euo pipefail + state_setup + trap state_teardown EXIT + + remote_state_init "demo-target" "host" "" "root" "$BAUDBOT_REMOTE_DIR/key" "none" "" "" "" + + [ "$(remote_next_install_checkpoint "demo-target" "host")" = "target_selected" ] + + remote_checkpoint_mark_complete "demo-target" "target_selected" 0 + [ "$(remote_next_install_checkpoint "demo-target" "host")" = "ssh_key_ready" ] + + remote_checkpoint_set_retry "demo-target" "ssh_key_ready" 2 + [ "$(remote_checkpoint_retry_count "demo-target" "ssh_key_ready")" = "2" ] + + remote_checkpoint_mark_complete "demo-target" "ssh_key_ready" 2 + [ "$(remote_next_install_checkpoint "demo-target" "host")" = "ssh_reachable" ] + ) +} + +test_checkpoint_order_includes_tailscale() { + ( + set -euo pipefail + local host_order hetzner_order + host_order="$(remote_install_checkpoint_order "host")" + hetzner_order="$(remote_install_checkpoint_order "hetzner")" + + printf '%s\n' "$host_order" | grep -q '^tailscale_connected$' + printf '%s\n' "$hetzner_order" | grep -q '^tailscale_connected$' + ) +} + +test_reset_install_progress() { + ( + set -euo pipefail + state_setup + trap state_teardown EXIT + + remote_state_init "demo-target" "host" "" "root" "$BAUDBOT_REMOTE_DIR/key" "none" "" "" "" + remote_checkpoint_mark_complete "demo-target" "target_selected" 0 + remote_state_set_status "demo-target" "failed" + remote_state_set_last_error "demo-target" "boom" + + remote_reset_install_progress "demo-target" + + [ "$(remote_state_get_field "demo-target" '.status')" = "initialized" ] + [ -z "$(remote_state_get_field "demo-target" '.last_error')" ] + [ "$(remote_next_install_checkpoint "demo-target" "host")" = "target_selected" ] + ) +} + +test_ensure_local_ssh_key_generates_pair() { + ( + set -euo pipefail + state_setup + trap state_teardown EXIT + + local key_path + key_path="$BAUDBOT_REMOTE_DIR/keys/test-key" + + generated="$(remote_ensure_local_ssh_key "$key_path" "remote-common-test" 1)" + [ "$generated" = "$key_path" ] + [ -f "$key_path" ] + [ -f "${key_path}.pub" ] + + reused="$(remote_ensure_local_ssh_key "$key_path" "remote-common-test" 1)" + [ "$reused" = "$key_path" ] + ) +} + +echo "=== remote-common tests ===" +echo "" + +run_test "target name validation" test_target_name_validation +run_test "state init and fields" test_state_init_and_fields +run_test "checkpoint progression" test_checkpoint_progression +run_test "checkpoint order includes tailscale" test_checkpoint_order_includes_tailscale +run_test "reset install progress" test_reset_install_progress +run_test "ssh key generation" test_ensure_local_ssh_key_generates_pair + +echo "" +echo "=== $PASSED/$TOTAL passed, $FAILED failed ===" + +if [ "$FAILED" -gt 0 ]; then + exit 1 +fi diff --git a/bin/lib/remote-hetzner.sh b/bin/lib/remote-hetzner.sh new file mode 100644 index 0000000..d39f563 --- /dev/null +++ b/bin/lib/remote-hetzner.sh @@ -0,0 +1,339 @@ +#!/bin/bash +# Hetzner provider adapter for baudbot remote workflows. + +_REMOTE_HETZNER_LIB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=bin/lib/remote-common.sh +source "$_REMOTE_HETZNER_LIB_DIR/remote-common.sh" + +REMOTE_HETZNER_API_BASE="${REMOTE_HETZNER_API_BASE:-https://api.hetzner.cloud/v1}" + +_remote_http_code_allowed() { + local code="$1" + shift + local allowed + for allowed in "$@"; do + if [ "$code" = "$allowed" ]; then + return 0 + fi + done + return 1 +} + +remote_hetzner_extract_error_message() { + local response_file="$1" + + if ! [ -s "$response_file" ]; then + printf 'empty response\n' + return 0 + fi + + if command -v jq >/dev/null 2>&1; then + local msg="" + msg="$(jq -er '.error.message // .message // empty' "$response_file" 2>/dev/null || true)" + if [ -n "$msg" ]; then + printf '%s\n' "$msg" + return 0 + fi + fi + + head -c 200 "$response_file" 2>/dev/null || true +} + +remote_hetzner_request() { + local token="$1" + local method="$2" + local endpoint="$3" + local body="${4:-}" + shift 4 + + if [ -z "$token" ]; then + remote_error "Hetzner API token is required" + return 1 + fi + + local -a allowed_codes + if [ "$#" -gt 0 ]; then + allowed_codes=("$@") + else + allowed_codes=(200 201 202 204) + fi + + local response_file http_code curl_rc + response_file="$(mktemp "${TMPDIR:-/tmp}/baudbot-hetzner-response.XXXXXX")" + + if [ -n "$body" ]; then + http_code="$(curl -sS -X "$method" \ + -H "Authorization: Bearer $token" \ + -H "Content-Type: application/json" \ + -o "$response_file" \ + -w "%{http_code}" \ + "$REMOTE_HETZNER_API_BASE$endpoint" \ + -d "$body")" + curl_rc=$? + else + http_code="$(curl -sS -X "$method" \ + -H "Authorization: Bearer $token" \ + -H "Content-Type: application/json" \ + -o "$response_file" \ + -w "%{http_code}" \ + "$REMOTE_HETZNER_API_BASE$endpoint")" + curl_rc=$? + fi + + if [ "$curl_rc" -ne 0 ]; then + rm -f "$response_file" + remote_error "Hetzner API request failed (network or TLS error)" + return 1 + fi + + if _remote_http_code_allowed "$http_code" "${allowed_codes[@]}"; then + cat "$response_file" + rm -f "$response_file" + return 0 + fi + + local api_error + api_error="$(remote_hetzner_extract_error_message "$response_file")" + rm -f "$response_file" + + case "$http_code" in + 401|403) + remote_error "Hetzner API authentication failed ($http_code): $api_error" + ;; + 404) + remote_error "Hetzner API resource not found ($http_code): $api_error" + ;; + 429) + remote_error "Hetzner API rate limit hit ($http_code): $api_error" + ;; + *) + remote_error "Hetzner API request failed ($http_code): $api_error" + ;; + esac + + return 1 +} + +remote_hetzner_validate_credentials() { + local token="$1" + remote_hetzner_request "$token" GET "/account" "" 200 >/dev/null +} + +remote_hetzner_create_ssh_key() { + local token="$1" + local name="$2" + local public_key="$3" + + local payload response ssh_key_id + payload="$(jq -nc --arg name "$name" --arg public_key "$public_key" '{name: $name, public_key: $public_key}')" + response="$(remote_hetzner_request "$token" POST "/ssh_keys" "$payload" 201)" || return 1 + + ssh_key_id="$(printf '%s' "$response" | jq -er '.ssh_key.id' 2>/dev/null || true)" + if [ -z "$ssh_key_id" ]; then + remote_error "Hetzner create SSH key response missing ssh_key.id" + return 1 + fi + + printf '%s\n' "$ssh_key_id" +} + +remote_hetzner_list_ssh_keys() { + local token="$1" + remote_hetzner_request "$token" GET "/ssh_keys" "" 200 +} + +remote_hetzner_find_ssh_key_id_by_name() { + local token="$1" + local name="$2" + local response + + response="$(remote_hetzner_list_ssh_keys "$token")" || return 1 + printf '%s' "$response" | jq -er --arg name "$name" '.ssh_keys[]? | select(.name == $name) | .id' 2>/dev/null || true +} + +remote_hetzner_delete_ssh_key() { + local token="$1" + local ssh_key_id="$2" + + [ -n "$ssh_key_id" ] || return 0 + remote_hetzner_request "$token" DELETE "/ssh_keys/$ssh_key_id" "" 200 204 404 >/dev/null +} + +remote_hetzner_create_server() { + local token="$1" + local name="$2" + local server_type="$3" + local image="$4" + local location="$5" + local ssh_key_id="$6" + + local payload response server_id + payload="$(jq -nc \ + --arg name "$name" \ + --arg server_type "$server_type" \ + --arg image "$image" \ + --arg location "$location" \ + --argjson ssh_key_id "$ssh_key_id" \ + '{name: $name, server_type: $server_type, image: $image, location: $location, ssh_keys: [$ssh_key_id], start_after_create: true}')" + + response="$(remote_hetzner_request "$token" POST "/servers" "$payload" 201 202)" || return 1 + + server_id="$(printf '%s' "$response" | jq -er '.server.id' 2>/dev/null || true)" + if [ -z "$server_id" ]; then + remote_error "Hetzner create server response missing server.id" + return 1 + fi + + printf '%s\n' "$server_id" +} + +remote_hetzner_list_servers() { + local token="$1" + remote_hetzner_request "$token" GET "/servers" "" 200 +} + +remote_hetzner_find_server_id_by_name() { + local token="$1" + local name="$2" + local response + + response="$(remote_hetzner_list_servers "$token")" || return 1 + printf '%s' "$response" | jq -er --arg name "$name" '.servers[]? | select(.name == $name) | .id' 2>/dev/null || true +} + +remote_hetzner_get_server_ipv4() { + local token="$1" + local server_id="$2" + local response + + response="$(remote_hetzner_request "$token" GET "/servers/$server_id" "" 200)" || return 1 + printf '%s' "$response" | jq -er '.server.public_net.ipv4.ip // empty' 2>/dev/null || true +} + +remote_hetzner_wait_server_running() { + local token="$1" + local server_id="$2" + local timeout_seconds="${3:-600}" + local interval_seconds="${4:-5}" + + local elapsed=0 + while [ "$elapsed" -lt "$timeout_seconds" ]; do + local response status server_ip + response="$(remote_hetzner_request "$token" GET "/servers/$server_id" "" 200)" || return 1 + + status="$(printf '%s' "$response" | jq -er '.server.status // empty' 2>/dev/null || true)" + server_ip="$(printf '%s' "$response" | jq -er '.server.public_net.ipv4.ip // empty' 2>/dev/null || true)" + + if [ "$status" = "running" ] && [ -n "$server_ip" ]; then + printf '%s\n' "$server_ip" + return 0 + fi + + sleep "$interval_seconds" + elapsed=$((elapsed + interval_seconds)) + done + + remote_error "Timed out waiting for Hetzner server $server_id to become running" + return 1 +} + +remote_hetzner_delete_server() { + local token="$1" + local server_id="$2" + + [ -n "$server_id" ] || return 0 + remote_hetzner_request "$token" DELETE "/servers/$server_id" "" 200 204 404 >/dev/null +} + +provider_validate_credentials() { + local provider="$1" + local token="$2" + + case "$provider" in + hetzner) + remote_hetzner_validate_credentials "$token" + ;; + none|"") + return 0 + ;; + *) + remote_error "unsupported provider: $provider" + return 1 + ;; + esac +} + +provider_create_ssh_key() { + local provider="$1" + shift + + case "$provider" in + hetzner) + remote_hetzner_create_ssh_key "$@" + ;; + *) + remote_error "provider_create_ssh_key not supported for provider: $provider" + return 1 + ;; + esac +} + +provider_create_server() { + local provider="$1" + shift + + case "$provider" in + hetzner) + remote_hetzner_create_server "$@" + ;; + *) + remote_error "provider_create_server not supported for provider: $provider" + return 1 + ;; + esac +} + +provider_wait_server_running() { + local provider="$1" + shift + + case "$provider" in + hetzner) + remote_hetzner_wait_server_running "$@" + ;; + *) + remote_error "provider_wait_server_running not supported for provider: $provider" + return 1 + ;; + esac +} + +provider_delete_server() { + local provider="$1" + shift + + case "$provider" in + hetzner) + remote_hetzner_delete_server "$@" + ;; + *) + remote_error "provider_delete_server not supported for provider: $provider" + return 1 + ;; + esac +} + +provider_delete_ssh_key() { + local provider="$1" + shift + + case "$provider" in + hetzner) + remote_hetzner_delete_ssh_key "$@" + ;; + *) + remote_error "provider_delete_ssh_key not supported for provider: $provider" + return 1 + ;; + esac +} diff --git a/bin/lib/remote-hetzner.test.sh b/bin/lib/remote-hetzner.test.sh new file mode 100755 index 0000000..b0b6695 --- /dev/null +++ b/bin/lib/remote-hetzner.test.sh @@ -0,0 +1,242 @@ +#!/bin/bash +# Tests for bin/lib/remote-hetzner.sh + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +# shellcheck source=bin/lib/remote-hetzner.sh +source "$SCRIPT_DIR/remote-hetzner.sh" + +TOTAL=0 +PASSED=0 +FAILED=0 + +run_test() { + local name="$1" + shift + local out + + TOTAL=$((TOTAL + 1)) + printf " %-45s " "$name" + + out="$(mktemp /tmp/baudbot-remote-hetzner-test-output.XXXXXX)" + if "$@" >"$out" 2>&1; then + echo "✓" + PASSED=$((PASSED + 1)) + else + echo "✗ FAILED" + tail -60 "$out" | sed 's/^/ /' + FAILED=$((FAILED + 1)) + fi + rm -f "$out" +} + +with_mocked_curl() { + local case_name="$1" + shift + + local tmp fakebin fakecurl + tmp="$(mktemp -d /tmp/baudbot-remote-hetzner.XXXXXX)" + fakebin="$tmp/fakebin" + fakecurl="$fakebin/curl" + mkdir -p "$fakebin" + + cat > "$fakecurl" <<'EOF_CURL' +#!/bin/bash +set -euo pipefail + +out_file="" +method="GET" +url="" + +while [ "$#" -gt 0 ]; do + case "$1" in + -o) + out_file="$2" + shift 2 + ;; + -w) + shift 2 + ;; + -X) + method="$2" + shift 2 + ;; + -H) + shift 2 + ;; + -d) + shift 2 + ;; + -s|-S|-sS) + shift + ;; + *) + url="$1" + shift + ;; + esac +done + +code="500" +body='{"error":{"message":"unknown mock case"}}' + +case "${MOCK_CURL_CASE:-}" in + validate_ok) + code="200" + body='{"account":{"id":1}}' + ;; + unauthorized) + code="401" + body='{"error":{"message":"unauthorized"}}' + ;; + rate_limit) + code="429" + body='{"error":{"message":"too many requests"}}' + ;; + create_key) + code="201" + body='{"ssh_key":{"id":321}}' + ;; + create_server) + code="201" + body='{"server":{"id":654}}' + ;; + list_servers) + code="200" + body='{"servers":[{"id":77,"name":"demo"}]}' + ;; + list_keys) + code="200" + body='{"ssh_keys":[{"id":88,"name":"demo-key"}]}' + ;; + delete_ok) + code="204" + body='' + ;; + wait_running) + code="200" + counter_file="${MOCK_COUNTER_FILE}" + counter="0" + if [ -f "$counter_file" ]; then + counter="$(cat "$counter_file")" + fi + counter=$((counter + 1)) + printf '%s' "$counter" > "$counter_file" + if [ "$counter" -lt 3 ]; then + body='{"server":{"status":"starting","public_net":{"ipv4":{"ip":""}}}}' + else + body='{"server":{"status":"running","public_net":{"ipv4":{"ip":"198.51.100.20"}}}}' + fi + ;; + wait_timeout) + code="200" + body='{"server":{"status":"starting","public_net":{"ipv4":{"ip":""}}}}' + ;; +esac + +if [ -n "$out_file" ]; then + printf '%s' "$body" > "$out_file" +fi + +printf '%s' "$code" +exit 0 +EOF_CURL + + chmod +x "$fakecurl" + + local rc=0 + ( + set -euo pipefail + export PATH="$fakebin:$PATH" + hash -r + export MOCK_CURL_CASE="$case_name" + export MOCK_COUNTER_FILE="$tmp/counter" + "$@" + ) || rc=$? + + rm -rf "$tmp" + return "$rc" +} + +test_validate_credentials_ok() { + with_mocked_curl "validate_ok" remote_hetzner_validate_credentials "token123" +} + +test_validate_credentials_unauthorized() { + ( + set -euo pipefail + if with_mocked_curl "unauthorized" remote_hetzner_validate_credentials "badtoken" >/tmp/baudbot-hetzner-auth.out 2>&1; then + return 1 + fi + grep -q "authentication failed" /tmp/baudbot-hetzner-auth.out + rm -f /tmp/baudbot-hetzner-auth.out + ) +} + +test_create_ssh_key_returns_id() { + ( + set -euo pipefail + local id + id="$(with_mocked_curl "create_key" remote_hetzner_create_ssh_key "token123" "demo-key" "ssh-ed25519 AAAA")" + [ "$id" = "321" ] + ) +} + +test_create_server_returns_id() { + ( + set -euo pipefail + local id + id="$(with_mocked_curl "create_server" remote_hetzner_create_server "token123" "demo" "cpx11" "ubuntu-24.04" "ash" "55")" + [ "$id" = "654" ] + ) +} + +test_wait_server_running_polls_until_running() { + ( + set -euo pipefail + local ip + ip="$(with_mocked_curl "wait_running" remote_hetzner_wait_server_running "token123" "654" "5" "1")" + [ "$ip" = "198.51.100.20" ] + ) +} + +test_wait_server_running_timeout() { + ( + set -euo pipefail + if with_mocked_curl "wait_timeout" remote_hetzner_wait_server_running "token123" "654" "1" "1" >/tmp/baudbot-hetzner-timeout.out 2>&1; then + return 1 + fi + grep -q "Timed out" /tmp/baudbot-hetzner-timeout.out + rm -f /tmp/baudbot-hetzner-timeout.out + ) +} + +test_rate_limit_error_message() { + ( + set -euo pipefail + if with_mocked_curl "rate_limit" remote_hetzner_validate_credentials "token123" >/tmp/baudbot-hetzner-rate.out 2>&1; then + return 1 + fi + grep -q "rate limit" /tmp/baudbot-hetzner-rate.out + rm -f /tmp/baudbot-hetzner-rate.out + ) +} + +echo "=== remote-hetzner tests ===" +echo "" + +run_test "validate credentials success" test_validate_credentials_ok +run_test "validate credentials unauthorized" test_validate_credentials_unauthorized +run_test "create ssh key returns id" test_create_ssh_key_returns_id +run_test "create server returns id" test_create_server_returns_id +run_test "wait running polls" test_wait_server_running_polls_until_running +run_test "wait running timeout" test_wait_server_running_timeout +run_test "rate limit error handling" test_rate_limit_error_message + +echo "" +echo "=== $PASSED/$TOTAL passed, $FAILED failed ===" + +if [ "$FAILED" -gt 0 ]; then + exit 1 +fi diff --git a/bin/lib/remote-ssh.sh b/bin/lib/remote-ssh.sh new file mode 100644 index 0000000..400a73a --- /dev/null +++ b/bin/lib/remote-ssh.sh @@ -0,0 +1,134 @@ +#!/bin/bash +# SSH/SCP wrappers for baudbot remote workflows. + +_REMOTE_SSH_LIB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=bin/lib/remote-common.sh +source "$_REMOTE_SSH_LIB_DIR/remote-common.sh" + +REMOTE_SSH_CONNECT_TIMEOUT_SEC="${REMOTE_SSH_CONNECT_TIMEOUT_SEC:-8}" +REMOTE_SSH_SERVER_ALIVE_INTERVAL_SEC="${REMOTE_SSH_SERVER_ALIVE_INTERVAL_SEC:-20}" +REMOTE_SSH_SERVER_ALIVE_COUNT_MAX="${REMOTE_SSH_SERVER_ALIVE_COUNT_MAX:-3}" + +remote_ssh_target() { + local ssh_user="$1" + local host="$2" + printf '%s@%s\n' "$ssh_user" "$host" +} + +remote_ssh_exec() { + local ssh_user="$1" + local host="$2" + local ssh_key_path="${3:-}" + local remote_command="$4" + + remote_init_storage + + local -a args + args=( + -o StrictHostKeyChecking=accept-new + -o "UserKnownHostsFile=$(remote_known_hosts_path)" + -o "ConnectTimeout=${REMOTE_SSH_CONNECT_TIMEOUT_SEC}" + -o "ServerAliveInterval=${REMOTE_SSH_SERVER_ALIVE_INTERVAL_SEC}" + -o "ServerAliveCountMax=${REMOTE_SSH_SERVER_ALIVE_COUNT_MAX}" + -o BatchMode=yes + ) + + if [ -n "$ssh_key_path" ]; then + args+=( -i "$ssh_key_path" ) + fi + + ssh "${args[@]}" "$(remote_ssh_target "$ssh_user" "$host")" "$remote_command" +} + +remote_ssh_exec_tty() { + local ssh_user="$1" + local host="$2" + local ssh_key_path="${3:-}" + local remote_command="$4" + + remote_init_storage + + local -a args + args=( + -tt + -o StrictHostKeyChecking=accept-new + -o "UserKnownHostsFile=$(remote_known_hosts_path)" + -o "ConnectTimeout=${REMOTE_SSH_CONNECT_TIMEOUT_SEC}" + -o "ServerAliveInterval=${REMOTE_SSH_SERVER_ALIVE_INTERVAL_SEC}" + -o "ServerAliveCountMax=${REMOTE_SSH_SERVER_ALIVE_COUNT_MAX}" + ) + + if [ -n "$ssh_key_path" ]; then + args+=( -i "$ssh_key_path" ) + fi + + ssh "${args[@]}" "$(remote_ssh_target "$ssh_user" "$host")" "$remote_command" +} + +remote_scp_to() { + local ssh_user="$1" + local host="$2" + local ssh_key_path="${3:-}" + local local_path="$4" + local remote_path="$5" + + remote_init_storage + + local -a args + args=( + -o StrictHostKeyChecking=accept-new + -o "UserKnownHostsFile=$(remote_known_hosts_path)" + -o "ConnectTimeout=${REMOTE_SSH_CONNECT_TIMEOUT_SEC}" + ) + + if [ -n "$ssh_key_path" ]; then + args+=( -i "$ssh_key_path" ) + fi + + scp "${args[@]}" "$local_path" "$(remote_ssh_target "$ssh_user" "$host"):$remote_path" +} + +remote_scp_from() { + local ssh_user="$1" + local host="$2" + local ssh_key_path="${3:-}" + local remote_path="$4" + local local_path="$5" + + remote_init_storage + + local -a args + args=( + -o StrictHostKeyChecking=accept-new + -o "UserKnownHostsFile=$(remote_known_hosts_path)" + -o "ConnectTimeout=${REMOTE_SSH_CONNECT_TIMEOUT_SEC}" + ) + + if [ -n "$ssh_key_path" ]; then + args+=( -i "$ssh_key_path" ) + fi + + scp "${args[@]}" "$(remote_ssh_target "$ssh_user" "$host"):$remote_path" "$local_path" +} + +remote_ssh_wait_for_reachable() { + local ssh_user="$1" + local host="$2" + local ssh_key_path="${3:-}" + local max_attempts="${4:-30}" + local sleep_seconds="${5:-5}" + + local attempt=1 + while [ "$attempt" -le "$max_attempts" ]; do + if remote_ssh_exec "$ssh_user" "$host" "$ssh_key_path" "true" >/dev/null 2>&1; then + return 0 + fi + + if [ "$attempt" -lt "$max_attempts" ]; then + sleep "$sleep_seconds" + fi + attempt=$((attempt + 1)) + done + + return 1 +} diff --git a/bin/lib/remote-ssh.test.sh b/bin/lib/remote-ssh.test.sh new file mode 100755 index 0000000..fddc717 --- /dev/null +++ b/bin/lib/remote-ssh.test.sh @@ -0,0 +1,167 @@ +#!/bin/bash +# Tests for bin/lib/remote-ssh.sh + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +# shellcheck source=bin/lib/remote-ssh.sh +source "$SCRIPT_DIR/remote-ssh.sh" + +TOTAL=0 +PASSED=0 +FAILED=0 + +run_test() { + local name="$1" + shift + local out + + TOTAL=$((TOTAL + 1)) + printf " %-45s " "$name" + + out="$(mktemp /tmp/baudbot-remote-ssh-test-output.XXXXXX)" + if "$@" >"$out" 2>&1; then + echo "✓" + PASSED=$((PASSED + 1)) + else + echo "✗ FAILED" + tail -60 "$out" | sed 's/^/ /' + FAILED=$((FAILED + 1)) + fi + rm -f "$out" +} + +with_mocked_ssh_tools() { + local fail_until="${1:-0}" + shift + + local tmp fakebin log_file count_file + tmp="$(mktemp -d /tmp/baudbot-remote-ssh.XXXXXX)" + fakebin="$tmp/fakebin" + log_file="$tmp/log" + count_file="$tmp/count" + mkdir -p "$fakebin" + + cat > "$fakebin/ssh" <<'EOF_SSH' +#!/bin/bash +set -euo pipefail + +count_file="${MOCK_SSH_COUNT_FILE}" +log_file="${MOCK_SSH_LOG}" +fail_until="${MOCK_SSH_FAIL_UNTIL:-0}" + +count="0" +if [ -f "$count_file" ]; then + count="$(cat "$count_file")" +fi +count=$((count + 1)) +printf '%s' "$count" > "$count_file" + +echo "ssh $*" >> "$log_file" + +if [ "$count" -le "$fail_until" ]; then + exit 255 +fi + +exit 0 +EOF_SSH + + cat > "$fakebin/scp" <<'EOF_SCP' +#!/bin/bash +set -euo pipefail +echo "scp $*" >> "${MOCK_SSH_LOG}" +exit 0 +EOF_SCP + + chmod +x "$fakebin/ssh" "$fakebin/scp" + + local rc=0 + ( + set -euo pipefail + export PATH="$fakebin:$PATH" + hash -r + export MOCK_SSH_LOG="$log_file" + export MOCK_SSH_COUNT_FILE="$count_file" + export MOCK_SSH_FAIL_UNTIL="$fail_until" + export BAUDBOT_REMOTE_DIR="$tmp/state" + "$@" + ) || rc=$? + + rm -rf "$tmp" + return "$rc" +} + +test_ssh_exec_builds_expected_flags() { + with_mocked_ssh_tools 0 _case_ssh_exec_flags +} + +test_ssh_exec_tty_adds_tty_flag() { + with_mocked_ssh_tools 0 _case_ssh_exec_tty +} + +test_scp_wrappers_build_expected_targets() { + with_mocked_ssh_tools 0 _case_scp_wrappers +} + +test_wait_for_reachable_retries() { + with_mocked_ssh_tools 2 _case_wait_retries +} + +test_wait_for_reachable_timeout() { + with_mocked_ssh_tools 10 _case_wait_timeout +} + +_case_ssh_exec_flags() { + set -euo pipefail + remote_ssh_exec root 203.0.113.5 /tmp/key "echo hi" + grep -q "StrictHostKeyChecking=accept-new" "$MOCK_SSH_LOG" + grep -q "UserKnownHostsFile=$BAUDBOT_REMOTE_DIR/known_hosts" "$MOCK_SSH_LOG" + grep -q -- "-i /tmp/key" "$MOCK_SSH_LOG" + grep -q "root@203.0.113.5" "$MOCK_SSH_LOG" +} + +_case_ssh_exec_tty() { + set -euo pipefail + remote_ssh_exec_tty root 203.0.113.5 /tmp/key "baudbot install" + grep -q "ssh -tt" "$MOCK_SSH_LOG" +} + +_case_scp_wrappers() { + set -euo pipefail + remote_scp_to root 203.0.113.5 /tmp/key /tmp/local /tmp/remote + remote_scp_from root 203.0.113.5 /tmp/key /tmp/remote /tmp/local + grep -q "scp .* /tmp/local root@203.0.113.5:/tmp/remote" "$MOCK_SSH_LOG" + grep -q "scp .* root@203.0.113.5:/tmp/remote /tmp/local" "$MOCK_SSH_LOG" +} + +_case_wait_retries() { + set -euo pipefail + remote_ssh_wait_for_reachable root 203.0.113.5 /tmp/key 5 0 + attempts="$(cat "$MOCK_SSH_COUNT_FILE")" + [ "$attempts" = "3" ] +} + +_case_wait_timeout() { + set -euo pipefail + if remote_ssh_wait_for_reachable root 203.0.113.5 /tmp/key 3 0; then + exit 1 + fi + attempts="$(cat "$MOCK_SSH_COUNT_FILE")" + [ "$attempts" = "3" ] +} + +echo "=== remote-ssh tests ===" +echo "" + +run_test "ssh exec flags" test_ssh_exec_builds_expected_flags +run_test "ssh exec tty mode" test_ssh_exec_tty_adds_tty_flag +run_test "scp wrappers" test_scp_wrappers_build_expected_targets +run_test "wait retries until success" test_wait_for_reachable_retries +run_test "wait fails after timeout" test_wait_for_reachable_timeout + +echo "" +echo "=== $PASSED/$TOTAL passed, $FAILED failed ===" + +if [ "$FAILED" -gt 0 ]; then + exit 1 +fi diff --git a/bin/redact-logs.sh b/bin/redact-logs.sh index 564ce40..33d1500 100755 --- a/bin/redact-logs.sh +++ b/bin/redact-logs.sh @@ -28,27 +28,6 @@ if [ ! -d "$SESSION_DIR" ]; then exit 0 fi -# Secret patterns — each is a sed-compatible extended regex -# We use perl for lookahead/lookbehind since sed ERE is limited -REDACT_PATTERNS=( - # OpenAI API keys - 's/sk-[a-zA-Z0-9]{20,}/[REDACTED_API_KEY]/g' - # Slack bot tokens - 's/xoxb-[0-9A-Za-z-]{20,}/[REDACTED_SLACK_TOKEN]/g' - # Slack app tokens - 's/xapp-[0-9A-Za-z-]{20,}/[REDACTED_SLACK_TOKEN]/g' - # GitHub PATs - 's/ghp_[a-zA-Z0-9]{36}/[REDACTED_GITHUB_TOKEN]/g' - # GitHub fine-grained PATs - 's/github_pat_[a-zA-Z0-9_]{20,}/[REDACTED_GITHUB_TOKEN]/g' - # AWS access keys - 's/AKIA[A-Z0-9]{16}/[REDACTED_AWS_KEY]/g' - # Bearer tokens in headers - 's/(Bearer[[:space:]]+)[a-zA-Z0-9._~+/-]+[=]*/\1[REDACTED_BEARER]/gI' - # Generic password/secret in key=value or key: value - 's/(password|secret|api_key|apikey|api-key)[[:space:]]*[:=][[:space:]]*"[^"]{8,}"/\1=[REDACTED_SECRET]/gI' -) - files_changed=0 files_scanned=0 @@ -56,7 +35,7 @@ while IFS= read -r -d '' logfile; do files_scanned=$((files_scanned + 1)) # Quick check: does file contain anything that looks like a secret? - if ! grep -qE '(sk-[a-zA-Z0-9]{20}|xoxb-|xapp-|ghp_|github_pat_|AKIA[A-Z0-9]{16}|Bearer\s+[a-zA-Z0-9]|-----BEGIN)' "$logfile" 2>/dev/null; then + if ! grep -qE '(sk-[a-zA-Z0-9]{20}|xoxb-|xapp-|ghp_|github_pat_|AKIA[A-Z0-9]{16}|Bearer[[:space:]]+[a-zA-Z0-9]|-----BEGIN)' "$logfile" 2>/dev/null; then continue fi @@ -66,19 +45,27 @@ while IFS= read -r -d '' logfile; do continue fi - # Build the sed command - sed_args=() - for pattern in "${REDACT_PATTERNS[@]}"; do - sed_args+=(-e "$pattern") - done - - # Also handle PEM private keys (multi-line, but in JSONL they're escaped) - sed_args+=(-e 's/-----BEGIN[A-Z ]*PRIVATE KEY-----[^-]*-----END[A-Z ]*PRIVATE KEY-----/[REDACTED_PRIVATE_KEY]/g') - - # Apply in-place - sed -i -E "${sed_args[@]}" "$logfile" - files_changed=$((files_changed + 1)) - echo " ✓ Redacted: $logfile" + temp_file="$(mktemp "${TMPDIR:-/tmp}/redact-log.XXXXXX")" + perl -0777 - "$logfile" > "$temp_file" <<'PERL' +my $text = do { local $/; <> }; +$text =~ s/sk-[a-zA-Z0-9]{20,}/[REDACTED_API_KEY]/g; +$text =~ s/xoxb-[0-9A-Za-z-]{20,}/[REDACTED_SLACK_TOKEN]/g; +$text =~ s/xapp-[0-9A-Za-z-]{20,}/[REDACTED_SLACK_TOKEN]/g; +$text =~ s/ghp_[a-zA-Z0-9]{36}/[REDACTED_GITHUB_TOKEN]/g; +$text =~ s/github_pat_[a-zA-Z0-9_]{20,}/[REDACTED_GITHUB_TOKEN]/g; +$text =~ s/AKIA[A-Z0-9]{16}/[REDACTED_AWS_KEY]/g; +$text =~ s#(Bearer\s+)[A-Za-z0-9._~+/=-]+#${1}[REDACTED_BEARER]#ig; +$text =~ s/(password|secret|api_key|apikey|api-key)\s*[:=]\s*"[^"]{8,}"/$1=[REDACTED_SECRET]/ig; +$text =~ s/-----BEGIN[A-Z ]*PRIVATE KEY-----[^-]*-----END[A-Z ]*PRIVATE KEY-----/[REDACTED_PRIVATE_KEY]/g; +print $text; +PERL + if ! cmp -s "$logfile" "$temp_file"; then + mv "$temp_file" "$logfile" + files_changed=$((files_changed + 1)) + echo " ✓ Redacted: $logfile" + else + rm -f "$temp_file" + fi done < <(find "$SESSION_DIR" -name '*.jsonl' -print0 2>/dev/null) diff --git a/bin/remote.sh b/bin/remote.sh new file mode 100755 index 0000000..89af7b5 --- /dev/null +++ b/bin/remote.sh @@ -0,0 +1,1195 @@ +#!/bin/bash +# Remote install/repair orchestration for baudbot. + +set -euo pipefail + +REMOTE_SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=bin/lib/remote-common.sh +source "$REMOTE_SCRIPT_DIR/lib/remote-common.sh" +# shellcheck source=bin/lib/remote-ssh.sh +source "$REMOTE_SCRIPT_DIR/lib/remote-ssh.sh" +# shellcheck source=bin/lib/remote-hetzner.sh +source "$REMOTE_SCRIPT_DIR/lib/remote-hetzner.sh" + +REMOTE_CHECKPOINT_MAX_RETRIES="${REMOTE_CHECKPOINT_MAX_RETRIES:-3}" +REMOTE_SSH_REACHABLE_ATTEMPTS="${REMOTE_SSH_REACHABLE_ATTEMPTS:-40}" +REMOTE_SSH_REACHABLE_INTERVAL_SEC="${REMOTE_SSH_REACHABLE_INTERVAL_SEC:-3}" +REMOTE_HETZNER_WAIT_TIMEOUT_SEC="${REMOTE_HETZNER_WAIT_TIMEOUT_SEC:-600}" +REMOTE_HETZNER_WAIT_INTERVAL_SEC="${REMOTE_HETZNER_WAIT_INTERVAL_SEC:-5}" +REMOTE_BOOTSTRAP_URL="${REMOTE_BOOTSTRAP_URL:-https://raw.githubusercontent.com/modem-dev/baudbot/main/bootstrap.sh}" +REMOTE_TAILSCALE_INSTALL_URL="${REMOTE_TAILSCALE_INSTALL_URL:-https://tailscale.com/install.sh}" +REMOTE_TAILSCALE_WAIT_ATTEMPTS="${REMOTE_TAILSCALE_WAIT_ATTEMPTS:-40}" +REMOTE_TAILSCALE_WAIT_INTERVAL_SEC="${REMOTE_TAILSCALE_WAIT_INTERVAL_SEC:-3}" + +REMOTE_DEFAULT_HETZNER_SERVER_TYPE="${REMOTE_HETZNER_SERVER_TYPE:-cpx11}" +REMOTE_DEFAULT_HETZNER_IMAGE="${REMOTE_HETZNER_IMAGE:-ubuntu-24.04}" +REMOTE_DEFAULT_HETZNER_LOCATION="${REMOTE_HETZNER_LOCATION:-ash}" + +remote_usage() { + cat <<'EOF_USAGE' +Usage: baudbot remote [options] + +Commands: + install Interactive remote install (mode: hetzner|host) + repair Guided repair workflow for existing remote host + list List saved remote targets + status Show target status and checkpoints + resume Resume a previously interrupted install + +Install options: + --target + --mode hetzner|host + --host + --ssh-user (default: root) + --ssh-key + --hetzner-token (fallback: HETZNER_API_TOKEN) + --server-type (hetzner only, default: cpx11) + --image (hetzner only, default: ubuntu-24.04) + --location (hetzner only, default: ash) + --tailscale force Tailscale setup + --no-tailscale skip Tailscale setup + --tailscale-auth-key (fallback: TAILSCALE_AUTHKEY) + --resume + --dry-run + +Repair options: + --target | --host + --ssh-user + --ssh-key + --tailscale-auth-key (fallback: TAILSCALE_AUTHKEY) + --non-interactive-safe + --dry-run +EOF_USAGE +} + +remote_prompt_secret() { + local prompt="$1" + local value="" + printf "%s: " "$prompt" >&2 + read -r -s value + printf '\n' >&2 + printf '%s\n' "$value" +} + +remote_mode_or_die() { + local mode="$1" + case "$mode" in + hetzner|host) + return 0 + ;; + *) + remote_die "invalid mode '$mode' (expected hetzner|host)" + ;; + esac +} + +remote_target_from_host() { + local host="$1" + local normalized="" + normalized="$(printf '%s' "$host" | tr '[:upper:]' '[:lower:]' | tr -c 'a-z0-9-' '-' | sed -E 's/^-+//; s/-+$//; s/-+/-/g')" + if [ -z "$normalized" ]; then + normalized="remote-host" + fi + printf '%s\n' "$normalized" +} + +remote_checkpoint_phase() { + local mode="$1" + local checkpoint="$2" + + case "$checkpoint" in + target_selected|ssh_key_ready|server_provisioned|ssh_reachable) + if [ "$mode" = "hetzner" ]; then + printf 'provisioning\n' + else + printf 'installing\n' + fi + ;; + bootstrap_installed|baudbot_install_completed|doctor_passed|tailscale_connected) + printf 'installing\n' + ;; + completed) + printf 'ready\n' + ;; + *) + printf 'installing\n' + ;; + esac +} + +remote_run_bootstrap_remote() { + local ssh_user="$1" + local host="$2" + local ssh_key_path="$3" + + local cmd + cmd="if command -v curl >/dev/null 2>&1; then curl -fsSL '$REMOTE_BOOTSTRAP_URL' | bash; elif command -v wget >/dev/null 2>&1; then wget -qO- '$REMOTE_BOOTSTRAP_URL' | bash; else echo 'curl or wget is required on remote host' >&2; exit 1; fi" + remote_ssh_exec "$ssh_user" "$host" "$ssh_key_path" "$cmd" +} + +remote_run_install_remote() { + local ssh_user="$1" + local host="$2" + local ssh_key_path="$3" + + if ! remote_is_interactive; then + remote_die "remote install requires an interactive terminal (or use --dry-run)" + fi + + remote_ssh_exec_tty "$ssh_user" "$host" "$ssh_key_path" "baudbot install" +} + +remote_run_post_install_doctor() { + local ssh_user="$1" + local host="$2" + local ssh_key_path="$3" + + remote_ssh_exec "$ssh_user" "$host" "$ssh_key_path" "sudo baudbot status" + remote_ssh_exec "$ssh_user" "$host" "$ssh_key_path" "sudo baudbot doctor" +} + +remote_shell_single_quote() { + printf "%s" "$1" | sed "s/'/'\"'\"'/g" +} + +remote_tailscale_wait_running() { + local ssh_user="$1" + local host="$2" + local ssh_key_path="$3" + + local attempt=1 + while [ "$attempt" -le "$REMOTE_TAILSCALE_WAIT_ATTEMPTS" ]; do + local status_json backend_state tailscale_ip + status_json="$(remote_ssh_exec "$ssh_user" "$host" "$ssh_key_path" "if command -v tailscale >/dev/null 2>&1; then sudo tailscale status --json 2>/dev/null || true; fi" 2>/dev/null || true)" + backend_state="$(printf '%s' "$status_json" | jq -er '.BackendState // empty' 2>/dev/null || true)" + tailscale_ip="$(printf '%s' "$status_json" | jq -er '.Self.TailscaleIPs[0] // empty' 2>/dev/null || true)" + + if [ "$backend_state" = "Running" ] && [ -n "$tailscale_ip" ]; then + printf '%s\n' "$tailscale_ip" + return 0 + fi + + if [ "$attempt" -lt "$REMOTE_TAILSCALE_WAIT_ATTEMPTS" ]; then + sleep "$REMOTE_TAILSCALE_WAIT_INTERVAL_SEC" + fi + attempt=$((attempt + 1)) + done + + return 1 +} + +remote_configure_tailscale() { + local target="$1" + local ssh_user="$2" + local host="$3" + local ssh_key_path="$4" + local tailscale_auth_key="$5" + local tailscale_mode="$6" + local dry_run="$7" + + if [ "$dry_run" = "1" ]; then + remote_state_set_tailscale_enabled "$target" "false" + remote_state_set_tailscale_ip "$target" "" + return 0 + fi + + local effective_mode="$tailscale_mode" + if [ "$effective_mode" = "auto" ]; then + if remote_is_interactive; then + if remote_confirm "Configure Tailscale on '$target' for secure remote access?" "y"; then + effective_mode="enable" + else + effective_mode="skip" + fi + else + effective_mode="skip" + fi + fi + + if [ "$effective_mode" = "skip" ]; then + remote_state_set_tailscale_enabled "$target" "false" + remote_state_set_tailscale_ip "$target" "" + return 0 + fi + + if [ -z "$tailscale_auth_key" ] && ! remote_is_interactive; then + remote_die "tailscale setup requested in non-interactive mode requires --tailscale-auth-key or TAILSCALE_AUTHKEY" + fi + + remote_log "[$target] ensuring Tailscale is installed" + remote_ssh_exec "$ssh_user" "$host" "$ssh_key_path" \ + "sudo sh -c 'if command -v tailscale >/dev/null 2>&1; then exit 0; fi; if command -v curl >/dev/null 2>&1; then curl -fsSL \"$REMOTE_TAILSCALE_INSTALL_URL\" | sh; elif command -v wget >/dev/null 2>&1; then wget -qO- \"$REMOTE_TAILSCALE_INSTALL_URL\" | sh; else echo \"curl or wget required to install tailscale\" >&2; exit 1; fi'" + + remote_ssh_exec "$ssh_user" "$host" "$ssh_key_path" \ + "sudo systemctl enable --now tailscaled >/dev/null 2>&1 || sudo service tailscaled start >/dev/null 2>&1 || true" + + if [ -n "$tailscale_auth_key" ]; then + local escaped_auth_key + escaped_auth_key="$(remote_shell_single_quote "$tailscale_auth_key")" + remote_log "[$target] connecting Tailscale with auth key" + remote_ssh_exec "$ssh_user" "$host" "$ssh_key_path" \ + "sudo tailscale up --authkey '$escaped_auth_key' --ssh --accept-routes" + else + local up_output="" + if ! up_output="$(remote_ssh_exec "$ssh_user" "$host" "$ssh_key_path" "sudo tailscale up --ssh --accept-routes" 2>&1)"; then + local login_url="" + login_url="$(printf '%s' "$up_output" | grep -Eo 'https://login\.tailscale\.com[^[:space:]]+' | head -n 1 || true)" + + if [ -n "$login_url" ]; then + remote_log "[$target] complete Tailscale login: $login_url" + else + remote_warn "tailscale up did not return success and no login URL was parsed" + fi + + if remote_is_interactive; then + remote_prompt_default "Press Enter after completing Tailscale login in your browser" "" >/dev/null + fi + fi + fi + + local tailscale_ip="" + tailscale_ip="$(remote_tailscale_wait_running "$ssh_user" "$host" "$ssh_key_path" || true)" + if [ -z "$tailscale_ip" ]; then + remote_die "failed to verify Tailscale connectivity on '$target'" + fi + + remote_state_set_tailscale_enabled "$target" "true" + remote_state_set_tailscale_ip "$target" "$tailscale_ip" + remote_log "[$target] Tailscale connected: $tailscale_ip" +} + +remote_prepare_state_install() { + local target="$1" + local mode="$2" + local host="$3" + local ssh_user="$4" + local ssh_key_path="$5" + local location="$6" + local server_type="$7" + local image="$8" + local resume="$9" + + local provider_name="none" + if [ "$mode" = "hetzner" ]; then + provider_name="hetzner" + fi + + if [ "$resume" = "1" ]; then + if ! remote_state_exists "$target"; then + remote_die "resume requested but target '$target' was not found" + fi + + local existing_mode + existing_mode="$(remote_state_get_field "$target" '.mode')" + if [ -z "$existing_mode" ]; then + remote_die "target '$target' has invalid state (missing mode)" + fi + if [ "$existing_mode" != "$mode" ]; then + remote_die "target '$target' is mode '$existing_mode', not '$mode'" + fi + + if [ -n "$host" ]; then + remote_state_set_host "$target" "$host" + fi + if [ -n "$ssh_user" ]; then + remote_state_set_ssh_user "$target" "$ssh_user" + fi + if [ -n "$ssh_key_path" ]; then + remote_state_set_ssh_key_path "$target" "$ssh_key_path" + fi + if [ "$mode" = "hetzner" ]; then + remote_state_set_provider_field "$target" "location" "$location" + remote_state_set_provider_field "$target" "server_type" "$server_type" + remote_state_set_provider_field "$target" "image" "$image" + fi + else + if remote_state_exists "$target"; then + remote_die "target '$target' already exists; use --resume or choose a new --target" + fi + remote_state_init "$target" "$mode" "$host" "$ssh_user" "$ssh_key_path" "$provider_name" "$location" "$server_type" "$image" + fi + + if ! remote_checkpoint_is_complete "$target" "target_selected"; then + remote_checkpoint_mark_complete "$target" "target_selected" 0 + fi +} + +remote_cleanup_provider_if_key_mismatch() { + local target="$1" + local mode="$2" + local token="$3" + local key_preexisted="$4" + local dry_run="$5" + + if [ "$mode" != "hetzner" ]; then + return 0 + fi + + if [ "$key_preexisted" = "1" ]; then + return 0 + fi + + local server_id ssh_key_id + server_id="$(remote_state_get_field "$target" '.provider.server_id')" + ssh_key_id="$(remote_state_get_field "$target" '.provider.ssh_key_id')" + + if [ -z "$server_id" ] && [ -z "$ssh_key_id" ]; then + return 0 + fi + + if [ -z "$token" ]; then + remote_die "local SSH key was regenerated and remote resources exist; provide --hetzner-token (or HETZNER_API_TOKEN) to reconcile" + fi + + if ! remote_is_interactive; then + remote_die "local SSH key was regenerated and remote resources exist; rerun interactively to confirm cleanup" + fi + + if ! remote_confirm "Local SSH key was regenerated for '$target'. Delete stale Hetzner server/key resources before continuing?" "y"; then + remote_die "aborting to avoid mismatched SSH credentials" + fi + + if [ "$dry_run" = "1" ]; then + remote_log "[dry-run] would delete stale Hetzner resources for '$target'" + else + provider_delete_server "hetzner" "$token" "$server_id" || true + provider_delete_ssh_key "hetzner" "$token" "$ssh_key_id" || true + fi + + remote_state_set_provider_field "$target" "server_id" "" + remote_state_set_provider_field "$target" "ssh_key_id" "" + remote_state_set_host "$target" "" +} + +remote_execute_install_checkpoint() { + local target="$1" + local mode="$2" + local checkpoint="$3" + local hetzner_token="$4" + local tailscale_mode="$5" + local tailscale_auth_key="$6" + local dry_run="$7" + + local host ssh_user ssh_key_path + host="$(remote_state_get_field "$target" '.host')" + ssh_user="$(remote_state_get_field "$target" '.ssh_user')" + ssh_key_path="$(remote_state_get_field "$target" '.ssh_key_path')" + + case "$checkpoint" in + target_selected) + return 0 + ;; + + ssh_key_ready) + local default_key key_preexisted key_comment resolved_key + default_key="$(remote_keys_dir)/$target" + if [ -z "$ssh_key_path" ]; then + ssh_key_path="$default_key" + remote_state_set_ssh_key_path "$target" "$ssh_key_path" + fi + + key_preexisted=0 + if [ -f "$(remote_expand_path "$ssh_key_path")" ]; then + key_preexisted=1 + fi + + if [ "$mode" = "hetzner" ] && [ "$dry_run" != "1" ]; then + provider_validate_credentials "hetzner" "$hetzner_token" + fi + + remote_cleanup_provider_if_key_mismatch "$target" "$mode" "$hetzner_token" "$key_preexisted" "$dry_run" + + key_comment="baudbot-remote-$target" + resolved_key="$(remote_ensure_local_ssh_key "$ssh_key_path" "$key_comment" 1)" + remote_state_set_ssh_key_path "$target" "$resolved_key" + return 0 + ;; + + server_provisioned) + if [ "$mode" != "hetzner" ]; then + return 0 + fi + + if [ -z "$hetzner_token" ]; then + remote_die "Hetzner mode requires --hetzner-token or HETZNER_API_TOKEN" + fi + + local location server_type image server_id ssh_key_id key_name pub_key existing_server_id server_ip + location="$(remote_state_get_field "$target" '.provider.location')" + server_type="$(remote_state_get_field "$target" '.provider.server_type')" + image="$(remote_state_get_field "$target" '.provider.image')" + server_id="$(remote_state_get_field "$target" '.provider.server_id')" + ssh_key_id="$(remote_state_get_field "$target" '.provider.ssh_key_id')" + ssh_key_path="$(remote_state_get_field "$target" '.ssh_key_path')" + + if [ -z "$location" ]; then + location="$REMOTE_DEFAULT_HETZNER_LOCATION" + fi + if [ -z "$server_type" ]; then + server_type="$REMOTE_DEFAULT_HETZNER_SERVER_TYPE" + fi + if [ -z "$image" ]; then + image="$REMOTE_DEFAULT_HETZNER_IMAGE" + fi + + if [ "$dry_run" = "1" ]; then + if [ -z "$host" ]; then + remote_state_set_host "$target" "dry-run-host" + fi + return 0 + fi + + key_name="baudbot-remote-$target" + pub_key="$(cat "${ssh_key_path}.pub")" + + if [ -z "$ssh_key_id" ]; then + ssh_key_id="$(provider_create_ssh_key "hetzner" "$hetzner_token" "$key_name" "$pub_key")" + remote_state_set_provider_field "$target" "ssh_key_id" "$ssh_key_id" + fi + + if [ -z "$server_id" ]; then + existing_server_id="$(remote_hetzner_find_server_id_by_name "$hetzner_token" "$target" || true)" + if [ -n "$existing_server_id" ]; then + if remote_is_interactive && remote_confirm "Existing Hetzner server '$target' found (id $existing_server_id). Delete and recreate?" "y"; then + provider_delete_server "hetzner" "$hetzner_token" "$existing_server_id" + else + remote_die "existing Hetzner server '$target' blocks provisioning" + fi + fi + + server_id="$(provider_create_server "hetzner" "$hetzner_token" "$target" "$server_type" "$image" "$location" "$ssh_key_id")" + remote_state_set_provider_field "$target" "server_id" "$server_id" + fi + + server_ip="$(provider_wait_server_running "hetzner" "$hetzner_token" "$server_id" "$REMOTE_HETZNER_WAIT_TIMEOUT_SEC" "$REMOTE_HETZNER_WAIT_INTERVAL_SEC")" + if [ -z "$server_ip" ]; then + remote_die "failed to obtain running server IP from Hetzner" + fi + remote_state_set_host "$target" "$server_ip" + return 0 + ;; + + ssh_reachable) + host="$(remote_state_get_field "$target" '.host')" + if [ -z "$host" ]; then + remote_die "target '$target' has no host configured" + fi + + if [ "$dry_run" = "1" ]; then + return 0 + fi + + if remote_ssh_wait_for_reachable "$ssh_user" "$host" "$ssh_key_path" "$REMOTE_SSH_REACHABLE_ATTEMPTS" "$REMOTE_SSH_REACHABLE_INTERVAL_SEC"; then + return 0 + fi + remote_error "SSH not reachable for $ssh_user@$host" + return 1 + ;; + + bootstrap_installed) + host="$(remote_state_get_field "$target" '.host')" + if [ -z "$host" ]; then + remote_die "target '$target' has no host configured" + fi + if [ "$dry_run" = "1" ]; then + return 0 + fi + remote_run_bootstrap_remote "$ssh_user" "$host" "$ssh_key_path" + return 0 + ;; + + baudbot_install_completed) + host="$(remote_state_get_field "$target" '.host')" + if [ -z "$host" ]; then + remote_die "target '$target' has no host configured" + fi + if [ "$dry_run" = "1" ]; then + return 0 + fi + remote_run_install_remote "$ssh_user" "$host" "$ssh_key_path" + return 0 + ;; + + doctor_passed) + host="$(remote_state_get_field "$target" '.host')" + if [ -z "$host" ]; then + remote_die "target '$target' has no host configured" + fi + if [ "$dry_run" = "1" ]; then + return 0 + fi + remote_run_post_install_doctor "$ssh_user" "$host" "$ssh_key_path" + return 0 + ;; + + tailscale_connected) + host="$(remote_state_get_field "$target" '.host')" + if [ -z "$host" ]; then + remote_die "target '$target' has no host configured" + fi + remote_configure_tailscale "$target" "$ssh_user" "$host" "$ssh_key_path" "$tailscale_auth_key" "$tailscale_mode" "$dry_run" + return 0 + ;; + + completed) + return 0 + ;; + + *) + remote_error "unknown checkpoint: $checkpoint" + return 1 + ;; + esac +} + +remote_run_install_lifecycle() { + local target="$1" + local mode="$2" + local hetzner_token="$3" + local tailscale_mode="$4" + local tailscale_auth_key="$5" + local dry_run="$6" + + while true; do + local restart_from_beginning=0 + local checkpoint="" + + while IFS= read -r checkpoint; do + [ -n "$checkpoint" ] || continue + + if remote_checkpoint_is_complete "$target" "$checkpoint"; then + continue + fi + + local phase retry_count + phase="$(remote_checkpoint_phase "$mode" "$checkpoint")" + if [ "$phase" != "ready" ]; then + remote_state_set_status "$target" "$phase" + fi + + retry_count="$(remote_checkpoint_retry_count "$target" "$checkpoint")" + while [ "$retry_count" -lt "$REMOTE_CHECKPOINT_MAX_RETRIES" ]; do + remote_log "[$target] checkpoint: $checkpoint" + + if remote_execute_install_checkpoint "$target" "$mode" "$checkpoint" "$hetzner_token" "$tailscale_mode" "$tailscale_auth_key" "$dry_run"; then + remote_checkpoint_mark_complete "$target" "$checkpoint" "$retry_count" + remote_state_clear_last_error "$target" + break + fi + + retry_count=$((retry_count + 1)) + remote_checkpoint_set_retry "$target" "$checkpoint" "$retry_count" + remote_state_set_last_error "$target" "checkpoint '$checkpoint' failed" + + if [ "$retry_count" -lt "$REMOTE_CHECKPOINT_MAX_RETRIES" ]; then + remote_warn "checkpoint '$checkpoint' failed (attempt $retry_count/$REMOTE_CHECKPOINT_MAX_RETRIES), retrying" + sleep 3 + continue + fi + + remote_state_set_status "$target" "failed" + + if remote_is_interactive && remote_confirm "Checkpoint '$checkpoint' failed after $REMOTE_CHECKPOINT_MAX_RETRIES attempts. Retry this install from the beginning?" "n"; then + remote_reset_install_progress "$target" + restart_from_beginning=1 + break + fi + + return 1 + done + + if [ "$restart_from_beginning" = "1" ]; then + break + fi + done < <(remote_install_checkpoint_order "$mode") + + if [ "$restart_from_beginning" = "1" ]; then + continue + fi + + break + done + + remote_state_set_status "$target" "ready" + remote_state_clear_last_error "$target" + remote_log "[$target] install completed" +} + +remote_cmd_install() { + local target="" + local mode="" + local host="" + local ssh_user="root" + local ssh_user_set=0 + local ssh_key_path="" + local hetzner_token="${HETZNER_API_TOKEN:-}" + local tailscale_mode="auto" + local tailscale_auth_key="${TAILSCALE_AUTHKEY:-}" + local resume=0 + local dry_run=0 + local location="$REMOTE_DEFAULT_HETZNER_LOCATION" + local server_type="$REMOTE_DEFAULT_HETZNER_SERVER_TYPE" + local image="$REMOTE_DEFAULT_HETZNER_IMAGE" + + while [ "$#" -gt 0 ]; do + case "$1" in + --target) + target="$2" + shift 2 + ;; + --mode) + mode="$2" + shift 2 + ;; + --host) + host="$2" + shift 2 + ;; + --ssh-user) + ssh_user="$2" + ssh_user_set=1 + shift 2 + ;; + --ssh-key) + ssh_key_path="$2" + shift 2 + ;; + --hetzner-token) + hetzner_token="$2" + shift 2 + ;; + --tailscale) + tailscale_mode="enable" + shift + ;; + --no-tailscale) + tailscale_mode="skip" + shift + ;; + --tailscale-auth-key) + tailscale_auth_key="$2" + shift 2 + ;; + --server-type) + server_type="$2" + shift 2 + ;; + --image) + image="$2" + shift 2 + ;; + --location) + location="$2" + shift 2 + ;; + --resume) + resume=1 + shift + ;; + --dry-run) + dry_run=1 + shift + ;; + -h|--help) + remote_usage + return 0 + ;; + *) + remote_die "unknown install option: $1" + ;; + esac + done + + remote_init_storage + + if [ -z "$target" ]; then + local target_default + if [ -n "$host" ]; then + target_default="$(remote_target_from_host "$host")" + else + target_default="baudbot-$(date +%Y%m%d%H%M%S)" + fi + + if remote_is_interactive; then + target="$(remote_prompt_default "Target name" "$target_default")" + else + target="$target_default" + fi + fi + remote_validate_target_name "$target" || return 1 + + if [ "$resume" = "1" ] && ! remote_state_exists "$target"; then + remote_die "target '$target' not found for resume" + fi + + if [ "$resume" = "1" ]; then + if [ -z "$mode" ]; then + mode="$(remote_state_get_field "$target" '.mode')" + fi + + local stored_mode + stored_mode="$(remote_state_get_field "$target" '.mode')" + if [ -n "$stored_mode" ]; then + mode="$stored_mode" + fi + fi + + if [ -z "$mode" ]; then + if remote_is_interactive; then + mode="$(remote_prompt_default "Install mode (hetzner|host)" "host")" + else + remote_die "--mode is required in non-interactive mode" + fi + fi + remote_mode_or_die "$mode" + remote_require_dependencies_install "$mode" + + if [ "$mode" = "host" ] && [ -z "$host" ] && [ "$resume" != "1" ]; then + if remote_is_interactive; then + host="$(remote_prompt_default "Remote host (IP or hostname)" "")" + else + remote_die "--host is required for host mode" + fi + fi + + if [ "$mode" = "hetzner" ] && [ -z "$hetzner_token" ] && [ "$dry_run" != "1" ]; then + if remote_is_interactive; then + hetzner_token="$(remote_prompt_secret "Hetzner API token")" + else + remote_die "Hetzner mode requires --hetzner-token or HETZNER_API_TOKEN" + fi + fi + + if [ -n "$ssh_key_path" ]; then + ssh_key_path="$(remote_expand_path "$ssh_key_path")" + else + ssh_key_path="$(remote_keys_dir)/$target" + fi + + if [ "$resume" = "1" ]; then + if [ -z "$host" ]; then + host="$(remote_state_get_field "$target" '.host')" + fi + if [ "$ssh_user_set" -eq 0 ]; then + ssh_user="$(remote_state_get_field "$target" '.ssh_user')" + ssh_user="${ssh_user:-root}" + fi + fi + + remote_prepare_state_install "$target" "$mode" "$host" "$ssh_user" "$ssh_key_path" "$location" "$server_type" "$image" "$resume" + + remote_run_install_lifecycle "$target" "$mode" "$hetzner_token" "$tailscale_mode" "$tailscale_auth_key" "$dry_run" +} + +remote_capture_remote_output() { + local __result_var="$1" + local ssh_user="$2" + local host="$3" + local ssh_key_path="$4" + local command="$5" + + local output="" + local rc=0 + if output="$(remote_ssh_exec "$ssh_user" "$host" "$ssh_key_path" "$command" 2>&1)"; then + rc=0 + else + rc=$? + fi + + printf -v "$__result_var" '%s' "$output" + return "$rc" +} + +remote_run_repair_action() { + local dry_run="$1" + local ssh_user="$2" + local host="$3" + local ssh_key_path="$4" + local label="$5" + local command="$6" + + if [ "$dry_run" = "1" ]; then + remote_log "[dry-run] $label: $command" + return 0 + fi + + remote_log "$label" + remote_ssh_exec "$ssh_user" "$host" "$ssh_key_path" "$command" +} + +remote_cmd_repair() { + local target="" + local host="" + local ssh_user="root" + local ssh_key_path="" + local tailscale_auth_key="${TAILSCALE_AUTHKEY:-}" + local non_interactive_safe=0 + local dry_run=0 + + while [ "$#" -gt 0 ]; do + case "$1" in + --target) + target="$2" + shift 2 + ;; + --host) + host="$2" + shift 2 + ;; + --ssh-user) + ssh_user="$2" + shift 2 + ;; + --ssh-key) + ssh_key_path="$2" + shift 2 + ;; + --tailscale-auth-key) + tailscale_auth_key="$2" + shift 2 + ;; + --non-interactive-safe) + non_interactive_safe=1 + shift + ;; + --dry-run) + dry_run=1 + shift + ;; + -h|--help) + remote_usage + return 0 + ;; + *) + remote_die "unknown repair option: $1" + ;; + esac + done + + remote_require_dependencies_repair + remote_init_storage + + if [ -z "$target" ] && [ -z "$host" ]; then + remote_die "repair requires --target or --host " + fi + + if [ -z "$target" ] && [ -n "$host" ]; then + target="$(remote_target_from_host "$host")" + fi + + remote_validate_target_name "$target" || return 1 + + if remote_state_exists "$target"; then + if [ -z "$host" ]; then + host="$(remote_state_get_field "$target" '.host')" + fi + if [ -z "$ssh_user" ] || [ "$ssh_user" = "root" ]; then + local state_ssh_user + state_ssh_user="$(remote_state_get_field "$target" '.ssh_user')" + if [ -n "$state_ssh_user" ]; then + ssh_user="$state_ssh_user" + fi + fi + if [ -z "$ssh_key_path" ]; then + ssh_key_path="$(remote_state_get_field "$target" '.ssh_key_path')" + fi + else + if [ -z "$host" ]; then + remote_die "target '$target' not found and no --host provided" + fi + remote_state_init "$target" "host" "$host" "$ssh_user" "$ssh_key_path" "none" "" "" "" + remote_checkpoint_mark_complete "$target" "target_selected" 0 + fi + + if [ -z "$host" ]; then + remote_die "repair target '$target' has no host configured" + fi + + if [ -n "$ssh_key_path" ]; then + ssh_key_path="$(remote_expand_path "$ssh_key_path")" + if [ "$dry_run" != "1" ] && [ ! -f "$ssh_key_path" ]; then + remote_die "ssh key not found: $ssh_key_path" + fi + remote_state_set_ssh_key_path "$target" "$ssh_key_path" + fi + + remote_state_set_host "$target" "$host" + remote_state_set_ssh_user "$target" "$ssh_user" + remote_state_set_status "$target" "repairing" + + local before_status_output="" + local before_doctor_output="" + local after_status_output="" + local after_doctor_output="" + local before_status_rc=0 + local before_doctor_rc=0 + local after_status_rc=0 + local after_doctor_rc=0 + + remote_log "[$target] collecting baseline diagnostics" + if [ "$dry_run" = "1" ]; then + before_status_output="[dry-run] skipped" + before_doctor_output="[dry-run] skipped" + else + if remote_capture_remote_output before_status_output "$ssh_user" "$host" "$ssh_key_path" "sudo baudbot status"; then + before_status_rc=0 + else + before_status_rc=$? + fi + if remote_capture_remote_output before_doctor_output "$ssh_user" "$host" "$ssh_key_path" "sudo baudbot doctor"; then + before_doctor_rc=0 + else + before_doctor_rc=$? + fi + fi + + local -a safe_labels + local -a safe_commands + safe_labels=( + "sync env + restart" + "deploy" + "restart" + "doctor re-check" + "tailscale status" + ) + safe_commands=( + "sudo baudbot env sync --restart" + "sudo baudbot deploy" + "sudo baudbot restart" + "sudo baudbot doctor" + "if command -v tailscale >/dev/null 2>&1; then sudo tailscale status || true; else echo 'tailscale is not installed'; fi" + ) + + local i run_action=0 + for i in "${!safe_labels[@]}"; do + run_action=0 + if [ "$non_interactive_safe" = "1" ]; then + run_action=1 + elif remote_is_interactive; then + if remote_confirm "Run safe repair action: ${safe_labels[$i]}?" "y"; then + run_action=1 + fi + fi + + if [ "$run_action" = "1" ]; then + if ! remote_run_repair_action "$dry_run" "$ssh_user" "$host" "$ssh_key_path" "[$target] ${safe_labels[$i]}" "${safe_commands[$i]}"; then + remote_warn "safe action failed: ${safe_labels[$i]}" + fi + fi + done + + if [ "$non_interactive_safe" != "1" ] && remote_is_interactive; then + if remote_confirm "Run advanced action: rerun setup (sudo baudbot setup )?" "n"; then + local admin_user + admin_user="$(remote_prompt_default "Admin username for setup" "")" + if [ -n "$admin_user" ]; then + if ! remote_run_repair_action "$dry_run" "$ssh_user" "$host" "$ssh_key_path" "[$target] rerun setup" "sudo baudbot setup $admin_user"; then + remote_warn "advanced action failed: setup" + fi + fi + fi + + if remote_confirm "Run advanced action: reinstall using bootstrap + install?" "n"; then + if [ "$dry_run" = "1" ]; then + remote_log "[dry-run] advanced reinstall skipped" + else + remote_run_bootstrap_remote "$ssh_user" "$host" "$ssh_key_path" + remote_run_install_remote "$ssh_user" "$host" "$ssh_key_path" + fi + fi + + if remote_confirm "Run advanced action: install/re-auth Tailscale for remote access?" "n"; then + local repair_tailscale_key="$tailscale_auth_key" + if [ -z "$repair_tailscale_key" ] && remote_is_interactive; then + repair_tailscale_key="$(remote_prompt_secret "Tailscale auth key (leave empty for browser login)")" + fi + if ! remote_configure_tailscale "$target" "$ssh_user" "$host" "$ssh_key_path" "$repair_tailscale_key" "enable" "$dry_run"; then + remote_warn "advanced action failed: tailscale install/re-auth" + fi + fi + fi + + remote_log "[$target] collecting post-repair diagnostics" + if [ "$dry_run" = "1" ]; then + after_status_output="[dry-run] skipped" + after_doctor_output="[dry-run] skipped" + else + if remote_capture_remote_output after_status_output "$ssh_user" "$host" "$ssh_key_path" "sudo baudbot status"; then + after_status_rc=0 + else + after_status_rc=$? + fi + if remote_capture_remote_output after_doctor_output "$ssh_user" "$host" "$ssh_key_path" "sudo baudbot doctor"; then + after_doctor_rc=0 + else + after_doctor_rc=$? + fi + fi + + if [ "$dry_run" = "1" ] || { [ "$after_status_rc" -eq 0 ] && [ "$after_doctor_rc" -eq 0 ]; }; then + remote_state_set_status "$target" "ready" + remote_state_clear_last_error "$target" + else + remote_state_set_status "$target" "failed" + remote_state_set_last_error "$target" "repair health checks failed" + fi + + echo "" + echo "=== Repair Summary ($target) ===" + echo "Host: $host" + echo "Before: status rc=$before_status_rc, doctor rc=$before_doctor_rc" + echo "After: status rc=$after_status_rc, doctor rc=$after_doctor_rc" + echo "" + echo "--- Before status ---" + printf '%s\n' "$before_status_output" + echo "" + echo "--- Before doctor ---" + printf '%s\n' "$before_doctor_output" + echo "" + echo "--- After status ---" + printf '%s\n' "$after_status_output" + echo "" + echo "--- After doctor ---" + printf '%s\n' "$after_doctor_output" +} + +remote_cmd_list() { + remote_init_storage + + local found=0 + local file + printf "%-24s %-8s %-22s %-12s %-20s\n" "TARGET" "MODE" "HOST" "STATUS" "NEXT" + printf "%-24s %-8s %-22s %-12s %-20s\n" "------" "----" "----" "------" "----" + + for file in "$(remote_targets_dir)"/*.json; do + [ -e "$file" ] || continue + found=1 + + local name mode host status next + name="$(jq -er '.name // empty' "$file" 2>/dev/null || true)" + mode="$(jq -er '.mode // empty' "$file" 2>/dev/null || true)" + host="$(jq -er '.host // empty' "$file" 2>/dev/null || true)" + status="$(jq -er '.status // empty' "$file" 2>/dev/null || true)" + + if [ -z "$name" ]; then + continue + fi + + if [ -n "$mode" ]; then + next="$(remote_next_install_checkpoint "$name" "$mode")" + else + next="unknown" + fi + + printf "%-24s %-8s %-22s %-12s %-20s\n" "$name" "${mode:-?}" "${host:--}" "${status:--}" "$next" + done + + if [ "$found" -eq 0 ]; then + echo "No remote targets found." + fi +} + +remote_cmd_status() { + local target="$1" + + remote_validate_target_name "$target" || return 1 + if ! remote_state_exists "$target"; then + remote_die "target '$target' not found" + fi + + local mode host status last_error next_checkpoint tailscale_enabled tailscale_ip + mode="$(remote_state_get_field "$target" '.mode')" + host="$(remote_state_get_field "$target" '.host')" + status="$(remote_state_get_field "$target" '.status')" + last_error="$(remote_state_get_field "$target" '.last_error')" + next_checkpoint="$(remote_next_install_checkpoint "$target" "$mode")" + tailscale_enabled="$(remote_state_get_field "$target" '.tailscale.enabled')" + tailscale_ip="$(remote_state_get_field "$target" '.tailscale.ip')" + + echo "Target: $target" + echo "Mode: ${mode:--}" + echo "Host: ${host:--}" + echo "Status: ${status:--}" + echo "Next checkpoint: ${next_checkpoint:--}" + echo "Tailscale: ${tailscale_enabled:-false}" + if [ -n "$tailscale_ip" ]; then + echo "Tailscale IP: $tailscale_ip" + fi + if [ -n "$last_error" ]; then + echo "Last error: $last_error" + fi + + echo "" + echo "Checkpoints:" + + local checkpoint + while IFS= read -r checkpoint; do + [ -n "$checkpoint" ] || continue + local completed_at retry_count + completed_at="$(remote_state_get_field "$target" ".checkpoints[]? | select(.name == \"$checkpoint\") | .completed_at")" + retry_count="$(remote_checkpoint_retry_count "$target" "$checkpoint")" + if [ -n "$completed_at" ]; then + printf ' %-24s done (%s, retries=%s)\n' "$checkpoint" "$completed_at" "$retry_count" + else + printf ' %-24s pending (retries=%s)\n' "$checkpoint" "$retry_count" + fi + done < <(remote_install_checkpoint_order "$mode") +} + +remote_cmd_resume() { + local target="$1" + shift + + if ! remote_state_exists "$target"; then + remote_die "target '$target' not found" + fi + + local status + status="$(remote_state_get_field "$target" '.status')" + + if [ "$status" = "repairing" ]; then + remote_cmd_repair --target "$target" "$@" + return 0 + fi + + remote_cmd_install --target "$target" --resume "$@" +} + +main() { + local command="${1:-}" + shift || true + + case "$command" in + install) + remote_cmd_install "$@" + ;; + repair) + remote_cmd_repair "$@" + ;; + list) + remote_cmd_list + ;; + status) + if [ "$#" -ne 1 ]; then + remote_die "usage: baudbot remote status " + fi + remote_cmd_status "$1" + ;; + resume) + if [ "$#" -lt 1 ]; then + remote_die "usage: baudbot remote resume [options]" + fi + local target="$1" + shift + remote_cmd_resume "$target" "$@" + ;; + -h|--help|"") + remote_usage + ;; + *) + remote_die "unknown remote command: $command" + ;; + esac +} + +main "$@" diff --git a/bin/remote.test.sh b/bin/remote.test.sh new file mode 100755 index 0000000..93a184e --- /dev/null +++ b/bin/remote.test.sh @@ -0,0 +1,145 @@ +#!/bin/bash +# Tests for bin/remote.sh + +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +REMOTE_CLI="$REPO_ROOT/bin/remote.sh" + +TOTAL=0 +PASSED=0 +FAILED=0 + +run_test() { + local name="$1" + shift + local out + + TOTAL=$((TOTAL + 1)) + printf " %-45s " "$name" + + out="$(mktemp /tmp/baudbot-remote-cli-test-output.XXXXXX)" + if "$@" >"$out" 2>&1; then + echo "✓" + PASSED=$((PASSED + 1)) + else + echo "✗ FAILED" + tail -60 "$out" | sed 's/^/ /' + FAILED=$((FAILED + 1)) + fi + rm -f "$out" +} + +with_state_dir() { + local tmp + tmp="$(mktemp -d /tmp/baudbot-remote-cli.XXXXXX)" + local rc=0 + ( + set -euo pipefail + export BAUDBOT_REMOTE_DIR="$tmp" + "$@" + ) || rc=$? + rm -rf "$tmp" + return "$rc" +} + +test_install_requires_mode_non_interactive() { + with_state_dir bash -c ' + set -euo pipefail + if bash "$0" install --target demo >/tmp/baudbot-remote-missing-mode.out 2>&1; then + exit 1 + fi + grep -q -- "--mode is required" /tmp/baudbot-remote-missing-mode.out + rm -f /tmp/baudbot-remote-missing-mode.out + ' "$REMOTE_CLI" +} + +test_install_host_dry_run_completes() { + with_state_dir bash -c ' + set -euo pipefail + bash "$0" install --mode host --target demo --host 198.51.100.10 --dry-run + + state_file="$BAUDBOT_REMOTE_DIR/targets/demo.json" + [ -f "$state_file" ] + [ "$(jq -r ".status" "$state_file")" = "ready" ] + [ "$(jq -r ".mode" "$state_file")" = "host" ] + [ "$(jq -r ".host" "$state_file")" = "198.51.100.10" ] + [ "$(jq -r ".tailscale.enabled" "$state_file")" = "false" ] + + status_out="$(bash "$0" status demo)" + next="$(printf "%s\n" "$status_out" | awk -F": " "/Next checkpoint/ {print \$2}")" + [ "$next" = "completed" ] + printf "%s\n" "$status_out" | grep -q "Tailscale: false" + printf "%s\n" "$status_out" | grep -q "tailscale_connected.*done" + ' "$REMOTE_CLI" +} + +test_resume_missing_target_fails() { + with_state_dir bash -c ' + set -euo pipefail + if bash "$0" resume missing-target >/tmp/baudbot-remote-resume-missing.out 2>&1; then + exit 1 + fi + grep -q "not found" /tmp/baudbot-remote-resume-missing.out + rm -f /tmp/baudbot-remote-resume-missing.out + ' "$REMOTE_CLI" +} + +test_resume_existing_target_uses_saved_mode() { + with_state_dir bash -c ' + set -euo pipefail + bash "$0" install --mode host --target demo --host 198.51.100.10 --dry-run >/dev/null + + state_file="$BAUDBOT_REMOTE_DIR/targets/demo.json" + tmp_file="$(mktemp /tmp/baudbot-remote-resume-state.XXXXXX)" + jq ".checkpoints = [] | .status = \"failed\" | .last_error = \"interrupted\"" "$state_file" > "$tmp_file" + mv "$tmp_file" "$state_file" + + bash "$0" resume demo --dry-run >/dev/null + [ "$(jq -r ".status" "$state_file")" = "ready" ] + [ "$(jq -r ".mode" "$state_file")" = "host" ] + ' "$REMOTE_CLI" +} + +test_list_and_status_output() { + with_state_dir bash -c ' + set -euo pipefail + bash "$0" install --mode host --target demo --host 198.51.100.10 --dry-run >/dev/null + + list_out="$(bash "$0" list)" + status_out="$(bash "$0" status demo)" + + printf "%s\n" "$list_out" | grep -q "demo" + printf "%s\n" "$status_out" | grep -q "Status: ready" + ' "$REMOTE_CLI" +} + +test_repair_non_interactive_safe_dry_run() { + with_state_dir bash -c ' + set -euo pipefail + bash "$0" install --mode host --target demo --host 198.51.100.10 --dry-run >/dev/null + bash "$0" repair --target demo --non-interactive-safe --dry-run >/tmp/baudbot-remote-repair.out + + state_file="$BAUDBOT_REMOTE_DIR/targets/demo.json" + [ "$(jq -r ".status" "$state_file")" = "ready" ] + grep -q "Repair Summary" /tmp/baudbot-remote-repair.out + rm -f /tmp/baudbot-remote-repair.out + ' "$REMOTE_CLI" +} + +echo "=== remote cli tests ===" +echo "" + +run_test "install requires mode in non-interactive" test_install_requires_mode_non_interactive +run_test "host install dry-run completes" test_install_host_dry_run_completes +run_test "resume missing target fails" test_resume_missing_target_fails +run_test "resume existing target uses saved mode" test_resume_existing_target_uses_saved_mode +run_test "list and status show target" test_list_and_status_output +run_test "repair safe dry-run" test_repair_non_interactive_safe_dry_run + +echo "" +echo "=== $PASSED/$TOTAL passed, $FAILED failed ===" + +if [ "$FAILED" -gt 0 ]; then + exit 1 +fi diff --git a/bin/security-audit.sh b/bin/security-audit.sh index dfa583b..74add8c 100755 --- a/bin/security-audit.sh +++ b/bin/security-audit.sh @@ -52,6 +52,7 @@ finding() { INFO) echo " ℹ️ INFO: $title"; info=$((info + 1)) ;; esac [ -n "$detail" ] && echo " $detail" + return 0 } ok() { @@ -88,6 +89,61 @@ fix_skip() { fi } +get_stat_mode() { + local file_path="$1" + local mode="" + if mode=$(stat -c '%a' "$file_path" 2>/dev/null); then + printf '%s\n' "$mode" + return 0 + fi + if mode=$(stat -f '%Lp' "$file_path" 2>/dev/null); then + printf '%s\n' "$mode" + return 0 + fi + printf '???\n' + return 1 +} + +get_stat_owner() { + local file_path="$1" + local owner="" + if owner=$(stat -c '%U' "$file_path" 2>/dev/null); then + printf '%s\n' "$owner" + return 0 + fi + if owner=$(stat -f '%Su' "$file_path" 2>/dev/null); then + printf '%s\n' "$owner" + return 0 + fi + printf 'unknown\n' + return 1 +} + +normalize_mode_octal() { + local mode="$1" + if [[ ! "$mode" =~ ^[0-7]{3,4}$ ]]; then + return 1 + fi + if [ "${#mode}" -eq 4 ]; then + mode="${mode:1}" + fi + printf '%s\n' "$mode" +} + +is_group_or_world_readable() { + local normalized + normalized="$(normalize_mode_octal "$1")" || return 1 + local as_octal=$((8#$normalized)) + [ $((as_octal & 044)) -ne 0 ] +} + +is_world_readable() { + local normalized + normalized="$(normalize_mode_octal "$1")" || return 1 + local as_octal=$((8#$normalized)) + [ $((as_octal & 004)) -ne 0 ] +} + echo "" echo "🔒 Baudbot Security Audit" echo "========================" @@ -126,7 +182,7 @@ check_perms() { if [ ! -e "$path" ]; then return fi - actual=$(stat -c '%a' "$path" 2>/dev/null || echo "???") + actual=$(get_stat_mode "$path") if [ "$actual" = "$expected" ]; then ok "$desc ($actual)" else @@ -134,7 +190,7 @@ check_perms() { # Group/world readable secrets or state = critical if [ "$expected" = "600" ] || [ "$expected" = "700" ]; then # Check if actually group/world readable - if [ $((0$actual & 044)) -ne 0 ]; then + if is_group_or_world_readable "$actual"; then sev="CRITICAL" fi fi @@ -480,12 +536,14 @@ if [ -f "$AUDIT_LOG_PRIMARY" ]; then fi fi # Check permissions - log_perms=$(stat -c '%a' "$AUDIT_LOG_PRIMARY" 2>/dev/null || echo "???") - if [ $((0$log_perms & 004)) -eq 0 ]; then - ok "Audit log is not world-readable ($log_perms)" - else + log_perms=$(get_stat_mode "$AUDIT_LOG_PRIMARY") + if [ "$log_perms" = "???" ]; then + finding "WARN" "Could not determine audit log permissions" "$AUDIT_LOG_PRIMARY" + elif is_world_readable "$log_perms"; then finding "WARN" "Audit log is world-readable ($log_perms)" \ "Run: sudo chmod 660 $AUDIT_LOG_PRIMARY" + else + ok "Audit log is not world-readable ($log_perms)" fi elif [ -f "$AUDIT_LOG_FALLBACK" ]; then finding "WARN" "Audit log using fallback location ($AUDIT_LOG_FALLBACK)" \ @@ -513,7 +571,7 @@ if [ -d "$BAUDBOT_SRC/.git" ] && [ -r "$BAUDBOT_SRC/.git/hooks" ]; then hook_path="$BAUDBOT_SRC/.git/hooks/pre-commit" if [ -f "$hook_path" ]; then ok "Pre-commit hook installed" - hook_owner=$(stat -c '%U' "$hook_path" 2>/dev/null || echo "unknown") + hook_owner=$(get_stat_owner "$hook_path") if [ "$hook_owner" = "root" ]; then ok "Pre-commit hook is root-owned (tamper-proof)" else diff --git a/bin/test.sh b/bin/test.sh index ef2c940..8b5765c 100755 --- a/bin/test.sh +++ b/bin/test.sh @@ -79,6 +79,10 @@ run_shell_tests() { run "config flow" bash bin/config.test.sh run "deploy lib helpers" bash bin/lib/deploy-common.test.sh run "doctor lib helpers" bash bin/lib/doctor-common.test.sh + run "remote common lib" bash bin/lib/remote-common.test.sh + run "remote ssh lib" bash bin/lib/remote-ssh.test.sh + run "remote hetzner lib" bash bin/lib/remote-hetzner.test.sh + run "remote cli" bash bin/remote.test.sh run "update release flow" bash bin/update-release.test.sh run "rollback release" bash bin/rollback-release.test.sh echo "" diff --git a/docs/agents.md b/docs/agents.md index 6077ade..37ae911 100644 --- a/docs/agents.md +++ b/docs/agents.md @@ -28,6 +28,9 @@ It should remain lightweight on coding itself and focus on orchestration quality ## Dev-agent The dev-agent is a coding worker launched in a dedicated git worktree for each task. +Execution backend can be: +- native `pi`, or +- CLI (`claude` / `codex`) behind a session-control shim. Responsibilities: @@ -55,7 +58,7 @@ Responsibilities: - Control and sentry sessions are long-lived. - Dev sessions are ephemeral and tied to todos. -- Session-control sockets allow inter-agent messaging (`send_to_session`). +- Session-control sockets allow inter-agent messaging (`send_to_session`) for both native and CLI-backed dev-agents. - Naming conventions encode role and task context (for observability and cleanup). ## Concurrency diff --git a/docs/architecture.md b/docs/architecture.md index 24c524a..1e6f94c 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -39,7 +39,11 @@ control-agent (persistent) └── dev-agent-* (ephemeral task workers) ``` -Inter-session communication is handled over pi session-control sockets. +Dev agents can run on: +- native `pi` sessions, or +- CLI backends (`claude`, `codex`) wrapped by a session-control compatibility shim. + +Inter-session communication remains socket-based in both cases, so control-agent keeps using the same `send_to_session` / `list_sessions` workflow. ## Data path summary diff --git a/docs/operations.md b/docs/operations.md index e33a25c..def84cf 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -38,6 +38,35 @@ Provision with a pinned pi version (optional): BAUDBOT_PI_VERSION=0.52.12 baudbot install ``` +## Remote install and repair + +`baudbot remote` is an opt-in operator workflow for remote provisioning/install/repair. It is local-CLI stateful (checkpoints + resume) and does not change normal runtime behavior unless you invoke it. + +```bash +# New Hetzner host (provision + install) +baudbot remote install --mode hetzner --target team-bot + +# Existing host install +baudbot remote install --mode host --target team-bot --host 203.0.113.10 --ssh-user root + +# Enable Tailscale during install (interactive login unless auth key provided) +baudbot remote install --mode host --target team-bot --host 203.0.113.10 --tailscale +# Non-interactive auth-key path: +baudbot remote install --mode host --target team-bot --host 203.0.113.10 --tailscale --tailscale-auth-key tskey-... + +# Checkpoint inspection and resume +baudbot remote list +baudbot remote status team-bot +baudbot remote resume team-bot + +# Guided repair +baudbot remote repair --target team-bot +# or host-only targeting: +baudbot remote repair --host 203.0.113.10 --ssh-user root --non-interactive-safe +``` + +Install checkpoints are persisted under `~/.baudbot/remote/targets/.json`. SSH host keys are stored in `~/.baudbot/remote/known_hosts` with `StrictHostKeyChecking=accept-new`. + ## Updating API keys after install ```bash diff --git a/package.json b/package.json index 54121ea..578c0da 100644 --- a/package.json +++ b/package.json @@ -4,7 +4,7 @@ "private": true, "scripts": { "test": "vitest run --config vitest.config.mjs", - "test:js": "vitest run --config vitest.config.mjs pi/extensions/heartbeat.test.mjs pi/extensions/memory.test.mjs test/legacy-node-tests.test.mjs test/broker-bridge.integration.test.mjs", + "test:js": "vitest run --config vitest.config.mjs pi/extensions/cli-session-shim.test.mjs pi/extensions/heartbeat.test.mjs pi/extensions/memory.test.mjs test/legacy-node-tests.test.mjs test/broker-bridge.integration.test.mjs", "test:shell": "vitest run --config vitest.config.mjs test/shell-scripts.test.mjs test/security-audit.test.mjs", "test:coverage": "vitest run --config vitest.config.mjs --coverage pi/extensions/heartbeat.test.mjs pi/extensions/memory.test.mjs test/legacy-node-tests.test.mjs", "lint": "npm run lint:js && npm run lint:shell", diff --git a/pi/extensions/cli-session-shim.mjs b/pi/extensions/cli-session-shim.mjs new file mode 100644 index 0000000..b75c7d4 --- /dev/null +++ b/pi/extensions/cli-session-shim.mjs @@ -0,0 +1,454 @@ +#!/usr/bin/env node + +import { execFile } from "node:child_process"; +import { promises as fs } from "node:fs"; +import * as net from "node:net"; +import * as os from "node:os"; +import * as path from "node:path"; + +const SENDER_INFO_PATTERN = /[\s\S]*?<\/sender_info>/g; +const SOCKET_SUFFIX = ".sock"; + +function parseArgs(argv) { + const parsed = {}; + + for (let i = 0; i < argv.length; i += 1) { + const raw = argv[i]; + if (!raw) continue; + + if (raw === "--help" || raw === "-h") { + parsed.help = true; + continue; + } + + if (!raw.startsWith("--")) continue; + + const withoutPrefix = raw.slice(2); + const eqIndex = withoutPrefix.indexOf("="); + + if (eqIndex !== -1) { + const key = withoutPrefix.slice(0, eqIndex); + const value = withoutPrefix.slice(eqIndex + 1); + parsed[key] = value; + continue; + } + + const key = withoutPrefix; + const value = argv[i + 1]; + if (value && !value.startsWith("--")) { + parsed[key] = value; + i += 1; + continue; + } + + parsed[key] = "true"; + } + + return parsed; +} + +function usage() { + return `Usage: node cli-session-shim.mjs \\ + --session-id \\ + --session-name \\ + --tmux-session \\ + [--control-dir ] \\ + [--capture-lines ] \\ + [--turn-end-delay-ms ] \\ + [--abort-hard-kill-ms ] \\ + [--tmux-bin ]`; +} + +function toInt(value, fallback, min = 0) { + const parsed = Number.parseInt(String(value ?? ""), 10); + if (!Number.isFinite(parsed) || parsed < min) return fallback; + return parsed; +} + +function isErrnoException(error) { + return typeof error === "object" && error !== null && "code" in error; +} + +async function safeUnlink(targetPath) { + try { + await fs.unlink(targetPath); + } catch (error) { + if (isErrnoException(error) && error.code !== "ENOENT") { + throw error; + } + } +} + +function stripSenderInfo(text) { + return String(text).replace(SENDER_INFO_PATTERN, "").trim(); +} + +function compactLines(text, maxLines) { + const lines = String(text) + .split("\n") + .map((line) => line.replace(/\s+$/g, "")) + .filter((line) => line.trim().length > 0); + + if (lines.length === 0) return ""; + return lines.slice(-maxLines).join("\n"); +} + +function buildSummary(paneText) { + const compact = compactLines(paneText, 30); + if (!compact) { + return "No CLI output captured yet."; + } + + return `CLI output snapshot (most recent lines):\n\n${compact}`; +} + +function createExtractedMessage(content) { + return { + role: "assistant", + content, + timestamp: Date.now(), + }; +} + +function execFileAsync(file, args) { + return new Promise((resolve, reject) => { + execFile(file, args, { encoding: "utf8", maxBuffer: 1024 * 1024 }, (error, stdout, stderr) => { + if (error) { + const err = new Error(`${file} ${args.join(" ")} failed: ${stderr || error.message}`); + err.cause = error; + reject(err); + return; + } + + resolve({ stdout, stderr }); + }); + }); +} + +function writeLine(socket, payload) { + try { + socket.write(`${JSON.stringify(payload)}\n`); + } catch { + // Ignore closed/broken sockets. + } +} + +async function main() { + const args = parseArgs(process.argv.slice(2)); + if (args.help) { + console.log(usage()); + process.exit(0); + } + + const sessionId = String(args["session-id"] || "").trim(); + const sessionName = String(args["session-name"] || "").trim(); + const tmuxSession = String(args["tmux-session"] || "").trim(); + + if (!sessionId || !sessionName || !tmuxSession) { + console.error(usage()); + process.exit(2); + } + + const controlDir = + String(args["control-dir"] || "").trim() || + path.join(os.homedir(), ".pi", "session-control"); + const captureLines = toInt(args["capture-lines"], 120, 20); + const turnEndDelayMs = toInt(args["turn-end-delay-ms"], 700, 0); + const defaultAbortHardKillMs = toInt( + args["abort-hard-kill-ms"] || process.env.CLI_SHIM_ABORT_HARD_KILL_MS, + 0, + 0, + ); + const tmuxBin = String(args["tmux-bin"] || process.env.CLI_SHIM_TMUX_BIN || "tmux").trim(); + + const socketPath = path.join(controlDir, `${sessionId}${SOCKET_SUFFIX}`); + const aliasPath = path.join(controlDir, `${sessionName}.alias`); + let server = null; + let shuttingDown = false; + let turnIndex = 0; + let lastMessage = createExtractedMessage("No CLI output captured yet."); + let sendQueue = Promise.resolve(); + const subscriptions = []; + + const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); + + async function runTmux(commandArgs) { + return await execFileAsync(tmuxBin, commandArgs); + } + + async function capturePaneText() { + const result = await runTmux([ + "capture-pane", + "-t", + tmuxSession, + "-p", + "-S", + `-${captureLines}`, + ]); + return result.stdout || ""; + } + + function upsertLastMessageFromPane(paneText) { + const compact = compactLines(paneText, 40); + if (!compact) return lastMessage; + lastMessage = createExtractedMessage(compact); + return lastMessage; + } + + function cleanupSubscriptionSocket(socket) { + for (let i = subscriptions.length - 1; i >= 0; i -= 1) { + if (subscriptions[i]?.socket === socket) { + subscriptions.splice(i, 1); + } + } + } + + function emitTurnEnd(data) { + if (subscriptions.length === 0) return; + + const pending = [...subscriptions]; + subscriptions.length = 0; + + for (const sub of pending) { + writeLine(sub.socket, { + type: "event", + event: "turn_end", + data, + subscriptionId: sub.subscriptionId, + }); + } + } + + function respond(socket, commandName, success, data, error, id) { + writeLine(socket, { + type: "response", + command: commandName, + success, + data, + error, + id, + }); + } + + async function handleCommand(socket, command) { + const id = typeof command.id === "string" ? command.id : undefined; + + if (!command || typeof command !== "object" || typeof command.type !== "string") { + respond(socket, "parse", false, undefined, "Invalid command", id); + return; + } + + if (command.type === "subscribe") { + if (command.event !== "turn_end") { + respond(socket, "subscribe", false, undefined, `Unknown event type: ${command.event}`, id); + return; + } + + const subscriptionId = id || `sub_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`; + subscriptions.push({ socket, subscriptionId }); + socket.once("close", () => cleanupSubscriptionSocket(socket)); + socket.once("error", () => cleanupSubscriptionSocket(socket)); + + respond(socket, "subscribe", true, { subscriptionId, event: "turn_end" }, undefined, id); + return; + } + + if (command.type === "get_message") { + try { + const paneText = await capturePaneText(); + const message = upsertLastMessageFromPane(paneText); + respond(socket, "get_message", true, { message }, undefined, id); + } catch (error) { + const message = error instanceof Error ? error.message : "Failed to capture tmux output"; + respond(socket, "get_message", false, undefined, message, id); + } + return; + } + + if (command.type === "get_summary") { + try { + const paneText = await capturePaneText(); + upsertLastMessageFromPane(paneText); + const summary = buildSummary(paneText); + respond(socket, "get_summary", true, { summary, model: "tmux-capture" }, undefined, id); + } catch (error) { + const message = error instanceof Error ? error.message : "Failed to summarize tmux output"; + respond(socket, "get_summary", false, undefined, message, id); + } + return; + } + + if (command.type === "abort") { + try { + await runTmux(["send-keys", "-t", tmuxSession, "C-c"]); + const requestedDelayMs = command.hardKillAfterMs ?? command.hard_kill_after_ms; + const hardKillDelayMs = toInt( + requestedDelayMs, + command.hard === true && defaultAbortHardKillMs === 0 ? 1500 : defaultAbortHardKillMs, + 0, + ); + if (hardKillDelayMs > 0) { + setTimeout(() => { + void runTmux(["kill-session", "-t", tmuxSession]).catch(() => { + // Ignore failed escalation; session may already be gone. + }); + }, hardKillDelayMs); + } + respond( + socket, + "abort", + true, + { delivered: true, hardKillScheduledMs: hardKillDelayMs > 0 ? hardKillDelayMs : undefined }, + undefined, + id, + ); + } catch (error) { + const message = error instanceof Error ? error.message : "Failed to abort session"; + respond(socket, "abort", false, undefined, message, id); + } + return; + } + + if (command.type === "clear") { + respond( + socket, + "clear", + false, + undefined, + "Clear is not supported for CLI-backed sessions", + id, + ); + return; + } + + if (command.type === "send") { + const rawMessage = typeof command.message === "string" ? command.message : ""; + const message = stripSenderInfo(rawMessage); + + if (!message) { + respond(socket, "send", false, undefined, "Missing message", id); + return; + } + + turnIndex += 1; + const nextTurn = turnIndex; + sendQueue = sendQueue + .then(async () => { + await runTmux(["send-keys", "-t", tmuxSession, "-l", message]); + await runTmux(["send-keys", "-t", tmuxSession, "Enter"]); + if (turnEndDelayMs > 0) { + await sleep(turnEndDelayMs); + } + + const paneText = await capturePaneText(); + const extracted = upsertLastMessageFromPane(paneText); + emitTurnEnd({ message: extracted, turnIndex: nextTurn }); + }) + .catch((error) => { + const errorMessage = error instanceof Error ? error.message : "send queue failed"; + emitTurnEnd({ message: lastMessage, turnIndex: nextTurn, error: errorMessage }); + }); + + respond(socket, "send", true, { delivered: true, mode: command.mode || "steer" }, undefined, id); + return; + } + + respond(socket, command.type, false, undefined, `Unsupported command: ${command.type}`, id); + } + + async function startServer() { + await fs.mkdir(controlDir, { recursive: true }); + await safeUnlink(socketPath); + await safeUnlink(aliasPath); + + server = net.createServer((socket) => { + socket.setEncoding("utf8"); + let buffer = ""; + + socket.on("data", (chunk) => { + buffer += chunk; + let newlineIndex = buffer.indexOf("\n"); + + while (newlineIndex !== -1) { + const line = buffer.slice(0, newlineIndex).trim(); + buffer = buffer.slice(newlineIndex + 1); + newlineIndex = buffer.indexOf("\n"); + if (!line) continue; + + let command; + try { + command = JSON.parse(line); + } catch (error) { + const message = error instanceof Error ? error.message : "Failed to parse command"; + respond(socket, "parse", false, undefined, message, undefined); + continue; + } + + void handleCommand(socket, command); + } + }); + + socket.on("close", () => cleanupSubscriptionSocket(socket)); + socket.on("error", () => cleanupSubscriptionSocket(socket)); + }); + + await new Promise((resolve, reject) => { + if (!server) { + reject(new Error("server is not initialized")); + return; + } + + server.once("error", reject); + server.listen(socketPath, async () => { + try { + await fs.symlink(`${sessionId}${SOCKET_SUFFIX}`, aliasPath); + resolve(); + } catch (error) { + reject(error); + } + }); + }); + } + + async function shutdown(exitCode = 0) { + if (shuttingDown) return; + shuttingDown = true; + + try { + if (server) { + await new Promise((resolve) => { + server.close(() => resolve(undefined)); + }); + } + } catch { + // Ignore shutdown errors. + } + + try { + await safeUnlink(aliasPath); + await safeUnlink(socketPath); + } finally { + process.exit(exitCode); + } + } + + process.on("SIGINT", () => { + void shutdown(130); + }); + process.on("SIGTERM", () => { + void shutdown(143); + }); + + try { + await startServer(); + } catch (error) { + const message = error instanceof Error ? error.message : "Unknown startup error"; + console.error(`cli-session-shim failed to start: ${message}`); + await shutdown(1); + return; + } + + console.log(`cli-session-shim ready: ${sessionName} (${sessionId}) at ${socketPath}`); +} + +void main(); diff --git a/pi/extensions/cli-session-shim.test.mjs b/pi/extensions/cli-session-shim.test.mjs new file mode 100644 index 0000000..3d7e2d8 --- /dev/null +++ b/pi/extensions/cli-session-shim.test.mjs @@ -0,0 +1,396 @@ +import { spawn } from "node:child_process"; +import { describe, it, beforeEach, afterEach } from "vitest"; +import assert from "node:assert/strict"; +import fs from "node:fs"; +import path from "node:path"; +import os from "node:os"; +import net from "node:net"; + +const REPO_ROOT = path.resolve(path.dirname(new URL(import.meta.url).pathname), "../.."); +const SHIM_SCRIPT = path.join(REPO_ROOT, "pi/extensions/cli-session-shim.mjs"); + +let tmpDir = ""; +let controlDir = ""; +let tmuxLogPath = ""; +let capturePath = ""; +let tmuxScriptPath = ""; +let unixSocketSupportCache = null; + +function sleep(ms) { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +function createTempDir(prefix) { + const roots = ["/tmp", os.tmpdir()]; + for (const root of roots) { + try { + return fs.mkdtempSync(path.join(root, prefix)); + } catch { + // try next root + } + } + throw new Error(`failed to create temp dir for prefix: ${prefix}`); +} + +function setupFixture() { + tmpDir = createTempDir("cli-shim-test-"); + controlDir = path.join(tmpDir, "session-control"); + tmuxLogPath = path.join(tmpDir, "tmux.log"); + capturePath = path.join(tmpDir, "capture.txt"); + tmuxScriptPath = path.join(tmpDir, "fake-tmux.sh"); + + fs.mkdirSync(controlDir, { recursive: true }); + fs.writeFileSync(tmuxLogPath, "", "utf8"); + fs.writeFileSync(capturePath, "", "utf8"); + + fs.writeFileSync( + tmuxScriptPath, + `#!/usr/bin/env bash +set -euo pipefail +log_file="${tmuxLogPath}" +capture_file="${capturePath}" +cmd="\${1:-}" +if [ -z "$cmd" ]; then + exit 1 +fi +if [ "$cmd" = "send-keys" ]; then + shift + if [ "$1" = "-t" ]; then + shift 2 + fi + if [ "\${1:-}" = "-l" ]; then + shift + fi + message="\${1:-}" + if [ "$message" = "C-c" ]; then + printf '%s\\n' "abort" >> "$log_file" + elif [ "$message" = "Enter" ]; then + printf '%s\\n' "enter" >> "$log_file" + else + printf '%s\\n' "send:$message" >> "$log_file" + fi + exit 0 +fi +if [ "$cmd" = "kill-session" ]; then + printf '%s\\n' "kill-session" >> "$log_file" + exit 0 +fi +if [ "$cmd" = "capture-pane" ]; then + cat "$capture_file" + exit 0 +fi +printf '%s\\n' "unexpected:$cmd" >> "$log_file" +exit 1 +`, + "utf8", + ); + fs.chmodSync(tmuxScriptPath, 0o755); +} + +function teardownFixture() { + if (tmpDir) { + fs.rmSync(tmpDir, { recursive: true, force: true }); + tmpDir = ""; + controlDir = ""; + tmuxLogPath = ""; + capturePath = ""; + tmuxScriptPath = ""; + } +} + +async function startShim({ sessionId, sessionName }) { + const shim = spawn( + "node", + [ + SHIM_SCRIPT, + "--session-id", + sessionId, + "--session-name", + sessionName, + "--tmux-session", + sessionName, + "--control-dir", + controlDir, + "--turn-end-delay-ms", + "100", + "--capture-lines", + "80", + "--tmux-bin", + tmuxScriptPath, + ], + { + cwd: REPO_ROOT, + env: process.env, + stdio: ["ignore", "pipe", "pipe"], + }, + ); + + let stdout = ""; + let stderr = ""; + shim.stdout.on("data", (chunk) => { + stdout += chunk.toString(); + }); + shim.stderr.on("data", (chunk) => { + stderr += chunk.toString(); + }); + + const socketPath = path.join(controlDir, `${sessionId}.sock`); + const aliasPath = path.join(controlDir, `${sessionName}.alias`); + + for (let i = 0; i < 80; i += 1) { + if (fs.existsSync(socketPath) && fs.existsSync(aliasPath)) { + return { shim, socketPath, aliasPath }; + } + + if (shim.exitCode != null) { + throw new Error(`shim exited early: code=${shim.exitCode} stdout=${stdout} stderr=${stderr}`); + } + + await sleep(50); + } + + throw new Error(`shim failed to start: stdout=${stdout} stderr=${stderr}`); +} + +async function stopShim(shim) { + if (!shim || shim.exitCode != null) return; + + shim.kill("SIGTERM"); + await new Promise((resolve) => { + const timer = setTimeout(() => { + if (shim.exitCode == null) { + shim.kill("SIGKILL"); + } + resolve(undefined); + }, 2000); + + shim.once("exit", () => { + clearTimeout(timer); + resolve(undefined); + }); + }); +} + +async function unixSocketsAvailable() { + if (unixSocketSupportCache != null) { + return unixSocketSupportCache; + } + + const probePath = path.join(tmpDir, "probe.sock"); + unixSocketSupportCache = await new Promise((resolve) => { + const server = net.createServer(); + server.once("error", () => resolve(false)); + server.listen(probePath, () => { + server.close(() => { + try { + fs.unlinkSync(probePath); + } catch { + // ignore + } + resolve(true); + }); + }); + }); + return unixSocketSupportCache; +} + +function sendRpc(socketPath, command, options = {}) { + const waitForEvent = options.waitForEvent === true; + + return new Promise((resolve, reject) => { + const socket = net.createConnection(socketPath); + socket.setEncoding("utf8"); + + const timeout = setTimeout(() => { + socket.destroy(new Error("timeout")); + }, 5000); + + let buffer = ""; + let response = null; + + const cleanup = () => { + clearTimeout(timeout); + socket.removeAllListeners(); + }; + + socket.on("connect", () => { + socket.write(`${JSON.stringify(command)}\n`); + if (waitForEvent) { + socket.write(`${JSON.stringify({ type: "subscribe", event: "turn_end" })}\n`); + } + }); + + socket.on("data", (chunk) => { + buffer += chunk; + let idx = buffer.indexOf("\n"); + while (idx !== -1) { + const line = buffer.slice(0, idx).trim(); + buffer = buffer.slice(idx + 1); + idx = buffer.indexOf("\n"); + if (!line) continue; + + const parsed = JSON.parse(line); + if (parsed.type === "response" && parsed.command === command.type) { + response = parsed; + if (!waitForEvent) { + cleanup(); + socket.end(); + resolve({ response }); + return; + } + continue; + } + + if (waitForEvent && parsed.type === "event" && parsed.event === "turn_end") { + cleanup(); + socket.end(); + resolve({ response, event: parsed }); + return; + } + } + }); + + socket.on("error", (error) => { + cleanup(); + reject(error); + }); + }); +} + +async function hasActiveDevAgentsLikeIdleCompact(controlRoot) { + const entries = fs.readdirSync(controlRoot, { withFileTypes: true }); + for (const entry of entries) { + if (!entry.name.endsWith(".alias")) continue; + const aliasName = entry.name.slice(0, -".alias".length); + if (!aliasName.startsWith("dev-agent-")) continue; + + const target = fs.readlinkSync(path.join(controlRoot, entry.name)); + const socketPath = path.join(controlRoot, target); + + const alive = await new Promise((resolve) => { + const socket = net.createConnection(socketPath); + const timer = setTimeout(() => { + socket.destroy(); + resolve(false); + }, 300); + socket.once("connect", () => { + clearTimeout(timer); + socket.end(); + resolve(true); + }); + socket.once("error", () => { + clearTimeout(timer); + resolve(false); + }); + }); + + if (alive) return true; + } + + return false; +} + +describe("cli-session-shim", () => { + beforeEach(setupFixture); + afterEach(teardownFixture); + + it("creates and cleans up socket + alias", async () => { + if (!(await unixSocketsAvailable())) return; + + const sessionId = "11111111-1111-4111-8111-111111111111"; + const sessionName = "dev-agent-myapp-aaaa1111"; + const { shim, socketPath, aliasPath } = await startShim({ sessionId, sessionName }); + + assert.ok(fs.existsSync(socketPath), "socket should exist"); + assert.ok(fs.existsSync(aliasPath), "alias should exist"); + assert.equal(fs.readlinkSync(aliasPath), `${sessionId}.sock`); + + await stopShim(shim); + + assert.ok(!fs.existsSync(socketPath), "socket should be removed on shutdown"); + assert.ok(!fs.existsSync(aliasPath), "alias should be removed on shutdown"); + }); + + it("handles send/get_message/get_summary/abort/clear", async () => { + if (!(await unixSocketsAvailable())) return; + + const sessionId = "22222222-2222-4222-8222-222222222222"; + const sessionName = "dev-agent-myapp-bbbb2222"; + const { shim, socketPath } = await startShim({ sessionId, sessionName }); + + fs.writeFileSync(capturePath, "Assistant: waiting for task\n", "utf8"); + + const withEvent = await sendRpc( + socketPath, + { + type: "send", + message: "Implement fix\n\n{\"sessionName\":\"control-agent\"}", + mode: "follow_up", + }, + { waitForEvent: true }, + ); + + assert.equal(withEvent.response.success, true); + assert.equal(withEvent.event.type, "event"); + + const tmuxLog = fs.readFileSync(tmuxLogPath, "utf8"); + assert.ok(tmuxLog.includes("send:Implement fix"), "message should be delivered to tmux"); + assert.ok(!tmuxLog.includes("sender_info"), "sender_info tag should be stripped before tmux delivery"); + + fs.writeFileSync(capturePath, "Updated output line\nSecond line\n", "utf8"); + + const getMessage = await sendRpc(socketPath, { type: "get_message" }); + assert.equal(getMessage.response.success, true); + assert.ok(getMessage.response.data.message.content.includes("Updated output line")); + + const getSummary = await sendRpc(socketPath, { type: "get_summary" }); + assert.equal(getSummary.response.success, true); + assert.ok(getSummary.response.data.summary.includes("CLI output snapshot")); + + const abortResult = await sendRpc(socketPath, { type: "abort" }); + assert.equal(abortResult.response.success, true); + + const afterAbortLog = fs.readFileSync(tmuxLogPath, "utf8"); + assert.ok(afterAbortLog.includes("abort"), "abort should send Ctrl+C to tmux"); + + const clearResult = await sendRpc(socketPath, { type: "clear" }); + assert.equal(clearResult.response.success, false); + assert.ok(clearResult.response.error.includes("not supported")); + + await stopShim(shim); + }); + + it("supports optional abort escalation to tmux kill-session", async () => { + if (!(await unixSocketsAvailable())) return; + + const sessionId = "44444444-4444-4444-8444-444444444444"; + const sessionName = "dev-agent-myapp-dddd4444"; + const { shim, socketPath } = await startShim({ sessionId, sessionName }); + + const abortResult = await sendRpc(socketPath, { + type: "abort", + hard: true, + hardKillAfterMs: 50, + }); + assert.equal(abortResult.response.success, true); + + await sleep(120); + const tmuxLog = fs.readFileSync(tmuxLogPath, "utf8"); + assert.ok(tmuxLog.includes("kill-session"), "hard abort should escalate to tmux kill-session"); + + await stopShim(shim); + }); + + it("is visible to idle-compact style dev-agent detection via alias+socket", async () => { + if (!(await unixSocketsAvailable())) return; + + const sessionId = "33333333-3333-4333-8333-333333333333"; + const sessionName = "dev-agent-myapp-cccc3333"; + const { shim } = await startShim({ sessionId, sessionName }); + + const detected = await hasActiveDevAgentsLikeIdleCompact(controlDir); + assert.equal(detected, true); + + await stopShim(shim); + }); +}); diff --git a/pi/skills/control-agent/HEARTBEAT.md b/pi/skills/control-agent/HEARTBEAT.md index c71acbf..4ae0cb1 100644 --- a/pi/skills/control-agent/HEARTBEAT.md +++ b/pi/skills/control-agent/HEARTBEAT.md @@ -2,7 +2,7 @@ Check each item and take action only if something is wrong. -- Check all agent sessions are alive (`list_sessions` — confirm `sentry-agent` exists, check for orphaned `dev-agent-*` sessions with no matching active todo) +- Check all agent sessions are alive (`list_sessions` — confirm `sentry-agent` exists, check for orphaned `dev-agent-*` sessions with no matching active todo; CLI-backed dev agents are visible here through the session-control shim) - Verify Slack bridge is responsive (`curl -s -o /dev/null -w '%{http_code}' -X POST http://127.0.0.1:7890/send -H 'Content-Type: application/json' -d '{}'` → should return 400) - If `BAUDBOT_EXPERIMENTAL=1`, check email monitor is running (`email_monitor status` — should show active) - Check for stale worktrees in `~/workspace/worktrees/` that don't correspond to active in-progress todos — clean them up with `git worktree remove` diff --git a/pi/skills/control-agent/SKILL.md b/pi/skills/control-agent/SKILL.md index bfa932d..c4782de 100644 --- a/pi/skills/control-agent/SKILL.md +++ b/pi/skills/control-agent/SKILL.md @@ -124,6 +124,9 @@ Dev agents are **ephemeral and task-scoped**. Each agent: - **Maximum 4 dev agents** running simultaneously - Before spawning, check `list_sessions` and count sessions matching `dev-agent-*` +- `list_sessions` is the source of truth for active dev agents. It includes: + - Native `pi` dev-agents + - CLI-backed dev-agents exposed through the session-control shim - If at limit, wait for an agent to finish before spawning a new one ### Known Repos @@ -206,6 +209,21 @@ If dev-agent reports repeated failures (e.g. CI failing after 3+ fix attempts, o ## Spawning a Dev Agent +### Backend Selection + +Choose backend per task using `DEV_AGENT_BACKEND` (default: `pi`): + +```bash +BACKEND="${DEV_AGENT_BACKEND:-pi}" # pi | claude-code | codex | auto +``` + +Override backend for a specific task when needed (for example, user requests Claude Code). + +If `BACKEND=auto`, select in this order: +1. `claude-code` if `claude` is available +2. `codex` if `codex` is available +3. fallback `pi` + Pick the model based on which API key is available (check env vars in this order): **Coding / orchestration (top-tier):** @@ -217,6 +235,8 @@ Pick the model based on which API key is available (check env vars in this order | `GEMINI_API_KEY` | `google/gemini-3-pro-preview` | | `OPENCODE_ZEN_API_KEY` | `opencode-zen/claude-opus-4-6` | +### Spawning a Dev Agent (pi backend) + Full procedure for spinning up a task-scoped dev agent: ```bash @@ -251,6 +271,47 @@ tmux new-session -d -s $SESSION_NAME \ **Model note**: Dev agents use the top-tier model from the table above. For cheaper tasks (e.g. read-only analysis), use the cheap model from the sentry-agent table instead. +### Spawning a Dev Agent (CLI backend via shim) + +Use this for `BACKEND=claude-code` or `BACKEND=codex`. The CLI dev-agent is exposed through a session-control shim, so you still use `send_to_session` and `list_sessions` normally. + +```bash +# Variables +BACKEND="claude-code" # or codex +REPO=myapp +REPO_PATH=~/workspace/$REPO +TODO_SHORT=a8b7b331 +BRANCH=fix/some-descriptive-name +SESSION_NAME=dev-agent-${REPO}-${TODO_SHORT} + +# 1. Create the worktree +cd $REPO_PATH +git fetch origin +git worktree add ~/workspace/worktrees/$BRANCH -b $BRANCH origin/main + +# 2. Launch CLI runner in tmux (runner starts shim + backend CLI) +tmux new-session -d -s $SESSION_NAME \ + "cd ~/workspace/worktrees/$BRANCH && \ + export PATH=\$HOME/.varlock/bin:\$HOME/opt/node-v22.14.0-linux-x64/bin:\$PATH && \ + exec varlock run --path ~/.config/ -- \ + bash ~/.pi/agent/skills/control-agent/scripts/run-cli-agent.sh \ + --backend $BACKEND \ + --worktree ~/workspace/worktrees/$BRANCH \ + --session-name $SESSION_NAME \ + --todo-id $TODO_SHORT \ + --repo $REPO" + +# 3. Wait for shim startup, then send the task via send_to_session (not tmux send-keys) +sleep 10 +send_to_session sessionName="$SESSION_NAME" action="send" mode="steer" message="Your task: " +``` + +**CLI backend notes:** +- Keep using `send_to_session` for initial task + follow-ups +- Keep using `list_sessions` for health/orphan checks +- `get_message` / `get_summary` work through the shim for spot checks +- `abort` works through `send_to_session` (mapped to Ctrl+C on the tmux session) + ## Cleanup After a dev agent reports completion: diff --git a/pi/skills/control-agent/scripts/bb-update.sh b/pi/skills/control-agent/scripts/bb-update.sh new file mode 100755 index 0000000..2c7b4be --- /dev/null +++ b/pi/skills/control-agent/scripts/bb-update.sh @@ -0,0 +1,234 @@ +#!/usr/bin/env bash + +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: bb-update.sh "message text" + +Environment (optional): + BB_SESSION_ID Current session UUID for sender_info + BB_SESSION_NAME Current session alias for sender_info + BB_CONTROL_SESSION Target control session alias (default: control-agent) + BB_CONTROL_SOCKET Target socket path override + BB_CONTROL_DIR Session-control directory (default: ~/.pi/session-control) + BB_MODE RPC send mode (default: follow_up) +USAGE +} + +if [ $# -lt 1 ]; then + usage >&2 + exit 2 +fi + +MESSAGE="$*" +MODE="${BB_MODE:-follow_up}" +CONTROL_SESSION="${BB_CONTROL_SESSION:-control-agent}" +CONTROL_DIR="${BB_CONTROL_DIR:-$HOME/.pi/session-control}" +SESSION_ID="${BB_SESSION_ID:-}" +SESSION_NAME="${BB_SESSION_NAME:-}" + +resolve_socket() { + if [ -n "${BB_CONTROL_SOCKET:-}" ] && [ -S "${BB_CONTROL_SOCKET}" ]; then + printf '%s\n' "$BB_CONTROL_SOCKET" + return 0 + fi + + local alias_path="$CONTROL_DIR/$CONTROL_SESSION.alias" + if [ -L "$alias_path" ]; then + local target + target="$(readlink "$alias_path")" + if [[ "$target" != /* ]]; then + target="$CONTROL_DIR/$target" + fi + if [ -S "$target" ]; then + printf '%s\n' "$target" + return 0 + fi + fi + + local direct_path="$CONTROL_DIR/$CONTROL_SESSION.sock" + if [ -S "$direct_path" ]; then + printf '%s\n' "$direct_path" + return 0 + fi + + return 1 +} + +SOCKET_PATH="$(resolve_socket || true)" +if [ -z "$SOCKET_PATH" ]; then + printf '%s\n' "bb-update: unable to resolve control socket for session '$CONTROL_SESSION'" >&2 + exit 1 +fi + +build_payload() { + if command -v python3 >/dev/null 2>&1; then + python3 - "$MESSAGE" "$MODE" "$SESSION_ID" "$SESSION_NAME" <<'PY' +import json +import sys + +message = sys.argv[1] +mode = sys.argv[2] +session_id = sys.argv[3] +session_name = sys.argv[4] + +sender = {} +if session_id: + sender["sessionId"] = session_id +if session_name: + sender["sessionName"] = session_name + +suffix = "" +if sender: + suffix = "\n\n" + json.dumps(sender, separators=(",", ":")) + "" + +payload = { + "type": "send", + "message": message + suffix, + "mode": mode, +} + +print(json.dumps(payload, separators=(",", ":"))) +PY + return 0 + fi + + if command -v node >/dev/null 2>&1; then + node -e ' +const message = process.argv[1]; +const mode = process.argv[2]; +const sessionId = process.argv[3]; +const sessionName = process.argv[4]; +const sender = {}; +if (sessionId) sender.sessionId = sessionId; +if (sessionName) sender.sessionName = sessionName; +const suffix = Object.keys(sender).length > 0 + ? "\n\n" + JSON.stringify(sender) + "" + : ""; +const payload = { type: "send", message: message + suffix, mode }; +process.stdout.write(JSON.stringify(payload)); +' "$MESSAGE" "$MODE" "$SESSION_ID" "$SESSION_NAME" + return 0 + fi + + printf '%s\n' "bb-update: python3 or node is required to build payload" >&2 + return 1 +} + +PAYLOAD="$(build_payload)" + +send_with_python() { + python3 - "$SOCKET_PATH" "$PAYLOAD" <<'PY' +import json +import socket +import sys + +sock_path = sys.argv[1] +payload = sys.argv[2] + "\n" + +sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) +sock.settimeout(5) +sock.connect(sock_path) +sock.sendall(payload.encode("utf-8")) +response = b"" +while b"\n" not in response: + chunk = sock.recv(4096) + if not chunk: + break + response += chunk +sock.close() + +if not response: + print("bb-update: no RPC response from control socket", file=sys.stderr) + sys.exit(1) + +line = response.split(b"\n", 1)[0].decode("utf-8", "replace").strip() +if not line: + print("bb-update: empty RPC response from control socket", file=sys.stderr) + sys.exit(1) + +try: + parsed = json.loads(line) +except Exception as error: + print(f"bb-update: invalid RPC response: {error}", file=sys.stderr) + sys.exit(1) + +if parsed.get("type") != "response": + print("bb-update: unexpected RPC response type", file=sys.stderr) + sys.exit(1) + +if not parsed.get("success"): + err = parsed.get("error") or "unknown error" + print(f"bb-update: control-agent rejected update: {err}", file=sys.stderr) + sys.exit(1) +PY +} + +send_with_node() { + node -e ' +const net = require("node:net"); +const socketPath = process.argv[1]; +const payload = process.argv[2] + "\n"; +const client = net.createConnection(socketPath, () => { + client.write(payload); +}); +client.setEncoding("utf8"); +client.setTimeout(5000, () => { + console.error("bb-update: timeout waiting for RPC response"); + client.destroy(); + process.exit(1); +}); +let buffer = ""; +client.on("data", (chunk) => { + buffer += chunk; + const newlineIdx = buffer.indexOf("\n"); + if (newlineIdx === -1) return; + const line = buffer.slice(0, newlineIdx).trim(); + client.end(); + if (!line) { + console.error("bb-update: empty RPC response from control socket"); + process.exit(1); + } + let parsed; + try { + parsed = JSON.parse(line); + } catch (error) { + console.error("bb-update: invalid RPC response: " + error.message); + process.exit(1); + } + if (parsed.type !== "response") { + console.error("bb-update: unexpected RPC response type"); + process.exit(1); + } + if (!parsed.success) { + console.error("bb-update: control-agent rejected update: " + (parsed.error || "unknown error")); + process.exit(1); + } + process.exit(0); +}); +client.on("error", (error) => { + console.error(error.message); + process.exit(1); +}); +client.on("end", () => { + if (!buffer.includes("\n")) { + console.error("bb-update: no RPC response from control socket"); + process.exit(1); + } +}); +' "$SOCKET_PATH" "$PAYLOAD" +} + +if command -v python3 >/dev/null 2>&1; then + send_with_python + exit 0 +fi + +if command -v node >/dev/null 2>&1; then + send_with_node + exit 0 +fi + +printf '%s\n' "bb-update: no supported socket client available (python3/node)" >&2 +exit 1 diff --git a/pi/skills/control-agent/scripts/run-cli-agent.sh b/pi/skills/control-agent/scripts/run-cli-agent.sh new file mode 100755 index 0000000..aaa0f1a --- /dev/null +++ b/pi/skills/control-agent/scripts/run-cli-agent.sh @@ -0,0 +1,412 @@ +#!/usr/bin/env bash + +set -euo pipefail + +log() { + printf '%s\n' "[run-cli-agent] $*" +} + +die() { + printf '%s\n' "[run-cli-agent] ERROR: $*" >&2 + exit 1 +} + +require_non_empty() { + local name="$1" + local value="${2:-}" + if [ -z "$value" ]; then + die "required value is empty: $name" + fi +} + +escape_sed_replacement() { + printf '%s' "$1" | sed -e 's/[\\/&]/\\&/g' +} + +json_escape() { + printf '%s' "$1" \ + | sed -e 's/\\/\\\\/g' -e 's/"/\\"/g' -e ':a;N;$!ba;s/\n/\\n/g' +} + +random_uuid() { + if command -v uuidgen >/dev/null 2>&1; then + uuidgen | tr '[:upper:]' '[:lower:]' + return 0 + fi + + if command -v python3 >/dev/null 2>&1; then + python3 - <<'PY' +import uuid +print(str(uuid.uuid4())) +PY + return 0 + fi + + if command -v node >/dev/null 2>&1; then + node -e 'console.log(require("crypto").randomUUID())' + return 0 + fi + + die "unable to generate UUID (uuidgen/python3/node missing)" +} + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SKILLS_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +BACKEND="" +WORKTREE="" +SESSION_NAME="" +TODO_ID="" +REPO="" +MODEL="" +TIMEOUT_SEC=3600 +CONTROL_SESSION="control-agent" +PERSONA_DIR="" +SHIM_SCRIPT="" +DRY_RUN=0 + +while [ $# -gt 0 ]; do + case "$1" in + --backend) + BACKEND="${2:-}" + shift 2 + ;; + --worktree) + WORKTREE="${2:-}" + shift 2 + ;; + --session-name) + SESSION_NAME="${2:-}" + shift 2 + ;; + --todo-id) + TODO_ID="${2:-}" + shift 2 + ;; + --repo) + REPO="${2:-}" + shift 2 + ;; + --model) + MODEL="${2:-}" + shift 2 + ;; + --timeout) + TIMEOUT_SEC="${2:-}" + shift 2 + ;; + --control-session) + CONTROL_SESSION="${2:-}" + shift 2 + ;; + --persona-dir) + PERSONA_DIR="${2:-}" + shift 2 + ;; + --shim-script) + SHIM_SCRIPT="${2:-}" + shift 2 + ;; + --dry-run) + DRY_RUN=1 + shift + ;; + --help|-h) + cat <<'USAGE' +Usage: run-cli-agent.sh \ + --backend \ + --worktree \ + --session-name \ + --todo-id \ + --repo \ + [--model ] \ + [--timeout ] \ + [--control-session ] \ + [--dry-run] +USAGE + exit 0 + ;; + *) + die "unknown argument: $1" + ;; + esac +done + +require_non_empty "backend" "$BACKEND" +require_non_empty "worktree" "$WORKTREE" +require_non_empty "session-name" "$SESSION_NAME" +require_non_empty "todo-id" "$TODO_ID" +require_non_empty "repo" "$REPO" + +case "$BACKEND" in + claude-code|codex) + ;; + *) + die "invalid --backend: $BACKEND (expected claude-code or codex)" + ;; +esac + +if [ ! -d "$WORKTREE" ]; then + die "worktree does not exist: $WORKTREE" +fi + +if [ ! -d "$WORKTREE/.git" ] && [ ! -f "$WORKTREE/.git" ]; then + die "worktree is not a git checkout: $WORKTREE" +fi + +if ! [[ "$TIMEOUT_SEC" =~ ^[0-9]+$ ]]; then + die "--timeout must be an integer number of seconds" +fi + +if [ -z "$PERSONA_DIR" ]; then + if [ -d "$HOME/.pi/agent/skills/dev-agent-cli" ]; then + PERSONA_DIR="$HOME/.pi/agent/skills/dev-agent-cli" + else + PERSONA_DIR="$SKILLS_ROOT/dev-agent-cli" + fi +fi + +if [ -z "$SHIM_SCRIPT" ]; then + if [ -f "$HOME/.pi/agent/extensions/cli-session-shim.mjs" ]; then + SHIM_SCRIPT="$HOME/.pi/agent/extensions/cli-session-shim.mjs" + else + SHIM_SCRIPT="$SCRIPT_DIR/../../../extensions/cli-session-shim.mjs" + fi +fi + +case "$BACKEND" in + claude-code) + TEMPLATE_PATH="$PERSONA_DIR/persona.claude-code.tmpl" + CLI_BIN="claude" + ;; + codex) + TEMPLATE_PATH="$PERSONA_DIR/persona.codex.tmpl" + CLI_BIN="codex" + ;; +esac + +if ! command -v "$CLI_BIN" >/dev/null 2>&1; then + die "required CLI binary not found in PATH: $CLI_BIN" +fi +if ! command -v node >/dev/null 2>&1; then + die "node is required to run the CLI session shim" +fi +if ! command -v tmux >/dev/null 2>&1; then + die "tmux is required" +fi +if [ ! -f "$TEMPLATE_PATH" ]; then + die "persona template not found: $TEMPLATE_PATH" +fi +if [ ! -f "$SHIM_SCRIPT" ]; then + die "shim script not found: $SHIM_SCRIPT" +fi + +TEMPLATE_RENDERED="$(sed \ + -e "s/{{TODO_ID}}/$(escape_sed_replacement "$TODO_ID")/g" \ + -e "s/{{SESSION_NAME}}/$(escape_sed_replacement "$SESSION_NAME")/g" \ + -e "s/{{REPO}}/$(escape_sed_replacement "$REPO")/g" \ + "$TEMPLATE_PATH")" + +if echo "$TEMPLATE_RENDERED" | grep -Eq '{{[A-Z0-9_]+}}'; then + die "persona template still contains unsubstituted placeholders" +fi + +BOOTSTRAP_PROMPT="$(cat <}" + log "control_session=$CONTROL_SESSION" + log "template=$TEMPLATE_PATH" + log "shim=$SHIM_SCRIPT" + log "session_id=$BB_SESSION_ID" + log "socket=$SOCKET_PATH" + printf '[run-cli-agent] command=' >&2 + printf '%q ' "${CLI_CMD[@]}" >&2 + printf '\n' >&2 + exit 0 +fi + +SHIM_LOG="" + +cleanup() { + set +e + if [ -n "${WATCHDOG_PID:-}" ]; then + kill "$WATCHDOG_PID" 2>/dev/null || true + fi + if [ -n "${SHIM_PID:-}" ]; then + kill "$SHIM_PID" 2>/dev/null || true + wait "$SHIM_PID" 2>/dev/null || true + fi + if [ -n "${SHIM_LOG:-}" ]; then + rm -f "$SHIM_LOG" + fi +} +trap cleanup EXIT + +SHIM_LOG="$(mktemp "${TMPDIR:-/tmp}/cli-session-shim.XXXXXX")" + +log "starting cli-session-shim" +node "$SHIM_SCRIPT" \ + --session-id "$BB_SESSION_ID" \ + --session-name "$SESSION_NAME" \ + --tmux-session "$SESSION_NAME" \ + --control-dir "$CONTROL_DIR" \ + >"$SHIM_LOG" 2>&1 & +SHIM_PID=$! + +for _ in $(seq 1 75); do + if [ -S "$SOCKET_PATH" ]; then + break + fi + + if ! kill -0 "$SHIM_PID" 2>/dev/null; then + cat "$SHIM_LOG" >&2 || true + die "cli-session-shim exited before creating socket" + fi + + sleep 0.2 +done + +if [ ! -S "$SOCKET_PATH" ]; then + cat "$SHIM_LOG" >&2 || true + die "timed out waiting for shim socket: $SOCKET_PATH" +fi + +export BB_SESSION_ID +export BB_SESSION_NAME="$SESSION_NAME" +export BB_CONTROL_SESSION="$CONTROL_SESSION" +export BB_CONTROL_DIR="$CONTROL_DIR" + +BB_UPDATE_SCRIPT="$SCRIPT_DIR/bb-update.sh" +if [ ! -x "$BB_UPDATE_SCRIPT" ]; then + die "bb-update helper is missing or not executable: $BB_UPDATE_SCRIPT" +fi + +cd "$WORKTREE" + +log "launching backend=$BACKEND in $WORKTREE" +set +e +"${CLI_CMD[@]}" & +CLI_PID=$! + +if [ "$TIMEOUT_SEC" -gt 0 ]; then + ( + sleep "$TIMEOUT_SEC" + if kill -0 "$CLI_PID" 2>/dev/null; then + printf '%s\n' "[run-cli-agent] timeout reached (${TIMEOUT_SEC}s), terminating CLI process $CLI_PID" >&2 + kill "$CLI_PID" 2>/dev/null || true + sleep 5 + kill -9 "$CLI_PID" 2>/dev/null || true + fi + ) & + WATCHDOG_PID=$! +fi + +wait "$CLI_PID" +CLI_EXIT=$? +set -e + +if [ -n "${WATCHDOG_PID:-}" ]; then + kill "$WATCHDOG_PID" 2>/dev/null || true +fi + +status_label="success" +if [ "$CLI_EXIT" -ne 0 ]; then + status_label="failure" +fi + +COMPLETION_JSON="$(printf '{"type":"cli_runner_completion","todo_id":"%s","session_name":"%s","repo":"%s","backend":"%s","status":"%s","exit_code":%d}' \ + "$(json_escape "$TODO_ID")" \ + "$(json_escape "$SESSION_NAME")" \ + "$(json_escape "$REPO")" \ + "$(json_escape "$BACKEND")" \ + "$(json_escape "$status_label")" \ + "$CLI_EXIT")" + +COMPLETION_MSG="$(cat <$COMPLETION_JSON +EOF_COMPLETION +)" + +notify_control() { + local attempts=3 + local attempt=1 + while [ "$attempt" -le "$attempts" ]; do + if "$BB_UPDATE_SCRIPT" "$COMPLETION_MSG"; then + return 0 + fi + sleep "$attempt" + attempt=$((attempt + 1)) + done + return 1 +} + +if ! notify_control; then + ts="$(date -u +"%Y-%m-%dT%H:%M:%SZ")" + runner_log="$HOME/.pi/agent/cli-runner-errors.log" + mkdir -p "$(dirname "$runner_log")" + printf '%s\n' "[$ts] failed to notify control-agent for TODO $TODO_ID (session=$SESSION_NAME backend=$BACKEND exit=$CLI_EXIT)" >> "$runner_log" + + todo_suffix="${TODO_ID#TODO-}" + for todo_file in "$HOME/.pi/todos/$todo_suffix.md" "$HOME/.pi/todos/$TODO_ID.md"; do + if [ -f "$todo_file" ]; then + printf '\n[cli-runner-error %s] failed to notify control-agent (exit=%s backend=%s session=%s)\n' \ + "$ts" "$CLI_EXIT" "$BACKEND" "$SESSION_NAME" >> "$todo_file" + break + fi + done +fi + +exit "$CLI_EXIT" diff --git a/pi/skills/control-agent/scripts/run-cli-agent.test.sh b/pi/skills/control-agent/scripts/run-cli-agent.test.sh new file mode 100755 index 0000000..26a3a95 --- /dev/null +++ b/pi/skills/control-agent/scripts/run-cli-agent.test.sh @@ -0,0 +1,511 @@ +#!/usr/bin/env bash + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +RUNNER="$SCRIPT_DIR/run-cli-agent.sh" +BB_UPDATE="$SCRIPT_DIR/bb-update.sh" + +PASS=0 +FAIL=0 +TMPDIR="$(mktemp -d /tmp/rca.XXXXXX)" + +cleanup() { + rm -rf "$TMPDIR" +} +trap cleanup EXIT + +pass() { + echo " PASS: $1" + PASS=$((PASS + 1)) +} + +fail() { + echo " FAIL: $1" + FAIL=$((FAIL + 1)) +} + +wait_server_or_terminate() { + local pid="$1" + local ticks=0 + while kill -0 "$pid" 2>/dev/null; do + if [ "$ticks" -ge 30 ]; then + kill "$pid" 2>/dev/null || true + break + fi + sleep 0.1 + ticks=$((ticks + 1)) + done + wait "$pid" 2>/dev/null || true +} + +assert_contains() { + local desc="$1" + local haystack="$2" + local needle="$3" + if echo "$haystack" | grep -qF -- "$needle"; then + pass "$desc" + else + fail "$desc (missing: $needle)" + fi +} + +run_expect_success() { + local output + if output="$("$@" 2>&1)"; then + printf '%s' "$output" + return 0 + fi + + printf '%s' "$output" + return 1 +} + +run_expect_failure() { + local output + if output="$("$@" 2>&1)"; then + printf '%s' "$output" + return 1 + fi + + printf '%s' "$output" + return 0 +} + +echo "" +echo "Testing run-cli-agent scripts" +echo "==============================" +echo "" + +BIN_DIR="$TMPDIR/bin" +WORKTREE="$TMPDIR/worktree" +CONTROL_DIR="$TMPDIR/sc" +PERSONA_DIR="$TMPDIR/persona" +FAKE_CLI_LOG="$TMPDIR/fake-cli.log" +FAKE_TMUX_LOG="$TMPDIR/fake-tmux.log" +CAPTURE_FILE="$TMPDIR/capture.txt" + +mkdir -p "$BIN_DIR" "$WORKTREE" "$CONTROL_DIR" "$PERSONA_DIR" +: > "$FAKE_CLI_LOG" +: > "$FAKE_TMUX_LOG" +: > "$CAPTURE_FILE" + +# Minimal git worktree marker +cat > "$WORKTREE/.git" <<'GIT' +gitdir: /tmp/fake +GIT + +cat > "$PERSONA_DIR/persona.claude-code.tmpl" <<'TPL' +Session {{SESSION_NAME}} Todo {{TODO_ID}} Repo {{REPO}} +TPL + +cat > "$PERSONA_DIR/persona.codex.tmpl" <<'TPL' +Session {{SESSION_NAME}} Todo {{TODO_ID}} Repo {{REPO}} +TPL + +cat > "$BIN_DIR/claude" <> "$FAKE_CLI_LOG" +exit 0 +EOF_CLAUDE + +cat > "$BIN_DIR/codex" <> "$FAKE_CLI_LOG" +exit 0 +EOF_CODEX + +cat > "$BIN_DIR/tmux" <> "$FAKE_TMUX_LOG" + exit 0 +fi +if [ "\$cmd" = "capture-pane" ]; then + cat "$CAPTURE_FILE" + exit 0 +fi +printf '%s\n' "unknown:\$*" >> "$FAKE_TMUX_LOG" +exit 0 +EOF_TMUX + +chmod +x "$BIN_DIR/claude" "$BIN_DIR/codex" "$BIN_DIR/tmux" + +export PATH="$BIN_DIR:$PATH" + +# 1) Argument validation +if out="$(run_expect_failure "$RUNNER" --worktree "$WORKTREE" --session-name dev-agent-a --todo-id abc12345 --repo myapp)"; then + assert_contains "missing backend fails" "$out" "required value is empty: backend" +else + fail "missing backend should fail" +fi + +# 2) Dry-run command construction (claude) +if out="$(run_expect_success "$RUNNER" \ + --backend claude-code \ + --worktree "$WORKTREE" \ + --session-name dev-agent-myapp-a1b2c3d4 \ + --todo-id a1b2c3d4 \ + --repo myapp \ + --persona-dir "$PERSONA_DIR" \ + --dry-run)"; then + assert_contains "claude dry-run includes append-system-prompt" "$out" "--append-system-prompt" + assert_contains "claude dry-run includes session" "$out" "dev-agent-myapp-a1b2c3d4" +else + fail "claude dry-run should succeed" +fi + +# 3) Dry-run command construction (codex) +if out="$(run_expect_success "$RUNNER" \ + --backend codex \ + --worktree "$WORKTREE" \ + --session-name dev-agent-myapp-b1c2d3e4 \ + --todo-id b1c2d3e4 \ + --repo myapp \ + --persona-dir "$PERSONA_DIR" \ + --dry-run)"; then + assert_contains "codex dry-run includes full-auto" "$out" "--full-auto" + if echo "$out" | grep -q -- "--instructions"; then + fail "codex dry-run should not use --instructions" + else + pass "codex dry-run does not use --instructions" + fi +else + fail "codex dry-run should succeed" +fi + +# 4) Full run: completion payload reaches control socket +CONTROL_UUID="aaaaaaaa-aaaa-4aaa-8aaa-aaaaaaaaaaaa" +CONTROL_SOCKET="$CONTROL_DIR/$CONTROL_UUID.sock" +CONTROL_ALIAS="$CONTROL_DIR/control-agent.alias" +CAPTURED_RPC="$TMPDIR/captured-rpc.txt" + +python3 - "$CONTROL_SOCKET" "$CAPTURED_RPC" <<'PY' 2>/dev/null & +import os +import socket +import sys + +sock_path = sys.argv[1] +out_path = sys.argv[2] + +if os.path.exists(sock_path): + os.unlink(sock_path) + +server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) +server.bind(sock_path) +server.listen(1) +conn, _ = server.accept() + +chunks = [] +while True: + piece = conn.recv(4096) + if not piece: + break + chunks.append(piece) + if b"\n" in piece: + break + +payload = b"".join(chunks) +with open(out_path, "wb") as fh: + fh.write(payload) + +conn.sendall(b'{"type":"response","command":"send","success":true}\n') +conn.close() +server.close() +PY +SERVER_PID=$! + +SERVER_READY=0 +for _ in $(seq 1 40); do + if [ -S "$CONTROL_SOCKET" ]; then + SERVER_READY=1 + break + fi + if ! kill -0 "$SERVER_PID" 2>/dev/null; then + break + fi + sleep 0.05 +done + +if [ "$SERVER_READY" -eq 0 ]; then + pass "full run socket assertion skipped (unix sockets unavailable in this environment)" + wait "$SERVER_PID" 2>/dev/null || true +else + ln -s "$(basename "$CONTROL_SOCKET")" "$CONTROL_ALIAS" + + if BB_CONTROL_DIR="$CONTROL_DIR" run_expect_success "$RUNNER" \ + --backend claude-code \ + --worktree "$WORKTREE" \ + --session-name dev-agent-myapp-c1d2e3f4 \ + --todo-id c1d2e3f4 \ + --repo myapp \ + --persona-dir "$PERSONA_DIR" \ + --timeout 30 \ + --control-session control-agent \ + >/dev/null; then + pass "full run exits successfully" + else + fail "full run should succeed" + kill "$SERVER_PID" 2>/dev/null || true + fi + + wait_server_or_terminate "$SERVER_PID" + + if [ -f "$CAPTURED_RPC" ]; then + payload="$(cat "$CAPTURED_RPC")" + assert_contains "completion payload uses send RPC" "$payload" '"type":"send"' + assert_contains "completion payload includes todo" "$payload" "TODO c1d2e3f4" + assert_contains "completion payload includes sender_info" "$payload" "sender_info" + assert_contains "completion payload includes structured marker" "$payload" "" + else + fail "expected captured RPC payload" + fi +fi + +# 5) Runner retries completion update when control responds with failure +RETRY_SOCKET="$CONTROL_DIR/cccccccc-cccc-4ccc-8ccc-cccccccccccc.sock" +RETRY_ALIAS="$CONTROL_DIR/control-retry.alias" +RETRY_TRACE="$TMPDIR/retry-trace.txt" + +python3 - "$RETRY_SOCKET" "$RETRY_TRACE" <<'PY' 2>/dev/null & +import os +import socket +import sys + +sock_path = sys.argv[1] +trace_path = sys.argv[2] + +if os.path.exists(sock_path): + os.unlink(sock_path) + +server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) +server.bind(sock_path) +server.listen(3) +server.settimeout(8) + +attempt = 0 +with open(trace_path, "w", encoding="utf-8") as trace: + while attempt < 3: + try: + conn, _ = server.accept() + except TimeoutError: + break + attempt += 1 + payload = b"" + while True: + chunk = conn.recv(4096) + if not chunk: + break + payload += chunk + if b"\n" in chunk: + break + trace.write(f"attempt={attempt} payload={payload.decode('utf-8', 'replace')}\n") + trace.flush() + if attempt < 3: + conn.sendall(b'{"type":"response","command":"send","success":false,"error":"retry me"}\n') + else: + conn.sendall(b'{"type":"response","command":"send","success":true}\n') + conn.close() + +server.close() +PY +RETRY_SERVER_PID=$! + +RETRY_READY=0 +for _ in $(seq 1 40); do + if [ -S "$RETRY_SOCKET" ]; then + RETRY_READY=1 + break + fi + if ! kill -0 "$RETRY_SERVER_PID" 2>/dev/null; then + break + fi + sleep 0.05 +done + +if [ "$RETRY_READY" -eq 0 ]; then + pass "runner retry assertion skipped (unix sockets unavailable in this environment)" + wait "$RETRY_SERVER_PID" 2>/dev/null || true +else + ln -s "$(basename "$RETRY_SOCKET")" "$RETRY_ALIAS" + + if BB_CONTROL_DIR="$CONTROL_DIR" run_expect_success "$RUNNER" \ + --backend claude-code \ + --worktree "$WORKTREE" \ + --session-name dev-agent-myapp-retry9876 \ + --todo-id retry9876 \ + --repo myapp \ + --persona-dir "$PERSONA_DIR" \ + --timeout 30 \ + --control-session control-retry \ + >/dev/null; then + pass "runner succeeds after retryable control failures" + else + fail "runner should retry and succeed" + kill "$RETRY_SERVER_PID" 2>/dev/null || true + fi + + wait_server_or_terminate "$RETRY_SERVER_PID" + + if [ -f "$RETRY_TRACE" ]; then + retry_trace="$(cat "$RETRY_TRACE")" + assert_contains "runner attempted completion update three times" "$retry_trace" "attempt=3" + else + fail "retry trace should be captured" + fi +fi + +# 6) Worktree validation +mkdir -p "$TMPDIR/not-a-worktree" +if out="$(run_expect_failure "$RUNNER" \ + --backend claude-code \ + --worktree "$TMPDIR/not-a-worktree" \ + --session-name dev-agent-myapp-deadbeef \ + --todo-id deadbeef \ + --repo myapp \ + --persona-dir "$PERSONA_DIR")"; then + assert_contains "invalid worktree is rejected" "$out" "worktree is not a git checkout" +else + fail "invalid worktree should fail" +fi + +# 7) bb-update helper sends follow_up payload +UPDATE_SOCKET="$CONTROL_DIR/bbbbbbbb-bbbb-4bbb-8bbb-bbbbbbbbbbbb.sock" +UPDATE_ALIAS="$CONTROL_DIR/control-update.alias" +UPDATE_PAYLOAD="$TMPDIR/bb-update-payload.txt" + +python3 - "$UPDATE_SOCKET" "$UPDATE_PAYLOAD" <<'PY' 2>/dev/null & +import os +import socket +import sys + +sock_path = sys.argv[1] +out_path = sys.argv[2] + +if os.path.exists(sock_path): + os.unlink(sock_path) + +server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) +server.bind(sock_path) +server.listen(1) +conn, _ = server.accept() + +data = b"" +while True: + chunk = conn.recv(4096) + if not chunk: + break + data += chunk + if b"\n" in chunk: + break + +with open(out_path, "wb") as fh: + fh.write(data) + +conn.sendall(b'{"type":"response","command":"send","success":true}\n') +conn.close() +server.close() +PY +UPDATE_SERVER_PID=$! + +UPDATE_SERVER_READY=0 +for _ in $(seq 1 40); do + if [ -S "$UPDATE_SOCKET" ]; then + UPDATE_SERVER_READY=1 + break + fi + if ! kill -0 "$UPDATE_SERVER_PID" 2>/dev/null; then + break + fi + sleep 0.05 +done + +if [ "$UPDATE_SERVER_READY" -eq 0 ]; then + pass "bb-update socket assertion skipped (unix sockets unavailable in this environment)" + wait "$UPDATE_SERVER_PID" 2>/dev/null || true +else + ln -s "$(basename "$UPDATE_SOCKET")" "$UPDATE_ALIAS" + + if BB_CONTROL_DIR="$CONTROL_DIR" \ + BB_CONTROL_SESSION="control-update" \ + BB_SESSION_ID="cccccccc-cccc-4ccc-8ccc-cccccccccccc" \ + BB_SESSION_NAME="dev-agent-myapp-feed1234" \ + "$BB_UPDATE" "Milestone: PR opened" >/dev/null 2>&1; then + pass "bb-update call succeeded" + else + fail "bb-update call should succeed" + fi + + wait "$UPDATE_SERVER_PID" + + if [ -f "$UPDATE_PAYLOAD" ]; then + payload="$(cat "$UPDATE_PAYLOAD")" + assert_contains "bb-update payload contains follow_up mode" "$payload" '"mode":"follow_up"' + assert_contains "bb-update payload contains message" "$payload" "Milestone: PR opened" + else + fail "bb-update payload should be captured" + fi +fi + +# 8) bb-update helper fails when control rejects update +REJECT_SOCKET="$CONTROL_DIR/dddddddd-dddd-4ddd-8ddd-dddddddddddd.sock" +REJECT_ALIAS="$CONTROL_DIR/control-reject.alias" + +python3 - "$REJECT_SOCKET" <<'PY' 2>/dev/null & +import os +import socket +import sys + +sock_path = sys.argv[1] +if os.path.exists(sock_path): + os.unlink(sock_path) + +server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) +server.bind(sock_path) +server.listen(1) +conn, _ = server.accept() +while True: + chunk = conn.recv(4096) + if not chunk or b"\n" in chunk: + break +conn.sendall(b'{"type":"response","command":"send","success":false,"error":"rejected"}\n') +conn.close() +server.close() +PY +REJECT_SERVER_PID=$! + +REJECT_READY=0 +for _ in $(seq 1 40); do + if [ -S "$REJECT_SOCKET" ]; then + REJECT_READY=1 + break + fi + if ! kill -0 "$REJECT_SERVER_PID" 2>/dev/null; then + break + fi + sleep 0.05 +done + +if [ "$REJECT_READY" -eq 0 ]; then + pass "bb-update rejection assertion skipped (unix sockets unavailable in this environment)" + wait "$REJECT_SERVER_PID" 2>/dev/null || true +else + ln -s "$(basename "$REJECT_SOCKET")" "$REJECT_ALIAS" + if BB_CONTROL_DIR="$CONTROL_DIR" \ + BB_CONTROL_SESSION="control-reject" \ + "$BB_UPDATE" "Should fail" >/dev/null 2>&1; then + fail "bb-update should fail when control rejects update" + else + pass "bb-update fails on explicit control rejection" + fi + wait "$REJECT_SERVER_PID" +fi + +echo "" +echo "Results: $PASS passed, $FAIL failed" + +if [ "$FAIL" -gt 0 ]; then + exit 1 +fi diff --git a/pi/skills/dev-agent-cli/persona.claude-code.tmpl b/pi/skills/dev-agent-cli/persona.claude-code.tmpl new file mode 100644 index 0000000..59d17c6 --- /dev/null +++ b/pi/skills/dev-agent-cli/persona.claude-code.tmpl @@ -0,0 +1,61 @@ +# Dev Agent CLI Persona (Claude Code) + +You are an ephemeral Baudbot dev agent session. + +## Session Context +- Session: `{{SESSION_NAME}}` +- Todo: `{{TODO_ID}}` +- Repo: `{{REPO}}` +- You are running inside a dedicated git worktree for this task. + +## Core Rules +- Own the full technical loop: implement, test, push, open PR, monitor CI, fix failures, resolve review comments. +- Stay scoped to this task. +- Never post to Slack APIs directly. The control-agent is the only external communicator. +- Send all progress/completion updates only through: + - `~/.pi/agent/skills/control-agent/scripts/bb-update.sh ""` +- Keep updates concise and include concrete artifacts (PR URL, CI state, preview URL). + +## Startup Checklist +1. Read repo guidance (`CODEX.md`, `AGENTS.md`, `CLAUDE.md`) if present. +2. Read shared memory notes if available: + - `cat ~/.pi/agent/memory/repos.md 2>/dev/null || true` +3. Immediately send readiness update: + - `~/.pi/agent/skills/control-agent/scripts/bb-update.sh "Ready — session {{SESSION_NAME}} (TODO {{TODO_ID}})"` +4. Wait for the task prompt in this terminal. + +## Follow-up Handling +- Additional requirements may arrive mid-task in this same terminal session. +- Incorporate them into the current branch and continue the loop. + +## Working Standards +- Never commit to main branches. +- Stay inside your current worktree. +- Keep security hygiene: validate inputs, avoid unsafe query interpolation, and verify external API docs before integrating. +- Update docs/config/tests that are required by repository conventions. + +## Protected Paths +Do not modify protected security files in the baudbot repo: +- `bin/`, `hooks/`, `setup.sh`, `start.sh`, `SECURITY.md` +- `pi/extensions/tool-guard.ts`, `pi/extensions/tool-guard.test.mjs` +- `slack-bridge/security.mjs`, `slack-bridge/security.test.mjs` + +## Milestone Updates +Send updates at minimum for: +- PR opened +- CI started +- CI failed (with blocker summary) +- CI passing +- Review comments addressed +- Final completion summary + +## Completion Update Format +When done, send a final update through `bb-update.sh` that includes: +- TODO ID +- PR URL +- CI status +- Review status +- Preview URL (if any) +- Brief summary of code changes + +Then exit the session. diff --git a/pi/skills/dev-agent-cli/persona.codex.tmpl b/pi/skills/dev-agent-cli/persona.codex.tmpl new file mode 100644 index 0000000..bedb760 --- /dev/null +++ b/pi/skills/dev-agent-cli/persona.codex.tmpl @@ -0,0 +1,41 @@ +# Dev Agent CLI Persona (Codex) + +You are an ephemeral Baudbot dev agent. + +Session: `{{SESSION_NAME}}` +Todo: `{{TODO_ID}}` +Repo: `{{REPO}}` + +## Must Follow +- Own the full implementation loop: code, test, push, PR, CI fixes, review fixes. +- Do not communicate with Slack directly. +- Send all updates only via: + - `~/.pi/agent/skills/control-agent/scripts/bb-update.sh ""` +- Treat follow-up instructions in this terminal as task updates for the same todo. + +## Startup +1. Load local repo guidance (`CODEX.md`, `AGENTS.md`, `CLAUDE.md`) if present. +2. Read shared memory (`~/.pi/agent/memory/repos.md`) when available. +3. Send readiness update immediately: + - `~/.pi/agent/skills/control-agent/scripts/bb-update.sh "Ready — session {{SESSION_NAME}} (TODO {{TODO_ID}})"` +4. Wait for task details in terminal. + +## Working Constraints +- Stay in your assigned worktree. +- Never commit to main. +- Respect repository security/test/doc conventions. +- Do not modify protected baudbot security paths: + - `bin/`, `hooks/`, `setup.sh`, `start.sh`, `SECURITY.md` + - `pi/extensions/tool-guard.ts`, `pi/extensions/tool-guard.test.mjs` + - `slack-bridge/security.mjs`, `slack-bridge/security.test.mjs` + +## Required Milestone Updates +Use `bb-update.sh` at: +- PR opened +- CI started +- CI failed +- CI passed +- Review feedback addressed +- Task complete + +Final completion update must include TODO ID, PR URL, CI status, review status, preview URL (if available), and a concise change summary. diff --git a/test/broker-bridge.integration.test.mjs b/test/broker-bridge.integration.test.mjs index 4c8df64..53db33e 100644 --- a/test/broker-bridge.integration.test.mjs +++ b/test/broker-bridge.integration.test.mjs @@ -4,7 +4,7 @@ import { spawn } from "node:child_process"; import net from "node:net"; import path from "node:path"; import { fileURLToPath } from "node:url"; -import { mkdtempSync, mkdirSync, rmSync } from "node:fs"; +import { existsSync, mkdtempSync, mkdirSync, rmSync } from "node:fs"; import { tmpdir } from "node:os"; import sodium from "libsodium-wrappers-sumo"; import { @@ -32,13 +32,75 @@ function waitFor(condition, timeoutMs = 10_000, intervalMs = 50, onTimeoutMessag }); } +function isLocalBindPermissionError(error) { + if (!error || typeof error !== "object" || !("code" in error)) return false; + const code = String(error.code || ""); + return code === "EPERM" || code === "EACCES"; +} + +async function listenServer(server, port, host) { + await new Promise((resolve, reject) => { + const onError = (error) => { + server.off("listening", onListening); + reject(error); + }; + const onListening = () => { + server.off("error", onError); + resolve(); + }; + server.once("error", onError); + server.once("listening", onListening); + server.listen(port, host); + }); +} + +async function listenLocalhostOrUnavailable(server, port = 0) { + try { + await listenServer(server, port, "127.0.0.1"); + return true; + } catch (error) { + if (isLocalBindPermissionError(error)) return false; + throw error; + } +} + +async function listenUnixSocketOrUnavailable(server, socketPath) { + try { + await new Promise((resolve, reject) => { + const onError = (error) => { + server.off("listening", onListening); + reject(error); + }; + const onListening = () => { + server.off("error", onError); + resolve(); + }; + server.once("error", onError); + server.once("listening", onListening); + server.listen(socketPath); + }); + + // On some platforms/path lengths, Node can report "listening" but no + // filesystem socket entry is created, making it undiscoverable by path. + if (!existsSync(socketPath)) { + await new Promise((resolve) => server.close(() => resolve(undefined))); + return false; + } + + return true; + } catch (error) { + if (isLocalBindPermissionError(error)) return false; + throw error; + } +} + async function reserveFreePort() { const server = createServer((_req, res) => { res.writeHead(204); res.end(); }); - await new Promise((resolve) => server.listen(0, "127.0.0.1", resolve)); + await listenServer(server, 0, "127.0.0.1"); const address = server.address(); if (!address || typeof address === "string") { await new Promise((resolve) => server.close(() => resolve(undefined))); @@ -50,6 +112,26 @@ async function reserveFreePort() { return port; } +function createBridgeHome(tempDirs) { + const roots = ["/tmp", tmpdir()]; + let tempHome = null; + for (const root of roots) { + try { + tempHome = mkdtempSync(path.join(root, "baudbot-broker-home-")); + break; + } catch { + // fall through to next candidate + } + } + if (!tempHome) { + throw new Error("failed to create temp HOME for broker bridge test"); + } + tempDirs.push(tempHome); + mkdirSync(path.join(tempHome, ".pi", "agent"), { recursive: true }); + mkdirSync(path.join(tempHome, ".pi", "session-control"), { recursive: true }); + return tempHome; +} + describe("broker pull bridge semi-integration", () => { const children = []; const servers = []; @@ -115,7 +197,7 @@ describe("broker pull bridge semi-integration", () => { res.end(JSON.stringify({ ok: false, error: "not found" })); }); - await new Promise((resolve) => broker.listen(0, "127.0.0.1", resolve)); + if (!(await listenLocalhostOrUnavailable(broker, 0))) return; servers.push(broker); const address = broker.address(); @@ -128,6 +210,7 @@ describe("broker pull bridge semi-integration", () => { const repoRoot = path.dirname(testFileDir); const bridgePath = path.join(repoRoot, "slack-bridge", "broker-bridge.mjs"); const bridgeCwd = path.join(repoRoot, "slack-bridge"); + const tempHome = createBridgeHome(tempDirs); let bridgeStdout = ""; let bridgeStderr = ""; @@ -137,6 +220,7 @@ describe("broker pull bridge semi-integration", () => { cwd: bridgeCwd, env: { ...process.env, + HOME: tempHome, SLACK_BROKER_URL: brokerUrl, SLACK_BROKER_WORKSPACE_ID: "T123BROKER", SLACK_BROKER_SERVER_PRIVATE_KEY: b64(32, 11), @@ -201,8 +285,7 @@ describe("broker pull bridge semi-integration", () => { const bridgePath = path.join(repoRoot, "slack-bridge", "broker-bridge.mjs"); const bridgeCwd = path.join(repoRoot, "slack-bridge"); - const tempHome = mkdtempSync(path.join(tmpdir(), "baudbot-broker-test-")); - tempDirs.push(tempHome); + const tempHome = createBridgeHome(tempDirs); const sessionDir = path.join(tempHome, ".pi", "session-control"); mkdirSync(sessionDir, { recursive: true }); @@ -226,7 +309,7 @@ describe("broker pull bridge semi-integration", () => { } }); }); - await new Promise((resolve) => agentSocket.listen(socketFile, resolve)); + if (!(await listenUnixSocketOrUnavailable(agentSocket, socketFile))) return; servers.push(agentSocket); const serverBox = sodium.crypto_box_keypair(); @@ -303,7 +386,7 @@ describe("broker pull bridge semi-integration", () => { res.end(JSON.stringify({ ok: false, error: "not found" })); }); - await new Promise((resolve) => broker.listen(0, "127.0.0.1", resolve)); + if (!(await listenLocalhostOrUnavailable(broker, 0))) return; servers.push(broker); const address = broker.address(); @@ -415,7 +498,7 @@ describe("broker pull bridge semi-integration", () => { res.end(JSON.stringify({ ok: false, error: "not found" })); }); - await new Promise((resolve) => broker.listen(0, "127.0.0.1", resolve)); + if (!(await listenLocalhostOrUnavailable(broker, 0))) return; servers.push(broker); const address = broker.address(); @@ -428,11 +511,13 @@ describe("broker pull bridge semi-integration", () => { const repoRoot = path.dirname(testFileDir); const bridgePath = path.join(repoRoot, "slack-bridge", "broker-bridge.mjs"); const bridgeCwd = path.join(repoRoot, "slack-bridge"); + const tempHome = createBridgeHome(tempDirs); const bridge = spawn("node", [bridgePath], { cwd: bridgeCwd, env: { ...process.env, + HOME: tempHome, SLACK_BROKER_URL: brokerUrl, SLACK_BROKER_WORKSPACE_ID: workspaceId, SLACK_BROKER_SERVER_PRIVATE_KEY: b64(32, 11), @@ -501,7 +586,7 @@ describe("broker pull bridge semi-integration", () => { res.end(JSON.stringify({ ok: false, error: "not found" })); }); - await new Promise((resolve) => broker.listen(0, "127.0.0.1", resolve)); + if (!(await listenLocalhostOrUnavailable(broker, 0))) return; servers.push(broker); const address = broker.address(); @@ -514,11 +599,13 @@ describe("broker pull bridge semi-integration", () => { const repoRoot = path.dirname(testFileDir); const bridgePath = path.join(repoRoot, "slack-bridge", "broker-bridge.mjs"); const bridgeCwd = path.join(repoRoot, "slack-bridge"); + const tempHome = createBridgeHome(tempDirs); const bridge = spawn("node", [bridgePath], { cwd: bridgeCwd, env: { ...process.env, + HOME: tempHome, SLACK_BROKER_URL: brokerUrl, SLACK_BROKER_WORKSPACE_ID: workspaceId, SLACK_BROKER_SERVER_PRIVATE_KEY: b64(32, 11), @@ -588,7 +675,7 @@ describe("broker pull bridge semi-integration", () => { res.end(JSON.stringify({ ok: false, error: "not found" })); }); - await new Promise((resolve) => broker.listen(0, "127.0.0.1", resolve)); + if (!(await listenLocalhostOrUnavailable(broker, 0))) return; servers.push(broker); const address = broker.address(); @@ -601,11 +688,13 @@ describe("broker pull bridge semi-integration", () => { const repoRoot = path.dirname(testFileDir); const bridgePath = path.join(repoRoot, "slack-bridge", "broker-bridge.mjs"); const bridgeCwd = path.join(repoRoot, "slack-bridge"); + const tempHome = createBridgeHome(tempDirs); const bridge = spawn("node", [bridgePath], { cwd: bridgeCwd, env: { ...process.env, + HOME: tempHome, SLACK_BROKER_URL: brokerUrl, SLACK_BROKER_WORKSPACE_ID: workspaceId, SLACK_BROKER_SERVER_PRIVATE_KEY: b64(32, 11), @@ -644,7 +733,13 @@ describe("broker pull bridge semi-integration", () => { await sodium.ready; const workspaceId = "T123BROKER"; - const bridgeApiPort = await reserveFreePort(); + let bridgeApiPort; + try { + bridgeApiPort = await reserveFreePort(); + } catch (error) { + if (isLocalBindPermissionError(error)) return; + throw error; + } let outboundAuthorization = null; const broker = createServer(async (req, res) => { @@ -671,7 +766,7 @@ describe("broker pull bridge semi-integration", () => { res.end(JSON.stringify({ ok: false, error: "not found" })); }); - await new Promise((resolve) => broker.listen(0, "127.0.0.1", resolve)); + if (!(await listenLocalhostOrUnavailable(broker, 0))) return; servers.push(broker); const address = broker.address(); @@ -684,11 +779,13 @@ describe("broker pull bridge semi-integration", () => { const repoRoot = path.dirname(testFileDir); const bridgePath = path.join(repoRoot, "slack-bridge", "broker-bridge.mjs"); const bridgeCwd = path.join(repoRoot, "slack-bridge"); + const tempHome = createBridgeHome(tempDirs); const bridge = spawn("node", [bridgePath], { cwd: bridgeCwd, env: { ...process.env, + HOME: tempHome, SLACK_BROKER_URL: brokerUrl, SLACK_BROKER_WORKSPACE_ID: workspaceId, SLACK_BROKER_SERVER_PRIVATE_KEY: b64(32, 11), @@ -735,6 +832,7 @@ describe("broker pull bridge semi-integration", () => { const repoRoot = path.dirname(testFileDir); const bridgePath = path.join(repoRoot, "slack-bridge", "broker-bridge.mjs"); const bridgeCwd = path.join(repoRoot, "slack-bridge"); + const tempHome = createBridgeHome(tempDirs); let bridgeStdout = ""; let bridgeStderr = ""; @@ -743,6 +841,7 @@ describe("broker pull bridge semi-integration", () => { cwd: bridgeCwd, env: { ...process.env, + HOME: tempHome, SLACK_BROKER_URL: "http://127.0.0.1:65535", SLACK_BROKER_WORKSPACE_ID: "T123BROKER", SLACK_BROKER_SERVER_PRIVATE_KEY: b64(32, 11), diff --git a/test/security-audit.test.mjs b/test/security-audit.test.mjs index 6e20d79..218cda3 100644 --- a/test/security-audit.test.mjs +++ b/test/security-audit.test.mjs @@ -61,7 +61,7 @@ async function runAuditWithLocalBridge(homeDir, args = []) { }; server.on("error", (err) => { - if (err && err.code === "EADDRINUSE") { + if (err && (err.code === "EADDRINUSE" || err.code === "EPERM" || err.code === "EACCES")) { resolve(runAudit(homeDir, args)); return; } diff --git a/test/shell-scripts.test.mjs b/test/shell-scripts.test.mjs index 2081249..a325959 100644 --- a/test/shell-scripts.test.mjs +++ b/test/shell-scripts.test.mjs @@ -43,8 +43,28 @@ describe("shell script test suites", () => { expect(() => runScript("bin/lib/doctor-common.test.sh")).not.toThrow(); }); + it("remote common helpers", () => { + expect(() => runScript("bin/lib/remote-common.test.sh")).not.toThrow(); + }); + + it("remote ssh helpers", () => { + expect(() => runScript("bin/lib/remote-ssh.test.sh")).not.toThrow(); + }); + + it("remote hetzner adapter", () => { + expect(() => runScript("bin/lib/remote-hetzner.test.sh")).not.toThrow(); + }); + it("baudbot cli", () => { expect(() => runScript("bin/baudbot.test.sh")).not.toThrow(); }); + it("remote cli", () => { + expect(() => runScript("bin/remote.test.sh")).not.toThrow(); + }); + + it("cli agent runner helpers", () => { + expect(() => runScript("pi/skills/control-agent/scripts/run-cli-agent.test.sh")).not.toThrow(); + }); + }); diff --git a/vitest.config.mjs b/vitest.config.mjs index 37db185..02a7381 100644 --- a/vitest.config.mjs +++ b/vitest.config.mjs @@ -3,6 +3,7 @@ import { defineConfig } from "vitest/config"; export default defineConfig({ test: { include: [ + "pi/extensions/cli-session-shim.test.mjs", "pi/extensions/heartbeat.test.mjs", "pi/extensions/memory.test.mjs", "test/legacy-node-tests.test.mjs", From 7eb512f4b14a16845cbaf21f81c9008af85cb4fd Mon Sep 17 00:00:00 2001 From: AndreyMarchuk Date: Mon, 23 Feb 2026 09:36:48 -0800 Subject: [PATCH 2/2] ops: implement remote workflows and stabilize agent runtime --- .gitignore | 4 + CONFIGURATION.md | 2 +- README.md | 4 +- bin/baudbot.service | 2 +- bin/config.sh | 13 +- bin/config.test.sh | 16 +- bin/doctor.sh | 31 ++- bin/doctor.test.sh | 147 +++++++++++++++ bin/lib/baudbot-runtime.sh | 24 ++- bin/lib/setup-common.sh | 32 ++++ bin/lib/setup-common.test.sh | 112 +++++++++++ bin/remote.sh | 11 +- bin/test.sh | 2 + bin/uninstall.sh | 2 +- docs/operations.md | 7 + install.sh | 6 +- pi/extensions/kernel/index.ts | 35 +++- pi/skills/control-agent/HEARTBEAT.md | 4 + pi/skills/control-agent/SKILL.md | 6 +- pi/skills/control-agent/startup-cleanup.sh | 18 +- setup.sh | 43 ++++- slack-bridge/bridge.mjs | 30 ++- slack-bridge/broker-bridge.mjs | 25 ++- start.sh | 39 ++-- test/broker-bridge.integration.test.mjs | 210 ++++++++++++++++++++- test/shell-scripts.test.mjs | 8 + 26 files changed, 768 insertions(+), 65 deletions(-) create mode 100644 bin/doctor.test.sh create mode 100644 bin/lib/setup-common.sh create mode 100644 bin/lib/setup-common.test.sh diff --git a/.gitignore b/.gitignore index cfcad7a..91464bc 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,7 @@ slack-bridge/.env # Coverage coverage/ .c8_output/ + + +.tmp +.state \ No newline at end of file diff --git a/CONFIGURATION.md b/CONFIGURATION.md index 8f1b603..cdf5651 100644 --- a/CONFIGURATION.md +++ b/CONFIGURATION.md @@ -39,7 +39,7 @@ The agent also uses an SSH key (`~/.ssh/id_ed25519`) for git push. Setup generat |----------|-------------|---------------| | `SLACK_BOT_TOKEN` | Slack bot OAuth token (required for direct Socket Mode, optional in broker mode) | Create a Slack app at [api.slack.com/apps](https://api.slack.com/apps). Under **OAuth & Permissions**, add bot scopes: `app_mentions:read`, `chat:write`, `channels:history`, `channels:read`, `reactions:write`, `im:history`, `im:read`, `im:write`. Install the app to your workspace and copy the **Bot User OAuth Token**. | | `SLACK_APP_TOKEN` | Slack app-level token (required for Socket Mode, optional in broker mode) | In your Slack app settings → **Basic Information** → **App-Level Tokens**, create a token with `connections:write` scope. | -| `SLACK_ALLOWED_USERS` | Comma-separated Slack user IDs | **Optional** — if not set, all workspace members can interact. Find your Slack user ID: click your profile → "..." → "Copy member ID". Example: `U01ABCDEF,U02GHIJKL` | +| `SLACK_ALLOWED_USERS` | Comma-separated Slack user IDs | **Required** — only listed users can interact with the agent. Find your Slack user ID: click your profile → "..." → "Copy member ID". Example: `U01ABCDEF,U02GHIJKL` | If you're using Slack broker mode (`SLACK_BROKER_*` vars), the runtime uses broker pull delivery and does not require Socket Mode callbacks. diff --git a/README.md b/README.md index 72fffa6..2e7262e 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ Baudbot is designed as shared engineering infrastructure, not a single-user desk | **CPU** | 2 vCPU | 4 vCPU | | **Disk** | 20 GB | 40 GB+ (repos, dependencies, Docker images) | -System package dependencies (installed by `baudbot install`): `git`, `curl`, `tmux`, `iptables`, `docker`, `gh`, `jq`, `sudo`. +System package dependencies (installed by `baudbot install`): `git`, `curl`, `tmux`, `iptables`, `docker`, `gh`, `jq`, `ripgrep`, `sudo`. ## Quick Start @@ -61,7 +61,7 @@ curl -fsSL https://raw.githubusercontent.com/modem-dev/baudbot/main/bootstrap.sh baudbot install ``` -`baudbot install` includes a guided config flow: pick an LLM provider, choose Slack integration mode (managed broker vs custom app), then opt into optional integrations (Kernel/Sentry). Email capabilities are disabled by default and only available in experimental mode (`baudbot setup --experimental` / `install.sh --experimental`). If [`gum`](https://github.com/charmbracelet/gum) is installed, prompts use richer TUI widgets; otherwise installer falls back to standard bash prompts. +`baudbot install` includes a guided config flow: pick an LLM provider, choose Slack integration mode (managed broker vs custom app), then opt into optional integrations (Kernel/Sentry). Host setup installs Node.js + pi and also installs Claude Code via the official installer script for `baudbot_agent`, exposing a root-owned `/usr/local/bin/claude` wrapper for sudo-safe invocation. Email capabilities are disabled by default and only available in experimental mode (`baudbot setup --experimental` / `install.sh --experimental`). If [`gum`](https://github.com/charmbracelet/gum) is installed, prompts use richer TUI widgets; otherwise installer falls back to standard bash prompts. After install: diff --git a/bin/baudbot.service b/bin/baudbot.service index 56c3e32..70c40d6 100644 --- a/bin/baudbot.service +++ b/bin/baudbot.service @@ -22,7 +22,7 @@ Restart=on-failure RestartSec=10 # Environment -Environment=PATH=/home/baudbot_agent/.varlock/bin:/home/baudbot_agent/opt/node-v22.14.0-linux-x64/bin:/usr/local/bin:/usr/bin:/bin +Environment=PATH=/home/baudbot_agent/.local/bin:/home/baudbot_agent/.varlock/bin:/home/baudbot_agent/opt/node-v22.14.0-linux-x64/bin:/usr/local/bin:/usr/bin:/bin Environment=HOME=/home/baudbot_agent # Security hardening diff --git a/bin/config.sh b/bin/config.sh index 6893ee0..034e68f 100755 --- a/bin/config.sh +++ b/bin/config.sh @@ -434,9 +434,9 @@ if [ "$SLACK_CHOICE" = "Use baudbot.ai Slack integration (easy)" ]; then dim " We'll set up broker registration after install via: sudo baudbot broker register" clear_keys SLACK_BOT_TOKEN SLACK_APP_TOKEN prompt_secret "SLACK_ALLOWED_USERS" \ - "Slack user IDs (comma-separated; optional — allow all if empty)" \ + "Slack user IDs (comma-separated; required)" \ "Click your Slack profile → ··· → Copy member ID" \ - "" \ + "required" \ "U" \ "false" else @@ -470,9 +470,9 @@ else "xapp-" prompt_secret "SLACK_ALLOWED_USERS" \ - "Slack user IDs (comma-separated; optional — allow all if empty)" \ + "Slack user IDs (comma-separated; required)" \ "Click your Slack profile → ··· → Copy member ID" \ - "" \ + "required" \ "U" \ "false" fi @@ -579,7 +579,8 @@ fi # ── Validation ─────────────────────────────────────────────────────────────── if [ -z "${ENV_VARS[SLACK_ALLOWED_USERS]:-}" ]; then - warn "SLACK_ALLOWED_USERS not set — all workspace members will be allowed" + echo "❌ SLACK_ALLOWED_USERS is required for Slack access control" + exit 1 fi # ── Write config ───────────────────────────────────────────────────────────── @@ -674,4 +675,4 @@ else fi echo "" echo -e "Next: ${BOLD}sudo baudbot deploy${RESET} to push config to the agent" -echo "" \ No newline at end of file +echo "" diff --git a/bin/config.test.sh b/bin/config.test.sh index c3aaba0..c09376f 100644 --- a/bin/config.test.sh +++ b/bin/config.test.sh @@ -85,24 +85,26 @@ echo "" # Test 1: Advanced Slack path writes socket-mode keys only HOME1="$TMPDIR/advanced" -run_config "$HOME1" '1\nsk-ant-test\n2\nxoxb-test\nxapp-test\n\nn\nn\n' +run_config "$HOME1" '1\nsk-ant-test\n2\nxoxb-test\nxapp-test\nU01ADVANCED\nn\nn\n' ENV1="$HOME1/.baudbot/.env" expect_file_contains "advanced path writes Anthropic key" "$ENV1" "ANTHROPIC_API_KEY=sk-ant-test" expect_file_contains "advanced path writes SLACK_BOT_TOKEN" "$ENV1" "SLACK_BOT_TOKEN=xoxb-test" expect_file_contains "advanced path writes SLACK_APP_TOKEN" "$ENV1" "SLACK_APP_TOKEN=xapp-test" +expect_file_contains "advanced path writes SLACK_ALLOWED_USERS" "$ENV1" "SLACK_ALLOWED_USERS=U01ADVANCED" expect_file_not_contains "advanced path does not write OPENAI key" "$ENV1" "OPENAI_API_KEY=" # Test 2: Easy Slack path avoids socket-mode keys HOME2="$TMPDIR/easy" -run_config "$HOME2" '2\nsk-openai-test\n1\n\nn\nn\n' +run_config "$HOME2" '2\nsk-openai-test\n1\nU02EASY\nn\nn\n' ENV2="$HOME2/.baudbot/.env" expect_file_contains "easy path writes OpenAI key" "$ENV2" "OPENAI_API_KEY=sk-openai-test" expect_file_not_contains "easy path omits SLACK_BOT_TOKEN" "$ENV2" "SLACK_BOT_TOKEN=" expect_file_not_contains "easy path omits SLACK_APP_TOKEN" "$ENV2" "SLACK_APP_TOKEN=" +expect_file_contains "easy path writes SLACK_ALLOWED_USERS" "$ENV2" "SLACK_ALLOWED_USERS=U02EASY" # Test 3: Optional integration toggle prompts conditionally HOME3="$TMPDIR/kernel" -run_config "$HOME3" '3\ngem-key\n2\nxoxb-test\nxapp-test\n\ny\nkernel-key\nn\n' +run_config "$HOME3" '3\ngem-key\n2\nxoxb-test\nxapp-test\nU03KERNEL\ny\nkernel-key\nn\n' ENV3="$HOME3/.baudbot/.env" expect_file_contains "kernel enabled writes key" "$ENV3" "KERNEL_API_KEY=kernel-key" expect_file_not_contains "sentry skipped omits token" "$ENV3" "SENTRY_AUTH_TOKEN=" @@ -115,19 +117,23 @@ expect_exit_nonzero "fails when selected provider key is missing" "$HOME4" '1\n\ # Test 5: Re-run preserves existing selected LLM key when input is blank HOME5="$TMPDIR/rerun-keep-llm" write_existing_env "$HOME5" 'ANTHROPIC_API_KEY=sk-ant-existing\n' -run_config "$HOME5" '1\n\n1\n\nn\nn\n' +run_config "$HOME5" '1\n\n1\nU05KEEP\nn\nn\n' ENV5="$HOME5/.baudbot/.env" expect_file_contains "rerun keeps existing Anthropic key" "$ENV5" "ANTHROPIC_API_KEY=sk-ant-existing" # Test 6: Advanced Slack mode clears stale broker registration keys HOME6="$TMPDIR/clear-broker" write_existing_env "$HOME6" 'OPENAI_API_KEY=sk-old\nSLACK_BROKER_URL=https://broker.example.com\nSLACK_BROKER_WORKSPACE_ID=T0123\nSLACK_BROKER_PUBLIC_KEY=abc\n' -run_config "$HOME6" '2\nsk-openai-new\n2\nxoxb-new\nxapp-new\n\nn\nn\n' +run_config "$HOME6" '2\nsk-openai-new\n2\nxoxb-new\nxapp-new\nU06CLEAR\nn\nn\n' ENV6="$HOME6/.baudbot/.env" expect_file_not_contains "advanced clears broker URL" "$ENV6" "SLACK_BROKER_URL=" expect_file_not_contains "advanced clears broker workspace" "$ENV6" "SLACK_BROKER_WORKSPACE_ID=" expect_file_contains "advanced retains socket bot token" "$ENV6" "SLACK_BOT_TOKEN=xoxb-new" +# Test 7: SLACK_ALLOWED_USERS is required +HOME7="$TMPDIR/missing-slack-users" +expect_exit_nonzero "fails when Slack user IDs are missing" "$HOME7" '2\nsk-openai\n2\nxoxb-miss\nxapp-miss\n\nn\nn\n' + echo "" echo "Results: $PASS passed, $FAIL failed" diff --git a/bin/doctor.sh b/bin/doctor.sh index 50473a6..d1e7863 100755 --- a/bin/doctor.sh +++ b/bin/doctor.sh @@ -85,6 +85,12 @@ else fail "jq not found (required for shell JSON parsing)" fi +if command -v rg &>/dev/null; then + pass "rg is installed ($(command -v rg))" +else + fail "rg not found (install ripgrep)" +fi + if command -v docker &>/dev/null; then pass "docker is available" else @@ -101,6 +107,29 @@ else fail "gh cli not found" fi +check_claude_path() { + local probe_path probe_output current_user + probe_path="$BAUDBOT_HOME/.local/bin:/usr/local/bin:/usr/bin:/bin" + current_user="$(id -un 2>/dev/null || true)" + + if [ "$IS_ROOT" -eq 1 ] && command -v sudo &>/dev/null; then + probe_output="$(sudo -u "$BAUDBOT_AGENT_USER" env PATH="$probe_path" sh -lc 'command -v claude' 2>/dev/null || true)" + elif [ "$current_user" = "$BAUDBOT_AGENT_USER" ]; then + probe_output="$(env PATH="$probe_path:$PATH" sh -lc 'command -v claude' 2>/dev/null || true)" + else + probe_output="$(env PATH="$probe_path:$PATH" sh -lc 'command -v claude' 2>/dev/null || true)" + fi + + printf '%s\n' "$probe_output" | head -n1 +} + +CLAUDE_PATH="$(check_claude_path)" +if [ -n "$CLAUDE_PATH" ]; then + pass "claude code is installed ($CLAUDE_PATH)" +else + warn "claude code not found for $BAUDBOT_AGENT_USER (run: curl -fsSL https://claude.ai/install.sh | bash)" +fi + # ── Secrets ────────────────────────────────────────────────────────────────── echo "" @@ -233,7 +262,7 @@ if [ -f "$ENV_FILE" ]; then if grep -q '^SLACK_ALLOWED_USERS=.\+' "$ENV_FILE" 2>/dev/null; then pass "SLACK_ALLOWED_USERS is set" else - warn "SLACK_ALLOWED_USERS is not set (all workspace members allowed)" + fail "SLACK_ALLOWED_USERS is not set" fi else if [ "$IS_ROOT" -ne 1 ] && [ -d "$BAUDBOT_HOME/.config" ]; then diff --git a/bin/doctor.test.sh b/bin/doctor.test.sh new file mode 100644 index 0000000..dcc206c --- /dev/null +++ b/bin/doctor.test.sh @@ -0,0 +1,147 @@ +#!/bin/bash +# Focused tests for bin/doctor.sh dependency reporting. + +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +DOCTOR_SCRIPT="$REPO_ROOT/bin/doctor.sh" + +TOTAL=0 +PASSED=0 +FAILED=0 + +run_test() { + local name="$1" + shift + local out + + TOTAL=$((TOTAL + 1)) + printf " %-45s " "$name" + + out="$(mktemp /tmp/baudbot-doctor-test-output.XXXXXX)" + if "$@" >"$out" 2>&1; then + echo "✓" + PASSED=$((PASSED + 1)) + else + echo "✗ FAILED" + tail -40 "$out" | sed 's/^/ /' + FAILED=$((FAILED + 1)) + fi + rm -f "$out" +} + +make_fake_commands() { + local fakebin="$1" + local home_dir="$2" + local claude_probe="$3" + mkdir -p "$fakebin" + mkdir -p "$home_dir/.local/bin" + + cat > "$fakebin/sudo" <<'EOF' +#!/bin/bash +if [ "${1:-}" = "-u" ] && [ "$#" -ge 3 ]; then + shift 2 +fi +exec "$@" +EOF + + cat > "$fakebin/curl" <<'EOF' +#!/bin/bash +echo "400" +EOF + + cat > "$fakebin/rg" <<'EOF' +#!/bin/bash +exit 0 +EOF + + if [ "$claude_probe" = "present" ]; then + cat > "$home_dir/.local/bin/claude" <<'EOF' +#!/bin/bash +echo "Claude Code fake binary" +EOF + + cat > "$home_dir/.local/bin/sh" < "$home_dir/.local/bin/sh" <<'EOF' +#!/bin/bash +if [ "${1:-}" = "-lc" ] && [ "${2:-}" = "command -v claude" ]; then + exit 1 +fi +exec /bin/sh "$@" +EOF + fi + + chmod +x "$fakebin/sudo" "$fakebin/curl" "$fakebin/rg" "$home_dir/.local/bin/sh" + if [ "$claude_probe" = "present" ]; then + chmod +x "$home_dir/.local/bin/claude" + fi +} + +run_doctor_capture() { + local tmp="$1" + local out="$2" + set +e + PATH="$tmp/fakebin:/usr/bin:/bin" \ + BAUDBOT_HOME="$tmp/home" \ + BAUDBOT_AGENT_USER="$(id -un)" \ + SUDO_USER="$(id -un)" \ + bash "$DOCTOR_SCRIPT" >"$out" 2>&1 + local rc=$? + set -e + [ "$rc" -ge 0 ] +} + +test_reports_claude_when_available() { + ( + set -euo pipefail + local tmp out + tmp="$(mktemp -d /tmp/baudbot-doctor-test.XXXXXX)" + out="$(mktemp /tmp/baudbot-doctor-out.XXXXXX)" + trap 'rm -rf "$tmp"; rm -f "$out"' EXIT + + mkdir -p "$tmp/home" + make_fake_commands "$tmp/fakebin" "$tmp/home" "present" + run_doctor_capture "$tmp" "$out" + + grep -q "rg is installed ($tmp/fakebin/rg)" "$out" + grep -q "claude code is installed ($tmp/home/.local/bin/claude)" "$out" + ) +} + +test_warns_when_claude_missing() { + ( + set -euo pipefail + local tmp out + tmp="$(mktemp -d /tmp/baudbot-doctor-test.XXXXXX)" + out="$(mktemp /tmp/baudbot-doctor-out.XXXXXX)" + trap 'rm -rf "$tmp"; rm -f "$out"' EXIT + + mkdir -p "$tmp/home" + make_fake_commands "$tmp/fakebin" "$tmp/home" "missing" + run_doctor_capture "$tmp" "$out" + + grep -q "rg is installed ($tmp/fakebin/rg)" "$out" + grep -q "claude code not found for" "$out" + ) +} + +echo "=== doctor cli tests ===" +echo "" + +run_test "reports Claude when available" test_reports_claude_when_available +run_test "warns when Claude is missing" test_warns_when_claude_missing + +echo "" +echo "=== $PASSED/$TOTAL passed, $FAILED failed ===" + +if [ "$FAILED" -gt 0 ]; then + exit 1 +fi diff --git a/bin/lib/baudbot-runtime.sh b/bin/lib/baudbot-runtime.sh index 6a508ba..6858b7f 100644 --- a/bin/lib/baudbot-runtime.sh +++ b/bin/lib/baudbot-runtime.sh @@ -194,6 +194,17 @@ pi_control_dir() { echo "/home/$agent_user/.pi/session-control" } +tmux_socket_dir() { + local agent_user="${1:-baudbot_agent}" + echo "/home/$agent_user/.tmux" +} + +run_agent_tmux() { + local agent_user="$1" + shift + sudo -u "$agent_user" env TMUX_TMPDIR="$(tmux_socket_dir "$agent_user")" tmux "$@" +} + pi_alias_to_uuid() { local alias_path="$1" local target @@ -306,7 +317,7 @@ cmd_logs() { fi echo "No systemd unit. Check tmux sessions:" - echo " sudo -u baudbot_agent tmux ls" + echo " sudo -u baudbot_agent env TMUX_TMPDIR=/home/baudbot_agent/.tmux tmux ls" } cmd_sessions() { @@ -317,7 +328,7 @@ cmd_sessions() { declare -A ALIASES echo -e "${BOLD}tmux sessions:${RESET}" - if sudo -u "$AGENT_USER" tmux ls 2>/dev/null; then + if run_agent_tmux "$AGENT_USER" ls 2>/dev/null; then : else echo " (none)" @@ -422,7 +433,8 @@ cmd_attach() { echo -e "${GREEN}Safe detach:${RESET} Ctrl+b, d ${DIM}(keeps agent running)${RESET}" echo "" pause_before_attach - exec sudo -u "$AGENT_USER" tmux attach-session -t "$tmux_target" + run_agent_tmux "$AGENT_USER" attach-session -t "$tmux_target" + exit $? } attach_pi_session() { @@ -434,7 +446,7 @@ cmd_attach() { echo -e " ${GREEN}Agent keeps running under systemd in the background.${RESET}" echo "" pause_before_attach - exec sudo -u "$AGENT_USER" bash -lc "export PATH='$AGENT_HOME/.varlock/bin:$AGENT_HOME/opt/node-v22.14.0-linux-x64/bin':\$PATH; cd ~; varlock run --path ~/.config/ -- pi --session '$pi_target'" + exec sudo -u "$AGENT_USER" bash -lc "export PATH='$AGENT_HOME/.local/bin:$AGENT_HOME/.varlock/bin:$AGENT_HOME/opt/node-v22.14.0-linux-x64/bin':\$PATH; cd ~; varlock run --path ~/.config/ -- pi --session '$pi_target'" } choose_tmux_target() { @@ -442,14 +454,14 @@ cmd_attach() { local first if [ -n "$requested" ]; then - if sudo -u "$AGENT_USER" tmux has-session -t "$requested" 2>/dev/null; then + if run_agent_tmux "$AGENT_USER" has-session -t "$requested" 2>/dev/null; then echo "$requested" return 0 fi return 1 fi - first=$(sudo -u "$AGENT_USER" tmux ls -F '#{session_name}' 2>/dev/null | head -1) + first=$(run_agent_tmux "$AGENT_USER" ls -F '#{session_name}' 2>/dev/null | head -1) [ -n "$first" ] || return 1 echo "$first" return 0 diff --git a/bin/lib/setup-common.sh b/bin/lib/setup-common.sh new file mode 100644 index 0000000..9efdde6 --- /dev/null +++ b/bin/lib/setup-common.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Shared helpers for setup.sh + +bb_install_exec_wrapper() { + local wrapper_path="$1" + local target_exec="$2" + + if [ -z "$wrapper_path" ] || [ -z "$target_exec" ]; then + echo "bb_install_exec_wrapper: wrapper path and target executable are required" >&2 + return 1 + fi + + if [ ! -x "$target_exec" ]; then + echo "bb_install_exec_wrapper: target executable not found: $target_exec" >&2 + return 1 + fi + + local wrapper_dir tmp + wrapper_dir="$(dirname "$wrapper_path")" + mkdir -p "$wrapper_dir" + + tmp="$(mktemp "${wrapper_path}.tmp.XXXXXX")" + printf '#!/bin/sh\nexec %q "$@"\n' "$target_exec" > "$tmp" + chmod 755 "$tmp" + + if [ "$(id -u)" -eq 0 ]; then + chown root:root "$tmp" + fi + + rm -f "$wrapper_path" + mv "$tmp" "$wrapper_path" +} diff --git a/bin/lib/setup-common.test.sh b/bin/lib/setup-common.test.sh new file mode 100644 index 0000000..75a0e9a --- /dev/null +++ b/bin/lib/setup-common.test.sh @@ -0,0 +1,112 @@ +#!/bin/bash +# Tests for bin/lib/setup-common.sh + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +# shellcheck source=bin/lib/setup-common.sh +source "$SCRIPT_DIR/setup-common.sh" + +TOTAL=0 +PASSED=0 +FAILED=0 + +run_test() { + local name="$1" + shift + local out + + TOTAL=$((TOTAL + 1)) + printf " %-45s " "$name" + + out="$(mktemp /tmp/baudbot-setup-common-test-output.XXXXXX)" + if "$@" >"$out" 2>&1; then + echo "✓" + PASSED=$((PASSED + 1)) + else + echo "✗ FAILED" + tail -40 "$out" | sed 's/^/ /' + FAILED=$((FAILED + 1)) + fi + rm -f "$out" +} + +test_install_exec_wrapper_creates_executable() { + ( + set -euo pipefail + local tmp target wrapper output + tmp="$(mktemp -d /tmp/baudbot-setup-common-test.XXXXXX)" + trap 'rm -rf "$tmp"' EXIT + + target="$tmp/bin/target" + mkdir -p "$(dirname "$target")" + cat >"$target" <<'EOF' +#!/bin/sh +echo "target:$*" +EOF + chmod +x "$target" + + wrapper="$tmp/usr/local/bin/claude" + bb_install_exec_wrapper "$wrapper" "$target" + + [ -x "$wrapper" ] + output="$("$wrapper" --version)" + [ "$output" = "target:--version" ] + ) +} + +test_install_exec_wrapper_replaces_symlink_without_touching_target() { + ( + set -euo pipefail + local tmp target wrapper output + tmp="$(mktemp -d /tmp/baudbot-setup-common-test.XXXXXX)" + trap 'rm -rf "$tmp"' EXIT + + target="$tmp/target-bin" + cat >"$target" <<'EOF' +#!/bin/sh +echo "target:$*" +EOF + chmod +x "$target" + + wrapper="$tmp/usr/local/bin/claude" + mkdir -p "$(dirname "$wrapper")" + ln -s "$target" "$wrapper" + + bb_install_exec_wrapper "$wrapper" "$target" + + [ ! -L "$wrapper" ] + output="$("$wrapper" ok)" + [ "$output" = "target:ok" ] + # Ensure target content itself wasn't replaced via symlink-following writes. + grep -q 'target:\$*' "$target" + ) +} + +test_install_exec_wrapper_fails_when_target_missing() { + ( + set -euo pipefail + local tmp wrapper + tmp="$(mktemp -d /tmp/baudbot-setup-common-test.XXXXXX)" + trap 'rm -rf "$tmp"' EXIT + wrapper="$tmp/usr/local/bin/claude" + + if bb_install_exec_wrapper "$wrapper" "$tmp/missing-target"; then + return 1 + fi + ) +} + +echo "=== setup-common tests ===" +echo "" + +run_test "install wrapper creates executable launcher" test_install_exec_wrapper_creates_executable +run_test "install wrapper replaces symlink safely" test_install_exec_wrapper_replaces_symlink_without_touching_target +run_test "install wrapper fails when target missing" test_install_exec_wrapper_fails_when_target_missing + +echo "" +echo "=== $PASSED/$TOTAL passed, $FAILED failed ===" + +if [ "$FAILED" -gt 0 ]; then + exit 1 +fi diff --git a/bin/remote.sh b/bin/remote.sh index 89af7b5..41620e0 100755 --- a/bin/remote.sh +++ b/bin/remote.sh @@ -550,11 +550,18 @@ remote_run_install_lifecycle() { local tailscale_auth_key="$5" local dry_run="$6" + local -a checkpoints + local checkpoint_line="" + while IFS= read -r checkpoint_line; do + [ -n "$checkpoint_line" ] || continue + checkpoints+=("$checkpoint_line") + done < <(remote_install_checkpoint_order "$mode") + while true; do local restart_from_beginning=0 local checkpoint="" - while IFS= read -r checkpoint; do + for checkpoint in "${checkpoints[@]}"; do [ -n "$checkpoint" ] || continue if remote_checkpoint_is_complete "$target" "$checkpoint"; then @@ -601,7 +608,7 @@ remote_run_install_lifecycle() { if [ "$restart_from_beginning" = "1" ]; then break fi - done < <(remote_install_checkpoint_order "$mode") + done if [ "$restart_from_beginning" = "1" ]; then continue diff --git a/bin/test.sh b/bin/test.sh index 8b5765c..4fbf550 100755 --- a/bin/test.sh +++ b/bin/test.sh @@ -79,6 +79,8 @@ run_shell_tests() { run "config flow" bash bin/config.test.sh run "deploy lib helpers" bash bin/lib/deploy-common.test.sh run "doctor lib helpers" bash bin/lib/doctor-common.test.sh + run "doctor cli" bash bin/doctor.test.sh + run "setup lib helpers" bash bin/lib/setup-common.test.sh run "remote common lib" bash bin/lib/remote-common.test.sh run "remote ssh lib" bash bin/lib/remote-ssh.test.sh run "remote hetzner lib" bash bin/lib/remote-hetzner.test.sh diff --git a/bin/uninstall.sh b/bin/uninstall.sh index 75d1c9b..4a22e74 100755 --- a/bin/uninstall.sh +++ b/bin/uninstall.sh @@ -222,7 +222,7 @@ fi # ── 6. Remove /usr/local/bin wrappers ─────────────────────────────────────── echo "=== Removing system wrappers ===" -for bin in baudbot-docker baudbot-safe-bash; do +for bin in baudbot-docker baudbot-safe-bash claude; do if [ -f "/usr/local/bin/$bin" ]; then run rm -f "/usr/local/bin/$bin" removed "/usr/local/bin/$bin" diff --git a/docs/operations.md b/docs/operations.md index def84cf..b728c0c 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -38,6 +38,13 @@ Provision with a pinned pi version (optional): BAUDBOT_PI_VERSION=0.52.12 baudbot install ``` +Authenticate Claude Code for CLI-backed dev-agents (optional, only needed if using `DEV_AGENT_BACKEND=claude-code`): + +```bash +sudo -u baudbot_agent claude auth login +sudo -u baudbot_agent claude auth status --text +``` + ## Remote install and repair `baudbot remote` is an opt-in operator workflow for remote provisioning/install/repair. It is local-CLI stateful (checkpoints + resume) and does not change normal runtime behavior unless you invoke it. diff --git a/install.sh b/install.sh index 91a4260..083aeb2 100755 --- a/install.sh +++ b/install.sh @@ -163,7 +163,7 @@ install_prereqs_ubuntu() { for attempt in $(seq 1 5); do if DEBIAN_FRONTEND=noninteractive apt-get -o DPkg::Lock::Timeout=120 update -qq \ - && DEBIAN_FRONTEND=noninteractive apt-get -o DPkg::Lock::Timeout=120 install -y -qq git curl tmux iptables docker.io gh jq sudo 2>&1 | tail -3; then + && DEBIAN_FRONTEND=noninteractive apt-get -o DPkg::Lock::Timeout=120 install -y -qq git curl tmux iptables docker.io gh jq ripgrep sudo 2>&1 | tail -3; then return 0 fi @@ -179,10 +179,10 @@ install_prereqs_ubuntu() { } install_prereqs_arch() { - pacman -Syu --noconfirm --needed git curl tmux iptables docker github-cli jq sudo 2>&1 | tail -5 + pacman -Syu --noconfirm --needed git curl tmux iptables docker github-cli jq ripgrep sudo 2>&1 | tail -5 } -info "Installing: git, curl, tmux, iptables, docker, gh, jq, sudo" +info "Installing: git, curl, tmux, iptables, docker, gh, jq, ripgrep, sudo" "install_prereqs_$DISTRO" info "Prerequisites installed" diff --git a/pi/extensions/kernel/index.ts b/pi/extensions/kernel/index.ts index 54bdb79..707b571 100644 --- a/pi/extensions/kernel/index.ts +++ b/pi/extensions/kernel/index.ts @@ -17,19 +17,40 @@ import type { ExtensionAPI, ExtensionContext } from "@mariozechner/pi-coding-agent"; import { Type, } from "@sinclair/typebox"; import { StringEnum } from "@mariozechner/pi-ai"; -import Kernel from "@onkernel/sdk"; // --------------------------------------------------------------------------- // Client // --------------------------------------------------------------------------- -function getClient(): Kernel { +let kernelCtorPromise: Promise | null = null; + +async function loadKernelCtor(): Promise { + if (!kernelCtorPromise) { + kernelCtorPromise = import("@onkernel/sdk") + .then((mod) => mod.default ?? mod) + .catch(() => null); + } + + const ctor = await kernelCtorPromise; + if (!ctor) { + throw new Error( + "Kernel SDK dependency is missing. Install it with: " + + "cd ~/.pi/agent/extensions/kernel && npm install --omit=dev", + ); + } + + return ctor; +} + +async function getClient(): Promise { const apiKey = process.env.KERNEL_API_KEY; if (!apiKey) { throw new Error( "KERNEL_API_KEY environment variable is not set. Get one at https://app.onkernel.com", ); } + + const Kernel = await loadKernelCtor(); return new Kernel({ apiKey }); } @@ -94,7 +115,7 @@ export default function (pi: ExtensionAPI) { ), }), async execute(_id, params, signal) { - const client = getClient(); + const client = await getClient(); switch (params.action) { case "create": { @@ -201,7 +222,7 @@ export default function (pi: ExtensionAPI) { ), }), async execute(_id, params, signal) { - const client = getClient(); + const client = await getClient(); const sid = params.session_id ?? activeBrowserId; if (!sid) { return { @@ -262,7 +283,7 @@ export default function (pi: ExtensionAPI) { ), }), async execute(_id, params, signal) { - const client = getClient(); + const client = await getClient(); const sid = params.session_id ?? activeBrowserId; if (!sid) { return { @@ -321,7 +342,7 @@ export default function (pi: ExtensionAPI) { scroll_y: Type.Optional(Type.Number({ description: "Vertical scroll amount" })), }), async execute(_id, params, signal) { - const client = getClient(); + const client = await getClient(); const sid = params.session_id ?? activeBrowserId; if (!sid) { return { @@ -430,7 +451,7 @@ export default function (pi: ExtensionAPI) { return; } - const client = getClient(); + const client = await getClient(); try { const browsers: any[] = []; diff --git a/pi/skills/control-agent/HEARTBEAT.md b/pi/skills/control-agent/HEARTBEAT.md index 4ae0cb1..414a5d8 100644 --- a/pi/skills/control-agent/HEARTBEAT.md +++ b/pi/skills/control-agent/HEARTBEAT.md @@ -1,3 +1,7 @@ +--- +description: Control-agent periodic heartbeat checklist for runtime health checks. +--- + # Heartbeat Checklist Check each item and take action only if something is wrong. diff --git a/pi/skills/control-agent/SKILL.md b/pi/skills/control-agent/SKILL.md index c4782de..30ffd22 100644 --- a/pi/skills/control-agent/SKILL.md +++ b/pi/skills/control-agent/SKILL.md @@ -231,7 +231,7 @@ Pick the model based on which API key is available (check env vars in this order | API key | Model | |---------|-------| | `ANTHROPIC_API_KEY` | `anthropic/claude-opus-4-6` | -| `OPENAI_API_KEY` | `openai/gpt-5.2-codex` | +| `OPENAI_API_KEY` | `openai/gpt-5-chat-latest` | | `GEMINI_API_KEY` | `google/gemini-3-pro-preview` | | `OPENCODE_ZEN_API_KEY` | `opencode-zen/claude-opus-4-6` | @@ -342,7 +342,7 @@ Pick the model based on which API key is available (check env vars in this order | API key | Model | |---------|-------| | `ANTHROPIC_API_KEY` | `anthropic/claude-haiku-4-5` | -| `OPENAI_API_KEY` | `openai/gpt-5-mini` | +| `OPENAI_API_KEY` | `openai/gpt-4.1-mini` | | `GEMINI_API_KEY` | `google/gemini-3-flash-preview` | | `OPENCODE_ZEN_API_KEY` | `opencode-zen/claude-haiku-4-5` | @@ -477,7 +477,7 @@ The sentry-agent triages Sentry alerts and investigates critical issues via the | API key | Model | |---------|-------| | `ANTHROPIC_API_KEY` | `anthropic/claude-haiku-4-5` | -| `OPENAI_API_KEY` | `openai/gpt-5-mini` | +| `OPENAI_API_KEY` | `openai/gpt-4.1-mini` | | `GEMINI_API_KEY` | `google/gemini-3-flash-preview` | | `OPENCODE_ZEN_API_KEY` | `opencode-zen/claude-haiku-4-5` | diff --git a/pi/skills/control-agent/startup-cleanup.sh b/pi/skills/control-agent/startup-cleanup.sh index 2941807..5e68519 100755 --- a/pi/skills/control-agent/startup-cleanup.sh +++ b/pi/skills/control-agent/startup-cleanup.sh @@ -77,7 +77,12 @@ fi # then Socket Mode when SLACK_BOT_TOKEN + SLACK_APP_TOKEN are present. # If neither mode is configured, skip bridge startup. BRIDGE_SCRIPT="" -if [ -f "$HOME/runtime/slack-bridge/broker-bridge.mjs" ] && varlock run --path "$HOME/.config/" -- sh -c ' +BRIDGE_DIR="$HOME/runtime/slack-bridge" +if [ ! -d "$BRIDGE_DIR" ] && [ -d "/opt/baudbot/current/slack-bridge" ]; then + BRIDGE_DIR="/opt/baudbot/current/slack-bridge" +fi + +if [ -f "$BRIDGE_DIR/broker-bridge.mjs" ] && varlock run --path "$HOME/.config/" -- sh -c ' test -n "$SLACK_BROKER_URL" && test -n "$SLACK_BROKER_WORKSPACE_ID" && test -n "$SLACK_BROKER_SERVER_PRIVATE_KEY" && @@ -99,10 +104,17 @@ if [ -z "$BRIDGE_SCRIPT" ]; then exit 0 fi +if [ ! -d "$BRIDGE_DIR" ]; then + echo "Bridge directory not found (expected $HOME/runtime/slack-bridge or /opt/baudbot/current/slack-bridge); skipping bridge startup." + echo "" + echo "=== Cleanup Complete ===" + exit 0 +fi + # Start fresh slack-bridge -echo "Starting slack-bridge ($BRIDGE_SCRIPT) with PI_SESSION_ID=$MY_UUID..." +echo "Starting slack-bridge ($BRIDGE_SCRIPT) from $BRIDGE_DIR with PI_SESSION_ID=$MY_UUID..." tmux new-session -d -s slack-bridge \ - "unset PKG_EXECPATH; export PATH=\$HOME/.varlock/bin:\$HOME/opt/node-v22.14.0-linux-x64/bin:\$PATH && export PI_SESSION_ID=$MY_UUID && cd ~/runtime/slack-bridge && exec varlock run --path ~/.config/ -- node $BRIDGE_SCRIPT" + "unset PKG_EXECPATH; export PATH=\$HOME/.local/bin:\$HOME/.varlock/bin:\$HOME/opt/node-v22.14.0-linux-x64/bin:\$PATH && export PI_SESSION_ID=$MY_UUID && cd $BRIDGE_DIR && while true; do varlock run --path ~/.config/ -- node $BRIDGE_SCRIPT; echo 'Bridge exited (\$?), restarting in 5s...'; sleep 5; done" # Wait for bridge to come up sleep 3 diff --git a/setup.sh b/setup.sh index 1688af9..a5b6768 100755 --- a/setup.sh +++ b/setup.sh @@ -8,7 +8,7 @@ # # This script: # 1. Creates the baudbot_agent user -# 2. Installs Node.js and pi +# 2. Installs Node.js, pi, and Claude Code # 3. Sets up SSH key for GitHub # 4. Installs the Docker wrapper # 5. Installs the safe bash wrapper (tool deny list) @@ -61,8 +61,19 @@ fi BAUDBOT_HOME="/home/baudbot_agent" # Source repo auto-detected from this script's location (can live anywhere) REPO_DIR="$(cd "$(dirname "$0")" && pwd)" +# shellcheck source=bin/lib/setup-common.sh +source "$REPO_DIR/bin/lib/setup-common.sh" NODE_VERSION="22.14.0" PI_VERSION="${BAUDBOT_PI_VERSION:-0.52.12}" +NODE_BIN="$BAUDBOT_HOME/opt/node-v$NODE_VERSION-linux-x64/bin" +CLAUDE_INSTALL_SCRIPT_URL="${CLAUDE_INSTALL_SCRIPT_URL:-https://claude.ai/install.sh}" +CLAUDE_INSTALL_TARGET="${CLAUDE_INSTALL_TARGET:-}" + +if [ -n "$CLAUDE_INSTALL_TARGET" ] && [[ ! "$CLAUDE_INSTALL_TARGET" =~ ^(stable|latest|[0-9]+\.[0-9]+\.[0-9]+(-[^[:space:]]+)?)$ ]]; then + echo "❌ Invalid CLAUDE_INSTALL_TARGET: $CLAUDE_INSTALL_TARGET" >&2 + echo " Expected: stable, latest, or semver (e.g. 2.1.50)" >&2 + exit 1 +fi # Work from a neutral directory — sudo -u baudbot_agent inherits CWD, and # git/find fail if CWD is a directory the agent can't access (e.g. /root). @@ -122,10 +133,30 @@ else fi echo "=== Installing pi $PI_VERSION ===" -NODE_BIN="$BAUDBOT_HOME/opt/node-v$NODE_VERSION-linux-x64/bin" sudo -u baudbot_agent env PATH="$NODE_BIN:$PATH" \ npm install -g "@mariozechner/pi-coding-agent@$PI_VERSION" +echo "=== Installing Claude Code ===" +CLAUDE_BIN="$BAUDBOT_HOME/.local/bin/claude" +if [ ! -x "$CLAUDE_BIN" ]; then + echo "Installing via official script: $CLAUDE_INSTALL_SCRIPT_URL" + if [ -n "$CLAUDE_INSTALL_TARGET" ]; then + sudo -u baudbot_agent env PATH="$NODE_BIN:$PATH" bash -c "curl -fsSL '$CLAUDE_INSTALL_SCRIPT_URL' | bash -s -- '$CLAUDE_INSTALL_TARGET'" + else + sudo -u baudbot_agent env PATH="$NODE_BIN:$PATH" bash -c "curl -fsSL '$CLAUDE_INSTALL_SCRIPT_URL' | bash" + fi +else + echo "Claude Code already installed, skipping installer" +fi + +if [ ! -x "$CLAUDE_BIN" ]; then + echo "❌ Claude Code binary not found at $CLAUDE_BIN after install" >&2 + exit 1 +fi + +bb_install_exec_wrapper "/usr/local/bin/claude" "$CLAUDE_BIN" +echo "Installed /usr/local/bin/claude wrapper (works with sudo secure_path)" + echo "=== Configuring git identity ===" GIT_USER_NAME="${GIT_USER_NAME:-baudbot-agent}" GIT_USER_EMAIL="${GIT_USER_EMAIL:-baudbot-agent@users.noreply.github.com}" @@ -156,6 +187,9 @@ for repo in "$BAUDBOT_HOME"/workspace/*/; do done echo "=== Adding PATH to bashrc ===" +if ! grep -q '\.local/bin' "$BAUDBOT_HOME/.bashrc"; then + sudo -u baudbot_agent bash -c "echo 'export PATH=\$HOME/.local/bin:\$PATH' >> ~/.bashrc" +fi if ! grep -q "node-v$NODE_VERSION" "$BAUDBOT_HOME/.bashrc"; then sudo -u baudbot_agent bash -c "echo 'export PATH=\$HOME/opt/node-v$NODE_VERSION-linux-x64/bin:\$PATH' >> ~/.bashrc" fi @@ -229,7 +263,6 @@ sudo -u baudbot_agent bash -c ' echo "=== Installing extension dependencies ===" # npm install runs in source (admin-owned) then deploy copies to runtime -NODE_BIN="$BAUDBOT_HOME/opt/node-v$NODE_VERSION-linux-x64/bin" export PATH="$NODE_BIN:$PATH" while IFS= read -r dir; do ext_name="$(basename "$dir")" @@ -337,7 +370,9 @@ echo " 3. Add SSH key to your agent's GitHub account:" echo " cat $BAUDBOT_HOME/.ssh/id_ed25519.pub" echo " 4. Authenticate GitHub CLI:" echo " sudo -u baudbot_agent gh auth login" -echo " 5. Log out and back in for group membership to take effect" +echo " 5. Authenticate Claude Code (recommended for claude-code backend):" +echo " sudo -u baudbot_agent claude auth login" +echo " 6. Log out and back in for group membership to take effect" echo "" echo "Commands:" echo " baudbot start Start the agent" diff --git a/slack-bridge/bridge.mjs b/slack-bridge/bridge.mjs index a719eff..c85b371 100644 --- a/slack-bridge/bridge.mjs +++ b/slack-bridge/bridge.mjs @@ -114,26 +114,48 @@ function getThreadId(channel, threadTs) { // ── Session Socket ────────────────────────────────────────────────────────── function findSessionSocket(targetId) { + const resolveAliasSocket = (aliasName) => { + const aliasPath = path.join(SOCKET_DIR, `${aliasName}.alias`); + if (!fs.existsSync(aliasPath)) return null; + try { + const target = fs.readlinkSync(aliasPath); + const resolved = path.resolve(SOCKET_DIR, target); + if (resolved.endsWith(".sock") && fs.existsSync(resolved)) return resolved; + } catch { + // Ignore alias read errors and continue with fallback discovery. + } + return null; + }; + if (targetId) { // Try as UUID first const sock = path.join(SOCKET_DIR, `${targetId}.sock`); if (fs.existsSync(sock)) return sock; + // Try as direct alias (.alias -> .sock) + const aliasSock = resolveAliasSocket(targetId); + if (aliasSock) return aliasSock; + // Try as session name — check the alias symlinks const aliasDir = path.join(SOCKET_DIR, "by-name"); if (fs.existsSync(aliasDir)) { - const aliasSock = path.join(aliasDir, `${targetId}.sock`); - if (fs.existsSync(aliasSock)) return fs.realpathSync(aliasSock); + const byNameSock = path.join(aliasDir, `${targetId}.sock`); + if (fs.existsSync(byNameSock)) return fs.realpathSync(byNameSock); } // Fallback: scan sockets and try to match by name via RPC throw new Error(`Socket not found for session "${targetId}". Use the full session UUID from: ls ~/.pi/session-control/`); } - // Auto-detect: pick the first available socket + + // Auto-detect: prefer control-agent alias when present. + const controlAgentSock = resolveAliasSocket("control-agent"); + if (controlAgentSock) return controlAgentSock; + + // Otherwise pick the first available socket if unambiguous. const socks = fs.readdirSync(SOCKET_DIR).filter((f) => f.endsWith(".sock")); if (socks.length === 0) throw new Error("No pi sessions with control sockets found"); if (socks.length === 1) return path.join(SOCKET_DIR, socks[0]); - console.log("Multiple sessions found. Set PI_SESSION_ID to pick one:"); + console.log("Multiple sessions found and no control-agent alias. Set PI_SESSION_ID to pick one:"); socks.forEach((s) => console.log(` ${s.replace(".sock", "")}`)); throw new Error("Ambiguous — multiple sessions found"); } diff --git a/slack-bridge/broker-bridge.mjs b/slack-bridge/broker-bridge.mjs index c202edf..c3654b9 100755 --- a/slack-bridge/broker-bridge.mjs +++ b/slack-bridge/broker-bridge.mjs @@ -245,19 +245,38 @@ function sleep(ms) { } function findSessionSocket(targetId) { + const resolveAliasSocket = (aliasName) => { + const aliasPath = path.join(SOCKET_DIR, `${aliasName}.alias`); + if (!fs.existsSync(aliasPath)) return null; + try { + const target = fs.readlinkSync(aliasPath); + const resolved = path.resolve(SOCKET_DIR, target); + if (resolved.endsWith(".sock") && fs.existsSync(resolved)) return resolved; + } catch { + // Ignore alias read errors and continue with fallback discovery. + } + return null; + }; + if (targetId) { const sock = path.join(SOCKET_DIR, `${targetId}.sock`); if (fs.existsSync(sock)) return sock; + const aliasSock = resolveAliasSocket(targetId); + if (aliasSock) return aliasSock; + const aliasDir = path.join(SOCKET_DIR, "by-name"); if (fs.existsSync(aliasDir)) { - const aliasSock = path.join(aliasDir, `${targetId}.sock`); - if (fs.existsSync(aliasSock)) return fs.realpathSync(aliasSock); + const byNameSock = path.join(aliasDir, `${targetId}.sock`); + if (fs.existsSync(byNameSock)) return fs.realpathSync(byNameSock); } throw new Error(`Socket not found for session "${targetId}".`); } + const controlAgentSock = resolveAliasSocket("control-agent"); + if (controlAgentSock) return controlAgentSock; + const socks = fs.readdirSync(SOCKET_DIR).filter((f) => f.endsWith(".sock")); if (socks.length === 0) throw new Error("No pi sessions with control sockets found"); if (socks.length === 1) return path.join(SOCKET_DIR, socks[0]); @@ -1034,4 +1053,4 @@ async function startPollLoop() { logInfo(` allowed users: ${ALLOWED_USERS.length || "all"}`); logInfo(` pi socket: ${socketPath || "(not found — will retry on message)"}`); await startPollLoop(); -})(); \ No newline at end of file +})(); diff --git a/start.sh b/start.sh index c47289c..88ff7be 100755 --- a/start.sh +++ b/start.sh @@ -14,7 +14,12 @@ set -euo pipefail cd ~ # Set PATH -export PATH="$HOME/.varlock/bin:$HOME/opt/node-v22.14.0-linux-x64/bin:$PATH" +export PATH="$HOME/.local/bin:$HOME/.varlock/bin:$HOME/opt/node-v22.14.0-linux-x64/bin:$PATH" + +# Keep tmux server sockets outside /tmp so systemd PrivateTmp restarts don't strand sessions. +export TMUX_TMPDIR="$HOME/.tmux" +mkdir -p "$TMUX_TMPDIR" +chmod 700 "$TMUX_TMPDIR" # Work around varlock telemetry config crash by opting out at runtime. # This avoids loading anonymousId from user config and keeps startup deterministic. @@ -82,16 +87,25 @@ elif [ -n "${SLACK_BOT_TOKEN:-}" ] && [ -n "${SLACK_APP_TOKEN:-}" ]; then fi if [ -n "$BRIDGE_SCRIPT" ]; then - tmux kill-session -t slack-bridge 2>/dev/null || true - echo "Starting Slack bridge ($BRIDGE_SCRIPT)..." - tmux new-session -d -s slack-bridge \ - "export PATH=$HOME/.varlock/bin:$HOME/opt/node-v22.14.0-linux-x64/bin:\$PATH && \ - cd ~/runtime/slack-bridge && \ - while true; do \ - varlock run --path ~/.config/ -- node $BRIDGE_SCRIPT; \ - echo '⚠️ Bridge exited (\$?), restarting in 5s...'; \ - sleep 5; \ - done" + BRIDGE_DIR="$HOME/runtime/slack-bridge" + if [ ! -d "$BRIDGE_DIR" ] && [ -d "/opt/baudbot/current/slack-bridge" ]; then + BRIDGE_DIR="/opt/baudbot/current/slack-bridge" + fi + + if [ -d "$BRIDGE_DIR" ]; then + tmux kill-session -t slack-bridge 2>/dev/null || true + echo "Starting Slack bridge ($BRIDGE_SCRIPT) from $BRIDGE_DIR..." + tmux new-session -d -s slack-bridge \ + "export PATH=$HOME/.local/bin:$HOME/.varlock/bin:$HOME/opt/node-v22.14.0-linux-x64/bin:\$PATH && \ + cd $BRIDGE_DIR && \ + while true; do \ + varlock run --path ~/.config/ -- node $BRIDGE_SCRIPT; \ + echo '⚠️ Bridge exited (\$?), restarting in 5s...'; \ + sleep 5; \ + done" + else + echo "⚠️ Slack bridge configured but no bridge directory found; skipping bridge startup." + fi fi # Set session name (read by auto-name.ts extension) @@ -101,7 +115,8 @@ export PI_SESSION_NAME="control-agent" if [ -n "${ANTHROPIC_API_KEY:-}" ]; then MODEL="anthropic/claude-opus-4-6" elif [ -n "${OPENAI_API_KEY:-}" ]; then - MODEL="openai/gpt-5.2-codex" + # Use a non-reasoning OpenAI model to avoid Responses API store=false reasoning-item failures. + MODEL="openai/gpt-5-chat-latest" elif [ -n "${GEMINI_API_KEY:-}" ]; then MODEL="google/gemini-3-pro-preview" elif [ -n "${OPENCODE_ZEN_API_KEY:-}" ]; then diff --git a/test/broker-bridge.integration.test.mjs b/test/broker-bridge.integration.test.mjs index 53db33e..d2946f0 100644 --- a/test/broker-bridge.integration.test.mjs +++ b/test/broker-bridge.integration.test.mjs @@ -4,7 +4,7 @@ import { spawn } from "node:child_process"; import net from "node:net"; import path from "node:path"; import { fileURLToPath } from "node:url"; -import { existsSync, mkdtempSync, mkdirSync, rmSync } from "node:fs"; +import { existsSync, mkdtempSync, mkdirSync, rmSync, symlinkSync } from "node:fs"; import { tmpdir } from "node:os"; import sodium from "libsodium-wrappers-sumo"; import { @@ -463,6 +463,214 @@ describe("broker pull bridge semi-integration", () => { expect(sendPayloads.some((payload) => payload.action === "reactions.add")).toBe(false); }); + it("prefers control-agent alias when multiple pi session sockets exist", async () => { + await sodium.ready; + + const testFileDir = path.dirname(fileURLToPath(import.meta.url)); + const repoRoot = path.dirname(testFileDir); + const bridgePath = path.join(repoRoot, "slack-bridge", "broker-bridge.mjs"); + const bridgeCwd = path.join(repoRoot, "slack-bridge"); + + const tempHome = createBridgeHome(tempDirs); + const sessionDir = path.join(tempHome, ".pi", "session-control"); + mkdirSync(sessionDir, { recursive: true }); + + const controlSessionId = "22222222-2222-2222-2222-222222222222"; + const sentrySessionId = "33333333-3333-3333-3333-333333333333"; + const controlSocketFile = path.join(sessionDir, `${controlSessionId}.sock`); + const sentrySocketFile = path.join(sessionDir, `${sentrySessionId}.sock`); + symlinkSync(`${controlSessionId}.sock`, path.join(sessionDir, "control-agent.alias")); + + const controlCommands = []; + const sentryCommands = []; + + const controlSocket = net.createServer((conn) => { + let buffer = ""; + conn.on("data", (chunk) => { + buffer += chunk.toString(); + const lines = buffer.split("\n"); + buffer = lines.pop() || ""; + for (const line of lines) { + if (!line.trim()) continue; + const msg = JSON.parse(line); + controlCommands.push(msg); + if (msg.type === "send") { + conn.write(`${JSON.stringify({ type: "response", command: "send", success: true })}\n`); + } + } + }); + }); + if (!(await listenUnixSocketOrUnavailable(controlSocket, controlSocketFile))) return; + servers.push(controlSocket); + + const sentrySocket = net.createServer((conn) => { + let buffer = ""; + conn.on("data", (chunk) => { + buffer += chunk.toString(); + const lines = buffer.split("\n"); + buffer = lines.pop() || ""; + for (const line of lines) { + if (!line.trim()) continue; + const msg = JSON.parse(line); + sentryCommands.push(msg); + if (msg.type === "send") { + conn.write(`${JSON.stringify({ type: "response", command: "send", success: true })}\n`); + } + } + }); + }); + if (!(await listenUnixSocketOrUnavailable(sentrySocket, sentrySocketFile))) return; + servers.push(sentrySocket); + + const serverBox = sodium.crypto_box_keypair(); + const brokerBox = sodium.crypto_box_keypair(); + const brokerSign = sodium.crypto_sign_keypair(); + const serverSignSeed = sodium.randombytes_buf(sodium.crypto_sign_SEEDBYTES); + + const workspaceId = "T123BROKER"; + const eventPayload = { + type: "event_callback", + event: { + type: "app_mention", + user: "U_ALLOWED", + channel: "C123", + ts: "1730000000.000200", + text: "<@U_BOT> route to control alias", + }, + }; + + const encrypted = sodium.crypto_box_seal( + Buffer.from(JSON.stringify(eventPayload)), + serverBox.publicKey, + ); + const brokerTimestamp = Math.floor(Date.now() / 1000); + const encryptedB64 = toBase64(encrypted); + const brokerSignature = toBase64( + sodium.crypto_sign_detached( + canonicalizeEnvelope(workspaceId, brokerTimestamp, encryptedB64), + brokerSign.privateKey, + ), + ); + + let pullCount = 0; + let ackPayload = null; + const sendPayloads = []; + + const broker = createServer(async (req, res) => { + if (req.method === "POST" && req.url === "/api/inbox/pull") { + pullCount += 1; + const messages = pullCount === 1 + ? [{ + message_id: "m-alias-1", + workspace_id: workspaceId, + encrypted: encryptedB64, + broker_timestamp: brokerTimestamp, + broker_signature: brokerSignature, + }] + : []; + + res.writeHead(200, { "Content-Type": "application/json" }); + res.end(JSON.stringify({ ok: true, messages })); + return; + } + + if (req.method === "POST" && req.url === "/api/inbox/ack") { + let raw = ""; + for await (const chunk of req) raw += chunk; + ackPayload = JSON.parse(raw); + res.writeHead(200, { "Content-Type": "application/json" }); + res.end(JSON.stringify({ ok: true, acked: ackPayload.message_ids?.length ?? 0 })); + return; + } + + if (req.method === "POST" && req.url === "/api/send") { + let raw = ""; + for await (const chunk of req) raw += chunk; + sendPayloads.push(JSON.parse(raw)); + res.writeHead(200, { "Content-Type": "application/json" }); + res.end(JSON.stringify({ ok: true, ts: "1234.5678" })); + return; + } + + res.writeHead(404, { "Content-Type": "application/json" }); + res.end(JSON.stringify({ ok: false, error: "not found" })); + }); + + if (!(await listenLocalhostOrUnavailable(broker, 0))) return; + servers.push(broker); + + const address = broker.address(); + if (!address || typeof address === "string") { + throw new Error("failed to get broker test server address"); + } + const brokerUrl = `http://127.0.0.1:${address.port}`; + + let bridgeStdout = ""; + let bridgeStderr = ""; + let bridgeExit = null; + + const bridge = spawn("node", [bridgePath], { + cwd: bridgeCwd, + env: { + ...process.env, + HOME: tempHome, + SLACK_BROKER_URL: brokerUrl, + SLACK_BROKER_WORKSPACE_ID: workspaceId, + SLACK_BROKER_SERVER_PRIVATE_KEY: toBase64(serverBox.privateKey), + SLACK_BROKER_SERVER_PUBLIC_KEY: toBase64(serverBox.publicKey), + SLACK_BROKER_SERVER_SIGNING_PRIVATE_KEY: toBase64(serverSignSeed), + SLACK_BROKER_PUBLIC_KEY: toBase64(brokerBox.publicKey), + SLACK_BROKER_SIGNING_PUBLIC_KEY: toBase64(brokerSign.publicKey), + SLACK_ALLOWED_USERS: "U_ALLOWED", + SLACK_BROKER_POLL_INTERVAL_MS: "50", + BRIDGE_API_PORT: "0", + }, + stdio: ["ignore", "pipe", "pipe"], + }); + + bridge.stdout.on("data", (chunk) => { + bridgeStdout += chunk.toString(); + }); + bridge.stderr.on("data", (chunk) => { + bridgeStderr += chunk.toString(); + }); + + const bridgeExited = new Promise((_, reject) => { + bridge.on("error", (err) => { + if (ackPayload !== null) return; + reject(new Error(`bridge spawn error: ${err.message}; stdout=${bridgeStdout}; stderr=${bridgeStderr}`)); + }); + bridge.on("exit", (code, signal) => { + bridgeExit = { code, signal }; + if (ackPayload !== null) return; + reject(new Error(`bridge exited early: code=${code} signal=${signal}; stdout=${bridgeStdout}; stderr=${bridgeStderr}`)); + }); + }); + + children.push(bridge); + + const completeWait = waitFor( + () => ackPayload !== null && controlCommands.length > 0, + 12_000, + 50, + `timeout waiting for alias-route forward+ack; pullCount=${pullCount}; control=${JSON.stringify(controlCommands)}; sentry=${JSON.stringify(sentryCommands)}; sendPayloads=${JSON.stringify(sendPayloads)}; exit=${JSON.stringify(bridgeExit)}; stdout=${bridgeStdout}; stderr=${bridgeStderr}`, + ); + + await Promise.race([completeWait, bridgeExited]); + + expect(ackPayload.workspace_id).toBe(workspaceId); + expect(ackPayload.message_ids).toContain("m-alias-1"); + + expect(controlCommands.length).toBe(1); + expect(controlCommands[0].type).toBe("send"); + expect(controlCommands[0].mode).toBe("steer"); + expect(sentryCommands.length).toBe(0); + + expect(sendPayloads.some((payload) => payload.action === "chat.postMessage")).toBe(false); + expect(sendPayloads.some((payload) => payload.action === "reactions.add")).toBe(false); + expect(bridgeStdout).not.toContain("Ambiguous"); + }); + it("uses protocol-versioned inbox.pull signatures with wait_seconds by default", async () => { await sodium.ready; diff --git a/test/shell-scripts.test.mjs b/test/shell-scripts.test.mjs index a325959..b58c14c 100644 --- a/test/shell-scripts.test.mjs +++ b/test/shell-scripts.test.mjs @@ -43,6 +43,14 @@ describe("shell script test suites", () => { expect(() => runScript("bin/lib/doctor-common.test.sh")).not.toThrow(); }); + it("doctor cli", () => { + expect(() => runScript("bin/doctor.test.sh")).not.toThrow(); + }); + + it("setup helpers", () => { + expect(() => runScript("bin/lib/setup-common.test.sh")).not.toThrow(); + }); + it("remote common helpers", () => { expect(() => runScript("bin/lib/remote-common.test.sh")).not.toThrow(); });