From 09ea2f416adf9f02649587598c7a5c2ba0be554f Mon Sep 17 00:00:00 2001
From: Dmitry Kondratyev <kondratyev.d.95@gmail.com>
Date: Tue, 3 Feb 2026 23:20:52 -0500
Subject: [PATCH 01/25] Add CI campaign source-of-truth plan

---
 .codex/CI_PLAN.md | 106 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 .codex/CI_PLAN.md

diff --git a/.codex/CI_PLAN.md b/.codex/CI_PLAN.md
new file mode 100644
index 00000000..76463f3f
--- /dev/null
+++ b/.codex/CI_PLAN.md
@@ -0,0 +1,106 @@
+# CI/CD Campaign Plan (Current State)
+
+## Mission and Success Criteria
+Deliver exactly one draft PR from `codex/ci` to `main` with minimal CI/CD hardening that:
+- converts formatter-based CI to check-only behavior,
+- adds advisory-first integrity/deploy/security coverage,
+- keeps one source of truth in this file,
+- preserves safe daily branch sync (`main` merged into `codex/ci`, no force-push).
+
+Success means:
+- `.github/workflows/lint-*.yml` workflows are check-only and run on every push + pull_request,
+- new workflows exist for integrity, GitOps deployability, and nightly advisory security,
+- optional repo-quality workflow is selected and included,
+- README shows A-E category badges,
+- no changes touch out-of-scope paths.
+
+## In-Scope / Out-of-Scope Paths
+In scope:
+- `.github/**`
+- `apps/**`
+- `deploy/**`
+- `docker/**` (except exclusions)
+- `README.md`
+- `.codex/CI_PLAN.md`
+
+Out of scope:
+- `docker/dask-gateway-server/**`
+- `docs/**`
+- `docs/source/demos/**`
+- `docker/kaniko-build-jobs/**`
+- `slurm/**`
+- `.cursor/**`
+
+## Target Check Architecture
+### A) CI System Integrity (advisory)
+- Workflow: `.github/workflows/ci-workflow-integrity.yml`
+- Checks: `actionlint` + workflow YAML parse.
+- Risk mapped: malformed workflows, invalid action definitions, skipped CI due syntax/runtime issues.
+
+### B) Repo-Owned Code Quality / Tests (advisory additions)
+- Workflows:
+  - `.github/workflows/lint-python.yml`
+  - `.github/workflows/lint-shell.yml`
+  - `.github/workflows/lint-json.yml`
+  - `.github/workflows/lint-yaml.yml`
+  - `.github/workflows/ci-repo-quality.yml` (selected)
+- Checks: black/isort check-only, py_compile, pytest (advisory), shellcheck/shfmt/bash -n, JSON/YAML parse checks.
+- Risk mapped: runtime and script regressions.
+
+### C) Container Reliability (advisory additions)
+- Workflow: `.github/workflows/lint-docker.yml`
+- Checks: hadolint (check-only), advisory docker build/smoke for maintained Dockerfiles via `.github/scripts/container-smoke.sh`.
+- Risk mapped: container build/runtime breakage.
+
+### D) GitOps/K8s Deployability (advisory)
+- Workflow: `.github/workflows/ci-gitops-deployability.yml`
+- Checks: `kustomize build --load-restrictor LoadRestrictionsNone` for all deploy overlays + `kubeconform` schema validation.
+- Risk mapped: Flux reconciliation failures from invalid manifests.
+
+### E) Nightly Advisory Security
+- Workflow: `.github/workflows/nightly-security-advisory.yml`
+- Checks: Trivy filesystem scan (HIGH/CRITICAL).
+- Risk mapped: security posture drift.
+
+## Advisory vs Future Blocking Milestones
+- M0 (this campaign): all newly introduced validations advisory.
+- M1: promote workflow integrity + repo-quality checks to blocking after stable baseline.
+- M2: promote container + GitOps checks to blocking after stable baseline.
+- M3: keep nightly security advisory unless explicitly promoted.
+
+## Agent Lane Ownership (File Level)
+- Coordinator: `.codex/CI_PLAN.md`, `README.md`, branch/PR/sync operations.
+- Agent A: `.github/workflows/ci-workflow-integrity.yml` (+ selection recommendation in chat).
+- Agent B: `.github/workflows/lint-python.yml`, `.github/workflows/lint-shell.yml`, `.github/workflows/ci-repo-quality.yml`, optional B helper scripts.
+- Agent C: `.github/workflows/lint-json.yml`, `.github/workflows/lint-yaml.yml`.
+- Agent D: `.github/workflows/lint-docker.yml`, `.github/scripts/container-smoke.sh`.
+- Agent E: `.github/workflows/ci-gitops-deployability.yml`, `.github/workflows/nightly-security-advisory.yml`.
+
+## Phased Rollout and Rollback
+Rollout:
+1. First commit creates this file.
+2. Add/convert workflows in lane-owned files only.
+3. Keep PR draft until baseline checks stabilize.
+4. Daily sync by merging `main` into `codex/ci`.
+
+Rollback:
+- Revert only unstable workflow files in small commits.
+- Keep advisory mode active during stabilization.
+
+## Reproducible Runbook (from clean main)
+1. `git fetch origin`
+2. `git switch main && git pull --ff-only origin main`
+3. `git switch -c codex/ci` (or `git switch codex/ci`)
+4. Commit #1: `.codex/CI_PLAN.md`
+5. Apply lane-scoped workflow changes
+6. `git push -u origin codex/ci`
+7. Open one draft PR `codex/ci -> main`
+8. Daily sync: `git fetch origin && git switch codex/ci && git merge --no-ff origin/main`
+
+## Constraint Challenge Protocol
+If a hard constraint appears to conflict with delivery, create an `EXCEPTION REQUEST` with:
+1) challenged constraint,
+2) concrete risk if unchanged,
+3) minimal exception,
+4) rollback path.
+Do not implement exception changes before explicit user approval.

From c76e026ee1cbd7b4db74686e059fbe4f857165da Mon Sep 17 00:00:00 2001
From: Dmitry Kondratyev <kondratyev.d.95@gmail.com>
Date: Tue, 3 Feb 2026 23:26:16 -0500
Subject: [PATCH 02/25] Add advisory CI integrity, quality, container, gitops,
 and security checks

---
 .github/scripts/container-smoke.sh            | 30 ++++++
 .github/workflows/ci-gitops-deployability.yml | 53 ++++++++++
 .github/workflows/ci-repo-quality.yml         | 71 ++++++++++++++
 .github/workflows/ci-workflow-integrity.yml   | 43 +++++++++
 .github/workflows/lint-docker.yml             | 96 +++++++++++++++++--
 .github/workflows/lint-json.yml               | 53 +++++-----
 .github/workflows/lint-python.yml             | 65 ++++++++-----
 .github/workflows/lint-shell.yml              | 62 +++++++-----
 .github/workflows/lint-yaml.yml               | 75 +++++++++------
 .../workflows/nightly-security-advisory.yml   | 27 ++++++
 README.md                                     | 11 +--
 11 files changed, 462 insertions(+), 124 deletions(-)
 create mode 100755 .github/scripts/container-smoke.sh
 create mode 100644 .github/workflows/ci-gitops-deployability.yml
 create mode 100644 .github/workflows/ci-repo-quality.yml
 create mode 100644 .github/workflows/ci-workflow-integrity.yml
 create mode 100644 .github/workflows/nightly-security-advisory.yml

diff --git a/.github/scripts/container-smoke.sh b/.github/scripts/container-smoke.sh
new file mode 100755
index 00000000..cf1bb967
--- /dev/null
+++ b/.github/scripts/container-smoke.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+if [ "$#" -ne 2 ]; then
+  echo "Usage: $0 <image> <profile>" >&2
+  exit 2
+fi
+
+image="$1"
+profile="$2"
+
+docker image inspect "$image" >/dev/null
+
+case "$profile" in
+  af-pod-monitor)
+    docker run --rm --entrypoint python "$image" -c "import prometheus_client"
+    ;;
+  interlink-slurm-plugin)
+    docker run --rm --entrypoint /bin/sh "$image" -lc 'test -x /sidecar/slurm-sidecar'
+    ;;
+  purdue-af)
+    docker run --rm --entrypoint /bin/bash "$image" -lc 'python --version && jupyter --version >/dev/null'
+    ;;
+  *)
+    echo "Unknown profile: $profile" >&2
+    exit 2
+    ;;
+esac
+
+echo "Smoke checks passed for profile: $profile"
diff --git a/.github/workflows/ci-gitops-deployability.yml b/.github/workflows/ci-gitops-deployability.yml
new file mode 100644
index 00000000..c8f70e70
--- /dev/null
+++ b/.github/workflows/ci-gitops-deployability.yml
@@ -0,0 +1,53 @@
+name: CI GitOps Deployability
+
+on:
+  push:
+  pull_request:
+
+permissions:
+  contents: read
+
+jobs:
+  gitops-validate:
+    runs-on: ubuntu-latest
+    continue-on-error: true
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up kustomize
+        uses: imranismail/setup-kustomize@v2
+        with:
+          kustomize-version: '5.4.2'
+
+      - name: Install kubeconform
+        run: |
+          set -euo pipefail
+          KUBECONFORM_VERSION=0.6.7
+          curl -fsSL "https://github.com/yannh/kubeconform/releases/download/v${KUBECONFORM_VERSION}/kubeconform-linux-amd64.tar.gz" -o /tmp/kubeconform.tar.gz
+          tar -xzf /tmp/kubeconform.tar.gz -C /tmp kubeconform
+          chmod +x /tmp/kubeconform
+          sudo mv /tmp/kubeconform /usr/local/bin/kubeconform
+
+      - name: Render overlays with kustomize (advisory)
+        run: |
+          set -euo pipefail
+          overlays=(
+            deploy/core-production
+            deploy/core-staging
+            deploy/core-geddes2
+            deploy/experimental
+          )
+
+          for overlay in "${overlays[@]}"; do
+            out="/tmp/$(echo "$overlay" | tr '/' '_').yaml"
+            echo "Rendering $overlay -> $out"
+            kustomize build --load-restrictor LoadRestrictionsNone "$overlay" > "$out"
+          done
+
+      - name: Validate rendered manifests with kubeconform (advisory)
+        run: |
+          set -euo pipefail
+          for rendered in /tmp/deploy_core-production.yaml /tmp/deploy_core-staging.yaml /tmp/deploy_core-geddes2.yaml /tmp/deploy_experimental.yaml; do
+            echo "Validating $rendered"
+            kubeconform -summary -strict -ignore-missing-schemas "$rendered"
+          done
diff --git a/.github/workflows/ci-repo-quality.yml b/.github/workflows/ci-repo-quality.yml
new file mode 100644
index 00000000..9a08b612
--- /dev/null
+++ b/.github/workflows/ci-repo-quality.yml
@@ -0,0 +1,71 @@
+name: CI Repo Quality
+
+on:
+  push:
+  pull_request:
+
+permissions:
+  contents: read
+
+jobs:
+  repo-quality:
+    runs-on: ubuntu-latest
+    continue-on-error: true
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install quality tooling
+        run: |
+          set -euo pipefail
+          python -m pip install --upgrade pip
+          pip install pytest
+
+      - name: Python syntax smoke (advisory)
+        shell: bash
+        run: |
+          set -euo pipefail
+          mapfile -t py_files < <(find . -type f -name '*.py' \
+            -not -path './docker/dask-gateway-server/*' \
+            -not -path './docker/kaniko-build-jobs/*' \
+            -not -path './docs/*' \
+            -not -path './slurm/*' \
+            -not -path './.cursor/*' \
+            -not -path './.git/*' | sort)
+
+          if [ "${#py_files[@]}" -gt 0 ]; then
+            python -m py_compile "${py_files[@]}"
+          fi
+
+      - name: Shell syntax smoke (advisory)
+        shell: bash
+        run: |
+          set -euo pipefail
+          mapfile -t sh_files < <(find . -type f \
+            \( -name '*.sh' -o -name 'pixi-wrapper' -o -name 'fix-permissions' \) \
+            -not -path './docker/dask-gateway-server/*' \
+            -not -path './docker/kaniko-build-jobs/*' \
+            -not -path './docs/*' \
+            -not -path './slurm/*' \
+            -not -path './.cursor/*' \
+            -not -path './.git/*' | sort)
+
+          for f in "${sh_files[@]}"; do
+            bash -n "$f"
+          done
+
+      - name: Run pytest (advisory)
+        shell: bash
+        run: |
+          set +e
+          pytest -q
+          rc=$?
+          if [ "$rc" -eq 5 ]; then
+            echo 'pytest collected no tests; treating as informational.'
+            exit 0
+          fi
+          exit "$rc"
diff --git a/.github/workflows/ci-workflow-integrity.yml b/.github/workflows/ci-workflow-integrity.yml
new file mode 100644
index 00000000..035578ea
--- /dev/null
+++ b/.github/workflows/ci-workflow-integrity.yml
@@ -0,0 +1,43 @@
+name: CI Workflow Integrity
+
+on:
+  push:
+  pull_request:
+
+permissions:
+  contents: read
+
+jobs:
+  workflow-integrity:
+    runs-on: ubuntu-latest
+    continue-on-error: true
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install actionlint
+        run: |
+          set -euo pipefail
+          bash <(curl -sSfL https://raw.githubusercontent.com/rhysd/actionlint/main/scripts/download-actionlint.bash) 1.7.4
+          echo "$PWD/bin" >> "$GITHUB_PATH"
+
+      - name: Run actionlint (advisory)
+        run: actionlint -color
+
+      - name: Validate workflow YAML parse (advisory)
+        run: |
+          set -euo pipefail
+          python3 -m pip install --upgrade pip
+          python3 -m pip install pyyaml
+          python3 - <<'PY'
+          from pathlib import Path
+          import yaml
+
+          workflows = sorted(Path('.github/workflows').glob('*.y*ml'))
+          if not workflows:
+              raise SystemExit('No workflow files found in .github/workflows')
+
+          for wf in workflows:
+              with wf.open('r', encoding='utf-8') as f:
+                  list(yaml.safe_load_all(f))
+          print(f'Parsed {len(workflows)} workflow file(s).')
+          PY
diff --git a/.github/workflows/lint-docker.yml b/.github/workflows/lint-docker.yml
index f0733fdc..fce34cbc 100644
--- a/.github/workflows/lint-docker.yml
+++ b/.github/workflows/lint-docker.yml
@@ -1,22 +1,98 @@
-name: Lint Dockerfiles
+name: Container Reliability
 
 on:
   push:
-    paths:
-      - '**/Dockerfile*'
   pull_request:
-    paths:
-      - '**/Dockerfile*'
+
+permissions:
+  contents: read
 
 jobs:
-  lint:
+  detect-docker-changes:
     runs-on: ubuntu-latest
+    outputs:
+      af_pod_monitor: ${{ steps.filter.outputs.af_pod_monitor }}
+      interlink_slurm_plugin: ${{ steps.filter.outputs.interlink_slurm_plugin }}
+      purdue_af: ${{ steps.filter.outputs.purdue_af }}
     steps:
       - uses: actions/checkout@v4
+
+      - uses: dorny/paths-filter@v3
+        id: filter
+        with:
+          filters: |
+            af_pod_monitor:
+              - 'docker/af-pod-monitor/**'
+              - '.github/workflows/lint-docker.yml'
+              - '.github/scripts/container-smoke.sh'
+            interlink_slurm_plugin:
+              - 'docker/interlink-slurm-plugin/**'
+              - '.github/workflows/lint-docker.yml'
+              - '.github/scripts/container-smoke.sh'
+            purdue_af:
+              - 'docker/purdue-af/**'
+              - '.github/workflows/lint-docker.yml'
+              - '.github/scripts/container-smoke.sh'
+
+  lint-dockerfiles:
+    runs-on: ubuntu-latest
+    continue-on-error: true
+    steps:
+      - uses: actions/checkout@v4
+
       - name: Install hadolint
         run: |
-          sudo wget -O /bin/hadolint https://github.com/hadolint/hadolint/releases/latest/download/hadolint-$(uname -s)-$(uname -m)
-          sudo chmod +x /bin/hadolint
-      - name: Run hadolint
+          set -euo pipefail
+          HADOLINT_VERSION=v2.12.0
+          curl -fsSL "https://github.com/hadolint/hadolint/releases/download/${HADOLINT_VERSION}/hadolint-Linux-x86_64" -o /tmp/hadolint
+          chmod +x /tmp/hadolint
+          sudo mv /tmp/hadolint /usr/local/bin/hadolint
+
+      - name: Run hadolint (check-only, advisory)
         run: |
-          find . -type f -iname 'Dockerfile*' -not -path './docker/dask-gateway-server/*' -exec hadolint --ignore DL3041 --ignore DL3033 --failure-threshold warning {} + 
\ No newline at end of file
+          set -euo pipefail
+          hadolint --ignore DL3041 --ignore DL3033 --failure-threshold warning docker/af-pod-monitor/Dockerfile
+          hadolint --ignore DL3041 --ignore DL3033 --failure-threshold warning docker/interlink-slurm-plugin/Dockerfile.alma8
+          hadolint --ignore DL3041 --ignore DL3033 --failure-threshold warning docker/purdue-af/Dockerfile
+
+  build-af-pod-monitor:
+    needs: detect-docker-changes
+    if: needs.detect-docker-changes.outputs.af_pod_monitor == 'true'
+    runs-on: ubuntu-latest
+    continue-on-error: true
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Build af-pod-monitor image (advisory)
+        run: docker build -f docker/af-pod-monitor/Dockerfile -t local/af-pod-monitor:${{ github.sha }} .
+
+      - name: Smoke test af-pod-monitor image (advisory)
+        run: .github/scripts/container-smoke.sh local/af-pod-monitor:${{ github.sha }} af-pod-monitor
+
+  build-interlink-slurm-plugin:
+    needs: detect-docker-changes
+    if: needs.detect-docker-changes.outputs.interlink_slurm_plugin == 'true'
+    runs-on: ubuntu-latest
+    continue-on-error: true
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Build interlink-slurm-plugin image (advisory)
+        run: docker build -f docker/interlink-slurm-plugin/Dockerfile.alma8 -t local/interlink-slurm-plugin:${{ github.sha }} .
+
+      - name: Smoke test interlink-slurm-plugin image (advisory)
+        run: .github/scripts/container-smoke.sh local/interlink-slurm-plugin:${{ github.sha }} interlink-slurm-plugin
+
+  build-purdue-af:
+    needs: detect-docker-changes
+    if: needs.detect-docker-changes.outputs.purdue_af == 'true'
+    runs-on: ubuntu-latest
+    continue-on-error: true
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Build purdue-af image (advisory)
+        run: docker build -f docker/purdue-af/Dockerfile -t local/purdue-af:${{ github.sha }} .
+
+      - name: Smoke test purdue-af image (advisory)
+        run: .github/scripts/container-smoke.sh local/purdue-af:${{ github.sha }} purdue-af
diff --git a/.github/workflows/lint-json.yml b/.github/workflows/lint-json.yml
index 84c5e6b5..fcfe8bfb 100644
--- a/.github/workflows/lint-json.yml
+++ b/.github/workflows/lint-json.yml
@@ -2,40 +2,37 @@ name: Lint JSON
 
 on:
   push:
-    paths:
-      - '**.json'
   pull_request:
-    paths:
-      - '**.json'
 
 permissions:
-  contents: write
-  pull-requests: write
+  contents: read
 
 jobs:
-  lint:
+  json-parse:
     runs-on: ubuntu-latest
+    continue-on-error: true
     steps:
       - uses: actions/checkout@v4
-        with:
-          token: ${{ secrets.GITHUB_TOKEN }}
-      - name: Set up Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: '20'
-      - name: Install prettier
-        run: npm install -g prettier
-      - name: Format JSON files
-        run: |
-          find . -name "*.json" | grep -v "docker/dask-gateway-server" | xargs prettier --write
-      - name: Commit changes
-        if: ${{ github.event_name == 'pull_request' || github.ref_type == 'branch' }}
+
+      - name: Validate JSON files (check-only, advisory)
+        shell: bash
         run: |
-          git config --local user.email "action@github.com"
-          git config --local user.name "GitHub Action"
-          REF_NAME="${GITHUB_HEAD_REF:-${GITHUB_REF_NAME}}"
-          git fetch origin "$REF_NAME"
-          git checkout -B "$REF_NAME" "origin/$REF_NAME" || git checkout -B "$REF_NAME"
-          git add -A
-          git diff --quiet && git diff --staged --quiet || git commit -m "Apply prettier JSON formatting"
-          git push origin "$REF_NAME"
\ No newline at end of file
+          set -euo pipefail
+          mapfile -t files < <(find . -type f -name '*.json' \
+            -not -path './docker/dask-gateway-server/*' \
+            -not -path './docker/kaniko-build-jobs/*' \
+            -not -path './docs/*' \
+            -not -path './slurm/*' \
+            -not -path './.cursor/*' \
+            -not -path './.git/*' | sort)
+
+          if [ "${#files[@]}" -eq 0 ]; then
+            echo 'No in-scope JSON files found.'
+            exit 0
+          fi
+
+          for f in "${files[@]}"; do
+            python3 -m json.tool "$f" >/dev/null
+          done
+
+          echo "Validated ${#files[@]} JSON file(s)."
diff --git a/.github/workflows/lint-python.yml b/.github/workflows/lint-python.yml
index 32e273ec..f9f16381 100644
--- a/.github/workflows/lint-python.yml
+++ b/.github/workflows/lint-python.yml
@@ -2,43 +2,58 @@ name: Lint Python
 
 on:
   push:
-    paths:
-      - '**.py'
   pull_request:
-    paths:
-      - '**.py'
 
 permissions:
-  contents: write
-  pull-requests: write
+  contents: read
 
 jobs:
-  lint:
+  lint-python:
     runs-on: ubuntu-latest
+    continue-on-error: true
     steps:
       - uses: actions/checkout@v4
-        with:
-          token: ${{ secrets.GITHUB_TOKEN }}
+
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
           python-version: '3.11'
-      - name: Install dependencies
+
+      - name: Install tooling
         run: |
+          set -euo pipefail
           python -m pip install --upgrade pip
-          pip install black isort
-      - name: Run black
-        run: black . --exclude docker/dask-gateway-server
-      - name: Run isort
-        run: isort . --skip docker/dask-gateway-server
-      - name: Commit changes
-        if: ${{ github.event_name == 'pull_request' || github.ref_type == 'branch' }}
+          pip install black isort pytest
+
+      - name: Run black/isort/py_compile (check-only, advisory)
+        shell: bash
+        run: |
+          set -euo pipefail
+          mapfile -t files < <(find . -type f -name '*.py' \
+            -not -path './docker/dask-gateway-server/*' \
+            -not -path './docker/kaniko-build-jobs/*' \
+            -not -path './docs/*' \
+            -not -path './slurm/*' \
+            -not -path './.cursor/*' \
+            -not -path './.git/*' | sort)
+
+          if [ "${#files[@]}" -eq 0 ]; then
+            echo 'No in-scope Python files found.'
+            exit 0
+          fi
+
+          black --check "${files[@]}"
+          isort --check-only "${files[@]}"
+          python -m py_compile "${files[@]}"
+
+      - name: Run pytest (advisory)
+        shell: bash
         run: |
-          git config --local user.email "action@github.com"
-          git config --local user.name "GitHub Action"
-          REF_NAME="${GITHUB_HEAD_REF:-${GITHUB_REF_NAME}}"
-          git fetch origin "$REF_NAME"
-          git checkout -B "$REF_NAME" "origin/$REF_NAME" || git checkout -B "$REF_NAME"
-          git add -A
-          git diff --quiet && git diff --staged --quiet || git commit -m "Apply black and isort formatting"
-          git push origin "$REF_NAME"
\ No newline at end of file
+          set +e
+          pytest -q
+          rc=$?
+          if [ "$rc" -eq 5 ]; then
+            echo 'pytest collected no tests; treating as informational.'
+            exit 0
+          fi
+          exit "$rc"
diff --git a/.github/workflows/lint-shell.yml b/.github/workflows/lint-shell.yml
index c3d65b08..04a61734 100644
--- a/.github/workflows/lint-shell.yml
+++ b/.github/workflows/lint-shell.yml
@@ -2,39 +2,49 @@ name: Lint Shell Scripts
 
 on:
   push:
-    paths:
-      - '**.sh'
   pull_request:
-    paths:
-      - '**.sh'
 
 permissions:
-  contents: write
-  pull-requests: write
+  contents: read
 
 jobs:
-  lint:
+  lint-shell:
     runs-on: ubuntu-latest
+    continue-on-error: true
     steps:
       - uses: actions/checkout@v4
-        with:
-          token: ${{ secrets.GITHUB_TOKEN }}
-      - name: Install shfmt
-        run: |
-          curl -L https://github.com/mvdan/sh/releases/download/v3.7.0/shfmt_v3.7.0_linux_amd64 -o shfmt
-          chmod +x shfmt
-          sudo mv shfmt /usr/local/bin/
-      - name: Format shell scripts
+
+      - name: Install shell tooling
         run: |
-          find . -type f -name '*.sh' -not -path './docker/dask-gateway-server/*' -exec shfmt -w {} +
-      - name: Commit changes
-        if: ${{ github.event_name == 'pull_request' || github.ref_type == 'branch' }}
+          set -euo pipefail
+          sudo apt-get update
+          sudo apt-get install -y shellcheck
+          SHFMT_VERSION=3.10.0
+          curl -fsSL "https://github.com/mvdan/sh/releases/download/v${SHFMT_VERSION}/shfmt_v${SHFMT_VERSION}_linux_amd64" -o /tmp/shfmt
+          chmod +x /tmp/shfmt
+          sudo mv /tmp/shfmt /usr/local/bin/shfmt
+
+      - name: Run shellcheck/shfmt/bash -n (check-only, advisory)
+        shell: bash
         run: |
-          git config --local user.email "action@github.com"
-          git config --local user.name "GitHub Action"
-          REF_NAME="${GITHUB_HEAD_REF:-${GITHUB_REF_NAME}}"
-          git fetch origin "$REF_NAME"
-          git checkout -B "$REF_NAME" "origin/$REF_NAME" || git checkout -B "$REF_NAME"
-          git add -A
-          git diff --quiet && git diff --staged --quiet || git commit -m "Apply shfmt shell script formatting"
-          git push origin "$REF_NAME"
\ No newline at end of file
+          set -euo pipefail
+          mapfile -t files < <(find . -type f \
+            \( -name '*.sh' -o -name 'pixi-wrapper' -o -name 'fix-permissions' \) \
+            -not -path './docker/dask-gateway-server/*' \
+            -not -path './docker/kaniko-build-jobs/*' \
+            -not -path './docs/*' \
+            -not -path './slurm/*' \
+            -not -path './.cursor/*' \
+            -not -path './.git/*' | sort)
+
+          if [ "${#files[@]}" -eq 0 ]; then
+            echo 'No in-scope shell files found.'
+            exit 0
+          fi
+
+          shellcheck "${files[@]}"
+          shfmt -d "${files[@]}"
+
+          for f in "${files[@]}"; do
+            bash -n "$f"
+          done
diff --git a/.github/workflows/lint-yaml.yml b/.github/workflows/lint-yaml.yml
index 532884f1..e7cb7e92 100644
--- a/.github/workflows/lint-yaml.yml
+++ b/.github/workflows/lint-yaml.yml
@@ -2,42 +2,59 @@ name: Lint YAML
 
 on:
   push:
-    paths:
-      - '**.yml'
-      - '**.yaml'
   pull_request:
-    paths:
-      - '**.yml'
-      - '**.yaml'
 
 permissions:
-  contents: write
-  pull-requests: write
+  contents: read
 
 jobs:
-  lint:
+  yaml-parse:
     runs-on: ubuntu-latest
+    continue-on-error: true
     steps:
       - uses: actions/checkout@v4
-        with:
-          token: ${{ secrets.GITHUB_TOKEN }}
-      - name: Set up Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: '20'
-      - name: Install prettier
-        run: npm install -g prettier
-      - name: Format YAML files
+
+      - name: Install parser dependency
         run: |
-          find . -name "*.yml" -o -name "*.yaml" | grep -v "docker/dask-gateway-server" | grep -v "templates" | grep -v ".github/workflows" | xargs prettier --write
-      - name: Commit changes
-        if: ${{ github.event_name == 'pull_request' || github.ref_type == 'branch' }}
+          set -euo pipefail
+          python3 -m pip install --upgrade pip
+          python3 -m pip install pyyaml
+
+      - name: Validate YAML files (check-only, advisory)
+        shell: bash
         run: |
-          git config --local user.email "action@github.com"
-          git config --local user.name "GitHub Action"
-          REF_NAME="${GITHUB_HEAD_REF:-${GITHUB_REF_NAME}}"
-          git fetch origin "$REF_NAME"
-          git checkout -B "$REF_NAME" "origin/$REF_NAME" || git checkout -B "$REF_NAME"
-          git add -A
-          git diff --quiet && git diff --staged --quiet || git commit -m "Apply prettier YAML formatting"
-          git push origin "$REF_NAME"
\ No newline at end of file
+          set -euo pipefail
+          python3 - <<'PY'
+          from pathlib import Path
+          import yaml
+
+          files = sorted(
+              p for p in Path('.').rglob('*')
+              if p.is_file() and p.suffix in {'.yml', '.yaml'}
+          )
+
+          excluded_prefixes = (
+              Path('docker/dask-gateway-server'),
+              Path('docker/kaniko-build-jobs'),
+              Path('docs'),
+              Path('slurm'),
+              Path('.cursor'),
+              Path('.git'),
+          )
+
+          filtered = []
+          for p in files:
+              if any(str(p).startswith(str(prefix) + '/') or p == prefix for prefix in excluded_prefixes):
+                  continue
+              filtered.append(p)
+
+          if not filtered:
+              print('No in-scope YAML files found.')
+              raise SystemExit(0)
+
+          for p in filtered:
+              with p.open('r', encoding='utf-8') as f:
+                  list(yaml.safe_load_all(f))
+
+          print(f'Validated {len(filtered)} YAML file(s).')
+          PY
diff --git a/.github/workflows/nightly-security-advisory.yml b/.github/workflows/nightly-security-advisory.yml
new file mode 100644
index 00000000..3204dc2b
--- /dev/null
+++ b/.github/workflows/nightly-security-advisory.yml
@@ -0,0 +1,27 @@
+name: Nightly Security Advisory
+
+on:
+  schedule:
+    - cron: '17 5 * * *'
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+jobs:
+  trivy-filesystem:
+    runs-on: ubuntu-latest
+    continue-on-error: true
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Run Trivy filesystem scan (advisory)
+        uses: aquasecurity/trivy-action@0.33.1
+        with:
+          scan-type: fs
+          scan-ref: .
+          severity: HIGH,CRITICAL
+          ignore-unfixed: true
+          exit-code: '1'
+          format: table
+          skip-dirs: docker/dask-gateway-server,docs,docs/source/demos,docker/kaniko-build-jobs,slurm,.cursor
diff --git a/README.md b/README.md
index 0faaac5d..3e7ec8f2 100644
--- a/README.md
+++ b/README.md
@@ -18,9 +18,8 @@ Each user is provided with a 25GB home directory at first login. These directori
 
 [![Documentation Status](https://readthedocs.org/projects/purdue-af/badge/?version=latest)](https://purdue-af.readthedocs.io/en/latest/?badge=latest)
 
-[![Python](https://github.com/PurdueAF/purdue-af/workflows/Lint%20Python/badge.svg)](https://github.com/PurdueAF/purdue-af/actions?query=workflow%3A%22Lint+Python%22)
-[![YAML](https://github.com/PurdueAF/purdue-af/workflows/Lint%20YAML/badge.svg)](https://github.com/PurdueAF/purdue-af/actions?query=workflow%3A%22Lint+YAML%22)
-[![JSON](https://github.com/PurdueAF/purdue-af/workflows/Lint%20JSON/badge.svg)](https://github.com/PurdueAF/purdue-af/actions?query=workflow%3A%22Lint+JSON%22)
-[![Shell](https://github.com/PurdueAF/purdue-af/workflows/Lint%20Shell%20Scripts/badge.svg)](https://github.com/PurdueAF/purdue-af/actions?query=workflow%3A%22Lint+Shell+Scripts%22)
-[![Docker](https://github.com/PurdueAF/purdue-af/workflows/Lint%20Dockerfiles/badge.svg)](https://github.com/PurdueAF/purdue-af/actions?query=workflow%3A%22Lint+Dockerfiles%22)
-
+[![Workflow Integrity](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-workflow-integrity.yml/badge.svg)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-workflow-integrity.yml)
+[![Repo Quality](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-repo-quality.yml/badge.svg)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-repo-quality.yml)
+[![Container Reliability](https://github.com/PurdueAF/purdue-af/actions/workflows/lint-docker.yml/badge.svg)](https://github.com/PurdueAF/purdue-af/actions/workflows/lint-docker.yml)
+[![GitOps Deployability](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-gitops-deployability.yml/badge.svg)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-gitops-deployability.yml)
+[![Nightly Security](https://github.com/PurdueAF/purdue-af/actions/workflows/nightly-security-advisory.yml/badge.svg)](https://github.com/PurdueAF/purdue-af/actions/workflows/nightly-security-advisory.yml)

From 7049587766841272971a0b226b5226e54da7a648 Mon Sep 17 00:00:00 2001
From: Dmitry Kondratyev <kondratyev.d.95@gmail.com>
Date: Wed, 4 Feb 2026 08:59:16 -0500
Subject: [PATCH 03/25] Run CI checks on pull requests only

---
 .codex/CI_PLAN.md                             | 2 +-
 .github/workflows/ci-gitops-deployability.yml | 1 -
 .github/workflows/ci-repo-quality.yml         | 1 -
 .github/workflows/ci-workflow-integrity.yml   | 1 -
 .github/workflows/lint-docker.yml             | 1 -
 .github/workflows/lint-json.yml               | 1 -
 .github/workflows/lint-python.yml             | 1 -
 .github/workflows/lint-shell.yml              | 1 -
 .github/workflows/lint-yaml.yml               | 1 -
 9 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/.codex/CI_PLAN.md b/.codex/CI_PLAN.md
index 76463f3f..3f1d2790 100644
--- a/.codex/CI_PLAN.md
+++ b/.codex/CI_PLAN.md
@@ -8,7 +8,7 @@ Deliver exactly one draft PR from `codex/ci` to `main` with minimal CI/CD harden
 - preserves safe daily branch sync (`main` merged into `codex/ci`, no force-push).
 
 Success means:
-- `.github/workflows/lint-*.yml` workflows are check-only and run on every push + pull_request,
+- `.github/workflows/lint-*.yml` workflows are check-only and run on `pull_request` (single run per change),
 - new workflows exist for integrity, GitOps deployability, and nightly advisory security,
 - optional repo-quality workflow is selected and included,
 - README shows A-E category badges,
diff --git a/.github/workflows/ci-gitops-deployability.yml b/.github/workflows/ci-gitops-deployability.yml
index c8f70e70..0942e2ab 100644
--- a/.github/workflows/ci-gitops-deployability.yml
+++ b/.github/workflows/ci-gitops-deployability.yml
@@ -1,7 +1,6 @@
 name: CI GitOps Deployability
 
 on:
-  push:
   pull_request:
 
 permissions:
diff --git a/.github/workflows/ci-repo-quality.yml b/.github/workflows/ci-repo-quality.yml
index 9a08b612..247108bd 100644
--- a/.github/workflows/ci-repo-quality.yml
+++ b/.github/workflows/ci-repo-quality.yml
@@ -1,7 +1,6 @@
 name: CI Repo Quality
 
 on:
-  push:
   pull_request:
 
 permissions:
diff --git a/.github/workflows/ci-workflow-integrity.yml b/.github/workflows/ci-workflow-integrity.yml
index 035578ea..b364d826 100644
--- a/.github/workflows/ci-workflow-integrity.yml
+++ b/.github/workflows/ci-workflow-integrity.yml
@@ -1,7 +1,6 @@
 name: CI Workflow Integrity
 
 on:
-  push:
   pull_request:
 
 permissions:
diff --git a/.github/workflows/lint-docker.yml b/.github/workflows/lint-docker.yml
index fce34cbc..8ce41627 100644
--- a/.github/workflows/lint-docker.yml
+++ b/.github/workflows/lint-docker.yml
@@ -1,7 +1,6 @@
 name: Container Reliability
 
 on:
-  push:
   pull_request:
 
 permissions:
diff --git a/.github/workflows/lint-json.yml b/.github/workflows/lint-json.yml
index fcfe8bfb..d223cfe7 100644
--- a/.github/workflows/lint-json.yml
+++ b/.github/workflows/lint-json.yml
@@ -1,7 +1,6 @@
 name: Lint JSON
 
 on:
-  push:
   pull_request:
 
 permissions:
diff --git a/.github/workflows/lint-python.yml b/.github/workflows/lint-python.yml
index f9f16381..0f4d8686 100644
--- a/.github/workflows/lint-python.yml
+++ b/.github/workflows/lint-python.yml
@@ -1,7 +1,6 @@
 name: Lint Python
 
 on:
-  push:
   pull_request:
 
 permissions:
diff --git a/.github/workflows/lint-shell.yml b/.github/workflows/lint-shell.yml
index 04a61734..7c099240 100644
--- a/.github/workflows/lint-shell.yml
+++ b/.github/workflows/lint-shell.yml
@@ -1,7 +1,6 @@
 name: Lint Shell Scripts
 
 on:
-  push:
   pull_request:
 
 permissions:
diff --git a/.github/workflows/lint-yaml.yml b/.github/workflows/lint-yaml.yml
index e7cb7e92..6158c887 100644
--- a/.github/workflows/lint-yaml.yml
+++ b/.github/workflows/lint-yaml.yml
@@ -1,7 +1,6 @@
 name: Lint YAML
 
 on:
-  push:
   pull_request:
 
 permissions:

From 73d21697e10eaab8d6362691d4021669985e346c Mon Sep 17 00:00:00 2001
From: Dmitry Kondratyev <kondratyev.d.95@gmail.com>
Date: Wed, 4 Feb 2026 09:30:22 -0500
Subject: [PATCH 04/25] Fix CI baseline failures and docker filter dependency
 trigger

---
 .codex/CI_PLAN.md                             |  3 +++
 .github/workflows/ci-gitops-deployability.yml |  2 +-
 .github/workflows/ci-workflow-integrity.yml   | 10 +++-------
 .github/workflows/lint-docker.yml             |  4 +++-
 .github/workflows/lint-shell.yml              |  2 +-
 docker/purdue-af/genaistudio/genaistudio.py   |  7 ++++---
 docker/purdue-af/scripts/eos-connect.sh       |  2 +-
 7 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/.codex/CI_PLAN.md b/.codex/CI_PLAN.md
index 3f1d2790..63e5d390 100644
--- a/.codex/CI_PLAN.md
+++ b/.codex/CI_PLAN.md
@@ -31,6 +31,9 @@ Out of scope:
 - `slurm/**`
 - `.cursor/**`
 
+Approved exception:
+- `slurm/**` is used as a dependency-only trigger for container reliability jobs because maintained Dockerfiles copy `slurm/` artifacts.
+
 ## Target Check Architecture
 ### A) CI System Integrity (advisory)
 - Workflow: `.github/workflows/ci-workflow-integrity.yml`
diff --git a/.github/workflows/ci-gitops-deployability.yml b/.github/workflows/ci-gitops-deployability.yml
index 0942e2ab..f3c5ae05 100644
--- a/.github/workflows/ci-gitops-deployability.yml
+++ b/.github/workflows/ci-gitops-deployability.yml
@@ -48,5 +48,5 @@ jobs:
           set -euo pipefail
           for rendered in /tmp/deploy_core-production.yaml /tmp/deploy_core-staging.yaml /tmp/deploy_core-geddes2.yaml /tmp/deploy_experimental.yaml; do
             echo "Validating $rendered"
-            kubeconform -summary -strict -ignore-missing-schemas "$rendered"
+            kubeconform -summary -strict -ignore-missing-schemas -skip Secret "$rendered"
           done
diff --git a/.github/workflows/ci-workflow-integrity.yml b/.github/workflows/ci-workflow-integrity.yml
index b364d826..8bb39d1c 100644
--- a/.github/workflows/ci-workflow-integrity.yml
+++ b/.github/workflows/ci-workflow-integrity.yml
@@ -13,14 +13,10 @@ jobs:
     steps:
       - uses: actions/checkout@v4
 
-      - name: Install actionlint
-        run: |
-          set -euo pipefail
-          bash <(curl -sSfL https://raw.githubusercontent.com/rhysd/actionlint/main/scripts/download-actionlint.bash) 1.7.4
-          echo "$PWD/bin" >> "$GITHUB_PATH"
-
       - name: Run actionlint (advisory)
-        run: actionlint -color
+        uses: rhysd/actionlint@v1
+        with:
+          args: -color
 
       - name: Validate workflow YAML parse (advisory)
         run: |
diff --git a/.github/workflows/lint-docker.yml b/.github/workflows/lint-docker.yml
index 8ce41627..ac83bec6 100644
--- a/.github/workflows/lint-docker.yml
+++ b/.github/workflows/lint-docker.yml
@@ -26,10 +26,12 @@ jobs:
               - '.github/scripts/container-smoke.sh'
             interlink_slurm_plugin:
               - 'docker/interlink-slurm-plugin/**'
+              - 'slurm/**'
               - '.github/workflows/lint-docker.yml'
               - '.github/scripts/container-smoke.sh'
             purdue_af:
               - 'docker/purdue-af/**'
+              - 'slurm/**'
               - '.github/workflows/lint-docker.yml'
               - '.github/scripts/container-smoke.sh'
 
@@ -63,7 +65,7 @@ jobs:
       - uses: actions/checkout@v4
 
       - name: Build af-pod-monitor image (advisory)
-        run: docker build -f docker/af-pod-monitor/Dockerfile -t local/af-pod-monitor:${{ github.sha }} .
+        run: docker build -f docker/af-pod-monitor/Dockerfile -t local/af-pod-monitor:${{ github.sha }} docker/af-pod-monitor
 
       - name: Smoke test af-pod-monitor image (advisory)
         run: .github/scripts/container-smoke.sh local/af-pod-monitor:${{ github.sha }} af-pod-monitor
diff --git a/.github/workflows/lint-shell.yml b/.github/workflows/lint-shell.yml
index 7c099240..edcac463 100644
--- a/.github/workflows/lint-shell.yml
+++ b/.github/workflows/lint-shell.yml
@@ -41,7 +41,7 @@ jobs:
             exit 0
           fi
 
-          shellcheck "${files[@]}"
+          shellcheck -S error "${files[@]}"
           shfmt -d "${files[@]}"
 
           for f in "${files[@]}"; do
diff --git a/docker/purdue-af/genaistudio/genaistudio.py b/docker/purdue-af/genaistudio/genaistudio.py
index 965a65a7..cbc768fe 100644
--- a/docker/purdue-af/genaistudio/genaistudio.py
+++ b/docker/purdue-af/genaistudio/genaistudio.py
@@ -1,5 +1,4 @@
-from jupyter_ai_magics.providers import (BaseProvider, EnvAuthStrategy,
-                                         TextField)
+from jupyter_ai_magics.providers import BaseProvider, EnvAuthStrategy, TextField
 from langchain_openai import ChatOpenAI
 
 
@@ -20,7 +19,9 @@ class PurdueGenAIStudioProvider(BaseProvider, ChatOpenAI):
     )
 
     def __init__(self, **kwargs):
-        super().__init__(openai_api_base="https://genai.rcac.purdue.edu/api", **kwargs)
+        super().__init__(
+            openai_api_base="https://genai.rcac.purdue.edu/api", **kwargs
+        )
 
     @classmethod
     def is_api_key_exc(cls, e: Exception):
diff --git a/docker/purdue-af/scripts/eos-connect.sh b/docker/purdue-af/scripts/eos-connect.sh
index 055e8984..33d51ec8 100644
--- a/docker/purdue-af/scripts/eos-connect.sh
+++ b/docker/purdue-af/scripts/eos-connect.sh
@@ -31,7 +31,7 @@ if [[ $krb_ticket = "" ]]; then
 			echo " > Kerberos authentication failed!"
 			echo ""
 			return 1
-			else:
+		else
 			echo " > Kerberos authentication complete!"
 			echo ""
 		fi

From 1b2c2ba30672c424c6796828e7207d4469c827a4 Mon Sep 17 00:00:00 2001
From: Dmitry Kondratyev <kondratyev.d.95@gmail.com>
Date: Wed, 4 Feb 2026 09:38:58 -0500
Subject: [PATCH 05/25] Fix remaining CI failures in integrity, shell, python,
 and af-pod-monitor

---
 .github/workflows/ci-workflow-integrity.yml | 11 ++++++++---
 .github/workflows/lint-shell.yml            |  2 +-
 docker/af-pod-monitor/Dockerfile            |  4 ++--
 docker/purdue-af/genaistudio/genaistudio.py |  4 +---
 4 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/ci-workflow-integrity.yml b/.github/workflows/ci-workflow-integrity.yml
index 8bb39d1c..deacfa94 100644
--- a/.github/workflows/ci-workflow-integrity.yml
+++ b/.github/workflows/ci-workflow-integrity.yml
@@ -13,10 +13,15 @@ jobs:
     steps:
       - uses: actions/checkout@v4
 
+      - name: Install actionlint
+        run: |
+          set -euo pipefail
+          mkdir -p "$HOME/.local/bin"
+          bash <(curl -sSfL https://raw.githubusercontent.com/rhysd/actionlint/main/scripts/download-actionlint.bash) 1.7.4 "$HOME/.local/bin"
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"
+
       - name: Run actionlint (advisory)
-        uses: rhysd/actionlint@v1
-        with:
-          args: -color
+        run: actionlint -color
 
       - name: Validate workflow YAML parse (advisory)
         run: |
diff --git a/.github/workflows/lint-shell.yml b/.github/workflows/lint-shell.yml
index edcac463..adfe868e 100644
--- a/.github/workflows/lint-shell.yml
+++ b/.github/workflows/lint-shell.yml
@@ -41,7 +41,7 @@ jobs:
             exit 0
           fi
 
-          shellcheck -S error "${files[@]}"
+          shellcheck -S error -s bash "${files[@]}"
           shfmt -d "${files[@]}"
 
           for f in "${files[@]}"; do
diff --git a/docker/af-pod-monitor/Dockerfile b/docker/af-pod-monitor/Dockerfile
index f03b2fba..c6f05b0b 100644
--- a/docker/af-pod-monitor/Dockerfile
+++ b/docker/af-pod-monitor/Dockerfile
@@ -2,10 +2,10 @@ FROM python:3.8-slim
 
 WORKDIR /etc/
 
-RUN pip install --no-cache-dir prometheus_client==0.22.1
+RUN pip install --no-cache-dir prometheus_client==0.21.1
 
 COPY pod-metrics-exporter.py /etc/
 
 RUN chmod +x  /etc/pod-metrics-exporter.py
 
-CMD ["python", "/etc/pod-metrics-exporter.py"]
\ No newline at end of file
+CMD ["python", "/etc/pod-metrics-exporter.py"]
diff --git a/docker/purdue-af/genaistudio/genaistudio.py b/docker/purdue-af/genaistudio/genaistudio.py
index cbc768fe..832d3be2 100644
--- a/docker/purdue-af/genaistudio/genaistudio.py
+++ b/docker/purdue-af/genaistudio/genaistudio.py
@@ -19,9 +19,7 @@ class PurdueGenAIStudioProvider(BaseProvider, ChatOpenAI):
     )
 
     def __init__(self, **kwargs):
-        super().__init__(
-            openai_api_base="https://genai.rcac.purdue.edu/api", **kwargs
-        )
+        super().__init__(openai_api_base="https://genai.rcac.purdue.edu/api", **kwargs)
 
     @classmethod
     def is_api_key_exc(cls, e: Exception):

From 9b769d0e9f9942ab372ae8940ac8c7853ddca2b2 Mon Sep 17 00:00:00 2001
From: Dmitry Kondratyev <kondratyev.d.95@gmail.com>
Date: Wed, 4 Feb 2026 09:47:57 -0500
Subject: [PATCH 06/25] Align python and shell lint checks with baseline style

---
 .github/scripts/container-smoke.sh | 30 +++++++++++++++---------------
 .github/workflows/lint-python.yml  |  2 +-
 .github/workflows/lint-shell.yml   |  5 ++++-
 3 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/.github/scripts/container-smoke.sh b/.github/scripts/container-smoke.sh
index cf1bb967..6fca637d 100755
--- a/.github/scripts/container-smoke.sh
+++ b/.github/scripts/container-smoke.sh
@@ -2,8 +2,8 @@
 set -euo pipefail
 
 if [ "$#" -ne 2 ]; then
-  echo "Usage: $0 <image> <profile>" >&2
-  exit 2
+	echo "Usage: $0 <image> <profile>" >&2
+	exit 2
 fi
 
 image="$1"
@@ -12,19 +12,19 @@ profile="$2"
 docker image inspect "$image" >/dev/null
 
 case "$profile" in
-  af-pod-monitor)
-    docker run --rm --entrypoint python "$image" -c "import prometheus_client"
-    ;;
-  interlink-slurm-plugin)
-    docker run --rm --entrypoint /bin/sh "$image" -lc 'test -x /sidecar/slurm-sidecar'
-    ;;
-  purdue-af)
-    docker run --rm --entrypoint /bin/bash "$image" -lc 'python --version && jupyter --version >/dev/null'
-    ;;
-  *)
-    echo "Unknown profile: $profile" >&2
-    exit 2
-    ;;
+af-pod-monitor)
+	docker run --rm --entrypoint python "$image" -c "import prometheus_client"
+	;;
+interlink-slurm-plugin)
+	docker run --rm --entrypoint /bin/sh "$image" -lc 'test -x /sidecar/slurm-sidecar'
+	;;
+purdue-af)
+	docker run --rm --entrypoint /bin/bash "$image" -lc 'python --version && jupyter --version >/dev/null'
+	;;
+*)
+	echo "Unknown profile: $profile" >&2
+	exit 2
+	;;
 esac
 
 echo "Smoke checks passed for profile: $profile"
diff --git a/.github/workflows/lint-python.yml b/.github/workflows/lint-python.yml
index 0f4d8686..452b39d9 100644
--- a/.github/workflows/lint-python.yml
+++ b/.github/workflows/lint-python.yml
@@ -42,7 +42,7 @@ jobs:
           fi
 
           black --check "${files[@]}"
-          isort --check-only "${files[@]}"
+          isort --profile black --check-only "${files[@]}"
           python -m py_compile "${files[@]}"
 
       - name: Run pytest (advisory)
diff --git a/.github/workflows/lint-shell.yml b/.github/workflows/lint-shell.yml
index adfe868e..d12762ed 100644
--- a/.github/workflows/lint-shell.yml
+++ b/.github/workflows/lint-shell.yml
@@ -42,7 +42,10 @@ jobs:
           fi
 
           shellcheck -S error -s bash "${files[@]}"
-          shfmt -d "${files[@]}"
+          mapfile -t shfmt_files < <(printf '%s\n' "${files[@]}" | grep -E '\.sh$' || true)
+          if [ "${#shfmt_files[@]}" -gt 0 ]; then
+            shfmt -d "${shfmt_files[@]}"
+          fi
 
           for f in "${files[@]}"; do
             bash -n "$f"

From 05beb618a2da0ada3ef787a23f3c20d54b3fc77d Mon Sep 17 00:00:00 2001
From: Dmitry Kondratyev <kondratyev.d.95@gmail.com>
Date: Wed, 4 Feb 2026 10:31:04 -0500
Subject: [PATCH 07/25] Refresh CI plan to current optimization workstreams

---
 .codex/CI_PLAN.md | 172 ++++++++++++++++++++++++----------------------
 1 file changed, 90 insertions(+), 82 deletions(-)

diff --git a/.codex/CI_PLAN.md b/.codex/CI_PLAN.md
index 63e5d390..584661bd 100644
--- a/.codex/CI_PLAN.md
+++ b/.codex/CI_PLAN.md
@@ -1,18 +1,20 @@
 # CI/CD Campaign Plan (Current State)
 
-## Mission and Success Criteria
-Deliver exactly one draft PR from `codex/ci` to `main` with minimal CI/CD hardening that:
-- converts formatter-based CI to check-only behavior,
-- adds advisory-first integrity/deploy/security coverage,
-- keeps one source of truth in this file,
-- preserves safe daily branch sync (`main` merged into `codex/ci`, no force-push).
-
-Success means:
-- `.github/workflows/lint-*.yml` workflows are check-only and run on `pull_request` (single run per change),
-- new workflows exist for integrity, GitOps deployability, and nightly advisory security,
-- optional repo-quality workflow is selected and included,
-- README shows A-E category badges,
-- no changes touch out-of-scope paths.
+## Mission
+Deliver one draft PR from `codex/ci` to `main` with a stable advisory-first CI baseline, then optimize test depth, integration realism, and security signal without broad refactors.
+
+## Current Status
+- PR branch: `codex/ci`
+- Delivery model: single PR `codex/ci -> main`
+- Existing CI baseline is green on PR checks.
+- Formatter/linter workflows are check-only (no CI writeback commits).
+
+## Success Criteria
+- CI remains stable on `pull_request` runs for all configured workflows.
+- Optimization phase adds meaningful unit and integration coverage for repo-owned code.
+- Security checks include nightly advisory plus PR-time advisory signal.
+- `README.md` keeps A-E category badges aligned with active workflows.
+- `.codex/CI_PLAN.md` remains the single source of truth.
 
 ## In-Scope / Out-of-Scope Paths
 In scope:
@@ -32,78 +34,84 @@ Out of scope:
 - `.cursor/**`
 
 Approved exception:
-- `slurm/**` is used as a dependency-only trigger for container reliability jobs because maintained Dockerfiles copy `slurm/` artifacts.
-
-## Target Check Architecture
+- `slurm/**` is used as a dependency-only trigger in container reliability path filters because maintained Dockerfiles copy `slurm/` artifacts.
+
+## Active Workflow Surface
+- `.github/workflows/ci-workflow-integrity.yml`
+- `.github/workflows/lint-python.yml`
+- `.github/workflows/lint-shell.yml`
+- `.github/workflows/lint-json.yml`
+- `.github/workflows/lint-yaml.yml`
+- `.github/workflows/ci-repo-quality.yml`
+- `.github/workflows/lint-docker.yml`
+- `.github/workflows/ci-gitops-deployability.yml`
+- `.github/workflows/nightly-security-advisory.yml`
+
+## Check Architecture
 ### A) CI System Integrity (advisory)
-- Workflow: `.github/workflows/ci-workflow-integrity.yml`
-- Checks: `actionlint` + workflow YAML parse.
-- Risk mapped: malformed workflows, invalid action definitions, skipped CI due syntax/runtime issues.
-
-### B) Repo-Owned Code Quality / Tests (advisory additions)
-- Workflows:
-  - `.github/workflows/lint-python.yml`
-  - `.github/workflows/lint-shell.yml`
-  - `.github/workflows/lint-json.yml`
-  - `.github/workflows/lint-yaml.yml`
-  - `.github/workflows/ci-repo-quality.yml` (selected)
-- Checks: black/isort check-only, py_compile, pytest (advisory), shellcheck/shfmt/bash -n, JSON/YAML parse checks.
-- Risk mapped: runtime and script regressions.
-
-### C) Container Reliability (advisory additions)
-- Workflow: `.github/workflows/lint-docker.yml`
-- Checks: hadolint (check-only), advisory docker build/smoke for maintained Dockerfiles via `.github/scripts/container-smoke.sh`.
-- Risk mapped: container build/runtime breakage.
-
-### D) GitOps/K8s Deployability (advisory)
-- Workflow: `.github/workflows/ci-gitops-deployability.yml`
-- Checks: `kustomize build --load-restrictor LoadRestrictionsNone` for all deploy overlays + `kubeconform` schema validation.
-- Risk mapped: Flux reconciliation failures from invalid manifests.
-
-### E) Nightly Advisory Security
-- Workflow: `.github/workflows/nightly-security-advisory.yml`
-- Checks: Trivy filesystem scan (HIGH/CRITICAL).
-- Risk mapped: security posture drift.
-
-## Advisory vs Future Blocking Milestones
-- M0 (this campaign): all newly introduced validations advisory.
-- M1: promote workflow integrity + repo-quality checks to blocking after stable baseline.
-- M2: promote container + GitOps checks to blocking after stable baseline.
-- M3: keep nightly security advisory unless explicitly promoted.
-
-## Agent Lane Ownership (File Level)
-- Coordinator: `.codex/CI_PLAN.md`, `README.md`, branch/PR/sync operations.
-- Agent A: `.github/workflows/ci-workflow-integrity.yml` (+ selection recommendation in chat).
-- Agent B: `.github/workflows/lint-python.yml`, `.github/workflows/lint-shell.yml`, `.github/workflows/ci-repo-quality.yml`, optional B helper scripts.
-- Agent C: `.github/workflows/lint-json.yml`, `.github/workflows/lint-yaml.yml`.
-- Agent D: `.github/workflows/lint-docker.yml`, `.github/scripts/container-smoke.sh`.
-- Agent E: `.github/workflows/ci-gitops-deployability.yml`, `.github/workflows/nightly-security-advisory.yml`.
-
-## Phased Rollout and Rollback
-Rollout:
-1. First commit creates this file.
-2. Add/convert workflows in lane-owned files only.
-3. Keep PR draft until baseline checks stabilize.
-4. Daily sync by merging `main` into `codex/ci`.
-
-Rollback:
-- Revert only unstable workflow files in small commits.
-- Keep advisory mode active during stabilization.
-
-## Reproducible Runbook (from clean main)
-1. `git fetch origin`
-2. `git switch main && git pull --ff-only origin main`
-3. `git switch -c codex/ci` (or `git switch codex/ci`)
-4. Commit #1: `.codex/CI_PLAN.md`
-5. Apply lane-scoped workflow changes
-6. `git push -u origin codex/ci`
-7. Open one draft PR `codex/ci -> main`
-8. Daily sync: `git fetch origin && git switch codex/ci && git merge --no-ff origin/main`
+- Workflow: `ci-workflow-integrity.yml`
+- Checks: actionlint + workflow YAML parse.
+- Risk: broken workflow definitions and silent CI drift.
+
+### B) Repo Quality and Tests (advisory)
+- Workflows: `lint-python.yml`, `lint-shell.yml`, `lint-json.yml`, `lint-yaml.yml`, `ci-repo-quality.yml`
+- Checks: black/isort check-only, py_compile, pytest advisory, shellcheck/shfmt/bash -n, JSON/YAML parse.
+- Risk: script/runtime regressions.
+
+### C) Container Reliability (advisory)
+- Workflow: `lint-docker.yml`
+- Checks: hadolint, targeted docker build jobs, smoke checks via `.github/scripts/container-smoke.sh`.
+- Risk: image build/runtime regressions.
+
+### D) GitOps Deployability (advisory)
+- Workflow: `ci-gitops-deployability.yml`
+- Checks: kustomize render + kubeconform schema validation.
+- Risk: Flux reconciliation failures from invalid manifests.
+
+### E) Security Posture (advisory)
+- Workflow: `nightly-security-advisory.yml`
+- Checks: nightly Trivy filesystem scan.
+- Risk: security drift in dependencies/configuration.
+
+## Optimization Workstreams (Current)
+### Worker 1: Coverage Optimizer
+File lane:
+- `tests/unit/**`
+- `tests/conftest.py`
+- `.github/workflows/lint-python.yml`
+- `.github/workflows/ci-repo-quality.yml`
+Goal:
+- Increase meaningful Python test coverage and publish coverage in CI (advisory threshold first).
+
+### Worker 2: Integration Scenarios
+File lane:
+- `tests/integration/**`
+- `tests/fixtures/**`
+- `.github/workflows/ci-integration-scenarios.yml` (new)
+- `.github/scripts/integration/**`
+Goal:
+- Add realistic automated integration scenarios with deterministic mocks and PR advisory execution.
+
+### Worker 3: Security and Runtime Optimizer
+File lane:
+- `.github/workflows/nightly-security-advisory.yml`
+- `.github/workflows/ci-security-advisory.yml` (new)
+- `.github/workflows/lint-docker.yml`
+- `.github/workflows/ci-gitops-deployability.yml`
+Goal:
+- Add PR-time advisory security checks and reduce CI runtime/noise safely.
+
+## Branch and Sync Rules
+- No side branches.
+- No force-push on shared campaign work.
+- Daily sync: merge `main` into `codex/ci` (no rebase).
+- Keep PR draft until optimization baseline is stable.
 
 ## Constraint Challenge Protocol
-If a hard constraint appears to conflict with delivery, create an `EXCEPTION REQUEST` with:
+If any hard constraint must be challenged, submit an `EXCEPTION REQUEST` with:
 1) challenged constraint,
 2) concrete risk if unchanged,
-3) minimal exception,
+3) minimal exception requested,
 4) rollback path.
-Do not implement exception changes before explicit user approval.
+
+No exception is implemented without explicit user approval.

From 43e0d63bf1db3290b92c577e73b33bf7b9d21a7b Mon Sep 17 00:00:00 2001
From: Dmitry Kondratyev <kondratyev.d.95@gmail.com>
Date: Wed, 4 Feb 2026 11:08:50 -0500
Subject: [PATCH 08/25] ci(worker2): add scoped integration scenarios and
 fixture matrix

---
 .../scripts/integration/mock-docker-cli.sh    |  45 ++++++++
 .../integration/run-integration-scenarios.sh  |   7 ++
 .../workflows/ci-integration-scenarios.yml    |  31 ++++++
 tests/fixtures/container_smoke/matrix.json    | 102 ++++++++++++++++++
 .../fixtures/monitoring/event_rate_cases.json |  12 +++
 .../monitoring/event_rate_invalid.txt         |   1 +
 .../fixtures/monitoring/event_rate_valid.txt  |   1 +
 .../fixtures/node_health/checksum_cases.json  |  57 ++++++++++
 tests/integration/common.py                   |  90 ++++++++++++++++
 .../test_container_smoke_matrix.py            |  68 ++++++++++++
 .../test_monitoring_metric_update.py          |  59 ++++++++++
 .../test_node_healthcheck_integration.py      |  85 +++++++++++++++
 12 files changed, 558 insertions(+)
 create mode 100755 .github/scripts/integration/mock-docker-cli.sh
 create mode 100755 .github/scripts/integration/run-integration-scenarios.sh
 create mode 100644 .github/workflows/ci-integration-scenarios.yml
 create mode 100644 tests/fixtures/container_smoke/matrix.json
 create mode 100644 tests/fixtures/monitoring/event_rate_cases.json
 create mode 100644 tests/fixtures/monitoring/event_rate_invalid.txt
 create mode 100644 tests/fixtures/monitoring/event_rate_valid.txt
 create mode 100644 tests/fixtures/node_health/checksum_cases.json
 create mode 100644 tests/integration/common.py
 create mode 100644 tests/integration/test_container_smoke_matrix.py
 create mode 100644 tests/integration/test_monitoring_metric_update.py
 create mode 100644 tests/integration/test_node_healthcheck_integration.py

diff --git a/.github/scripts/integration/mock-docker-cli.sh b/.github/scripts/integration/mock-docker-cli.sh
new file mode 100755
index 00000000..04969cda
--- /dev/null
+++ b/.github/scripts/integration/mock-docker-cli.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+if [ -n "${MOCK_DOCKER_LOG:-}" ]; then
+  printf '%s\n' "$*" >> "$MOCK_DOCKER_LOG"
+fi
+
+cmd="${1:-}"
+shift || true
+
+case "$cmd" in
+  image)
+    subcmd="${1:-}"
+    shift || true
+    if [ "$subcmd" != "inspect" ]; then
+      echo "mock docker unsupported image subcommand: $subcmd" >&2
+      exit 64
+    fi
+
+    if [ -n "${MOCK_DOCKER_INSPECT_STDOUT:-}" ]; then
+      printf '%s\n' "$MOCK_DOCKER_INSPECT_STDOUT"
+    fi
+    if [ -n "${MOCK_DOCKER_INSPECT_STDERR:-}" ]; then
+      printf '%s\n' "$MOCK_DOCKER_INSPECT_STDERR" >&2
+    fi
+
+    exit "${MOCK_DOCKER_INSPECT_EXIT:-0}"
+    ;;
+
+  run)
+    if [ -n "${MOCK_DOCKER_RUN_STDOUT:-}" ]; then
+      printf '%s\n' "$MOCK_DOCKER_RUN_STDOUT"
+    fi
+    if [ -n "${MOCK_DOCKER_RUN_STDERR:-}" ]; then
+      printf '%s\n' "$MOCK_DOCKER_RUN_STDERR" >&2
+    fi
+
+    exit "${MOCK_DOCKER_RUN_EXIT:-0}"
+    ;;
+
+  *)
+    echo "mock docker unsupported command: $cmd" >&2
+    exit 64
+    ;;
+esac
diff --git a/.github/scripts/integration/run-integration-scenarios.sh b/.github/scripts/integration/run-integration-scenarios.sh
new file mode 100755
index 00000000..20190685
--- /dev/null
+++ b/.github/scripts/integration/run-integration-scenarios.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)"
+
+cd "$repo_root"
+python3 -m unittest discover -s tests/integration -p 'test_*.py' -v
diff --git a/.github/workflows/ci-integration-scenarios.yml b/.github/workflows/ci-integration-scenarios.yml
new file mode 100644
index 00000000..2908b38d
--- /dev/null
+++ b/.github/workflows/ci-integration-scenarios.yml
@@ -0,0 +1,31 @@
+name: CI Integration Scenarios
+
+on:
+  pull_request:
+    paths:
+      - tests/integration/**
+      - tests/fixtures/**
+      - .github/scripts/integration/**
+      - .github/scripts/container-smoke.sh
+      - apps/monitoring/af-monitoring/metrics_server.py
+      - apps/monitoring/af-monitoring/node_healthcheck.py
+      - .github/workflows/ci-integration-scenarios.yml
+
+permissions:
+  contents: read
+
+jobs:
+  integration-scenarios:
+    runs-on: ubuntu-latest
+    continue-on-error: true
+    timeout-minutes: 10
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Run integration scenarios (advisory)
+        run: bash .github/scripts/integration/run-integration-scenarios.sh
diff --git a/tests/fixtures/container_smoke/matrix.json b/tests/fixtures/container_smoke/matrix.json
new file mode 100644
index 00000000..78619afd
--- /dev/null
+++ b/tests/fixtures/container_smoke/matrix.json
@@ -0,0 +1,102 @@
+[
+  {
+    "name": "af_pod_monitor_success",
+    "image": "ghcr.io/purdue-af/af-pod-monitor:test",
+    "profile": "af-pod-monitor",
+    "mock": {
+      "inspect_exit": 0,
+      "run_exit": 0
+    },
+    "expected": {
+      "exit_code": 0,
+      "stdout_contains": [
+        "Smoke checks passed for profile: af-pod-monitor"
+      ],
+      "stderr_contains": [],
+      "log_lines": [
+        "image inspect ghcr.io/purdue-af/af-pod-monitor:test",
+        "run --rm --entrypoint python ghcr.io/purdue-af/af-pod-monitor:test -c import prometheus_client"
+      ]
+    }
+  },
+  {
+    "name": "interlink_slurm_plugin_failure",
+    "image": "ghcr.io/purdue-af/interlink-slurm-plugin:test",
+    "profile": "interlink-slurm-plugin",
+    "mock": {
+      "inspect_exit": 0,
+      "run_exit": 1,
+      "run_stderr": "missing /sidecar/slurm-sidecar"
+    },
+    "expected": {
+      "exit_code": 1,
+      "stdout_contains": [],
+      "stderr_contains": [
+        "missing /sidecar/slurm-sidecar"
+      ],
+      "log_lines": [
+        "image inspect ghcr.io/purdue-af/interlink-slurm-plugin:test",
+        "run --rm --entrypoint /bin/sh ghcr.io/purdue-af/interlink-slurm-plugin:test -lc test -x /sidecar/slurm-sidecar"
+      ]
+    }
+  },
+  {
+    "name": "unknown_profile_rejected",
+    "image": "ghcr.io/purdue-af/custom:test",
+    "profile": "unknown-profile",
+    "mock": {
+      "inspect_exit": 0,
+      "run_exit": 0
+    },
+    "expected": {
+      "exit_code": 2,
+      "stdout_contains": [],
+      "stderr_contains": [
+        "Unknown profile: unknown-profile"
+      ],
+      "log_lines": [
+        "image inspect ghcr.io/purdue-af/custom:test"
+      ]
+    }
+  },
+  {
+    "name": "inspect_failure_short_circuit",
+    "image": "ghcr.io/purdue-af/purdue-af:test",
+    "profile": "purdue-af",
+    "mock": {
+      "inspect_exit": 1,
+      "inspect_stderr": "image not found",
+      "run_exit": 0
+    },
+    "expected": {
+      "exit_code": 1,
+      "stdout_contains": [],
+      "stderr_contains": [
+        "image not found"
+      ],
+      "log_lines": [
+        "image inspect ghcr.io/purdue-af/purdue-af:test"
+      ]
+    }
+  },
+  {
+    "name": "purdue_af_success",
+    "image": "ghcr.io/purdue-af/purdue-af:test",
+    "profile": "purdue-af",
+    "mock": {
+      "inspect_exit": 0,
+      "run_exit": 0
+    },
+    "expected": {
+      "exit_code": 0,
+      "stdout_contains": [
+        "Smoke checks passed for profile: purdue-af"
+      ],
+      "stderr_contains": [],
+      "log_lines": [
+        "image inspect ghcr.io/purdue-af/purdue-af:test",
+        "run --rm --entrypoint /bin/bash ghcr.io/purdue-af/purdue-af:test -lc python --version && jupyter --version >/dev/null"
+      ]
+    }
+  }
+]
diff --git a/tests/fixtures/monitoring/event_rate_cases.json b/tests/fixtures/monitoring/event_rate_cases.json
new file mode 100644
index 00000000..785a3de0
--- /dev/null
+++ b/tests/fixtures/monitoring/event_rate_cases.json
@@ -0,0 +1,12 @@
+[
+  {
+    "name": "valid_event_rate",
+    "fixture_file": "event_rate_valid.txt",
+    "expected_gauge_value": 128.5
+  },
+  {
+    "name": "invalid_event_rate",
+    "fixture_file": "event_rate_invalid.txt",
+    "expected_gauge_value": 0
+  }
+]
diff --git a/tests/fixtures/monitoring/event_rate_invalid.txt b/tests/fixtures/monitoring/event_rate_invalid.txt
new file mode 100644
index 00000000..3fb64bad
--- /dev/null
+++ b/tests/fixtures/monitoring/event_rate_invalid.txt
@@ -0,0 +1 @@
+not-a-number
diff --git a/tests/fixtures/monitoring/event_rate_valid.txt b/tests/fixtures/monitoring/event_rate_valid.txt
new file mode 100644
index 00000000..ae500ed9
--- /dev/null
+++ b/tests/fixtures/monitoring/event_rate_valid.txt
@@ -0,0 +1 @@
+128.5
diff --git a/tests/fixtures/node_health/checksum_cases.json b/tests/fixtures/node_health/checksum_cases.json
new file mode 100644
index 00000000..954d99b8
--- /dev/null
+++ b/tests/fixtures/node_health/checksum_cases.json
@@ -0,0 +1,57 @@
+[
+  {
+    "name": "checksum_match",
+    "mode": "normal",
+    "filename": "/depot/cms/purdue-af/validate-mount.txt",
+    "expected_checksum": "13dede34ee8dc7e5b70c9cd06ac15467",
+    "md5_stdout": "13dede34ee8dc7e5b70c9cd06ac15467  /depot/cms/purdue-af/validate-mount.txt\n",
+    "md5_stderr": "",
+    "returncode": 0,
+    "start_time": 1000.0,
+    "end_time": 1000.123,
+    "expected_result": true,
+    "expected_ping_ms": 123.0,
+    "expect_killed": false
+  },
+  {
+    "name": "checksum_mismatch",
+    "mode": "normal",
+    "filename": "/work/projects/purdue-af/validate-mount.txt",
+    "expected_checksum": "f4cb7f2740ba3e87edfbda6c70fa94c2",
+    "md5_stdout": "00000000000000000000000000000000  /work/projects/purdue-af/validate-mount.txt\n",
+    "md5_stderr": "",
+    "returncode": 0,
+    "start_time": 2000.0,
+    "end_time": 2000.05,
+    "expected_result": false,
+    "expected_ping_ms": 50.0,
+    "expect_killed": false
+  },
+  {
+    "name": "md5_error_returncode",
+    "mode": "normal",
+    "filename": "/eos/purdue/store/user/dkondrat/test.root",
+    "expected_checksum": "18864b0de8ae5a6a8d3b459a7999b431",
+    "md5_stdout": "",
+    "md5_stderr": "No such file or directory",
+    "returncode": 1,
+    "start_time": 3000.0,
+    "end_time": 3000.08,
+    "expected_result": false,
+    "expected_ping_ms": 80.0,
+    "expect_killed": false
+  },
+  {
+    "name": "md5_timeout",
+    "mode": "timeout",
+    "filename": "/cvmfs/cms.cern.ch/SITECONF/T2_US_Purdue/Purdue-Hadoop/JobConfig/site-local-config.xml",
+    "expected_checksum": "3b570d80272b7188c13cef51e58b7151",
+    "md5_stdout": "",
+    "md5_stderr": "",
+    "returncode": 124,
+    "start_time": 4000.0,
+    "expected_result": false,
+    "expected_ping_ms": 3000,
+    "expect_killed": true
+  }
+]
diff --git a/tests/integration/common.py b/tests/integration/common.py
new file mode 100644
index 00000000..9ad1358a
--- /dev/null
+++ b/tests/integration/common.py
@@ -0,0 +1,90 @@
+from __future__ import annotations
+
+import importlib.util
+import json
+import sys
+import types
+from pathlib import Path
+from typing import Any
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+FIXTURES_ROOT = REPO_ROOT / "tests" / "fixtures"
+
+
+class FakeGaugeChild:
+    def __init__(self, labels: dict[str, str]):
+        self.labels = labels
+        self.value: float | int | None = None
+        self.history: list[float | int] = []
+
+    def set(self, value: float | int) -> None:
+        self.value = value
+        self.history.append(value)
+
+
+class FakeGauge:
+    def __init__(
+        self,
+        name: str,
+        description: str,
+        label_names: list[str] | tuple[str, ...] | None = None,
+    ):
+        self.name = name
+        self.description = description
+        self.label_names = tuple(label_names or ())
+        self.value: float | int | None = None
+        self.history: list[float | int] = []
+        self.children: dict[tuple[tuple[str, str], ...], FakeGaugeChild] = {}
+
+    def set(self, value: float | int) -> None:
+        self.value = value
+        self.history.append(value)
+
+    def labels(self, *args: str, **kwargs: str) -> FakeGaugeChild:
+        if args and kwargs:
+            raise ValueError("labels accepts positional or keyword labels, not both")
+
+        if args:
+            if len(args) != len(self.label_names):
+                raise ValueError("label count does not match")
+            label_values = dict(zip(self.label_names, args))
+        else:
+            label_values = {name: kwargs[name] for name in self.label_names}
+
+        key = tuple((name, label_values[name]) for name in self.label_names)
+        child = self.children.get(key)
+        if child is None:
+            child = FakeGaugeChild(label_values)
+            self.children[key] = child
+        return child
+
+
+def load_json_fixture(relative_path: str) -> Any:
+    fixture_path = FIXTURES_ROOT / relative_path
+    with fixture_path.open("r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def load_module_with_fake_prometheus(relative_path: str, module_name: str):
+    module_path = REPO_ROOT / relative_path
+    spec = importlib.util.spec_from_file_location(module_name, module_path)
+    if spec is None or spec.loader is None:
+        raise RuntimeError(f"Could not load module spec for {module_path}")
+
+    module = importlib.util.module_from_spec(spec)
+
+    fake_prometheus = types.ModuleType("prometheus_client")
+    fake_prometheus.Gauge = FakeGauge
+    fake_prometheus.start_http_server = lambda *_args, **_kwargs: None
+
+    original_prometheus = sys.modules.get("prometheus_client")
+    sys.modules["prometheus_client"] = fake_prometheus
+    try:
+        spec.loader.exec_module(module)
+    finally:
+        if original_prometheus is None:
+            del sys.modules["prometheus_client"]
+        else:
+            sys.modules["prometheus_client"] = original_prometheus
+
+    return module
diff --git a/tests/integration/test_container_smoke_matrix.py b/tests/integration/test_container_smoke_matrix.py
new file mode 100644
index 00000000..b3215c6f
--- /dev/null
+++ b/tests/integration/test_container_smoke_matrix.py
@@ -0,0 +1,68 @@
+from __future__ import annotations
+
+import os
+from pathlib import Path
+import subprocess
+import tempfile
+import unittest
+
+from common import REPO_ROOT, load_json_fixture
+
+CONTAINER_SMOKE_SCRIPT = REPO_ROOT / ".github/scripts/container-smoke.sh"
+MOCK_DOCKER_SCRIPT = REPO_ROOT / ".github/scripts/integration/mock-docker-cli.sh"
+
+
+class ContainerSmokeBehaviorMatrixIntegrationTest(unittest.TestCase):
+    def test_container_smoke_behavior_matrix(self) -> None:
+        cases = load_json_fixture("container_smoke/matrix.json")
+
+        for case in cases:
+            with self.subTest(case=case["name"]):
+                with tempfile.TemporaryDirectory() as temp_dir:
+                    temp_path = Path(temp_dir)
+                    docker_wrapper = temp_path / "docker"
+                    docker_wrapper.write_text(
+                        f"#!/usr/bin/env bash\nexec \"{MOCK_DOCKER_SCRIPT}\" \"$@\"\n",
+                        encoding="utf-8",
+                    )
+                    docker_wrapper.chmod(0o755)
+
+                    log_file = temp_path / "docker.log"
+                    env = os.environ.copy()
+                    env["PATH"] = f"{temp_path}:{env.get('PATH', '')}"
+                    env["MOCK_DOCKER_LOG"] = str(log_file)
+                    env["MOCK_DOCKER_INSPECT_EXIT"] = str(case["mock"]["inspect_exit"])
+                    env["MOCK_DOCKER_RUN_EXIT"] = str(case["mock"]["run_exit"])
+                    env["MOCK_DOCKER_INSPECT_STDERR"] = case["mock"].get(
+                        "inspect_stderr", ""
+                    )
+                    env["MOCK_DOCKER_RUN_STDERR"] = case["mock"].get("run_stderr", "")
+
+                    result = subprocess.run(
+                        [
+                            "bash",
+                            str(CONTAINER_SMOKE_SCRIPT),
+                            case["image"],
+                            case["profile"],
+                        ],
+                        capture_output=True,
+                        text=True,
+                        check=False,
+                        env=env,
+                    )
+
+                    expected = case["expected"]
+                    self.assertEqual(result.returncode, expected["exit_code"])
+                    for expected_text in expected["stdout_contains"]:
+                        self.assertIn(expected_text, result.stdout)
+                    for expected_text in expected["stderr_contains"]:
+                        self.assertIn(expected_text, result.stderr)
+
+                    logged_lines = []
+                    if log_file.exists():
+                        logged_lines = log_file.read_text(encoding="utf-8").splitlines()
+                    self.assertEqual(logged_lines, expected["log_lines"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/integration/test_monitoring_metric_update.py b/tests/integration/test_monitoring_metric_update.py
new file mode 100644
index 00000000..05b3a6fe
--- /dev/null
+++ b/tests/integration/test_monitoring_metric_update.py
@@ -0,0 +1,59 @@
+from __future__ import annotations
+
+from pathlib import Path
+import unittest
+from unittest import mock
+from uuid import uuid4
+
+from common import FIXTURES_ROOT, load_json_fixture, load_module_with_fake_prometheus
+
+METRIC_FILE = "/work/projects/purdue-af/agc/metrics/event_rate.txt"
+MODULE_PATH = "apps/monitoring/af-monitoring/metrics_server.py"
+
+
+class MonitoringMetricUpdateFlowIntegrationTest(unittest.TestCase):
+    def setUp(self) -> None:
+        module_name = f"metrics_server_integration_{uuid4().hex}"
+        self.module = load_module_with_fake_prometheus(MODULE_PATH, module_name)
+
+    def _patched_open_for_fixture(self, fixture_path: Path):
+        real_open = open
+
+        def _patched_open(path, *args, **kwargs):
+            if str(path) == METRIC_FILE:
+                return real_open(fixture_path, *args, **kwargs)
+            return real_open(path, *args, **kwargs)
+
+        return _patched_open
+
+    def test_fixture_backed_metric_updates(self) -> None:
+        cases = load_json_fixture("monitoring/event_rate_cases.json")
+
+        for case in cases:
+            fixture_path = FIXTURES_ROOT / "monitoring" / case["fixture_file"]
+            with self.subTest(case=case["name"]), mock.patch(
+                "builtins.open",
+                side_effect=self._patched_open_for_fixture(fixture_path),
+            ):
+                self.module.update_metrics()
+                self.assertEqual(
+                    self.module.event_rate_per_worker.history[-1],
+                    case["expected_gauge_value"],
+                )
+
+    def test_missing_metric_file_falls_back_to_zero(self) -> None:
+        real_open = open
+
+        def _patched_open(path, *args, **kwargs):
+            if str(path) == METRIC_FILE:
+                raise FileNotFoundError("event rate fixture not found")
+            return real_open(path, *args, **kwargs)
+
+        with mock.patch("builtins.open", side_effect=_patched_open):
+            self.module.update_metrics()
+
+        self.assertEqual(self.module.event_rate_per_worker.history[-1], 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/integration/test_node_healthcheck_integration.py b/tests/integration/test_node_healthcheck_integration.py
new file mode 100644
index 00000000..17abc15d
--- /dev/null
+++ b/tests/integration/test_node_healthcheck_integration.py
@@ -0,0 +1,85 @@
+from __future__ import annotations
+
+import subprocess
+import unittest
+from unittest import mock
+from uuid import uuid4
+
+from common import load_json_fixture, load_module_with_fake_prometheus
+
+MODULE_PATH = "apps/monitoring/af-monitoring/node_healthcheck.py"
+
+
+class FakeMd5Process:
+    def __init__(self, case: dict):
+        self.mode = case["mode"]
+        self.stdout = case["md5_stdout"]
+        self.stderr = case["md5_stderr"]
+        self.returncode = case["returncode"]
+        self.killed = False
+        self.communicate_calls = 0
+        self.timeout_history: list[float | int | None] = []
+
+    def communicate(self, timeout=None):
+        self.communicate_calls += 1
+        self.timeout_history.append(timeout)
+        if self.mode == "timeout" and self.communicate_calls == 1:
+            raise subprocess.TimeoutExpired(cmd="/usr/bin/md5sum", timeout=timeout)
+        return self.stdout, self.stderr
+
+    def kill(self):
+        self.killed = True
+
+
+class NodeHealthChecksumTimeoutIntegrationTest(unittest.TestCase):
+    def setUp(self) -> None:
+        module_name = f"node_healthcheck_integration_{uuid4().hex}"
+        self.module = load_module_with_fake_prometheus(MODULE_PATH, module_name)
+
+    def test_checksum_and_timeout_matrix(self) -> None:
+        cases = load_json_fixture("node_health/checksum_cases.json")
+
+        for case in cases:
+            process = FakeMd5Process(case)
+            time_values = [case["start_time"]]
+            if case["mode"] != "timeout":
+                time_values.append(case["end_time"])
+
+            with self.subTest(case=case["name"]), mock.patch.object(
+                self.module.subprocess,
+                "Popen",
+                return_value=process,
+            ) as popen_mock, mock.patch.object(
+                self.module.time,
+                "time",
+                side_effect=time_values,
+            ):
+                result, ping_ms = self.module.check_if_directory_exists(
+                    (case["filename"], case["expected_checksum"])
+                )
+
+            self.assertEqual(result, case["expected_result"])
+            self.assertEqual(process.killed, case["expect_killed"])
+            self.assertEqual(
+                popen_mock.call_args[0][0],
+                ["/usr/bin/md5sum", case["filename"]],
+            )
+            if case["mode"] == "timeout":
+                self.assertEqual(process.timeout_history, [3, None])
+            else:
+                self.assertEqual(process.timeout_history, [3])
+
+            expected_ping_ms = case["expected_ping_ms"]
+            if isinstance(expected_ping_ms, float):
+                self.assertAlmostEqual(ping_ms, expected_ping_ms, delta=0.001)
+            else:
+                self.assertEqual(ping_ms, expected_ping_ms)
+
+            if case["mode"] == "timeout":
+                self.assertEqual(process.communicate_calls, 2)
+            else:
+                self.assertEqual(process.communicate_calls, 1)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 4ca5aa81fbbbb3465b9cee824b818ad430ab5a8d Mon Sep 17 00:00:00 2001
From: Dmitry Kondratyev <kondratyev.d.95@gmail.com>
Date: Wed, 4 Feb 2026 11:15:26 -0500
Subject: [PATCH 09/25] ci(worker1): tighten unit coverage flow and clean
 transient test artifacts

---
 .github/workflows/ci-repo-quality.yml   |  10 ++-
 .github/workflows/lint-python.yml       |  14 +---
 tests/conftest.py                       |  75 +++++++++++++++++
 tests/unit/test_docker_healthcheck.py   |  62 ++++++++++++++
 tests/unit/test_metrics_server.py       |  37 ++++++++
 tests/unit/test_node_healthcheck.py     | 107 ++++++++++++++++++++++++
 tests/unit/test_pod_metrics_exporter.py |  87 +++++++++++++++++++
 7 files changed, 377 insertions(+), 15 deletions(-)
 create mode 100644 tests/conftest.py
 create mode 100644 tests/unit/test_docker_healthcheck.py
 create mode 100644 tests/unit/test_metrics_server.py
 create mode 100644 tests/unit/test_node_healthcheck.py
 create mode 100644 tests/unit/test_pod_metrics_exporter.py

diff --git a/.github/workflows/ci-repo-quality.yml b/.github/workflows/ci-repo-quality.yml
index 247108bd..2af6c26c 100644
--- a/.github/workflows/ci-repo-quality.yml
+++ b/.github/workflows/ci-repo-quality.yml
@@ -22,7 +22,7 @@ jobs:
         run: |
           set -euo pipefail
           python -m pip install --upgrade pip
-          pip install pytest
+          pip install pytest pytest-cov
 
       - name: Python syntax smoke (advisory)
         shell: bash
@@ -61,7 +61,13 @@ jobs:
         shell: bash
         run: |
           set +e
-          pytest -q
+          pytest -q tests/unit \
+            --cov=apps/monitoring/af-monitoring/metrics_server.py \
+            --cov=apps/monitoring/af-monitoring/node_healthcheck.py \
+            --cov=docker/af-pod-monitor/pod-metrics-exporter.py \
+            --cov=docker/purdue-af/jupyter/docker_healthcheck.py \
+            --cov-report=term-missing \
+            --cov-fail-under=70
           rc=$?
           if [ "$rc" -eq 5 ]; then
             echo 'pytest collected no tests; treating as informational.'
diff --git a/.github/workflows/lint-python.yml b/.github/workflows/lint-python.yml
index 452b39d9..80dd38aa 100644
--- a/.github/workflows/lint-python.yml
+++ b/.github/workflows/lint-python.yml
@@ -22,7 +22,7 @@ jobs:
         run: |
           set -euo pipefail
           python -m pip install --upgrade pip
-          pip install black isort pytest
+          pip install black isort
 
       - name: Run black/isort/py_compile (check-only, advisory)
         shell: bash
@@ -44,15 +44,3 @@ jobs:
           black --check "${files[@]}"
           isort --profile black --check-only "${files[@]}"
           python -m py_compile "${files[@]}"
-
-      - name: Run pytest (advisory)
-        shell: bash
-        run: |
-          set +e
-          pytest -q
-          rc=$?
-          if [ "$rc" -eq 5 ]; then
-            echo 'pytest collected no tests; treating as informational.'
-            exit 0
-          fi
-          exit "$rc"
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 00000000..2caf62bf
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,75 @@
+from __future__ import annotations
+
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+from typing import Callable
+
+import pytest
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+
+
+class RecordingGauge:
+    def __init__(self) -> None:
+        self.values: list[float] = []
+        self.label_children: dict[tuple[tuple[str, str], ...], "RecordingGauge"] = {}
+
+    def set(self, value: float) -> None:
+        self.values.append(value)
+
+    def labels(self, **labels: str) -> "RecordingGauge":
+        key = tuple(sorted(labels.items()))
+        child = self.label_children.get(key)
+        if child is None:
+            child = RecordingGauge()
+            self.label_children[key] = child
+        return child
+
+
+@pytest.fixture
+def recording_gauge_cls():
+    return RecordingGauge
+
+
+@pytest.fixture
+def prometheus_stub() -> ModuleType:
+    module = ModuleType("prometheus_client")
+
+    class Gauge:
+        def __init__(self, *_args, **_kwargs) -> None:
+            self.values = []
+
+        def set(self, value: float) -> None:
+            self.values.append(value)
+
+        def labels(self, **_labels: str) -> "Gauge":
+            return self
+
+    module.Gauge = Gauge
+    module.start_http_server = lambda *_args, **_kwargs: None
+    return module
+
+
+@pytest.fixture
+def module_loader(monkeypatch: pytest.MonkeyPatch) -> Callable[..., object]:
+    counter = 0
+
+    def _load(relative_path: str, *, extra_modules: dict[str, object] | None = None) -> object:
+        nonlocal counter
+        counter += 1
+        module_name = f"test_module_{counter}"
+        module_path = REPO_ROOT / relative_path
+
+        if extra_modules:
+            for name, module in extra_modules.items():
+                monkeypatch.setitem(sys.modules, name, module)
+
+        spec = importlib.util.spec_from_file_location(module_name, module_path)
+        assert spec is not None and spec.loader is not None
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        return module
+
+    return _load
diff --git a/tests/unit/test_docker_healthcheck.py b/tests/unit/test_docker_healthcheck.py
new file mode 100644
index 00000000..bbd7fd1b
--- /dev/null
+++ b/tests/unit/test_docker_healthcheck.py
@@ -0,0 +1,62 @@
+from __future__ import annotations
+
+import json
+from types import ModuleType
+
+
+class _FakeJsonFile:
+    def __init__(self, payload: bytes) -> None:
+        self.payload = payload
+
+    def read_bytes(self) -> bytes:
+        return self.payload
+
+
+class _FakePath:
+    def __init__(self, payload: bytes) -> None:
+        self.payload = payload
+
+    def __truediv__(self, _part: str) -> "_FakePath":
+        return self
+
+    def glob(self, _pattern: str):
+        return iter([_FakeJsonFile(self.payload)])
+
+
+def test_healthcheck_queries_jupyter_api_and_prints_response(monkeypatch, module_loader) -> None:
+    captured = {}
+    payload = json.dumps({"url": "https://af.example/"}).encode("utf-8")
+
+    class _FakeResponse:
+        def __init__(self) -> None:
+            self.content = b"healthy"
+            self.raise_calls = 0
+
+        def raise_for_status(self) -> None:
+            self.raise_calls += 1
+
+    fake_response = _FakeResponse()
+    requests_stub = ModuleType("requests")
+
+    def _fake_get(url: str, verify: bool):
+        captured["url"] = url
+        captured["verify"] = verify
+        return fake_response
+
+    requests_stub.get = _fake_get
+
+    pathlib_stub = ModuleType("pathlib")
+    pathlib_stub.Path = lambda _value: _FakePath(payload)
+
+    printed = []
+    monkeypatch.setenv("NB_USER", "alice")
+    monkeypatch.setattr("builtins.print", lambda value: printed.append(value))
+
+    module_loader(
+        "docker/purdue-af/jupyter/docker_healthcheck.py",
+        extra_modules={"pathlib": pathlib_stub, "requests": requests_stub},
+    )
+
+    assert captured == {"url": "https://af.example/api", "verify": False}
+    assert fake_response.raise_calls == 1
+    assert printed == [b"healthy"]
diff --git a/tests/unit/test_metrics_server.py b/tests/unit/test_metrics_server.py
new file mode 100644
index 00000000..2967aa7c
--- /dev/null
+++ b/tests/unit/test_metrics_server.py
@@ -0,0 +1,37 @@
+from __future__ import annotations
+
+from unittest.mock import mock_open
+
+
+def test_update_metrics_sets_event_rate_from_file(
+    monkeypatch, module_loader, prometheus_stub, recording_gauge_cls
+) -> None:
+    module = module_loader(
+        "apps/monitoring/af-monitoring/metrics_server.py",
+        extra_modules={"prometheus_client": prometheus_stub},
+    )
+    module.event_rate_per_worker = recording_gauge_cls()
+    monkeypatch.setattr("builtins.open", mock_open(read_data="42.5\n"))
+
+    module.update_metrics()
+
+    assert module.event_rate_per_worker.values == [42.5]
+
+
+def test_update_metrics_sets_zero_when_read_fails(
+    monkeypatch, module_loader, prometheus_stub, recording_gauge_cls
+) -> None:
+    module = module_loader(
+        "apps/monitoring/af-monitoring/metrics_server.py",
+        extra_modules={"prometheus_client": prometheus_stub},
+    )
+    module.event_rate_per_worker = recording_gauge_cls()
+
+    def _raise(*_args, **_kwargs):
+        raise OSError("not found")
+
+    monkeypatch.setattr("builtins.open", _raise)
+
+    module.update_metrics()
+
+    assert module.event_rate_per_worker.values == [0]
diff --git a/tests/unit/test_node_healthcheck.py b/tests/unit/test_node_healthcheck.py
new file mode 100644
index 00000000..caf58bc4
--- /dev/null
+++ b/tests/unit/test_node_healthcheck.py
@@ -0,0 +1,107 @@
+from __future__ import annotations
+
+import subprocess
+
+import pytest
+
+
+def test_check_if_directory_exists_reports_success_for_matching_checksum(
+    monkeypatch, module_loader, prometheus_stub
+) -> None:
+    module = module_loader(
+        "apps/monitoring/af-monitoring/node_healthcheck.py",
+        extra_modules={"prometheus_client": prometheus_stub},
+    )
+
+    class FakeProc:
+        returncode = 0
+
+        def __init__(self) -> None:
+            self.killed = False
+
+        def communicate(self, timeout=None):
+            return ("abc123  /tmp/validate.txt\n", "")
+
+        def kill(self) -> None:
+            self.killed = True
+
+    proc = FakeProc()
+    popen_calls = []
+
+    def _fake_popen(args, **kwargs):
+        popen_calls.append((args, kwargs))
+        return proc
+
+    times = iter([100.0, 100.2])
+    monkeypatch.setattr(module.time, "time", lambda: next(times))
+    monkeypatch.setattr(module.subprocess, "Popen", _fake_popen)
+
+    valid, elapsed_ms = module.check_if_directory_exists(("/tmp/validate.txt", "abc123"))
+
+    assert valid is True
+    assert elapsed_ms == pytest.approx(200.0)
+    assert popen_calls[0][0] == ["/usr/bin/md5sum", "/tmp/validate.txt"]
+
+
+def test_check_if_directory_exists_returns_timeout_result(
+    monkeypatch, module_loader, prometheus_stub
+) -> None:
+    module = module_loader(
+        "apps/monitoring/af-monitoring/node_healthcheck.py",
+        extra_modules={"prometheus_client": prometheus_stub},
+    )
+
+    class FakeProc:
+        returncode = 0
+
+        def __init__(self) -> None:
+            self.killed = False
+            self.calls = 0
+
+        def communicate(self, timeout=None):
+            self.calls += 1
+            if self.calls == 1:
+                raise subprocess.TimeoutExpired(cmd="md5sum", timeout=timeout)
+            return ("", "")
+
+        def kill(self) -> None:
+            self.killed = True
+
+    proc = FakeProc()
+    monkeypatch.setattr(module.subprocess, "Popen", lambda *_args, **_kwargs: proc)
+
+    valid, elapsed_ms = module.check_if_directory_exists(("/tmp/validate.txt", "abc123"))
+
+    assert valid is False
+    assert elapsed_ms == 3000
+    assert proc.killed is True
+
+
+def test_update_metrics_writes_mount_health_and_ping(
+    monkeypatch, module_loader, prometheus_stub, recording_gauge_cls
+) -> None:
+    module = module_loader(
+        "apps/monitoring/af-monitoring/node_healthcheck.py",
+        extra_modules={"prometheus_client": prometheus_stub},
+    )
+    module.mount_valid = recording_gauge_cls()
+    module.mount_ping_ms = recording_gauge_cls()
+    module.mounts = {
+        "mount-a": ("/mnt/a", "sum-a"),
+        "mount-b": ("/mnt/b", "sum-b"),
+    }
+    responses = iter([(True, 12.5), (False, 22.5)])
+    monkeypatch.setattr(
+        module,
+        "check_if_directory_exists",
+        lambda _path_tuple: next(responses),
+    )
+
+    module.update_metrics()
+
+    key_a = (("mount_name", "mount-a"), ("mount_path", "/mnt/a"))
+    key_b = (("mount_name", "mount-b"), ("mount_path", "/mnt/b"))
+    assert module.mount_valid.label_children[key_a].values == [1]
+    assert module.mount_valid.label_children[key_b].values == [0]
+    assert module.mount_ping_ms.label_children[key_a].values == [12.5]
+    assert module.mount_ping_ms.label_children[key_b].values == [22.5]
diff --git a/tests/unit/test_pod_metrics_exporter.py b/tests/unit/test_pod_metrics_exporter.py
new file mode 100644
index 00000000..7ac617fd
--- /dev/null
+++ b/tests/unit/test_pod_metrics_exporter.py
@@ -0,0 +1,87 @@
+from __future__ import annotations
+
+import glob
+import os
+from types import SimpleNamespace
+
+
+def _load_exporter(monkeypatch, module_loader, prometheus_stub):
+    monkeypatch.setattr(os, "listdir", lambda _path: ["jovyan", "slurm", "alice"])
+    monkeypatch.setattr(glob, "glob", lambda _pattern: ["/home/alice"])
+    return module_loader(
+        "docker/af-pod-monitor/pod-metrics-exporter.py",
+        extra_modules={"prometheus_client": prometheus_stub},
+    )
+
+
+def test_module_initializes_directories_from_non_skipped_user(
+    monkeypatch, module_loader, prometheus_stub
+) -> None:
+    module = _load_exporter(monkeypatch, module_loader, prometheus_stub)
+
+    assert module.username == "alice"
+    assert module.directories == {
+        "home": "/home/alice",
+        "work": "/work/users/alice/",
+    }
+
+
+def test_update_metrics_work_branch_sets_usage_and_access_time(
+    monkeypatch, module_loader, prometheus_stub, recording_gauge_cls
+) -> None:
+    module = _load_exporter(monkeypatch, module_loader, prometheus_stub)
+    module.metrics = {
+        "work_dir_used": recording_gauge_cls(),
+        "work_dir_size": recording_gauge_cls(),
+        "work_dir_util": recording_gauge_cls(),
+        "work_dir_last_accessed": recording_gauge_cls(),
+    }
+    module.dl = "work"
+    monkeypatch.setattr(
+        module.subprocess,
+        "check_output",
+        lambda *_args, **_kwargs: b"2048 /work/users/alice/\n",
+    )
+    monkeypatch.setattr(
+        module.os,
+        "stat",
+        lambda _directory: SimpleNamespace(st_atime=1700000000.0),
+    )
+
+    module.update_metrics("work")
+
+    assert module.metrics["work_dir_used"].values == [2048]
+    assert module.metrics["work_dir_size"].values == [104857600]
+    assert module.metrics["work_dir_util"].values == [2048 / 104857600]
+    assert module.metrics["work_dir_last_accessed"].values == [1700000000.0]
+
+
+def test_update_metrics_home_branch_parses_df_and_ignores_stat_errors(
+    monkeypatch, module_loader, prometheus_stub, recording_gauge_cls
+) -> None:
+    module = _load_exporter(monkeypatch, module_loader, prometheus_stub)
+    module.metrics = {
+        "home_dir_used": recording_gauge_cls(),
+        "home_dir_size": recording_gauge_cls(),
+        "home_dir_util": recording_gauge_cls(),
+        "home_dir_last_accessed": recording_gauge_cls(),
+    }
+    module.dl = "home"
+
+    df_output = (
+        "Filesystem 1K-blocks Used Available Use% Mounted on\n"
+        "/dev/sda1 1000 250 750 25% /home\n"
+    ).encode("utf-8")
+    monkeypatch.setattr(module.subprocess, "check_output", lambda *_args, **_kwargs: df_output)
+
+    def _raise_stat(_directory):
+        raise OSError("stat unavailable")
+
+    monkeypatch.setattr(module.os, "stat", _raise_stat)
+
+    module.update_metrics("home")
+
+    assert module.metrics["home_dir_used"].values == [250]
+    assert module.metrics["home_dir_size"].values == [1000]
+    assert module.metrics["home_dir_util"].values == [0.25]
+    assert module.metrics["home_dir_last_accessed"].values == []

From 625737d60d825ea9f66d624cc5f9e55e3e9c907d Mon Sep 17 00:00:00 2001
From: Dmitry Kondratyev <kondratyev.d.95@gmail.com>
Date: Wed, 4 Feb 2026 11:16:41 -0500
Subject: [PATCH 10/25] ci(worker3): optimize advisory security and runtime
 workflow scope

---
 .github/workflows/ci-gitops-deployability.yml | 130 ++++++++++++--
 .github/workflows/ci-security-advisory.yml    | 166 ++++++++++++++++++
 .github/workflows/lint-docker.yml             |  22 ++-
 .../workflows/nightly-security-advisory.yml   |  78 +++++++-
 4 files changed, 375 insertions(+), 21 deletions(-)
 create mode 100644 .github/workflows/ci-security-advisory.yml

diff --git a/.github/workflows/ci-gitops-deployability.yml b/.github/workflows/ci-gitops-deployability.yml
index f3c5ae05..d83902b0 100644
--- a/.github/workflows/ci-gitops-deployability.yml
+++ b/.github/workflows/ci-gitops-deployability.yml
@@ -2,12 +2,63 @@ name: CI GitOps Deployability
 
 on:
   pull_request:
+    paths:
+      - 'deploy/**'
+      - '.github/workflows/ci-gitops-deployability.yml'
+
+concurrency:
+  group: ci-gitops-deployability-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
 
 permissions:
   contents: read
 
 jobs:
+  detect-gitops-changes:
+    runs-on: ubuntu-latest
+    outputs:
+      run_all: ${{ steps.scope.outputs.run_all }}
+      core_production: ${{ steps.filter.outputs.core_production }}
+      core_staging: ${{ steps.filter.outputs.core_staging }}
+      core_geddes2: ${{ steps.filter.outputs.core_geddes2 }}
+      experimental: ${{ steps.filter.outputs.experimental }}
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: dorny/paths-filter@v3
+        id: filter
+        with:
+          filters: |
+            core_production:
+              - 'deploy/core-production/**'
+            core_staging:
+              - 'deploy/core-staging/**'
+            core_geddes2:
+              - 'deploy/core-geddes2/**'
+            experimental:
+              - 'deploy/experimental/**'
+            deploy_shared:
+              - 'deploy/**'
+              - '!deploy/core-production/**'
+              - '!deploy/core-staging/**'
+              - '!deploy/core-geddes2/**'
+              - '!deploy/experimental/**'
+            workflow:
+              - '.github/workflows/ci-gitops-deployability.yml'
+
+      - id: scope
+        shell: bash
+        run: |
+          set -euo pipefail
+          if [ "${{ steps.filter.outputs.deploy_shared }}" = 'true' ] || [ "${{ steps.filter.outputs.workflow }}" = 'true' ]; then
+            echo "run_all=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "run_all=false" >> "$GITHUB_OUTPUT"
+          fi
+
   gitops-validate:
+    needs: detect-gitops-changes
+    if: needs.detect-gitops-changes.outputs.run_all == 'true' || needs.detect-gitops-changes.outputs.core_production == 'true' || needs.detect-gitops-changes.outputs.core_staging == 'true' || needs.detect-gitops-changes.outputs.core_geddes2 == 'true' || needs.detect-gitops-changes.outputs.experimental == 'true'
     runs-on: ubuntu-latest
     continue-on-error: true
     steps:
@@ -27,26 +78,71 @@ jobs:
           chmod +x /tmp/kubeconform
           sudo mv /tmp/kubeconform /usr/local/bin/kubeconform
 
-      - name: Render overlays with kustomize (advisory)
+      - name: Render and validate selected overlays (advisory)
+        shell: bash
+        env:
+          RUN_ALL: ${{ needs.detect-gitops-changes.outputs.run_all }}
+          CORE_PRODUCTION: ${{ needs.detect-gitops-changes.outputs.core_production }}
+          CORE_STAGING: ${{ needs.detect-gitops-changes.outputs.core_staging }}
+          CORE_GEDDES2: ${{ needs.detect-gitops-changes.outputs.core_geddes2 }}
+          EXPERIMENTAL: ${{ needs.detect-gitops-changes.outputs.experimental }}
         run: |
           set -euo pipefail
-          overlays=(
-            deploy/core-production
-            deploy/core-staging
-            deploy/core-geddes2
-            deploy/experimental
-          )
 
+          overlays=()
+          if [ "$RUN_ALL" = 'true' ]; then
+            overlays=(
+              deploy/core-production
+              deploy/core-staging
+              deploy/core-geddes2
+              deploy/experimental
+            )
+          else
+            [ "$CORE_PRODUCTION" = 'true' ] && overlays+=(deploy/core-production)
+            [ "$CORE_STAGING" = 'true' ] && overlays+=(deploy/core-staging)
+            [ "$CORE_GEDDES2" = 'true' ] && overlays+=(deploy/core-geddes2)
+            [ "$EXPERIMENTAL" = 'true' ] && overlays+=(deploy/experimental)
+          fi
+
+          if [ "${#overlays[@]}" -eq 0 ]; then
+            echo 'No in-scope overlay changes detected; skipping render/validation.'
+            {
+              echo '### GitOps Deployability Summary'
+              echo
+              echo '- No in-scope overlay changes detected.'
+            } >> "$GITHUB_STEP_SUMMARY"
+            exit 0
+          fi
+
+          {
+            echo '### GitOps Deployability Summary'
+            echo
+            echo '| Overlay | Render | Kubeconform |'
+            echo '|---|---|---|'
+          } >> "$GITHUB_STEP_SUMMARY"
+
+          status=0
           for overlay in "${overlays[@]}"; do
-            out="/tmp/$(echo "$overlay" | tr '/' '_').yaml"
-            echo "Rendering $overlay -> $out"
-            kustomize build --load-restrictor LoadRestrictionsNone "$overlay" > "$out"
-          done
+            rendered="/tmp/$(echo "$overlay" | tr '/' '_').yaml"
+            render_status='ok'
+            kubeconform_status='ok'
 
-      - name: Validate rendered manifests with kubeconform (advisory)
-        run: |
-          set -euo pipefail
-          for rendered in /tmp/deploy_core-production.yaml /tmp/deploy_core-staging.yaml /tmp/deploy_core-geddes2.yaml /tmp/deploy_experimental.yaml; do
-            echo "Validating $rendered"
-            kubeconform -summary -strict -ignore-missing-schemas -skip Secret "$rendered"
+            echo "Rendering $overlay -> $rendered"
+            if ! kustomize build --load-restrictor LoadRestrictionsNone "$overlay" > "$rendered"; then
+              render_status='failed'
+              kubeconform_status='skipped'
+              status=1
+            fi
+
+            if [ "$render_status" = 'ok' ]; then
+              echo "Validating $rendered"
+              if ! kubeconform -summary -strict -ignore-missing-schemas -skip Secret "$rendered"; then
+                kubeconform_status='failed'
+                status=1
+              fi
+            fi
+
+            echo "| \`$overlay\` | $render_status | $kubeconform_status |" >> "$GITHUB_STEP_SUMMARY"
           done
+
+          exit "$status"
diff --git a/.github/workflows/ci-security-advisory.yml b/.github/workflows/ci-security-advisory.yml
new file mode 100644
index 00000000..953a65c0
--- /dev/null
+++ b/.github/workflows/ci-security-advisory.yml
@@ -0,0 +1,166 @@
+name: CI Security Advisory
+
+on:
+  pull_request:
+    paths:
+      - 'deploy/**'
+      - 'docker/**'
+      - '.github/workflows/**'
+      - '**/requirements*.txt'
+      - '**/pyproject.toml'
+      - '**/poetry.lock'
+      - '**/Pipfile'
+      - '**/Pipfile.lock'
+      - '**/go.mod'
+      - '**/go.sum'
+  workflow_dispatch:
+
+concurrency:
+  group: ci-security-advisory-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+jobs:
+  detect-security-scope:
+    runs-on: ubuntu-latest
+    outputs:
+      vuln_surface: ${{ steps.filter.outputs.vuln_surface }}
+      config_surface: ${{ steps.filter.outputs.config_surface }}
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: dorny/paths-filter@v3
+        id: filter
+        with:
+          filters: |
+            vuln_surface:
+              - 'docker/**'
+              - '**/requirements*.txt'
+              - '**/pyproject.toml'
+              - '**/poetry.lock'
+              - '**/Pipfile'
+              - '**/Pipfile.lock'
+              - '**/go.mod'
+              - '**/go.sum'
+            config_surface:
+              - 'deploy/**'
+              - 'docker/**'
+              - '.github/workflows/**'
+
+  trivy-security-advisory:
+    needs: detect-security-scope
+    if: needs.detect-security-scope.outputs.vuln_surface == 'true' || needs.detect-security-scope.outputs.config_surface == 'true'
+    runs-on: ubuntu-latest
+    continue-on-error: true
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Run Trivy filesystem vulnerability scan (advisory)
+        if: needs.detect-security-scope.outputs.vuln_surface == 'true'
+        continue-on-error: true
+        uses: aquasecurity/trivy-action@0.33.1
+        with:
+          scan-type: fs
+          scan-ref: .
+          scanners: vuln
+          severity: HIGH,CRITICAL
+          ignore-unfixed: true
+          exit-code: '1'
+          format: json
+          output: trivy-pr-fs.json
+          skip-dirs: docker/dask-gateway-server,docs,docs/source/demos,docker/kaniko-build-jobs,slurm,.cursor
+
+      - name: Run Trivy configuration scan (advisory)
+        if: needs.detect-security-scope.outputs.config_surface == 'true'
+        continue-on-error: true
+        uses: aquasecurity/trivy-action@0.33.1
+        with:
+          scan-type: config
+          scan-ref: .
+          severity: HIGH,CRITICAL
+          exit-code: '1'
+          format: json
+          output: trivy-pr-config.json
+          skip-dirs: docker/dask-gateway-server,docs,docs/source/demos,docker/kaniko-build-jobs,slurm,.cursor
+
+      - name: Publish PR Trivy summary (advisory)
+        if: always()
+        shell: bash
+        run: |
+          set -euo pipefail
+          python3 - <<'PY'
+          import json
+          import os
+          from collections import Counter
+          from pathlib import Path
+
+          summary_path = Path(os.environ['GITHUB_STEP_SUMMARY'])
+          reports = (
+              ('Filesystem vulnerability scan', Path('trivy-pr-fs.json'), 'Vulnerabilities'),
+              ('Configuration misconfiguration scan', Path('trivy-pr-config.json'), 'Misconfigurations'),
+          )
+
+          total_high_critical = 0
+
+          with summary_path.open('a', encoding='utf-8') as summary:
+              summary.write('### PR Trivy Advisory Summary\n\n')
+
+              for label, report_path, finding_key in reports:
+                  if not report_path.exists():
+                      summary.write(f'- {label}: skipped (out of scope)\n')
+                      continue
+
+                  payload = json.loads(report_path.read_text(encoding='utf-8'))
+                  results = payload.get('Results', []) if isinstance(payload, dict) else payload
+
+                  severity_counts = Counter()
+                  target_counts = Counter()
+
+                  for result in results:
+                      target = result.get('Target', 'unknown-target')
+                      for finding in result.get(finding_key) or []:
+                          severity = (finding.get('Severity') or 'UNKNOWN').upper()
+                          severity_counts[severity] += 1
+                          target_counts[target] += 1
+
+                  high_critical = severity_counts.get('HIGH', 0) + severity_counts.get('CRITICAL', 0)
+                  total_high_critical += high_critical
+
+                  summary.write(f'\n#### {label}\n\n')
+                  summary.write(f'- HIGH/CRITICAL findings: **{high_critical}**\n')
+                  summary.write(f'- Targets with findings: **{len(target_counts)}**\n\n')
+
+                  if high_critical == 0:
+                      summary.write('No HIGH/CRITICAL findings detected.\n')
+                      continue
+
+                  summary.write('| Severity | Count |\n')
+                  summary.write('|---|---:|\n')
+                  for severity in ('CRITICAL', 'HIGH', 'MEDIUM', 'LOW', 'UNKNOWN'):
+                      count = severity_counts.get(severity, 0)
+                      if count:
+                          summary.write(f'| {severity} | {count} |\n')
+
+                  summary.write('\n| Top targets | Findings |\n')
+                  summary.write('|---|---:|\n')
+                  for target, count in target_counts.most_common(10):
+                      summary.write(f'| `{target}` | {count} |\n')
+
+          if total_high_critical > 0:
+              print(f'::warning::PR Trivy found {total_high_critical} HIGH/CRITICAL findings. See summary and artifacts.')
+          else:
+              print('::notice::PR Trivy found no HIGH/CRITICAL findings in scope.')
+          PY
+
+      - name: Upload PR Trivy artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: trivy-pr-security-${{ github.run_id }}
+          path: |
+            trivy-pr-fs.json
+            trivy-pr-config.json
+          if-no-files-found: ignore
+          retention-days: 7
diff --git a/.github/workflows/lint-docker.yml b/.github/workflows/lint-docker.yml
index ac83bec6..adcd8426 100644
--- a/.github/workflows/lint-docker.yml
+++ b/.github/workflows/lint-docker.yml
@@ -2,6 +2,17 @@ name: Container Reliability
 
 on:
   pull_request:
+    paths:
+      - 'docker/af-pod-monitor/**'
+      - 'docker/interlink-slurm-plugin/**'
+      - 'docker/purdue-af/**'
+      - 'slurm/**'
+      - '.github/scripts/container-smoke.sh'
+      - '.github/workflows/lint-docker.yml'
+
+concurrency:
+  group: lint-docker-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
 
 permissions:
   contents: read
@@ -10,6 +21,7 @@ jobs:
   detect-docker-changes:
     runs-on: ubuntu-latest
     outputs:
+      dockerfiles: ${{ steps.filter.outputs.dockerfiles }}
       af_pod_monitor: ${{ steps.filter.outputs.af_pod_monitor }}
       interlink_slurm_plugin: ${{ steps.filter.outputs.interlink_slurm_plugin }}
       purdue_af: ${{ steps.filter.outputs.purdue_af }}
@@ -20,22 +32,26 @@ jobs:
         id: filter
         with:
           filters: |
+            dockerfiles:
+              - 'docker/af-pod-monitor/Dockerfile'
+              - 'docker/interlink-slurm-plugin/Dockerfile.alma8'
+              - 'docker/purdue-af/Dockerfile'
+              - '.github/workflows/lint-docker.yml'
             af_pod_monitor:
               - 'docker/af-pod-monitor/**'
-              - '.github/workflows/lint-docker.yml'
               - '.github/scripts/container-smoke.sh'
             interlink_slurm_plugin:
               - 'docker/interlink-slurm-plugin/**'
               - 'slurm/**'
-              - '.github/workflows/lint-docker.yml'
               - '.github/scripts/container-smoke.sh'
             purdue_af:
               - 'docker/purdue-af/**'
               - 'slurm/**'
-              - '.github/workflows/lint-docker.yml'
               - '.github/scripts/container-smoke.sh'
 
   lint-dockerfiles:
+    needs: detect-docker-changes
+    if: needs.detect-docker-changes.outputs.dockerfiles == 'true'
     runs-on: ubuntu-latest
     continue-on-error: true
     steps:
diff --git a/.github/workflows/nightly-security-advisory.yml b/.github/workflows/nightly-security-advisory.yml
index 3204dc2b..9d7fa398 100644
--- a/.github/workflows/nightly-security-advisory.yml
+++ b/.github/workflows/nightly-security-advisory.yml
@@ -5,6 +5,10 @@ on:
     - cron: '17 5 * * *'
   workflow_dispatch:
 
+concurrency:
+  group: nightly-security-advisory-${{ github.ref }}
+  cancel-in-progress: true
+
 permissions:
   contents: read
 
@@ -20,8 +24,80 @@ jobs:
         with:
           scan-type: fs
           scan-ref: .
+          scanners: vuln
           severity: HIGH,CRITICAL
           ignore-unfixed: true
           exit-code: '1'
-          format: table
+          format: json
+          output: trivy-nightly-fs.json
           skip-dirs: docker/dask-gateway-server,docs,docs/source/demos,docker/kaniko-build-jobs,slurm,.cursor
+
+      - name: Publish nightly Trivy summary (advisory)
+        if: always()
+        shell: bash
+        run: |
+          set -euo pipefail
+          python3 - <<'PY'
+          import json
+          import os
+          from collections import Counter
+          from pathlib import Path
+
+          report_path = Path('trivy-nightly-fs.json')
+          summary_path = Path(os.environ['GITHUB_STEP_SUMMARY'])
+          title = 'Nightly Trivy Vulnerability Summary'
+
+          with summary_path.open('a', encoding='utf-8') as summary:
+              summary.write(f'### {title}\n\n')
+
+              if not report_path.exists():
+                  summary.write('- Trivy report was not generated.\n')
+                  print('::warning::Nightly Trivy report was not generated.')
+                  raise SystemExit(0)
+
+              payload = json.loads(report_path.read_text(encoding='utf-8'))
+              results = payload.get('Results', []) if isinstance(payload, dict) else payload
+
+              severity_counts = Counter()
+              target_counts = Counter()
+
+              for result in results:
+                  target = result.get('Target', 'unknown-target')
+                  for vuln in result.get('Vulnerabilities') or []:
+                      severity = (vuln.get('Severity') or 'UNKNOWN').upper()
+                      severity_counts[severity] += 1
+                      target_counts[target] += 1
+
+              high_critical = severity_counts.get('HIGH', 0) + severity_counts.get('CRITICAL', 0)
+
+              summary.write(f'- HIGH/CRITICAL findings: **{high_critical}**\n')
+              summary.write(f'- Targets with findings: **{len(target_counts)}**\n\n')
+
+              if high_critical == 0:
+                  summary.write('No HIGH/CRITICAL vulnerabilities found in scope.\n')
+                  print('::notice::Nightly Trivy found no HIGH/CRITICAL vulnerabilities.')
+                  raise SystemExit(0)
+
+              summary.write('| Severity | Count |\n')
+              summary.write('|---|---:|\n')
+              for severity in ('CRITICAL', 'HIGH', 'MEDIUM', 'LOW', 'UNKNOWN'):
+                  count = severity_counts.get(severity, 0)
+                  if count:
+                      summary.write(f'| {severity} | {count} |\n')
+
+              summary.write('\n| Top targets | Findings |\n')
+              summary.write('|---|---:|\n')
+              for target, count in target_counts.most_common(10):
+                  summary.write(f'| `{target}` | {count} |\n')
+
+          print(f'::warning::Nightly Trivy found {high_critical} HIGH/CRITICAL vulnerabilities. See summary and artifact.')
+          PY
+
+      - name: Upload nightly Trivy artifact
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: trivy-nightly-fs-${{ github.run_id }}
+          path: trivy-nightly-fs.json
+          if-no-files-found: ignore
+          retention-days: 14

From dd78608ef3cdedeb9f17d918d81e9a97affc549d Mon Sep 17 00:00:00 2001
From: Dmitry Kondratyev <kondratyev.d.95@gmail.com>
Date: Wed, 4 Feb 2026 11:19:30 -0500
Subject: [PATCH 11/25] docs(ci): align CI plan with active workflow surface

---
 .codex/CI_PLAN.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/.codex/CI_PLAN.md b/.codex/CI_PLAN.md
index 584661bd..faf1d8f2 100644
--- a/.codex/CI_PLAN.md
+++ b/.codex/CI_PLAN.md
@@ -43,8 +43,10 @@ Approved exception:
 - `.github/workflows/lint-json.yml`
 - `.github/workflows/lint-yaml.yml`
 - `.github/workflows/ci-repo-quality.yml`
+- `.github/workflows/ci-integration-scenarios.yml`
 - `.github/workflows/lint-docker.yml`
 - `.github/workflows/ci-gitops-deployability.yml`
+- `.github/workflows/ci-security-advisory.yml`
 - `.github/workflows/nightly-security-advisory.yml`
 
 ## Check Architecture
@@ -54,8 +56,8 @@ Approved exception:
 - Risk: broken workflow definitions and silent CI drift.
 
 ### B) Repo Quality and Tests (advisory)
-- Workflows: `lint-python.yml`, `lint-shell.yml`, `lint-json.yml`, `lint-yaml.yml`, `ci-repo-quality.yml`
-- Checks: black/isort check-only, py_compile, pytest advisory, shellcheck/shfmt/bash -n, JSON/YAML parse.
+- Workflows: `lint-python.yml`, `lint-shell.yml`, `lint-json.yml`, `lint-yaml.yml`, `ci-repo-quality.yml`, `ci-integration-scenarios.yml`
+- Checks: black/isort check-only, py_compile, pytest unit advisory with coverage threshold, shellcheck/shfmt/bash -n, JSON/YAML parse, integration scenario matrix tests via mocked container/monitoring flows.
 - Risk: script/runtime regressions.
 
 ### C) Container Reliability (advisory)
@@ -69,8 +71,8 @@ Approved exception:
 - Risk: Flux reconciliation failures from invalid manifests.
 
 ### E) Security Posture (advisory)
-- Workflow: `nightly-security-advisory.yml`
-- Checks: nightly Trivy filesystem scan.
+- Workflows: `nightly-security-advisory.yml`, `ci-security-advisory.yml`
+- Checks: nightly Trivy filesystem scan plus PR-time advisory Trivy vulnerability/config scans with run summaries and artifacts.
 - Risk: security drift in dependencies/configuration.
 
 ## Optimization Workstreams (Current)

From 52dab13425152b4172ba6328803ed1010fea1f29 Mon Sep 17 00:00:00 2001
From: Dmitry Kondratyev <kondratyev.d.95@gmail.com>
Date: Wed, 4 Feb 2026 11:22:37 -0500
Subject: [PATCH 12/25] ci(docker): enable gha cache for advisory image builds

---
 .codex/CI_PLAN.md                 |  2 +-
 .github/workflows/lint-docker.yml | 45 ++++++++++++++++++++++++++-----
 2 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/.codex/CI_PLAN.md b/.codex/CI_PLAN.md
index faf1d8f2..a70e7d24 100644
--- a/.codex/CI_PLAN.md
+++ b/.codex/CI_PLAN.md
@@ -62,7 +62,7 @@ Approved exception:
 
 ### C) Container Reliability (advisory)
 - Workflow: `lint-docker.yml`
-- Checks: hadolint, targeted docker build jobs, smoke checks via `.github/scripts/container-smoke.sh`.
+- Checks: hadolint, targeted Docker Buildx jobs with GitHub Actions layer cache, smoke checks via `.github/scripts/container-smoke.sh`.
 - Risk: image build/runtime regressions.
 
 ### D) GitOps Deployability (advisory)
diff --git a/.github/workflows/lint-docker.yml b/.github/workflows/lint-docker.yml
index adcd8426..da7fe070 100644
--- a/.github/workflows/lint-docker.yml
+++ b/.github/workflows/lint-docker.yml
@@ -80,8 +80,19 @@ jobs:
     steps:
       - uses: actions/checkout@v4
 
-      - name: Build af-pod-monitor image (advisory)
-        run: docker build -f docker/af-pod-monitor/Dockerfile -t local/af-pod-monitor:${{ github.sha }} docker/af-pod-monitor
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Build af-pod-monitor image with cache (advisory)
+        uses: docker/build-push-action@v6
+        with:
+          context: docker/af-pod-monitor
+          file: docker/af-pod-monitor/Dockerfile
+          load: true
+          tags: local/af-pod-monitor:${{ github.sha }}
+          cache-from: type=gha,scope=af-pod-monitor
+          cache-to: type=gha,mode=max,scope=af-pod-monitor,ignore-error=true
+          provenance: false
 
       - name: Smoke test af-pod-monitor image (advisory)
         run: .github/scripts/container-smoke.sh local/af-pod-monitor:${{ github.sha }} af-pod-monitor
@@ -94,8 +105,19 @@ jobs:
     steps:
       - uses: actions/checkout@v4
 
-      - name: Build interlink-slurm-plugin image (advisory)
-        run: docker build -f docker/interlink-slurm-plugin/Dockerfile.alma8 -t local/interlink-slurm-plugin:${{ github.sha }} .
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Build interlink-slurm-plugin image with cache (advisory)
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: docker/interlink-slurm-plugin/Dockerfile.alma8
+          load: true
+          tags: local/interlink-slurm-plugin:${{ github.sha }}
+          cache-from: type=gha,scope=interlink-slurm-plugin
+          cache-to: type=gha,mode=max,scope=interlink-slurm-plugin,ignore-error=true
+          provenance: false
 
       - name: Smoke test interlink-slurm-plugin image (advisory)
         run: .github/scripts/container-smoke.sh local/interlink-slurm-plugin:${{ github.sha }} interlink-slurm-plugin
@@ -108,8 +130,19 @@ jobs:
     steps:
       - uses: actions/checkout@v4
 
-      - name: Build purdue-af image (advisory)
-        run: docker build -f docker/purdue-af/Dockerfile -t local/purdue-af:${{ github.sha }} .
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Build purdue-af image with cache (advisory)
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: docker/purdue-af/Dockerfile
+          load: true
+          tags: local/purdue-af:${{ github.sha }}
+          cache-from: type=gha,scope=purdue-af
+          cache-to: type=gha,mode=max,scope=purdue-af,ignore-error=true
+          provenance: false
 
       - name: Smoke test purdue-af image (advisory)
         run: .github/scripts/container-smoke.sh local/purdue-af:${{ github.sha }} purdue-af

From fd4b57a1b1f647d4cd0e79311ea7acc329904925 Mon Sep 17 00:00:00 2001
From: Dmitry Kondratyev <kondratyev.d.95@gmail.com>
Date: Wed, 4 Feb 2026 11:26:49 -0500
Subject: [PATCH 13/25] ci: fix lint formatting and unit coverage reporting

---
 .../scripts/integration/mock-docker-cli.sh    | 68 +++++++++----------
 .github/workflows/ci-repo-quality.yml         | 19 +++---
 tests/conftest.py                             |  4 +-
 .../test_container_smoke_matrix.py            |  2 +-
 tests/unit/test_docker_healthcheck.py         |  4 +-
 tests/unit/test_node_healthcheck.py           |  8 ++-
 tests/unit/test_pod_metrics_exporter.py       |  4 +-
 7 files changed, 60 insertions(+), 49 deletions(-)

diff --git a/.github/scripts/integration/mock-docker-cli.sh b/.github/scripts/integration/mock-docker-cli.sh
index 04969cda..c20a19aa 100755
--- a/.github/scripts/integration/mock-docker-cli.sh
+++ b/.github/scripts/integration/mock-docker-cli.sh
@@ -2,44 +2,44 @@
 set -euo pipefail
 
 if [ -n "${MOCK_DOCKER_LOG:-}" ]; then
-  printf '%s\n' "$*" >> "$MOCK_DOCKER_LOG"
+	printf '%s\n' "$*" >>"$MOCK_DOCKER_LOG"
 fi
 
 cmd="${1:-}"
 shift || true
 
 case "$cmd" in
-  image)
-    subcmd="${1:-}"
-    shift || true
-    if [ "$subcmd" != "inspect" ]; then
-      echo "mock docker unsupported image subcommand: $subcmd" >&2
-      exit 64
-    fi
-
-    if [ -n "${MOCK_DOCKER_INSPECT_STDOUT:-}" ]; then
-      printf '%s\n' "$MOCK_DOCKER_INSPECT_STDOUT"
-    fi
-    if [ -n "${MOCK_DOCKER_INSPECT_STDERR:-}" ]; then
-      printf '%s\n' "$MOCK_DOCKER_INSPECT_STDERR" >&2
-    fi
-
-    exit "${MOCK_DOCKER_INSPECT_EXIT:-0}"
-    ;;
-
-  run)
-    if [ -n "${MOCK_DOCKER_RUN_STDOUT:-}" ]; then
-      printf '%s\n' "$MOCK_DOCKER_RUN_STDOUT"
-    fi
-    if [ -n "${MOCK_DOCKER_RUN_STDERR:-}" ]; then
-      printf '%s\n' "$MOCK_DOCKER_RUN_STDERR" >&2
-    fi
-
-    exit "${MOCK_DOCKER_RUN_EXIT:-0}"
-    ;;
-
-  *)
-    echo "mock docker unsupported command: $cmd" >&2
-    exit 64
-    ;;
+image)
+	subcmd="${1:-}"
+	shift || true
+	if [ "$subcmd" != "inspect" ]; then
+		echo "mock docker unsupported image subcommand: $subcmd" >&2
+		exit 64
+	fi
+
+	if [ -n "${MOCK_DOCKER_INSPECT_STDOUT:-}" ]; then
+		printf '%s\n' "$MOCK_DOCKER_INSPECT_STDOUT"
+	fi
+	if [ -n "${MOCK_DOCKER_INSPECT_STDERR:-}" ]; then
+		printf '%s\n' "$MOCK_DOCKER_INSPECT_STDERR" >&2
+	fi
+
+	exit "${MOCK_DOCKER_INSPECT_EXIT:-0}"
+	;;
+
+run)
+	if [ -n "${MOCK_DOCKER_RUN_STDOUT:-}" ]; then
+		printf '%s\n' "$MOCK_DOCKER_RUN_STDOUT"
+	fi
+	if [ -n "${MOCK_DOCKER_RUN_STDERR:-}" ]; then
+		printf '%s\n' "$MOCK_DOCKER_RUN_STDERR" >&2
+	fi
+
+	exit "${MOCK_DOCKER_RUN_EXIT:-0}"
+	;;
+
+*)
+	echo "mock docker unsupported command: $cmd" >&2
+	exit 64
+	;;
 esac
diff --git a/.github/workflows/ci-repo-quality.yml b/.github/workflows/ci-repo-quality.yml
index 2af6c26c..4dea93df 100644
--- a/.github/workflows/ci-repo-quality.yml
+++ b/.github/workflows/ci-repo-quality.yml
@@ -22,7 +22,7 @@ jobs:
         run: |
           set -euo pipefail
           python -m pip install --upgrade pip
-          pip install pytest pytest-cov
+          pip install pytest coverage
 
       - name: Python syntax smoke (advisory)
         shell: bash
@@ -61,16 +61,17 @@ jobs:
         shell: bash
         run: |
           set +e
-          pytest -q tests/unit \
-            --cov=apps/monitoring/af-monitoring/metrics_server.py \
-            --cov=apps/monitoring/af-monitoring/node_healthcheck.py \
-            --cov=docker/af-pod-monitor/pod-metrics-exporter.py \
-            --cov=docker/purdue-af/jupyter/docker_healthcheck.py \
-            --cov-report=term-missing \
-            --cov-fail-under=70
+          python -m coverage run -m pytest -q tests/unit
           rc=$?
+          set -e
           if [ "$rc" -eq 5 ]; then
             echo 'pytest collected no tests; treating as informational.'
             exit 0
           fi
-          exit "$rc"
+          if [ "$rc" -ne 0 ]; then
+            exit "$rc"
+          fi
+          python -m coverage report \
+            --show-missing \
+            --fail-under=70 \
+            --include="apps/monitoring/af-monitoring/metrics_server.py,apps/monitoring/af-monitoring/node_healthcheck.py,docker/af-pod-monitor/pod-metrics-exporter.py,docker/purdue-af/jupyter/docker_healthcheck.py"
diff --git a/tests/conftest.py b/tests/conftest.py
index 2caf62bf..eac3d579 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -56,7 +56,9 @@ def labels(self, **_labels: str) -> "Gauge":
 def module_loader(monkeypatch: pytest.MonkeyPatch) -> Callable[..., object]:
     counter = 0
 
-    def _load(relative_path: str, *, extra_modules: dict[str, object] | None = None) -> object:
+    def _load(
+        relative_path: str, *, extra_modules: dict[str, object] | None = None
+    ) -> object:
         nonlocal counter
         counter += 1
         module_name = f"test_module_{counter}"
diff --git a/tests/integration/test_container_smoke_matrix.py b/tests/integration/test_container_smoke_matrix.py
index b3215c6f..5bec78a4 100644
--- a/tests/integration/test_container_smoke_matrix.py
+++ b/tests/integration/test_container_smoke_matrix.py
@@ -22,7 +22,7 @@ def test_container_smoke_behavior_matrix(self) -> None:
                     temp_path = Path(temp_dir)
                     docker_wrapper = temp_path / "docker"
                     docker_wrapper.write_text(
-                        f"#!/usr/bin/env bash\nexec \"{MOCK_DOCKER_SCRIPT}\" \"$@\"\n",
+                        f'#!/usr/bin/env bash\nexec "{MOCK_DOCKER_SCRIPT}" "$@"\n',
                         encoding="utf-8",
                     )
                     docker_wrapper.chmod(0o755)
diff --git a/tests/unit/test_docker_healthcheck.py b/tests/unit/test_docker_healthcheck.py
index bbd7fd1b..64102674 100644
--- a/tests/unit/test_docker_healthcheck.py
+++ b/tests/unit/test_docker_healthcheck.py
@@ -23,7 +23,9 @@ def glob(self, _pattern: str):
         return iter([_FakeJsonFile(self.payload)])
 
 
-def test_healthcheck_queries_jupyter_api_and_prints_response(monkeypatch, module_loader) -> None:
+def test_healthcheck_queries_jupyter_api_and_prints_response(
+    monkeypatch, module_loader
+) -> None:
     captured = {}
     payload = json.dumps({"url": "https://af.example/"}).encode("utf-8")
 
diff --git a/tests/unit/test_node_healthcheck.py b/tests/unit/test_node_healthcheck.py
index caf58bc4..e1510b5f 100644
--- a/tests/unit/test_node_healthcheck.py
+++ b/tests/unit/test_node_healthcheck.py
@@ -36,7 +36,9 @@ def _fake_popen(args, **kwargs):
     monkeypatch.setattr(module.time, "time", lambda: next(times))
     monkeypatch.setattr(module.subprocess, "Popen", _fake_popen)
 
-    valid, elapsed_ms = module.check_if_directory_exists(("/tmp/validate.txt", "abc123"))
+    valid, elapsed_ms = module.check_if_directory_exists(
+        ("/tmp/validate.txt", "abc123")
+    )
 
     assert valid is True
     assert elapsed_ms == pytest.approx(200.0)
@@ -70,7 +72,9 @@ def kill(self) -> None:
     proc = FakeProc()
     monkeypatch.setattr(module.subprocess, "Popen", lambda *_args, **_kwargs: proc)
 
-    valid, elapsed_ms = module.check_if_directory_exists(("/tmp/validate.txt", "abc123"))
+    valid, elapsed_ms = module.check_if_directory_exists(
+        ("/tmp/validate.txt", "abc123")
+    )
 
     assert valid is False
     assert elapsed_ms == 3000
diff --git a/tests/unit/test_pod_metrics_exporter.py b/tests/unit/test_pod_metrics_exporter.py
index 7ac617fd..90d2353e 100644
--- a/tests/unit/test_pod_metrics_exporter.py
+++ b/tests/unit/test_pod_metrics_exporter.py
@@ -72,7 +72,9 @@ def test_update_metrics_home_branch_parses_df_and_ignores_stat_errors(
         "Filesystem 1K-blocks Used Available Use% Mounted on\n"
         "/dev/sda1 1000 250 750 25% /home\n"
     ).encode("utf-8")
-    monkeypatch.setattr(module.subprocess, "check_output", lambda *_args, **_kwargs: df_output)
+    monkeypatch.setattr(
+        module.subprocess, "check_output", lambda *_args, **_kwargs: df_output
+    )
 
     def _raise_stat(_directory):
         raise OSError("stat unavailable")

From a592a4cd7fa4150a4ce80435368e8f1149d1a79b Mon Sep 17 00:00:00 2001
From: Dmitry Kondratyev <kondratyev.d.95@gmail.com>
Date: Wed, 4 Feb 2026 11:33:54 -0500
Subject: [PATCH 14/25] ci: add formatter autofix workflow and resolve
 lint-python import order

---
 .codex/CI_PLAN.md                             |   6 +-
 .github/workflows/ci-format-autofix.yml       | 173 ++++++++++++++++++
 .../test_container_smoke_matrix.py            |   2 +-
 .../test_monitoring_metric_update.py          |   2 +-
 4 files changed, 179 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/ci-format-autofix.yml

diff --git a/.codex/CI_PLAN.md b/.codex/CI_PLAN.md
index a70e7d24..dbef85fc 100644
--- a/.codex/CI_PLAN.md
+++ b/.codex/CI_PLAN.md
@@ -35,6 +35,7 @@ Out of scope:
 
 Approved exception:
 - `slurm/**` is used as a dependency-only trigger in container reliability path filters because maintained Dockerfiles copy `slurm/` artifacts.
+- CI auto-commit is enabled for formatter-only fixes in `ci-format-autofix.yml` to reduce lint iteration noise.
 
 ## Active Workflow Surface
 - `.github/workflows/ci-workflow-integrity.yml`
@@ -42,6 +43,7 @@ Approved exception:
 - `.github/workflows/lint-shell.yml`
 - `.github/workflows/lint-json.yml`
 - `.github/workflows/lint-yaml.yml`
+- `.github/workflows/ci-format-autofix.yml`
 - `.github/workflows/ci-repo-quality.yml`
 - `.github/workflows/ci-integration-scenarios.yml`
 - `.github/workflows/lint-docker.yml`
@@ -56,8 +58,8 @@ Approved exception:
 - Risk: broken workflow definitions and silent CI drift.
 
 ### B) Repo Quality and Tests (advisory)
-- Workflows: `lint-python.yml`, `lint-shell.yml`, `lint-json.yml`, `lint-yaml.yml`, `ci-repo-quality.yml`, `ci-integration-scenarios.yml`
-- Checks: black/isort check-only, py_compile, pytest unit advisory with coverage threshold, shellcheck/shfmt/bash -n, JSON/YAML parse, integration scenario matrix tests via mocked container/monitoring flows.
+- Workflows: `lint-python.yml`, `lint-shell.yml`, `lint-json.yml`, `lint-yaml.yml`, `ci-format-autofix.yml`, `ci-repo-quality.yml`, `ci-integration-scenarios.yml`
+- Checks: black/isort check-only, py_compile, pytest unit advisory with coverage threshold, shellcheck/shfmt/bash -n, JSON/YAML parse, auto-format commits for changed Python/shell/JSON/YAML files, integration scenario matrix tests via mocked container/monitoring flows.
 - Risk: script/runtime regressions.
 
 ### C) Container Reliability (advisory)
diff --git a/.github/workflows/ci-format-autofix.yml b/.github/workflows/ci-format-autofix.yml
new file mode 100644
index 00000000..9a97a35c
--- /dev/null
+++ b/.github/workflows/ci-format-autofix.yml
@@ -0,0 +1,173 @@
+name: CI Format Autofix
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+
+concurrency:
+  group: ci-format-autofix-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: write
+
+jobs:
+  autofix-format:
+    if: github.event.pull_request.head.repo.full_name == github.repository
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.head_ref }}
+          fetch-depth: 0
+
+      - name: Detect changed Python files
+        id: py_changes
+        uses: tj-actions/changed-files@v45
+        with:
+          separator: "\n"
+          files: |
+            **/*.py
+          files_ignore: |
+            docker/dask-gateway-server/**
+            docs/**
+            docs/source/demos/**
+            docker/kaniko-build-jobs/**
+            slurm/**
+            .cursor/**
+            .git/**
+
+      - name: Detect changed shell files
+        id: sh_changes
+        uses: tj-actions/changed-files@v45
+        with:
+          separator: "\n"
+          files: |
+            **/*.sh
+            **/pixi-wrapper
+            **/fix-permissions
+          files_ignore: |
+            docker/dask-gateway-server/**
+            docs/**
+            docs/source/demos/**
+            docker/kaniko-build-jobs/**
+            slurm/**
+            .cursor/**
+            .git/**
+
+      - name: Detect changed JSON/YAML files
+        id: data_changes
+        uses: tj-actions/changed-files@v45
+        with:
+          separator: "\n"
+          files: |
+            **/*.json
+            **/*.yml
+            **/*.yaml
+          files_ignore: |
+            docker/dask-gateway-server/**
+            docs/**
+            docs/source/demos/**
+            docker/kaniko-build-jobs/**
+            slurm/**
+            .cursor/**
+            .git/**
+            .github/workflows/**
+
+      - name: Set up Python
+        if: steps.py_changes.outputs.any_changed == 'true'
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Auto-format Python files
+        if: steps.py_changes.outputs.any_changed == 'true'
+        shell: bash
+        run: |
+          set -euo pipefail
+          python -m pip install --upgrade pip
+          pip install black isort
+
+          mapfile -t py_files <<'EOF'
+          ${{ steps.py_changes.outputs.all_changed_files }}
+          EOF
+
+          files=()
+          for f in "${py_files[@]}"; do
+            [ -f "$f" ] && files+=("$f")
+          done
+
+          if [ "${#files[@]}" -gt 0 ]; then
+            black "${files[@]}"
+            isort --profile black "${files[@]}"
+          fi
+
+      - name: Install shell formatter
+        if: steps.sh_changes.outputs.any_changed == 'true'
+        shell: bash
+        run: |
+          set -euo pipefail
+          SHFMT_VERSION=3.10.0
+          curl -fsSL "https://github.com/mvdan/sh/releases/download/v${SHFMT_VERSION}/shfmt_v${SHFMT_VERSION}_linux_amd64" -o /tmp/shfmt
+          chmod +x /tmp/shfmt
+          sudo mv /tmp/shfmt /usr/local/bin/shfmt
+
+      - name: Auto-format shell files
+        if: steps.sh_changes.outputs.any_changed == 'true'
+        shell: bash
+        run: |
+          set -euo pipefail
+
+          mapfile -t sh_files <<'EOF'
+          ${{ steps.sh_changes.outputs.all_changed_files }}
+          EOF
+
+          files=()
+          for f in "${sh_files[@]}"; do
+            [ -f "$f" ] && files+=("$f")
+          done
+
+          if [ "${#files[@]}" -gt 0 ]; then
+            shfmt -w "${files[@]}"
+          fi
+
+      - name: Set up Node.js
+        if: steps.data_changes.outputs.any_changed == 'true'
+        uses: actions/setup-node@v4
+        with:
+          node-version: '20'
+
+      - name: Auto-format JSON/YAML files
+        if: steps.data_changes.outputs.any_changed == 'true'
+        shell: bash
+        run: |
+          set -euo pipefail
+          npm install --global prettier
+
+          mapfile -t data_files <<'EOF'
+          ${{ steps.data_changes.outputs.all_changed_files }}
+          EOF
+
+          files=()
+          for f in "${data_files[@]}"; do
+            [ -f "$f" ] && files+=("$f")
+          done
+
+          if [ "${#files[@]}" -gt 0 ]; then
+            prettier --write "${files[@]}"
+          fi
+
+      - name: Commit and push formatting fixes
+        shell: bash
+        run: |
+          set -euo pipefail
+          if git diff --quiet; then
+            echo "No formatter changes to commit."
+            exit 0
+          fi
+
+          git config user.name "github-actions[bot]"
+          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          git add -A
+          git commit -m "ci: auto-format fixable lint issues"
+          git push origin "HEAD:${{ github.head_ref }}"
diff --git a/tests/integration/test_container_smoke_matrix.py b/tests/integration/test_container_smoke_matrix.py
index 5bec78a4..1ed3a879 100644
--- a/tests/integration/test_container_smoke_matrix.py
+++ b/tests/integration/test_container_smoke_matrix.py
@@ -1,10 +1,10 @@
 from __future__ import annotations
 
 import os
-from pathlib import Path
 import subprocess
 import tempfile
 import unittest
+from pathlib import Path
 
 from common import REPO_ROOT, load_json_fixture
 
diff --git a/tests/integration/test_monitoring_metric_update.py b/tests/integration/test_monitoring_metric_update.py
index 05b3a6fe..a647ffcb 100644
--- a/tests/integration/test_monitoring_metric_update.py
+++ b/tests/integration/test_monitoring_metric_update.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
-from pathlib import Path
 import unittest
+from pathlib import Path
 from unittest import mock
 from uuid import uuid4
 

From ef3821bd0d9326e2ba8d5ba292ba899fabd6cb5a Mon Sep 17 00:00:00 2001
From: Dmitry Kondratyev <kondratyev.d.95@gmail.com>
Date: Wed, 4 Feb 2026 11:35:35 -0500
Subject: [PATCH 15/25] ci: harden autofix workflow push ref handling

---
 .github/workflows/ci-format-autofix.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci-format-autofix.yml b/.github/workflows/ci-format-autofix.yml
index 9a97a35c..f97b9fee 100644
--- a/.github/workflows/ci-format-autofix.yml
+++ b/.github/workflows/ci-format-autofix.yml
@@ -158,6 +158,8 @@ jobs:
           fi
 
       - name: Commit and push formatting fixes
+        env:
+          PR_HEAD_REF: ${{ github.head_ref }}
         shell: bash
         run: |
           set -euo pipefail
@@ -170,4 +172,4 @@ jobs:
           git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
           git add -A
           git commit -m "ci: auto-format fixable lint issues"
-          git push origin "HEAD:${{ github.head_ref }}"
+          git push origin "HEAD:${PR_HEAD_REF}"

From 15fad76b3882ad630476284cdccb46796a4661d8 Mon Sep 17 00:00:00 2001
From: Dmitry Kondratyev <kondratyev.d.95@gmail.com>
Date: Wed, 4 Feb 2026 11:48:47 -0500
Subject: [PATCH 16/25] docs: modernize CI badges with status and policy signal

---
 README.md | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 3e7ec8f2..a063fd08 100644
--- a/README.md
+++ b/README.md
@@ -18,8 +18,17 @@ Each user is provided with a 25GB home directory at first login. These directori
 
 [![Documentation Status](https://readthedocs.org/projects/purdue-af/badge/?version=latest)](https://purdue-af.readthedocs.io/en/latest/?badge=latest)
 
-[![Workflow Integrity](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-workflow-integrity.yml/badge.svg)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-workflow-integrity.yml)
-[![Repo Quality](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-repo-quality.yml/badge.svg)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-repo-quality.yml)
-[![Container Reliability](https://github.com/PurdueAF/purdue-af/actions/workflows/lint-docker.yml/badge.svg)](https://github.com/PurdueAF/purdue-af/actions/workflows/lint-docker.yml)
-[![GitOps Deployability](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-gitops-deployability.yml/badge.svg)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-gitops-deployability.yml)
-[![Nightly Security](https://github.com/PurdueAF/purdue-af/actions/workflows/nightly-security-advisory.yml/badge.svg)](https://github.com/PurdueAF/purdue-af/actions/workflows/nightly-security-advisory.yml)
+### Runtime Status
+
+[![Workflow Integrity](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-workflow-integrity.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-workflow-integrity.yml)
+[![Repo Quality](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-repo-quality.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-repo-quality.yml)
+[![Container Reliability](https://github.com/PurdueAF/purdue-af/actions/workflows/lint-docker.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/lint-docker.yml)
+[![GitOps Deployability](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-gitops-deployability.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-gitops-deployability.yml)
+[![Nightly Security](https://github.com/PurdueAF/purdue-af/actions/workflows/nightly-security-advisory.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/nightly-security-advisory.yml)
+
+### Policy Badges
+
+[![Coverage Gate](https://img.shields.io/badge/Coverage%20Gate-%3E%3D70%25%20%28advisory%29-4c1)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-repo-quality.yml)
+[![Security Scans](https://img.shields.io/badge/Security%20Scans-PR%20%2B%20Nightly-0366d6)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-security-advisory.yml)
+[![Validation Mode](https://img.shields.io/badge/Validation%20Mode-Advisory--first-f59e0b)](https://github.com/PurdueAF/purdue-af/actions)
+[![Autofix](https://img.shields.io/badge/Autofix-Python%2FShell%2FJSON%2FYAML-7c3aed)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-format-autofix.yml)

From 6304b85c26a5edd2dc8da93095e4ee44a3c88463 Mon Sep 17 00:00:00 2001
From: Dmitry Kondratyev <kondratyev.d.95@gmail.com>
Date: Wed, 4 Feb 2026 11:49:39 -0500
Subject: [PATCH 17/25] ci: trim duplicate checks and path-scope fast lint
 workflows

---
 .github/workflows/ci-repo-quality.yml       | 45 ++++++---------------
 .github/workflows/ci-workflow-integrity.yml |  6 +++
 .github/workflows/lint-json.yml             | 13 ++++++
 .github/workflows/lint-python.yml           | 13 ++++++
 .github/workflows/lint-shell.yml            | 15 +++++++
 .github/workflows/lint-yaml.yml             | 16 ++++++++
 6 files changed, 75 insertions(+), 33 deletions(-)

diff --git a/.github/workflows/ci-repo-quality.yml b/.github/workflows/ci-repo-quality.yml
index 4dea93df..06a96532 100644
--- a/.github/workflows/ci-repo-quality.yml
+++ b/.github/workflows/ci-repo-quality.yml
@@ -2,6 +2,18 @@ name: CI Repo Quality
 
 on:
   pull_request:
+    paths:
+      - 'tests/unit/**'
+      - 'tests/conftest.py'
+      - 'apps/monitoring/af-monitoring/metrics_server.py'
+      - 'apps/monitoring/af-monitoring/node_healthcheck.py'
+      - 'docker/af-pod-monitor/pod-metrics-exporter.py'
+      - 'docker/purdue-af/jupyter/docker_healthcheck.py'
+      - '.github/workflows/ci-repo-quality.yml'
+
+concurrency:
+  group: ci-repo-quality-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
 
 permissions:
   contents: read
@@ -24,39 +36,6 @@ jobs:
           python -m pip install --upgrade pip
           pip install pytest coverage
 
-      - name: Python syntax smoke (advisory)
-        shell: bash
-        run: |
-          set -euo pipefail
-          mapfile -t py_files < <(find . -type f -name '*.py' \
-            -not -path './docker/dask-gateway-server/*' \
-            -not -path './docker/kaniko-build-jobs/*' \
-            -not -path './docs/*' \
-            -not -path './slurm/*' \
-            -not -path './.cursor/*' \
-            -not -path './.git/*' | sort)
-
-          if [ "${#py_files[@]}" -gt 0 ]; then
-            python -m py_compile "${py_files[@]}"
-          fi
-
-      - name: Shell syntax smoke (advisory)
-        shell: bash
-        run: |
-          set -euo pipefail
-          mapfile -t sh_files < <(find . -type f \
-            \( -name '*.sh' -o -name 'pixi-wrapper' -o -name 'fix-permissions' \) \
-            -not -path './docker/dask-gateway-server/*' \
-            -not -path './docker/kaniko-build-jobs/*' \
-            -not -path './docs/*' \
-            -not -path './slurm/*' \
-            -not -path './.cursor/*' \
-            -not -path './.git/*' | sort)
-
-          for f in "${sh_files[@]}"; do
-            bash -n "$f"
-          done
-
       - name: Run pytest (advisory)
         shell: bash
         run: |
diff --git a/.github/workflows/ci-workflow-integrity.yml b/.github/workflows/ci-workflow-integrity.yml
index deacfa94..c3b30c66 100644
--- a/.github/workflows/ci-workflow-integrity.yml
+++ b/.github/workflows/ci-workflow-integrity.yml
@@ -2,6 +2,12 @@ name: CI Workflow Integrity
 
 on:
   pull_request:
+    paths:
+      - '.github/workflows/**'
+
+concurrency:
+  group: ci-workflow-integrity-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
 
 permissions:
   contents: read
diff --git a/.github/workflows/lint-json.yml b/.github/workflows/lint-json.yml
index d223cfe7..1c5d56f2 100644
--- a/.github/workflows/lint-json.yml
+++ b/.github/workflows/lint-json.yml
@@ -2,6 +2,19 @@ name: Lint JSON
 
 on:
   pull_request:
+    paths:
+      - '**/*.json'
+      - '.github/workflows/lint-json.yml'
+      - '!docker/dask-gateway-server/**'
+      - '!docker/kaniko-build-jobs/**'
+      - '!docs/**'
+      - '!slurm/**'
+      - '!.cursor/**'
+      - '!.git/**'
+
+concurrency:
+  group: lint-json-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
 
 permissions:
   contents: read
diff --git a/.github/workflows/lint-python.yml b/.github/workflows/lint-python.yml
index 80dd38aa..149a4c82 100644
--- a/.github/workflows/lint-python.yml
+++ b/.github/workflows/lint-python.yml
@@ -2,6 +2,19 @@ name: Lint Python
 
 on:
   pull_request:
+    paths:
+      - '**/*.py'
+      - '.github/workflows/lint-python.yml'
+      - '!docker/dask-gateway-server/**'
+      - '!docker/kaniko-build-jobs/**'
+      - '!docs/**'
+      - '!slurm/**'
+      - '!.cursor/**'
+      - '!.git/**'
+
+concurrency:
+  group: lint-python-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
 
 permissions:
   contents: read
diff --git a/.github/workflows/lint-shell.yml b/.github/workflows/lint-shell.yml
index d12762ed..030dcf60 100644
--- a/.github/workflows/lint-shell.yml
+++ b/.github/workflows/lint-shell.yml
@@ -2,6 +2,21 @@ name: Lint Shell Scripts
 
 on:
   pull_request:
+    paths:
+      - '**/*.sh'
+      - '**/pixi-wrapper'
+      - '**/fix-permissions'
+      - '.github/workflows/lint-shell.yml'
+      - '!docker/dask-gateway-server/**'
+      - '!docker/kaniko-build-jobs/**'
+      - '!docs/**'
+      - '!slurm/**'
+      - '!.cursor/**'
+      - '!.git/**'
+
+concurrency:
+  group: lint-shell-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
 
 permissions:
   contents: read
diff --git a/.github/workflows/lint-yaml.yml b/.github/workflows/lint-yaml.yml
index 6158c887..44edbd9b 100644
--- a/.github/workflows/lint-yaml.yml
+++ b/.github/workflows/lint-yaml.yml
@@ -2,6 +2,21 @@ name: Lint YAML
 
 on:
   pull_request:
+    paths:
+      - '**/*.yml'
+      - '**/*.yaml'
+      - '!docker/dask-gateway-server/**'
+      - '!docker/kaniko-build-jobs/**'
+      - '!docs/**'
+      - '!slurm/**'
+      - '!.cursor/**'
+      - '!.git/**'
+      - '!.github/workflows/**'
+      - '.github/workflows/lint-yaml.yml'
+
+concurrency:
+  group: lint-yaml-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
 
 permissions:
   contents: read
@@ -39,6 +54,7 @@ jobs:
               Path('slurm'),
               Path('.cursor'),
               Path('.git'),
+              Path('.github/workflows'),
           )
 
           filtered = []

From 237dc69183a4129c05480c784e33aa709ec4db08 Mon Sep 17 00:00:00 2001
From: Dmitry Kondratyev <kondratyev.d.95@gmail.com>
Date: Wed, 4 Feb 2026 11:51:34 -0500
Subject: [PATCH 18/25] ci: harden runtime and security workflow execution
 behavior

---
 .github/workflows/ci-gitops-deployability.yml | 52 +++++++++++
 .github/workflows/ci-security-advisory.yml    | 82 +++++++++++++++--
 .github/workflows/lint-docker.yml             | 89 +++++++++++++++++++
 .../workflows/nightly-security-advisory.yml   | 16 ++++
 4 files changed, 234 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci-gitops-deployability.yml b/.github/workflows/ci-gitops-deployability.yml
index d83902b0..f37b5a53 100644
--- a/.github/workflows/ci-gitops-deployability.yml
+++ b/.github/workflows/ci-gitops-deployability.yml
@@ -16,6 +16,7 @@ permissions:
 jobs:
   detect-gitops-changes:
     runs-on: ubuntu-latest
+    timeout-minutes: 5
     outputs:
       run_all: ${{ steps.scope.outputs.run_all }}
       core_production: ${{ steps.filter.outputs.core_production }}
@@ -56,10 +57,48 @@ jobs:
             echo "run_all=false" >> "$GITHUB_OUTPUT"
           fi
 
+      - name: Publish GitOps validation plan
+        if: always()
+        shell: bash
+        run: |
+          set -euo pipefail
+          run_all="${{ steps.scope.outputs.run_all }}"
+          {
+            echo '### GitOps Deployability Plan'
+            echo
+            echo "- Full overlay run: \`$run_all\`"
+            echo
+            echo '| Overlay | Decision |'
+            echo '|---|---|'
+            if [ "$run_all" = 'true' ] || [ "${{ steps.filter.outputs.core_production }}" = 'true' ]; then
+              echo '| `deploy/core-production` | run |'
+            else
+              echo '| `deploy/core-production` | skipped |'
+            fi
+            if [ "$run_all" = 'true' ] || [ "${{ steps.filter.outputs.core_staging }}" = 'true' ]; then
+              echo '| `deploy/core-staging` | run |'
+            else
+              echo '| `deploy/core-staging` | skipped |'
+            fi
+            if [ "$run_all" = 'true' ] || [ "${{ steps.filter.outputs.core_geddes2 }}" = 'true' ]; then
+              echo '| `deploy/core-geddes2` | run |'
+            else
+              echo '| `deploy/core-geddes2` | skipped |'
+            fi
+            if [ "$run_all" = 'true' ] || [ "${{ steps.filter.outputs.experimental }}" = 'true' ]; then
+              echo '| `deploy/experimental` | run |'
+            else
+              echo '| `deploy/experimental` | skipped |'
+            fi
+            echo
+            echo '- Mode: advisory (gitops-validate uses continue-on-error).'
+          } >> "$GITHUB_STEP_SUMMARY"
+
   gitops-validate:
     needs: detect-gitops-changes
     if: needs.detect-gitops-changes.outputs.run_all == 'true' || needs.detect-gitops-changes.outputs.core_production == 'true' || needs.detect-gitops-changes.outputs.core_staging == 'true' || needs.detect-gitops-changes.outputs.core_geddes2 == 'true' || needs.detect-gitops-changes.outputs.experimental == 'true'
     runs-on: ubuntu-latest
+    timeout-minutes: 25
     continue-on-error: true
     steps:
       - uses: actions/checkout@v4
@@ -110,6 +149,7 @@ jobs:
               echo '### GitOps Deployability Summary'
               echo
               echo '- No in-scope overlay changes detected.'
+              echo '- Mode: advisory (job continue-on-error=true).'
             } >> "$GITHUB_STEP_SUMMARY"
             exit 0
           fi
@@ -145,4 +185,16 @@ jobs:
             echo "| \`$overlay\` | $render_status | $kubeconform_status |" >> "$GITHUB_STEP_SUMMARY"
           done
 
+          if [ "$status" -eq 0 ]; then
+            overall_result='pass'
+          else
+            overall_result='issues-detected'
+          fi
+
+          {
+            echo
+            echo "- Overall result: **$overall_result**"
+            echo '- Mode: advisory (job continue-on-error=true).'
+          } >> "$GITHUB_STEP_SUMMARY"
+
           exit "$status"
diff --git a/.github/workflows/ci-security-advisory.yml b/.github/workflows/ci-security-advisory.yml
index 953a65c0..2aa7d146 100644
--- a/.github/workflows/ci-security-advisory.yml
+++ b/.github/workflows/ci-security-advisory.yml
@@ -25,6 +25,7 @@ permissions:
 jobs:
   detect-security-scope:
     runs-on: ubuntu-latest
+    timeout-minutes: 5
     outputs:
       vuln_surface: ${{ steps.filter.outputs.vuln_surface }}
       config_surface: ${{ steps.filter.outputs.config_surface }}
@@ -49,15 +50,41 @@ jobs:
               - 'docker/**'
               - '.github/workflows/**'
 
+      - name: Publish security scan plan
+        if: always()
+        shell: bash
+        run: |
+          set -euo pipefail
+          {
+            echo '### Security Advisory Scan Plan'
+            echo
+            echo '| Scan | Decision |'
+            echo '|---|---|'
+            if [ "${{ steps.filter.outputs.vuln_surface }}" = 'true' ]; then
+              echo '| Filesystem vulnerability scan | run |'
+            else
+              echo '| Filesystem vulnerability scan | skipped |'
+            fi
+            if [ "${{ steps.filter.outputs.config_surface }}" = 'true' ]; then
+              echo '| Configuration misconfiguration scan | run |'
+            else
+              echo '| Configuration misconfiguration scan | skipped |'
+            fi
+            echo
+            echo '- Workflow mode: advisory (scan job uses continue-on-error).'
+          } >> "$GITHUB_STEP_SUMMARY"
+
   trivy-security-advisory:
     needs: detect-security-scope
     if: needs.detect-security-scope.outputs.vuln_surface == 'true' || needs.detect-security-scope.outputs.config_surface == 'true'
     runs-on: ubuntu-latest
+    timeout-minutes: 30
     continue-on-error: true
     steps:
       - uses: actions/checkout@v4
 
       - name: Run Trivy filesystem vulnerability scan (advisory)
+        id: fs_scan
         if: needs.detect-security-scope.outputs.vuln_surface == 'true'
         continue-on-error: true
         uses: aquasecurity/trivy-action@0.33.1
@@ -73,6 +100,7 @@ jobs:
           skip-dirs: docker/dask-gateway-server,docs,docs/source/demos,docker/kaniko-build-jobs,slurm,.cursor
 
       - name: Run Trivy configuration scan (advisory)
+        id: config_scan
         if: needs.detect-security-scope.outputs.config_surface == 'true'
         continue-on-error: true
         uses: aquasecurity/trivy-action@0.33.1
@@ -88,6 +116,11 @@ jobs:
       - name: Publish PR Trivy summary (advisory)
         if: always()
         shell: bash
+        env:
+          VULN_SURFACE: ${{ needs.detect-security-scope.outputs.vuln_surface }}
+          CONFIG_SURFACE: ${{ needs.detect-security-scope.outputs.config_surface }}
+          FS_SCAN_OUTCOME: ${{ steps.fs_scan.outcome || 'skipped' }}
+          CONFIG_SCAN_OUTCOME: ${{ steps.config_scan.outcome || 'skipped' }}
         run: |
           set -euo pipefail
           python3 - <<'PY'
@@ -98,18 +131,44 @@ jobs:
 
           summary_path = Path(os.environ['GITHUB_STEP_SUMMARY'])
           reports = (
-              ('Filesystem vulnerability scan', Path('trivy-pr-fs.json'), 'Vulnerabilities'),
-              ('Configuration misconfiguration scan', Path('trivy-pr-config.json'), 'Misconfigurations'),
+              (
+                  'Filesystem vulnerability scan',
+                  Path('trivy-pr-fs.json'),
+                  'Vulnerabilities',
+                  os.environ.get('VULN_SURFACE', 'false') == 'true',
+                  os.environ.get('FS_SCAN_OUTCOME', 'skipped'),
+              ),
+              (
+                  'Configuration misconfiguration scan',
+                  Path('trivy-pr-config.json'),
+                  'Misconfigurations',
+                  os.environ.get('CONFIG_SURFACE', 'false') == 'true',
+                  os.environ.get('CONFIG_SCAN_OUTCOME', 'skipped'),
+              ),
           )
 
           total_high_critical = 0
+          missing_reports = 0
 
           with summary_path.open('a', encoding='utf-8') as summary:
               summary.write('### PR Trivy Advisory Summary\n\n')
+              summary.write('| Scan | Scope | Step outcome | Report |\n')
+              summary.write('|---|---|---|---|\n')
+
+              for label, report_path, _, in_scope, outcome in reports:
+                  scope_status = 'run' if in_scope else 'skipped'
+                  report_status = 'present' if report_path.exists() else 'missing'
+                  summary.write(f'| {label} | {scope_status} | `{outcome}` | {report_status} |\n')
+                  if in_scope and not report_path.exists():
+                      missing_reports += 1
+
+              for label, report_path, finding_key, in_scope, _ in reports:
+                  if not in_scope:
+                      summary.write(f'\n#### {label}\n\nSkipped by path scope.\n')
+                      continue
 
-              for label, report_path, finding_key in reports:
                   if not report_path.exists():
-                      summary.write(f'- {label}: skipped (out of scope)\n')
+                      summary.write(f'\n#### {label}\n\nReport missing (scan did not produce expected output).\n')
                       continue
 
                   payload = json.loads(report_path.read_text(encoding='utf-8'))
@@ -148,7 +207,20 @@ jobs:
                   for target, count in target_counts.most_common(10):
                       summary.write(f'| `{target}` | {count} |\n')
 
-          if total_high_critical > 0:
+              if missing_reports > 0:
+                  overall_result = 'report-missing'
+              elif total_high_critical > 0:
+                  overall_result = 'findings-detected'
+              else:
+                  overall_result = 'clear'
+
+              summary.write('\n')
+              summary.write(f'- Overall result: **{overall_result}**\n')
+              summary.write('- Mode: advisory (job continue-on-error=true).\n')
+
+          if missing_reports > 0:
+              print(f'::warning::PR Trivy missing report files: {missing_reports}. See summary and artifacts.')
+          elif total_high_critical > 0:
               print(f'::warning::PR Trivy found {total_high_critical} HIGH/CRITICAL findings. See summary and artifacts.')
           else:
               print('::notice::PR Trivy found no HIGH/CRITICAL findings in scope.')
diff --git a/.github/workflows/lint-docker.yml b/.github/workflows/lint-docker.yml
index da7fe070..37678f98 100644
--- a/.github/workflows/lint-docker.yml
+++ b/.github/workflows/lint-docker.yml
@@ -20,6 +20,7 @@ permissions:
 jobs:
   detect-docker-changes:
     runs-on: ubuntu-latest
+    timeout-minutes: 5
     outputs:
       dockerfiles: ${{ steps.filter.outputs.dockerfiles }}
       af_pod_monitor: ${{ steps.filter.outputs.af_pod_monitor }}
@@ -49,10 +50,45 @@ jobs:
               - 'slurm/**'
               - '.github/scripts/container-smoke.sh'
 
+      - name: Publish container reliability plan
+        if: always()
+        shell: bash
+        run: |
+          set -euo pipefail
+          {
+            echo '### Container Reliability Plan'
+            echo
+            echo '| Check | Decision |'
+            echo '|---|---|'
+            if [ "${{ steps.filter.outputs.dockerfiles }}" = 'true' ]; then
+              echo '| Dockerfile lint | run |'
+            else
+              echo '| Dockerfile lint | skipped |'
+            fi
+            if [ "${{ steps.filter.outputs.af_pod_monitor }}" = 'true' ]; then
+              echo '| af-pod-monitor build/smoke | run |'
+            else
+              echo '| af-pod-monitor build/smoke | skipped |'
+            fi
+            if [ "${{ steps.filter.outputs.interlink_slurm_plugin }}" = 'true' ]; then
+              echo '| interlink-slurm-plugin build/smoke | run |'
+            else
+              echo '| interlink-slurm-plugin build/smoke | skipped |'
+            fi
+            if [ "${{ steps.filter.outputs.purdue_af }}" = 'true' ]; then
+              echo '| purdue-af build/smoke | run |'
+            else
+              echo '| purdue-af build/smoke | skipped |'
+            fi
+            echo
+            echo '- Workflow mode: advisory (all jobs use continue-on-error).'
+          } >> "$GITHUB_STEP_SUMMARY"
+
   lint-dockerfiles:
     needs: detect-docker-changes
     if: needs.detect-docker-changes.outputs.dockerfiles == 'true'
     runs-on: ubuntu-latest
+    timeout-minutes: 12
     continue-on-error: true
     steps:
       - uses: actions/checkout@v4
@@ -66,16 +102,28 @@ jobs:
           sudo mv /tmp/hadolint /usr/local/bin/hadolint
 
       - name: Run hadolint (check-only, advisory)
+        id: hadolint
         run: |
           set -euo pipefail
           hadolint --ignore DL3041 --ignore DL3033 --failure-threshold warning docker/af-pod-monitor/Dockerfile
           hadolint --ignore DL3041 --ignore DL3033 --failure-threshold warning docker/interlink-slurm-plugin/Dockerfile.alma8
           hadolint --ignore DL3041 --ignore DL3033 --failure-threshold warning docker/purdue-af/Dockerfile
 
+      - name: Publish Dockerfile lint advisory summary
+        if: always()
+        run: |
+          {
+            echo '### Dockerfile Lint Advisory Summary'
+            echo
+            echo "- Hadolint outcome: `${{ steps.hadolint.outcome }}`"
+            echo '- Mode: advisory (job continue-on-error=true).'
+          } >> "$GITHUB_STEP_SUMMARY"
+
   build-af-pod-monitor:
     needs: detect-docker-changes
     if: needs.detect-docker-changes.outputs.af_pod_monitor == 'true'
     runs-on: ubuntu-latest
+    timeout-minutes: 35
     continue-on-error: true
     steps:
       - uses: actions/checkout@v4
@@ -84,6 +132,7 @@ jobs:
         uses: docker/setup-buildx-action@v3
 
       - name: Build af-pod-monitor image with cache (advisory)
+        id: build_image
         uses: docker/build-push-action@v6
         with:
           context: docker/af-pod-monitor
@@ -95,12 +144,25 @@ jobs:
           provenance: false
 
       - name: Smoke test af-pod-monitor image (advisory)
+        id: smoke_test
         run: .github/scripts/container-smoke.sh local/af-pod-monitor:${{ github.sha }} af-pod-monitor
 
+      - name: Publish af-pod-monitor advisory summary
+        if: always()
+        run: |
+          {
+            echo '### af-pod-monitor Container Advisory Summary'
+            echo
+            echo "- Build outcome: `${{ steps.build_image.outcome }}`"
+            echo "- Smoke test outcome: `${{ steps.smoke_test.outcome }}`"
+            echo '- Mode: advisory (job continue-on-error=true).'
+          } >> "$GITHUB_STEP_SUMMARY"
+
   build-interlink-slurm-plugin:
     needs: detect-docker-changes
     if: needs.detect-docker-changes.outputs.interlink_slurm_plugin == 'true'
     runs-on: ubuntu-latest
+    timeout-minutes: 35
     continue-on-error: true
     steps:
       - uses: actions/checkout@v4
@@ -109,6 +171,7 @@ jobs:
         uses: docker/setup-buildx-action@v3
 
       - name: Build interlink-slurm-plugin image with cache (advisory)
+        id: build_image
         uses: docker/build-push-action@v6
         with:
           context: .
@@ -120,12 +183,25 @@ jobs:
           provenance: false
 
       - name: Smoke test interlink-slurm-plugin image (advisory)
+        id: smoke_test
         run: .github/scripts/container-smoke.sh local/interlink-slurm-plugin:${{ github.sha }} interlink-slurm-plugin
 
+      - name: Publish interlink-slurm-plugin advisory summary
+        if: always()
+        run: |
+          {
+            echo '### interlink-slurm-plugin Container Advisory Summary'
+            echo
+            echo "- Build outcome: `${{ steps.build_image.outcome }}`"
+            echo "- Smoke test outcome: `${{ steps.smoke_test.outcome }}`"
+            echo '- Mode: advisory (job continue-on-error=true).'
+          } >> "$GITHUB_STEP_SUMMARY"
+
   build-purdue-af:
     needs: detect-docker-changes
     if: needs.detect-docker-changes.outputs.purdue_af == 'true'
     runs-on: ubuntu-latest
+    timeout-minutes: 35
     continue-on-error: true
     steps:
       - uses: actions/checkout@v4
@@ -134,6 +210,7 @@ jobs:
         uses: docker/setup-buildx-action@v3
 
       - name: Build purdue-af image with cache (advisory)
+        id: build_image
         uses: docker/build-push-action@v6
         with:
           context: .
@@ -145,4 +222,16 @@ jobs:
           provenance: false
 
       - name: Smoke test purdue-af image (advisory)
+        id: smoke_test
         run: .github/scripts/container-smoke.sh local/purdue-af:${{ github.sha }} purdue-af
+
+      - name: Publish purdue-af advisory summary
+        if: always()
+        run: |
+          {
+            echo '### purdue-af Container Advisory Summary'
+            echo
+            echo "- Build outcome: `${{ steps.build_image.outcome }}`"
+            echo "- Smoke test outcome: `${{ steps.smoke_test.outcome }}`"
+            echo '- Mode: advisory (job continue-on-error=true).'
+          } >> "$GITHUB_STEP_SUMMARY"
diff --git a/.github/workflows/nightly-security-advisory.yml b/.github/workflows/nightly-security-advisory.yml
index 9d7fa398..c61012e5 100644
--- a/.github/workflows/nightly-security-advisory.yml
+++ b/.github/workflows/nightly-security-advisory.yml
@@ -15,11 +15,13 @@ permissions:
 jobs:
   trivy-filesystem:
     runs-on: ubuntu-latest
+    timeout-minutes: 30
     continue-on-error: true
     steps:
       - uses: actions/checkout@v4
 
       - name: Run Trivy filesystem scan (advisory)
+        id: fs_scan
         uses: aquasecurity/trivy-action@0.33.1
         with:
           scan-type: fs
@@ -35,6 +37,8 @@ jobs:
       - name: Publish nightly Trivy summary (advisory)
         if: always()
         shell: bash
+        env:
+          FS_SCAN_OUTCOME: ${{ steps.fs_scan.outcome || 'unknown' }}
         run: |
           set -euo pipefail
           python3 - <<'PY'
@@ -46,12 +50,18 @@ jobs:
           report_path = Path('trivy-nightly-fs.json')
           summary_path = Path(os.environ['GITHUB_STEP_SUMMARY'])
           title = 'Nightly Trivy Vulnerability Summary'
+          scan_outcome = os.environ.get('FS_SCAN_OUTCOME', 'unknown')
 
           with summary_path.open('a', encoding='utf-8') as summary:
               summary.write(f'### {title}\n\n')
+              summary.write('| Scan | Step outcome | Report |\n')
+              summary.write('|---|---|---|\n')
+              summary.write(f'| Filesystem vulnerability scan | `{scan_outcome}` | {"present" if report_path.exists() else "missing"} |\n\n')
 
               if not report_path.exists():
                   summary.write('- Trivy report was not generated.\n')
+                  summary.write('- Overall result: **scan-step-failed**\n')
+                  summary.write('- Mode: advisory (job continue-on-error=true).\n')
                   print('::warning::Nightly Trivy report was not generated.')
                   raise SystemExit(0)
 
@@ -75,6 +85,8 @@ jobs:
 
               if high_critical == 0:
                   summary.write('No HIGH/CRITICAL vulnerabilities found in scope.\n')
+                  summary.write('- Overall result: **clear**\n')
+                  summary.write('- Mode: advisory (job continue-on-error=true).\n')
                   print('::notice::Nightly Trivy found no HIGH/CRITICAL vulnerabilities.')
                   raise SystemExit(0)
 
@@ -90,6 +102,10 @@ jobs:
               for target, count in target_counts.most_common(10):
                   summary.write(f'| `{target}` | {count} |\n')
 
+              summary.write('\n')
+              summary.write('- Overall result: **findings-detected**\n')
+              summary.write('- Mode: advisory (job continue-on-error=true).\n')
+
           print(f'::warning::Nightly Trivy found {high_critical} HIGH/CRITICAL vulnerabilities. See summary and artifact.')
           PY
 

From 5cb774d195199ef2a99fffbaae8b47e87aaad454 Mon Sep 17 00:00:00 2001
From: Dmitry Kondratyev <kondratyev.d.95@gmail.com>
Date: Wed, 4 Feb 2026 12:00:27 -0500
Subject: [PATCH 19/25] ci: path-scope and harden autofix workflow runtime
 behavior

---
 .github/workflows/ci-format-autofix.yml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/workflows/ci-format-autofix.yml b/.github/workflows/ci-format-autofix.yml
index f97b9fee..8902c6cb 100644
--- a/.github/workflows/ci-format-autofix.yml
+++ b/.github/workflows/ci-format-autofix.yml
@@ -3,6 +3,15 @@ name: CI Format Autofix
 on:
   pull_request:
     types: [opened, synchronize, reopened]
+    paths:
+      - '**/*.py'
+      - '**/*.sh'
+      - '**/*.json'
+      - '**/*.yml'
+      - '**/*.yaml'
+      - '**/pixi-wrapper'
+      - '**/fix-permissions'
+      - '.github/workflows/ci-format-autofix.yml'
 
 concurrency:
   group: ci-format-autofix-${{ github.event.pull_request.number || github.ref }}
@@ -15,6 +24,7 @@ jobs:
   autofix-format:
     if: github.event.pull_request.head.repo.full_name == github.repository
     runs-on: ubuntu-latest
+    timeout-minutes: 15
     steps:
       - uses: actions/checkout@v4
         with:

From 61f918baf22e92ed87c8801bbc557f0d589f3292 Mon Sep 17 00:00:00 2001
From: Dmitry Kondratyev <kondratyev.d.95@gmail.com>
Date: Wed, 4 Feb 2026 12:01:18 -0500
Subject: [PATCH 20/25] docs: elevate CI section with expanded runtime badges
 and CI profile table

---
 README.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/README.md b/README.md
index a063fd08..926e2e30 100644
--- a/README.md
+++ b/README.md
@@ -22,8 +22,11 @@ Each user is provided with a 25GB home directory at first login. These directori
 
 [![Workflow Integrity](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-workflow-integrity.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-workflow-integrity.yml)
 [![Repo Quality](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-repo-quality.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-repo-quality.yml)
+[![CI Format Autofix](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-format-autofix.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-format-autofix.yml)
+[![CI Integration Scenarios](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-integration-scenarios.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-integration-scenarios.yml)
 [![Container Reliability](https://github.com/PurdueAF/purdue-af/actions/workflows/lint-docker.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/lint-docker.yml)
 [![GitOps Deployability](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-gitops-deployability.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-gitops-deployability.yml)
+[![CI Security Advisory](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-security-advisory.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-security-advisory.yml)
 [![Nightly Security](https://github.com/PurdueAF/purdue-af/actions/workflows/nightly-security-advisory.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/nightly-security-advisory.yml)
 
 ### Policy Badges
@@ -32,3 +35,16 @@ Each user is provided with a 25GB home directory at first login. These directori
 [![Security Scans](https://img.shields.io/badge/Security%20Scans-PR%20%2B%20Nightly-0366d6)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-security-advisory.yml)
 [![Validation Mode](https://img.shields.io/badge/Validation%20Mode-Advisory--first-f59e0b)](https://github.com/PurdueAF/purdue-af/actions)
 [![Autofix](https://img.shields.io/badge/Autofix-Python%2FShell%2FJSON%2FYAML-7c3aed)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-format-autofix.yml)
+
+### CI Profile
+
+| Signal | Workflow | Trigger | Mode (advisory/blocking) | Notes |
+|---|---|---|---|---|
+| Workflow integrity | [CI Workflow Integrity](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-workflow-integrity.yml) | Pull request (`.github/workflows/**`) | advisory | Actionlint + workflow YAML parse |
+| Repo quality | [CI Repo Quality](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-repo-quality.yml) | Pull request (unit/runtime paths) | advisory | Unit tests + 70% coverage policy signal |
+| Format autofix | [CI Format Autofix](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-format-autofix.yml) | Pull request open/sync/reopen (format-targeted paths) | advisory | Auto-formats and pushes fix commits to PR branch |
+| Integration scenarios | [CI Integration Scenarios](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-integration-scenarios.yml) | Pull request (integration paths) | advisory | Scripted integration scenario run |
+| Container reliability | [Container Reliability](https://github.com/PurdueAF/purdue-af/actions/workflows/lint-docker.yml) | Pull request (container/slurm paths) | advisory | Hadolint + image build/smoke checks |
+| GitOps deployability | [CI GitOps Deployability](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-gitops-deployability.yml) | Pull request (`deploy/**`) | advisory | Kustomize render + kubeconform validation |
+| Security advisory (PR) | [CI Security Advisory](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-security-advisory.yml) | Pull request (security-relevant paths) + manual dispatch | advisory | Trivy vuln/config scans with summary + artifacts |
+| Security advisory (nightly) | [Nightly Security Advisory](https://github.com/PurdueAF/purdue-af/actions/workflows/nightly-security-advisory.yml) | Nightly schedule + manual dispatch | advisory | Trivy filesystem scan with nightly summary |

From dbe0c48f5873d306e1c2471cee6e3cbf9d514c3d Mon Sep 17 00:00:00 2001
From: Dmitry Kondratyev <kondratyev.d.95@gmail.com>
Date: Wed, 4 Feb 2026 12:02:52 -0500
Subject: [PATCH 21/25] ci: pin formatter and lint toolchain versions for
 deterministic runs

---
 .github/workflows/ci-format-autofix.yml | 32 ++++++++++++++++++++++---
 .github/workflows/ci-repo-quality.yml   | 11 ++++++++-
 .github/workflows/lint-python.yml       | 11 ++++++++-
 .github/workflows/lint-shell.yml        | 15 +++++++++---
 4 files changed, 61 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/ci-format-autofix.yml b/.github/workflows/ci-format-autofix.yml
index 8902c6cb..4ae301aa 100644
--- a/.github/workflows/ci-format-autofix.yml
+++ b/.github/workflows/ci-format-autofix.yml
@@ -25,6 +25,11 @@ jobs:
     if: github.event.pull_request.head.repo.full_name == github.repository
     runs-on: ubuntu-latest
     timeout-minutes: 15
+    env:
+      BLACK_VERSION: '24.10.0'
+      ISORT_VERSION: '5.13.2'
+      SHFMT_VERSION: '3.10.0'
+      PRETTIER_VERSION: '3.3.3'
     steps:
       - uses: actions/checkout@v4
         with:
@@ -96,7 +101,7 @@ jobs:
         run: |
           set -euo pipefail
           python -m pip install --upgrade pip
-          pip install black isort
+          pip install "black==${BLACK_VERSION}" "isort==${ISORT_VERSION}"
 
           mapfile -t py_files <<'EOF'
           ${{ steps.py_changes.outputs.all_changed_files }}
@@ -112,16 +117,30 @@ jobs:
             isort --profile black "${files[@]}"
           fi
 
+      - name: Tool versions (Python formatters)
+        if: steps.py_changes.outputs.any_changed == 'true'
+        shell: bash
+        run: |
+          set -euo pipefail
+          black --version
+          isort --version-number
+
       - name: Install shell formatter
         if: steps.sh_changes.outputs.any_changed == 'true'
         shell: bash
         run: |
           set -euo pipefail
-          SHFMT_VERSION=3.10.0
           curl -fsSL "https://github.com/mvdan/sh/releases/download/v${SHFMT_VERSION}/shfmt_v${SHFMT_VERSION}_linux_amd64" -o /tmp/shfmt
           chmod +x /tmp/shfmt
           sudo mv /tmp/shfmt /usr/local/bin/shfmt
 
+      - name: Tool versions (Shell formatter)
+        if: steps.sh_changes.outputs.any_changed == 'true'
+        shell: bash
+        run: |
+          set -euo pipefail
+          shfmt --version
+
       - name: Auto-format shell files
         if: steps.sh_changes.outputs.any_changed == 'true'
         shell: bash
@@ -152,7 +171,7 @@ jobs:
         shell: bash
         run: |
           set -euo pipefail
-          npm install --global prettier
+          npm install --global "prettier@${PRETTIER_VERSION}"
 
           mapfile -t data_files <<'EOF'
           ${{ steps.data_changes.outputs.all_changed_files }}
@@ -167,6 +186,13 @@ jobs:
             prettier --write "${files[@]}"
           fi
 
+      - name: Tool versions (Data formatter)
+        if: steps.data_changes.outputs.any_changed == 'true'
+        shell: bash
+        run: |
+          set -euo pipefail
+          prettier --version
+
       - name: Commit and push formatting fixes
         env:
           PR_HEAD_REF: ${{ github.head_ref }}
diff --git a/.github/workflows/ci-repo-quality.yml b/.github/workflows/ci-repo-quality.yml
index 06a96532..3189f564 100644
--- a/.github/workflows/ci-repo-quality.yml
+++ b/.github/workflows/ci-repo-quality.yml
@@ -22,6 +22,9 @@ jobs:
   repo-quality:
     runs-on: ubuntu-latest
     continue-on-error: true
+    env:
+      PYTEST_VERSION: '8.4.0'
+      COVERAGE_VERSION: '7.6.1'
     steps:
       - uses: actions/checkout@v4
 
@@ -34,7 +37,13 @@ jobs:
         run: |
           set -euo pipefail
           python -m pip install --upgrade pip
-          pip install pytest coverage
+          pip install "pytest==${PYTEST_VERSION}" "coverage==${COVERAGE_VERSION}"
+
+      - name: Tool versions
+        run: |
+          set -euo pipefail
+          pytest --version
+          python -m coverage --version
 
       - name: Run pytest (advisory)
         shell: bash
diff --git a/.github/workflows/lint-python.yml b/.github/workflows/lint-python.yml
index 149a4c82..1a3aae7f 100644
--- a/.github/workflows/lint-python.yml
+++ b/.github/workflows/lint-python.yml
@@ -23,6 +23,9 @@ jobs:
   lint-python:
     runs-on: ubuntu-latest
     continue-on-error: true
+    env:
+      BLACK_VERSION: '24.10.0'
+      ISORT_VERSION: '5.13.2'
     steps:
       - uses: actions/checkout@v4
 
@@ -35,7 +38,13 @@ jobs:
         run: |
           set -euo pipefail
           python -m pip install --upgrade pip
-          pip install black isort
+          pip install "black==${BLACK_VERSION}" "isort==${ISORT_VERSION}"
+
+      - name: Tool versions
+        run: |
+          set -euo pipefail
+          black --version
+          isort --version-number
 
       - name: Run black/isort/py_compile (check-only, advisory)
         shell: bash
diff --git a/.github/workflows/lint-shell.yml b/.github/workflows/lint-shell.yml
index 030dcf60..f0e076e5 100644
--- a/.github/workflows/lint-shell.yml
+++ b/.github/workflows/lint-shell.yml
@@ -25,19 +25,28 @@ jobs:
   lint-shell:
     runs-on: ubuntu-latest
     continue-on-error: true
+    env:
+      SHELLCHECK_VERSION: '0.10.0'
+      SHFMT_VERSION: '3.10.0'
     steps:
       - uses: actions/checkout@v4
 
       - name: Install shell tooling
         run: |
           set -euo pipefail
-          sudo apt-get update
-          sudo apt-get install -y shellcheck
-          SHFMT_VERSION=3.10.0
+          curl -fsSL "https://github.com/koalaman/shellcheck/releases/download/v${SHELLCHECK_VERSION}/shellcheck-v${SHELLCHECK_VERSION}.linux.x86_64.tar.xz" -o /tmp/shellcheck.tar.xz
+          tar -xJf /tmp/shellcheck.tar.xz -C /tmp
+          sudo mv "/tmp/shellcheck-v${SHELLCHECK_VERSION}/shellcheck" /usr/local/bin/shellcheck
           curl -fsSL "https://github.com/mvdan/sh/releases/download/v${SHFMT_VERSION}/shfmt_v${SHFMT_VERSION}_linux_amd64" -o /tmp/shfmt
           chmod +x /tmp/shfmt
           sudo mv /tmp/shfmt /usr/local/bin/shfmt
 
+      - name: Tool versions
+        run: |
+          set -euo pipefail
+          shellcheck --version
+          shfmt --version
+
       - name: Run shellcheck/shfmt/bash -n (check-only, advisory)
         shell: bash
         run: |

From 903233f324550c1069aee3e1c372d483ed8ad2b8 Mon Sep 17 00:00:00 2001
From: Dmitry Kondratyev <kondratyev.d.95@gmail.com>
Date: Wed, 4 Feb 2026 12:22:31 -0500
Subject: [PATCH 22/25] docs(ci): sync CI plan with current runtime strategy

---
 .codex/CI_PLAN.md | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/.codex/CI_PLAN.md b/.codex/CI_PLAN.md
index dbef85fc..12f6c491 100644
--- a/.codex/CI_PLAN.md
+++ b/.codex/CI_PLAN.md
@@ -6,8 +6,9 @@ Deliver one draft PR from `codex/ci` to `main` with a stable advisory-first CI b
 ## Current Status
 - PR branch: `codex/ci`
 - Delivery model: single PR `codex/ci -> main`
-- Existing CI baseline is green on PR checks.
-- Formatter/linter workflows are check-only (no CI writeback commits).
+- PR #21 is open against `main` (not draft).
+- Existing CI baseline is green on fast PR checks; container build jobs are the long pole.
+- Lint workflows are check-only; formatter autofix workflow can commit formatting-only fixes to PR branches.
 
 ## Success Criteria
 - CI remains stable on `pull_request` runs for all configured workflows.
@@ -60,21 +61,25 @@ Approved exception:
 ### B) Repo Quality and Tests (advisory)
 - Workflows: `lint-python.yml`, `lint-shell.yml`, `lint-json.yml`, `lint-yaml.yml`, `ci-format-autofix.yml`, `ci-repo-quality.yml`, `ci-integration-scenarios.yml`
 - Checks: black/isort check-only, py_compile, pytest unit advisory with coverage threshold, shellcheck/shfmt/bash -n, JSON/YAML parse, auto-format commits for changed Python/shell/JSON/YAML files, integration scenario matrix tests via mocked container/monitoring flows.
+- Execution model: fast workflows are path-scoped with PR concurrency cancellation; formatter/lint tool versions are pinned for deterministic behavior.
 - Risk: script/runtime regressions.
 
 ### C) Container Reliability (advisory)
 - Workflow: `lint-docker.yml`
 - Checks: hadolint, targeted Docker Buildx jobs with GitHub Actions layer cache, smoke checks via `.github/scripts/container-smoke.sh`.
+- Execution model: path-scoped change detection, explicit job timeouts, and advisory summaries in run output.
 - Risk: image build/runtime regressions.
 
 ### D) GitOps Deployability (advisory)
 - Workflow: `ci-gitops-deployability.yml`
 - Checks: kustomize render + kubeconform schema validation.
+- Execution model: overlay-scoped detection, explicit job timeouts, and advisory plan/result summaries in run output.
 - Risk: Flux reconciliation failures from invalid manifests.
 
 ### E) Security Posture (advisory)
 - Workflows: `nightly-security-advisory.yml`, `ci-security-advisory.yml`
 - Checks: nightly Trivy filesystem scan plus PR-time advisory Trivy vulnerability/config scans with run summaries and artifacts.
+- Execution model: path-scoped PR scans, explicit scan timeouts, and summary tables for scan scope/outcomes.
 - Risk: security drift in dependencies/configuration.
 
 ## Optimization Workstreams (Current)
@@ -109,7 +114,6 @@ Goal:
 - No side branches.
 - No force-push on shared campaign work.
 - Daily sync: merge `main` into `codex/ci` (no rebase).
-- Keep PR draft until optimization baseline is stable.
 
 ## Constraint Challenge Protocol
 If any hard constraint must be challenged, submit an `EXCEPTION REQUEST` with:

From 600da03f0a059996f5d509081d486bc1d772a78b Mon Sep 17 00:00:00 2001
From: Dmitry Kondratyev <kondratyev.d.95@gmail.com>
Date: Wed, 4 Feb 2026 12:25:04 -0500
Subject: [PATCH 23/25] ci: fix actionlint shell warnings in workflow summaries

---
 .github/workflows/ci-gitops-deployability.yml | 18 +++++++++---------
 .github/workflows/lint-docker.yml             | 14 +++++++-------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/ci-gitops-deployability.yml b/.github/workflows/ci-gitops-deployability.yml
index f37b5a53..9167c264 100644
--- a/.github/workflows/ci-gitops-deployability.yml
+++ b/.github/workflows/ci-gitops-deployability.yml
@@ -66,29 +66,29 @@ jobs:
           {
             echo '### GitOps Deployability Plan'
             echo
-            echo "- Full overlay run: \`$run_all\`"
+            echo "- Full overlay run: $run_all"
             echo
             echo '| Overlay | Decision |'
             echo '|---|---|'
             if [ "$run_all" = 'true' ] || [ "${{ steps.filter.outputs.core_production }}" = 'true' ]; then
-              echo '| `deploy/core-production` | run |'
+              echo '| deploy/core-production | run |'
             else
-              echo '| `deploy/core-production` | skipped |'
+              echo '| deploy/core-production | skipped |'
             fi
             if [ "$run_all" = 'true' ] || [ "${{ steps.filter.outputs.core_staging }}" = 'true' ]; then
-              echo '| `deploy/core-staging` | run |'
+              echo '| deploy/core-staging | run |'
             else
-              echo '| `deploy/core-staging` | skipped |'
+              echo '| deploy/core-staging | skipped |'
             fi
             if [ "$run_all" = 'true' ] || [ "${{ steps.filter.outputs.core_geddes2 }}" = 'true' ]; then
-              echo '| `deploy/core-geddes2` | run |'
+              echo '| deploy/core-geddes2 | run |'
             else
-              echo '| `deploy/core-geddes2` | skipped |'
+              echo '| deploy/core-geddes2 | skipped |'
             fi
             if [ "$run_all" = 'true' ] || [ "${{ steps.filter.outputs.experimental }}" = 'true' ]; then
-              echo '| `deploy/experimental` | run |'
+              echo '| deploy/experimental | run |'
             else
-              echo '| `deploy/experimental` | skipped |'
+              echo '| deploy/experimental | skipped |'
             fi
             echo
             echo '- Mode: advisory (gitops-validate uses continue-on-error).'
diff --git a/.github/workflows/lint-docker.yml b/.github/workflows/lint-docker.yml
index 37678f98..6764fefa 100644
--- a/.github/workflows/lint-docker.yml
+++ b/.github/workflows/lint-docker.yml
@@ -115,7 +115,7 @@ jobs:
           {
             echo '### Dockerfile Lint Advisory Summary'
             echo
-            echo "- Hadolint outcome: `${{ steps.hadolint.outcome }}`"
+            echo "- Hadolint outcome: ${{ steps.hadolint.outcome }}"
             echo '- Mode: advisory (job continue-on-error=true).'
           } >> "$GITHUB_STEP_SUMMARY"
 
@@ -153,8 +153,8 @@ jobs:
           {
             echo '### af-pod-monitor Container Advisory Summary'
             echo
-            echo "- Build outcome: `${{ steps.build_image.outcome }}`"
-            echo "- Smoke test outcome: `${{ steps.smoke_test.outcome }}`"
+            echo "- Build outcome: ${{ steps.build_image.outcome }}"
+            echo "- Smoke test outcome: ${{ steps.smoke_test.outcome }}"
             echo '- Mode: advisory (job continue-on-error=true).'
           } >> "$GITHUB_STEP_SUMMARY"
 
@@ -192,8 +192,8 @@ jobs:
           {
             echo '### interlink-slurm-plugin Container Advisory Summary'
             echo
-            echo "- Build outcome: `${{ steps.build_image.outcome }}`"
-            echo "- Smoke test outcome: `${{ steps.smoke_test.outcome }}`"
+            echo "- Build outcome: ${{ steps.build_image.outcome }}"
+            echo "- Smoke test outcome: ${{ steps.smoke_test.outcome }}"
             echo '- Mode: advisory (job continue-on-error=true).'
           } >> "$GITHUB_STEP_SUMMARY"
 
@@ -231,7 +231,7 @@ jobs:
           {
             echo '### purdue-af Container Advisory Summary'
             echo
-            echo "- Build outcome: `${{ steps.build_image.outcome }}`"
-            echo "- Smoke test outcome: `${{ steps.smoke_test.outcome }}`"
+            echo "- Build outcome: ${{ steps.build_image.outcome }}"
+            echo "- Smoke test outcome: ${{ steps.smoke_test.outcome }}"
             echo '- Mode: advisory (job continue-on-error=true).'
           } >> "$GITHUB_STEP_SUMMARY"

From 6999e2f15b8f457b6fa8750ebe84ccbecd7bcf70 Mon Sep 17 00:00:00 2001
From: Dmitry Kondratyev <kondratyev.d.95@gmail.com>
Date: Wed, 4 Feb 2026 13:26:38 -0500
Subject: [PATCH 24/25] ci: remove docker build timeouts and trim build context

---
 .codex/CI_PLAN.md                 |  3 ++-
 .dockerignore                     | 14 ++++++++++++++
 .github/workflows/lint-docker.yml |  3 ---
 3 files changed, 16 insertions(+), 4 deletions(-)
 create mode 100644 .dockerignore

diff --git a/.codex/CI_PLAN.md b/.codex/CI_PLAN.md
index 12f6c491..384ff43c 100644
--- a/.codex/CI_PLAN.md
+++ b/.codex/CI_PLAN.md
@@ -8,6 +8,7 @@ Deliver one draft PR from `codex/ci` to `main` with a stable advisory-first CI b
 - Delivery model: single PR `codex/ci -> main`
 - PR #21 is open against `main` (not draft).
 - Existing CI baseline is green on fast PR checks; container build jobs are the long pole.
+- Root-context Docker builds are cache-enabled and use a repo-level `.dockerignore` to reduce context size.
 - Lint workflows are check-only; formatter autofix workflow can commit formatting-only fixes to PR branches.
 
 ## Success Criteria
@@ -67,7 +68,7 @@ Approved exception:
 ### C) Container Reliability (advisory)
 - Workflow: `lint-docker.yml`
 - Checks: hadolint, targeted Docker Buildx jobs with GitHub Actions layer cache, smoke checks via `.github/scripts/container-smoke.sh`.
-- Execution model: path-scoped change detection, explicit job timeouts, and advisory summaries in run output.
+- Execution model: path-scoped change detection, no per-job timeout cap on Docker build jobs, root-context `.dockerignore` optimization, and advisory summaries in run output.
 - Risk: image build/runtime regressions.
 
 ### D) GitOps Deployability (advisory)
diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 00000000..6f75458c
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,14 @@
+# Keep root-context Docker builds lean for CI.
+# This file affects builds that use `context: .` in GitHub Actions.
+**
+
+# Keep required sources for maintained root-context Dockerfiles.
+!docker/
+!docker/interlink-slurm-plugin/
+!docker/interlink-slurm-plugin/**
+!docker/purdue-af/
+!docker/purdue-af/**
+!slurm/
+!slurm/slurm-24.05.1-1.el8.x86_64.rpm
+!slurm/slurm-configs/
+!slurm/slurm-configs/**
diff --git a/.github/workflows/lint-docker.yml b/.github/workflows/lint-docker.yml
index 6764fefa..2489aee9 100644
--- a/.github/workflows/lint-docker.yml
+++ b/.github/workflows/lint-docker.yml
@@ -123,7 +123,6 @@ jobs:
     needs: detect-docker-changes
     if: needs.detect-docker-changes.outputs.af_pod_monitor == 'true'
     runs-on: ubuntu-latest
-    timeout-minutes: 35
     continue-on-error: true
     steps:
       - uses: actions/checkout@v4
@@ -162,7 +161,6 @@ jobs:
     needs: detect-docker-changes
     if: needs.detect-docker-changes.outputs.interlink_slurm_plugin == 'true'
     runs-on: ubuntu-latest
-    timeout-minutes: 35
     continue-on-error: true
     steps:
       - uses: actions/checkout@v4
@@ -201,7 +199,6 @@ jobs:
     needs: detect-docker-changes
     if: needs.detect-docker-changes.outputs.purdue_af == 'true'
     runs-on: ubuntu-latest
-    timeout-minutes: 35
     continue-on-error: true
     steps:
       - uses: actions/checkout@v4

From 7f675f192ebd0c8c529c1d154d909e0e8b792f31 Mon Sep 17 00:00:00 2001
From: Dmitry Kondratyev <kondratyev.d.95@gmail.com>
Date: Wed, 4 Feb 2026 18:37:21 -0500
Subject: [PATCH 25/25] ci: bound docker builds and improve build visibility

---
 .codex/CI_PLAN.md                 |  2 +-
 .github/workflows/lint-docker.yml | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/.codex/CI_PLAN.md b/.codex/CI_PLAN.md
index 384ff43c..7b82984b 100644
--- a/.codex/CI_PLAN.md
+++ b/.codex/CI_PLAN.md
@@ -68,7 +68,7 @@ Approved exception:
 ### C) Container Reliability (advisory)
 - Workflow: `lint-docker.yml`
 - Checks: hadolint, targeted Docker Buildx jobs with GitHub Actions layer cache, smoke checks via `.github/scripts/container-smoke.sh`.
-- Execution model: path-scoped change detection, no per-job timeout cap on Docker build jobs, root-context `.dockerignore` optimization, and advisory summaries in run output.
+- Execution model: path-scoped change detection, 120-minute per-job timeout cap for Docker build jobs, root-context `.dockerignore` optimization, BuildKit plain progress logging, and advisory summaries in run output.
 - Risk: image build/runtime regressions.
 
 ### D) GitOps Deployability (advisory)
diff --git a/.github/workflows/lint-docker.yml b/.github/workflows/lint-docker.yml
index 2489aee9..887e63db 100644
--- a/.github/workflows/lint-docker.yml
+++ b/.github/workflows/lint-docker.yml
@@ -123,7 +123,10 @@ jobs:
     needs: detect-docker-changes
     if: needs.detect-docker-changes.outputs.af_pod_monitor == 'true'
     runs-on: ubuntu-latest
+    timeout-minutes: 120
     continue-on-error: true
+    env:
+      BUILDKIT_PROGRESS: plain
     steps:
       - uses: actions/checkout@v4
 
@@ -137,6 +140,7 @@ jobs:
           context: docker/af-pod-monitor
           file: docker/af-pod-monitor/Dockerfile
           load: true
+          pull: true
           tags: local/af-pod-monitor:${{ github.sha }}
           cache-from: type=gha,scope=af-pod-monitor
           cache-to: type=gha,mode=max,scope=af-pod-monitor,ignore-error=true
@@ -161,7 +165,10 @@ jobs:
     needs: detect-docker-changes
     if: needs.detect-docker-changes.outputs.interlink_slurm_plugin == 'true'
     runs-on: ubuntu-latest
+    timeout-minutes: 120
     continue-on-error: true
+    env:
+      BUILDKIT_PROGRESS: plain
     steps:
       - uses: actions/checkout@v4
 
@@ -175,6 +182,7 @@ jobs:
           context: .
           file: docker/interlink-slurm-plugin/Dockerfile.alma8
           load: true
+          pull: true
           tags: local/interlink-slurm-plugin:${{ github.sha }}
           cache-from: type=gha,scope=interlink-slurm-plugin
           cache-to: type=gha,mode=max,scope=interlink-slurm-plugin,ignore-error=true
@@ -199,7 +207,10 @@ jobs:
     needs: detect-docker-changes
     if: needs.detect-docker-changes.outputs.purdue_af == 'true'
     runs-on: ubuntu-latest
+    timeout-minutes: 120
     continue-on-error: true
+    env:
+      BUILDKIT_PROGRESS: plain
     steps:
       - uses: actions/checkout@v4
 
@@ -213,6 +224,7 @@ jobs:
           context: .
           file: docker/purdue-af/Dockerfile
           load: true
+          pull: true
           tags: local/purdue-af:${{ github.sha }}
           cache-from: type=gha,scope=purdue-af
           cache-to: type=gha,mode=max,scope=purdue-af,ignore-error=true