From 09ea2f416adf9f02649587598c7a5c2ba0be554f Mon Sep 17 00:00:00 2001 From: Dmitry Kondratyev Date: Tue, 3 Feb 2026 23:20:52 -0500 Subject: [PATCH 01/25] Add CI campaign source-of-truth plan --- .codex/CI_PLAN.md | 106 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .codex/CI_PLAN.md diff --git a/.codex/CI_PLAN.md b/.codex/CI_PLAN.md new file mode 100644 index 00000000..76463f3f --- /dev/null +++ b/.codex/CI_PLAN.md @@ -0,0 +1,106 @@ +# CI/CD Campaign Plan (Current State) + +## Mission and Success Criteria +Deliver exactly one draft PR from `codex/ci` to `main` with minimal CI/CD hardening that: +- converts formatter-based CI to check-only behavior, +- adds advisory-first integrity/deploy/security coverage, +- keeps one source of truth in this file, +- preserves safe daily branch sync (`main` merged into `codex/ci`, no force-push). + +Success means: +- `.github/workflows/lint-*.yml` workflows are check-only and run on every push + pull_request, +- new workflows exist for integrity, GitOps deployability, and nightly advisory security, +- optional repo-quality workflow is selected and included, +- README shows A-E category badges, +- no changes touch out-of-scope paths. + +## In-Scope / Out-of-Scope Paths +In scope: +- `.github/**` +- `apps/**` +- `deploy/**` +- `docker/**` (except exclusions) +- `README.md` +- `.codex/CI_PLAN.md` + +Out of scope: +- `docker/dask-gateway-server/**` +- `docs/**` +- `docs/source/demos/**` +- `docker/kaniko-build-jobs/**` +- `slurm/**` +- `.cursor/**` + +## Target Check Architecture +### A) CI System Integrity (advisory) +- Workflow: `.github/workflows/ci-workflow-integrity.yml` +- Checks: `actionlint` + workflow YAML parse. +- Risk mapped: malformed workflows, invalid action definitions, skipped CI due syntax/runtime issues. + +### B) Repo-Owned Code Quality / Tests (advisory additions) +- Workflows: + - `.github/workflows/lint-python.yml` + - `.github/workflows/lint-shell.yml` + - `.github/workflows/lint-json.yml` + - `.github/workflows/lint-yaml.yml` + - `.github/workflows/ci-repo-quality.yml` (selected) +- Checks: black/isort check-only, py_compile, pytest (advisory), shellcheck/shfmt/bash -n, JSON/YAML parse checks. +- Risk mapped: runtime and script regressions. + +### C) Container Reliability (advisory additions) +- Workflow: `.github/workflows/lint-docker.yml` +- Checks: hadolint (check-only), advisory docker build/smoke for maintained Dockerfiles via `.github/scripts/container-smoke.sh`. +- Risk mapped: container build/runtime breakage. + +### D) GitOps/K8s Deployability (advisory) +- Workflow: `.github/workflows/ci-gitops-deployability.yml` +- Checks: `kustomize build --load-restrictor LoadRestrictionsNone` for all deploy overlays + `kubeconform` schema validation. +- Risk mapped: Flux reconciliation failures from invalid manifests. + +### E) Nightly Advisory Security +- Workflow: `.github/workflows/nightly-security-advisory.yml` +- Checks: Trivy filesystem scan (HIGH/CRITICAL). +- Risk mapped: security posture drift. + +## Advisory vs Future Blocking Milestones +- M0 (this campaign): all newly introduced validations advisory. +- M1: promote workflow integrity + repo-quality checks to blocking after stable baseline. +- M2: promote container + GitOps checks to blocking after stable baseline. +- M3: keep nightly security advisory unless explicitly promoted. + +## Agent Lane Ownership (File Level) +- Coordinator: `.codex/CI_PLAN.md`, `README.md`, branch/PR/sync operations. +- Agent A: `.github/workflows/ci-workflow-integrity.yml` (+ selection recommendation in chat). +- Agent B: `.github/workflows/lint-python.yml`, `.github/workflows/lint-shell.yml`, `.github/workflows/ci-repo-quality.yml`, optional B helper scripts. +- Agent C: `.github/workflows/lint-json.yml`, `.github/workflows/lint-yaml.yml`. +- Agent D: `.github/workflows/lint-docker.yml`, `.github/scripts/container-smoke.sh`. +- Agent E: `.github/workflows/ci-gitops-deployability.yml`, `.github/workflows/nightly-security-advisory.yml`. + +## Phased Rollout and Rollback +Rollout: +1. First commit creates this file. +2. Add/convert workflows in lane-owned files only. +3. Keep PR draft until baseline checks stabilize. +4. Daily sync by merging `main` into `codex/ci`. + +Rollback: +- Revert only unstable workflow files in small commits. +- Keep advisory mode active during stabilization. + +## Reproducible Runbook (from clean main) +1. `git fetch origin` +2. `git switch main && git pull --ff-only origin main` +3. `git switch -c codex/ci` (or `git switch codex/ci`) +4. Commit #1: `.codex/CI_PLAN.md` +5. Apply lane-scoped workflow changes +6. `git push -u origin codex/ci` +7. Open one draft PR `codex/ci -> main` +8. Daily sync: `git fetch origin && git switch codex/ci && git merge --no-ff origin/main` + +## Constraint Challenge Protocol +If a hard constraint appears to conflict with delivery, create an `EXCEPTION REQUEST` with: +1) challenged constraint, +2) concrete risk if unchanged, +3) minimal exception, +4) rollback path. +Do not implement exception changes before explicit user approval. From c76e026ee1cbd7b4db74686e059fbe4f857165da Mon Sep 17 00:00:00 2001 From: Dmitry Kondratyev Date: Tue, 3 Feb 2026 23:26:16 -0500 Subject: [PATCH 02/25] Add advisory CI integrity, quality, container, gitops, and security checks --- .github/scripts/container-smoke.sh | 30 ++++++ .github/workflows/ci-gitops-deployability.yml | 53 ++++++++++ .github/workflows/ci-repo-quality.yml | 71 ++++++++++++++ .github/workflows/ci-workflow-integrity.yml | 43 +++++++++ .github/workflows/lint-docker.yml | 96 +++++++++++++++++-- .github/workflows/lint-json.yml | 53 +++++----- .github/workflows/lint-python.yml | 65 ++++++++----- .github/workflows/lint-shell.yml | 62 +++++++----- .github/workflows/lint-yaml.yml | 75 +++++++++------ .../workflows/nightly-security-advisory.yml | 27 ++++++ README.md | 11 +-- 11 files changed, 462 insertions(+), 124 deletions(-) create mode 100755 .github/scripts/container-smoke.sh create mode 100644 .github/workflows/ci-gitops-deployability.yml create mode 100644 .github/workflows/ci-repo-quality.yml create mode 100644 .github/workflows/ci-workflow-integrity.yml create mode 100644 .github/workflows/nightly-security-advisory.yml diff --git a/.github/scripts/container-smoke.sh b/.github/scripts/container-smoke.sh new file mode 100755 index 00000000..cf1bb967 --- /dev/null +++ b/.github/scripts/container-smoke.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " >&2 + exit 2 +fi + +image="$1" +profile="$2" + +docker image inspect "$image" >/dev/null + +case "$profile" in + af-pod-monitor) + docker run --rm --entrypoint python "$image" -c "import prometheus_client" + ;; + interlink-slurm-plugin) + docker run --rm --entrypoint /bin/sh "$image" -lc 'test -x /sidecar/slurm-sidecar' + ;; + purdue-af) + docker run --rm --entrypoint /bin/bash "$image" -lc 'python --version && jupyter --version >/dev/null' + ;; + *) + echo "Unknown profile: $profile" >&2 + exit 2 + ;; +esac + +echo "Smoke checks passed for profile: $profile" diff --git a/.github/workflows/ci-gitops-deployability.yml b/.github/workflows/ci-gitops-deployability.yml new file mode 100644 index 00000000..c8f70e70 --- /dev/null +++ b/.github/workflows/ci-gitops-deployability.yml @@ -0,0 +1,53 @@ +name: CI GitOps Deployability + +on: + push: + pull_request: + +permissions: + contents: read + +jobs: + gitops-validate: + runs-on: ubuntu-latest + continue-on-error: true + steps: + - uses: actions/checkout@v4 + + - name: Set up kustomize + uses: imranismail/setup-kustomize@v2 + with: + kustomize-version: '5.4.2' + + - name: Install kubeconform + run: | + set -euo pipefail + KUBECONFORM_VERSION=0.6.7 + curl -fsSL "https://github.com/yannh/kubeconform/releases/download/v${KUBECONFORM_VERSION}/kubeconform-linux-amd64.tar.gz" -o /tmp/kubeconform.tar.gz + tar -xzf /tmp/kubeconform.tar.gz -C /tmp kubeconform + chmod +x /tmp/kubeconform + sudo mv /tmp/kubeconform /usr/local/bin/kubeconform + + - name: Render overlays with kustomize (advisory) + run: | + set -euo pipefail + overlays=( + deploy/core-production + deploy/core-staging + deploy/core-geddes2 + deploy/experimental + ) + + for overlay in "${overlays[@]}"; do + out="/tmp/$(echo "$overlay" | tr '/' '_').yaml" + echo "Rendering $overlay -> $out" + kustomize build --load-restrictor LoadRestrictionsNone "$overlay" > "$out" + done + + - name: Validate rendered manifests with kubeconform (advisory) + run: | + set -euo pipefail + for rendered in /tmp/deploy_core-production.yaml /tmp/deploy_core-staging.yaml /tmp/deploy_core-geddes2.yaml /tmp/deploy_experimental.yaml; do + echo "Validating $rendered" + kubeconform -summary -strict -ignore-missing-schemas "$rendered" + done diff --git a/.github/workflows/ci-repo-quality.yml b/.github/workflows/ci-repo-quality.yml new file mode 100644 index 00000000..9a08b612 --- /dev/null +++ b/.github/workflows/ci-repo-quality.yml @@ -0,0 +1,71 @@ +name: CI Repo Quality + +on: + push: + pull_request: + +permissions: + contents: read + +jobs: + repo-quality: + runs-on: ubuntu-latest + continue-on-error: true + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install quality tooling + run: | + set -euo pipefail + python -m pip install --upgrade pip + pip install pytest + + - name: Python syntax smoke (advisory) + shell: bash + run: | + set -euo pipefail + mapfile -t py_files < <(find . -type f -name '*.py' \ + -not -path './docker/dask-gateway-server/*' \ + -not -path './docker/kaniko-build-jobs/*' \ + -not -path './docs/*' \ + -not -path './slurm/*' \ + -not -path './.cursor/*' \ + -not -path './.git/*' | sort) + + if [ "${#py_files[@]}" -gt 0 ]; then + python -m py_compile "${py_files[@]}" + fi + + - name: Shell syntax smoke (advisory) + shell: bash + run: | + set -euo pipefail + mapfile -t sh_files < <(find . -type f \ + \( -name '*.sh' -o -name 'pixi-wrapper' -o -name 'fix-permissions' \) \ + -not -path './docker/dask-gateway-server/*' \ + -not -path './docker/kaniko-build-jobs/*' \ + -not -path './docs/*' \ + -not -path './slurm/*' \ + -not -path './.cursor/*' \ + -not -path './.git/*' | sort) + + for f in "${sh_files[@]}"; do + bash -n "$f" + done + + - name: Run pytest (advisory) + shell: bash + run: | + set +e + pytest -q + rc=$? + if [ "$rc" -eq 5 ]; then + echo 'pytest collected no tests; treating as informational.' + exit 0 + fi + exit "$rc" diff --git a/.github/workflows/ci-workflow-integrity.yml b/.github/workflows/ci-workflow-integrity.yml new file mode 100644 index 00000000..035578ea --- /dev/null +++ b/.github/workflows/ci-workflow-integrity.yml @@ -0,0 +1,43 @@ +name: CI Workflow Integrity + +on: + push: + pull_request: + +permissions: + contents: read + +jobs: + workflow-integrity: + runs-on: ubuntu-latest + continue-on-error: true + steps: + - uses: actions/checkout@v4 + + - name: Install actionlint + run: | + set -euo pipefail + bash <(curl -sSfL https://raw.githubusercontent.com/rhysd/actionlint/main/scripts/download-actionlint.bash) 1.7.4 + echo "$PWD/bin" >> "$GITHUB_PATH" + + - name: Run actionlint (advisory) + run: actionlint -color + + - name: Validate workflow YAML parse (advisory) + run: | + set -euo pipefail + python3 -m pip install --upgrade pip + python3 -m pip install pyyaml + python3 - <<'PY' + from pathlib import Path + import yaml + + workflows = sorted(Path('.github/workflows').glob('*.y*ml')) + if not workflows: + raise SystemExit('No workflow files found in .github/workflows') + + for wf in workflows: + with wf.open('r', encoding='utf-8') as f: + list(yaml.safe_load_all(f)) + print(f'Parsed {len(workflows)} workflow file(s).') + PY diff --git a/.github/workflows/lint-docker.yml b/.github/workflows/lint-docker.yml index f0733fdc..fce34cbc 100644 --- a/.github/workflows/lint-docker.yml +++ b/.github/workflows/lint-docker.yml @@ -1,22 +1,98 @@ -name: Lint Dockerfiles +name: Container Reliability on: push: - paths: - - '**/Dockerfile*' pull_request: - paths: - - '**/Dockerfile*' + +permissions: + contents: read jobs: - lint: + detect-docker-changes: runs-on: ubuntu-latest + outputs: + af_pod_monitor: ${{ steps.filter.outputs.af_pod_monitor }} + interlink_slurm_plugin: ${{ steps.filter.outputs.interlink_slurm_plugin }} + purdue_af: ${{ steps.filter.outputs.purdue_af }} steps: - uses: actions/checkout@v4 + + - uses: dorny/paths-filter@v3 + id: filter + with: + filters: | + af_pod_monitor: + - 'docker/af-pod-monitor/**' + - '.github/workflows/lint-docker.yml' + - '.github/scripts/container-smoke.sh' + interlink_slurm_plugin: + - 'docker/interlink-slurm-plugin/**' + - '.github/workflows/lint-docker.yml' + - '.github/scripts/container-smoke.sh' + purdue_af: + - 'docker/purdue-af/**' + - '.github/workflows/lint-docker.yml' + - '.github/scripts/container-smoke.sh' + + lint-dockerfiles: + runs-on: ubuntu-latest + continue-on-error: true + steps: + - uses: actions/checkout@v4 + - name: Install hadolint run: | - sudo wget -O /bin/hadolint https://github.com/hadolint/hadolint/releases/latest/download/hadolint-$(uname -s)-$(uname -m) - sudo chmod +x /bin/hadolint - - name: Run hadolint + set -euo pipefail + HADOLINT_VERSION=v2.12.0 + curl -fsSL "https://github.com/hadolint/hadolint/releases/download/${HADOLINT_VERSION}/hadolint-Linux-x86_64" -o /tmp/hadolint + chmod +x /tmp/hadolint + sudo mv /tmp/hadolint /usr/local/bin/hadolint + + - name: Run hadolint (check-only, advisory) run: | - find . -type f -iname 'Dockerfile*' -not -path './docker/dask-gateway-server/*' -exec hadolint --ignore DL3041 --ignore DL3033 --failure-threshold warning {} + \ No newline at end of file + set -euo pipefail + hadolint --ignore DL3041 --ignore DL3033 --failure-threshold warning docker/af-pod-monitor/Dockerfile + hadolint --ignore DL3041 --ignore DL3033 --failure-threshold warning docker/interlink-slurm-plugin/Dockerfile.alma8 + hadolint --ignore DL3041 --ignore DL3033 --failure-threshold warning docker/purdue-af/Dockerfile + + build-af-pod-monitor: + needs: detect-docker-changes + if: needs.detect-docker-changes.outputs.af_pod_monitor == 'true' + runs-on: ubuntu-latest + continue-on-error: true + steps: + - uses: actions/checkout@v4 + + - name: Build af-pod-monitor image (advisory) + run: docker build -f docker/af-pod-monitor/Dockerfile -t local/af-pod-monitor:${{ github.sha }} . + + - name: Smoke test af-pod-monitor image (advisory) + run: .github/scripts/container-smoke.sh local/af-pod-monitor:${{ github.sha }} af-pod-monitor + + build-interlink-slurm-plugin: + needs: detect-docker-changes + if: needs.detect-docker-changes.outputs.interlink_slurm_plugin == 'true' + runs-on: ubuntu-latest + continue-on-error: true + steps: + - uses: actions/checkout@v4 + + - name: Build interlink-slurm-plugin image (advisory) + run: docker build -f docker/interlink-slurm-plugin/Dockerfile.alma8 -t local/interlink-slurm-plugin:${{ github.sha }} . + + - name: Smoke test interlink-slurm-plugin image (advisory) + run: .github/scripts/container-smoke.sh local/interlink-slurm-plugin:${{ github.sha }} interlink-slurm-plugin + + build-purdue-af: + needs: detect-docker-changes + if: needs.detect-docker-changes.outputs.purdue_af == 'true' + runs-on: ubuntu-latest + continue-on-error: true + steps: + - uses: actions/checkout@v4 + + - name: Build purdue-af image (advisory) + run: docker build -f docker/purdue-af/Dockerfile -t local/purdue-af:${{ github.sha }} . + + - name: Smoke test purdue-af image (advisory) + run: .github/scripts/container-smoke.sh local/purdue-af:${{ github.sha }} purdue-af diff --git a/.github/workflows/lint-json.yml b/.github/workflows/lint-json.yml index 84c5e6b5..fcfe8bfb 100644 --- a/.github/workflows/lint-json.yml +++ b/.github/workflows/lint-json.yml @@ -2,40 +2,37 @@ name: Lint JSON on: push: - paths: - - '**.json' pull_request: - paths: - - '**.json' permissions: - contents: write - pull-requests: write + contents: read jobs: - lint: + json-parse: runs-on: ubuntu-latest + continue-on-error: true steps: - uses: actions/checkout@v4 - with: - token: ${{ secrets.GITHUB_TOKEN }} - - name: Set up Node.js - uses: actions/setup-node@v4 - with: - node-version: '20' - - name: Install prettier - run: npm install -g prettier - - name: Format JSON files - run: | - find . -name "*.json" | grep -v "docker/dask-gateway-server" | xargs prettier --write - - name: Commit changes - if: ${{ github.event_name == 'pull_request' || github.ref_type == 'branch' }} + + - name: Validate JSON files (check-only, advisory) + shell: bash run: | - git config --local user.email "action@github.com" - git config --local user.name "GitHub Action" - REF_NAME="${GITHUB_HEAD_REF:-${GITHUB_REF_NAME}}" - git fetch origin "$REF_NAME" - git checkout -B "$REF_NAME" "origin/$REF_NAME" || git checkout -B "$REF_NAME" - git add -A - git diff --quiet && git diff --staged --quiet || git commit -m "Apply prettier JSON formatting" - git push origin "$REF_NAME" \ No newline at end of file + set -euo pipefail + mapfile -t files < <(find . -type f -name '*.json' \ + -not -path './docker/dask-gateway-server/*' \ + -not -path './docker/kaniko-build-jobs/*' \ + -not -path './docs/*' \ + -not -path './slurm/*' \ + -not -path './.cursor/*' \ + -not -path './.git/*' | sort) + + if [ "${#files[@]}" -eq 0 ]; then + echo 'No in-scope JSON files found.' + exit 0 + fi + + for f in "${files[@]}"; do + python3 -m json.tool "$f" >/dev/null + done + + echo "Validated ${#files[@]} JSON file(s)." diff --git a/.github/workflows/lint-python.yml b/.github/workflows/lint-python.yml index 32e273ec..f9f16381 100644 --- a/.github/workflows/lint-python.yml +++ b/.github/workflows/lint-python.yml @@ -2,43 +2,58 @@ name: Lint Python on: push: - paths: - - '**.py' pull_request: - paths: - - '**.py' permissions: - contents: write - pull-requests: write + contents: read jobs: - lint: + lint-python: runs-on: ubuntu-latest + continue-on-error: true steps: - uses: actions/checkout@v4 - with: - token: ${{ secrets.GITHUB_TOKEN }} + - name: Set up Python uses: actions/setup-python@v5 with: python-version: '3.11' - - name: Install dependencies + + - name: Install tooling run: | + set -euo pipefail python -m pip install --upgrade pip - pip install black isort - - name: Run black - run: black . --exclude docker/dask-gateway-server - - name: Run isort - run: isort . --skip docker/dask-gateway-server - - name: Commit changes - if: ${{ github.event_name == 'pull_request' || github.ref_type == 'branch' }} + pip install black isort pytest + + - name: Run black/isort/py_compile (check-only, advisory) + shell: bash + run: | + set -euo pipefail + mapfile -t files < <(find . -type f -name '*.py' \ + -not -path './docker/dask-gateway-server/*' \ + -not -path './docker/kaniko-build-jobs/*' \ + -not -path './docs/*' \ + -not -path './slurm/*' \ + -not -path './.cursor/*' \ + -not -path './.git/*' | sort) + + if [ "${#files[@]}" -eq 0 ]; then + echo 'No in-scope Python files found.' + exit 0 + fi + + black --check "${files[@]}" + isort --check-only "${files[@]}" + python -m py_compile "${files[@]}" + + - name: Run pytest (advisory) + shell: bash run: | - git config --local user.email "action@github.com" - git config --local user.name "GitHub Action" - REF_NAME="${GITHUB_HEAD_REF:-${GITHUB_REF_NAME}}" - git fetch origin "$REF_NAME" - git checkout -B "$REF_NAME" "origin/$REF_NAME" || git checkout -B "$REF_NAME" - git add -A - git diff --quiet && git diff --staged --quiet || git commit -m "Apply black and isort formatting" - git push origin "$REF_NAME" \ No newline at end of file + set +e + pytest -q + rc=$? + if [ "$rc" -eq 5 ]; then + echo 'pytest collected no tests; treating as informational.' + exit 0 + fi + exit "$rc" diff --git a/.github/workflows/lint-shell.yml b/.github/workflows/lint-shell.yml index c3d65b08..04a61734 100644 --- a/.github/workflows/lint-shell.yml +++ b/.github/workflows/lint-shell.yml @@ -2,39 +2,49 @@ name: Lint Shell Scripts on: push: - paths: - - '**.sh' pull_request: - paths: - - '**.sh' permissions: - contents: write - pull-requests: write + contents: read jobs: - lint: + lint-shell: runs-on: ubuntu-latest + continue-on-error: true steps: - uses: actions/checkout@v4 - with: - token: ${{ secrets.GITHUB_TOKEN }} - - name: Install shfmt - run: | - curl -L https://github.com/mvdan/sh/releases/download/v3.7.0/shfmt_v3.7.0_linux_amd64 -o shfmt - chmod +x shfmt - sudo mv shfmt /usr/local/bin/ - - name: Format shell scripts + + - name: Install shell tooling run: | - find . -type f -name '*.sh' -not -path './docker/dask-gateway-server/*' -exec shfmt -w {} + - - name: Commit changes - if: ${{ github.event_name == 'pull_request' || github.ref_type == 'branch' }} + set -euo pipefail + sudo apt-get update + sudo apt-get install -y shellcheck + SHFMT_VERSION=3.10.0 + curl -fsSL "https://github.com/mvdan/sh/releases/download/v${SHFMT_VERSION}/shfmt_v${SHFMT_VERSION}_linux_amd64" -o /tmp/shfmt + chmod +x /tmp/shfmt + sudo mv /tmp/shfmt /usr/local/bin/shfmt + + - name: Run shellcheck/shfmt/bash -n (check-only, advisory) + shell: bash run: | - git config --local user.email "action@github.com" - git config --local user.name "GitHub Action" - REF_NAME="${GITHUB_HEAD_REF:-${GITHUB_REF_NAME}}" - git fetch origin "$REF_NAME" - git checkout -B "$REF_NAME" "origin/$REF_NAME" || git checkout -B "$REF_NAME" - git add -A - git diff --quiet && git diff --staged --quiet || git commit -m "Apply shfmt shell script formatting" - git push origin "$REF_NAME" \ No newline at end of file + set -euo pipefail + mapfile -t files < <(find . -type f \ + \( -name '*.sh' -o -name 'pixi-wrapper' -o -name 'fix-permissions' \) \ + -not -path './docker/dask-gateway-server/*' \ + -not -path './docker/kaniko-build-jobs/*' \ + -not -path './docs/*' \ + -not -path './slurm/*' \ + -not -path './.cursor/*' \ + -not -path './.git/*' | sort) + + if [ "${#files[@]}" -eq 0 ]; then + echo 'No in-scope shell files found.' + exit 0 + fi + + shellcheck "${files[@]}" + shfmt -d "${files[@]}" + + for f in "${files[@]}"; do + bash -n "$f" + done diff --git a/.github/workflows/lint-yaml.yml b/.github/workflows/lint-yaml.yml index 532884f1..e7cb7e92 100644 --- a/.github/workflows/lint-yaml.yml +++ b/.github/workflows/lint-yaml.yml @@ -2,42 +2,59 @@ name: Lint YAML on: push: - paths: - - '**.yml' - - '**.yaml' pull_request: - paths: - - '**.yml' - - '**.yaml' permissions: - contents: write - pull-requests: write + contents: read jobs: - lint: + yaml-parse: runs-on: ubuntu-latest + continue-on-error: true steps: - uses: actions/checkout@v4 - with: - token: ${{ secrets.GITHUB_TOKEN }} - - name: Set up Node.js - uses: actions/setup-node@v4 - with: - node-version: '20' - - name: Install prettier - run: npm install -g prettier - - name: Format YAML files + + - name: Install parser dependency run: | - find . -name "*.yml" -o -name "*.yaml" | grep -v "docker/dask-gateway-server" | grep -v "templates" | grep -v ".github/workflows" | xargs prettier --write - - name: Commit changes - if: ${{ github.event_name == 'pull_request' || github.ref_type == 'branch' }} + set -euo pipefail + python3 -m pip install --upgrade pip + python3 -m pip install pyyaml + + - name: Validate YAML files (check-only, advisory) + shell: bash run: | - git config --local user.email "action@github.com" - git config --local user.name "GitHub Action" - REF_NAME="${GITHUB_HEAD_REF:-${GITHUB_REF_NAME}}" - git fetch origin "$REF_NAME" - git checkout -B "$REF_NAME" "origin/$REF_NAME" || git checkout -B "$REF_NAME" - git add -A - git diff --quiet && git diff --staged --quiet || git commit -m "Apply prettier YAML formatting" - git push origin "$REF_NAME" \ No newline at end of file + set -euo pipefail + python3 - <<'PY' + from pathlib import Path + import yaml + + files = sorted( + p for p in Path('.').rglob('*') + if p.is_file() and p.suffix in {'.yml', '.yaml'} + ) + + excluded_prefixes = ( + Path('docker/dask-gateway-server'), + Path('docker/kaniko-build-jobs'), + Path('docs'), + Path('slurm'), + Path('.cursor'), + Path('.git'), + ) + + filtered = [] + for p in files: + if any(str(p).startswith(str(prefix) + '/') or p == prefix for prefix in excluded_prefixes): + continue + filtered.append(p) + + if not filtered: + print('No in-scope YAML files found.') + raise SystemExit(0) + + for p in filtered: + with p.open('r', encoding='utf-8') as f: + list(yaml.safe_load_all(f)) + + print(f'Validated {len(filtered)} YAML file(s).') + PY diff --git a/.github/workflows/nightly-security-advisory.yml b/.github/workflows/nightly-security-advisory.yml new file mode 100644 index 00000000..3204dc2b --- /dev/null +++ b/.github/workflows/nightly-security-advisory.yml @@ -0,0 +1,27 @@ +name: Nightly Security Advisory + +on: + schedule: + - cron: '17 5 * * *' + workflow_dispatch: + +permissions: + contents: read + +jobs: + trivy-filesystem: + runs-on: ubuntu-latest + continue-on-error: true + steps: + - uses: actions/checkout@v4 + + - name: Run Trivy filesystem scan (advisory) + uses: aquasecurity/trivy-action@0.33.1 + with: + scan-type: fs + scan-ref: . + severity: HIGH,CRITICAL + ignore-unfixed: true + exit-code: '1' + format: table + skip-dirs: docker/dask-gateway-server,docs,docs/source/demos,docker/kaniko-build-jobs,slurm,.cursor diff --git a/README.md b/README.md index 0faaac5d..3e7ec8f2 100644 --- a/README.md +++ b/README.md @@ -18,9 +18,8 @@ Each user is provided with a 25GB home directory at first login. These directori [![Documentation Status](https://readthedocs.org/projects/purdue-af/badge/?version=latest)](https://purdue-af.readthedocs.io/en/latest/?badge=latest) -[![Python](https://github.com/PurdueAF/purdue-af/workflows/Lint%20Python/badge.svg)](https://github.com/PurdueAF/purdue-af/actions?query=workflow%3A%22Lint+Python%22) -[![YAML](https://github.com/PurdueAF/purdue-af/workflows/Lint%20YAML/badge.svg)](https://github.com/PurdueAF/purdue-af/actions?query=workflow%3A%22Lint+YAML%22) -[![JSON](https://github.com/PurdueAF/purdue-af/workflows/Lint%20JSON/badge.svg)](https://github.com/PurdueAF/purdue-af/actions?query=workflow%3A%22Lint+JSON%22) -[![Shell](https://github.com/PurdueAF/purdue-af/workflows/Lint%20Shell%20Scripts/badge.svg)](https://github.com/PurdueAF/purdue-af/actions?query=workflow%3A%22Lint+Shell+Scripts%22) -[![Docker](https://github.com/PurdueAF/purdue-af/workflows/Lint%20Dockerfiles/badge.svg)](https://github.com/PurdueAF/purdue-af/actions?query=workflow%3A%22Lint+Dockerfiles%22) - +[![Workflow Integrity](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-workflow-integrity.yml/badge.svg)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-workflow-integrity.yml) +[![Repo Quality](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-repo-quality.yml/badge.svg)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-repo-quality.yml) +[![Container Reliability](https://github.com/PurdueAF/purdue-af/actions/workflows/lint-docker.yml/badge.svg)](https://github.com/PurdueAF/purdue-af/actions/workflows/lint-docker.yml) +[![GitOps Deployability](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-gitops-deployability.yml/badge.svg)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-gitops-deployability.yml) +[![Nightly Security](https://github.com/PurdueAF/purdue-af/actions/workflows/nightly-security-advisory.yml/badge.svg)](https://github.com/PurdueAF/purdue-af/actions/workflows/nightly-security-advisory.yml) From 7049587766841272971a0b226b5226e54da7a648 Mon Sep 17 00:00:00 2001 From: Dmitry Kondratyev Date: Wed, 4 Feb 2026 08:59:16 -0500 Subject: [PATCH 03/25] Run CI checks on pull requests only --- .codex/CI_PLAN.md | 2 +- .github/workflows/ci-gitops-deployability.yml | 1 - .github/workflows/ci-repo-quality.yml | 1 - .github/workflows/ci-workflow-integrity.yml | 1 - .github/workflows/lint-docker.yml | 1 - .github/workflows/lint-json.yml | 1 - .github/workflows/lint-python.yml | 1 - .github/workflows/lint-shell.yml | 1 - .github/workflows/lint-yaml.yml | 1 - 9 files changed, 1 insertion(+), 9 deletions(-) diff --git a/.codex/CI_PLAN.md b/.codex/CI_PLAN.md index 76463f3f..3f1d2790 100644 --- a/.codex/CI_PLAN.md +++ b/.codex/CI_PLAN.md @@ -8,7 +8,7 @@ Deliver exactly one draft PR from `codex/ci` to `main` with minimal CI/CD harden - preserves safe daily branch sync (`main` merged into `codex/ci`, no force-push). Success means: -- `.github/workflows/lint-*.yml` workflows are check-only and run on every push + pull_request, +- `.github/workflows/lint-*.yml` workflows are check-only and run on `pull_request` (single run per change), - new workflows exist for integrity, GitOps deployability, and nightly advisory security, - optional repo-quality workflow is selected and included, - README shows A-E category badges, diff --git a/.github/workflows/ci-gitops-deployability.yml b/.github/workflows/ci-gitops-deployability.yml index c8f70e70..0942e2ab 100644 --- a/.github/workflows/ci-gitops-deployability.yml +++ b/.github/workflows/ci-gitops-deployability.yml @@ -1,7 +1,6 @@ name: CI GitOps Deployability on: - push: pull_request: permissions: diff --git a/.github/workflows/ci-repo-quality.yml b/.github/workflows/ci-repo-quality.yml index 9a08b612..247108bd 100644 --- a/.github/workflows/ci-repo-quality.yml +++ b/.github/workflows/ci-repo-quality.yml @@ -1,7 +1,6 @@ name: CI Repo Quality on: - push: pull_request: permissions: diff --git a/.github/workflows/ci-workflow-integrity.yml b/.github/workflows/ci-workflow-integrity.yml index 035578ea..b364d826 100644 --- a/.github/workflows/ci-workflow-integrity.yml +++ b/.github/workflows/ci-workflow-integrity.yml @@ -1,7 +1,6 @@ name: CI Workflow Integrity on: - push: pull_request: permissions: diff --git a/.github/workflows/lint-docker.yml b/.github/workflows/lint-docker.yml index fce34cbc..8ce41627 100644 --- a/.github/workflows/lint-docker.yml +++ b/.github/workflows/lint-docker.yml @@ -1,7 +1,6 @@ name: Container Reliability on: - push: pull_request: permissions: diff --git a/.github/workflows/lint-json.yml b/.github/workflows/lint-json.yml index fcfe8bfb..d223cfe7 100644 --- a/.github/workflows/lint-json.yml +++ b/.github/workflows/lint-json.yml @@ -1,7 +1,6 @@ name: Lint JSON on: - push: pull_request: permissions: diff --git a/.github/workflows/lint-python.yml b/.github/workflows/lint-python.yml index f9f16381..0f4d8686 100644 --- a/.github/workflows/lint-python.yml +++ b/.github/workflows/lint-python.yml @@ -1,7 +1,6 @@ name: Lint Python on: - push: pull_request: permissions: diff --git a/.github/workflows/lint-shell.yml b/.github/workflows/lint-shell.yml index 04a61734..7c099240 100644 --- a/.github/workflows/lint-shell.yml +++ b/.github/workflows/lint-shell.yml @@ -1,7 +1,6 @@ name: Lint Shell Scripts on: - push: pull_request: permissions: diff --git a/.github/workflows/lint-yaml.yml b/.github/workflows/lint-yaml.yml index e7cb7e92..6158c887 100644 --- a/.github/workflows/lint-yaml.yml +++ b/.github/workflows/lint-yaml.yml @@ -1,7 +1,6 @@ name: Lint YAML on: - push: pull_request: permissions: From 73d21697e10eaab8d6362691d4021669985e346c Mon Sep 17 00:00:00 2001 From: Dmitry Kondratyev Date: Wed, 4 Feb 2026 09:30:22 -0500 Subject: [PATCH 04/25] Fix CI baseline failures and docker filter dependency trigger --- .codex/CI_PLAN.md | 3 +++ .github/workflows/ci-gitops-deployability.yml | 2 +- .github/workflows/ci-workflow-integrity.yml | 10 +++------- .github/workflows/lint-docker.yml | 4 +++- .github/workflows/lint-shell.yml | 2 +- docker/purdue-af/genaistudio/genaistudio.py | 7 ++++--- docker/purdue-af/scripts/eos-connect.sh | 2 +- 7 files changed, 16 insertions(+), 14 deletions(-) diff --git a/.codex/CI_PLAN.md b/.codex/CI_PLAN.md index 3f1d2790..63e5d390 100644 --- a/.codex/CI_PLAN.md +++ b/.codex/CI_PLAN.md @@ -31,6 +31,9 @@ Out of scope: - `slurm/**` - `.cursor/**` +Approved exception: +- `slurm/**` is used as a dependency-only trigger for container reliability jobs because maintained Dockerfiles copy `slurm/` artifacts. + ## Target Check Architecture ### A) CI System Integrity (advisory) - Workflow: `.github/workflows/ci-workflow-integrity.yml` diff --git a/.github/workflows/ci-gitops-deployability.yml b/.github/workflows/ci-gitops-deployability.yml index 0942e2ab..f3c5ae05 100644 --- a/.github/workflows/ci-gitops-deployability.yml +++ b/.github/workflows/ci-gitops-deployability.yml @@ -48,5 +48,5 @@ jobs: set -euo pipefail for rendered in /tmp/deploy_core-production.yaml /tmp/deploy_core-staging.yaml /tmp/deploy_core-geddes2.yaml /tmp/deploy_experimental.yaml; do echo "Validating $rendered" - kubeconform -summary -strict -ignore-missing-schemas "$rendered" + kubeconform -summary -strict -ignore-missing-schemas -skip Secret "$rendered" done diff --git a/.github/workflows/ci-workflow-integrity.yml b/.github/workflows/ci-workflow-integrity.yml index b364d826..8bb39d1c 100644 --- a/.github/workflows/ci-workflow-integrity.yml +++ b/.github/workflows/ci-workflow-integrity.yml @@ -13,14 +13,10 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Install actionlint - run: | - set -euo pipefail - bash <(curl -sSfL https://raw.githubusercontent.com/rhysd/actionlint/main/scripts/download-actionlint.bash) 1.7.4 - echo "$PWD/bin" >> "$GITHUB_PATH" - - name: Run actionlint (advisory) - run: actionlint -color + uses: rhysd/actionlint@v1 + with: + args: -color - name: Validate workflow YAML parse (advisory) run: | diff --git a/.github/workflows/lint-docker.yml b/.github/workflows/lint-docker.yml index 8ce41627..ac83bec6 100644 --- a/.github/workflows/lint-docker.yml +++ b/.github/workflows/lint-docker.yml @@ -26,10 +26,12 @@ jobs: - '.github/scripts/container-smoke.sh' interlink_slurm_plugin: - 'docker/interlink-slurm-plugin/**' + - 'slurm/**' - '.github/workflows/lint-docker.yml' - '.github/scripts/container-smoke.sh' purdue_af: - 'docker/purdue-af/**' + - 'slurm/**' - '.github/workflows/lint-docker.yml' - '.github/scripts/container-smoke.sh' @@ -63,7 +65,7 @@ jobs: - uses: actions/checkout@v4 - name: Build af-pod-monitor image (advisory) - run: docker build -f docker/af-pod-monitor/Dockerfile -t local/af-pod-monitor:${{ github.sha }} . + run: docker build -f docker/af-pod-monitor/Dockerfile -t local/af-pod-monitor:${{ github.sha }} docker/af-pod-monitor - name: Smoke test af-pod-monitor image (advisory) run: .github/scripts/container-smoke.sh local/af-pod-monitor:${{ github.sha }} af-pod-monitor diff --git a/.github/workflows/lint-shell.yml b/.github/workflows/lint-shell.yml index 7c099240..edcac463 100644 --- a/.github/workflows/lint-shell.yml +++ b/.github/workflows/lint-shell.yml @@ -41,7 +41,7 @@ jobs: exit 0 fi - shellcheck "${files[@]}" + shellcheck -S error "${files[@]}" shfmt -d "${files[@]}" for f in "${files[@]}"; do diff --git a/docker/purdue-af/genaistudio/genaistudio.py b/docker/purdue-af/genaistudio/genaistudio.py index 965a65a7..cbc768fe 100644 --- a/docker/purdue-af/genaistudio/genaistudio.py +++ b/docker/purdue-af/genaistudio/genaistudio.py @@ -1,5 +1,4 @@ -from jupyter_ai_magics.providers import (BaseProvider, EnvAuthStrategy, - TextField) +from jupyter_ai_magics.providers import BaseProvider, EnvAuthStrategy, TextField from langchain_openai import ChatOpenAI @@ -20,7 +19,9 @@ class PurdueGenAIStudioProvider(BaseProvider, ChatOpenAI): ) def __init__(self, **kwargs): - super().__init__(openai_api_base="https://genai.rcac.purdue.edu/api", **kwargs) + super().__init__( + openai_api_base="https://genai.rcac.purdue.edu/api", **kwargs + ) @classmethod def is_api_key_exc(cls, e: Exception): diff --git a/docker/purdue-af/scripts/eos-connect.sh b/docker/purdue-af/scripts/eos-connect.sh index 055e8984..33d51ec8 100644 --- a/docker/purdue-af/scripts/eos-connect.sh +++ b/docker/purdue-af/scripts/eos-connect.sh @@ -31,7 +31,7 @@ if [[ $krb_ticket = "" ]]; then echo " > Kerberos authentication failed!" echo "" return 1 - else: + else echo " > Kerberos authentication complete!" echo "" fi From 1b2c2ba30672c424c6796828e7207d4469c827a4 Mon Sep 17 00:00:00 2001 From: Dmitry Kondratyev Date: Wed, 4 Feb 2026 09:38:58 -0500 Subject: [PATCH 05/25] Fix remaining CI failures in integrity, shell, python, and af-pod-monitor --- .github/workflows/ci-workflow-integrity.yml | 11 ++++++++--- .github/workflows/lint-shell.yml | 2 +- docker/af-pod-monitor/Dockerfile | 4 ++-- docker/purdue-af/genaistudio/genaistudio.py | 4 +--- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci-workflow-integrity.yml b/.github/workflows/ci-workflow-integrity.yml index 8bb39d1c..deacfa94 100644 --- a/.github/workflows/ci-workflow-integrity.yml +++ b/.github/workflows/ci-workflow-integrity.yml @@ -13,10 +13,15 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Install actionlint + run: | + set -euo pipefail + mkdir -p "$HOME/.local/bin" + bash <(curl -sSfL https://raw.githubusercontent.com/rhysd/actionlint/main/scripts/download-actionlint.bash) 1.7.4 "$HOME/.local/bin" + echo "$HOME/.local/bin" >> "$GITHUB_PATH" + - name: Run actionlint (advisory) - uses: rhysd/actionlint@v1 - with: - args: -color + run: actionlint -color - name: Validate workflow YAML parse (advisory) run: | diff --git a/.github/workflows/lint-shell.yml b/.github/workflows/lint-shell.yml index edcac463..adfe868e 100644 --- a/.github/workflows/lint-shell.yml +++ b/.github/workflows/lint-shell.yml @@ -41,7 +41,7 @@ jobs: exit 0 fi - shellcheck -S error "${files[@]}" + shellcheck -S error -s bash "${files[@]}" shfmt -d "${files[@]}" for f in "${files[@]}"; do diff --git a/docker/af-pod-monitor/Dockerfile b/docker/af-pod-monitor/Dockerfile index f03b2fba..c6f05b0b 100644 --- a/docker/af-pod-monitor/Dockerfile +++ b/docker/af-pod-monitor/Dockerfile @@ -2,10 +2,10 @@ FROM python:3.8-slim WORKDIR /etc/ -RUN pip install --no-cache-dir prometheus_client==0.22.1 +RUN pip install --no-cache-dir prometheus_client==0.21.1 COPY pod-metrics-exporter.py /etc/ RUN chmod +x /etc/pod-metrics-exporter.py -CMD ["python", "/etc/pod-metrics-exporter.py"] \ No newline at end of file +CMD ["python", "/etc/pod-metrics-exporter.py"] diff --git a/docker/purdue-af/genaistudio/genaistudio.py b/docker/purdue-af/genaistudio/genaistudio.py index cbc768fe..832d3be2 100644 --- a/docker/purdue-af/genaistudio/genaistudio.py +++ b/docker/purdue-af/genaistudio/genaistudio.py @@ -19,9 +19,7 @@ class PurdueGenAIStudioProvider(BaseProvider, ChatOpenAI): ) def __init__(self, **kwargs): - super().__init__( - openai_api_base="https://genai.rcac.purdue.edu/api", **kwargs - ) + super().__init__(openai_api_base="https://genai.rcac.purdue.edu/api", **kwargs) @classmethod def is_api_key_exc(cls, e: Exception): From 9b769d0e9f9942ab372ae8940ac8c7853ddca2b2 Mon Sep 17 00:00:00 2001 From: Dmitry Kondratyev Date: Wed, 4 Feb 2026 09:47:57 -0500 Subject: [PATCH 06/25] Align python and shell lint checks with baseline style --- .github/scripts/container-smoke.sh | 30 +++++++++++++++--------------- .github/workflows/lint-python.yml | 2 +- .github/workflows/lint-shell.yml | 5 ++++- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/.github/scripts/container-smoke.sh b/.github/scripts/container-smoke.sh index cf1bb967..6fca637d 100755 --- a/.github/scripts/container-smoke.sh +++ b/.github/scripts/container-smoke.sh @@ -2,8 +2,8 @@ set -euo pipefail if [ "$#" -ne 2 ]; then - echo "Usage: $0 " >&2 - exit 2 + echo "Usage: $0 " >&2 + exit 2 fi image="$1" @@ -12,19 +12,19 @@ profile="$2" docker image inspect "$image" >/dev/null case "$profile" in - af-pod-monitor) - docker run --rm --entrypoint python "$image" -c "import prometheus_client" - ;; - interlink-slurm-plugin) - docker run --rm --entrypoint /bin/sh "$image" -lc 'test -x /sidecar/slurm-sidecar' - ;; - purdue-af) - docker run --rm --entrypoint /bin/bash "$image" -lc 'python --version && jupyter --version >/dev/null' - ;; - *) - echo "Unknown profile: $profile" >&2 - exit 2 - ;; +af-pod-monitor) + docker run --rm --entrypoint python "$image" -c "import prometheus_client" + ;; +interlink-slurm-plugin) + docker run --rm --entrypoint /bin/sh "$image" -lc 'test -x /sidecar/slurm-sidecar' + ;; +purdue-af) + docker run --rm --entrypoint /bin/bash "$image" -lc 'python --version && jupyter --version >/dev/null' + ;; +*) + echo "Unknown profile: $profile" >&2 + exit 2 + ;; esac echo "Smoke checks passed for profile: $profile" diff --git a/.github/workflows/lint-python.yml b/.github/workflows/lint-python.yml index 0f4d8686..452b39d9 100644 --- a/.github/workflows/lint-python.yml +++ b/.github/workflows/lint-python.yml @@ -42,7 +42,7 @@ jobs: fi black --check "${files[@]}" - isort --check-only "${files[@]}" + isort --profile black --check-only "${files[@]}" python -m py_compile "${files[@]}" - name: Run pytest (advisory) diff --git a/.github/workflows/lint-shell.yml b/.github/workflows/lint-shell.yml index adfe868e..d12762ed 100644 --- a/.github/workflows/lint-shell.yml +++ b/.github/workflows/lint-shell.yml @@ -42,7 +42,10 @@ jobs: fi shellcheck -S error -s bash "${files[@]}" - shfmt -d "${files[@]}" + mapfile -t shfmt_files < <(printf '%s\n' "${files[@]}" | grep -E '\.sh$' || true) + if [ "${#shfmt_files[@]}" -gt 0 ]; then + shfmt -d "${shfmt_files[@]}" + fi for f in "${files[@]}"; do bash -n "$f" From 05beb618a2da0ada3ef787a23f3c20d54b3fc77d Mon Sep 17 00:00:00 2001 From: Dmitry Kondratyev Date: Wed, 4 Feb 2026 10:31:04 -0500 Subject: [PATCH 07/25] Refresh CI plan to current optimization workstreams --- .codex/CI_PLAN.md | 172 ++++++++++++++++++++++++---------------------- 1 file changed, 90 insertions(+), 82 deletions(-) diff --git a/.codex/CI_PLAN.md b/.codex/CI_PLAN.md index 63e5d390..584661bd 100644 --- a/.codex/CI_PLAN.md +++ b/.codex/CI_PLAN.md @@ -1,18 +1,20 @@ # CI/CD Campaign Plan (Current State) -## Mission and Success Criteria -Deliver exactly one draft PR from `codex/ci` to `main` with minimal CI/CD hardening that: -- converts formatter-based CI to check-only behavior, -- adds advisory-first integrity/deploy/security coverage, -- keeps one source of truth in this file, -- preserves safe daily branch sync (`main` merged into `codex/ci`, no force-push). - -Success means: -- `.github/workflows/lint-*.yml` workflows are check-only and run on `pull_request` (single run per change), -- new workflows exist for integrity, GitOps deployability, and nightly advisory security, -- optional repo-quality workflow is selected and included, -- README shows A-E category badges, -- no changes touch out-of-scope paths. +## Mission +Deliver one draft PR from `codex/ci` to `main` with a stable advisory-first CI baseline, then optimize test depth, integration realism, and security signal without broad refactors. + +## Current Status +- PR branch: `codex/ci` +- Delivery model: single PR `codex/ci -> main` +- Existing CI baseline is green on PR checks. +- Formatter/linter workflows are check-only (no CI writeback commits). + +## Success Criteria +- CI remains stable on `pull_request` runs for all configured workflows. +- Optimization phase adds meaningful unit and integration coverage for repo-owned code. +- Security checks include nightly advisory plus PR-time advisory signal. +- `README.md` keeps A-E category badges aligned with active workflows. +- `.codex/CI_PLAN.md` remains the single source of truth. ## In-Scope / Out-of-Scope Paths In scope: @@ -32,78 +34,84 @@ Out of scope: - `.cursor/**` Approved exception: -- `slurm/**` is used as a dependency-only trigger for container reliability jobs because maintained Dockerfiles copy `slurm/` artifacts. - -## Target Check Architecture +- `slurm/**` is used as a dependency-only trigger in container reliability path filters because maintained Dockerfiles copy `slurm/` artifacts. + +## Active Workflow Surface +- `.github/workflows/ci-workflow-integrity.yml` +- `.github/workflows/lint-python.yml` +- `.github/workflows/lint-shell.yml` +- `.github/workflows/lint-json.yml` +- `.github/workflows/lint-yaml.yml` +- `.github/workflows/ci-repo-quality.yml` +- `.github/workflows/lint-docker.yml` +- `.github/workflows/ci-gitops-deployability.yml` +- `.github/workflows/nightly-security-advisory.yml` + +## Check Architecture ### A) CI System Integrity (advisory) -- Workflow: `.github/workflows/ci-workflow-integrity.yml` -- Checks: `actionlint` + workflow YAML parse. -- Risk mapped: malformed workflows, invalid action definitions, skipped CI due syntax/runtime issues. - -### B) Repo-Owned Code Quality / Tests (advisory additions) -- Workflows: - - `.github/workflows/lint-python.yml` - - `.github/workflows/lint-shell.yml` - - `.github/workflows/lint-json.yml` - - `.github/workflows/lint-yaml.yml` - - `.github/workflows/ci-repo-quality.yml` (selected) -- Checks: black/isort check-only, py_compile, pytest (advisory), shellcheck/shfmt/bash -n, JSON/YAML parse checks. -- Risk mapped: runtime and script regressions. - -### C) Container Reliability (advisory additions) -- Workflow: `.github/workflows/lint-docker.yml` -- Checks: hadolint (check-only), advisory docker build/smoke for maintained Dockerfiles via `.github/scripts/container-smoke.sh`. -- Risk mapped: container build/runtime breakage. - -### D) GitOps/K8s Deployability (advisory) -- Workflow: `.github/workflows/ci-gitops-deployability.yml` -- Checks: `kustomize build --load-restrictor LoadRestrictionsNone` for all deploy overlays + `kubeconform` schema validation. -- Risk mapped: Flux reconciliation failures from invalid manifests. - -### E) Nightly Advisory Security -- Workflow: `.github/workflows/nightly-security-advisory.yml` -- Checks: Trivy filesystem scan (HIGH/CRITICAL). -- Risk mapped: security posture drift. - -## Advisory vs Future Blocking Milestones -- M0 (this campaign): all newly introduced validations advisory. -- M1: promote workflow integrity + repo-quality checks to blocking after stable baseline. -- M2: promote container + GitOps checks to blocking after stable baseline. -- M3: keep nightly security advisory unless explicitly promoted. - -## Agent Lane Ownership (File Level) -- Coordinator: `.codex/CI_PLAN.md`, `README.md`, branch/PR/sync operations. -- Agent A: `.github/workflows/ci-workflow-integrity.yml` (+ selection recommendation in chat). -- Agent B: `.github/workflows/lint-python.yml`, `.github/workflows/lint-shell.yml`, `.github/workflows/ci-repo-quality.yml`, optional B helper scripts. -- Agent C: `.github/workflows/lint-json.yml`, `.github/workflows/lint-yaml.yml`. -- Agent D: `.github/workflows/lint-docker.yml`, `.github/scripts/container-smoke.sh`. -- Agent E: `.github/workflows/ci-gitops-deployability.yml`, `.github/workflows/nightly-security-advisory.yml`. - -## Phased Rollout and Rollback -Rollout: -1. First commit creates this file. -2. Add/convert workflows in lane-owned files only. -3. Keep PR draft until baseline checks stabilize. -4. Daily sync by merging `main` into `codex/ci`. - -Rollback: -- Revert only unstable workflow files in small commits. -- Keep advisory mode active during stabilization. - -## Reproducible Runbook (from clean main) -1. `git fetch origin` -2. `git switch main && git pull --ff-only origin main` -3. `git switch -c codex/ci` (or `git switch codex/ci`) -4. Commit #1: `.codex/CI_PLAN.md` -5. Apply lane-scoped workflow changes -6. `git push -u origin codex/ci` -7. Open one draft PR `codex/ci -> main` -8. Daily sync: `git fetch origin && git switch codex/ci && git merge --no-ff origin/main` +- Workflow: `ci-workflow-integrity.yml` +- Checks: actionlint + workflow YAML parse. +- Risk: broken workflow definitions and silent CI drift. + +### B) Repo Quality and Tests (advisory) +- Workflows: `lint-python.yml`, `lint-shell.yml`, `lint-json.yml`, `lint-yaml.yml`, `ci-repo-quality.yml` +- Checks: black/isort check-only, py_compile, pytest advisory, shellcheck/shfmt/bash -n, JSON/YAML parse. +- Risk: script/runtime regressions. + +### C) Container Reliability (advisory) +- Workflow: `lint-docker.yml` +- Checks: hadolint, targeted docker build jobs, smoke checks via `.github/scripts/container-smoke.sh`. +- Risk: image build/runtime regressions. + +### D) GitOps Deployability (advisory) +- Workflow: `ci-gitops-deployability.yml` +- Checks: kustomize render + kubeconform schema validation. +- Risk: Flux reconciliation failures from invalid manifests. + +### E) Security Posture (advisory) +- Workflow: `nightly-security-advisory.yml` +- Checks: nightly Trivy filesystem scan. +- Risk: security drift in dependencies/configuration. + +## Optimization Workstreams (Current) +### Worker 1: Coverage Optimizer +File lane: +- `tests/unit/**` +- `tests/conftest.py` +- `.github/workflows/lint-python.yml` +- `.github/workflows/ci-repo-quality.yml` +Goal: +- Increase meaningful Python test coverage and publish coverage in CI (advisory threshold first). + +### Worker 2: Integration Scenarios +File lane: +- `tests/integration/**` +- `tests/fixtures/**` +- `.github/workflows/ci-integration-scenarios.yml` (new) +- `.github/scripts/integration/**` +Goal: +- Add realistic automated integration scenarios with deterministic mocks and PR advisory execution. + +### Worker 3: Security and Runtime Optimizer +File lane: +- `.github/workflows/nightly-security-advisory.yml` +- `.github/workflows/ci-security-advisory.yml` (new) +- `.github/workflows/lint-docker.yml` +- `.github/workflows/ci-gitops-deployability.yml` +Goal: +- Add PR-time advisory security checks and reduce CI runtime/noise safely. + +## Branch and Sync Rules +- No side branches. +- No force-push on shared campaign work. +- Daily sync: merge `main` into `codex/ci` (no rebase). +- Keep PR draft until optimization baseline is stable. ## Constraint Challenge Protocol -If a hard constraint appears to conflict with delivery, create an `EXCEPTION REQUEST` with: +If any hard constraint must be challenged, submit an `EXCEPTION REQUEST` with: 1) challenged constraint, 2) concrete risk if unchanged, -3) minimal exception, +3) minimal exception requested, 4) rollback path. -Do not implement exception changes before explicit user approval. + +No exception is implemented without explicit user approval. From 43e0d63bf1db3290b92c577e73b33bf7b9d21a7b Mon Sep 17 00:00:00 2001 From: Dmitry Kondratyev Date: Wed, 4 Feb 2026 11:08:50 -0500 Subject: [PATCH 08/25] ci(worker2): add scoped integration scenarios and fixture matrix --- .../scripts/integration/mock-docker-cli.sh | 45 ++++++++ .../integration/run-integration-scenarios.sh | 7 ++ .../workflows/ci-integration-scenarios.yml | 31 ++++++ tests/fixtures/container_smoke/matrix.json | 102 ++++++++++++++++++ .../fixtures/monitoring/event_rate_cases.json | 12 +++ .../monitoring/event_rate_invalid.txt | 1 + .../fixtures/monitoring/event_rate_valid.txt | 1 + .../fixtures/node_health/checksum_cases.json | 57 ++++++++++ tests/integration/common.py | 90 ++++++++++++++++ .../test_container_smoke_matrix.py | 68 ++++++++++++ .../test_monitoring_metric_update.py | 59 ++++++++++ .../test_node_healthcheck_integration.py | 85 +++++++++++++++ 12 files changed, 558 insertions(+) create mode 100755 .github/scripts/integration/mock-docker-cli.sh create mode 100755 .github/scripts/integration/run-integration-scenarios.sh create mode 100644 .github/workflows/ci-integration-scenarios.yml create mode 100644 tests/fixtures/container_smoke/matrix.json create mode 100644 tests/fixtures/monitoring/event_rate_cases.json create mode 100644 tests/fixtures/monitoring/event_rate_invalid.txt create mode 100644 tests/fixtures/monitoring/event_rate_valid.txt create mode 100644 tests/fixtures/node_health/checksum_cases.json create mode 100644 tests/integration/common.py create mode 100644 tests/integration/test_container_smoke_matrix.py create mode 100644 tests/integration/test_monitoring_metric_update.py create mode 100644 tests/integration/test_node_healthcheck_integration.py diff --git a/.github/scripts/integration/mock-docker-cli.sh b/.github/scripts/integration/mock-docker-cli.sh new file mode 100755 index 00000000..04969cda --- /dev/null +++ b/.github/scripts/integration/mock-docker-cli.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [ -n "${MOCK_DOCKER_LOG:-}" ]; then + printf '%s\n' "$*" >> "$MOCK_DOCKER_LOG" +fi + +cmd="${1:-}" +shift || true + +case "$cmd" in + image) + subcmd="${1:-}" + shift || true + if [ "$subcmd" != "inspect" ]; then + echo "mock docker unsupported image subcommand: $subcmd" >&2 + exit 64 + fi + + if [ -n "${MOCK_DOCKER_INSPECT_STDOUT:-}" ]; then + printf '%s\n' "$MOCK_DOCKER_INSPECT_STDOUT" + fi + if [ -n "${MOCK_DOCKER_INSPECT_STDERR:-}" ]; then + printf '%s\n' "$MOCK_DOCKER_INSPECT_STDERR" >&2 + fi + + exit "${MOCK_DOCKER_INSPECT_EXIT:-0}" + ;; + + run) + if [ -n "${MOCK_DOCKER_RUN_STDOUT:-}" ]; then + printf '%s\n' "$MOCK_DOCKER_RUN_STDOUT" + fi + if [ -n "${MOCK_DOCKER_RUN_STDERR:-}" ]; then + printf '%s\n' "$MOCK_DOCKER_RUN_STDERR" >&2 + fi + + exit "${MOCK_DOCKER_RUN_EXIT:-0}" + ;; + + *) + echo "mock docker unsupported command: $cmd" >&2 + exit 64 + ;; +esac diff --git a/.github/scripts/integration/run-integration-scenarios.sh b/.github/scripts/integration/run-integration-scenarios.sh new file mode 100755 index 00000000..20190685 --- /dev/null +++ b/.github/scripts/integration/run-integration-scenarios.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +set -euo pipefail + +repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" + +cd "$repo_root" +python3 -m unittest discover -s tests/integration -p 'test_*.py' -v diff --git a/.github/workflows/ci-integration-scenarios.yml b/.github/workflows/ci-integration-scenarios.yml new file mode 100644 index 00000000..2908b38d --- /dev/null +++ b/.github/workflows/ci-integration-scenarios.yml @@ -0,0 +1,31 @@ +name: CI Integration Scenarios + +on: + pull_request: + paths: + - tests/integration/** + - tests/fixtures/** + - .github/scripts/integration/** + - .github/scripts/container-smoke.sh + - apps/monitoring/af-monitoring/metrics_server.py + - apps/monitoring/af-monitoring/node_healthcheck.py + - .github/workflows/ci-integration-scenarios.yml + +permissions: + contents: read + +jobs: + integration-scenarios: + runs-on: ubuntu-latest + continue-on-error: true + timeout-minutes: 10 + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Run integration scenarios (advisory) + run: bash .github/scripts/integration/run-integration-scenarios.sh diff --git a/tests/fixtures/container_smoke/matrix.json b/tests/fixtures/container_smoke/matrix.json new file mode 100644 index 00000000..78619afd --- /dev/null +++ b/tests/fixtures/container_smoke/matrix.json @@ -0,0 +1,102 @@ +[ + { + "name": "af_pod_monitor_success", + "image": "ghcr.io/purdue-af/af-pod-monitor:test", + "profile": "af-pod-monitor", + "mock": { + "inspect_exit": 0, + "run_exit": 0 + }, + "expected": { + "exit_code": 0, + "stdout_contains": [ + "Smoke checks passed for profile: af-pod-monitor" + ], + "stderr_contains": [], + "log_lines": [ + "image inspect ghcr.io/purdue-af/af-pod-monitor:test", + "run --rm --entrypoint python ghcr.io/purdue-af/af-pod-monitor:test -c import prometheus_client" + ] + } + }, + { + "name": "interlink_slurm_plugin_failure", + "image": "ghcr.io/purdue-af/interlink-slurm-plugin:test", + "profile": "interlink-slurm-plugin", + "mock": { + "inspect_exit": 0, + "run_exit": 1, + "run_stderr": "missing /sidecar/slurm-sidecar" + }, + "expected": { + "exit_code": 1, + "stdout_contains": [], + "stderr_contains": [ + "missing /sidecar/slurm-sidecar" + ], + "log_lines": [ + "image inspect ghcr.io/purdue-af/interlink-slurm-plugin:test", + "run --rm --entrypoint /bin/sh ghcr.io/purdue-af/interlink-slurm-plugin:test -lc test -x /sidecar/slurm-sidecar" + ] + } + }, + { + "name": "unknown_profile_rejected", + "image": "ghcr.io/purdue-af/custom:test", + "profile": "unknown-profile", + "mock": { + "inspect_exit": 0, + "run_exit": 0 + }, + "expected": { + "exit_code": 2, + "stdout_contains": [], + "stderr_contains": [ + "Unknown profile: unknown-profile" + ], + "log_lines": [ + "image inspect ghcr.io/purdue-af/custom:test" + ] + } + }, + { + "name": "inspect_failure_short_circuit", + "image": "ghcr.io/purdue-af/purdue-af:test", + "profile": "purdue-af", + "mock": { + "inspect_exit": 1, + "inspect_stderr": "image not found", + "run_exit": 0 + }, + "expected": { + "exit_code": 1, + "stdout_contains": [], + "stderr_contains": [ + "image not found" + ], + "log_lines": [ + "image inspect ghcr.io/purdue-af/purdue-af:test" + ] + } + }, + { + "name": "purdue_af_success", + "image": "ghcr.io/purdue-af/purdue-af:test", + "profile": "purdue-af", + "mock": { + "inspect_exit": 0, + "run_exit": 0 + }, + "expected": { + "exit_code": 0, + "stdout_contains": [ + "Smoke checks passed for profile: purdue-af" + ], + "stderr_contains": [], + "log_lines": [ + "image inspect ghcr.io/purdue-af/purdue-af:test", + "run --rm --entrypoint /bin/bash ghcr.io/purdue-af/purdue-af:test -lc python --version && jupyter --version >/dev/null" + ] + } + } +] diff --git a/tests/fixtures/monitoring/event_rate_cases.json b/tests/fixtures/monitoring/event_rate_cases.json new file mode 100644 index 00000000..785a3de0 --- /dev/null +++ b/tests/fixtures/monitoring/event_rate_cases.json @@ -0,0 +1,12 @@ +[ + { + "name": "valid_event_rate", + "fixture_file": "event_rate_valid.txt", + "expected_gauge_value": 128.5 + }, + { + "name": "invalid_event_rate", + "fixture_file": "event_rate_invalid.txt", + "expected_gauge_value": 0 + } +] diff --git a/tests/fixtures/monitoring/event_rate_invalid.txt b/tests/fixtures/monitoring/event_rate_invalid.txt new file mode 100644 index 00000000..3fb64bad --- /dev/null +++ b/tests/fixtures/monitoring/event_rate_invalid.txt @@ -0,0 +1 @@ +not-a-number diff --git a/tests/fixtures/monitoring/event_rate_valid.txt b/tests/fixtures/monitoring/event_rate_valid.txt new file mode 100644 index 00000000..ae500ed9 --- /dev/null +++ b/tests/fixtures/monitoring/event_rate_valid.txt @@ -0,0 +1 @@ +128.5 diff --git a/tests/fixtures/node_health/checksum_cases.json b/tests/fixtures/node_health/checksum_cases.json new file mode 100644 index 00000000..954d99b8 --- /dev/null +++ b/tests/fixtures/node_health/checksum_cases.json @@ -0,0 +1,57 @@ +[ + { + "name": "checksum_match", + "mode": "normal", + "filename": "/depot/cms/purdue-af/validate-mount.txt", + "expected_checksum": "13dede34ee8dc7e5b70c9cd06ac15467", + "md5_stdout": "13dede34ee8dc7e5b70c9cd06ac15467 /depot/cms/purdue-af/validate-mount.txt\n", + "md5_stderr": "", + "returncode": 0, + "start_time": 1000.0, + "end_time": 1000.123, + "expected_result": true, + "expected_ping_ms": 123.0, + "expect_killed": false + }, + { + "name": "checksum_mismatch", + "mode": "normal", + "filename": "/work/projects/purdue-af/validate-mount.txt", + "expected_checksum": "f4cb7f2740ba3e87edfbda6c70fa94c2", + "md5_stdout": "00000000000000000000000000000000 /work/projects/purdue-af/validate-mount.txt\n", + "md5_stderr": "", + "returncode": 0, + "start_time": 2000.0, + "end_time": 2000.05, + "expected_result": false, + "expected_ping_ms": 50.0, + "expect_killed": false + }, + { + "name": "md5_error_returncode", + "mode": "normal", + "filename": "/eos/purdue/store/user/dkondrat/test.root", + "expected_checksum": "18864b0de8ae5a6a8d3b459a7999b431", + "md5_stdout": "", + "md5_stderr": "No such file or directory", + "returncode": 1, + "start_time": 3000.0, + "end_time": 3000.08, + "expected_result": false, + "expected_ping_ms": 80.0, + "expect_killed": false + }, + { + "name": "md5_timeout", + "mode": "timeout", + "filename": "/cvmfs/cms.cern.ch/SITECONF/T2_US_Purdue/Purdue-Hadoop/JobConfig/site-local-config.xml", + "expected_checksum": "3b570d80272b7188c13cef51e58b7151", + "md5_stdout": "", + "md5_stderr": "", + "returncode": 124, + "start_time": 4000.0, + "expected_result": false, + "expected_ping_ms": 3000, + "expect_killed": true + } +] diff --git a/tests/integration/common.py b/tests/integration/common.py new file mode 100644 index 00000000..9ad1358a --- /dev/null +++ b/tests/integration/common.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +import importlib.util +import json +import sys +import types +from pathlib import Path +from typing import Any + +REPO_ROOT = Path(__file__).resolve().parents[2] +FIXTURES_ROOT = REPO_ROOT / "tests" / "fixtures" + + +class FakeGaugeChild: + def __init__(self, labels: dict[str, str]): + self.labels = labels + self.value: float | int | None = None + self.history: list[float | int] = [] + + def set(self, value: float | int) -> None: + self.value = value + self.history.append(value) + + +class FakeGauge: + def __init__( + self, + name: str, + description: str, + label_names: list[str] | tuple[str, ...] | None = None, + ): + self.name = name + self.description = description + self.label_names = tuple(label_names or ()) + self.value: float | int | None = None + self.history: list[float | int] = [] + self.children: dict[tuple[tuple[str, str], ...], FakeGaugeChild] = {} + + def set(self, value: float | int) -> None: + self.value = value + self.history.append(value) + + def labels(self, *args: str, **kwargs: str) -> FakeGaugeChild: + if args and kwargs: + raise ValueError("labels accepts positional or keyword labels, not both") + + if args: + if len(args) != len(self.label_names): + raise ValueError("label count does not match") + label_values = dict(zip(self.label_names, args)) + else: + label_values = {name: kwargs[name] for name in self.label_names} + + key = tuple((name, label_values[name]) for name in self.label_names) + child = self.children.get(key) + if child is None: + child = FakeGaugeChild(label_values) + self.children[key] = child + return child + + +def load_json_fixture(relative_path: str) -> Any: + fixture_path = FIXTURES_ROOT / relative_path + with fixture_path.open("r", encoding="utf-8") as f: + return json.load(f) + + +def load_module_with_fake_prometheus(relative_path: str, module_name: str): + module_path = REPO_ROOT / relative_path + spec = importlib.util.spec_from_file_location(module_name, module_path) + if spec is None or spec.loader is None: + raise RuntimeError(f"Could not load module spec for {module_path}") + + module = importlib.util.module_from_spec(spec) + + fake_prometheus = types.ModuleType("prometheus_client") + fake_prometheus.Gauge = FakeGauge + fake_prometheus.start_http_server = lambda *_args, **_kwargs: None + + original_prometheus = sys.modules.get("prometheus_client") + sys.modules["prometheus_client"] = fake_prometheus + try: + spec.loader.exec_module(module) + finally: + if original_prometheus is None: + del sys.modules["prometheus_client"] + else: + sys.modules["prometheus_client"] = original_prometheus + + return module diff --git a/tests/integration/test_container_smoke_matrix.py b/tests/integration/test_container_smoke_matrix.py new file mode 100644 index 00000000..b3215c6f --- /dev/null +++ b/tests/integration/test_container_smoke_matrix.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +import os +from pathlib import Path +import subprocess +import tempfile +import unittest + +from common import REPO_ROOT, load_json_fixture + +CONTAINER_SMOKE_SCRIPT = REPO_ROOT / ".github/scripts/container-smoke.sh" +MOCK_DOCKER_SCRIPT = REPO_ROOT / ".github/scripts/integration/mock-docker-cli.sh" + + +class ContainerSmokeBehaviorMatrixIntegrationTest(unittest.TestCase): + def test_container_smoke_behavior_matrix(self) -> None: + cases = load_json_fixture("container_smoke/matrix.json") + + for case in cases: + with self.subTest(case=case["name"]): + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + docker_wrapper = temp_path / "docker" + docker_wrapper.write_text( + f"#!/usr/bin/env bash\nexec \"{MOCK_DOCKER_SCRIPT}\" \"$@\"\n", + encoding="utf-8", + ) + docker_wrapper.chmod(0o755) + + log_file = temp_path / "docker.log" + env = os.environ.copy() + env["PATH"] = f"{temp_path}:{env.get('PATH', '')}" + env["MOCK_DOCKER_LOG"] = str(log_file) + env["MOCK_DOCKER_INSPECT_EXIT"] = str(case["mock"]["inspect_exit"]) + env["MOCK_DOCKER_RUN_EXIT"] = str(case["mock"]["run_exit"]) + env["MOCK_DOCKER_INSPECT_STDERR"] = case["mock"].get( + "inspect_stderr", "" + ) + env["MOCK_DOCKER_RUN_STDERR"] = case["mock"].get("run_stderr", "") + + result = subprocess.run( + [ + "bash", + str(CONTAINER_SMOKE_SCRIPT), + case["image"], + case["profile"], + ], + capture_output=True, + text=True, + check=False, + env=env, + ) + + expected = case["expected"] + self.assertEqual(result.returncode, expected["exit_code"]) + for expected_text in expected["stdout_contains"]: + self.assertIn(expected_text, result.stdout) + for expected_text in expected["stderr_contains"]: + self.assertIn(expected_text, result.stderr) + + logged_lines = [] + if log_file.exists(): + logged_lines = log_file.read_text(encoding="utf-8").splitlines() + self.assertEqual(logged_lines, expected["log_lines"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/integration/test_monitoring_metric_update.py b/tests/integration/test_monitoring_metric_update.py new file mode 100644 index 00000000..05b3a6fe --- /dev/null +++ b/tests/integration/test_monitoring_metric_update.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +from pathlib import Path +import unittest +from unittest import mock +from uuid import uuid4 + +from common import FIXTURES_ROOT, load_json_fixture, load_module_with_fake_prometheus + +METRIC_FILE = "/work/projects/purdue-af/agc/metrics/event_rate.txt" +MODULE_PATH = "apps/monitoring/af-monitoring/metrics_server.py" + + +class MonitoringMetricUpdateFlowIntegrationTest(unittest.TestCase): + def setUp(self) -> None: + module_name = f"metrics_server_integration_{uuid4().hex}" + self.module = load_module_with_fake_prometheus(MODULE_PATH, module_name) + + def _patched_open_for_fixture(self, fixture_path: Path): + real_open = open + + def _patched_open(path, *args, **kwargs): + if str(path) == METRIC_FILE: + return real_open(fixture_path, *args, **kwargs) + return real_open(path, *args, **kwargs) + + return _patched_open + + def test_fixture_backed_metric_updates(self) -> None: + cases = load_json_fixture("monitoring/event_rate_cases.json") + + for case in cases: + fixture_path = FIXTURES_ROOT / "monitoring" / case["fixture_file"] + with self.subTest(case=case["name"]), mock.patch( + "builtins.open", + side_effect=self._patched_open_for_fixture(fixture_path), + ): + self.module.update_metrics() + self.assertEqual( + self.module.event_rate_per_worker.history[-1], + case["expected_gauge_value"], + ) + + def test_missing_metric_file_falls_back_to_zero(self) -> None: + real_open = open + + def _patched_open(path, *args, **kwargs): + if str(path) == METRIC_FILE: + raise FileNotFoundError("event rate fixture not found") + return real_open(path, *args, **kwargs) + + with mock.patch("builtins.open", side_effect=_patched_open): + self.module.update_metrics() + + self.assertEqual(self.module.event_rate_per_worker.history[-1], 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/integration/test_node_healthcheck_integration.py b/tests/integration/test_node_healthcheck_integration.py new file mode 100644 index 00000000..17abc15d --- /dev/null +++ b/tests/integration/test_node_healthcheck_integration.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +import subprocess +import unittest +from unittest import mock +from uuid import uuid4 + +from common import load_json_fixture, load_module_with_fake_prometheus + +MODULE_PATH = "apps/monitoring/af-monitoring/node_healthcheck.py" + + +class FakeMd5Process: + def __init__(self, case: dict): + self.mode = case["mode"] + self.stdout = case["md5_stdout"] + self.stderr = case["md5_stderr"] + self.returncode = case["returncode"] + self.killed = False + self.communicate_calls = 0 + self.timeout_history: list[float | int | None] = [] + + def communicate(self, timeout=None): + self.communicate_calls += 1 + self.timeout_history.append(timeout) + if self.mode == "timeout" and self.communicate_calls == 1: + raise subprocess.TimeoutExpired(cmd="/usr/bin/md5sum", timeout=timeout) + return self.stdout, self.stderr + + def kill(self): + self.killed = True + + +class NodeHealthChecksumTimeoutIntegrationTest(unittest.TestCase): + def setUp(self) -> None: + module_name = f"node_healthcheck_integration_{uuid4().hex}" + self.module = load_module_with_fake_prometheus(MODULE_PATH, module_name) + + def test_checksum_and_timeout_matrix(self) -> None: + cases = load_json_fixture("node_health/checksum_cases.json") + + for case in cases: + process = FakeMd5Process(case) + time_values = [case["start_time"]] + if case["mode"] != "timeout": + time_values.append(case["end_time"]) + + with self.subTest(case=case["name"]), mock.patch.object( + self.module.subprocess, + "Popen", + return_value=process, + ) as popen_mock, mock.patch.object( + self.module.time, + "time", + side_effect=time_values, + ): + result, ping_ms = self.module.check_if_directory_exists( + (case["filename"], case["expected_checksum"]) + ) + + self.assertEqual(result, case["expected_result"]) + self.assertEqual(process.killed, case["expect_killed"]) + self.assertEqual( + popen_mock.call_args[0][0], + ["/usr/bin/md5sum", case["filename"]], + ) + if case["mode"] == "timeout": + self.assertEqual(process.timeout_history, [3, None]) + else: + self.assertEqual(process.timeout_history, [3]) + + expected_ping_ms = case["expected_ping_ms"] + if isinstance(expected_ping_ms, float): + self.assertAlmostEqual(ping_ms, expected_ping_ms, delta=0.001) + else: + self.assertEqual(ping_ms, expected_ping_ms) + + if case["mode"] == "timeout": + self.assertEqual(process.communicate_calls, 2) + else: + self.assertEqual(process.communicate_calls, 1) + + +if __name__ == "__main__": + unittest.main() From 4ca5aa81fbbbb3465b9cee824b818ad430ab5a8d Mon Sep 17 00:00:00 2001 From: Dmitry Kondratyev Date: Wed, 4 Feb 2026 11:15:26 -0500 Subject: [PATCH 09/25] ci(worker1): tighten unit coverage flow and clean transient test artifacts --- .github/workflows/ci-repo-quality.yml | 10 ++- .github/workflows/lint-python.yml | 14 +--- tests/conftest.py | 75 +++++++++++++++++ tests/unit/test_docker_healthcheck.py | 62 ++++++++++++++ tests/unit/test_metrics_server.py | 37 ++++++++ tests/unit/test_node_healthcheck.py | 107 ++++++++++++++++++++++++ tests/unit/test_pod_metrics_exporter.py | 87 +++++++++++++++++++ 7 files changed, 377 insertions(+), 15 deletions(-) create mode 100644 tests/conftest.py create mode 100644 tests/unit/test_docker_healthcheck.py create mode 100644 tests/unit/test_metrics_server.py create mode 100644 tests/unit/test_node_healthcheck.py create mode 100644 tests/unit/test_pod_metrics_exporter.py diff --git a/.github/workflows/ci-repo-quality.yml b/.github/workflows/ci-repo-quality.yml index 247108bd..2af6c26c 100644 --- a/.github/workflows/ci-repo-quality.yml +++ b/.github/workflows/ci-repo-quality.yml @@ -22,7 +22,7 @@ jobs: run: | set -euo pipefail python -m pip install --upgrade pip - pip install pytest + pip install pytest pytest-cov - name: Python syntax smoke (advisory) shell: bash @@ -61,7 +61,13 @@ jobs: shell: bash run: | set +e - pytest -q + pytest -q tests/unit \ + --cov=apps/monitoring/af-monitoring/metrics_server.py \ + --cov=apps/monitoring/af-monitoring/node_healthcheck.py \ + --cov=docker/af-pod-monitor/pod-metrics-exporter.py \ + --cov=docker/purdue-af/jupyter/docker_healthcheck.py \ + --cov-report=term-missing \ + --cov-fail-under=70 rc=$? if [ "$rc" -eq 5 ]; then echo 'pytest collected no tests; treating as informational.' diff --git a/.github/workflows/lint-python.yml b/.github/workflows/lint-python.yml index 452b39d9..80dd38aa 100644 --- a/.github/workflows/lint-python.yml +++ b/.github/workflows/lint-python.yml @@ -22,7 +22,7 @@ jobs: run: | set -euo pipefail python -m pip install --upgrade pip - pip install black isort pytest + pip install black isort - name: Run black/isort/py_compile (check-only, advisory) shell: bash @@ -44,15 +44,3 @@ jobs: black --check "${files[@]}" isort --profile black --check-only "${files[@]}" python -m py_compile "${files[@]}" - - - name: Run pytest (advisory) - shell: bash - run: | - set +e - pytest -q - rc=$? - if [ "$rc" -eq 5 ]; then - echo 'pytest collected no tests; treating as informational.' - exit 0 - fi - exit "$rc" diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..2caf62bf --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +import importlib.util +import sys +from pathlib import Path +from types import ModuleType +from typing import Callable + +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[1] + + +class RecordingGauge: + def __init__(self) -> None: + self.values: list[float] = [] + self.label_children: dict[tuple[tuple[str, str], ...], "RecordingGauge"] = {} + + def set(self, value: float) -> None: + self.values.append(value) + + def labels(self, **labels: str) -> "RecordingGauge": + key = tuple(sorted(labels.items())) + child = self.label_children.get(key) + if child is None: + child = RecordingGauge() + self.label_children[key] = child + return child + + +@pytest.fixture +def recording_gauge_cls(): + return RecordingGauge + + +@pytest.fixture +def prometheus_stub() -> ModuleType: + module = ModuleType("prometheus_client") + + class Gauge: + def __init__(self, *_args, **_kwargs) -> None: + self.values = [] + + def set(self, value: float) -> None: + self.values.append(value) + + def labels(self, **_labels: str) -> "Gauge": + return self + + module.Gauge = Gauge + module.start_http_server = lambda *_args, **_kwargs: None + return module + + +@pytest.fixture +def module_loader(monkeypatch: pytest.MonkeyPatch) -> Callable[..., object]: + counter = 0 + + def _load(relative_path: str, *, extra_modules: dict[str, object] | None = None) -> object: + nonlocal counter + counter += 1 + module_name = f"test_module_{counter}" + module_path = REPO_ROOT / relative_path + + if extra_modules: + for name, module in extra_modules.items(): + monkeypatch.setitem(sys.modules, name, module) + + spec = importlib.util.spec_from_file_location(module_name, module_path) + assert spec is not None and spec.loader is not None + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + return _load diff --git a/tests/unit/test_docker_healthcheck.py b/tests/unit/test_docker_healthcheck.py new file mode 100644 index 00000000..bbd7fd1b --- /dev/null +++ b/tests/unit/test_docker_healthcheck.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +import json +from types import ModuleType + + +class _FakeJsonFile: + def __init__(self, payload: bytes) -> None: + self.payload = payload + + def read_bytes(self) -> bytes: + return self.payload + + +class _FakePath: + def __init__(self, payload: bytes) -> None: + self.payload = payload + + def __truediv__(self, _part: str) -> "_FakePath": + return self + + def glob(self, _pattern: str): + return iter([_FakeJsonFile(self.payload)]) + + +def test_healthcheck_queries_jupyter_api_and_prints_response(monkeypatch, module_loader) -> None: + captured = {} + payload = json.dumps({"url": "https://af.example/"}).encode("utf-8") + + class _FakeResponse: + def __init__(self) -> None: + self.content = b"healthy" + self.raise_calls = 0 + + def raise_for_status(self) -> None: + self.raise_calls += 1 + + fake_response = _FakeResponse() + requests_stub = ModuleType("requests") + + def _fake_get(url: str, verify: bool): + captured["url"] = url + captured["verify"] = verify + return fake_response + + requests_stub.get = _fake_get + + pathlib_stub = ModuleType("pathlib") + pathlib_stub.Path = lambda _value: _FakePath(payload) + + printed = [] + monkeypatch.setenv("NB_USER", "alice") + monkeypatch.setattr("builtins.print", lambda value: printed.append(value)) + + module_loader( + "docker/purdue-af/jupyter/docker_healthcheck.py", + extra_modules={"pathlib": pathlib_stub, "requests": requests_stub}, + ) + + assert captured == {"url": "https://af.example/api", "verify": False} + assert fake_response.raise_calls == 1 + assert printed == [b"healthy"] diff --git a/tests/unit/test_metrics_server.py b/tests/unit/test_metrics_server.py new file mode 100644 index 00000000..2967aa7c --- /dev/null +++ b/tests/unit/test_metrics_server.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from unittest.mock import mock_open + + +def test_update_metrics_sets_event_rate_from_file( + monkeypatch, module_loader, prometheus_stub, recording_gauge_cls +) -> None: + module = module_loader( + "apps/monitoring/af-monitoring/metrics_server.py", + extra_modules={"prometheus_client": prometheus_stub}, + ) + module.event_rate_per_worker = recording_gauge_cls() + monkeypatch.setattr("builtins.open", mock_open(read_data="42.5\n")) + + module.update_metrics() + + assert module.event_rate_per_worker.values == [42.5] + + +def test_update_metrics_sets_zero_when_read_fails( + monkeypatch, module_loader, prometheus_stub, recording_gauge_cls +) -> None: + module = module_loader( + "apps/monitoring/af-monitoring/metrics_server.py", + extra_modules={"prometheus_client": prometheus_stub}, + ) + module.event_rate_per_worker = recording_gauge_cls() + + def _raise(*_args, **_kwargs): + raise OSError("not found") + + monkeypatch.setattr("builtins.open", _raise) + + module.update_metrics() + + assert module.event_rate_per_worker.values == [0] diff --git a/tests/unit/test_node_healthcheck.py b/tests/unit/test_node_healthcheck.py new file mode 100644 index 00000000..caf58bc4 --- /dev/null +++ b/tests/unit/test_node_healthcheck.py @@ -0,0 +1,107 @@ +from __future__ import annotations + +import subprocess + +import pytest + + +def test_check_if_directory_exists_reports_success_for_matching_checksum( + monkeypatch, module_loader, prometheus_stub +) -> None: + module = module_loader( + "apps/monitoring/af-monitoring/node_healthcheck.py", + extra_modules={"prometheus_client": prometheus_stub}, + ) + + class FakeProc: + returncode = 0 + + def __init__(self) -> None: + self.killed = False + + def communicate(self, timeout=None): + return ("abc123 /tmp/validate.txt\n", "") + + def kill(self) -> None: + self.killed = True + + proc = FakeProc() + popen_calls = [] + + def _fake_popen(args, **kwargs): + popen_calls.append((args, kwargs)) + return proc + + times = iter([100.0, 100.2]) + monkeypatch.setattr(module.time, "time", lambda: next(times)) + monkeypatch.setattr(module.subprocess, "Popen", _fake_popen) + + valid, elapsed_ms = module.check_if_directory_exists(("/tmp/validate.txt", "abc123")) + + assert valid is True + assert elapsed_ms == pytest.approx(200.0) + assert popen_calls[0][0] == ["/usr/bin/md5sum", "/tmp/validate.txt"] + + +def test_check_if_directory_exists_returns_timeout_result( + monkeypatch, module_loader, prometheus_stub +) -> None: + module = module_loader( + "apps/monitoring/af-monitoring/node_healthcheck.py", + extra_modules={"prometheus_client": prometheus_stub}, + ) + + class FakeProc: + returncode = 0 + + def __init__(self) -> None: + self.killed = False + self.calls = 0 + + def communicate(self, timeout=None): + self.calls += 1 + if self.calls == 1: + raise subprocess.TimeoutExpired(cmd="md5sum", timeout=timeout) + return ("", "") + + def kill(self) -> None: + self.killed = True + + proc = FakeProc() + monkeypatch.setattr(module.subprocess, "Popen", lambda *_args, **_kwargs: proc) + + valid, elapsed_ms = module.check_if_directory_exists(("/tmp/validate.txt", "abc123")) + + assert valid is False + assert elapsed_ms == 3000 + assert proc.killed is True + + +def test_update_metrics_writes_mount_health_and_ping( + monkeypatch, module_loader, prometheus_stub, recording_gauge_cls +) -> None: + module = module_loader( + "apps/monitoring/af-monitoring/node_healthcheck.py", + extra_modules={"prometheus_client": prometheus_stub}, + ) + module.mount_valid = recording_gauge_cls() + module.mount_ping_ms = recording_gauge_cls() + module.mounts = { + "mount-a": ("/mnt/a", "sum-a"), + "mount-b": ("/mnt/b", "sum-b"), + } + responses = iter([(True, 12.5), (False, 22.5)]) + monkeypatch.setattr( + module, + "check_if_directory_exists", + lambda _path_tuple: next(responses), + ) + + module.update_metrics() + + key_a = (("mount_name", "mount-a"), ("mount_path", "/mnt/a")) + key_b = (("mount_name", "mount-b"), ("mount_path", "/mnt/b")) + assert module.mount_valid.label_children[key_a].values == [1] + assert module.mount_valid.label_children[key_b].values == [0] + assert module.mount_ping_ms.label_children[key_a].values == [12.5] + assert module.mount_ping_ms.label_children[key_b].values == [22.5] diff --git a/tests/unit/test_pod_metrics_exporter.py b/tests/unit/test_pod_metrics_exporter.py new file mode 100644 index 00000000..7ac617fd --- /dev/null +++ b/tests/unit/test_pod_metrics_exporter.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +import glob +import os +from types import SimpleNamespace + + +def _load_exporter(monkeypatch, module_loader, prometheus_stub): + monkeypatch.setattr(os, "listdir", lambda _path: ["jovyan", "slurm", "alice"]) + monkeypatch.setattr(glob, "glob", lambda _pattern: ["/home/alice"]) + return module_loader( + "docker/af-pod-monitor/pod-metrics-exporter.py", + extra_modules={"prometheus_client": prometheus_stub}, + ) + + +def test_module_initializes_directories_from_non_skipped_user( + monkeypatch, module_loader, prometheus_stub +) -> None: + module = _load_exporter(monkeypatch, module_loader, prometheus_stub) + + assert module.username == "alice" + assert module.directories == { + "home": "/home/alice", + "work": "/work/users/alice/", + } + + +def test_update_metrics_work_branch_sets_usage_and_access_time( + monkeypatch, module_loader, prometheus_stub, recording_gauge_cls +) -> None: + module = _load_exporter(monkeypatch, module_loader, prometheus_stub) + module.metrics = { + "work_dir_used": recording_gauge_cls(), + "work_dir_size": recording_gauge_cls(), + "work_dir_util": recording_gauge_cls(), + "work_dir_last_accessed": recording_gauge_cls(), + } + module.dl = "work" + monkeypatch.setattr( + module.subprocess, + "check_output", + lambda *_args, **_kwargs: b"2048 /work/users/alice/\n", + ) + monkeypatch.setattr( + module.os, + "stat", + lambda _directory: SimpleNamespace(st_atime=1700000000.0), + ) + + module.update_metrics("work") + + assert module.metrics["work_dir_used"].values == [2048] + assert module.metrics["work_dir_size"].values == [104857600] + assert module.metrics["work_dir_util"].values == [2048 / 104857600] + assert module.metrics["work_dir_last_accessed"].values == [1700000000.0] + + +def test_update_metrics_home_branch_parses_df_and_ignores_stat_errors( + monkeypatch, module_loader, prometheus_stub, recording_gauge_cls +) -> None: + module = _load_exporter(monkeypatch, module_loader, prometheus_stub) + module.metrics = { + "home_dir_used": recording_gauge_cls(), + "home_dir_size": recording_gauge_cls(), + "home_dir_util": recording_gauge_cls(), + "home_dir_last_accessed": recording_gauge_cls(), + } + module.dl = "home" + + df_output = ( + "Filesystem 1K-blocks Used Available Use% Mounted on\n" + "/dev/sda1 1000 250 750 25% /home\n" + ).encode("utf-8") + monkeypatch.setattr(module.subprocess, "check_output", lambda *_args, **_kwargs: df_output) + + def _raise_stat(_directory): + raise OSError("stat unavailable") + + monkeypatch.setattr(module.os, "stat", _raise_stat) + + module.update_metrics("home") + + assert module.metrics["home_dir_used"].values == [250] + assert module.metrics["home_dir_size"].values == [1000] + assert module.metrics["home_dir_util"].values == [0.25] + assert module.metrics["home_dir_last_accessed"].values == [] From 625737d60d825ea9f66d624cc5f9e55e3e9c907d Mon Sep 17 00:00:00 2001 From: Dmitry Kondratyev Date: Wed, 4 Feb 2026 11:16:41 -0500 Subject: [PATCH 10/25] ci(worker3): optimize advisory security and runtime workflow scope --- .github/workflows/ci-gitops-deployability.yml | 130 ++++++++++++-- .github/workflows/ci-security-advisory.yml | 166 ++++++++++++++++++ .github/workflows/lint-docker.yml | 22 ++- .../workflows/nightly-security-advisory.yml | 78 +++++++- 4 files changed, 375 insertions(+), 21 deletions(-) create mode 100644 .github/workflows/ci-security-advisory.yml diff --git a/.github/workflows/ci-gitops-deployability.yml b/.github/workflows/ci-gitops-deployability.yml index f3c5ae05..d83902b0 100644 --- a/.github/workflows/ci-gitops-deployability.yml +++ b/.github/workflows/ci-gitops-deployability.yml @@ -2,12 +2,63 @@ name: CI GitOps Deployability on: pull_request: + paths: + - 'deploy/**' + - '.github/workflows/ci-gitops-deployability.yml' + +concurrency: + group: ci-gitops-deployability-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true permissions: contents: read jobs: + detect-gitops-changes: + runs-on: ubuntu-latest + outputs: + run_all: ${{ steps.scope.outputs.run_all }} + core_production: ${{ steps.filter.outputs.core_production }} + core_staging: ${{ steps.filter.outputs.core_staging }} + core_geddes2: ${{ steps.filter.outputs.core_geddes2 }} + experimental: ${{ steps.filter.outputs.experimental }} + steps: + - uses: actions/checkout@v4 + + - uses: dorny/paths-filter@v3 + id: filter + with: + filters: | + core_production: + - 'deploy/core-production/**' + core_staging: + - 'deploy/core-staging/**' + core_geddes2: + - 'deploy/core-geddes2/**' + experimental: + - 'deploy/experimental/**' + deploy_shared: + - 'deploy/**' + - '!deploy/core-production/**' + - '!deploy/core-staging/**' + - '!deploy/core-geddes2/**' + - '!deploy/experimental/**' + workflow: + - '.github/workflows/ci-gitops-deployability.yml' + + - id: scope + shell: bash + run: | + set -euo pipefail + if [ "${{ steps.filter.outputs.deploy_shared }}" = 'true' ] || [ "${{ steps.filter.outputs.workflow }}" = 'true' ]; then + echo "run_all=true" >> "$GITHUB_OUTPUT" + else + echo "run_all=false" >> "$GITHUB_OUTPUT" + fi + gitops-validate: + needs: detect-gitops-changes + if: needs.detect-gitops-changes.outputs.run_all == 'true' || needs.detect-gitops-changes.outputs.core_production == 'true' || needs.detect-gitops-changes.outputs.core_staging == 'true' || needs.detect-gitops-changes.outputs.core_geddes2 == 'true' || needs.detect-gitops-changes.outputs.experimental == 'true' runs-on: ubuntu-latest continue-on-error: true steps: @@ -27,26 +78,71 @@ jobs: chmod +x /tmp/kubeconform sudo mv /tmp/kubeconform /usr/local/bin/kubeconform - - name: Render overlays with kustomize (advisory) + - name: Render and validate selected overlays (advisory) + shell: bash + env: + RUN_ALL: ${{ needs.detect-gitops-changes.outputs.run_all }} + CORE_PRODUCTION: ${{ needs.detect-gitops-changes.outputs.core_production }} + CORE_STAGING: ${{ needs.detect-gitops-changes.outputs.core_staging }} + CORE_GEDDES2: ${{ needs.detect-gitops-changes.outputs.core_geddes2 }} + EXPERIMENTAL: ${{ needs.detect-gitops-changes.outputs.experimental }} run: | set -euo pipefail - overlays=( - deploy/core-production - deploy/core-staging - deploy/core-geddes2 - deploy/experimental - ) + overlays=() + if [ "$RUN_ALL" = 'true' ]; then + overlays=( + deploy/core-production + deploy/core-staging + deploy/core-geddes2 + deploy/experimental + ) + else + [ "$CORE_PRODUCTION" = 'true' ] && overlays+=(deploy/core-production) + [ "$CORE_STAGING" = 'true' ] && overlays+=(deploy/core-staging) + [ "$CORE_GEDDES2" = 'true' ] && overlays+=(deploy/core-geddes2) + [ "$EXPERIMENTAL" = 'true' ] && overlays+=(deploy/experimental) + fi + + if [ "${#overlays[@]}" -eq 0 ]; then + echo 'No in-scope overlay changes detected; skipping render/validation.' + { + echo '### GitOps Deployability Summary' + echo + echo '- No in-scope overlay changes detected.' + } >> "$GITHUB_STEP_SUMMARY" + exit 0 + fi + + { + echo '### GitOps Deployability Summary' + echo + echo '| Overlay | Render | Kubeconform |' + echo '|---|---|---|' + } >> "$GITHUB_STEP_SUMMARY" + + status=0 for overlay in "${overlays[@]}"; do - out="/tmp/$(echo "$overlay" | tr '/' '_').yaml" - echo "Rendering $overlay -> $out" - kustomize build --load-restrictor LoadRestrictionsNone "$overlay" > "$out" - done + rendered="/tmp/$(echo "$overlay" | tr '/' '_').yaml" + render_status='ok' + kubeconform_status='ok' - - name: Validate rendered manifests with kubeconform (advisory) - run: | - set -euo pipefail - for rendered in /tmp/deploy_core-production.yaml /tmp/deploy_core-staging.yaml /tmp/deploy_core-geddes2.yaml /tmp/deploy_experimental.yaml; do - echo "Validating $rendered" - kubeconform -summary -strict -ignore-missing-schemas -skip Secret "$rendered" + echo "Rendering $overlay -> $rendered" + if ! kustomize build --load-restrictor LoadRestrictionsNone "$overlay" > "$rendered"; then + render_status='failed' + kubeconform_status='skipped' + status=1 + fi + + if [ "$render_status" = 'ok' ]; then + echo "Validating $rendered" + if ! kubeconform -summary -strict -ignore-missing-schemas -skip Secret "$rendered"; then + kubeconform_status='failed' + status=1 + fi + fi + + echo "| \`$overlay\` | $render_status | $kubeconform_status |" >> "$GITHUB_STEP_SUMMARY" done + + exit "$status" diff --git a/.github/workflows/ci-security-advisory.yml b/.github/workflows/ci-security-advisory.yml new file mode 100644 index 00000000..953a65c0 --- /dev/null +++ b/.github/workflows/ci-security-advisory.yml @@ -0,0 +1,166 @@ +name: CI Security Advisory + +on: + pull_request: + paths: + - 'deploy/**' + - 'docker/**' + - '.github/workflows/**' + - '**/requirements*.txt' + - '**/pyproject.toml' + - '**/poetry.lock' + - '**/Pipfile' + - '**/Pipfile.lock' + - '**/go.mod' + - '**/go.sum' + workflow_dispatch: + +concurrency: + group: ci-security-advisory-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + detect-security-scope: + runs-on: ubuntu-latest + outputs: + vuln_surface: ${{ steps.filter.outputs.vuln_surface }} + config_surface: ${{ steps.filter.outputs.config_surface }} + steps: + - uses: actions/checkout@v4 + + - uses: dorny/paths-filter@v3 + id: filter + with: + filters: | + vuln_surface: + - 'docker/**' + - '**/requirements*.txt' + - '**/pyproject.toml' + - '**/poetry.lock' + - '**/Pipfile' + - '**/Pipfile.lock' + - '**/go.mod' + - '**/go.sum' + config_surface: + - 'deploy/**' + - 'docker/**' + - '.github/workflows/**' + + trivy-security-advisory: + needs: detect-security-scope + if: needs.detect-security-scope.outputs.vuln_surface == 'true' || needs.detect-security-scope.outputs.config_surface == 'true' + runs-on: ubuntu-latest + continue-on-error: true + steps: + - uses: actions/checkout@v4 + + - name: Run Trivy filesystem vulnerability scan (advisory) + if: needs.detect-security-scope.outputs.vuln_surface == 'true' + continue-on-error: true + uses: aquasecurity/trivy-action@0.33.1 + with: + scan-type: fs + scan-ref: . + scanners: vuln + severity: HIGH,CRITICAL + ignore-unfixed: true + exit-code: '1' + format: json + output: trivy-pr-fs.json + skip-dirs: docker/dask-gateway-server,docs,docs/source/demos,docker/kaniko-build-jobs,slurm,.cursor + + - name: Run Trivy configuration scan (advisory) + if: needs.detect-security-scope.outputs.config_surface == 'true' + continue-on-error: true + uses: aquasecurity/trivy-action@0.33.1 + with: + scan-type: config + scan-ref: . + severity: HIGH,CRITICAL + exit-code: '1' + format: json + output: trivy-pr-config.json + skip-dirs: docker/dask-gateway-server,docs,docs/source/demos,docker/kaniko-build-jobs,slurm,.cursor + + - name: Publish PR Trivy summary (advisory) + if: always() + shell: bash + run: | + set -euo pipefail + python3 - <<'PY' + import json + import os + from collections import Counter + from pathlib import Path + + summary_path = Path(os.environ['GITHUB_STEP_SUMMARY']) + reports = ( + ('Filesystem vulnerability scan', Path('trivy-pr-fs.json'), 'Vulnerabilities'), + ('Configuration misconfiguration scan', Path('trivy-pr-config.json'), 'Misconfigurations'), + ) + + total_high_critical = 0 + + with summary_path.open('a', encoding='utf-8') as summary: + summary.write('### PR Trivy Advisory Summary\n\n') + + for label, report_path, finding_key in reports: + if not report_path.exists(): + summary.write(f'- {label}: skipped (out of scope)\n') + continue + + payload = json.loads(report_path.read_text(encoding='utf-8')) + results = payload.get('Results', []) if isinstance(payload, dict) else payload + + severity_counts = Counter() + target_counts = Counter() + + for result in results: + target = result.get('Target', 'unknown-target') + for finding in result.get(finding_key) or []: + severity = (finding.get('Severity') or 'UNKNOWN').upper() + severity_counts[severity] += 1 + target_counts[target] += 1 + + high_critical = severity_counts.get('HIGH', 0) + severity_counts.get('CRITICAL', 0) + total_high_critical += high_critical + + summary.write(f'\n#### {label}\n\n') + summary.write(f'- HIGH/CRITICAL findings: **{high_critical}**\n') + summary.write(f'- Targets with findings: **{len(target_counts)}**\n\n') + + if high_critical == 0: + summary.write('No HIGH/CRITICAL findings detected.\n') + continue + + summary.write('| Severity | Count |\n') + summary.write('|---|---:|\n') + for severity in ('CRITICAL', 'HIGH', 'MEDIUM', 'LOW', 'UNKNOWN'): + count = severity_counts.get(severity, 0) + if count: + summary.write(f'| {severity} | {count} |\n') + + summary.write('\n| Top targets | Findings |\n') + summary.write('|---|---:|\n') + for target, count in target_counts.most_common(10): + summary.write(f'| `{target}` | {count} |\n') + + if total_high_critical > 0: + print(f'::warning::PR Trivy found {total_high_critical} HIGH/CRITICAL findings. See summary and artifacts.') + else: + print('::notice::PR Trivy found no HIGH/CRITICAL findings in scope.') + PY + + - name: Upload PR Trivy artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: trivy-pr-security-${{ github.run_id }} + path: | + trivy-pr-fs.json + trivy-pr-config.json + if-no-files-found: ignore + retention-days: 7 diff --git a/.github/workflows/lint-docker.yml b/.github/workflows/lint-docker.yml index ac83bec6..adcd8426 100644 --- a/.github/workflows/lint-docker.yml +++ b/.github/workflows/lint-docker.yml @@ -2,6 +2,17 @@ name: Container Reliability on: pull_request: + paths: + - 'docker/af-pod-monitor/**' + - 'docker/interlink-slurm-plugin/**' + - 'docker/purdue-af/**' + - 'slurm/**' + - '.github/scripts/container-smoke.sh' + - '.github/workflows/lint-docker.yml' + +concurrency: + group: lint-docker-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true permissions: contents: read @@ -10,6 +21,7 @@ jobs: detect-docker-changes: runs-on: ubuntu-latest outputs: + dockerfiles: ${{ steps.filter.outputs.dockerfiles }} af_pod_monitor: ${{ steps.filter.outputs.af_pod_monitor }} interlink_slurm_plugin: ${{ steps.filter.outputs.interlink_slurm_plugin }} purdue_af: ${{ steps.filter.outputs.purdue_af }} @@ -20,22 +32,26 @@ jobs: id: filter with: filters: | + dockerfiles: + - 'docker/af-pod-monitor/Dockerfile' + - 'docker/interlink-slurm-plugin/Dockerfile.alma8' + - 'docker/purdue-af/Dockerfile' + - '.github/workflows/lint-docker.yml' af_pod_monitor: - 'docker/af-pod-monitor/**' - - '.github/workflows/lint-docker.yml' - '.github/scripts/container-smoke.sh' interlink_slurm_plugin: - 'docker/interlink-slurm-plugin/**' - 'slurm/**' - - '.github/workflows/lint-docker.yml' - '.github/scripts/container-smoke.sh' purdue_af: - 'docker/purdue-af/**' - 'slurm/**' - - '.github/workflows/lint-docker.yml' - '.github/scripts/container-smoke.sh' lint-dockerfiles: + needs: detect-docker-changes + if: needs.detect-docker-changes.outputs.dockerfiles == 'true' runs-on: ubuntu-latest continue-on-error: true steps: diff --git a/.github/workflows/nightly-security-advisory.yml b/.github/workflows/nightly-security-advisory.yml index 3204dc2b..9d7fa398 100644 --- a/.github/workflows/nightly-security-advisory.yml +++ b/.github/workflows/nightly-security-advisory.yml @@ -5,6 +5,10 @@ on: - cron: '17 5 * * *' workflow_dispatch: +concurrency: + group: nightly-security-advisory-${{ github.ref }} + cancel-in-progress: true + permissions: contents: read @@ -20,8 +24,80 @@ jobs: with: scan-type: fs scan-ref: . + scanners: vuln severity: HIGH,CRITICAL ignore-unfixed: true exit-code: '1' - format: table + format: json + output: trivy-nightly-fs.json skip-dirs: docker/dask-gateway-server,docs,docs/source/demos,docker/kaniko-build-jobs,slurm,.cursor + + - name: Publish nightly Trivy summary (advisory) + if: always() + shell: bash + run: | + set -euo pipefail + python3 - <<'PY' + import json + import os + from collections import Counter + from pathlib import Path + + report_path = Path('trivy-nightly-fs.json') + summary_path = Path(os.environ['GITHUB_STEP_SUMMARY']) + title = 'Nightly Trivy Vulnerability Summary' + + with summary_path.open('a', encoding='utf-8') as summary: + summary.write(f'### {title}\n\n') + + if not report_path.exists(): + summary.write('- Trivy report was not generated.\n') + print('::warning::Nightly Trivy report was not generated.') + raise SystemExit(0) + + payload = json.loads(report_path.read_text(encoding='utf-8')) + results = payload.get('Results', []) if isinstance(payload, dict) else payload + + severity_counts = Counter() + target_counts = Counter() + + for result in results: + target = result.get('Target', 'unknown-target') + for vuln in result.get('Vulnerabilities') or []: + severity = (vuln.get('Severity') or 'UNKNOWN').upper() + severity_counts[severity] += 1 + target_counts[target] += 1 + + high_critical = severity_counts.get('HIGH', 0) + severity_counts.get('CRITICAL', 0) + + summary.write(f'- HIGH/CRITICAL findings: **{high_critical}**\n') + summary.write(f'- Targets with findings: **{len(target_counts)}**\n\n') + + if high_critical == 0: + summary.write('No HIGH/CRITICAL vulnerabilities found in scope.\n') + print('::notice::Nightly Trivy found no HIGH/CRITICAL vulnerabilities.') + raise SystemExit(0) + + summary.write('| Severity | Count |\n') + summary.write('|---|---:|\n') + for severity in ('CRITICAL', 'HIGH', 'MEDIUM', 'LOW', 'UNKNOWN'): + count = severity_counts.get(severity, 0) + if count: + summary.write(f'| {severity} | {count} |\n') + + summary.write('\n| Top targets | Findings |\n') + summary.write('|---|---:|\n') + for target, count in target_counts.most_common(10): + summary.write(f'| `{target}` | {count} |\n') + + print(f'::warning::Nightly Trivy found {high_critical} HIGH/CRITICAL vulnerabilities. See summary and artifact.') + PY + + - name: Upload nightly Trivy artifact + if: always() + uses: actions/upload-artifact@v4 + with: + name: trivy-nightly-fs-${{ github.run_id }} + path: trivy-nightly-fs.json + if-no-files-found: ignore + retention-days: 14 From dd78608ef3cdedeb9f17d918d81e9a97affc549d Mon Sep 17 00:00:00 2001 From: Dmitry Kondratyev Date: Wed, 4 Feb 2026 11:19:30 -0500 Subject: [PATCH 11/25] docs(ci): align CI plan with active workflow surface --- .codex/CI_PLAN.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.codex/CI_PLAN.md b/.codex/CI_PLAN.md index 584661bd..faf1d8f2 100644 --- a/.codex/CI_PLAN.md +++ b/.codex/CI_PLAN.md @@ -43,8 +43,10 @@ Approved exception: - `.github/workflows/lint-json.yml` - `.github/workflows/lint-yaml.yml` - `.github/workflows/ci-repo-quality.yml` +- `.github/workflows/ci-integration-scenarios.yml` - `.github/workflows/lint-docker.yml` - `.github/workflows/ci-gitops-deployability.yml` +- `.github/workflows/ci-security-advisory.yml` - `.github/workflows/nightly-security-advisory.yml` ## Check Architecture @@ -54,8 +56,8 @@ Approved exception: - Risk: broken workflow definitions and silent CI drift. ### B) Repo Quality and Tests (advisory) -- Workflows: `lint-python.yml`, `lint-shell.yml`, `lint-json.yml`, `lint-yaml.yml`, `ci-repo-quality.yml` -- Checks: black/isort check-only, py_compile, pytest advisory, shellcheck/shfmt/bash -n, JSON/YAML parse. +- Workflows: `lint-python.yml`, `lint-shell.yml`, `lint-json.yml`, `lint-yaml.yml`, `ci-repo-quality.yml`, `ci-integration-scenarios.yml` +- Checks: black/isort check-only, py_compile, pytest unit advisory with coverage threshold, shellcheck/shfmt/bash -n, JSON/YAML parse, integration scenario matrix tests via mocked container/monitoring flows. - Risk: script/runtime regressions. ### C) Container Reliability (advisory) @@ -69,8 +71,8 @@ Approved exception: - Risk: Flux reconciliation failures from invalid manifests. ### E) Security Posture (advisory) -- Workflow: `nightly-security-advisory.yml` -- Checks: nightly Trivy filesystem scan. +- Workflows: `nightly-security-advisory.yml`, `ci-security-advisory.yml` +- Checks: nightly Trivy filesystem scan plus PR-time advisory Trivy vulnerability/config scans with run summaries and artifacts. - Risk: security drift in dependencies/configuration. ## Optimization Workstreams (Current) From 52dab13425152b4172ba6328803ed1010fea1f29 Mon Sep 17 00:00:00 2001 From: Dmitry Kondratyev Date: Wed, 4 Feb 2026 11:22:37 -0500 Subject: [PATCH 12/25] ci(docker): enable gha cache for advisory image builds --- .codex/CI_PLAN.md | 2 +- .github/workflows/lint-docker.yml | 45 ++++++++++++++++++++++++++----- 2 files changed, 40 insertions(+), 7 deletions(-) diff --git a/.codex/CI_PLAN.md b/.codex/CI_PLAN.md index faf1d8f2..a70e7d24 100644 --- a/.codex/CI_PLAN.md +++ b/.codex/CI_PLAN.md @@ -62,7 +62,7 @@ Approved exception: ### C) Container Reliability (advisory) - Workflow: `lint-docker.yml` -- Checks: hadolint, targeted docker build jobs, smoke checks via `.github/scripts/container-smoke.sh`. +- Checks: hadolint, targeted Docker Buildx jobs with GitHub Actions layer cache, smoke checks via `.github/scripts/container-smoke.sh`. - Risk: image build/runtime regressions. ### D) GitOps Deployability (advisory) diff --git a/.github/workflows/lint-docker.yml b/.github/workflows/lint-docker.yml index adcd8426..da7fe070 100644 --- a/.github/workflows/lint-docker.yml +++ b/.github/workflows/lint-docker.yml @@ -80,8 +80,19 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Build af-pod-monitor image (advisory) - run: docker build -f docker/af-pod-monitor/Dockerfile -t local/af-pod-monitor:${{ github.sha }} docker/af-pod-monitor + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build af-pod-monitor image with cache (advisory) + uses: docker/build-push-action@v6 + with: + context: docker/af-pod-monitor + file: docker/af-pod-monitor/Dockerfile + load: true + tags: local/af-pod-monitor:${{ github.sha }} + cache-from: type=gha,scope=af-pod-monitor + cache-to: type=gha,mode=max,scope=af-pod-monitor,ignore-error=true + provenance: false - name: Smoke test af-pod-monitor image (advisory) run: .github/scripts/container-smoke.sh local/af-pod-monitor:${{ github.sha }} af-pod-monitor @@ -94,8 +105,19 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Build interlink-slurm-plugin image (advisory) - run: docker build -f docker/interlink-slurm-plugin/Dockerfile.alma8 -t local/interlink-slurm-plugin:${{ github.sha }} . + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build interlink-slurm-plugin image with cache (advisory) + uses: docker/build-push-action@v6 + with: + context: . + file: docker/interlink-slurm-plugin/Dockerfile.alma8 + load: true + tags: local/interlink-slurm-plugin:${{ github.sha }} + cache-from: type=gha,scope=interlink-slurm-plugin + cache-to: type=gha,mode=max,scope=interlink-slurm-plugin,ignore-error=true + provenance: false - name: Smoke test interlink-slurm-plugin image (advisory) run: .github/scripts/container-smoke.sh local/interlink-slurm-plugin:${{ github.sha }} interlink-slurm-plugin @@ -108,8 +130,19 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Build purdue-af image (advisory) - run: docker build -f docker/purdue-af/Dockerfile -t local/purdue-af:${{ github.sha }} . + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build purdue-af image with cache (advisory) + uses: docker/build-push-action@v6 + with: + context: . + file: docker/purdue-af/Dockerfile + load: true + tags: local/purdue-af:${{ github.sha }} + cache-from: type=gha,scope=purdue-af + cache-to: type=gha,mode=max,scope=purdue-af,ignore-error=true + provenance: false - name: Smoke test purdue-af image (advisory) run: .github/scripts/container-smoke.sh local/purdue-af:${{ github.sha }} purdue-af From fd4b57a1b1f647d4cd0e79311ea7acc329904925 Mon Sep 17 00:00:00 2001 From: Dmitry Kondratyev Date: Wed, 4 Feb 2026 11:26:49 -0500 Subject: [PATCH 13/25] ci: fix lint formatting and unit coverage reporting --- .../scripts/integration/mock-docker-cli.sh | 68 +++++++++---------- .github/workflows/ci-repo-quality.yml | 19 +++--- tests/conftest.py | 4 +- .../test_container_smoke_matrix.py | 2 +- tests/unit/test_docker_healthcheck.py | 4 +- tests/unit/test_node_healthcheck.py | 8 ++- tests/unit/test_pod_metrics_exporter.py | 4 +- 7 files changed, 60 insertions(+), 49 deletions(-) diff --git a/.github/scripts/integration/mock-docker-cli.sh b/.github/scripts/integration/mock-docker-cli.sh index 04969cda..c20a19aa 100755 --- a/.github/scripts/integration/mock-docker-cli.sh +++ b/.github/scripts/integration/mock-docker-cli.sh @@ -2,44 +2,44 @@ set -euo pipefail if [ -n "${MOCK_DOCKER_LOG:-}" ]; then - printf '%s\n' "$*" >> "$MOCK_DOCKER_LOG" + printf '%s\n' "$*" >>"$MOCK_DOCKER_LOG" fi cmd="${1:-}" shift || true case "$cmd" in - image) - subcmd="${1:-}" - shift || true - if [ "$subcmd" != "inspect" ]; then - echo "mock docker unsupported image subcommand: $subcmd" >&2 - exit 64 - fi - - if [ -n "${MOCK_DOCKER_INSPECT_STDOUT:-}" ]; then - printf '%s\n' "$MOCK_DOCKER_INSPECT_STDOUT" - fi - if [ -n "${MOCK_DOCKER_INSPECT_STDERR:-}" ]; then - printf '%s\n' "$MOCK_DOCKER_INSPECT_STDERR" >&2 - fi - - exit "${MOCK_DOCKER_INSPECT_EXIT:-0}" - ;; - - run) - if [ -n "${MOCK_DOCKER_RUN_STDOUT:-}" ]; then - printf '%s\n' "$MOCK_DOCKER_RUN_STDOUT" - fi - if [ -n "${MOCK_DOCKER_RUN_STDERR:-}" ]; then - printf '%s\n' "$MOCK_DOCKER_RUN_STDERR" >&2 - fi - - exit "${MOCK_DOCKER_RUN_EXIT:-0}" - ;; - - *) - echo "mock docker unsupported command: $cmd" >&2 - exit 64 - ;; +image) + subcmd="${1:-}" + shift || true + if [ "$subcmd" != "inspect" ]; then + echo "mock docker unsupported image subcommand: $subcmd" >&2 + exit 64 + fi + + if [ -n "${MOCK_DOCKER_INSPECT_STDOUT:-}" ]; then + printf '%s\n' "$MOCK_DOCKER_INSPECT_STDOUT" + fi + if [ -n "${MOCK_DOCKER_INSPECT_STDERR:-}" ]; then + printf '%s\n' "$MOCK_DOCKER_INSPECT_STDERR" >&2 + fi + + exit "${MOCK_DOCKER_INSPECT_EXIT:-0}" + ;; + +run) + if [ -n "${MOCK_DOCKER_RUN_STDOUT:-}" ]; then + printf '%s\n' "$MOCK_DOCKER_RUN_STDOUT" + fi + if [ -n "${MOCK_DOCKER_RUN_STDERR:-}" ]; then + printf '%s\n' "$MOCK_DOCKER_RUN_STDERR" >&2 + fi + + exit "${MOCK_DOCKER_RUN_EXIT:-0}" + ;; + +*) + echo "mock docker unsupported command: $cmd" >&2 + exit 64 + ;; esac diff --git a/.github/workflows/ci-repo-quality.yml b/.github/workflows/ci-repo-quality.yml index 2af6c26c..4dea93df 100644 --- a/.github/workflows/ci-repo-quality.yml +++ b/.github/workflows/ci-repo-quality.yml @@ -22,7 +22,7 @@ jobs: run: | set -euo pipefail python -m pip install --upgrade pip - pip install pytest pytest-cov + pip install pytest coverage - name: Python syntax smoke (advisory) shell: bash @@ -61,16 +61,17 @@ jobs: shell: bash run: | set +e - pytest -q tests/unit \ - --cov=apps/monitoring/af-monitoring/metrics_server.py \ - --cov=apps/monitoring/af-monitoring/node_healthcheck.py \ - --cov=docker/af-pod-monitor/pod-metrics-exporter.py \ - --cov=docker/purdue-af/jupyter/docker_healthcheck.py \ - --cov-report=term-missing \ - --cov-fail-under=70 + python -m coverage run -m pytest -q tests/unit rc=$? + set -e if [ "$rc" -eq 5 ]; then echo 'pytest collected no tests; treating as informational.' exit 0 fi - exit "$rc" + if [ "$rc" -ne 0 ]; then + exit "$rc" + fi + python -m coverage report \ + --show-missing \ + --fail-under=70 \ + --include="apps/monitoring/af-monitoring/metrics_server.py,apps/monitoring/af-monitoring/node_healthcheck.py,docker/af-pod-monitor/pod-metrics-exporter.py,docker/purdue-af/jupyter/docker_healthcheck.py" diff --git a/tests/conftest.py b/tests/conftest.py index 2caf62bf..eac3d579 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -56,7 +56,9 @@ def labels(self, **_labels: str) -> "Gauge": def module_loader(monkeypatch: pytest.MonkeyPatch) -> Callable[..., object]: counter = 0 - def _load(relative_path: str, *, extra_modules: dict[str, object] | None = None) -> object: + def _load( + relative_path: str, *, extra_modules: dict[str, object] | None = None + ) -> object: nonlocal counter counter += 1 module_name = f"test_module_{counter}" diff --git a/tests/integration/test_container_smoke_matrix.py b/tests/integration/test_container_smoke_matrix.py index b3215c6f..5bec78a4 100644 --- a/tests/integration/test_container_smoke_matrix.py +++ b/tests/integration/test_container_smoke_matrix.py @@ -22,7 +22,7 @@ def test_container_smoke_behavior_matrix(self) -> None: temp_path = Path(temp_dir) docker_wrapper = temp_path / "docker" docker_wrapper.write_text( - f"#!/usr/bin/env bash\nexec \"{MOCK_DOCKER_SCRIPT}\" \"$@\"\n", + f'#!/usr/bin/env bash\nexec "{MOCK_DOCKER_SCRIPT}" "$@"\n', encoding="utf-8", ) docker_wrapper.chmod(0o755) diff --git a/tests/unit/test_docker_healthcheck.py b/tests/unit/test_docker_healthcheck.py index bbd7fd1b..64102674 100644 --- a/tests/unit/test_docker_healthcheck.py +++ b/tests/unit/test_docker_healthcheck.py @@ -23,7 +23,9 @@ def glob(self, _pattern: str): return iter([_FakeJsonFile(self.payload)]) -def test_healthcheck_queries_jupyter_api_and_prints_response(monkeypatch, module_loader) -> None: +def test_healthcheck_queries_jupyter_api_and_prints_response( + monkeypatch, module_loader +) -> None: captured = {} payload = json.dumps({"url": "https://af.example/"}).encode("utf-8") diff --git a/tests/unit/test_node_healthcheck.py b/tests/unit/test_node_healthcheck.py index caf58bc4..e1510b5f 100644 --- a/tests/unit/test_node_healthcheck.py +++ b/tests/unit/test_node_healthcheck.py @@ -36,7 +36,9 @@ def _fake_popen(args, **kwargs): monkeypatch.setattr(module.time, "time", lambda: next(times)) monkeypatch.setattr(module.subprocess, "Popen", _fake_popen) - valid, elapsed_ms = module.check_if_directory_exists(("/tmp/validate.txt", "abc123")) + valid, elapsed_ms = module.check_if_directory_exists( + ("/tmp/validate.txt", "abc123") + ) assert valid is True assert elapsed_ms == pytest.approx(200.0) @@ -70,7 +72,9 @@ def kill(self) -> None: proc = FakeProc() monkeypatch.setattr(module.subprocess, "Popen", lambda *_args, **_kwargs: proc) - valid, elapsed_ms = module.check_if_directory_exists(("/tmp/validate.txt", "abc123")) + valid, elapsed_ms = module.check_if_directory_exists( + ("/tmp/validate.txt", "abc123") + ) assert valid is False assert elapsed_ms == 3000 diff --git a/tests/unit/test_pod_metrics_exporter.py b/tests/unit/test_pod_metrics_exporter.py index 7ac617fd..90d2353e 100644 --- a/tests/unit/test_pod_metrics_exporter.py +++ b/tests/unit/test_pod_metrics_exporter.py @@ -72,7 +72,9 @@ def test_update_metrics_home_branch_parses_df_and_ignores_stat_errors( "Filesystem 1K-blocks Used Available Use% Mounted on\n" "/dev/sda1 1000 250 750 25% /home\n" ).encode("utf-8") - monkeypatch.setattr(module.subprocess, "check_output", lambda *_args, **_kwargs: df_output) + monkeypatch.setattr( + module.subprocess, "check_output", lambda *_args, **_kwargs: df_output + ) def _raise_stat(_directory): raise OSError("stat unavailable") From a592a4cd7fa4150a4ce80435368e8f1149d1a79b Mon Sep 17 00:00:00 2001 From: Dmitry Kondratyev Date: Wed, 4 Feb 2026 11:33:54 -0500 Subject: [PATCH 14/25] ci: add formatter autofix workflow and resolve lint-python import order --- .codex/CI_PLAN.md | 6 +- .github/workflows/ci-format-autofix.yml | 173 ++++++++++++++++++ .../test_container_smoke_matrix.py | 2 +- .../test_monitoring_metric_update.py | 2 +- 4 files changed, 179 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/ci-format-autofix.yml diff --git a/.codex/CI_PLAN.md b/.codex/CI_PLAN.md index a70e7d24..dbef85fc 100644 --- a/.codex/CI_PLAN.md +++ b/.codex/CI_PLAN.md @@ -35,6 +35,7 @@ Out of scope: Approved exception: - `slurm/**` is used as a dependency-only trigger in container reliability path filters because maintained Dockerfiles copy `slurm/` artifacts. +- CI auto-commit is enabled for formatter-only fixes in `ci-format-autofix.yml` to reduce lint iteration noise. ## Active Workflow Surface - `.github/workflows/ci-workflow-integrity.yml` @@ -42,6 +43,7 @@ Approved exception: - `.github/workflows/lint-shell.yml` - `.github/workflows/lint-json.yml` - `.github/workflows/lint-yaml.yml` +- `.github/workflows/ci-format-autofix.yml` - `.github/workflows/ci-repo-quality.yml` - `.github/workflows/ci-integration-scenarios.yml` - `.github/workflows/lint-docker.yml` @@ -56,8 +58,8 @@ Approved exception: - Risk: broken workflow definitions and silent CI drift. ### B) Repo Quality and Tests (advisory) -- Workflows: `lint-python.yml`, `lint-shell.yml`, `lint-json.yml`, `lint-yaml.yml`, `ci-repo-quality.yml`, `ci-integration-scenarios.yml` -- Checks: black/isort check-only, py_compile, pytest unit advisory with coverage threshold, shellcheck/shfmt/bash -n, JSON/YAML parse, integration scenario matrix tests via mocked container/monitoring flows. +- Workflows: `lint-python.yml`, `lint-shell.yml`, `lint-json.yml`, `lint-yaml.yml`, `ci-format-autofix.yml`, `ci-repo-quality.yml`, `ci-integration-scenarios.yml` +- Checks: black/isort check-only, py_compile, pytest unit advisory with coverage threshold, shellcheck/shfmt/bash -n, JSON/YAML parse, auto-format commits for changed Python/shell/JSON/YAML files, integration scenario matrix tests via mocked container/monitoring flows. - Risk: script/runtime regressions. ### C) Container Reliability (advisory) diff --git a/.github/workflows/ci-format-autofix.yml b/.github/workflows/ci-format-autofix.yml new file mode 100644 index 00000000..9a97a35c --- /dev/null +++ b/.github/workflows/ci-format-autofix.yml @@ -0,0 +1,173 @@ +name: CI Format Autofix + +on: + pull_request: + types: [opened, synchronize, reopened] + +concurrency: + group: ci-format-autofix-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +permissions: + contents: write + +jobs: + autofix-format: + if: github.event.pull_request.head.repo.full_name == github.repository + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.head_ref }} + fetch-depth: 0 + + - name: Detect changed Python files + id: py_changes + uses: tj-actions/changed-files@v45 + with: + separator: "\n" + files: | + **/*.py + files_ignore: | + docker/dask-gateway-server/** + docs/** + docs/source/demos/** + docker/kaniko-build-jobs/** + slurm/** + .cursor/** + .git/** + + - name: Detect changed shell files + id: sh_changes + uses: tj-actions/changed-files@v45 + with: + separator: "\n" + files: | + **/*.sh + **/pixi-wrapper + **/fix-permissions + files_ignore: | + docker/dask-gateway-server/** + docs/** + docs/source/demos/** + docker/kaniko-build-jobs/** + slurm/** + .cursor/** + .git/** + + - name: Detect changed JSON/YAML files + id: data_changes + uses: tj-actions/changed-files@v45 + with: + separator: "\n" + files: | + **/*.json + **/*.yml + **/*.yaml + files_ignore: | + docker/dask-gateway-server/** + docs/** + docs/source/demos/** + docker/kaniko-build-jobs/** + slurm/** + .cursor/** + .git/** + .github/workflows/** + + - name: Set up Python + if: steps.py_changes.outputs.any_changed == 'true' + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Auto-format Python files + if: steps.py_changes.outputs.any_changed == 'true' + shell: bash + run: | + set -euo pipefail + python -m pip install --upgrade pip + pip install black isort + + mapfile -t py_files <<'EOF' + ${{ steps.py_changes.outputs.all_changed_files }} + EOF + + files=() + for f in "${py_files[@]}"; do + [ -f "$f" ] && files+=("$f") + done + + if [ "${#files[@]}" -gt 0 ]; then + black "${files[@]}" + isort --profile black "${files[@]}" + fi + + - name: Install shell formatter + if: steps.sh_changes.outputs.any_changed == 'true' + shell: bash + run: | + set -euo pipefail + SHFMT_VERSION=3.10.0 + curl -fsSL "https://github.com/mvdan/sh/releases/download/v${SHFMT_VERSION}/shfmt_v${SHFMT_VERSION}_linux_amd64" -o /tmp/shfmt + chmod +x /tmp/shfmt + sudo mv /tmp/shfmt /usr/local/bin/shfmt + + - name: Auto-format shell files + if: steps.sh_changes.outputs.any_changed == 'true' + shell: bash + run: | + set -euo pipefail + + mapfile -t sh_files <<'EOF' + ${{ steps.sh_changes.outputs.all_changed_files }} + EOF + + files=() + for f in "${sh_files[@]}"; do + [ -f "$f" ] && files+=("$f") + done + + if [ "${#files[@]}" -gt 0 ]; then + shfmt -w "${files[@]}" + fi + + - name: Set up Node.js + if: steps.data_changes.outputs.any_changed == 'true' + uses: actions/setup-node@v4 + with: + node-version: '20' + + - name: Auto-format JSON/YAML files + if: steps.data_changes.outputs.any_changed == 'true' + shell: bash + run: | + set -euo pipefail + npm install --global prettier + + mapfile -t data_files <<'EOF' + ${{ steps.data_changes.outputs.all_changed_files }} + EOF + + files=() + for f in "${data_files[@]}"; do + [ -f "$f" ] && files+=("$f") + done + + if [ "${#files[@]}" -gt 0 ]; then + prettier --write "${files[@]}" + fi + + - name: Commit and push formatting fixes + shell: bash + run: | + set -euo pipefail + if git diff --quiet; then + echo "No formatter changes to commit." + exit 0 + fi + + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + git add -A + git commit -m "ci: auto-format fixable lint issues" + git push origin "HEAD:${{ github.head_ref }}" diff --git a/tests/integration/test_container_smoke_matrix.py b/tests/integration/test_container_smoke_matrix.py index 5bec78a4..1ed3a879 100644 --- a/tests/integration/test_container_smoke_matrix.py +++ b/tests/integration/test_container_smoke_matrix.py @@ -1,10 +1,10 @@ from __future__ import annotations import os -from pathlib import Path import subprocess import tempfile import unittest +from pathlib import Path from common import REPO_ROOT, load_json_fixture diff --git a/tests/integration/test_monitoring_metric_update.py b/tests/integration/test_monitoring_metric_update.py index 05b3a6fe..a647ffcb 100644 --- a/tests/integration/test_monitoring_metric_update.py +++ b/tests/integration/test_monitoring_metric_update.py @@ -1,7 +1,7 @@ from __future__ import annotations -from pathlib import Path import unittest +from pathlib import Path from unittest import mock from uuid import uuid4 From ef3821bd0d9326e2ba8d5ba292ba899fabd6cb5a Mon Sep 17 00:00:00 2001 From: Dmitry Kondratyev Date: Wed, 4 Feb 2026 11:35:35 -0500 Subject: [PATCH 15/25] ci: harden autofix workflow push ref handling --- .github/workflows/ci-format-autofix.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci-format-autofix.yml b/.github/workflows/ci-format-autofix.yml index 9a97a35c..f97b9fee 100644 --- a/.github/workflows/ci-format-autofix.yml +++ b/.github/workflows/ci-format-autofix.yml @@ -158,6 +158,8 @@ jobs: fi - name: Commit and push formatting fixes + env: + PR_HEAD_REF: ${{ github.head_ref }} shell: bash run: | set -euo pipefail @@ -170,4 +172,4 @@ jobs: git config user.email "41898282+github-actions[bot]@users.noreply.github.com" git add -A git commit -m "ci: auto-format fixable lint issues" - git push origin "HEAD:${{ github.head_ref }}" + git push origin "HEAD:${PR_HEAD_REF}" From 15fad76b3882ad630476284cdccb46796a4661d8 Mon Sep 17 00:00:00 2001 From: Dmitry Kondratyev Date: Wed, 4 Feb 2026 11:48:47 -0500 Subject: [PATCH 16/25] docs: modernize CI badges with status and policy signal --- README.md | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 3e7ec8f2..a063fd08 100644 --- a/README.md +++ b/README.md @@ -18,8 +18,17 @@ Each user is provided with a 25GB home directory at first login. These directori [![Documentation Status](https://readthedocs.org/projects/purdue-af/badge/?version=latest)](https://purdue-af.readthedocs.io/en/latest/?badge=latest) -[![Workflow Integrity](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-workflow-integrity.yml/badge.svg)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-workflow-integrity.yml) -[![Repo Quality](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-repo-quality.yml/badge.svg)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-repo-quality.yml) -[![Container Reliability](https://github.com/PurdueAF/purdue-af/actions/workflows/lint-docker.yml/badge.svg)](https://github.com/PurdueAF/purdue-af/actions/workflows/lint-docker.yml) -[![GitOps Deployability](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-gitops-deployability.yml/badge.svg)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-gitops-deployability.yml) -[![Nightly Security](https://github.com/PurdueAF/purdue-af/actions/workflows/nightly-security-advisory.yml/badge.svg)](https://github.com/PurdueAF/purdue-af/actions/workflows/nightly-security-advisory.yml) +### Runtime Status + +[![Workflow Integrity](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-workflow-integrity.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-workflow-integrity.yml) +[![Repo Quality](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-repo-quality.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-repo-quality.yml) +[![Container Reliability](https://github.com/PurdueAF/purdue-af/actions/workflows/lint-docker.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/lint-docker.yml) +[![GitOps Deployability](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-gitops-deployability.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-gitops-deployability.yml) +[![Nightly Security](https://github.com/PurdueAF/purdue-af/actions/workflows/nightly-security-advisory.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/nightly-security-advisory.yml) + +### Policy Badges + +[![Coverage Gate](https://img.shields.io/badge/Coverage%20Gate-%3E%3D70%25%20%28advisory%29-4c1)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-repo-quality.yml) +[![Security Scans](https://img.shields.io/badge/Security%20Scans-PR%20%2B%20Nightly-0366d6)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-security-advisory.yml) +[![Validation Mode](https://img.shields.io/badge/Validation%20Mode-Advisory--first-f59e0b)](https://github.com/PurdueAF/purdue-af/actions) +[![Autofix](https://img.shields.io/badge/Autofix-Python%2FShell%2FJSON%2FYAML-7c3aed)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-format-autofix.yml) From 6304b85c26a5edd2dc8da93095e4ee44a3c88463 Mon Sep 17 00:00:00 2001 From: Dmitry Kondratyev Date: Wed, 4 Feb 2026 11:49:39 -0500 Subject: [PATCH 17/25] ci: trim duplicate checks and path-scope fast lint workflows --- .github/workflows/ci-repo-quality.yml | 45 ++++++--------------- .github/workflows/ci-workflow-integrity.yml | 6 +++ .github/workflows/lint-json.yml | 13 ++++++ .github/workflows/lint-python.yml | 13 ++++++ .github/workflows/lint-shell.yml | 15 +++++++ .github/workflows/lint-yaml.yml | 16 ++++++++ 6 files changed, 75 insertions(+), 33 deletions(-) diff --git a/.github/workflows/ci-repo-quality.yml b/.github/workflows/ci-repo-quality.yml index 4dea93df..06a96532 100644 --- a/.github/workflows/ci-repo-quality.yml +++ b/.github/workflows/ci-repo-quality.yml @@ -2,6 +2,18 @@ name: CI Repo Quality on: pull_request: + paths: + - 'tests/unit/**' + - 'tests/conftest.py' + - 'apps/monitoring/af-monitoring/metrics_server.py' + - 'apps/monitoring/af-monitoring/node_healthcheck.py' + - 'docker/af-pod-monitor/pod-metrics-exporter.py' + - 'docker/purdue-af/jupyter/docker_healthcheck.py' + - '.github/workflows/ci-repo-quality.yml' + +concurrency: + group: ci-repo-quality-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true permissions: contents: read @@ -24,39 +36,6 @@ jobs: python -m pip install --upgrade pip pip install pytest coverage - - name: Python syntax smoke (advisory) - shell: bash - run: | - set -euo pipefail - mapfile -t py_files < <(find . -type f -name '*.py' \ - -not -path './docker/dask-gateway-server/*' \ - -not -path './docker/kaniko-build-jobs/*' \ - -not -path './docs/*' \ - -not -path './slurm/*' \ - -not -path './.cursor/*' \ - -not -path './.git/*' | sort) - - if [ "${#py_files[@]}" -gt 0 ]; then - python -m py_compile "${py_files[@]}" - fi - - - name: Shell syntax smoke (advisory) - shell: bash - run: | - set -euo pipefail - mapfile -t sh_files < <(find . -type f \ - \( -name '*.sh' -o -name 'pixi-wrapper' -o -name 'fix-permissions' \) \ - -not -path './docker/dask-gateway-server/*' \ - -not -path './docker/kaniko-build-jobs/*' \ - -not -path './docs/*' \ - -not -path './slurm/*' \ - -not -path './.cursor/*' \ - -not -path './.git/*' | sort) - - for f in "${sh_files[@]}"; do - bash -n "$f" - done - - name: Run pytest (advisory) shell: bash run: | diff --git a/.github/workflows/ci-workflow-integrity.yml b/.github/workflows/ci-workflow-integrity.yml index deacfa94..c3b30c66 100644 --- a/.github/workflows/ci-workflow-integrity.yml +++ b/.github/workflows/ci-workflow-integrity.yml @@ -2,6 +2,12 @@ name: CI Workflow Integrity on: pull_request: + paths: + - '.github/workflows/**' + +concurrency: + group: ci-workflow-integrity-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true permissions: contents: read diff --git a/.github/workflows/lint-json.yml b/.github/workflows/lint-json.yml index d223cfe7..1c5d56f2 100644 --- a/.github/workflows/lint-json.yml +++ b/.github/workflows/lint-json.yml @@ -2,6 +2,19 @@ name: Lint JSON on: pull_request: + paths: + - '**/*.json' + - '.github/workflows/lint-json.yml' + - '!docker/dask-gateway-server/**' + - '!docker/kaniko-build-jobs/**' + - '!docs/**' + - '!slurm/**' + - '!.cursor/**' + - '!.git/**' + +concurrency: + group: lint-json-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true permissions: contents: read diff --git a/.github/workflows/lint-python.yml b/.github/workflows/lint-python.yml index 80dd38aa..149a4c82 100644 --- a/.github/workflows/lint-python.yml +++ b/.github/workflows/lint-python.yml @@ -2,6 +2,19 @@ name: Lint Python on: pull_request: + paths: + - '**/*.py' + - '.github/workflows/lint-python.yml' + - '!docker/dask-gateway-server/**' + - '!docker/kaniko-build-jobs/**' + - '!docs/**' + - '!slurm/**' + - '!.cursor/**' + - '!.git/**' + +concurrency: + group: lint-python-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true permissions: contents: read diff --git a/.github/workflows/lint-shell.yml b/.github/workflows/lint-shell.yml index d12762ed..030dcf60 100644 --- a/.github/workflows/lint-shell.yml +++ b/.github/workflows/lint-shell.yml @@ -2,6 +2,21 @@ name: Lint Shell Scripts on: pull_request: + paths: + - '**/*.sh' + - '**/pixi-wrapper' + - '**/fix-permissions' + - '.github/workflows/lint-shell.yml' + - '!docker/dask-gateway-server/**' + - '!docker/kaniko-build-jobs/**' + - '!docs/**' + - '!slurm/**' + - '!.cursor/**' + - '!.git/**' + +concurrency: + group: lint-shell-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true permissions: contents: read diff --git a/.github/workflows/lint-yaml.yml b/.github/workflows/lint-yaml.yml index 6158c887..44edbd9b 100644 --- a/.github/workflows/lint-yaml.yml +++ b/.github/workflows/lint-yaml.yml @@ -2,6 +2,21 @@ name: Lint YAML on: pull_request: + paths: + - '**/*.yml' + - '**/*.yaml' + - '!docker/dask-gateway-server/**' + - '!docker/kaniko-build-jobs/**' + - '!docs/**' + - '!slurm/**' + - '!.cursor/**' + - '!.git/**' + - '!.github/workflows/**' + - '.github/workflows/lint-yaml.yml' + +concurrency: + group: lint-yaml-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true permissions: contents: read @@ -39,6 +54,7 @@ jobs: Path('slurm'), Path('.cursor'), Path('.git'), + Path('.github/workflows'), ) filtered = [] From 237dc69183a4129c05480c784e33aa709ec4db08 Mon Sep 17 00:00:00 2001 From: Dmitry Kondratyev Date: Wed, 4 Feb 2026 11:51:34 -0500 Subject: [PATCH 18/25] ci: harden runtime and security workflow execution behavior --- .github/workflows/ci-gitops-deployability.yml | 52 +++++++++++ .github/workflows/ci-security-advisory.yml | 82 +++++++++++++++-- .github/workflows/lint-docker.yml | 89 +++++++++++++++++++ .../workflows/nightly-security-advisory.yml | 16 ++++ 4 files changed, 234 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci-gitops-deployability.yml b/.github/workflows/ci-gitops-deployability.yml index d83902b0..f37b5a53 100644 --- a/.github/workflows/ci-gitops-deployability.yml +++ b/.github/workflows/ci-gitops-deployability.yml @@ -16,6 +16,7 @@ permissions: jobs: detect-gitops-changes: runs-on: ubuntu-latest + timeout-minutes: 5 outputs: run_all: ${{ steps.scope.outputs.run_all }} core_production: ${{ steps.filter.outputs.core_production }} @@ -56,10 +57,48 @@ jobs: echo "run_all=false" >> "$GITHUB_OUTPUT" fi + - name: Publish GitOps validation plan + if: always() + shell: bash + run: | + set -euo pipefail + run_all="${{ steps.scope.outputs.run_all }}" + { + echo '### GitOps Deployability Plan' + echo + echo "- Full overlay run: \`$run_all\`" + echo + echo '| Overlay | Decision |' + echo '|---|---|' + if [ "$run_all" = 'true' ] || [ "${{ steps.filter.outputs.core_production }}" = 'true' ]; then + echo '| `deploy/core-production` | run |' + else + echo '| `deploy/core-production` | skipped |' + fi + if [ "$run_all" = 'true' ] || [ "${{ steps.filter.outputs.core_staging }}" = 'true' ]; then + echo '| `deploy/core-staging` | run |' + else + echo '| `deploy/core-staging` | skipped |' + fi + if [ "$run_all" = 'true' ] || [ "${{ steps.filter.outputs.core_geddes2 }}" = 'true' ]; then + echo '| `deploy/core-geddes2` | run |' + else + echo '| `deploy/core-geddes2` | skipped |' + fi + if [ "$run_all" = 'true' ] || [ "${{ steps.filter.outputs.experimental }}" = 'true' ]; then + echo '| `deploy/experimental` | run |' + else + echo '| `deploy/experimental` | skipped |' + fi + echo + echo '- Mode: advisory (gitops-validate uses continue-on-error).' + } >> "$GITHUB_STEP_SUMMARY" + gitops-validate: needs: detect-gitops-changes if: needs.detect-gitops-changes.outputs.run_all == 'true' || needs.detect-gitops-changes.outputs.core_production == 'true' || needs.detect-gitops-changes.outputs.core_staging == 'true' || needs.detect-gitops-changes.outputs.core_geddes2 == 'true' || needs.detect-gitops-changes.outputs.experimental == 'true' runs-on: ubuntu-latest + timeout-minutes: 25 continue-on-error: true steps: - uses: actions/checkout@v4 @@ -110,6 +149,7 @@ jobs: echo '### GitOps Deployability Summary' echo echo '- No in-scope overlay changes detected.' + echo '- Mode: advisory (job continue-on-error=true).' } >> "$GITHUB_STEP_SUMMARY" exit 0 fi @@ -145,4 +185,16 @@ jobs: echo "| \`$overlay\` | $render_status | $kubeconform_status |" >> "$GITHUB_STEP_SUMMARY" done + if [ "$status" -eq 0 ]; then + overall_result='pass' + else + overall_result='issues-detected' + fi + + { + echo + echo "- Overall result: **$overall_result**" + echo '- Mode: advisory (job continue-on-error=true).' + } >> "$GITHUB_STEP_SUMMARY" + exit "$status" diff --git a/.github/workflows/ci-security-advisory.yml b/.github/workflows/ci-security-advisory.yml index 953a65c0..2aa7d146 100644 --- a/.github/workflows/ci-security-advisory.yml +++ b/.github/workflows/ci-security-advisory.yml @@ -25,6 +25,7 @@ permissions: jobs: detect-security-scope: runs-on: ubuntu-latest + timeout-minutes: 5 outputs: vuln_surface: ${{ steps.filter.outputs.vuln_surface }} config_surface: ${{ steps.filter.outputs.config_surface }} @@ -49,15 +50,41 @@ jobs: - 'docker/**' - '.github/workflows/**' + - name: Publish security scan plan + if: always() + shell: bash + run: | + set -euo pipefail + { + echo '### Security Advisory Scan Plan' + echo + echo '| Scan | Decision |' + echo '|---|---|' + if [ "${{ steps.filter.outputs.vuln_surface }}" = 'true' ]; then + echo '| Filesystem vulnerability scan | run |' + else + echo '| Filesystem vulnerability scan | skipped |' + fi + if [ "${{ steps.filter.outputs.config_surface }}" = 'true' ]; then + echo '| Configuration misconfiguration scan | run |' + else + echo '| Configuration misconfiguration scan | skipped |' + fi + echo + echo '- Workflow mode: advisory (scan job uses continue-on-error).' + } >> "$GITHUB_STEP_SUMMARY" + trivy-security-advisory: needs: detect-security-scope if: needs.detect-security-scope.outputs.vuln_surface == 'true' || needs.detect-security-scope.outputs.config_surface == 'true' runs-on: ubuntu-latest + timeout-minutes: 30 continue-on-error: true steps: - uses: actions/checkout@v4 - name: Run Trivy filesystem vulnerability scan (advisory) + id: fs_scan if: needs.detect-security-scope.outputs.vuln_surface == 'true' continue-on-error: true uses: aquasecurity/trivy-action@0.33.1 @@ -73,6 +100,7 @@ jobs: skip-dirs: docker/dask-gateway-server,docs,docs/source/demos,docker/kaniko-build-jobs,slurm,.cursor - name: Run Trivy configuration scan (advisory) + id: config_scan if: needs.detect-security-scope.outputs.config_surface == 'true' continue-on-error: true uses: aquasecurity/trivy-action@0.33.1 @@ -88,6 +116,11 @@ jobs: - name: Publish PR Trivy summary (advisory) if: always() shell: bash + env: + VULN_SURFACE: ${{ needs.detect-security-scope.outputs.vuln_surface }} + CONFIG_SURFACE: ${{ needs.detect-security-scope.outputs.config_surface }} + FS_SCAN_OUTCOME: ${{ steps.fs_scan.outcome || 'skipped' }} + CONFIG_SCAN_OUTCOME: ${{ steps.config_scan.outcome || 'skipped' }} run: | set -euo pipefail python3 - <<'PY' @@ -98,18 +131,44 @@ jobs: summary_path = Path(os.environ['GITHUB_STEP_SUMMARY']) reports = ( - ('Filesystem vulnerability scan', Path('trivy-pr-fs.json'), 'Vulnerabilities'), - ('Configuration misconfiguration scan', Path('trivy-pr-config.json'), 'Misconfigurations'), + ( + 'Filesystem vulnerability scan', + Path('trivy-pr-fs.json'), + 'Vulnerabilities', + os.environ.get('VULN_SURFACE', 'false') == 'true', + os.environ.get('FS_SCAN_OUTCOME', 'skipped'), + ), + ( + 'Configuration misconfiguration scan', + Path('trivy-pr-config.json'), + 'Misconfigurations', + os.environ.get('CONFIG_SURFACE', 'false') == 'true', + os.environ.get('CONFIG_SCAN_OUTCOME', 'skipped'), + ), ) total_high_critical = 0 + missing_reports = 0 with summary_path.open('a', encoding='utf-8') as summary: summary.write('### PR Trivy Advisory Summary\n\n') + summary.write('| Scan | Scope | Step outcome | Report |\n') + summary.write('|---|---|---|---|\n') + + for label, report_path, _, in_scope, outcome in reports: + scope_status = 'run' if in_scope else 'skipped' + report_status = 'present' if report_path.exists() else 'missing' + summary.write(f'| {label} | {scope_status} | `{outcome}` | {report_status} |\n') + if in_scope and not report_path.exists(): + missing_reports += 1 + + for label, report_path, finding_key, in_scope, _ in reports: + if not in_scope: + summary.write(f'\n#### {label}\n\nSkipped by path scope.\n') + continue - for label, report_path, finding_key in reports: if not report_path.exists(): - summary.write(f'- {label}: skipped (out of scope)\n') + summary.write(f'\n#### {label}\n\nReport missing (scan did not produce expected output).\n') continue payload = json.loads(report_path.read_text(encoding='utf-8')) @@ -148,7 +207,20 @@ jobs: for target, count in target_counts.most_common(10): summary.write(f'| `{target}` | {count} |\n') - if total_high_critical > 0: + if missing_reports > 0: + overall_result = 'report-missing' + elif total_high_critical > 0: + overall_result = 'findings-detected' + else: + overall_result = 'clear' + + summary.write('\n') + summary.write(f'- Overall result: **{overall_result}**\n') + summary.write('- Mode: advisory (job continue-on-error=true).\n') + + if missing_reports > 0: + print(f'::warning::PR Trivy missing report files: {missing_reports}. See summary and artifacts.') + elif total_high_critical > 0: print(f'::warning::PR Trivy found {total_high_critical} HIGH/CRITICAL findings. See summary and artifacts.') else: print('::notice::PR Trivy found no HIGH/CRITICAL findings in scope.') diff --git a/.github/workflows/lint-docker.yml b/.github/workflows/lint-docker.yml index da7fe070..37678f98 100644 --- a/.github/workflows/lint-docker.yml +++ b/.github/workflows/lint-docker.yml @@ -20,6 +20,7 @@ permissions: jobs: detect-docker-changes: runs-on: ubuntu-latest + timeout-minutes: 5 outputs: dockerfiles: ${{ steps.filter.outputs.dockerfiles }} af_pod_monitor: ${{ steps.filter.outputs.af_pod_monitor }} @@ -49,10 +50,45 @@ jobs: - 'slurm/**' - '.github/scripts/container-smoke.sh' + - name: Publish container reliability plan + if: always() + shell: bash + run: | + set -euo pipefail + { + echo '### Container Reliability Plan' + echo + echo '| Check | Decision |' + echo '|---|---|' + if [ "${{ steps.filter.outputs.dockerfiles }}" = 'true' ]; then + echo '| Dockerfile lint | run |' + else + echo '| Dockerfile lint | skipped |' + fi + if [ "${{ steps.filter.outputs.af_pod_monitor }}" = 'true' ]; then + echo '| af-pod-monitor build/smoke | run |' + else + echo '| af-pod-monitor build/smoke | skipped |' + fi + if [ "${{ steps.filter.outputs.interlink_slurm_plugin }}" = 'true' ]; then + echo '| interlink-slurm-plugin build/smoke | run |' + else + echo '| interlink-slurm-plugin build/smoke | skipped |' + fi + if [ "${{ steps.filter.outputs.purdue_af }}" = 'true' ]; then + echo '| purdue-af build/smoke | run |' + else + echo '| purdue-af build/smoke | skipped |' + fi + echo + echo '- Workflow mode: advisory (all jobs use continue-on-error).' + } >> "$GITHUB_STEP_SUMMARY" + lint-dockerfiles: needs: detect-docker-changes if: needs.detect-docker-changes.outputs.dockerfiles == 'true' runs-on: ubuntu-latest + timeout-minutes: 12 continue-on-error: true steps: - uses: actions/checkout@v4 @@ -66,16 +102,28 @@ jobs: sudo mv /tmp/hadolint /usr/local/bin/hadolint - name: Run hadolint (check-only, advisory) + id: hadolint run: | set -euo pipefail hadolint --ignore DL3041 --ignore DL3033 --failure-threshold warning docker/af-pod-monitor/Dockerfile hadolint --ignore DL3041 --ignore DL3033 --failure-threshold warning docker/interlink-slurm-plugin/Dockerfile.alma8 hadolint --ignore DL3041 --ignore DL3033 --failure-threshold warning docker/purdue-af/Dockerfile + - name: Publish Dockerfile lint advisory summary + if: always() + run: | + { + echo '### Dockerfile Lint Advisory Summary' + echo + echo "- Hadolint outcome: `${{ steps.hadolint.outcome }}`" + echo '- Mode: advisory (job continue-on-error=true).' + } >> "$GITHUB_STEP_SUMMARY" + build-af-pod-monitor: needs: detect-docker-changes if: needs.detect-docker-changes.outputs.af_pod_monitor == 'true' runs-on: ubuntu-latest + timeout-minutes: 35 continue-on-error: true steps: - uses: actions/checkout@v4 @@ -84,6 +132,7 @@ jobs: uses: docker/setup-buildx-action@v3 - name: Build af-pod-monitor image with cache (advisory) + id: build_image uses: docker/build-push-action@v6 with: context: docker/af-pod-monitor @@ -95,12 +144,25 @@ jobs: provenance: false - name: Smoke test af-pod-monitor image (advisory) + id: smoke_test run: .github/scripts/container-smoke.sh local/af-pod-monitor:${{ github.sha }} af-pod-monitor + - name: Publish af-pod-monitor advisory summary + if: always() + run: | + { + echo '### af-pod-monitor Container Advisory Summary' + echo + echo "- Build outcome: `${{ steps.build_image.outcome }}`" + echo "- Smoke test outcome: `${{ steps.smoke_test.outcome }}`" + echo '- Mode: advisory (job continue-on-error=true).' + } >> "$GITHUB_STEP_SUMMARY" + build-interlink-slurm-plugin: needs: detect-docker-changes if: needs.detect-docker-changes.outputs.interlink_slurm_plugin == 'true' runs-on: ubuntu-latest + timeout-minutes: 35 continue-on-error: true steps: - uses: actions/checkout@v4 @@ -109,6 +171,7 @@ jobs: uses: docker/setup-buildx-action@v3 - name: Build interlink-slurm-plugin image with cache (advisory) + id: build_image uses: docker/build-push-action@v6 with: context: . @@ -120,12 +183,25 @@ jobs: provenance: false - name: Smoke test interlink-slurm-plugin image (advisory) + id: smoke_test run: .github/scripts/container-smoke.sh local/interlink-slurm-plugin:${{ github.sha }} interlink-slurm-plugin + - name: Publish interlink-slurm-plugin advisory summary + if: always() + run: | + { + echo '### interlink-slurm-plugin Container Advisory Summary' + echo + echo "- Build outcome: `${{ steps.build_image.outcome }}`" + echo "- Smoke test outcome: `${{ steps.smoke_test.outcome }}`" + echo '- Mode: advisory (job continue-on-error=true).' + } >> "$GITHUB_STEP_SUMMARY" + build-purdue-af: needs: detect-docker-changes if: needs.detect-docker-changes.outputs.purdue_af == 'true' runs-on: ubuntu-latest + timeout-minutes: 35 continue-on-error: true steps: - uses: actions/checkout@v4 @@ -134,6 +210,7 @@ jobs: uses: docker/setup-buildx-action@v3 - name: Build purdue-af image with cache (advisory) + id: build_image uses: docker/build-push-action@v6 with: context: . @@ -145,4 +222,16 @@ jobs: provenance: false - name: Smoke test purdue-af image (advisory) + id: smoke_test run: .github/scripts/container-smoke.sh local/purdue-af:${{ github.sha }} purdue-af + + - name: Publish purdue-af advisory summary + if: always() + run: | + { + echo '### purdue-af Container Advisory Summary' + echo + echo "- Build outcome: `${{ steps.build_image.outcome }}`" + echo "- Smoke test outcome: `${{ steps.smoke_test.outcome }}`" + echo '- Mode: advisory (job continue-on-error=true).' + } >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/nightly-security-advisory.yml b/.github/workflows/nightly-security-advisory.yml index 9d7fa398..c61012e5 100644 --- a/.github/workflows/nightly-security-advisory.yml +++ b/.github/workflows/nightly-security-advisory.yml @@ -15,11 +15,13 @@ permissions: jobs: trivy-filesystem: runs-on: ubuntu-latest + timeout-minutes: 30 continue-on-error: true steps: - uses: actions/checkout@v4 - name: Run Trivy filesystem scan (advisory) + id: fs_scan uses: aquasecurity/trivy-action@0.33.1 with: scan-type: fs @@ -35,6 +37,8 @@ jobs: - name: Publish nightly Trivy summary (advisory) if: always() shell: bash + env: + FS_SCAN_OUTCOME: ${{ steps.fs_scan.outcome || 'unknown' }} run: | set -euo pipefail python3 - <<'PY' @@ -46,12 +50,18 @@ jobs: report_path = Path('trivy-nightly-fs.json') summary_path = Path(os.environ['GITHUB_STEP_SUMMARY']) title = 'Nightly Trivy Vulnerability Summary' + scan_outcome = os.environ.get('FS_SCAN_OUTCOME', 'unknown') with summary_path.open('a', encoding='utf-8') as summary: summary.write(f'### {title}\n\n') + summary.write('| Scan | Step outcome | Report |\n') + summary.write('|---|---|---|\n') + summary.write(f'| Filesystem vulnerability scan | `{scan_outcome}` | {"present" if report_path.exists() else "missing"} |\n\n') if not report_path.exists(): summary.write('- Trivy report was not generated.\n') + summary.write('- Overall result: **scan-step-failed**\n') + summary.write('- Mode: advisory (job continue-on-error=true).\n') print('::warning::Nightly Trivy report was not generated.') raise SystemExit(0) @@ -75,6 +85,8 @@ jobs: if high_critical == 0: summary.write('No HIGH/CRITICAL vulnerabilities found in scope.\n') + summary.write('- Overall result: **clear**\n') + summary.write('- Mode: advisory (job continue-on-error=true).\n') print('::notice::Nightly Trivy found no HIGH/CRITICAL vulnerabilities.') raise SystemExit(0) @@ -90,6 +102,10 @@ jobs: for target, count in target_counts.most_common(10): summary.write(f'| `{target}` | {count} |\n') + summary.write('\n') + summary.write('- Overall result: **findings-detected**\n') + summary.write('- Mode: advisory (job continue-on-error=true).\n') + print(f'::warning::Nightly Trivy found {high_critical} HIGH/CRITICAL vulnerabilities. See summary and artifact.') PY From 5cb774d195199ef2a99fffbaae8b47e87aaad454 Mon Sep 17 00:00:00 2001 From: Dmitry Kondratyev Date: Wed, 4 Feb 2026 12:00:27 -0500 Subject: [PATCH 19/25] ci: path-scope and harden autofix workflow runtime behavior --- .github/workflows/ci-format-autofix.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/ci-format-autofix.yml b/.github/workflows/ci-format-autofix.yml index f97b9fee..8902c6cb 100644 --- a/.github/workflows/ci-format-autofix.yml +++ b/.github/workflows/ci-format-autofix.yml @@ -3,6 +3,15 @@ name: CI Format Autofix on: pull_request: types: [opened, synchronize, reopened] + paths: + - '**/*.py' + - '**/*.sh' + - '**/*.json' + - '**/*.yml' + - '**/*.yaml' + - '**/pixi-wrapper' + - '**/fix-permissions' + - '.github/workflows/ci-format-autofix.yml' concurrency: group: ci-format-autofix-${{ github.event.pull_request.number || github.ref }} @@ -15,6 +24,7 @@ jobs: autofix-format: if: github.event.pull_request.head.repo.full_name == github.repository runs-on: ubuntu-latest + timeout-minutes: 15 steps: - uses: actions/checkout@v4 with: From 61f918baf22e92ed87c8801bbc557f0d589f3292 Mon Sep 17 00:00:00 2001 From: Dmitry Kondratyev Date: Wed, 4 Feb 2026 12:01:18 -0500 Subject: [PATCH 20/25] docs: elevate CI section with expanded runtime badges and CI profile table --- README.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/README.md b/README.md index a063fd08..926e2e30 100644 --- a/README.md +++ b/README.md @@ -22,8 +22,11 @@ Each user is provided with a 25GB home directory at first login. These directori [![Workflow Integrity](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-workflow-integrity.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-workflow-integrity.yml) [![Repo Quality](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-repo-quality.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-repo-quality.yml) +[![CI Format Autofix](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-format-autofix.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-format-autofix.yml) +[![CI Integration Scenarios](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-integration-scenarios.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-integration-scenarios.yml) [![Container Reliability](https://github.com/PurdueAF/purdue-af/actions/workflows/lint-docker.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/lint-docker.yml) [![GitOps Deployability](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-gitops-deployability.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-gitops-deployability.yml) +[![CI Security Advisory](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-security-advisory.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-security-advisory.yml) [![Nightly Security](https://github.com/PurdueAF/purdue-af/actions/workflows/nightly-security-advisory.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/nightly-security-advisory.yml) ### Policy Badges @@ -32,3 +35,16 @@ Each user is provided with a 25GB home directory at first login. These directori [![Security Scans](https://img.shields.io/badge/Security%20Scans-PR%20%2B%20Nightly-0366d6)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-security-advisory.yml) [![Validation Mode](https://img.shields.io/badge/Validation%20Mode-Advisory--first-f59e0b)](https://github.com/PurdueAF/purdue-af/actions) [![Autofix](https://img.shields.io/badge/Autofix-Python%2FShell%2FJSON%2FYAML-7c3aed)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-format-autofix.yml) + +### CI Profile + +| Signal | Workflow | Trigger | Mode (advisory/blocking) | Notes | +|---|---|---|---|---| +| Workflow integrity | [CI Workflow Integrity](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-workflow-integrity.yml) | Pull request (`.github/workflows/**`) | advisory | Actionlint + workflow YAML parse | +| Repo quality | [CI Repo Quality](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-repo-quality.yml) | Pull request (unit/runtime paths) | advisory | Unit tests + 70% coverage policy signal | +| Format autofix | [CI Format Autofix](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-format-autofix.yml) | Pull request open/sync/reopen (format-targeted paths) | advisory | Auto-formats and pushes fix commits to PR branch | +| Integration scenarios | [CI Integration Scenarios](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-integration-scenarios.yml) | Pull request (integration paths) | advisory | Scripted integration scenario run | +| Container reliability | [Container Reliability](https://github.com/PurdueAF/purdue-af/actions/workflows/lint-docker.yml) | Pull request (container/slurm paths) | advisory | Hadolint + image build/smoke checks | +| GitOps deployability | [CI GitOps Deployability](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-gitops-deployability.yml) | Pull request (`deploy/**`) | advisory | Kustomize render + kubeconform validation | +| Security advisory (PR) | [CI Security Advisory](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-security-advisory.yml) | Pull request (security-relevant paths) + manual dispatch | advisory | Trivy vuln/config scans with summary + artifacts | +| Security advisory (nightly) | [Nightly Security Advisory](https://github.com/PurdueAF/purdue-af/actions/workflows/nightly-security-advisory.yml) | Nightly schedule + manual dispatch | advisory | Trivy filesystem scan with nightly summary | From dbe0c48f5873d306e1c2471cee6e3cbf9d514c3d Mon Sep 17 00:00:00 2001 From: Dmitry Kondratyev Date: Wed, 4 Feb 2026 12:02:52 -0500 Subject: [PATCH 21/25] ci: pin formatter and lint toolchain versions for deterministic runs --- .github/workflows/ci-format-autofix.yml | 32 ++++++++++++++++++++++--- .github/workflows/ci-repo-quality.yml | 11 ++++++++- .github/workflows/lint-python.yml | 11 ++++++++- .github/workflows/lint-shell.yml | 15 +++++++++--- 4 files changed, 61 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci-format-autofix.yml b/.github/workflows/ci-format-autofix.yml index 8902c6cb..4ae301aa 100644 --- a/.github/workflows/ci-format-autofix.yml +++ b/.github/workflows/ci-format-autofix.yml @@ -25,6 +25,11 @@ jobs: if: github.event.pull_request.head.repo.full_name == github.repository runs-on: ubuntu-latest timeout-minutes: 15 + env: + BLACK_VERSION: '24.10.0' + ISORT_VERSION: '5.13.2' + SHFMT_VERSION: '3.10.0' + PRETTIER_VERSION: '3.3.3' steps: - uses: actions/checkout@v4 with: @@ -96,7 +101,7 @@ jobs: run: | set -euo pipefail python -m pip install --upgrade pip - pip install black isort + pip install "black==${BLACK_VERSION}" "isort==${ISORT_VERSION}" mapfile -t py_files <<'EOF' ${{ steps.py_changes.outputs.all_changed_files }} @@ -112,16 +117,30 @@ jobs: isort --profile black "${files[@]}" fi + - name: Tool versions (Python formatters) + if: steps.py_changes.outputs.any_changed == 'true' + shell: bash + run: | + set -euo pipefail + black --version + isort --version-number + - name: Install shell formatter if: steps.sh_changes.outputs.any_changed == 'true' shell: bash run: | set -euo pipefail - SHFMT_VERSION=3.10.0 curl -fsSL "https://github.com/mvdan/sh/releases/download/v${SHFMT_VERSION}/shfmt_v${SHFMT_VERSION}_linux_amd64" -o /tmp/shfmt chmod +x /tmp/shfmt sudo mv /tmp/shfmt /usr/local/bin/shfmt + - name: Tool versions (Shell formatter) + if: steps.sh_changes.outputs.any_changed == 'true' + shell: bash + run: | + set -euo pipefail + shfmt --version + - name: Auto-format shell files if: steps.sh_changes.outputs.any_changed == 'true' shell: bash @@ -152,7 +171,7 @@ jobs: shell: bash run: | set -euo pipefail - npm install --global prettier + npm install --global "prettier@${PRETTIER_VERSION}" mapfile -t data_files <<'EOF' ${{ steps.data_changes.outputs.all_changed_files }} @@ -167,6 +186,13 @@ jobs: prettier --write "${files[@]}" fi + - name: Tool versions (Data formatter) + if: steps.data_changes.outputs.any_changed == 'true' + shell: bash + run: | + set -euo pipefail + prettier --version + - name: Commit and push formatting fixes env: PR_HEAD_REF: ${{ github.head_ref }} diff --git a/.github/workflows/ci-repo-quality.yml b/.github/workflows/ci-repo-quality.yml index 06a96532..3189f564 100644 --- a/.github/workflows/ci-repo-quality.yml +++ b/.github/workflows/ci-repo-quality.yml @@ -22,6 +22,9 @@ jobs: repo-quality: runs-on: ubuntu-latest continue-on-error: true + env: + PYTEST_VERSION: '8.4.0' + COVERAGE_VERSION: '7.6.1' steps: - uses: actions/checkout@v4 @@ -34,7 +37,13 @@ jobs: run: | set -euo pipefail python -m pip install --upgrade pip - pip install pytest coverage + pip install "pytest==${PYTEST_VERSION}" "coverage==${COVERAGE_VERSION}" + + - name: Tool versions + run: | + set -euo pipefail + pytest --version + python -m coverage --version - name: Run pytest (advisory) shell: bash diff --git a/.github/workflows/lint-python.yml b/.github/workflows/lint-python.yml index 149a4c82..1a3aae7f 100644 --- a/.github/workflows/lint-python.yml +++ b/.github/workflows/lint-python.yml @@ -23,6 +23,9 @@ jobs: lint-python: runs-on: ubuntu-latest continue-on-error: true + env: + BLACK_VERSION: '24.10.0' + ISORT_VERSION: '5.13.2' steps: - uses: actions/checkout@v4 @@ -35,7 +38,13 @@ jobs: run: | set -euo pipefail python -m pip install --upgrade pip - pip install black isort + pip install "black==${BLACK_VERSION}" "isort==${ISORT_VERSION}" + + - name: Tool versions + run: | + set -euo pipefail + black --version + isort --version-number - name: Run black/isort/py_compile (check-only, advisory) shell: bash diff --git a/.github/workflows/lint-shell.yml b/.github/workflows/lint-shell.yml index 030dcf60..f0e076e5 100644 --- a/.github/workflows/lint-shell.yml +++ b/.github/workflows/lint-shell.yml @@ -25,19 +25,28 @@ jobs: lint-shell: runs-on: ubuntu-latest continue-on-error: true + env: + SHELLCHECK_VERSION: '0.10.0' + SHFMT_VERSION: '3.10.0' steps: - uses: actions/checkout@v4 - name: Install shell tooling run: | set -euo pipefail - sudo apt-get update - sudo apt-get install -y shellcheck - SHFMT_VERSION=3.10.0 + curl -fsSL "https://github.com/koalaman/shellcheck/releases/download/v${SHELLCHECK_VERSION}/shellcheck-v${SHELLCHECK_VERSION}.linux.x86_64.tar.xz" -o /tmp/shellcheck.tar.xz + tar -xJf /tmp/shellcheck.tar.xz -C /tmp + sudo mv "/tmp/shellcheck-v${SHELLCHECK_VERSION}/shellcheck" /usr/local/bin/shellcheck curl -fsSL "https://github.com/mvdan/sh/releases/download/v${SHFMT_VERSION}/shfmt_v${SHFMT_VERSION}_linux_amd64" -o /tmp/shfmt chmod +x /tmp/shfmt sudo mv /tmp/shfmt /usr/local/bin/shfmt + - name: Tool versions + run: | + set -euo pipefail + shellcheck --version + shfmt --version + - name: Run shellcheck/shfmt/bash -n (check-only, advisory) shell: bash run: | From 903233f324550c1069aee3e1c372d483ed8ad2b8 Mon Sep 17 00:00:00 2001 From: Dmitry Kondratyev Date: Wed, 4 Feb 2026 12:22:31 -0500 Subject: [PATCH 22/25] docs(ci): sync CI plan with current runtime strategy --- .codex/CI_PLAN.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/.codex/CI_PLAN.md b/.codex/CI_PLAN.md index dbef85fc..12f6c491 100644 --- a/.codex/CI_PLAN.md +++ b/.codex/CI_PLAN.md @@ -6,8 +6,9 @@ Deliver one draft PR from `codex/ci` to `main` with a stable advisory-first CI b ## Current Status - PR branch: `codex/ci` - Delivery model: single PR `codex/ci -> main` -- Existing CI baseline is green on PR checks. -- Formatter/linter workflows are check-only (no CI writeback commits). +- PR #21 is open against `main` (not draft). +- Existing CI baseline is green on fast PR checks; container build jobs are the long pole. +- Lint workflows are check-only; formatter autofix workflow can commit formatting-only fixes to PR branches. ## Success Criteria - CI remains stable on `pull_request` runs for all configured workflows. @@ -60,21 +61,25 @@ Approved exception: ### B) Repo Quality and Tests (advisory) - Workflows: `lint-python.yml`, `lint-shell.yml`, `lint-json.yml`, `lint-yaml.yml`, `ci-format-autofix.yml`, `ci-repo-quality.yml`, `ci-integration-scenarios.yml` - Checks: black/isort check-only, py_compile, pytest unit advisory with coverage threshold, shellcheck/shfmt/bash -n, JSON/YAML parse, auto-format commits for changed Python/shell/JSON/YAML files, integration scenario matrix tests via mocked container/monitoring flows. +- Execution model: fast workflows are path-scoped with PR concurrency cancellation; formatter/lint tool versions are pinned for deterministic behavior. - Risk: script/runtime regressions. ### C) Container Reliability (advisory) - Workflow: `lint-docker.yml` - Checks: hadolint, targeted Docker Buildx jobs with GitHub Actions layer cache, smoke checks via `.github/scripts/container-smoke.sh`. +- Execution model: path-scoped change detection, explicit job timeouts, and advisory summaries in run output. - Risk: image build/runtime regressions. ### D) GitOps Deployability (advisory) - Workflow: `ci-gitops-deployability.yml` - Checks: kustomize render + kubeconform schema validation. +- Execution model: overlay-scoped detection, explicit job timeouts, and advisory plan/result summaries in run output. - Risk: Flux reconciliation failures from invalid manifests. ### E) Security Posture (advisory) - Workflows: `nightly-security-advisory.yml`, `ci-security-advisory.yml` - Checks: nightly Trivy filesystem scan plus PR-time advisory Trivy vulnerability/config scans with run summaries and artifacts. +- Execution model: path-scoped PR scans, explicit scan timeouts, and summary tables for scan scope/outcomes. - Risk: security drift in dependencies/configuration. ## Optimization Workstreams (Current) @@ -109,7 +114,6 @@ Goal: - No side branches. - No force-push on shared campaign work. - Daily sync: merge `main` into `codex/ci` (no rebase). -- Keep PR draft until optimization baseline is stable. ## Constraint Challenge Protocol If any hard constraint must be challenged, submit an `EXCEPTION REQUEST` with: From 600da03f0a059996f5d509081d486bc1d772a78b Mon Sep 17 00:00:00 2001 From: Dmitry Kondratyev Date: Wed, 4 Feb 2026 12:25:04 -0500 Subject: [PATCH 23/25] ci: fix actionlint shell warnings in workflow summaries --- .github/workflows/ci-gitops-deployability.yml | 18 +++++++++--------- .github/workflows/lint-docker.yml | 14 +++++++------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/workflows/ci-gitops-deployability.yml b/.github/workflows/ci-gitops-deployability.yml index f37b5a53..9167c264 100644 --- a/.github/workflows/ci-gitops-deployability.yml +++ b/.github/workflows/ci-gitops-deployability.yml @@ -66,29 +66,29 @@ jobs: { echo '### GitOps Deployability Plan' echo - echo "- Full overlay run: \`$run_all\`" + echo "- Full overlay run: $run_all" echo echo '| Overlay | Decision |' echo '|---|---|' if [ "$run_all" = 'true' ] || [ "${{ steps.filter.outputs.core_production }}" = 'true' ]; then - echo '| `deploy/core-production` | run |' + echo '| deploy/core-production | run |' else - echo '| `deploy/core-production` | skipped |' + echo '| deploy/core-production | skipped |' fi if [ "$run_all" = 'true' ] || [ "${{ steps.filter.outputs.core_staging }}" = 'true' ]; then - echo '| `deploy/core-staging` | run |' + echo '| deploy/core-staging | run |' else - echo '| `deploy/core-staging` | skipped |' + echo '| deploy/core-staging | skipped |' fi if [ "$run_all" = 'true' ] || [ "${{ steps.filter.outputs.core_geddes2 }}" = 'true' ]; then - echo '| `deploy/core-geddes2` | run |' + echo '| deploy/core-geddes2 | run |' else - echo '| `deploy/core-geddes2` | skipped |' + echo '| deploy/core-geddes2 | skipped |' fi if [ "$run_all" = 'true' ] || [ "${{ steps.filter.outputs.experimental }}" = 'true' ]; then - echo '| `deploy/experimental` | run |' + echo '| deploy/experimental | run |' else - echo '| `deploy/experimental` | skipped |' + echo '| deploy/experimental | skipped |' fi echo echo '- Mode: advisory (gitops-validate uses continue-on-error).' diff --git a/.github/workflows/lint-docker.yml b/.github/workflows/lint-docker.yml index 37678f98..6764fefa 100644 --- a/.github/workflows/lint-docker.yml +++ b/.github/workflows/lint-docker.yml @@ -115,7 +115,7 @@ jobs: { echo '### Dockerfile Lint Advisory Summary' echo - echo "- Hadolint outcome: `${{ steps.hadolint.outcome }}`" + echo "- Hadolint outcome: ${{ steps.hadolint.outcome }}" echo '- Mode: advisory (job continue-on-error=true).' } >> "$GITHUB_STEP_SUMMARY" @@ -153,8 +153,8 @@ jobs: { echo '### af-pod-monitor Container Advisory Summary' echo - echo "- Build outcome: `${{ steps.build_image.outcome }}`" - echo "- Smoke test outcome: `${{ steps.smoke_test.outcome }}`" + echo "- Build outcome: ${{ steps.build_image.outcome }}" + echo "- Smoke test outcome: ${{ steps.smoke_test.outcome }}" echo '- Mode: advisory (job continue-on-error=true).' } >> "$GITHUB_STEP_SUMMARY" @@ -192,8 +192,8 @@ jobs: { echo '### interlink-slurm-plugin Container Advisory Summary' echo - echo "- Build outcome: `${{ steps.build_image.outcome }}`" - echo "- Smoke test outcome: `${{ steps.smoke_test.outcome }}`" + echo "- Build outcome: ${{ steps.build_image.outcome }}" + echo "- Smoke test outcome: ${{ steps.smoke_test.outcome }}" echo '- Mode: advisory (job continue-on-error=true).' } >> "$GITHUB_STEP_SUMMARY" @@ -231,7 +231,7 @@ jobs: { echo '### purdue-af Container Advisory Summary' echo - echo "- Build outcome: `${{ steps.build_image.outcome }}`" - echo "- Smoke test outcome: `${{ steps.smoke_test.outcome }}`" + echo "- Build outcome: ${{ steps.build_image.outcome }}" + echo "- Smoke test outcome: ${{ steps.smoke_test.outcome }}" echo '- Mode: advisory (job continue-on-error=true).' } >> "$GITHUB_STEP_SUMMARY" From 6999e2f15b8f457b6fa8750ebe84ccbecd7bcf70 Mon Sep 17 00:00:00 2001 From: Dmitry Kondratyev Date: Wed, 4 Feb 2026 13:26:38 -0500 Subject: [PATCH 24/25] ci: remove docker build timeouts and trim build context --- .codex/CI_PLAN.md | 3 ++- .dockerignore | 14 ++++++++++++++ .github/workflows/lint-docker.yml | 3 --- 3 files changed, 16 insertions(+), 4 deletions(-) create mode 100644 .dockerignore diff --git a/.codex/CI_PLAN.md b/.codex/CI_PLAN.md index 12f6c491..384ff43c 100644 --- a/.codex/CI_PLAN.md +++ b/.codex/CI_PLAN.md @@ -8,6 +8,7 @@ Deliver one draft PR from `codex/ci` to `main` with a stable advisory-first CI b - Delivery model: single PR `codex/ci -> main` - PR #21 is open against `main` (not draft). - Existing CI baseline is green on fast PR checks; container build jobs are the long pole. +- Root-context Docker builds are cache-enabled and use a repo-level `.dockerignore` to reduce context size. - Lint workflows are check-only; formatter autofix workflow can commit formatting-only fixes to PR branches. ## Success Criteria @@ -67,7 +68,7 @@ Approved exception: ### C) Container Reliability (advisory) - Workflow: `lint-docker.yml` - Checks: hadolint, targeted Docker Buildx jobs with GitHub Actions layer cache, smoke checks via `.github/scripts/container-smoke.sh`. -- Execution model: path-scoped change detection, explicit job timeouts, and advisory summaries in run output. +- Execution model: path-scoped change detection, no per-job timeout cap on Docker build jobs, root-context `.dockerignore` optimization, and advisory summaries in run output. - Risk: image build/runtime regressions. ### D) GitOps Deployability (advisory) diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..6f75458c --- /dev/null +++ b/.dockerignore @@ -0,0 +1,14 @@ +# Keep root-context Docker builds lean for CI. +# This file affects builds that use `context: .` in GitHub Actions. +** + +# Keep required sources for maintained root-context Dockerfiles. +!docker/ +!docker/interlink-slurm-plugin/ +!docker/interlink-slurm-plugin/** +!docker/purdue-af/ +!docker/purdue-af/** +!slurm/ +!slurm/slurm-24.05.1-1.el8.x86_64.rpm +!slurm/slurm-configs/ +!slurm/slurm-configs/** diff --git a/.github/workflows/lint-docker.yml b/.github/workflows/lint-docker.yml index 6764fefa..2489aee9 100644 --- a/.github/workflows/lint-docker.yml +++ b/.github/workflows/lint-docker.yml @@ -123,7 +123,6 @@ jobs: needs: detect-docker-changes if: needs.detect-docker-changes.outputs.af_pod_monitor == 'true' runs-on: ubuntu-latest - timeout-minutes: 35 continue-on-error: true steps: - uses: actions/checkout@v4 @@ -162,7 +161,6 @@ jobs: needs: detect-docker-changes if: needs.detect-docker-changes.outputs.interlink_slurm_plugin == 'true' runs-on: ubuntu-latest - timeout-minutes: 35 continue-on-error: true steps: - uses: actions/checkout@v4 @@ -201,7 +199,6 @@ jobs: needs: detect-docker-changes if: needs.detect-docker-changes.outputs.purdue_af == 'true' runs-on: ubuntu-latest - timeout-minutes: 35 continue-on-error: true steps: - uses: actions/checkout@v4 From 7f675f192ebd0c8c529c1d154d909e0e8b792f31 Mon Sep 17 00:00:00 2001 From: Dmitry Kondratyev Date: Wed, 4 Feb 2026 18:37:21 -0500 Subject: [PATCH 25/25] ci: bound docker builds and improve build visibility --- .codex/CI_PLAN.md | 2 +- .github/workflows/lint-docker.yml | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/.codex/CI_PLAN.md b/.codex/CI_PLAN.md index 384ff43c..7b82984b 100644 --- a/.codex/CI_PLAN.md +++ b/.codex/CI_PLAN.md @@ -68,7 +68,7 @@ Approved exception: ### C) Container Reliability (advisory) - Workflow: `lint-docker.yml` - Checks: hadolint, targeted Docker Buildx jobs with GitHub Actions layer cache, smoke checks via `.github/scripts/container-smoke.sh`. -- Execution model: path-scoped change detection, no per-job timeout cap on Docker build jobs, root-context `.dockerignore` optimization, and advisory summaries in run output. +- Execution model: path-scoped change detection, 120-minute per-job timeout cap for Docker build jobs, root-context `.dockerignore` optimization, BuildKit plain progress logging, and advisory summaries in run output. - Risk: image build/runtime regressions. ### D) GitOps Deployability (advisory) diff --git a/.github/workflows/lint-docker.yml b/.github/workflows/lint-docker.yml index 2489aee9..887e63db 100644 --- a/.github/workflows/lint-docker.yml +++ b/.github/workflows/lint-docker.yml @@ -123,7 +123,10 @@ jobs: needs: detect-docker-changes if: needs.detect-docker-changes.outputs.af_pod_monitor == 'true' runs-on: ubuntu-latest + timeout-minutes: 120 continue-on-error: true + env: + BUILDKIT_PROGRESS: plain steps: - uses: actions/checkout@v4 @@ -137,6 +140,7 @@ jobs: context: docker/af-pod-monitor file: docker/af-pod-monitor/Dockerfile load: true + pull: true tags: local/af-pod-monitor:${{ github.sha }} cache-from: type=gha,scope=af-pod-monitor cache-to: type=gha,mode=max,scope=af-pod-monitor,ignore-error=true @@ -161,7 +165,10 @@ jobs: needs: detect-docker-changes if: needs.detect-docker-changes.outputs.interlink_slurm_plugin == 'true' runs-on: ubuntu-latest + timeout-minutes: 120 continue-on-error: true + env: + BUILDKIT_PROGRESS: plain steps: - uses: actions/checkout@v4 @@ -175,6 +182,7 @@ jobs: context: . file: docker/interlink-slurm-plugin/Dockerfile.alma8 load: true + pull: true tags: local/interlink-slurm-plugin:${{ github.sha }} cache-from: type=gha,scope=interlink-slurm-plugin cache-to: type=gha,mode=max,scope=interlink-slurm-plugin,ignore-error=true @@ -199,7 +207,10 @@ jobs: needs: detect-docker-changes if: needs.detect-docker-changes.outputs.purdue_af == 'true' runs-on: ubuntu-latest + timeout-minutes: 120 continue-on-error: true + env: + BUILDKIT_PROGRESS: plain steps: - uses: actions/checkout@v4 @@ -213,6 +224,7 @@ jobs: context: . file: docker/purdue-af/Dockerfile load: true + pull: true tags: local/purdue-af:${{ github.sha }} cache-from: type=gha,scope=purdue-af cache-to: type=gha,mode=max,scope=purdue-af,ignore-error=true