diff --git a/.codex/CI_PLAN.md b/.codex/CI_PLAN.md new file mode 100644 index 00000000..7b82984b --- /dev/null +++ b/.codex/CI_PLAN.md @@ -0,0 +1,126 @@ +# CI/CD Campaign Plan (Current State) + +## Mission +Deliver one draft PR from `codex/ci` to `main` with a stable advisory-first CI baseline, then optimize test depth, integration realism, and security signal without broad refactors. + +## Current Status +- PR branch: `codex/ci` +- Delivery model: single PR `codex/ci -> main` +- PR #21 is open against `main` (not draft). +- Existing CI baseline is green on fast PR checks; container build jobs are the long pole. +- Root-context Docker builds are cache-enabled and use a repo-level `.dockerignore` to reduce context size. +- Lint workflows are check-only; formatter autofix workflow can commit formatting-only fixes to PR branches. + +## Success Criteria +- CI remains stable on `pull_request` runs for all configured workflows. +- Optimization phase adds meaningful unit and integration coverage for repo-owned code. +- Security checks include nightly advisory plus PR-time advisory signal. +- `README.md` keeps A-E category badges aligned with active workflows. +- `.codex/CI_PLAN.md` remains the single source of truth. + +## In-Scope / Out-of-Scope Paths +In scope: +- `.github/**` +- `apps/**` +- `deploy/**` +- `docker/**` (except exclusions) +- `README.md` +- `.codex/CI_PLAN.md` + +Out of scope: +- `docker/dask-gateway-server/**` +- `docs/**` +- `docs/source/demos/**` +- `docker/kaniko-build-jobs/**` +- `slurm/**` +- `.cursor/**` + +Approved exception: +- `slurm/**` is used as a dependency-only trigger in container reliability path filters because maintained Dockerfiles copy `slurm/` artifacts. +- CI auto-commit is enabled for formatter-only fixes in `ci-format-autofix.yml` to reduce lint iteration noise. + +## Active Workflow Surface +- `.github/workflows/ci-workflow-integrity.yml` +- `.github/workflows/lint-python.yml` +- `.github/workflows/lint-shell.yml` +- `.github/workflows/lint-json.yml` +- `.github/workflows/lint-yaml.yml` +- `.github/workflows/ci-format-autofix.yml` +- `.github/workflows/ci-repo-quality.yml` +- `.github/workflows/ci-integration-scenarios.yml` +- `.github/workflows/lint-docker.yml` +- `.github/workflows/ci-gitops-deployability.yml` +- `.github/workflows/ci-security-advisory.yml` +- `.github/workflows/nightly-security-advisory.yml` + +## Check Architecture +### A) CI System Integrity (advisory) +- Workflow: `ci-workflow-integrity.yml` +- Checks: actionlint + workflow YAML parse. +- Risk: broken workflow definitions and silent CI drift. + +### B) Repo Quality and Tests (advisory) +- Workflows: `lint-python.yml`, `lint-shell.yml`, `lint-json.yml`, `lint-yaml.yml`, `ci-format-autofix.yml`, `ci-repo-quality.yml`, `ci-integration-scenarios.yml` +- Checks: black/isort check-only, py_compile, pytest unit advisory with coverage threshold, shellcheck/shfmt/bash -n, JSON/YAML parse, auto-format commits for changed Python/shell/JSON/YAML files, integration scenario matrix tests via mocked container/monitoring flows. +- Execution model: fast workflows are path-scoped with PR concurrency cancellation; formatter/lint tool versions are pinned for deterministic behavior. +- Risk: script/runtime regressions. + +### C) Container Reliability (advisory) +- Workflow: `lint-docker.yml` +- Checks: hadolint, targeted Docker Buildx jobs with GitHub Actions layer cache, smoke checks via `.github/scripts/container-smoke.sh`. +- Execution model: path-scoped change detection, 120-minute per-job timeout cap for Docker build jobs, root-context `.dockerignore` optimization, BuildKit plain progress logging, and advisory summaries in run output. +- Risk: image build/runtime regressions. + +### D) GitOps Deployability (advisory) +- Workflow: `ci-gitops-deployability.yml` +- Checks: kustomize render + kubeconform schema validation. +- Execution model: overlay-scoped detection, explicit job timeouts, and advisory plan/result summaries in run output. +- Risk: Flux reconciliation failures from invalid manifests. + +### E) Security Posture (advisory) +- Workflows: `nightly-security-advisory.yml`, `ci-security-advisory.yml` +- Checks: nightly Trivy filesystem scan plus PR-time advisory Trivy vulnerability/config scans with run summaries and artifacts. +- Execution model: path-scoped PR scans, explicit scan timeouts, and summary tables for scan scope/outcomes. +- Risk: security drift in dependencies/configuration. + +## Optimization Workstreams (Current) +### Worker 1: Coverage Optimizer +File lane: +- `tests/unit/**` +- `tests/conftest.py` +- `.github/workflows/lint-python.yml` +- `.github/workflows/ci-repo-quality.yml` +Goal: +- Increase meaningful Python test coverage and publish coverage in CI (advisory threshold first). + +### Worker 2: Integration Scenarios +File lane: +- `tests/integration/**` +- `tests/fixtures/**` +- `.github/workflows/ci-integration-scenarios.yml` (new) +- `.github/scripts/integration/**` +Goal: +- Add realistic automated integration scenarios with deterministic mocks and PR advisory execution. + +### Worker 3: Security and Runtime Optimizer +File lane: +- `.github/workflows/nightly-security-advisory.yml` +- `.github/workflows/ci-security-advisory.yml` (new) +- `.github/workflows/lint-docker.yml` +- `.github/workflows/ci-gitops-deployability.yml` +Goal: +- Add PR-time advisory security checks and reduce CI runtime/noise safely. + +## Branch and Sync Rules +- No side branches. +- No force-push on shared campaign work. +- Daily sync: merge `main` into `codex/ci` (no rebase). + +## Constraint Challenge Protocol +If any hard constraint must be challenged, submit an `EXCEPTION REQUEST` with: +1) challenged constraint, +2) concrete risk if unchanged, +3) minimal exception requested, +4) rollback path. + +No exception is implemented without explicit user approval. diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..6f75458c --- /dev/null +++ b/.dockerignore @@ -0,0 +1,14 @@ +# Keep root-context Docker builds lean for CI. +# This file affects builds that use `context: .` in GitHub Actions. +** + +# Keep required sources for maintained root-context Dockerfiles. +!docker/ +!docker/interlink-slurm-plugin/ +!docker/interlink-slurm-plugin/** +!docker/purdue-af/ +!docker/purdue-af/** +!slurm/ +!slurm/slurm-24.05.1-1.el8.x86_64.rpm +!slurm/slurm-configs/ +!slurm/slurm-configs/** diff --git a/.github/scripts/container-smoke.sh b/.github/scripts/container-smoke.sh new file mode 100755 index 00000000..6fca637d --- /dev/null +++ b/.github/scripts/container-smoke.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " >&2 + exit 2 +fi + +image="$1" +profile="$2" + +docker image inspect "$image" >/dev/null + +case "$profile" in +af-pod-monitor) + docker run --rm --entrypoint python "$image" -c "import prometheus_client" + ;; +interlink-slurm-plugin) + docker run --rm --entrypoint /bin/sh "$image" -lc 'test -x /sidecar/slurm-sidecar' + ;; +purdue-af) + docker run --rm --entrypoint /bin/bash "$image" -lc 'python --version && jupyter --version >/dev/null' + ;; +*) + echo "Unknown profile: $profile" >&2 + exit 2 + ;; +esac + +echo "Smoke checks passed for profile: $profile" diff --git a/.github/scripts/integration/mock-docker-cli.sh b/.github/scripts/integration/mock-docker-cli.sh new file mode 100755 index 00000000..c20a19aa --- /dev/null +++ b/.github/scripts/integration/mock-docker-cli.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [ -n "${MOCK_DOCKER_LOG:-}" ]; then + printf '%s\n' "$*" >>"$MOCK_DOCKER_LOG" +fi + +cmd="${1:-}" +shift || true + +case "$cmd" in +image) + subcmd="${1:-}" + shift || true + if [ "$subcmd" != "inspect" ]; then + echo "mock docker unsupported image subcommand: $subcmd" >&2 + exit 64 + fi + + if [ -n "${MOCK_DOCKER_INSPECT_STDOUT:-}" ]; then + printf '%s\n' "$MOCK_DOCKER_INSPECT_STDOUT" + fi + if [ -n "${MOCK_DOCKER_INSPECT_STDERR:-}" ]; then + printf '%s\n' "$MOCK_DOCKER_INSPECT_STDERR" >&2 + fi + + exit "${MOCK_DOCKER_INSPECT_EXIT:-0}" + ;; + +run) + if [ -n "${MOCK_DOCKER_RUN_STDOUT:-}" ]; then + printf '%s\n' "$MOCK_DOCKER_RUN_STDOUT" + fi + if [ -n "${MOCK_DOCKER_RUN_STDERR:-}" ]; then + printf '%s\n' "$MOCK_DOCKER_RUN_STDERR" >&2 + fi + + exit "${MOCK_DOCKER_RUN_EXIT:-0}" + ;; + +*) + echo "mock docker unsupported command: $cmd" >&2 + exit 64 + ;; +esac diff --git a/.github/scripts/integration/run-integration-scenarios.sh b/.github/scripts/integration/run-integration-scenarios.sh new file mode 100755 index 00000000..20190685 --- /dev/null +++ b/.github/scripts/integration/run-integration-scenarios.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +set -euo pipefail + +repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" + +cd "$repo_root" +python3 -m unittest discover -s tests/integration -p 'test_*.py' -v diff --git a/.github/workflows/ci-format-autofix.yml b/.github/workflows/ci-format-autofix.yml new file mode 100644 index 00000000..4ae301aa --- /dev/null +++ b/.github/workflows/ci-format-autofix.yml @@ -0,0 +1,211 @@ +name: CI Format Autofix + +on: + pull_request: + types: [opened, synchronize, reopened] + paths: + - '**/*.py' + - '**/*.sh' + - '**/*.json' + - '**/*.yml' + - '**/*.yaml' + - '**/pixi-wrapper' + - '**/fix-permissions' + - '.github/workflows/ci-format-autofix.yml' + +concurrency: + group: ci-format-autofix-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +permissions: + contents: write + +jobs: + autofix-format: + if: github.event.pull_request.head.repo.full_name == github.repository + runs-on: ubuntu-latest + timeout-minutes: 15 + env: + BLACK_VERSION: '24.10.0' + ISORT_VERSION: '5.13.2' + SHFMT_VERSION: '3.10.0' + PRETTIER_VERSION: '3.3.3' + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.head_ref }} + fetch-depth: 0 + + - name: Detect changed Python files + id: py_changes + uses: tj-actions/changed-files@v45 + with: + separator: "\n" + files: | + **/*.py + files_ignore: | + docker/dask-gateway-server/** + docs/** + docs/source/demos/** + docker/kaniko-build-jobs/** + slurm/** + .cursor/** + .git/** + + - name: Detect changed shell files + id: sh_changes + uses: tj-actions/changed-files@v45 + with: + separator: "\n" + files: | + **/*.sh + **/pixi-wrapper + **/fix-permissions + files_ignore: | + docker/dask-gateway-server/** + docs/** + docs/source/demos/** + docker/kaniko-build-jobs/** + slurm/** + .cursor/** + .git/** + + - name: Detect changed JSON/YAML files + id: data_changes + uses: tj-actions/changed-files@v45 + with: + separator: "\n" + files: | + **/*.json + **/*.yml + **/*.yaml + files_ignore: | + docker/dask-gateway-server/** + docs/** + docs/source/demos/** + docker/kaniko-build-jobs/** + slurm/** + .cursor/** + .git/** + .github/workflows/** + + - name: Set up Python + if: steps.py_changes.outputs.any_changed == 'true' + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Auto-format Python files + if: steps.py_changes.outputs.any_changed == 'true' + shell: bash + run: | + set -euo pipefail + python -m pip install --upgrade pip + pip install "black==${BLACK_VERSION}" "isort==${ISORT_VERSION}" + + mapfile -t py_files <<'EOF' + ${{ steps.py_changes.outputs.all_changed_files }} + EOF + + files=() + for f in "${py_files[@]}"; do + [ -f "$f" ] && files+=("$f") + done + + if [ "${#files[@]}" -gt 0 ]; then + black "${files[@]}" + isort --profile black "${files[@]}" + fi + + - name: Tool versions (Python formatters) + if: steps.py_changes.outputs.any_changed == 'true' + shell: bash + run: | + set -euo pipefail + black --version + isort --version-number + + - name: Install shell formatter + if: steps.sh_changes.outputs.any_changed == 'true' + shell: bash + run: | + set -euo pipefail + curl -fsSL "https://github.com/mvdan/sh/releases/download/v${SHFMT_VERSION}/shfmt_v${SHFMT_VERSION}_linux_amd64" -o /tmp/shfmt + chmod +x /tmp/shfmt + sudo mv /tmp/shfmt /usr/local/bin/shfmt + + - name: Tool versions (Shell formatter) + if: steps.sh_changes.outputs.any_changed == 'true' + shell: bash + run: | + set -euo pipefail + shfmt --version + + - name: Auto-format shell files + if: steps.sh_changes.outputs.any_changed == 'true' + shell: bash + run: | + set -euo pipefail + + mapfile -t sh_files <<'EOF' + ${{ steps.sh_changes.outputs.all_changed_files }} + EOF + + files=() + for f in "${sh_files[@]}"; do + [ -f "$f" ] && files+=("$f") + done + + if [ "${#files[@]}" -gt 0 ]; then + shfmt -w "${files[@]}" + fi + + - name: Set up Node.js + if: steps.data_changes.outputs.any_changed == 'true' + uses: actions/setup-node@v4 + with: + node-version: '20' + + - name: Auto-format JSON/YAML files + if: steps.data_changes.outputs.any_changed == 'true' + shell: bash + run: | + set -euo pipefail + npm install --global "prettier@${PRETTIER_VERSION}" + + mapfile -t data_files <<'EOF' + ${{ steps.data_changes.outputs.all_changed_files }} + EOF + + files=() + for f in "${data_files[@]}"; do + [ -f "$f" ] && files+=("$f") + done + + if [ "${#files[@]}" -gt 0 ]; then + prettier --write "${files[@]}" + fi + + - name: Tool versions (Data formatter) + if: steps.data_changes.outputs.any_changed == 'true' + shell: bash + run: | + set -euo pipefail + prettier --version + + - name: Commit and push formatting fixes + env: + PR_HEAD_REF: ${{ github.head_ref }} + shell: bash + run: | + set -euo pipefail + if git diff --quiet; then + echo "No formatter changes to commit." + exit 0 + fi + + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + git add -A + git commit -m "ci: auto-format fixable lint issues" + git push origin "HEAD:${PR_HEAD_REF}" diff --git a/.github/workflows/ci-gitops-deployability.yml b/.github/workflows/ci-gitops-deployability.yml new file mode 100644 index 00000000..9167c264 --- /dev/null +++ b/.github/workflows/ci-gitops-deployability.yml @@ -0,0 +1,200 @@ +name: CI GitOps Deployability + +on: + pull_request: + paths: + - 'deploy/**' + - '.github/workflows/ci-gitops-deployability.yml' + +concurrency: + group: ci-gitops-deployability-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + detect-gitops-changes: + runs-on: ubuntu-latest + timeout-minutes: 5 + outputs: + run_all: ${{ steps.scope.outputs.run_all }} + core_production: ${{ steps.filter.outputs.core_production }} + core_staging: ${{ steps.filter.outputs.core_staging }} + core_geddes2: ${{ steps.filter.outputs.core_geddes2 }} + experimental: ${{ steps.filter.outputs.experimental }} + steps: + - uses: actions/checkout@v4 + + - uses: dorny/paths-filter@v3 + id: filter + with: + filters: | + core_production: + - 'deploy/core-production/**' + core_staging: + - 'deploy/core-staging/**' + core_geddes2: + - 'deploy/core-geddes2/**' + experimental: + - 'deploy/experimental/**' + deploy_shared: + - 'deploy/**' + - '!deploy/core-production/**' + - '!deploy/core-staging/**' + - '!deploy/core-geddes2/**' + - '!deploy/experimental/**' + workflow: + - '.github/workflows/ci-gitops-deployability.yml' + + - id: scope + shell: bash + run: | + set -euo pipefail + if [ "${{ steps.filter.outputs.deploy_shared }}" = 'true' ] || [ "${{ steps.filter.outputs.workflow }}" = 'true' ]; then + echo "run_all=true" >> "$GITHUB_OUTPUT" + else + echo "run_all=false" >> "$GITHUB_OUTPUT" + fi + + - name: Publish GitOps validation plan + if: always() + shell: bash + run: | + set -euo pipefail + run_all="${{ steps.scope.outputs.run_all }}" + { + echo '### GitOps Deployability Plan' + echo + echo "- Full overlay run: $run_all" + echo + echo '| Overlay | Decision |' + echo '|---|---|' + if [ "$run_all" = 'true' ] || [ "${{ steps.filter.outputs.core_production }}" = 'true' ]; then + echo '| deploy/core-production | run |' + else + echo '| deploy/core-production | skipped |' + fi + if [ "$run_all" = 'true' ] || [ "${{ steps.filter.outputs.core_staging }}" = 'true' ]; then + echo '| deploy/core-staging | run |' + else + echo '| deploy/core-staging | skipped |' + fi + if [ "$run_all" = 'true' ] || [ "${{ steps.filter.outputs.core_geddes2 }}" = 'true' ]; then + echo '| deploy/core-geddes2 | run |' + else + echo '| deploy/core-geddes2 | skipped |' + fi + if [ "$run_all" = 'true' ] || [ "${{ steps.filter.outputs.experimental }}" = 'true' ]; then + echo '| deploy/experimental | run |' + else + echo '| deploy/experimental | skipped |' + fi + echo + echo '- Mode: advisory (gitops-validate uses continue-on-error).' + } >> "$GITHUB_STEP_SUMMARY" + + gitops-validate: + needs: detect-gitops-changes + if: needs.detect-gitops-changes.outputs.run_all == 'true' || needs.detect-gitops-changes.outputs.core_production == 'true' || needs.detect-gitops-changes.outputs.core_staging == 'true' || needs.detect-gitops-changes.outputs.core_geddes2 == 'true' || needs.detect-gitops-changes.outputs.experimental == 'true' + runs-on: ubuntu-latest + timeout-minutes: 25 + continue-on-error: true + steps: + - uses: actions/checkout@v4 + + - name: Set up kustomize + uses: imranismail/setup-kustomize@v2 + with: + kustomize-version: '5.4.2' + + - name: Install kubeconform + run: | + set -euo pipefail + KUBECONFORM_VERSION=0.6.7 + curl -fsSL "https://github.com/yannh/kubeconform/releases/download/v${KUBECONFORM_VERSION}/kubeconform-linux-amd64.tar.gz" -o /tmp/kubeconform.tar.gz + tar -xzf /tmp/kubeconform.tar.gz -C /tmp kubeconform + chmod +x /tmp/kubeconform + sudo mv /tmp/kubeconform /usr/local/bin/kubeconform + + - name: Render and validate selected overlays (advisory) + shell: bash + env: + RUN_ALL: ${{ needs.detect-gitops-changes.outputs.run_all }} + CORE_PRODUCTION: ${{ needs.detect-gitops-changes.outputs.core_production }} + CORE_STAGING: ${{ needs.detect-gitops-changes.outputs.core_staging }} + CORE_GEDDES2: ${{ needs.detect-gitops-changes.outputs.core_geddes2 }} + EXPERIMENTAL: ${{ needs.detect-gitops-changes.outputs.experimental }} + run: | + set -euo pipefail + + overlays=() + if [ "$RUN_ALL" = 'true' ]; then + overlays=( + deploy/core-production + deploy/core-staging + deploy/core-geddes2 + deploy/experimental + ) + else + [ "$CORE_PRODUCTION" = 'true' ] && overlays+=(deploy/core-production) + [ "$CORE_STAGING" = 'true' ] && overlays+=(deploy/core-staging) + [ "$CORE_GEDDES2" = 'true' ] && overlays+=(deploy/core-geddes2) + [ "$EXPERIMENTAL" = 'true' ] && overlays+=(deploy/experimental) + fi + + if [ "${#overlays[@]}" -eq 0 ]; then + echo 'No in-scope overlay changes detected; skipping render/validation.' + { + echo '### GitOps Deployability Summary' + echo + echo '- No in-scope overlay changes detected.' + echo '- Mode: advisory (job continue-on-error=true).' + } >> "$GITHUB_STEP_SUMMARY" + exit 0 + fi + + { + echo '### GitOps Deployability Summary' + echo + echo '| Overlay | Render | Kubeconform |' + echo '|---|---|---|' + } >> "$GITHUB_STEP_SUMMARY" + + status=0 + for overlay in "${overlays[@]}"; do + rendered="/tmp/$(echo "$overlay" | tr '/' '_').yaml" + render_status='ok' + kubeconform_status='ok' + + echo "Rendering $overlay -> $rendered" + if ! kustomize build --load-restrictor LoadRestrictionsNone "$overlay" > "$rendered"; then + render_status='failed' + kubeconform_status='skipped' + status=1 + fi + + if [ "$render_status" = 'ok' ]; then + echo "Validating $rendered" + if ! kubeconform -summary -strict -ignore-missing-schemas -skip Secret "$rendered"; then + kubeconform_status='failed' + status=1 + fi + fi + + echo "| \`$overlay\` | $render_status | $kubeconform_status |" >> "$GITHUB_STEP_SUMMARY" + done + + if [ "$status" -eq 0 ]; then + overall_result='pass' + else + overall_result='issues-detected' + fi + + { + echo + echo "- Overall result: **$overall_result**" + echo '- Mode: advisory (job continue-on-error=true).' + } >> "$GITHUB_STEP_SUMMARY" + + exit "$status" diff --git a/.github/workflows/ci-integration-scenarios.yml b/.github/workflows/ci-integration-scenarios.yml new file mode 100644 index 00000000..2908b38d --- /dev/null +++ b/.github/workflows/ci-integration-scenarios.yml @@ -0,0 +1,31 @@ +name: CI Integration Scenarios + +on: + pull_request: + paths: + - tests/integration/** + - tests/fixtures/** + - .github/scripts/integration/** + - .github/scripts/container-smoke.sh + - apps/monitoring/af-monitoring/metrics_server.py + - apps/monitoring/af-monitoring/node_healthcheck.py + - .github/workflows/ci-integration-scenarios.yml + +permissions: + contents: read + +jobs: + integration-scenarios: + runs-on: ubuntu-latest + continue-on-error: true + timeout-minutes: 10 + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Run integration scenarios (advisory) + run: bash .github/scripts/integration/run-integration-scenarios.sh diff --git a/.github/workflows/ci-repo-quality.yml b/.github/workflows/ci-repo-quality.yml new file mode 100644 index 00000000..3189f564 --- /dev/null +++ b/.github/workflows/ci-repo-quality.yml @@ -0,0 +1,65 @@ +name: CI Repo Quality + +on: + pull_request: + paths: + - 'tests/unit/**' + - 'tests/conftest.py' + - 'apps/monitoring/af-monitoring/metrics_server.py' + - 'apps/monitoring/af-monitoring/node_healthcheck.py' + - 'docker/af-pod-monitor/pod-metrics-exporter.py' + - 'docker/purdue-af/jupyter/docker_healthcheck.py' + - '.github/workflows/ci-repo-quality.yml' + +concurrency: + group: ci-repo-quality-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + repo-quality: + runs-on: ubuntu-latest + continue-on-error: true + env: + PYTEST_VERSION: '8.4.0' + COVERAGE_VERSION: '7.6.1' + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install quality tooling + run: | + set -euo pipefail + python -m pip install --upgrade pip + pip install "pytest==${PYTEST_VERSION}" "coverage==${COVERAGE_VERSION}" + + - name: Tool versions + run: | + set -euo pipefail + pytest --version + python -m coverage --version + + - name: Run pytest (advisory) + shell: bash + run: | + set +e + python -m coverage run -m pytest -q tests/unit + rc=$? + set -e + if [ "$rc" -eq 5 ]; then + echo 'pytest collected no tests; treating as informational.' + exit 0 + fi + if [ "$rc" -ne 0 ]; then + exit "$rc" + fi + python -m coverage report \ + --show-missing \ + --fail-under=70 \ + --include="apps/monitoring/af-monitoring/metrics_server.py,apps/monitoring/af-monitoring/node_healthcheck.py,docker/af-pod-monitor/pod-metrics-exporter.py,docker/purdue-af/jupyter/docker_healthcheck.py" diff --git a/.github/workflows/ci-security-advisory.yml b/.github/workflows/ci-security-advisory.yml new file mode 100644 index 00000000..2aa7d146 --- /dev/null +++ b/.github/workflows/ci-security-advisory.yml @@ -0,0 +1,238 @@ +name: CI Security Advisory + +on: + pull_request: + paths: + - 'deploy/**' + - 'docker/**' + - '.github/workflows/**' + - '**/requirements*.txt' + - '**/pyproject.toml' + - '**/poetry.lock' + - '**/Pipfile' + - '**/Pipfile.lock' + - '**/go.mod' + - '**/go.sum' + workflow_dispatch: + +concurrency: + group: ci-security-advisory-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + detect-security-scope: + runs-on: ubuntu-latest + timeout-minutes: 5 + outputs: + vuln_surface: ${{ steps.filter.outputs.vuln_surface }} + config_surface: ${{ steps.filter.outputs.config_surface }} + steps: + - uses: actions/checkout@v4 + + - uses: dorny/paths-filter@v3 + id: filter + with: + filters: | + vuln_surface: + - 'docker/**' + - '**/requirements*.txt' + - '**/pyproject.toml' + - '**/poetry.lock' + - '**/Pipfile' + - '**/Pipfile.lock' + - '**/go.mod' + - '**/go.sum' + config_surface: + - 'deploy/**' + - 'docker/**' + - '.github/workflows/**' + + - name: Publish security scan plan + if: always() + shell: bash + run: | + set -euo pipefail + { + echo '### Security Advisory Scan Plan' + echo + echo '| Scan | Decision |' + echo '|---|---|' + if [ "${{ steps.filter.outputs.vuln_surface }}" = 'true' ]; then + echo '| Filesystem vulnerability scan | run |' + else + echo '| Filesystem vulnerability scan | skipped |' + fi + if [ "${{ steps.filter.outputs.config_surface }}" = 'true' ]; then + echo '| Configuration misconfiguration scan | run |' + else + echo '| Configuration misconfiguration scan | skipped |' + fi + echo + echo '- Workflow mode: advisory (scan job uses continue-on-error).' + } >> "$GITHUB_STEP_SUMMARY" + + trivy-security-advisory: + needs: detect-security-scope + if: needs.detect-security-scope.outputs.vuln_surface == 'true' || needs.detect-security-scope.outputs.config_surface == 'true' + runs-on: ubuntu-latest + timeout-minutes: 30 + continue-on-error: true + steps: + - uses: actions/checkout@v4 + + - name: Run Trivy filesystem vulnerability scan (advisory) + id: fs_scan + if: needs.detect-security-scope.outputs.vuln_surface == 'true' + continue-on-error: true + uses: aquasecurity/trivy-action@0.33.1 + with: + scan-type: fs + scan-ref: . + scanners: vuln + severity: HIGH,CRITICAL + ignore-unfixed: true + exit-code: '1' + format: json + output: trivy-pr-fs.json + skip-dirs: docker/dask-gateway-server,docs,docs/source/demos,docker/kaniko-build-jobs,slurm,.cursor + + - name: Run Trivy configuration scan (advisory) + id: config_scan + if: needs.detect-security-scope.outputs.config_surface == 'true' + continue-on-error: true + uses: aquasecurity/trivy-action@0.33.1 + with: + scan-type: config + scan-ref: . + severity: HIGH,CRITICAL + exit-code: '1' + format: json + output: trivy-pr-config.json + skip-dirs: docker/dask-gateway-server,docs,docs/source/demos,docker/kaniko-build-jobs,slurm,.cursor + + - name: Publish PR Trivy summary (advisory) + if: always() + shell: bash + env: + VULN_SURFACE: ${{ needs.detect-security-scope.outputs.vuln_surface }} + CONFIG_SURFACE: ${{ needs.detect-security-scope.outputs.config_surface }} + FS_SCAN_OUTCOME: ${{ steps.fs_scan.outcome || 'skipped' }} + CONFIG_SCAN_OUTCOME: ${{ steps.config_scan.outcome || 'skipped' }} + run: | + set -euo pipefail + python3 - <<'PY' + import json + import os + from collections import Counter + from pathlib import Path + + summary_path = Path(os.environ['GITHUB_STEP_SUMMARY']) + reports = ( + ( + 'Filesystem vulnerability scan', + Path('trivy-pr-fs.json'), + 'Vulnerabilities', + os.environ.get('VULN_SURFACE', 'false') == 'true', + os.environ.get('FS_SCAN_OUTCOME', 'skipped'), + ), + ( + 'Configuration misconfiguration scan', + Path('trivy-pr-config.json'), + 'Misconfigurations', + os.environ.get('CONFIG_SURFACE', 'false') == 'true', + os.environ.get('CONFIG_SCAN_OUTCOME', 'skipped'), + ), + ) + + total_high_critical = 0 + missing_reports = 0 + + with summary_path.open('a', encoding='utf-8') as summary: + summary.write('### PR Trivy Advisory Summary\n\n') + summary.write('| Scan | Scope | Step outcome | Report |\n') + summary.write('|---|---|---|---|\n') + + for label, report_path, _, in_scope, outcome in reports: + scope_status = 'run' if in_scope else 'skipped' + report_status = 'present' if report_path.exists() else 'missing' + summary.write(f'| {label} | {scope_status} | `{outcome}` | {report_status} |\n') + if in_scope and not report_path.exists(): + missing_reports += 1 + + for label, report_path, finding_key, in_scope, _ in reports: + if not in_scope: + summary.write(f'\n#### {label}\n\nSkipped by path scope.\n') + continue + + if not report_path.exists(): + summary.write(f'\n#### {label}\n\nReport missing (scan did not produce expected output).\n') + continue + + payload = json.loads(report_path.read_text(encoding='utf-8')) + results = payload.get('Results', []) if isinstance(payload, dict) else payload + + severity_counts = Counter() + target_counts = Counter() + + for result in results: + target = result.get('Target', 'unknown-target') + for finding in result.get(finding_key) or []: + severity = (finding.get('Severity') or 'UNKNOWN').upper() + severity_counts[severity] += 1 + target_counts[target] += 1 + + high_critical = severity_counts.get('HIGH', 0) + severity_counts.get('CRITICAL', 0) + total_high_critical += high_critical + + summary.write(f'\n#### {label}\n\n') + summary.write(f'- HIGH/CRITICAL findings: **{high_critical}**\n') + summary.write(f'- Targets with findings: **{len(target_counts)}**\n\n') + + if high_critical == 0: + summary.write('No HIGH/CRITICAL findings detected.\n') + continue + + summary.write('| Severity | Count |\n') + summary.write('|---|---:|\n') + for severity in ('CRITICAL', 'HIGH', 'MEDIUM', 'LOW', 'UNKNOWN'): + count = severity_counts.get(severity, 0) + if count: + summary.write(f'| {severity} | {count} |\n') + + summary.write('\n| Top targets | Findings |\n') + summary.write('|---|---:|\n') + for target, count in target_counts.most_common(10): + summary.write(f'| `{target}` | {count} |\n') + + if missing_reports > 0: + overall_result = 'report-missing' + elif total_high_critical > 0: + overall_result = 'findings-detected' + else: + overall_result = 'clear' + + summary.write('\n') + summary.write(f'- Overall result: **{overall_result}**\n') + summary.write('- Mode: advisory (job continue-on-error=true).\n') + + if missing_reports > 0: + print(f'::warning::PR Trivy missing report files: {missing_reports}. See summary and artifacts.') + elif total_high_critical > 0: + print(f'::warning::PR Trivy found {total_high_critical} HIGH/CRITICAL findings. See summary and artifacts.') + else: + print('::notice::PR Trivy found no HIGH/CRITICAL findings in scope.') + PY + + - name: Upload PR Trivy artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: trivy-pr-security-${{ github.run_id }} + path: | + trivy-pr-fs.json + trivy-pr-config.json + if-no-files-found: ignore + retention-days: 7 diff --git a/.github/workflows/ci-workflow-integrity.yml b/.github/workflows/ci-workflow-integrity.yml new file mode 100644 index 00000000..c3b30c66 --- /dev/null +++ b/.github/workflows/ci-workflow-integrity.yml @@ -0,0 +1,49 @@ +name: CI Workflow Integrity + +on: + pull_request: + paths: + - '.github/workflows/**' + +concurrency: + group: ci-workflow-integrity-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + workflow-integrity: + runs-on: ubuntu-latest + continue-on-error: true + steps: + - uses: actions/checkout@v4 + + - name: Install actionlint + run: | + set -euo pipefail + mkdir -p "$HOME/.local/bin" + bash <(curl -sSfL https://raw.githubusercontent.com/rhysd/actionlint/main/scripts/download-actionlint.bash) 1.7.4 "$HOME/.local/bin" + echo "$HOME/.local/bin" >> "$GITHUB_PATH" + + - name: Run actionlint (advisory) + run: actionlint -color + + - name: Validate workflow YAML parse (advisory) + run: | + set -euo pipefail + python3 -m pip install --upgrade pip + python3 -m pip install pyyaml + python3 - <<'PY' + from pathlib import Path + import yaml + + workflows = sorted(Path('.github/workflows').glob('*.y*ml')) + if not workflows: + raise SystemExit('No workflow files found in .github/workflows') + + for wf in workflows: + with wf.open('r', encoding='utf-8') as f: + list(yaml.safe_load_all(f)) + print(f'Parsed {len(workflows)} workflow file(s).') + PY diff --git a/.github/workflows/lint-docker.yml b/.github/workflows/lint-docker.yml index f0733fdc..887e63db 100644 --- a/.github/workflows/lint-docker.yml +++ b/.github/workflows/lint-docker.yml @@ -1,22 +1,246 @@ -name: Lint Dockerfiles +name: Container Reliability on: - push: - paths: - - '**/Dockerfile*' pull_request: paths: - - '**/Dockerfile*' + - 'docker/af-pod-monitor/**' + - 'docker/interlink-slurm-plugin/**' + - 'docker/purdue-af/**' + - 'slurm/**' + - '.github/scripts/container-smoke.sh' + - '.github/workflows/lint-docker.yml' + +concurrency: + group: lint-docker-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +permissions: + contents: read jobs: - lint: + detect-docker-changes: runs-on: ubuntu-latest + timeout-minutes: 5 + outputs: + dockerfiles: ${{ steps.filter.outputs.dockerfiles }} + af_pod_monitor: ${{ steps.filter.outputs.af_pod_monitor }} + interlink_slurm_plugin: ${{ steps.filter.outputs.interlink_slurm_plugin }} + purdue_af: ${{ steps.filter.outputs.purdue_af }} steps: - uses: actions/checkout@v4 + + - uses: dorny/paths-filter@v3 + id: filter + with: + filters: | + dockerfiles: + - 'docker/af-pod-monitor/Dockerfile' + - 'docker/interlink-slurm-plugin/Dockerfile.alma8' + - 'docker/purdue-af/Dockerfile' + - '.github/workflows/lint-docker.yml' + af_pod_monitor: + - 'docker/af-pod-monitor/**' + - '.github/scripts/container-smoke.sh' + interlink_slurm_plugin: + - 'docker/interlink-slurm-plugin/**' + - 'slurm/**' + - '.github/scripts/container-smoke.sh' + purdue_af: + - 'docker/purdue-af/**' + - 'slurm/**' + - '.github/scripts/container-smoke.sh' + + - name: Publish container reliability plan + if: always() + shell: bash + run: | + set -euo pipefail + { + echo '### Container Reliability Plan' + echo + echo '| Check | Decision |' + echo '|---|---|' + if [ "${{ steps.filter.outputs.dockerfiles }}" = 'true' ]; then + echo '| Dockerfile lint | run |' + else + echo '| Dockerfile lint | skipped |' + fi + if [ "${{ steps.filter.outputs.af_pod_monitor }}" = 'true' ]; then + echo '| af-pod-monitor build/smoke | run |' + else + echo '| af-pod-monitor build/smoke | skipped |' + fi + if [ "${{ steps.filter.outputs.interlink_slurm_plugin }}" = 'true' ]; then + echo '| interlink-slurm-plugin build/smoke | run |' + else + echo '| interlink-slurm-plugin build/smoke | skipped |' + fi + if [ "${{ steps.filter.outputs.purdue_af }}" = 'true' ]; then + echo '| purdue-af build/smoke | run |' + else + echo '| purdue-af build/smoke | skipped |' + fi + echo + echo '- Workflow mode: advisory (all jobs use continue-on-error).' + } >> "$GITHUB_STEP_SUMMARY" + + lint-dockerfiles: + needs: detect-docker-changes + if: needs.detect-docker-changes.outputs.dockerfiles == 'true' + runs-on: ubuntu-latest + timeout-minutes: 12 + continue-on-error: true + steps: + - uses: actions/checkout@v4 + - name: Install hadolint run: | - sudo wget -O /bin/hadolint https://github.com/hadolint/hadolint/releases/latest/download/hadolint-$(uname -s)-$(uname -m) - sudo chmod +x /bin/hadolint - - name: Run hadolint + set -euo pipefail + HADOLINT_VERSION=v2.12.0 + curl -fsSL "https://github.com/hadolint/hadolint/releases/download/${HADOLINT_VERSION}/hadolint-Linux-x86_64" -o /tmp/hadolint + chmod +x /tmp/hadolint + sudo mv /tmp/hadolint /usr/local/bin/hadolint + + - name: Run hadolint (check-only, advisory) + id: hadolint + run: | + set -euo pipefail + hadolint --ignore DL3041 --ignore DL3033 --failure-threshold warning docker/af-pod-monitor/Dockerfile + hadolint --ignore DL3041 --ignore DL3033 --failure-threshold warning docker/interlink-slurm-plugin/Dockerfile.alma8 + hadolint --ignore DL3041 --ignore DL3033 --failure-threshold warning docker/purdue-af/Dockerfile + + - name: Publish Dockerfile lint advisory summary + if: always() + run: | + { + echo '### Dockerfile Lint Advisory Summary' + echo + echo "- Hadolint outcome: ${{ steps.hadolint.outcome }}" + echo '- Mode: advisory (job continue-on-error=true).' + } >> "$GITHUB_STEP_SUMMARY" + + build-af-pod-monitor: + needs: detect-docker-changes + if: needs.detect-docker-changes.outputs.af_pod_monitor == 'true' + runs-on: ubuntu-latest + timeout-minutes: 120 + continue-on-error: true + env: + BUILDKIT_PROGRESS: plain + steps: + - uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build af-pod-monitor image with cache (advisory) + id: build_image + uses: docker/build-push-action@v6 + with: + context: docker/af-pod-monitor + file: docker/af-pod-monitor/Dockerfile + load: true + pull: true + tags: local/af-pod-monitor:${{ github.sha }} + cache-from: type=gha,scope=af-pod-monitor + cache-to: type=gha,mode=max,scope=af-pod-monitor,ignore-error=true + provenance: false + + - name: Smoke test af-pod-monitor image (advisory) + id: smoke_test + run: .github/scripts/container-smoke.sh local/af-pod-monitor:${{ github.sha }} af-pod-monitor + + - name: Publish af-pod-monitor advisory summary + if: always() + run: | + { + echo '### af-pod-monitor Container Advisory Summary' + echo + echo "- Build outcome: ${{ steps.build_image.outcome }}" + echo "- Smoke test outcome: ${{ steps.smoke_test.outcome }}" + echo '- Mode: advisory (job continue-on-error=true).' + } >> "$GITHUB_STEP_SUMMARY" + + build-interlink-slurm-plugin: + needs: detect-docker-changes + if: needs.detect-docker-changes.outputs.interlink_slurm_plugin == 'true' + runs-on: ubuntu-latest + timeout-minutes: 120 + continue-on-error: true + env: + BUILDKIT_PROGRESS: plain + steps: + - uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build interlink-slurm-plugin image with cache (advisory) + id: build_image + uses: docker/build-push-action@v6 + with: + context: . + file: docker/interlink-slurm-plugin/Dockerfile.alma8 + load: true + pull: true + tags: local/interlink-slurm-plugin:${{ github.sha }} + cache-from: type=gha,scope=interlink-slurm-plugin + cache-to: type=gha,mode=max,scope=interlink-slurm-plugin,ignore-error=true + provenance: false + + - name: Smoke test interlink-slurm-plugin image (advisory) + id: smoke_test + run: .github/scripts/container-smoke.sh local/interlink-slurm-plugin:${{ github.sha }} interlink-slurm-plugin + + - name: Publish interlink-slurm-plugin advisory summary + if: always() + run: | + { + echo '### interlink-slurm-plugin Container Advisory Summary' + echo + echo "- Build outcome: ${{ steps.build_image.outcome }}" + echo "- Smoke test outcome: ${{ steps.smoke_test.outcome }}" + echo '- Mode: advisory (job continue-on-error=true).' + } >> "$GITHUB_STEP_SUMMARY" + + build-purdue-af: + needs: detect-docker-changes + if: needs.detect-docker-changes.outputs.purdue_af == 'true' + runs-on: ubuntu-latest + timeout-minutes: 120 + continue-on-error: true + env: + BUILDKIT_PROGRESS: plain + steps: + - uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build purdue-af image with cache (advisory) + id: build_image + uses: docker/build-push-action@v6 + with: + context: . + file: docker/purdue-af/Dockerfile + load: true + pull: true + tags: local/purdue-af:${{ github.sha }} + cache-from: type=gha,scope=purdue-af + cache-to: type=gha,mode=max,scope=purdue-af,ignore-error=true + provenance: false + + - name: Smoke test purdue-af image (advisory) + id: smoke_test + run: .github/scripts/container-smoke.sh local/purdue-af:${{ github.sha }} purdue-af + + - name: Publish purdue-af advisory summary + if: always() run: | - find . -type f -iname 'Dockerfile*' -not -path './docker/dask-gateway-server/*' -exec hadolint --ignore DL3041 --ignore DL3033 --failure-threshold warning {} + \ No newline at end of file + { + echo '### purdue-af Container Advisory Summary' + echo + echo "- Build outcome: ${{ steps.build_image.outcome }}" + echo "- Smoke test outcome: ${{ steps.smoke_test.outcome }}" + echo '- Mode: advisory (job continue-on-error=true).' + } >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/lint-json.yml b/.github/workflows/lint-json.yml index 84c5e6b5..1c5d56f2 100644 --- a/.github/workflows/lint-json.yml +++ b/.github/workflows/lint-json.yml @@ -1,41 +1,50 @@ name: Lint JSON on: - push: - paths: - - '**.json' pull_request: paths: - - '**.json' + - '**/*.json' + - '.github/workflows/lint-json.yml' + - '!docker/dask-gateway-server/**' + - '!docker/kaniko-build-jobs/**' + - '!docs/**' + - '!slurm/**' + - '!.cursor/**' + - '!.git/**' + +concurrency: + group: lint-json-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true permissions: - contents: write - pull-requests: write + contents: read jobs: - lint: + json-parse: runs-on: ubuntu-latest + continue-on-error: true steps: - uses: actions/checkout@v4 - with: - token: ${{ secrets.GITHUB_TOKEN }} - - name: Set up Node.js - uses: actions/setup-node@v4 - with: - node-version: '20' - - name: Install prettier - run: npm install -g prettier - - name: Format JSON files - run: | - find . -name "*.json" | grep -v "docker/dask-gateway-server" | xargs prettier --write - - name: Commit changes - if: ${{ github.event_name == 'pull_request' || github.ref_type == 'branch' }} + + - name: Validate JSON files (check-only, advisory) + shell: bash run: | - git config --local user.email "action@github.com" - git config --local user.name "GitHub Action" - REF_NAME="${GITHUB_HEAD_REF:-${GITHUB_REF_NAME}}" - git fetch origin "$REF_NAME" - git checkout -B "$REF_NAME" "origin/$REF_NAME" || git checkout -B "$REF_NAME" - git add -A - git diff --quiet && git diff --staged --quiet || git commit -m "Apply prettier JSON formatting" - git push origin "$REF_NAME" \ No newline at end of file + set -euo pipefail + mapfile -t files < <(find . -type f -name '*.json' \ + -not -path './docker/dask-gateway-server/*' \ + -not -path './docker/kaniko-build-jobs/*' \ + -not -path './docs/*' \ + -not -path './slurm/*' \ + -not -path './.cursor/*' \ + -not -path './.git/*' | sort) + + if [ "${#files[@]}" -eq 0 ]; then + echo 'No in-scope JSON files found.' + exit 0 + fi + + for f in "${files[@]}"; do + python3 -m json.tool "$f" >/dev/null + done + + echo "Validated ${#files[@]} JSON file(s)." diff --git a/.github/workflows/lint-python.yml b/.github/workflows/lint-python.yml index 32e273ec..1a3aae7f 100644 --- a/.github/workflows/lint-python.yml +++ b/.github/workflows/lint-python.yml @@ -1,44 +1,68 @@ name: Lint Python on: - push: - paths: - - '**.py' pull_request: paths: - - '**.py' + - '**/*.py' + - '.github/workflows/lint-python.yml' + - '!docker/dask-gateway-server/**' + - '!docker/kaniko-build-jobs/**' + - '!docs/**' + - '!slurm/**' + - '!.cursor/**' + - '!.git/**' + +concurrency: + group: lint-python-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true permissions: - contents: write - pull-requests: write + contents: read jobs: - lint: + lint-python: runs-on: ubuntu-latest + continue-on-error: true + env: + BLACK_VERSION: '24.10.0' + ISORT_VERSION: '5.13.2' steps: - uses: actions/checkout@v4 - with: - token: ${{ secrets.GITHUB_TOKEN }} + - name: Set up Python uses: actions/setup-python@v5 with: python-version: '3.11' - - name: Install dependencies + + - name: Install tooling run: | + set -euo pipefail python -m pip install --upgrade pip - pip install black isort - - name: Run black - run: black . --exclude docker/dask-gateway-server - - name: Run isort - run: isort . --skip docker/dask-gateway-server - - name: Commit changes - if: ${{ github.event_name == 'pull_request' || github.ref_type == 'branch' }} + pip install "black==${BLACK_VERSION}" "isort==${ISORT_VERSION}" + + - name: Tool versions + run: | + set -euo pipefail + black --version + isort --version-number + + - name: Run black/isort/py_compile (check-only, advisory) + shell: bash run: | - git config --local user.email "action@github.com" - git config --local user.name "GitHub Action" - REF_NAME="${GITHUB_HEAD_REF:-${GITHUB_REF_NAME}}" - git fetch origin "$REF_NAME" - git checkout -B "$REF_NAME" "origin/$REF_NAME" || git checkout -B "$REF_NAME" - git add -A - git diff --quiet && git diff --staged --quiet || git commit -m "Apply black and isort formatting" - git push origin "$REF_NAME" \ No newline at end of file + set -euo pipefail + mapfile -t files < <(find . -type f -name '*.py' \ + -not -path './docker/dask-gateway-server/*' \ + -not -path './docker/kaniko-build-jobs/*' \ + -not -path './docs/*' \ + -not -path './slurm/*' \ + -not -path './.cursor/*' \ + -not -path './.git/*' | sort) + + if [ "${#files[@]}" -eq 0 ]; then + echo 'No in-scope Python files found.' + exit 0 + fi + + black --check "${files[@]}" + isort --profile black --check-only "${files[@]}" + python -m py_compile "${files[@]}" diff --git a/.github/workflows/lint-shell.yml b/.github/workflows/lint-shell.yml index c3d65b08..f0e076e5 100644 --- a/.github/workflows/lint-shell.yml +++ b/.github/workflows/lint-shell.yml @@ -1,40 +1,76 @@ name: Lint Shell Scripts on: - push: - paths: - - '**.sh' pull_request: paths: - - '**.sh' + - '**/*.sh' + - '**/pixi-wrapper' + - '**/fix-permissions' + - '.github/workflows/lint-shell.yml' + - '!docker/dask-gateway-server/**' + - '!docker/kaniko-build-jobs/**' + - '!docs/**' + - '!slurm/**' + - '!.cursor/**' + - '!.git/**' + +concurrency: + group: lint-shell-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true permissions: - contents: write - pull-requests: write + contents: read jobs: - lint: + lint-shell: runs-on: ubuntu-latest + continue-on-error: true + env: + SHELLCHECK_VERSION: '0.10.0' + SHFMT_VERSION: '3.10.0' steps: - uses: actions/checkout@v4 - with: - token: ${{ secrets.GITHUB_TOKEN }} - - name: Install shfmt + + - name: Install shell tooling run: | - curl -L https://github.com/mvdan/sh/releases/download/v3.7.0/shfmt_v3.7.0_linux_amd64 -o shfmt - chmod +x shfmt - sudo mv shfmt /usr/local/bin/ - - name: Format shell scripts + set -euo pipefail + curl -fsSL "https://github.com/koalaman/shellcheck/releases/download/v${SHELLCHECK_VERSION}/shellcheck-v${SHELLCHECK_VERSION}.linux.x86_64.tar.xz" -o /tmp/shellcheck.tar.xz + tar -xJf /tmp/shellcheck.tar.xz -C /tmp + sudo mv "/tmp/shellcheck-v${SHELLCHECK_VERSION}/shellcheck" /usr/local/bin/shellcheck + curl -fsSL "https://github.com/mvdan/sh/releases/download/v${SHFMT_VERSION}/shfmt_v${SHFMT_VERSION}_linux_amd64" -o /tmp/shfmt + chmod +x /tmp/shfmt + sudo mv /tmp/shfmt /usr/local/bin/shfmt + + - name: Tool versions run: | - find . -type f -name '*.sh' -not -path './docker/dask-gateway-server/*' -exec shfmt -w {} + - - name: Commit changes - if: ${{ github.event_name == 'pull_request' || github.ref_type == 'branch' }} + set -euo pipefail + shellcheck --version + shfmt --version + + - name: Run shellcheck/shfmt/bash -n (check-only, advisory) + shell: bash run: | - git config --local user.email "action@github.com" - git config --local user.name "GitHub Action" - REF_NAME="${GITHUB_HEAD_REF:-${GITHUB_REF_NAME}}" - git fetch origin "$REF_NAME" - git checkout -B "$REF_NAME" "origin/$REF_NAME" || git checkout -B "$REF_NAME" - git add -A - git diff --quiet && git diff --staged --quiet || git commit -m "Apply shfmt shell script formatting" - git push origin "$REF_NAME" \ No newline at end of file + set -euo pipefail + mapfile -t files < <(find . -type f \ + \( -name '*.sh' -o -name 'pixi-wrapper' -o -name 'fix-permissions' \) \ + -not -path './docker/dask-gateway-server/*' \ + -not -path './docker/kaniko-build-jobs/*' \ + -not -path './docs/*' \ + -not -path './slurm/*' \ + -not -path './.cursor/*' \ + -not -path './.git/*' | sort) + + if [ "${#files[@]}" -eq 0 ]; then + echo 'No in-scope shell files found.' + exit 0 + fi + + shellcheck -S error -s bash "${files[@]}" + mapfile -t shfmt_files < <(printf '%s\n' "${files[@]}" | grep -E '\.sh$' || true) + if [ "${#shfmt_files[@]}" -gt 0 ]; then + shfmt -d "${shfmt_files[@]}" + fi + + for f in "${files[@]}"; do + bash -n "$f" + done diff --git a/.github/workflows/lint-yaml.yml b/.github/workflows/lint-yaml.yml index 532884f1..44edbd9b 100644 --- a/.github/workflows/lint-yaml.yml +++ b/.github/workflows/lint-yaml.yml @@ -1,43 +1,75 @@ name: Lint YAML on: - push: - paths: - - '**.yml' - - '**.yaml' pull_request: paths: - - '**.yml' - - '**.yaml' + - '**/*.yml' + - '**/*.yaml' + - '!docker/dask-gateway-server/**' + - '!docker/kaniko-build-jobs/**' + - '!docs/**' + - '!slurm/**' + - '!.cursor/**' + - '!.git/**' + - '!.github/workflows/**' + - '.github/workflows/lint-yaml.yml' + +concurrency: + group: lint-yaml-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true permissions: - contents: write - pull-requests: write + contents: read jobs: - lint: + yaml-parse: runs-on: ubuntu-latest + continue-on-error: true steps: - uses: actions/checkout@v4 - with: - token: ${{ secrets.GITHUB_TOKEN }} - - name: Set up Node.js - uses: actions/setup-node@v4 - with: - node-version: '20' - - name: Install prettier - run: npm install -g prettier - - name: Format YAML files + + - name: Install parser dependency run: | - find . -name "*.yml" -o -name "*.yaml" | grep -v "docker/dask-gateway-server" | grep -v "templates" | grep -v ".github/workflows" | xargs prettier --write - - name: Commit changes - if: ${{ github.event_name == 'pull_request' || github.ref_type == 'branch' }} + set -euo pipefail + python3 -m pip install --upgrade pip + python3 -m pip install pyyaml + + - name: Validate YAML files (check-only, advisory) + shell: bash run: | - git config --local user.email "action@github.com" - git config --local user.name "GitHub Action" - REF_NAME="${GITHUB_HEAD_REF:-${GITHUB_REF_NAME}}" - git fetch origin "$REF_NAME" - git checkout -B "$REF_NAME" "origin/$REF_NAME" || git checkout -B "$REF_NAME" - git add -A - git diff --quiet && git diff --staged --quiet || git commit -m "Apply prettier YAML formatting" - git push origin "$REF_NAME" \ No newline at end of file + set -euo pipefail + python3 - <<'PY' + from pathlib import Path + import yaml + + files = sorted( + p for p in Path('.').rglob('*') + if p.is_file() and p.suffix in {'.yml', '.yaml'} + ) + + excluded_prefixes = ( + Path('docker/dask-gateway-server'), + Path('docker/kaniko-build-jobs'), + Path('docs'), + Path('slurm'), + Path('.cursor'), + Path('.git'), + Path('.github/workflows'), + ) + + filtered = [] + for p in files: + if any(str(p).startswith(str(prefix) + '/') or p == prefix for prefix in excluded_prefixes): + continue + filtered.append(p) + + if not filtered: + print('No in-scope YAML files found.') + raise SystemExit(0) + + for p in filtered: + with p.open('r', encoding='utf-8') as f: + list(yaml.safe_load_all(f)) + + print(f'Validated {len(filtered)} YAML file(s).') + PY diff --git a/.github/workflows/nightly-security-advisory.yml b/.github/workflows/nightly-security-advisory.yml new file mode 100644 index 00000000..c61012e5 --- /dev/null +++ b/.github/workflows/nightly-security-advisory.yml @@ -0,0 +1,119 @@ +name: Nightly Security Advisory + +on: + schedule: + - cron: '17 5 * * *' + workflow_dispatch: + +concurrency: + group: nightly-security-advisory-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + trivy-filesystem: + runs-on: ubuntu-latest + timeout-minutes: 30 + continue-on-error: true + steps: + - uses: actions/checkout@v4 + + - name: Run Trivy filesystem scan (advisory) + id: fs_scan + uses: aquasecurity/trivy-action@0.33.1 + with: + scan-type: fs + scan-ref: . + scanners: vuln + severity: HIGH,CRITICAL + ignore-unfixed: true + exit-code: '1' + format: json + output: trivy-nightly-fs.json + skip-dirs: docker/dask-gateway-server,docs,docs/source/demos,docker/kaniko-build-jobs,slurm,.cursor + + - name: Publish nightly Trivy summary (advisory) + if: always() + shell: bash + env: + FS_SCAN_OUTCOME: ${{ steps.fs_scan.outcome || 'unknown' }} + run: | + set -euo pipefail + python3 - <<'PY' + import json + import os + from collections import Counter + from pathlib import Path + + report_path = Path('trivy-nightly-fs.json') + summary_path = Path(os.environ['GITHUB_STEP_SUMMARY']) + title = 'Nightly Trivy Vulnerability Summary' + scan_outcome = os.environ.get('FS_SCAN_OUTCOME', 'unknown') + + with summary_path.open('a', encoding='utf-8') as summary: + summary.write(f'### {title}\n\n') + summary.write('| Scan | Step outcome | Report |\n') + summary.write('|---|---|---|\n') + summary.write(f'| Filesystem vulnerability scan | `{scan_outcome}` | {"present" if report_path.exists() else "missing"} |\n\n') + + if not report_path.exists(): + summary.write('- Trivy report was not generated.\n') + summary.write('- Overall result: **scan-step-failed**\n') + summary.write('- Mode: advisory (job continue-on-error=true).\n') + print('::warning::Nightly Trivy report was not generated.') + raise SystemExit(0) + + payload = json.loads(report_path.read_text(encoding='utf-8')) + results = payload.get('Results', []) if isinstance(payload, dict) else payload + + severity_counts = Counter() + target_counts = Counter() + + for result in results: + target = result.get('Target', 'unknown-target') + for vuln in result.get('Vulnerabilities') or []: + severity = (vuln.get('Severity') or 'UNKNOWN').upper() + severity_counts[severity] += 1 + target_counts[target] += 1 + + high_critical = severity_counts.get('HIGH', 0) + severity_counts.get('CRITICAL', 0) + + summary.write(f'- HIGH/CRITICAL findings: **{high_critical}**\n') + summary.write(f'- Targets with findings: **{len(target_counts)}**\n\n') + + if high_critical == 0: + summary.write('No HIGH/CRITICAL vulnerabilities found in scope.\n') + summary.write('- Overall result: **clear**\n') + summary.write('- Mode: advisory (job continue-on-error=true).\n') + print('::notice::Nightly Trivy found no HIGH/CRITICAL vulnerabilities.') + raise SystemExit(0) + + summary.write('| Severity | Count |\n') + summary.write('|---|---:|\n') + for severity in ('CRITICAL', 'HIGH', 'MEDIUM', 'LOW', 'UNKNOWN'): + count = severity_counts.get(severity, 0) + if count: + summary.write(f'| {severity} | {count} |\n') + + summary.write('\n| Top targets | Findings |\n') + summary.write('|---|---:|\n') + for target, count in target_counts.most_common(10): + summary.write(f'| `{target}` | {count} |\n') + + summary.write('\n') + summary.write('- Overall result: **findings-detected**\n') + summary.write('- Mode: advisory (job continue-on-error=true).\n') + + print(f'::warning::Nightly Trivy found {high_critical} HIGH/CRITICAL vulnerabilities. See summary and artifact.') + PY + + - name: Upload nightly Trivy artifact + if: always() + uses: actions/upload-artifact@v4 + with: + name: trivy-nightly-fs-${{ github.run_id }} + path: trivy-nightly-fs.json + if-no-files-found: ignore + retention-days: 14 diff --git a/README.md b/README.md index 0faaac5d..926e2e30 100644 --- a/README.md +++ b/README.md @@ -18,9 +18,33 @@ Each user is provided with a 25GB home directory at first login. These directori [![Documentation Status](https://readthedocs.org/projects/purdue-af/badge/?version=latest)](https://purdue-af.readthedocs.io/en/latest/?badge=latest) -[![Python](https://github.com/PurdueAF/purdue-af/workflows/Lint%20Python/badge.svg)](https://github.com/PurdueAF/purdue-af/actions?query=workflow%3A%22Lint+Python%22) -[![YAML](https://github.com/PurdueAF/purdue-af/workflows/Lint%20YAML/badge.svg)](https://github.com/PurdueAF/purdue-af/actions?query=workflow%3A%22Lint+YAML%22) -[![JSON](https://github.com/PurdueAF/purdue-af/workflows/Lint%20JSON/badge.svg)](https://github.com/PurdueAF/purdue-af/actions?query=workflow%3A%22Lint+JSON%22) -[![Shell](https://github.com/PurdueAF/purdue-af/workflows/Lint%20Shell%20Scripts/badge.svg)](https://github.com/PurdueAF/purdue-af/actions?query=workflow%3A%22Lint+Shell+Scripts%22) -[![Docker](https://github.com/PurdueAF/purdue-af/workflows/Lint%20Dockerfiles/badge.svg)](https://github.com/PurdueAF/purdue-af/actions?query=workflow%3A%22Lint+Dockerfiles%22) - +### Runtime Status + +[![Workflow Integrity](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-workflow-integrity.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-workflow-integrity.yml) +[![Repo Quality](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-repo-quality.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-repo-quality.yml) +[![CI Format Autofix](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-format-autofix.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-format-autofix.yml) +[![CI Integration Scenarios](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-integration-scenarios.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-integration-scenarios.yml) +[![Container Reliability](https://github.com/PurdueAF/purdue-af/actions/workflows/lint-docker.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/lint-docker.yml) +[![GitOps Deployability](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-gitops-deployability.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-gitops-deployability.yml) +[![CI Security Advisory](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-security-advisory.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-security-advisory.yml) +[![Nightly Security](https://github.com/PurdueAF/purdue-af/actions/workflows/nightly-security-advisory.yml/badge.svg?branch=main)](https://github.com/PurdueAF/purdue-af/actions/workflows/nightly-security-advisory.yml) + +### Policy Badges + +[![Coverage Gate](https://img.shields.io/badge/Coverage%20Gate-%3E%3D70%25%20%28advisory%29-4c1)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-repo-quality.yml) +[![Security Scans](https://img.shields.io/badge/Security%20Scans-PR%20%2B%20Nightly-0366d6)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-security-advisory.yml) +[![Validation Mode](https://img.shields.io/badge/Validation%20Mode-Advisory--first-f59e0b)](https://github.com/PurdueAF/purdue-af/actions) +[![Autofix](https://img.shields.io/badge/Autofix-Python%2FShell%2FJSON%2FYAML-7c3aed)](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-format-autofix.yml) + +### CI Profile + +| Signal | Workflow | Trigger | Mode (advisory/blocking) | Notes | +|---|---|---|---|---| +| Workflow integrity | [CI Workflow Integrity](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-workflow-integrity.yml) | Pull request (`.github/workflows/**`) | advisory | Actionlint + workflow YAML parse | +| Repo quality | [CI Repo Quality](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-repo-quality.yml) | Pull request (unit/runtime paths) | advisory | Unit tests + 70% coverage policy signal | +| Format autofix | [CI Format Autofix](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-format-autofix.yml) | Pull request open/sync/reopen (format-targeted paths) | advisory | Auto-formats and pushes fix commits to PR branch | +| Integration scenarios | [CI Integration Scenarios](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-integration-scenarios.yml) | Pull request (integration paths) | advisory | Scripted integration scenario run | +| Container reliability | [Container Reliability](https://github.com/PurdueAF/purdue-af/actions/workflows/lint-docker.yml) | Pull request (container/slurm paths) | advisory | Hadolint + image build/smoke checks | +| GitOps deployability | [CI GitOps Deployability](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-gitops-deployability.yml) | Pull request (`deploy/**`) | advisory | Kustomize render + kubeconform validation | +| Security advisory (PR) | [CI Security Advisory](https://github.com/PurdueAF/purdue-af/actions/workflows/ci-security-advisory.yml) | Pull request (security-relevant paths) + manual dispatch | advisory | Trivy vuln/config scans with summary + artifacts | +| Security advisory (nightly) | [Nightly Security Advisory](https://github.com/PurdueAF/purdue-af/actions/workflows/nightly-security-advisory.yml) | Nightly schedule + manual dispatch | advisory | Trivy filesystem scan with nightly summary | diff --git a/docker/af-pod-monitor/Dockerfile b/docker/af-pod-monitor/Dockerfile index f03b2fba..c6f05b0b 100644 --- a/docker/af-pod-monitor/Dockerfile +++ b/docker/af-pod-monitor/Dockerfile @@ -2,10 +2,10 @@ FROM python:3.8-slim WORKDIR /etc/ -RUN pip install --no-cache-dir prometheus_client==0.22.1 +RUN pip install --no-cache-dir prometheus_client==0.21.1 COPY pod-metrics-exporter.py /etc/ RUN chmod +x /etc/pod-metrics-exporter.py -CMD ["python", "/etc/pod-metrics-exporter.py"] \ No newline at end of file +CMD ["python", "/etc/pod-metrics-exporter.py"] diff --git a/docker/purdue-af/genaistudio/genaistudio.py b/docker/purdue-af/genaistudio/genaistudio.py index 965a65a7..832d3be2 100644 --- a/docker/purdue-af/genaistudio/genaistudio.py +++ b/docker/purdue-af/genaistudio/genaistudio.py @@ -1,5 +1,4 @@ -from jupyter_ai_magics.providers import (BaseProvider, EnvAuthStrategy, - TextField) +from jupyter_ai_magics.providers import BaseProvider, EnvAuthStrategy, TextField from langchain_openai import ChatOpenAI diff --git a/docker/purdue-af/scripts/eos-connect.sh b/docker/purdue-af/scripts/eos-connect.sh index 055e8984..33d51ec8 100644 --- a/docker/purdue-af/scripts/eos-connect.sh +++ b/docker/purdue-af/scripts/eos-connect.sh @@ -31,7 +31,7 @@ if [[ $krb_ticket = "" ]]; then echo " > Kerberos authentication failed!" echo "" return 1 - else: + else echo " > Kerberos authentication complete!" echo "" fi diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..eac3d579 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +import importlib.util +import sys +from pathlib import Path +from types import ModuleType +from typing import Callable + +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[1] + + +class RecordingGauge: + def __init__(self) -> None: + self.values: list[float] = [] + self.label_children: dict[tuple[tuple[str, str], ...], "RecordingGauge"] = {} + + def set(self, value: float) -> None: + self.values.append(value) + + def labels(self, **labels: str) -> "RecordingGauge": + key = tuple(sorted(labels.items())) + child = self.label_children.get(key) + if child is None: + child = RecordingGauge() + self.label_children[key] = child + return child + + +@pytest.fixture +def recording_gauge_cls(): + return RecordingGauge + + +@pytest.fixture +def prometheus_stub() -> ModuleType: + module = ModuleType("prometheus_client") + + class Gauge: + def __init__(self, *_args, **_kwargs) -> None: + self.values = [] + + def set(self, value: float) -> None: + self.values.append(value) + + def labels(self, **_labels: str) -> "Gauge": + return self + + module.Gauge = Gauge + module.start_http_server = lambda *_args, **_kwargs: None + return module + + +@pytest.fixture +def module_loader(monkeypatch: pytest.MonkeyPatch) -> Callable[..., object]: + counter = 0 + + def _load( + relative_path: str, *, extra_modules: dict[str, object] | None = None + ) -> object: + nonlocal counter + counter += 1 + module_name = f"test_module_{counter}" + module_path = REPO_ROOT / relative_path + + if extra_modules: + for name, module in extra_modules.items(): + monkeypatch.setitem(sys.modules, name, module) + + spec = importlib.util.spec_from_file_location(module_name, module_path) + assert spec is not None and spec.loader is not None + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + return _load diff --git a/tests/fixtures/container_smoke/matrix.json b/tests/fixtures/container_smoke/matrix.json new file mode 100644 index 00000000..78619afd --- /dev/null +++ b/tests/fixtures/container_smoke/matrix.json @@ -0,0 +1,102 @@ +[ + { + "name": "af_pod_monitor_success", + "image": "ghcr.io/purdue-af/af-pod-monitor:test", + "profile": "af-pod-monitor", + "mock": { + "inspect_exit": 0, + "run_exit": 0 + }, + "expected": { + "exit_code": 0, + "stdout_contains": [ + "Smoke checks passed for profile: af-pod-monitor" + ], + "stderr_contains": [], + "log_lines": [ + "image inspect ghcr.io/purdue-af/af-pod-monitor:test", + "run --rm --entrypoint python ghcr.io/purdue-af/af-pod-monitor:test -c import prometheus_client" + ] + } + }, + { + "name": "interlink_slurm_plugin_failure", + "image": "ghcr.io/purdue-af/interlink-slurm-plugin:test", + "profile": "interlink-slurm-plugin", + "mock": { + "inspect_exit": 0, + "run_exit": 1, + "run_stderr": "missing /sidecar/slurm-sidecar" + }, + "expected": { + "exit_code": 1, + "stdout_contains": [], + "stderr_contains": [ + "missing /sidecar/slurm-sidecar" + ], + "log_lines": [ + "image inspect ghcr.io/purdue-af/interlink-slurm-plugin:test", + "run --rm --entrypoint /bin/sh ghcr.io/purdue-af/interlink-slurm-plugin:test -lc test -x /sidecar/slurm-sidecar" + ] + } + }, + { + "name": "unknown_profile_rejected", + "image": "ghcr.io/purdue-af/custom:test", + "profile": "unknown-profile", + "mock": { + "inspect_exit": 0, + "run_exit": 0 + }, + "expected": { + "exit_code": 2, + "stdout_contains": [], + "stderr_contains": [ + "Unknown profile: unknown-profile" + ], + "log_lines": [ + "image inspect ghcr.io/purdue-af/custom:test" + ] + } + }, + { + "name": "inspect_failure_short_circuit", + "image": "ghcr.io/purdue-af/purdue-af:test", + "profile": "purdue-af", + "mock": { + "inspect_exit": 1, + "inspect_stderr": "image not found", + "run_exit": 0 + }, + "expected": { + "exit_code": 1, + "stdout_contains": [], + "stderr_contains": [ + "image not found" + ], + "log_lines": [ + "image inspect ghcr.io/purdue-af/purdue-af:test" + ] + } + }, + { + "name": "purdue_af_success", + "image": "ghcr.io/purdue-af/purdue-af:test", + "profile": "purdue-af", + "mock": { + "inspect_exit": 0, + "run_exit": 0 + }, + "expected": { + "exit_code": 0, + "stdout_contains": [ + "Smoke checks passed for profile: purdue-af" + ], + "stderr_contains": [], + "log_lines": [ + "image inspect ghcr.io/purdue-af/purdue-af:test", + "run --rm --entrypoint /bin/bash ghcr.io/purdue-af/purdue-af:test -lc python --version && jupyter --version >/dev/null" + ] + } + } +] diff --git a/tests/fixtures/monitoring/event_rate_cases.json b/tests/fixtures/monitoring/event_rate_cases.json new file mode 100644 index 00000000..785a3de0 --- /dev/null +++ b/tests/fixtures/monitoring/event_rate_cases.json @@ -0,0 +1,12 @@ +[ + { + "name": "valid_event_rate", + "fixture_file": "event_rate_valid.txt", + "expected_gauge_value": 128.5 + }, + { + "name": "invalid_event_rate", + "fixture_file": "event_rate_invalid.txt", + "expected_gauge_value": 0 + } +] diff --git a/tests/fixtures/monitoring/event_rate_invalid.txt b/tests/fixtures/monitoring/event_rate_invalid.txt new file mode 100644 index 00000000..3fb64bad --- /dev/null +++ b/tests/fixtures/monitoring/event_rate_invalid.txt @@ -0,0 +1 @@ +not-a-number diff --git a/tests/fixtures/monitoring/event_rate_valid.txt b/tests/fixtures/monitoring/event_rate_valid.txt new file mode 100644 index 00000000..ae500ed9 --- /dev/null +++ b/tests/fixtures/monitoring/event_rate_valid.txt @@ -0,0 +1 @@ +128.5 diff --git a/tests/fixtures/node_health/checksum_cases.json b/tests/fixtures/node_health/checksum_cases.json new file mode 100644 index 00000000..954d99b8 --- /dev/null +++ b/tests/fixtures/node_health/checksum_cases.json @@ -0,0 +1,57 @@ +[ + { + "name": "checksum_match", + "mode": "normal", + "filename": "/depot/cms/purdue-af/validate-mount.txt", + "expected_checksum": "13dede34ee8dc7e5b70c9cd06ac15467", + "md5_stdout": "13dede34ee8dc7e5b70c9cd06ac15467 /depot/cms/purdue-af/validate-mount.txt\n", + "md5_stderr": "", + "returncode": 0, + "start_time": 1000.0, + "end_time": 1000.123, + "expected_result": true, + "expected_ping_ms": 123.0, + "expect_killed": false + }, + { + "name": "checksum_mismatch", + "mode": "normal", + "filename": "/work/projects/purdue-af/validate-mount.txt", + "expected_checksum": "f4cb7f2740ba3e87edfbda6c70fa94c2", + "md5_stdout": "00000000000000000000000000000000 /work/projects/purdue-af/validate-mount.txt\n", + "md5_stderr": "", + "returncode": 0, + "start_time": 2000.0, + "end_time": 2000.05, + "expected_result": false, + "expected_ping_ms": 50.0, + "expect_killed": false + }, + { + "name": "md5_error_returncode", + "mode": "normal", + "filename": "/eos/purdue/store/user/dkondrat/test.root", + "expected_checksum": "18864b0de8ae5a6a8d3b459a7999b431", + "md5_stdout": "", + "md5_stderr": "No such file or directory", + "returncode": 1, + "start_time": 3000.0, + "end_time": 3000.08, + "expected_result": false, + "expected_ping_ms": 80.0, + "expect_killed": false + }, + { + "name": "md5_timeout", + "mode": "timeout", + "filename": "/cvmfs/cms.cern.ch/SITECONF/T2_US_Purdue/Purdue-Hadoop/JobConfig/site-local-config.xml", + "expected_checksum": "3b570d80272b7188c13cef51e58b7151", + "md5_stdout": "", + "md5_stderr": "", + "returncode": 124, + "start_time": 4000.0, + "expected_result": false, + "expected_ping_ms": 3000, + "expect_killed": true + } +] diff --git a/tests/integration/common.py b/tests/integration/common.py new file mode 100644 index 00000000..9ad1358a --- /dev/null +++ b/tests/integration/common.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +import importlib.util +import json +import sys +import types +from pathlib import Path +from typing import Any + +REPO_ROOT = Path(__file__).resolve().parents[2] +FIXTURES_ROOT = REPO_ROOT / "tests" / "fixtures" + + +class FakeGaugeChild: + def __init__(self, labels: dict[str, str]): + self.labels = labels + self.value: float | int | None = None + self.history: list[float | int] = [] + + def set(self, value: float | int) -> None: + self.value = value + self.history.append(value) + + +class FakeGauge: + def __init__( + self, + name: str, + description: str, + label_names: list[str] | tuple[str, ...] | None = None, + ): + self.name = name + self.description = description + self.label_names = tuple(label_names or ()) + self.value: float | int | None = None + self.history: list[float | int] = [] + self.children: dict[tuple[tuple[str, str], ...], FakeGaugeChild] = {} + + def set(self, value: float | int) -> None: + self.value = value + self.history.append(value) + + def labels(self, *args: str, **kwargs: str) -> FakeGaugeChild: + if args and kwargs: + raise ValueError("labels accepts positional or keyword labels, not both") + + if args: + if len(args) != len(self.label_names): + raise ValueError("label count does not match") + label_values = dict(zip(self.label_names, args)) + else: + label_values = {name: kwargs[name] for name in self.label_names} + + key = tuple((name, label_values[name]) for name in self.label_names) + child = self.children.get(key) + if child is None: + child = FakeGaugeChild(label_values) + self.children[key] = child + return child + + +def load_json_fixture(relative_path: str) -> Any: + fixture_path = FIXTURES_ROOT / relative_path + with fixture_path.open("r", encoding="utf-8") as f: + return json.load(f) + + +def load_module_with_fake_prometheus(relative_path: str, module_name: str): + module_path = REPO_ROOT / relative_path + spec = importlib.util.spec_from_file_location(module_name, module_path) + if spec is None or spec.loader is None: + raise RuntimeError(f"Could not load module spec for {module_path}") + + module = importlib.util.module_from_spec(spec) + + fake_prometheus = types.ModuleType("prometheus_client") + fake_prometheus.Gauge = FakeGauge + fake_prometheus.start_http_server = lambda *_args, **_kwargs: None + + original_prometheus = sys.modules.get("prometheus_client") + sys.modules["prometheus_client"] = fake_prometheus + try: + spec.loader.exec_module(module) + finally: + if original_prometheus is None: + del sys.modules["prometheus_client"] + else: + sys.modules["prometheus_client"] = original_prometheus + + return module diff --git a/tests/integration/test_container_smoke_matrix.py b/tests/integration/test_container_smoke_matrix.py new file mode 100644 index 00000000..1ed3a879 --- /dev/null +++ b/tests/integration/test_container_smoke_matrix.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +import os +import subprocess +import tempfile +import unittest +from pathlib import Path + +from common import REPO_ROOT, load_json_fixture + +CONTAINER_SMOKE_SCRIPT = REPO_ROOT / ".github/scripts/container-smoke.sh" +MOCK_DOCKER_SCRIPT = REPO_ROOT / ".github/scripts/integration/mock-docker-cli.sh" + + +class ContainerSmokeBehaviorMatrixIntegrationTest(unittest.TestCase): + def test_container_smoke_behavior_matrix(self) -> None: + cases = load_json_fixture("container_smoke/matrix.json") + + for case in cases: + with self.subTest(case=case["name"]): + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + docker_wrapper = temp_path / "docker" + docker_wrapper.write_text( + f'#!/usr/bin/env bash\nexec "{MOCK_DOCKER_SCRIPT}" "$@"\n', + encoding="utf-8", + ) + docker_wrapper.chmod(0o755) + + log_file = temp_path / "docker.log" + env = os.environ.copy() + env["PATH"] = f"{temp_path}:{env.get('PATH', '')}" + env["MOCK_DOCKER_LOG"] = str(log_file) + env["MOCK_DOCKER_INSPECT_EXIT"] = str(case["mock"]["inspect_exit"]) + env["MOCK_DOCKER_RUN_EXIT"] = str(case["mock"]["run_exit"]) + env["MOCK_DOCKER_INSPECT_STDERR"] = case["mock"].get( + "inspect_stderr", "" + ) + env["MOCK_DOCKER_RUN_STDERR"] = case["mock"].get("run_stderr", "") + + result = subprocess.run( + [ + "bash", + str(CONTAINER_SMOKE_SCRIPT), + case["image"], + case["profile"], + ], + capture_output=True, + text=True, + check=False, + env=env, + ) + + expected = case["expected"] + self.assertEqual(result.returncode, expected["exit_code"]) + for expected_text in expected["stdout_contains"]: + self.assertIn(expected_text, result.stdout) + for expected_text in expected["stderr_contains"]: + self.assertIn(expected_text, result.stderr) + + logged_lines = [] + if log_file.exists(): + logged_lines = log_file.read_text(encoding="utf-8").splitlines() + self.assertEqual(logged_lines, expected["log_lines"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/integration/test_monitoring_metric_update.py b/tests/integration/test_monitoring_metric_update.py new file mode 100644 index 00000000..a647ffcb --- /dev/null +++ b/tests/integration/test_monitoring_metric_update.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +import unittest +from pathlib import Path +from unittest import mock +from uuid import uuid4 + +from common import FIXTURES_ROOT, load_json_fixture, load_module_with_fake_prometheus + +METRIC_FILE = "/work/projects/purdue-af/agc/metrics/event_rate.txt" +MODULE_PATH = "apps/monitoring/af-monitoring/metrics_server.py" + + +class MonitoringMetricUpdateFlowIntegrationTest(unittest.TestCase): + def setUp(self) -> None: + module_name = f"metrics_server_integration_{uuid4().hex}" + self.module = load_module_with_fake_prometheus(MODULE_PATH, module_name) + + def _patched_open_for_fixture(self, fixture_path: Path): + real_open = open + + def _patched_open(path, *args, **kwargs): + if str(path) == METRIC_FILE: + return real_open(fixture_path, *args, **kwargs) + return real_open(path, *args, **kwargs) + + return _patched_open + + def test_fixture_backed_metric_updates(self) -> None: + cases = load_json_fixture("monitoring/event_rate_cases.json") + + for case in cases: + fixture_path = FIXTURES_ROOT / "monitoring" / case["fixture_file"] + with self.subTest(case=case["name"]), mock.patch( + "builtins.open", + side_effect=self._patched_open_for_fixture(fixture_path), + ): + self.module.update_metrics() + self.assertEqual( + self.module.event_rate_per_worker.history[-1], + case["expected_gauge_value"], + ) + + def test_missing_metric_file_falls_back_to_zero(self) -> None: + real_open = open + + def _patched_open(path, *args, **kwargs): + if str(path) == METRIC_FILE: + raise FileNotFoundError("event rate fixture not found") + return real_open(path, *args, **kwargs) + + with mock.patch("builtins.open", side_effect=_patched_open): + self.module.update_metrics() + + self.assertEqual(self.module.event_rate_per_worker.history[-1], 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/integration/test_node_healthcheck_integration.py b/tests/integration/test_node_healthcheck_integration.py new file mode 100644 index 00000000..17abc15d --- /dev/null +++ b/tests/integration/test_node_healthcheck_integration.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +import subprocess +import unittest +from unittest import mock +from uuid import uuid4 + +from common import load_json_fixture, load_module_with_fake_prometheus + +MODULE_PATH = "apps/monitoring/af-monitoring/node_healthcheck.py" + + +class FakeMd5Process: + def __init__(self, case: dict): + self.mode = case["mode"] + self.stdout = case["md5_stdout"] + self.stderr = case["md5_stderr"] + self.returncode = case["returncode"] + self.killed = False + self.communicate_calls = 0 + self.timeout_history: list[float | int | None] = [] + + def communicate(self, timeout=None): + self.communicate_calls += 1 + self.timeout_history.append(timeout) + if self.mode == "timeout" and self.communicate_calls == 1: + raise subprocess.TimeoutExpired(cmd="/usr/bin/md5sum", timeout=timeout) + return self.stdout, self.stderr + + def kill(self): + self.killed = True + + +class NodeHealthChecksumTimeoutIntegrationTest(unittest.TestCase): + def setUp(self) -> None: + module_name = f"node_healthcheck_integration_{uuid4().hex}" + self.module = load_module_with_fake_prometheus(MODULE_PATH, module_name) + + def test_checksum_and_timeout_matrix(self) -> None: + cases = load_json_fixture("node_health/checksum_cases.json") + + for case in cases: + process = FakeMd5Process(case) + time_values = [case["start_time"]] + if case["mode"] != "timeout": + time_values.append(case["end_time"]) + + with self.subTest(case=case["name"]), mock.patch.object( + self.module.subprocess, + "Popen", + return_value=process, + ) as popen_mock, mock.patch.object( + self.module.time, + "time", + side_effect=time_values, + ): + result, ping_ms = self.module.check_if_directory_exists( + (case["filename"], case["expected_checksum"]) + ) + + self.assertEqual(result, case["expected_result"]) + self.assertEqual(process.killed, case["expect_killed"]) + self.assertEqual( + popen_mock.call_args[0][0], + ["/usr/bin/md5sum", case["filename"]], + ) + if case["mode"] == "timeout": + self.assertEqual(process.timeout_history, [3, None]) + else: + self.assertEqual(process.timeout_history, [3]) + + expected_ping_ms = case["expected_ping_ms"] + if isinstance(expected_ping_ms, float): + self.assertAlmostEqual(ping_ms, expected_ping_ms, delta=0.001) + else: + self.assertEqual(ping_ms, expected_ping_ms) + + if case["mode"] == "timeout": + self.assertEqual(process.communicate_calls, 2) + else: + self.assertEqual(process.communicate_calls, 1) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/test_docker_healthcheck.py b/tests/unit/test_docker_healthcheck.py new file mode 100644 index 00000000..64102674 --- /dev/null +++ b/tests/unit/test_docker_healthcheck.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +import json +from types import ModuleType + + +class _FakeJsonFile: + def __init__(self, payload: bytes) -> None: + self.payload = payload + + def read_bytes(self) -> bytes: + return self.payload + + +class _FakePath: + def __init__(self, payload: bytes) -> None: + self.payload = payload + + def __truediv__(self, _part: str) -> "_FakePath": + return self + + def glob(self, _pattern: str): + return iter([_FakeJsonFile(self.payload)]) + + +def test_healthcheck_queries_jupyter_api_and_prints_response( + monkeypatch, module_loader +) -> None: + captured = {} + payload = json.dumps({"url": "https://af.example/"}).encode("utf-8") + + class _FakeResponse: + def __init__(self) -> None: + self.content = b"healthy" + self.raise_calls = 0 + + def raise_for_status(self) -> None: + self.raise_calls += 1 + + fake_response = _FakeResponse() + requests_stub = ModuleType("requests") + + def _fake_get(url: str, verify: bool): + captured["url"] = url + captured["verify"] = verify + return fake_response + + requests_stub.get = _fake_get + + pathlib_stub = ModuleType("pathlib") + pathlib_stub.Path = lambda _value: _FakePath(payload) + + printed = [] + monkeypatch.setenv("NB_USER", "alice") + monkeypatch.setattr("builtins.print", lambda value: printed.append(value)) + + module_loader( + "docker/purdue-af/jupyter/docker_healthcheck.py", + extra_modules={"pathlib": pathlib_stub, "requests": requests_stub}, + ) + + assert captured == {"url": "https://af.example/api", "verify": False} + assert fake_response.raise_calls == 1 + assert printed == [b"healthy"] diff --git a/tests/unit/test_metrics_server.py b/tests/unit/test_metrics_server.py new file mode 100644 index 00000000..2967aa7c --- /dev/null +++ b/tests/unit/test_metrics_server.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from unittest.mock import mock_open + + +def test_update_metrics_sets_event_rate_from_file( + monkeypatch, module_loader, prometheus_stub, recording_gauge_cls +) -> None: + module = module_loader( + "apps/monitoring/af-monitoring/metrics_server.py", + extra_modules={"prometheus_client": prometheus_stub}, + ) + module.event_rate_per_worker = recording_gauge_cls() + monkeypatch.setattr("builtins.open", mock_open(read_data="42.5\n")) + + module.update_metrics() + + assert module.event_rate_per_worker.values == [42.5] + + +def test_update_metrics_sets_zero_when_read_fails( + monkeypatch, module_loader, prometheus_stub, recording_gauge_cls +) -> None: + module = module_loader( + "apps/monitoring/af-monitoring/metrics_server.py", + extra_modules={"prometheus_client": prometheus_stub}, + ) + module.event_rate_per_worker = recording_gauge_cls() + + def _raise(*_args, **_kwargs): + raise OSError("not found") + + monkeypatch.setattr("builtins.open", _raise) + + module.update_metrics() + + assert module.event_rate_per_worker.values == [0] diff --git a/tests/unit/test_node_healthcheck.py b/tests/unit/test_node_healthcheck.py new file mode 100644 index 00000000..e1510b5f --- /dev/null +++ b/tests/unit/test_node_healthcheck.py @@ -0,0 +1,111 @@ +from __future__ import annotations + +import subprocess + +import pytest + + +def test_check_if_directory_exists_reports_success_for_matching_checksum( + monkeypatch, module_loader, prometheus_stub +) -> None: + module = module_loader( + "apps/monitoring/af-monitoring/node_healthcheck.py", + extra_modules={"prometheus_client": prometheus_stub}, + ) + + class FakeProc: + returncode = 0 + + def __init__(self) -> None: + self.killed = False + + def communicate(self, timeout=None): + return ("abc123 /tmp/validate.txt\n", "") + + def kill(self) -> None: + self.killed = True + + proc = FakeProc() + popen_calls = [] + + def _fake_popen(args, **kwargs): + popen_calls.append((args, kwargs)) + return proc + + times = iter([100.0, 100.2]) + monkeypatch.setattr(module.time, "time", lambda: next(times)) + monkeypatch.setattr(module.subprocess, "Popen", _fake_popen) + + valid, elapsed_ms = module.check_if_directory_exists( + ("/tmp/validate.txt", "abc123") + ) + + assert valid is True + assert elapsed_ms == pytest.approx(200.0) + assert popen_calls[0][0] == ["/usr/bin/md5sum", "/tmp/validate.txt"] + + +def test_check_if_directory_exists_returns_timeout_result( + monkeypatch, module_loader, prometheus_stub +) -> None: + module = module_loader( + "apps/monitoring/af-monitoring/node_healthcheck.py", + extra_modules={"prometheus_client": prometheus_stub}, + ) + + class FakeProc: + returncode = 0 + + def __init__(self) -> None: + self.killed = False + self.calls = 0 + + def communicate(self, timeout=None): + self.calls += 1 + if self.calls == 1: + raise subprocess.TimeoutExpired(cmd="md5sum", timeout=timeout) + return ("", "") + + def kill(self) -> None: + self.killed = True + + proc = FakeProc() + monkeypatch.setattr(module.subprocess, "Popen", lambda *_args, **_kwargs: proc) + + valid, elapsed_ms = module.check_if_directory_exists( + ("/tmp/validate.txt", "abc123") + ) + + assert valid is False + assert elapsed_ms == 3000 + assert proc.killed is True + + +def test_update_metrics_writes_mount_health_and_ping( + monkeypatch, module_loader, prometheus_stub, recording_gauge_cls +) -> None: + module = module_loader( + "apps/monitoring/af-monitoring/node_healthcheck.py", + extra_modules={"prometheus_client": prometheus_stub}, + ) + module.mount_valid = recording_gauge_cls() + module.mount_ping_ms = recording_gauge_cls() + module.mounts = { + "mount-a": ("/mnt/a", "sum-a"), + "mount-b": ("/mnt/b", "sum-b"), + } + responses = iter([(True, 12.5), (False, 22.5)]) + monkeypatch.setattr( + module, + "check_if_directory_exists", + lambda _path_tuple: next(responses), + ) + + module.update_metrics() + + key_a = (("mount_name", "mount-a"), ("mount_path", "/mnt/a")) + key_b = (("mount_name", "mount-b"), ("mount_path", "/mnt/b")) + assert module.mount_valid.label_children[key_a].values == [1] + assert module.mount_valid.label_children[key_b].values == [0] + assert module.mount_ping_ms.label_children[key_a].values == [12.5] + assert module.mount_ping_ms.label_children[key_b].values == [22.5] diff --git a/tests/unit/test_pod_metrics_exporter.py b/tests/unit/test_pod_metrics_exporter.py new file mode 100644 index 00000000..90d2353e --- /dev/null +++ b/tests/unit/test_pod_metrics_exporter.py @@ -0,0 +1,89 @@ +from __future__ import annotations + +import glob +import os +from types import SimpleNamespace + + +def _load_exporter(monkeypatch, module_loader, prometheus_stub): + monkeypatch.setattr(os, "listdir", lambda _path: ["jovyan", "slurm", "alice"]) + monkeypatch.setattr(glob, "glob", lambda _pattern: ["/home/alice"]) + return module_loader( + "docker/af-pod-monitor/pod-metrics-exporter.py", + extra_modules={"prometheus_client": prometheus_stub}, + ) + + +def test_module_initializes_directories_from_non_skipped_user( + monkeypatch, module_loader, prometheus_stub +) -> None: + module = _load_exporter(monkeypatch, module_loader, prometheus_stub) + + assert module.username == "alice" + assert module.directories == { + "home": "/home/alice", + "work": "/work/users/alice/", + } + + +def test_update_metrics_work_branch_sets_usage_and_access_time( + monkeypatch, module_loader, prometheus_stub, recording_gauge_cls +) -> None: + module = _load_exporter(monkeypatch, module_loader, prometheus_stub) + module.metrics = { + "work_dir_used": recording_gauge_cls(), + "work_dir_size": recording_gauge_cls(), + "work_dir_util": recording_gauge_cls(), + "work_dir_last_accessed": recording_gauge_cls(), + } + module.dl = "work" + monkeypatch.setattr( + module.subprocess, + "check_output", + lambda *_args, **_kwargs: b"2048 /work/users/alice/\n", + ) + monkeypatch.setattr( + module.os, + "stat", + lambda _directory: SimpleNamespace(st_atime=1700000000.0), + ) + + module.update_metrics("work") + + assert module.metrics["work_dir_used"].values == [2048] + assert module.metrics["work_dir_size"].values == [104857600] + assert module.metrics["work_dir_util"].values == [2048 / 104857600] + assert module.metrics["work_dir_last_accessed"].values == [1700000000.0] + + +def test_update_metrics_home_branch_parses_df_and_ignores_stat_errors( + monkeypatch, module_loader, prometheus_stub, recording_gauge_cls +) -> None: + module = _load_exporter(monkeypatch, module_loader, prometheus_stub) + module.metrics = { + "home_dir_used": recording_gauge_cls(), + "home_dir_size": recording_gauge_cls(), + "home_dir_util": recording_gauge_cls(), + "home_dir_last_accessed": recording_gauge_cls(), + } + module.dl = "home" + + df_output = ( + "Filesystem 1K-blocks Used Available Use% Mounted on\n" + "/dev/sda1 1000 250 750 25% /home\n" + ).encode("utf-8") + monkeypatch.setattr( + module.subprocess, "check_output", lambda *_args, **_kwargs: df_output + ) + + def _raise_stat(_directory): + raise OSError("stat unavailable") + + monkeypatch.setattr(module.os, "stat", _raise_stat) + + module.update_metrics("home") + + assert module.metrics["home_dir_used"].values == [250] + assert module.metrics["home_dir_size"].values == [1000] + assert module.metrics["home_dir_util"].values == [0.25] + assert module.metrics["home_dir_last_accessed"].values == []