diff --git a/.github/workflows/e2e-matrix.yml b/.github/workflows/e2e-matrix.yml index 447cc00b76..bbe3d21e5d 100644 --- a/.github/workflows/e2e-matrix.yml +++ b/.github/workflows/e2e-matrix.yml @@ -17,12 +17,7 @@ name: E2E Storage Matrix on: push: branches: - - chore/ci/e2e-matrix-skeleton - pull_request: - types: [opened, reopened, synchronize, labeled, unlabeled] - branches: - - main - - chore/ci/e2e-matrix-skeleton + - ci-nested-matrix-test-run schedule: - cron: "30 2 * * *" workflow_dispatch: @@ -45,8 +40,9 @@ jobs: storage_class: linstor-thin-r2 parent_storage_class: linstor-thin-r1-immediate image_storage_class: linstor-thin-r1-immediate + hotplug_storage_class: linstor-thin-r1-immediate attach_disk_size: 10Gi - data_disk_count: 2 + data_disk_count: 1 concurrency: group: setup-${{ github.head_ref || github.ref_name }}-${{ matrix.profile }} cancel-in-progress: true @@ -56,8 +52,15 @@ jobs: STORAGE_CLASS: ${{ matrix.storage_class }} PARENT_STORAGE_CLASS: ${{ matrix.parent_storage_class }} IMAGE_STORAGE_CLASS: ${{ matrix.image_storage_class }} + HOTPLUG_STORAGE_CLASS: ${{ matrix.hotplug_storage_class || matrix.parent_storage_class }} ATTACH_DISK_SIZE: ${{ matrix.attach_disk_size }} DATA_DISK_COUNT: ${{ matrix.data_disk_count }} + outputs: + run_id: ${{ steps.setup-output.outputs.run_id }} + run_artifact: ${{ steps.setup-output.outputs.run_artifact }} + profile: ${{ steps.setup-output.outputs.profile }} + storage_name: ${{ steps.setup-output.outputs.storage_name }} + storage_class: ${{ steps.setup-output.outputs.storage_class }} steps: - uses: actions/checkout@v4 @@ -78,14 +81,7 @@ jobs: version: "latest" - name: Setup d8 - uses: werf/trdl/actions/setup-app@v0.12.2 - with: - repo: d8 - url: https://deckhouse.ru/downloads/deckhouse-cli-trdl/ - root-version: 1 - root-sha512: 343bd5f0d8811254e5f0b6fe292372a7b7eda08d276ff255229200f84e58a8151ab2729df3515cb11372dc3899c70df172a4e54c8a596a73d67ae790466a0491 - group: 0 - channel: stable + uses: ./.github/actions/install-d8 - name: Install yq run: | @@ -93,6 +89,16 @@ jobs: curl -L -o /usr/local/bin/yq https://github.com/mikefarah/yq/releases/download/v4.44.1/yq_linux_amd64 chmod +x /usr/local/bin/yq + - name: Export setup outputs + id: setup-output + run: | + set -euo pipefail + echo "run_id=${RUN_ID}" >> "$GITHUB_OUTPUT" + echo "run_artifact=nested-run-${RUN_ID}" >> "$GITHUB_OUTPUT" + echo "profile=${PROFILE}" >> "$GITHUB_OUTPUT" + echo "storage_name=${{ matrix.storage_name }}" >> "$GITHUB_OUTPUT" + echo "storage_class=${STORAGE_CLASS}" >> "$GITHUB_OUTPUT" + - name: Setup nested environment env: RUN_ID: ${{ env.RUN_ID }} @@ -100,6 +106,7 @@ jobs: STORAGE_CLASS: ${{ env.STORAGE_CLASS }} PARENT_STORAGE_CLASS: ${{ env.PARENT_STORAGE_CLASS }} IMAGE_STORAGE_CLASS: ${{ env.IMAGE_STORAGE_CLASS }} + HOTPLUG_STORAGE_CLASS: ${{ env.HOTPLUG_STORAGE_CLASS || env.PARENT_STORAGE_CLASS }} ATTACH_DISK_SIZE: ${{ env.ATTACH_DISK_SIZE }} DATA_DISK_COUNT: ${{ matrix.data_disk_count }} REGISTRY_DOCKER_CFG: ${{ secrets.DEV_REGISTRY_DOCKER_CFG }} @@ -109,9 +116,100 @@ jobs: run: | task ci:setup-nested-env + - name: Upload nested artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: ${{ steps.setup-output.outputs.run_artifact }} + path: | + ci/dvp-e2e/tmp/runs/${{ env.RUN_ID }} + if-no-files-found: ignore + + tests: + name: Run E2E (${{ matrix.profile }} / ${{ matrix.storage_class }}) + runs-on: ubuntu-latest + needs: setup + timeout-minutes: 300 + strategy: + matrix: + include: + - profile: sds-replicated-volume + storage_name: sds + storage_class: linstor-thin-r2 + parent_storage_class: linstor-thin-r1-immediate + image_storage_class: linstor-thin-r1-immediate + attach_disk_size: 10Gi + data_disk_count: 1 + env: + GO_VERSION: "1.24.6" + TIMEOUT: 4h + RUN_ID: ${{ needs.setup.outputs.run_id }} + steps: + - uses: actions/checkout@v4 + + - name: Set up Go ${{ env.GO_VERSION }} + uses: actions/setup-go@v5 + with: + go-version: ${{ env.GO_VERSION }} + + - name: Install Task + uses: arduino/setup-task@v2 + with: + version: 3.x + repo-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Install kubectl + uses: azure/setup-kubectl@v4 + with: + version: "latest" + + - name: Setup d8 + uses: ./.github/actions/install-d8 + + - name: Install ginkgo + working-directory: test/e2e + run: | + go install github.com/onsi/ginkgo/v2/ginkgo@latest + + - name: Download dependencies + working-directory: test/e2e + run: | + go mod download + + - name: Download nested run artifacts + uses: actions/download-artifact@v4 + with: + name: nested-run-${{ env.RUN_ID }} + path: ci/dvp-e2e/tmp/runs/${{ env.RUN_ID }} + merge-multiple: true + + - name: Configure kubeconfig env + working-directory: ci/dvp-e2e + env: + RUN_ID: ${{ env.RUN_ID }} + run: | + task ci:kubeconfig:ensure + + - name: Pause for manual inspection + working-directory: ci/dvp-e2e + env: + RUN_ID: ${{ env.RUN_ID }} + MANUAL_WAIT_SECONDS: ${{ vars.MANUAL_WAIT_SECONDS || '36000' }} + run: | + task ci:manual-wait + + - name: Run E2E tests + working-directory: test/e2e + env: + STORAGE_CLASS_NAME: ${{ matrix.storage_class }} + run: | + task run:ci -v + cleanup: name: Cleanup (${{ matrix.profile }}) - needs: setup + needs: + - setup + - tests if: always() runs-on: ubuntu-latest strategy: @@ -123,9 +221,7 @@ jobs: parent_storage_class: linstor-thin-r1-immediate image_storage_class: linstor-thin-r1-immediate attach_disk_size: 10Gi - data_disk_count: 2 - env: - CLEANUP_PREFIX: ${{ vars.CLEANUP_PREFIX || 'nightly-nested-e2e-' }} + data_disk_count: 1 steps: - uses: actions/checkout@v4 @@ -142,7 +238,9 @@ jobs: - name: Cleanup test namespaces working-directory: ci/dvp-e2e run: | + # Cleanup specific RUN_ID for this matrix leg + RUN_ID="nightly-nested-e2e-${{ matrix.storage_name }}-${{ github.run_number }}" task cleanup:namespaces \ - PREFIX="${CLEANUP_PREFIX}" \ + PREFIX="${RUN_ID}" \ API_URL="${E2E_K8S_URL}" \ SA_TOKEN="${{ secrets.E2E_NESTED_SA_SECRET }}" diff --git a/ci/dvp-e2e/Taskfile.yaml b/ci/dvp-e2e/Taskfile.yaml index e4ff030c00..f5a02029f3 100644 --- a/ci/dvp-e2e/Taskfile.yaml +++ b/ci/dvp-e2e/Taskfile.yaml @@ -96,6 +96,7 @@ tasks: STORAGE_CLASS: '{{ .STORAGE_CLASS | default (env "STORAGE_CLASS") | default "" }}' IMAGE_STORAGE_CLASS: '{{ .IMAGE_STORAGE_CLASS | default (env "IMAGE_STORAGE_CLASS") | default "" }}' PARENT_STORAGE_CLASS: '{{ .PARENT_STORAGE_CLASS | default (env "PARENT_STORAGE_CLASS") | default "" }}' + HOTPLUG_STORAGE_CLASS: '{{ .HOTPLUG_STORAGE_CLASS | default (env "HOTPLUG_STORAGE_CLASS") | default "" }}' ATTACH_DISK_SIZE: '{{ .ATTACH_DISK_SIZE | default (env "ATTACH_DISK_SIZE") | default "10Gi" }}' DATA_DISK_COUNT: '{{ .DATA_DISK_COUNT | default (env "DATA_DISK_COUNT") | default "2" }}' REGISTRY_DOCKER_CFG: '{{ .REGISTRY_DOCKER_CFG | default (env "REGISTRY_DOCKER_CFG") | default "" }}' @@ -105,7 +106,7 @@ tasks: VALUES_FILE_PATH: '{{ printf "%s/values.yaml" .RUN_DIR }}' PARENT_KUBECONFIG_PATH: '{{ printf "%s/parent.kubeconfig" .RUN_DIR }}' NESTED_KUBECONFIG_PATH: '{{ printf "%s/nested/kubeconfig" .RUN_DIR }}' - EFFECTIVE_DISK_SC: "{{ if .IMAGE_STORAGE_CLASS }}{{ .IMAGE_STORAGE_CLASS }}{{ else }}{{ .STORAGE_CLASS }}{{ end }}" + EFFECTIVE_DISK_SC: "{{ if .HOTPLUG_STORAGE_CLASS }}{{ .HOTPLUG_STORAGE_CLASS }}{{ else if .IMAGE_STORAGE_CLASS }}{{ .IMAGE_STORAGE_CLASS }}{{ else }}{{ .STORAGE_CLASS }}{{ end }}" cmds: - task: ci:prepare-env vars: @@ -114,6 +115,7 @@ tasks: PROFILE: "{{ .PROFILE }}" STORAGE_CLASS: "{{ .STORAGE_CLASS }}" PARENT_STORAGE_CLASS: "{{ .PARENT_STORAGE_CLASS }}" + HOTPLUG_STORAGE_CLASS: "{{ .HOTPLUG_STORAGE_CLASS }}" REGISTRY_DOCKER_CFG: "{{ .REGISTRY_DOCKER_CFG }}" API_URL: "{{ .API_URL }}" SA_TOKEN: "{{ .SA_TOKEN }}" @@ -124,9 +126,11 @@ tasks: PARENT_KUBECONFIG: "{{ .PARENT_KUBECONFIG_PATH }}" REGISTRY_DOCKER_CFG: "{{ .REGISTRY_DOCKER_CFG }}" TARGET_STORAGE_CLASS: "{{ .PARENT_STORAGE_CLASS }}" + PARENT_STORAGE_CLASS: "{{ .PARENT_STORAGE_CLASS }}" ATTACH_DISK_SIZE: "{{ .ATTACH_DISK_SIZE }}" EFFECTIVE_DISK_SC: "{{ .EFFECTIVE_DISK_SC }}" NAMESPACE: "{{ .RUN_ID }}" + NESTED_DIR: "{{ .RUN_DIR }}/nested" NESTED_KUBECONFIG: "{{ .NESTED_KUBECONFIG_PATH }}" SDS_SC_NAME: "{{ .STORAGE_CLASS }}" DATA_DISK_COUNT: "{{ .DATA_DISK_COUNT }}" @@ -139,6 +143,7 @@ tasks: PROFILE: '{{ .PROFILE | default (env "PROFILE") | default "" }}' STORAGE_CLASS: '{{ .STORAGE_CLASS | default (env "STORAGE_CLASS") | default "" }}' PARENT_STORAGE_CLASS: '{{ .PARENT_STORAGE_CLASS | default (env "PARENT_STORAGE_CLASS") | default "" }}' + HOTPLUG_STORAGE_CLASS: '{{ .HOTPLUG_STORAGE_CLASS | default (env "HOTPLUG_STORAGE_CLASS") | default "" }}' REGISTRY_DOCKER_CFG: '{{ .REGISTRY_DOCKER_CFG | default (env "REGISTRY_DOCKER_CFG") | default "" }}' API_URL: '{{ .API_URL | default (env "API_URL") | default (env "E2E_K8S_URL") | default "" }}' SA_TOKEN: '{{ .SA_TOKEN | default (env "SA_TOKEN") | default (env "E2E_NESTED_SA_SECRET") | default "" }}' @@ -481,10 +486,7 @@ tasks: set -euo pipefail NESTED_DIR="{{ .NESTED_DIR }}" NESTED_KUBECONFIG="{{ .NESTED_KUBECONFIG }}" - if ! mkdir -p "${NESTED_DIR}"; then - echo "[ERR] Failed to create nested directory: ${NESTED_DIR}" >&2 - exit 1 - fi + mkdir -p "${NESTED_DIR}" "$(dirname "${NESTED_KUBECONFIG}")" - chmod +x scripts/build_nested_kubeconfig.sh - | scripts/build_nested_kubeconfig.sh \ @@ -535,16 +537,21 @@ tasks: echo "[CLEANUP] Prefix='{{ .PREFIX }}'" ns_list=$(kubectl get ns -o json | jq -r --arg p '{{ .PREFIX }}' '.items[].metadata.name | select(startswith($p))') if [ -z "${ns_list}" ]; then - echo "[INFO] No namespaces to delete"; exit 0 + echo "[INFO] No namespaces to delete" + else + for ns in $ns_list; do + echo "[CLEANUP] Deleting namespace $ns ..." + kubectl delete ns "$ns" --wait=false || true + done + echo "[CLEANUP] Waiting for namespaces to be deleted..." + for ns in $ns_list; do + kubectl wait --for=delete ns/"$ns" --timeout=600s || echo "[WARN] Namespace $ns was not fully deleted within timeout" + done fi - for ns in $ns_list; do - echo "[CLEANUP] Deleting namespace $ns ..." - kubectl delete ns "$ns" --wait=false || true - done - echo "[CLEANUP] Waiting for namespaces to be deleted..." - for ns in $ns_list; do - kubectl wait --for=delete ns/"$ns" --timeout=600s || echo "[WARN] Namespace $ns was not fully deleted within timeout" - done + # Cleanup cluster-scoped resources for this run-id (if any) + echo "[CLEANUP] Deleting cluster-scoped resources labeled with run-id='{{ .PREFIX }}'" + kubectl delete virtualmachineclass -l e2e.deckhouse.io/run-id='{{ .PREFIX }}' --ignore-not-found || true + kubectl delete clusterrolebinding -l e2e.deckhouse.io/run-id='{{ .PREFIX }}' --ignore-not-found || true # ------------------------------------------------------------ # CI helpers: kubeconfig + registry @@ -574,6 +581,7 @@ tasks: PARENT_KUBECONFIG: '{{ .PARENT_KUBECONFIG | default (env "KUBECONFIG") }}' REGISTRY_DOCKER_CFG: '{{ .REGISTRY_DOCKER_CFG | default (env "REGISTRY_DOCKER_CFG") | default "" }}' TARGET_STORAGE_CLASS: "{{ .TARGET_STORAGE_CLASS }}" + PARENT_STORAGE_CLASS: "{{ .PARENT_STORAGE_CLASS | default .TARGET_STORAGE_CLASS }}" ATTACH_DISK_SIZE: '{{ .ATTACH_DISK_SIZE | default "10Gi" }}' EFFECTIVE_DISK_SC: "{{ .EFFECTIVE_DISK_SC }}" NAMESPACE: "{{ .NAMESPACE }}" @@ -620,3 +628,71 @@ tasks: TMP_DIR: "{{ .TMP_DIR }}" NESTED_KUBECONFIG: "{{ .NESTED_KUBECONFIG }}" SDS_SC_NAME: "{{ .SDS_SC_NAME }}" + ci:kubeconfig:ensure: + desc: Ensure nested kubeconfig exists at the expected path + vars: + RUN_ID: '{{ .RUN_ID | default (env "RUN_ID") | default "" }}' + WORKSPACE: + sh: git rev-parse --show-toplevel 2>/dev/null || pwd + cmds: + - | + set -euo pipefail + RUN_ID="{{ .RUN_ID }}" + if [ -z "$RUN_ID" ]; then + echo "[ERR] RUN_ID must be provided to locate nested kubeconfig" >&2 + exit 1 + fi + WORKSPACE="${GITHUB_WORKSPACE:-{{ .WORKSPACE }}}" + TARGET_PATH="$WORKSPACE/ci/dvp-e2e/tmp/runs/$RUN_ID/nested/kubeconfig" + if [ ! -s "$TARGET_PATH" ]; then + echo "[ERR] Nested kubeconfig not found at $TARGET_PATH" >&2 + exit 1 + fi + echo "[INFO] Using nested kubeconfig at $TARGET_PATH" + [ -n "${GITHUB_ENV:-}" ] && echo "KUBECONFIG=$TARGET_PATH" >> "$GITHUB_ENV" + ci:manual-wait: + desc: Pause execution to allow manual SSH inspection of nested cluster + vars: + RUN_ID: '{{ .RUN_ID | default (env "RUN_ID") | default "" }}' + WORKSPACE: + sh: git rev-parse --show-toplevel 2>/dev/null || pwd + WAIT_SECONDS: '{{ .WAIT_SECONDS | default (env "MANUAL_WAIT_SECONDS") | default "36000" }}' + cmds: + - | + set -euo pipefail + RUN_ID="{{ .RUN_ID }}" + WAIT="{{ .WAIT_SECONDS }}" + if [ -z "$RUN_ID" ]; then + echo "[ERR] RUN_ID must be set for ci:manual-wait" >&2 + exit 1 + fi + if ! [[ "$WAIT" =~ ^[0-9]+$ ]]; then + echo "[ERR] WAIT_SECONDS must be numeric (got '$WAIT')" >&2 + exit 1 + fi + if [ "$WAIT" -le 0 ]; then + echo "[INFO] Manual wait skipped (WAIT_SECONDS=$WAIT)" + exit 0 + fi + WORKSPACE='{{ .WORKSPACE }}' + PARENT_KUBECONFIG="$WORKSPACE/ci/dvp-e2e/tmp/runs/$RUN_ID/parent.kubeconfig" + echo "[INFO] Pausing for $WAIT seconds before running tests." + echo "[INFO] Use parent kubeconfig for SSH tunneling:" + echo " export KUBECONFIG=$PARENT_KUBECONFIG" + echo " d8 v ssh --namespace $RUN_ID --username ubuntu " + echo "[INFO] Press Ctrl+C in the workflow run to cancel wait early." + START_TS=$(date +%s) + END=$((START_TS + WAIT)) + LAST_NOTE=$START_TS + while true; do + NOW=$(date +%s) + [ "$NOW" -ge "$END" ] && break + REM=$((END - NOW)) + printf '[INFO] Manual wait: %d seconds remaining...\n' "$REM" + if [ $((NOW - LAST_NOTE)) -ge 300 ]; then + echo "[INFO] Cluster should be ready for manual SSH troubleshooting." + LAST_NOTE=$NOW + fi + sleep 60 || true + done + echo "[INFO] Manual wait finished; proceeding to tests." diff --git a/ci/dvp-e2e/charts/infra/templates/ingress.yaml b/ci/dvp-e2e/charts/infra/templates/ingress.yaml index b419188353..113142af11 100644 --- a/ci/dvp-e2e/charts/infra/templates/ingress.yaml +++ b/ci/dvp-e2e/charts/infra/templates/ingress.yaml @@ -3,6 +3,8 @@ kind: Service metadata: name: dvp-over-dvp-443 namespace: {{ .Values.namespace }} + labels: + e2e.deckhouse.io/run-id: "{{ .Values.namespace }}" spec: ports: - port: 443 @@ -20,6 +22,8 @@ metadata: annotations: nginx.ingress.kubernetes.io/ssl-passthrough: "true" nginx.ingress.kubernetes.io/backend-protocol: "HTTPS" + labels: + e2e.deckhouse.io/run-id: "{{ .Values.namespace }}" spec: ingressClassName: nginx rules: diff --git a/ci/dvp-e2e/charts/infra/templates/jump-host/deploy.yaml b/ci/dvp-e2e/charts/infra/templates/jump-host/deploy.yaml index a6bee4278a..7ff0c3e8d5 100644 --- a/ci/dvp-e2e/charts/infra/templates/jump-host/deploy.yaml +++ b/ci/dvp-e2e/charts/infra/templates/jump-host/deploy.yaml @@ -4,6 +4,8 @@ kind: Deployment metadata: name: jump-host namespace: {{ .Values.namespace }} + labels: + e2e.deckhouse.io/run-id: "{{ .Values.namespace }}" spec: replicas: 1 selector: @@ -13,6 +15,7 @@ spec: metadata: labels: app: jump-host + e2e.deckhouse.io/run-id: "{{ .Values.namespace }}" spec: containers: - name: jump-host diff --git a/ci/dvp-e2e/charts/infra/templates/jump-host/svc.yaml b/ci/dvp-e2e/charts/infra/templates/jump-host/svc.yaml index e2b809dcab..09adddbc27 100644 --- a/ci/dvp-e2e/charts/infra/templates/jump-host/svc.yaml +++ b/ci/dvp-e2e/charts/infra/templates/jump-host/svc.yaml @@ -4,6 +4,8 @@ kind: Service metadata: name: jump-host namespace: {{ .Values.namespace }} + labels: + e2e.deckhouse.io/run-id: "{{ .Values.namespace }}" spec: type: NodePort selector: diff --git a/ci/dvp-e2e/charts/infra/templates/ns.yaml b/ci/dvp-e2e/charts/infra/templates/ns.yaml index 064087cab7..2ddec28c58 100644 --- a/ci/dvp-e2e/charts/infra/templates/ns.yaml +++ b/ci/dvp-e2e/charts/infra/templates/ns.yaml @@ -4,3 +4,4 @@ metadata: name: {{ .Values.namespace }} labels: heritage: deckhouse + e2e.deckhouse.io/run-id: "{{ .Values.namespace }}" diff --git a/ci/dvp-e2e/charts/infra/templates/rbac/rbac.yaml b/ci/dvp-e2e/charts/infra/templates/rbac/rbac.yaml index 1a6a4b9846..6e1c531459 100644 --- a/ci/dvp-e2e/charts/infra/templates/rbac/rbac.yaml +++ b/ci/dvp-e2e/charts/infra/templates/rbac/rbac.yaml @@ -3,6 +3,8 @@ kind: ServiceAccount metadata: name: dkp-sa namespace: {{ .Values.namespace }} + labels: + e2e.deckhouse.io/run-id: "{{ .Values.namespace }}" --- apiVersion: v1 kind: Secret @@ -11,6 +13,8 @@ metadata: namespace: {{ .Values.namespace }} annotations: kubernetes.io/service-account.name: dkp-sa + labels: + e2e.deckhouse.io/run-id: "{{ .Values.namespace }}" type: kubernetes.io/service-account-token --- apiVersion: rbac.authorization.k8s.io/v1 @@ -18,6 +22,8 @@ kind: RoleBinding metadata: name: dkp-sa-rb namespace: {{ .Values.namespace }} + labels: + e2e.deckhouse.io/run-id: "{{ .Values.namespace }}" subjects: - kind: ServiceAccount name: dkp-sa @@ -31,6 +37,8 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: dkp-sa-cluster-admin-{{ .Values.namespace }} + labels: + e2e.deckhouse.io/run-id: "{{ .Values.namespace }}" subjects: - kind: ServiceAccount name: dkp-sa diff --git a/ci/dvp-e2e/charts/infra/templates/vi.yaml b/ci/dvp-e2e/charts/infra/templates/vi.yaml index 66034a649d..3aa7acec04 100644 --- a/ci/dvp-e2e/charts/infra/templates/vi.yaml +++ b/ci/dvp-e2e/charts/infra/templates/vi.yaml @@ -4,6 +4,8 @@ kind: VirtualImage metadata: name: image namespace: {{ .Values.namespace }} + labels: + e2e.deckhouse.io/run-id: "{{ .Values.namespace }}" spec: storage: ContainerRegistry dataSource: diff --git a/ci/dvp-e2e/charts/infra/templates/vmc.yaml b/ci/dvp-e2e/charts/infra/templates/vmc.yaml index 39330ced39..db7d46cb74 100644 --- a/ci/dvp-e2e/charts/infra/templates/vmc.yaml +++ b/ci/dvp-e2e/charts/infra/templates/vmc.yaml @@ -2,6 +2,8 @@ apiVersion: virtualization.deckhouse.io/v1alpha2 kind: VirtualMachineClass metadata: name: "{{ .Values.namespace }}-cpu" + labels: + e2e.deckhouse.io/run-id: "{{ .Values.namespace }}" spec: cpu: type: Discovery diff --git a/ci/dvp-e2e/scripts/attach_worker_disks.sh b/ci/dvp-e2e/scripts/attach_worker_disks.sh index f6d0b2ca94..e11ab63fe2 100755 --- a/ci/dvp-e2e/scripts/attach_worker_disks.sh +++ b/ci/dvp-e2e/scripts/attach_worker_disks.sh @@ -22,7 +22,7 @@ set -euo pipefail namespace="" storage_class="" disk_size="10Gi" -disk_count="2" +disk_count="1" kubeconfig="${KUBECONFIG:-}" while getopts ":n:s:z:c:k:" opt; do @@ -50,6 +50,12 @@ fi echo "[INFRA] Attaching ${disk_count} storage disks to worker VMs using hotplug in namespace ${namespace}" +# Cleanup stale hp-volume pods (older than 10 minutes) to avoid interference +echo "[INFRA] Cleaning up stale hp-volume pods (older than 10m) before attachment" +kubectl -n "${namespace}" get pods --no-headers 2>/dev/null \ + | awk '$1 ~ /^hp-volume-/ && $3 == "Running" && $5 ~ /[0-9]+m/ { split($5,t,"m"); if (t[1] > 10) print $1 }' \ + | xargs -r kubectl -n "${namespace}" delete pod --force --grace-period=0 2>/dev/null || true + # Wait for worker VMs for i in $(seq 1 50); do worker_count=$(kubectl -n "${namespace}" get vm -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null | grep -c worker || echo "0") @@ -90,7 +96,7 @@ for vm in "${workers[@]}"; do vd="storage-disk-${disk_num}-$vm" echo "[INFRA] Creating VirtualDisk $vd (${disk_size}, sc=${storage_class})" - cat > "/tmp/vd-$vd.yaml" </dev/null 2>&1 || kubectl -n "${namespace}" apply -f - </dev/null 2>&1 || kubectl -n "${namespace}" apply -f "/tmp/vd-$vd.yaml" # Wait for VirtualDisk to be Ready echo "[INFRA] Waiting for VirtualDisk $vd to be Ready..." @@ -115,7 +120,7 @@ EOF echo "[INFRA] VD $vd phase=$vd_phase; retry $j/50" sleep 5 done - + if [ "$vd_phase" != "Ready" ]; then echo "[ERROR] VirtualDisk $vd not Ready" kubectl -n "${namespace}" get vd "$vd" -o yaml || true @@ -123,34 +128,51 @@ EOF exit 1 fi - # Wait for PVC - pvc_name="" - for j in $(seq 1 50); do - pvc_name=$(kubectl -n "${namespace}" get vd "$vd" -o jsonpath='{.status.target.persistentVolumeClaimName}' 2>/dev/null || true) - [ -n "$pvc_name" ] && break - echo "[INFRA] Waiting for PVC name for VD $vd; retry $j/50" - sleep 3 - done - - if [ -n "$pvc_name" ]; then - echo "[INFRA] Waiting PVC $pvc_name to reach phase=Bound..." - for j in $(seq 1 120); do - pvc_phase=$(kubectl -n "${namespace}" get pvc "$pvc_name" -o jsonpath='{.status.phase}' 2>/dev/null || true) - if [ "$pvc_phase" = "Bound" ]; then - break - fi - [ $((j % 10)) -eq 0 ] && echo "[INFRA] PVC $pvc_name phase=$pvc_phase; retry $j/120" - sleep 2 - done - if [ "$pvc_phase" != "Bound" ]; then - echo "[WARN] PVC $pvc_name not Bound after waiting" + # Ensure VirtualDisk is not marked in use before attaching + in_use="false" + for j in $(seq 1 30); do + in_use=$(kubectl -n "${namespace}" get vd "$vd" -o json 2>/dev/null | jq -r '.status.inUse // false' || echo "false") + if [ "$in_use" = "false" ]; then + break fi + echo "[INFRA] VD $vd inUse=$in_use; retry $j/30" + sleep 5 + done + + if [ "$in_use" != "false" ]; then + echo "[ERROR] VirtualDisk $vd remains InUse; aborting attachment" + kubectl -n "${namespace}" get vd "$vd" -o yaml || true + kubectl -n "${namespace}" get events --sort-by=.lastTimestamp | tail -n 100 || true + exit 1 + fi + + # Skip if VM already reports this disk attached/hotplugged + if kubectl -n "${namespace}" get vm "$vm" -o json 2>/dev/null \ + | jq -e --arg disk "$vd" ' + ([.status.blockDeviceRefs[]? + | select(.name == $disk and .attached == true) + ] | length) > 0' >/dev/null; then + echo "[INFO] VM $vm already has disk $vd attached; skipping VMBDA creation" + continue + fi + + # Skip if there is an existing non-failed VMBDA for this disk + conflict_vmbda=$(kubectl -n "${namespace}" get vmbda -o json 2>/dev/null \ + | jq -r --arg name "$vd" ' + .items[]? + | select(.spec.blockDeviceRef.kind == "VirtualDisk" + and .spec.blockDeviceRef.name == $name + and (.status.phase != "" and .status.phase != "Failed")) + | .metadata.name' | head -n 1) + if [ -n "${conflict_vmbda:-}" ]; then + echo "[WARN] Found existing VMBDA $conflict_vmbda for disk $vd; skipping" + continue fi # Create hotplug attachment att="att-$vd" echo "[INFRA] Creating VirtualMachineBlockDeviceAttachment $att for VM $vm" - cat > "/tmp/att-$att.yaml" </dev/null 2>&1 || kubectl -n "${namespace}" apply -f - </dev/null 2>&1 || kubectl -n "${namespace}" apply -f "/tmp/att-$att.yaml" + + # Give controller time to react on creation + sleep 60 # Wait for attachment echo "[INFRA] Waiting for VMBDA $att to be Attached..." att_phase="" success_by_vm=0 - for i in $(seq 1 50); do + for i in $(seq 1 100); do att_phase=$(kubectl -n "${namespace}" get vmbda "$att" -o jsonpath='{.status.phase}' 2>/dev/null || true) if [ "$att_phase" = "Attached" ]; then echo "[INFRA] Disk $vd attached to VM $vm" @@ -187,7 +211,7 @@ EOF success_by_vm=1 break fi - [ $((i % 10)) -eq 0 ] && echo "[INFRA] Disk $vd phase=$att_phase; retry $i/50" + [ $((i % 10)) -eq 0 ] && echo "[INFRA] Disk $vd phase=$att_phase; retry $i/100" sleep 5 done @@ -201,6 +225,17 @@ EOF done echo "[INFRA] VM $vm configured with hotplug disks" + + echo "[DEBUG] BlockDeviceRefs for VM $vm" + kubectl -n "${namespace}" get vm "$vm" -o json 2>/dev/null | jq '.status.blockDeviceRefs' || true + echo "[DEBUG] BlockDevices in cluster (all namespaces)" + kubectl get blockdevices.storage.deckhouse.io -A 2>/dev/null || true + + # Throttle between VMs to avoid concurrent hotplug flaps + if [ ${#workers[@]} -gt 1 ]; then + echo "[INFRA] Waiting 60s before processing next VM..." + sleep 60 + fi done echo "[INFRA] All worker VMs configured with storage disks via hotplug" diff --git a/ci/dvp-e2e/scripts/configure_sds_storage.sh b/ci/dvp-e2e/scripts/configure_sds_storage.sh index 8f3b6d6ff6..f1a7e11fb2 100755 --- a/ci/dvp-e2e/scripts/configure_sds_storage.sh +++ b/ci/dvp-e2e/scripts/configure_sds_storage.sh @@ -123,21 +123,28 @@ if [ -z "$NODES" ]; then NODES=$(kubectl get nodes -o json | jq -r '.items[].metadata.name') fi +MATCH_EXPR_TYPE=$(yq eval -n ' + .key = "status.blockdevice.storage.deckhouse.io/type" | + .operator = "In" | + .values = ["disk"] +') + +MATCH_EXPR_MODEL=$(yq eval -n ' + .key = "status.blockdevice.storage.deckhouse.io/model" | + .operator = "In" | + .values = ["QEMU-HARDDISK"] +') + for node in $NODES; do [ -z "$node" ] && continue - MATCH_EXPR=$(yq eval -n ' - .key = "storage.deckhouse.io/device-path" | - .operator = "In" | - .values = ["/dev/sdb","/dev/vdb","/dev/xvdb","/dev/sdc","/dev/vdc","/dev/xvdc","/dev/sdd","/dev/vdd","/dev/xvdd"] - ') - NODE="$node" MATCH_EXPR="$MATCH_EXPR" yq eval -n ' + NODE="$node" MATCH_EXPR_TYPE="$MATCH_EXPR_TYPE" MATCH_EXPR_MODEL="$MATCH_EXPR_MODEL" yq eval -n ' .apiVersion = "storage.deckhouse.io/v1alpha1" | .kind = "LVMVolumeGroup" | .metadata.name = "data-" + env(NODE) | .spec.type = "Local" | .spec.local.nodeName = env(NODE) | .spec.actualVGNameOnTheNode = "data" | - .spec.blockDeviceSelector.matchExpressions = [ env(MATCH_EXPR) ] + .spec.blockDeviceSelector.matchExpressions = [ env(MATCH_EXPR_TYPE), env(MATCH_EXPR_MODEL) ] ' | kubectl apply -f - done