Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -653,6 +653,13 @@ tests:
test:
- chain: openshift-e2e-test-qe
workflow: cucushift-installer-rehearse-aws-ipi-edge-zone-cco-manual-security-token-service
- as: aws-ipi-zone-consistency-f14
cron: 30 10 3,17 * *
steps:
cluster_profile: aws-qe
env:
BASE_DOMAIN: qe.devcluster.openshift.com
workflow: cucushift-installer-rehearse-aws-cases-zone-consistency
- as: aws-ipi-localzone-sts-fips-mini-perm-f28-destructive
cron: 10 19 27 * *
steps:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23154,6 +23154,88 @@ periodics:
- name: result-aggregator
secret:
secretName: result-aggregator
- agent: kubernetes
cluster: build09
cron: 30 10 3,17 * *
decorate: true
decoration_config:
skip_cloning: true
extra_refs:
- base_ref: release-4.22
org: openshift
repo: openshift-tests-private
labels:
ci-operator.openshift.io/cloud: aws
ci-operator.openshift.io/cloud-cluster-profile: aws-qe
ci-operator.openshift.io/variant: amd64-nightly
ci.openshift.io/generator: prowgen
job-release: "4.22"
pj-rehearse.openshift.io/can-be-rehearsed: "true"
name: periodic-ci-openshift-openshift-tests-private-release-4.22-amd64-nightly-aws-ipi-zone-consistency-f14
spec:
containers:
- args:
- --gcs-upload-secret=/secrets/gcs/service-account.json
- --image-import-pull-secret=/etc/pull-secret/.dockerconfigjson
- --lease-server-credentials-file=/etc/boskos/credentials
- --oauth-token-path=/usr/local/github-credentials/oauth
- --report-credentials-file=/etc/report/credentials
- --secret-dir=/secrets/ci-pull-credentials
- --target=aws-ipi-zone-consistency-f14
- --variant=amd64-nightly
command:
- ci-operator
image: quay-proxy.ci.openshift.org/openshift/ci:ci_ci-operator_latest
imagePullPolicy: Always
name: ""
resources:
requests:
cpu: 10m
volumeMounts:
- mountPath: /etc/boskos
name: boskos
readOnly: true
- mountPath: /secrets/ci-pull-credentials
name: ci-pull-credentials
readOnly: true
- mountPath: /secrets/gcs
name: gcs-credentials
readOnly: true
- mountPath: /usr/local/github-credentials
name: github-credentials-openshift-ci-robot-private-git-cloner
readOnly: true
- mountPath: /secrets/manifest-tool
name: manifest-tool-local-pusher
readOnly: true
- mountPath: /etc/pull-secret
name: pull-secret
readOnly: true
- mountPath: /etc/report
name: result-aggregator
readOnly: true
serviceAccountName: ci-operator
volumes:
- name: boskos
secret:
items:
- key: credentials
path: credentials
secretName: boskos-credentials
- name: ci-pull-credentials
secret:
secretName: ci-pull-credentials
- name: github-credentials-openshift-ci-robot-private-git-cloner
secret:
secretName: github-credentials-openshift-ci-robot-private-git-cloner
- name: manifest-tool-local-pusher
secret:
secretName: manifest-tool-local-pusher
- name: pull-secret
secret:
secretName: registry-pull-credentials
- name: result-aggregator
secret:
secretName: result-aggregator
- agent: kubernetes
cluster: build09
cron: 26 15 5,12,19,26 * *
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
approvers:
- jianlinliu
- gpei
- yunjiang29
- liweinan
reviewers:
- jianlinliu
- gpei
- yunjiang29
- liweinan
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
#!/bin/bash

# OCPBUGS-69923 - Verify control plane machine zone allocation consistency in manifests
# Run 10 iterations to verify CAPI and MAPI zone allocation is deterministic
#
# IMPORTANT: Must compare CORRECT files:
# - CAPI zones: cluster-api/machines/10_inframachine_*-master-*.yaml (from subnet filter)
# - MAPI zones: openshift/99_openshift-machine-api_master-control-plane-machine-set.yaml (failureDomains)
#
# NOTE: openshift/99_openshift-cluster-api_master-machines-*.yaml is MAPI (despite the name)!

set -o errexit
set -o pipefail
set -o nounset

export AWS_SHARED_CREDENTIALS_FILE="${CLUSTER_PROFILE_DIR}/.awscred"

REGION="${LEASED_RESOURCE}"
CLUSTER_NAME="${NAMESPACE}-${UNIQUE_HASH}"

SSH_PUB_KEY=$(<"${CLUSTER_PROFILE_DIR}/ssh-publickey")
PULL_SECRET=$(<"${CLUSTER_PROFILE_DIR}/pull-secret")

WORK_DIR="/tmp/test-zone-consistency"

echo "openshift-install version:"
openshift-install version
echo ""

TOTAL_FAILURES=0

for iteration in $(seq 1 10); do
echo "=========================================="
echo "Iteration $iteration/10"
echo "=========================================="

# Clean up everything from previous iteration
rm -rf "${WORK_DIR}"
mkdir -p "${WORK_DIR}"

# Create install-config.yaml (without specifying zones - triggers the bug path)
cat > "${WORK_DIR}/install-config.yaml" << EOF
apiVersion: v1
baseDomain: ${BASE_DOMAIN}
metadata:
name: ${CLUSTER_NAME}
controlPlane:
architecture: amd64
hyperthreading: Enabled
name: master
replicas: 3
compute:
- architecture: amd64
hyperthreading: Enabled
name: worker
replicas: 3
platform:
aws:
region: ${REGION}
pullSecret: >
${PULL_SECRET}
sshKey: |
${SSH_PUB_KEY}
EOF

# Backup install-config.yaml before it gets consumed by create manifests
cp "${WORK_DIR}/install-config.yaml" "${WORK_DIR}/install-config.yaml.backup"

# Generate manifests (this consumes install-config.yaml)
openshift-install create manifests --dir "${WORK_DIR}"

# Extract CAPI zones from cluster-api/machines/10_inframachine_*-master-*.yaml
# These are the REAL CAPI AWSMachine objects (not the misleadingly named openshift/99_openshift-cluster-api_* files)
capi_zones=""
for file in $(find "$WORK_DIR"/cluster-api/machines -name "10_inframachine_*-master-*.yaml" -type f 2>/dev/null | sort); do
# Extract zone from subnet filter name (e.g., "cluster-subnet-private-us-east-1a" -> "us-east-1a")
subnet_name=$(yq-go r "$file" 'spec.subnet.filters[0].values[0]' 2>/dev/null || echo "")
# Extract zone from subnet name using region pattern
zone=$(echo "$subnet_name" | grep -oE "${REGION}[a-z]$" || echo "")
if [ -n "$zone" ] && [ "$zone" != "null" ]; then
capi_zones="${capi_zones} ${zone}"
fi
done
capi_zones=$(echo "$capi_zones" | xargs)
capi_count=$(echo "$capi_zones" | wc -w | xargs)

# Extract MAPI zones from ControlPlaneMachineSet failureDomains
# File: openshift/99_openshift-machine-api_master-control-plane-machine-set.yaml
mapi_zones=""
mapi_count=0
cpms_file="$WORK_DIR/openshift/99_openshift-machine-api_master-control-plane-machine-set.yaml"
if [ -f "$cpms_file" ]; then
idx=0
while [ $mapi_count -lt "$capi_count" ]; do
zone=$(yq-go r "$cpms_file" "spec.template.machines_v1beta1_machine_openshift_io.failureDomains.aws[$idx].placement.availabilityZone" 2>/dev/null || echo "")
if [ -z "$zone" ] || [ "$zone" = "null" ]; then
break
fi
mapi_zones="${mapi_zones} ${zone}"
mapi_count=$((mapi_count + 1))
idx=$((idx + 1))
done
else
echo " ERROR: ControlPlaneMachineSet file not found!"
fi
mapi_zones=$(echo "$mapi_zones" | xargs)

# Save manifests to ARTIFACT_DIR for verification (regardless of test result)
iteration_artifact_dir="${ARTIFACT_DIR}/iteration-${iteration}"
mkdir -p "${iteration_artifact_dir}"

echo " Saving manifests to ${iteration_artifact_dir}..."

# Copy CAPI machine manifests
if [ -d "$WORK_DIR/cluster-api/machines" ]; then
cp -r "$WORK_DIR/cluster-api/machines" "${iteration_artifact_dir}/capi-machines"
fi

# Copy ControlPlaneMachineSet
if [ -f "$cpms_file" ]; then
cp "$cpms_file" "${iteration_artifact_dir}/control-plane-machine-set.yaml"
fi

# Copy install-config for reference (from backup since create manifests consumed the original)
if [ -f "$WORK_DIR/install-config.yaml.backup" ]; then
cp "$WORK_DIR/install-config.yaml.backup" "${iteration_artifact_dir}/install-config.yaml"
fi

# Compare
echo " CAPI zones (from cluster-api/machines/10_inframachine_*): $capi_zones"
echo " MAPI zones (from ControlPlaneMachineSet failureDomains): $mapi_zones"

if [ "$capi_zones" = "$mapi_zones" ]; then
echo " PASS"
else
echo " FAIL: zones mismatch - CAPI and MAPI have different zone assignments"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you hit the issue describe in the bug using this script?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we print error details? e.g. the zone in MAPI is us-east-1a, bug in CAPI its us-east-1b, and we can consider saving the relevant manifests in ARTIFACT dir, it will be helpful for the debugging.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay let me improve this part.

Copy link
Contributor Author

@liweinan liweinan Jan 29, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you hit the issue described in the bug using this script?

This commit can be used to reproduce the problem:

I have a similar local script that can see the failure output: https://gist.github.com/liweinan/9d65abf9759370f141d4eec93fa7de17


# Print detailed zone differences
echo " ERROR DETAILS:"
IFS=' ' read -ra capi_array <<< "$capi_zones"
IFS=' ' read -ra mapi_array <<< "$mapi_zones"

for i in "${!capi_array[@]}"; do
capi_zone="${capi_array[$i]:-}"
mapi_zone="${mapi_array[$i]:-}"
if [ "$capi_zone" != "$mapi_zone" ]; then
echo " Position $((i+1)): CAPI has '$capi_zone' but MAPI has '$mapi_zone'"
fi
done

# Handle case where arrays have different lengths
if [ ${#capi_array[@]} -ne ${#mapi_array[@]} ]; then
echo " Zone count mismatch: CAPI has ${#capi_array[@]} zones, MAPI has ${#mapi_array[@]} zones"
fi

TOTAL_FAILURES=$((TOTAL_FAILURES + 1))
fi

# Delete all generated files for next iteration
rm -rf "${WORK_DIR}"
done

echo ""
echo "=========================================="
echo "Final Result: 10 iterations completed"
echo "All manifests saved to ${ARTIFACT_DIR}/ for verification"
if [ $TOTAL_FAILURES -eq 0 ]; then
echo "PASS: All iterations have consistent zone allocation between CAPI and MAPI"
else
echo "FAIL: $TOTAL_FAILURES iterations had zone mismatches (OCPBUGS-69923)"
fi
echo "=========================================="

exit $TOTAL_FAILURES
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"path": "cucushift/installer/rehearse/aws/cases/zone-consistency/cucushift-installer-rehearse-aws-cases-zone-consistency-ref.yaml",
"owners": {
"approvers": [
"jianlinliu",
"gpei",
"yunjiang29",
"liweinan"
],
"reviewers": [
"jianlinliu",
"gpei",
"yunjiang29",
"liweinan"
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
ref:
as: cucushift-installer-rehearse-aws-cases-zone-consistency
from: upi-installer
grace_period: 10m
commands: cucushift-installer-rehearse-aws-cases-zone-consistency-commands.sh
resources:
requests:
cpu: 10m
memory: 100Mi
env:
- name: BASE_DOMAIN
default: "qe.devcluster.openshift.com"
documentation: >-
Verify control plane machine zone allocation consistency in manifests (OCPBUGS-69923).
This step generates manifests using openshift-install and verifies that CAPI and MAPI
manifests have consistent zone allocation for control plane machines.
This is a static validation test that does not require actual cluster installation.
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"path": "cucushift/installer/rehearse/aws/cases/zone-consistency/cucushift-installer-rehearse-aws-cases-zone-consistency-workflow.yaml",
"owners": {
"approvers": [
"jianlinliu",
"gpei",
"yunjiang29",
"liweinan"
],
"reviewers": [
"jianlinliu",
"gpei",
"yunjiang29",
"liweinan"
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
workflow:
as: cucushift-installer-rehearse-aws-cases-zone-consistency
steps:
pre:
- chain: cucushift-installer-rehearse-aws-cases-zone-consistency-provision
- ref: cucushift-installer-reportportal-marker
documentation: |-
This workflow runs static validation tests for OCPBUGS-69923: Control plane machine
zone allocation consistency. It verifies that CAPI and MAPI manifests have consistent
zone allocation for control plane machines. No cluster provisioning or deprovisioning
is needed as these are configuration validation tests only.
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
approvers:
- jianlinliu
- gpei
- yunjiang29
- liweinan
reviewers:
- jianlinliu
- gpei
- yunjiang29
- liweinan
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"path": "cucushift/installer/rehearse/aws/cases/zone-consistency/provision/cucushift-installer-rehearse-aws-cases-zone-consistency-provision-chain.yaml",
"owners": {
"approvers": [
"jianlinliu",
"gpei",
"yunjiang29",
"liweinan"
],
"reviewers": [
"jianlinliu",
"gpei",
"yunjiang29",
"liweinan"
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
chain:
as: cucushift-installer-rehearse-aws-cases-zone-consistency-provision
steps:
- ref: cucushift-installer-rehearse-aws-cases-zone-consistency
documentation: |-
Run static validation tests for OCPBUGS-69923: Control plane machine zone allocation
consistency. This generates manifests using openshift-install and verifies that CAPI
and MAPI manifests have consistent zone allocation. No cluster provisioning or
deprovisioning is needed as these are configuration validation tests only.