Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 137 additions & 0 deletions .github/workflows/test-E2E.yml
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,143 @@ jobs:
# Check events
kubectl get events -n $DB_NS --sort-by='.lastTimestamp'

- name: Verify mount options are set by PV controller
run: |
echo "Verifying PV mount options are set by the PV controller..."

# Get the PVC and PV names from the existing cluster
pvc_name=$(kubectl -n ${{ env.DB_NS }} get pvc -l cnpg.io/cluster=${{ env.DB_NAME }} -o jsonpath='{.items[0].metadata.name}')
pv_name=$(kubectl -n ${{ env.DB_NS }} get pvc $pvc_name -o jsonpath='{.spec.volumeName}')

echo "PVC name: $pvc_name"
echo "PV name: $pv_name"

if [ -z "$pv_name" ]; then
echo "❌ Failed to find PV bound to PVC $pvc_name"
exit 1
fi

# Get mount options from PV
mount_options=$(kubectl get pv $pv_name -o jsonpath='{.spec.mountOptions}')
echo "PV mount options: $mount_options"

# Check for security mount options (nodev, nosuid, noexec)
if echo "$mount_options" | grep -q "nodev" && \
echo "$mount_options" | grep -q "nosuid" && \
echo "$mount_options" | grep -q "noexec"; then
echo "✓ PV mount options (nodev, nosuid, noexec) are set correctly"
else
echo "❌ PV mount options are missing. Expected nodev, nosuid, noexec"
exit 1
fi

- name: Test PV reclaim policy default and explicit Delete
shell: bash
run: |
echo "Testing PV reclaim policy - default (Retain) and explicit Delete..."

# Test 1: Verify default policy is Retain on the existing cluster
echo "=== Test 1: Verify default PV reclaim policy is Retain ==="

# Get the PVC and PV names from the existing cluster
pvc_name=$(kubectl -n ${{ env.DB_NS }} get pvc -l cnpg.io/cluster=${{ env.DB_NAME }} -o jsonpath='{.items[0].metadata.name}')
pv_name=$(kubectl -n ${{ env.DB_NS }} get pvc $pvc_name -o jsonpath='{.spec.volumeName}')

echo "PVC name: $pvc_name"
echo "PV name: $pv_name"

if [ -z "$pv_name" ]; then
echo "❌ Failed to find PV bound to PVC $pvc_name"
exit 1
fi

# Verify default PV reclaim policy is Retain
current_policy=$(kubectl get pv $pv_name -o jsonpath='{.spec.persistentVolumeReclaimPolicy}')
echo "Current PV reclaim policy: $current_policy"

if [ "$current_policy" != "Retain" ]; then
echo "❌ Expected default PV reclaim policy to be 'Retain', but got '$current_policy'"
exit 1
fi
echo "✓ Default PV reclaim policy is correctly set to Retain"

# Test 2: Change policy to Delete and verify PV is deleted with cluster
echo ""
echo "=== Test 2: Change policy to Delete and verify PV cleanup ==="

# Patch the existing DocumentDB to set persistentVolumeReclaimPolicy to Delete
echo "Patching DocumentDB to set persistentVolumeReclaimPolicy to Delete..."
kubectl -n ${{ env.DB_NS }} patch documentdb ${{ env.DB_NAME }} --type=merge \
-p '{"spec":{"resource":{"storage":{"persistentVolumeReclaimPolicy":"Delete"}}}}'

# Wait for PV controller to update the PV reclaim policy
echo "Waiting for PV reclaim policy to be updated to Delete..."
MAX_RETRIES=30
SLEEP_INTERVAL=5
ITER=0
while [ $ITER -lt $MAX_RETRIES ]; do
new_policy=$(kubectl get pv $pv_name -o jsonpath='{.spec.persistentVolumeReclaimPolicy}')
if [ "$new_policy" == "Delete" ]; then
echo "✓ PV reclaim policy updated to Delete"
break
else
echo "PV reclaim policy is still '$new_policy'. Waiting..."
sleep $SLEEP_INTERVAL
fi
((++ITER))
done

if [ "$new_policy" != "Delete" ]; then
echo "❌ PV reclaim policy was not updated to Delete within expected time"
exit 1
fi

# Delete the DocumentDB cluster
echo "Deleting DocumentDB cluster to test PV cleanup with Delete policy..."
kubectl -n ${{ env.DB_NS }} delete documentdb ${{ env.DB_NAME }} --wait=false

# Wait for DocumentDB to be deleted
echo "Waiting for DocumentDB to be deleted..."
MAX_RETRIES=30
SLEEP_INTERVAL=10
ITER=0
while [ $ITER -lt $MAX_RETRIES ]; do
db_exists=$(kubectl -n ${{ env.DB_NS }} get documentdb ${{ env.DB_NAME }} --ignore-not-found)
if [ -z "$db_exists" ]; then
echo "✓ DocumentDB deleted successfully."
break
else
echo "DocumentDB still exists. Waiting..."
sleep $SLEEP_INTERVAL
fi
((++ITER))
done

# Verify no PVsRetained warning event was emitted (since policy is Delete)
events=$(kubectl -n ${{ env.DB_NS }} get events --field-selector reason=PVsRetained,involvedObject.name=${{ env.DB_NAME }} --ignore-not-found -o jsonpath='{.items}')
if [ -z "$events" ] || [ "$events" == "[]" ]; then
echo "✓ No PVsRetained warning event emitted (expected for Delete policy)"
else
echo "⚠️ Unexpected PVsRetained event found for Delete policy cluster"
fi

# Wait a bit for PV to be deleted (the storage class handles actual deletion)
echo "Waiting for PV to be deleted..."
sleep 30

# Verify PV was deleted (because reclaim policy is Delete)
pv_exists=$(kubectl get pv $pv_name --ignore-not-found 2>/dev/null)
if [ -z "$pv_exists" ]; then
echo "✓ PV $pv_name was deleted as expected (Delete policy)"
else
pv_status=$(kubectl get pv $pv_name -o jsonpath='{.status.phase}')
echo "⚠️ PV $pv_name still exists with status: $pv_status"
echo "Note: PV deletion depends on the storage provisioner. The reclaim policy was correctly set to Delete."
Comment on lines +403 to +414
Copy link

Copilot AI Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The test treats PV persistence after deletion as a warning rather than a failure. This could mask real issues with the Delete policy implementation. Consider adding a timeout with eventual consistency check or making this a failure if the PV still exists after a reasonable wait period (e.g., 60 seconds).

Suggested change
# Wait a bit for PV to be deleted (the storage class handles actual deletion)
echo "Waiting for PV to be deleted..."
sleep 30
# Verify PV was deleted (because reclaim policy is Delete)
pv_exists=$(kubectl get pv $pv_name --ignore-not-found 2>/dev/null)
if [ -z "$pv_exists" ]; then
echo "✓ PV $pv_name was deleted as expected (Delete policy)"
else
pv_status=$(kubectl get pv $pv_name -o jsonpath='{.status.phase}')
echo "⚠️ PV $pv_name still exists with status: $pv_status"
echo "Note: PV deletion depends on the storage provisioner. The reclaim policy was correctly set to Delete."
# Wait (with timeout) for PV to be deleted (the storage class handles actual deletion)
echo "Waiting for PV $pv_name to be deleted (up to 60 seconds)..."
MAX_PV_WAIT_SECONDS=60
PV_POLL_INTERVAL=5
ELAPSED=0
while [ $ELAPSED -lt $MAX_PV_WAIT_SECONDS ]; do
pv_exists=$(kubectl get pv $pv_name --ignore-not-found 2>/dev/null)
if [ -z "$pv_exists" ]; then
echo "✓ PV $pv_name was deleted as expected (Delete policy)"
break
fi
echo "PV $pv_name still exists. Waiting..."
sleep $PV_POLL_INTERVAL
ELAPSED=$((ELAPSED + PV_POLL_INTERVAL))
done
# After waiting, fail the test if the PV still exists
pv_exists=$(kubectl get pv $pv_name --ignore-not-found 2>/dev/null)
if [ -n "$pv_exists" ]; then
pv_status=$(kubectl get pv $pv_name -o jsonpath='{.status.phase}')
echo "❌ PV $pv_name still exists after ${MAX_PV_WAIT_SECONDS}s with status: $pv_status"
echo "PV should be deleted when reclaim policy is Delete. Investigate storage provisioner or policy configuration."
exit 1

Copilot uses AI. Check for mistakes.
fi

echo ""
echo "✓ PV reclaim policy test completed successfully"

- name: Collect comprehensive logs on failure
if: failure()
uses: ./.github/actions/collect-logs
Expand Down
204 changes: 203 additions & 1 deletion .github/workflows/test-backup-and-restore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -354,4 +354,206 @@ jobs:
((++ITER))
done
echo "❌ Expired backup was not cleaned up within expected time."
exit 1
exit 1

- name: Test PV retention after DocumentDB deletion
id: test-pv-retention
shell: bash
run: |
echo "Testing PV retention after DocumentDB deletion..."

# Get the PVC name and PV name before deleting the DocumentDB
# PVCs are created by CNPG and labeled with cnpg.io/cluster
pvc_name=$(kubectl -n ${{ env.DB_NS }} get pvc -l cnpg.io/cluster=${{ env.DB_RESTORE_NAME }} -o jsonpath='{.items[0].metadata.name}')
echo "PVC name: $pvc_name"

if [ -z "$pvc_name" ]; then
echo "❌ Failed to find PVC for cluster ${{ env.DB_RESTORE_NAME }}"
exit 1
fi

# Get the PV name bound to this PVC
pv_name=$(kubectl -n ${{ env.DB_NS }} get pvc $pvc_name -o jsonpath='{.spec.volumeName}')
echo "PV name: $pv_name"

if [ -z "$pv_name" ]; then
echo "❌ Failed to find PV bound to PVC $pvc_name"
exit 1
fi

# Check current PV reclaim policy - should be Retain by default
current_policy=$(kubectl get pv $pv_name -o jsonpath='{.spec.persistentVolumeReclaimPolicy}')
echo "Current PV reclaim policy: $current_policy"

if [ "$current_policy" != "Retain" ]; then
echo "❌ Expected PV reclaim policy to be 'Retain' (default), but got '$current_policy'"
exit 1
fi
echo "✓ PV reclaim policy is correctly set to Retain (default)"

# Delete the restored DocumentDB cluster
kubectl -n ${{ env.DB_NS }} delete documentdb ${{ env.DB_RESTORE_NAME }} --wait=false

# Wait for DocumentDB to be deleted
echo "Waiting for DocumentDB to be deleted..."
MAX_RETRIES=30
SLEEP_INTERVAL=10
ITER=0
while [ $ITER -lt $MAX_RETRIES ]; do
db_exists=$(kubectl -n ${{ env.DB_NS }} get documentdb ${{ env.DB_RESTORE_NAME }} --ignore-not-found)
if [ -z "$db_exists" ]; then
echo "✓ DocumentDB deleted successfully."
break
else
echo "DocumentDB still exists. Waiting..."
sleep $SLEEP_INTERVAL
fi
((++ITER))
done

# Verify PV still exists (because reclaim policy is Retain)
pv_exists=$(kubectl get pv $pv_name --ignore-not-found)
if [ -n "$pv_exists" ]; then
echo "✓ PV $pv_name retained after DocumentDB deletion"
else
echo "❌ PV $pv_name was deleted unexpectedly"
exit 1
fi

# Store PV name for later steps using GitHub Actions output (more robust than temp files)
echo "pv_name=$pv_name" >> $GITHUB_OUTPUT
Comment on lines +423 to +424
Copy link

Copilot AI Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good use of GitHub Actions outputs instead of temporary files. However, consider adding error handling to verify that pv_name is not empty before writing to GITHUB_OUTPUT to prevent silent failures in subsequent steps.

Copilot uses AI. Check for mistakes.

- name: Restore DocumentDB from retained PV
shell: bash
run: |
pv_name="${{ steps.test-pv-retention.outputs.pv_name }}"
echo "Restoring DocumentDB from retained PV: $pv_name"

# Check the PV status - it should be in "Released" state after PVC deletion
pv_status=$(kubectl get pv $pv_name -o jsonpath='{.status.phase}')
echo "PV status: $pv_status"

# Clear the claimRef from the PV so a new PVC can bind to it
# When a PV is in "Released" state, it still has a claimRef to the old deleted PVC
echo "Clearing claimRef from PV $pv_name to allow new PVC binding..."
kubectl patch pv $pv_name --type=json -p='[{"op": "remove", "path": "/spec/claimRef"}]'

# Verify PV is now Available
pv_status=$(kubectl get pv $pv_name -o jsonpath='{.status.phase}')
echo "PV status after clearing claimRef: $pv_status"

# Create a new PVC that binds to the retained PV
new_pvc_name="recovered-pvc-from-pv"
echo "Creating new PVC $new_pvc_name to bind to retained PV $pv_name"

# Get the storage capacity from the PV
pv_capacity=$(kubectl get pv $pv_name -o jsonpath='{.spec.capacity.storage}')
echo "PV capacity: $pv_capacity"

cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: $new_pvc_name
namespace: ${{ env.DB_NS }}
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: $pv_capacity
storageClassName: csi-hostpath-sc
volumeName: $pv_name
EOF

# Wait for PVC to be bound
echo "Waiting for PVC to be bound to PV..."
MAX_RETRIES=30
SLEEP_INTERVAL=5
ITER=0
while [ $ITER -lt $MAX_RETRIES ]; do
pvc_status=$(kubectl -n ${{ env.DB_NS }} get pvc $new_pvc_name -o jsonpath='{.status.phase}')
if [ "$pvc_status" == "Bound" ]; then
echo "✓ PVC $new_pvc_name is now bound to PV $pv_name"
break
else
echo "PVC status: $pvc_status. Waiting..."
sleep $SLEEP_INTERVAL
fi
((++ITER))
done

if [ "$pvc_status" != "Bound" ]; then
echo "❌ PVC failed to bind to PV within expected time"
exit 1
fi

# Create DocumentDB resource with PVC recovery
echo "Creating DocumentDB with PVC recovery from $new_pvc_name"
cat <<EOF | kubectl apply -f -
apiVersion: documentdb.io/preview
kind: DocumentDB
metadata:
name: ${{ env.DB_RESTORE_NAME }}-from-pvc
namespace: ${{ env.DB_NS }}
spec:
nodeCount: ${{ matrix.node_count }}
instancesPerNode: ${{ matrix.instances_per_node }}
documentDBImage: ghcr.io/microsoft/documentdb/documentdb-local:16
gatewayImage: ghcr.io/microsoft/documentdb/documentdb-local:16
resource:
storage:
pvcSize: 5Gi
storageClass: csi-hostpath-sc
exposeViaService:
serviceType: ClusterIP
bootstrap:
recovery:
pvc:
name: $new_pvc_name
EOF

- name: Setup port forwarding for PVC restored cluster
uses: ./.github/actions/setup-port-forwarding
with:
namespace: ${{ env.DB_NS }}
cluster-name: ${{ env.DB_RESTORE_NAME }}-from-pvc
port: ${{ env.DB_PORT }}
architecture: ${{ matrix.architecture }}
test-type: 'comprehensive'

- name: Validate data exists after PVC restoration
run: |
echo "Validating data exists after PVC restoration..."

# Validate that the restored cluster has the expected data
count=$(mongosh 127.0.0.1:$DB_PORT --quiet --eval "db.testCollection.countDocuments({})" -u $DB_USERNAME -p $DB_PASSWORD --authenticationMechanism SCRAM-SHA-256 --tls --tlsAllowInvalidCertificates)
if [ "$count" -eq 100 ]; then
echo "✓ Data validation completed successfully after PVC restoration on ${{ matrix.architecture }}"
else
echo "❌ Data validation failed after PVC restoration on ${{ matrix.architecture }}. Count: $count"
exit 1
fi

- name: Cleanup PVC restored cluster port forwarding
if: always()
run: |
# Stop port-forward if it exists
if [ -f /tmp/pf_pid ]; then
PF_PID=$(cat /tmp/pf_pid)
kill $PF_PID 2>/dev/null || true
rm -f /tmp/pf_pid
fi

# Clean up output log
rm -f /tmp/pf_output.log

- name: Collect logs on failure
if: failure()
uses: ./.github/actions/collect-logs
with:
architecture: ${{ matrix.architecture }}
operator-namespace: ${{ env.OPERATOR_NS }}
db-namespace: ${{ env.DB_NS }}
db-cluster-name: ${{ env.DB_NAME }}
cert-manager-namespace: ${{ env.CERT_MANAGER_NS }}
Loading
Loading