documentdb · WentingWu666666 · Dec 23, 2025 · Jan 6, 2026 · Jan 7, 2026 · Jan 7, 2026
@@ -280,6 +280,143 @@ jobs:
         # Check events
         kubectl get events -n $DB_NS --sort-by='.lastTimestamp'
 
+    - name: Verify mount options are set by PV controller
+      run: |
+        echo "Verifying PV mount options are set by the PV controller..."
+
+        # Get the PVC and PV names from the existing cluster
+        pvc_name=$(kubectl -n ${{ env.DB_NS }} get pvc -l cnpg.io/cluster=${{ env.DB_NAME }} -o jsonpath='{.items[0].metadata.name}')
+        pv_name=$(kubectl -n ${{ env.DB_NS }} get pvc $pvc_name -o jsonpath='{.spec.volumeName}')
+
+        echo "PVC name: $pvc_name"
+        echo "PV name: $pv_name"
+
+        if [ -z "$pv_name" ]; then
+          echo "❌ Failed to find PV bound to PVC $pvc_name"
+          exit 1
+        fi
+
+        # Get mount options from PV
+        mount_options=$(kubectl get pv $pv_name -o jsonpath='{.spec.mountOptions}')
+        echo "PV mount options: $mount_options"
+
+        # Check for security mount options (nodev, nosuid, noexec)
+        if echo "$mount_options" | grep -q "nodev" && \
+           echo "$mount_options" | grep -q "nosuid" && \
+           echo "$mount_options" | grep -q "noexec"; then
+          echo "✓ PV mount options (nodev, nosuid, noexec) are set correctly"
+        else
+          echo "❌ PV mount options are missing. Expected nodev, nosuid, noexec"
+          exit 1
+        fi
+
+    - name: Test PV reclaim policy default and explicit Delete
+      shell: bash
+      run: |
+        echo "Testing PV reclaim policy - default (Retain) and explicit Delete..."
+
+        # Test 1: Verify default policy is Retain on the existing cluster
+        echo "=== Test 1: Verify default PV reclaim policy is Retain ==="
+
+        # Get the PVC and PV names from the existing cluster
+        pvc_name=$(kubectl -n ${{ env.DB_NS }} get pvc -l cnpg.io/cluster=${{ env.DB_NAME }} -o jsonpath='{.items[0].metadata.name}')
+        pv_name=$(kubectl -n ${{ env.DB_NS }} get pvc $pvc_name -o jsonpath='{.spec.volumeName}')
+
+        echo "PVC name: $pvc_name"
+        echo "PV name: $pv_name"
+
+        if [ -z "$pv_name" ]; then
+          echo "❌ Failed to find PV bound to PVC $pvc_name"
+          exit 1
+        fi
+
+        # Verify default PV reclaim policy is Retain
+        current_policy=$(kubectl get pv $pv_name -o jsonpath='{.spec.persistentVolumeReclaimPolicy}')
+        echo "Current PV reclaim policy: $current_policy"
+
+        if [ "$current_policy" != "Retain" ]; then
+          echo "❌ Expected default PV reclaim policy to be 'Retain', but got '$current_policy'"
+          exit 1
+        fi
+        echo "✓ Default PV reclaim policy is correctly set to Retain"
+
+        # Test 2: Change policy to Delete and verify PV is deleted with cluster
+        echo ""
+        echo "=== Test 2: Change policy to Delete and verify PV cleanup ==="
+
+        # Patch the existing DocumentDB to set persistentVolumeReclaimPolicy to Delete
+        echo "Patching DocumentDB to set persistentVolumeReclaimPolicy to Delete..."
+        kubectl -n ${{ env.DB_NS }} patch documentdb ${{ env.DB_NAME }} --type=merge \
+          -p '{"spec":{"resource":{"storage":{"persistentVolumeReclaimPolicy":"Delete"}}}}'
+
+        # Wait for PV controller to update the PV reclaim policy
+        echo "Waiting for PV reclaim policy to be updated to Delete..."
+        MAX_RETRIES=30
+        SLEEP_INTERVAL=5
+        ITER=0
+        while [ $ITER -lt $MAX_RETRIES ]; do
+          new_policy=$(kubectl get pv $pv_name -o jsonpath='{.spec.persistentVolumeReclaimPolicy}')
+          if [ "$new_policy" == "Delete" ]; then
+            echo "✓ PV reclaim policy updated to Delete"
+            break
+          else
+            echo "PV reclaim policy is still '$new_policy'. Waiting..."
+            sleep $SLEEP_INTERVAL
+          fi
+          ((++ITER))
+        done
+
+        if [ "$new_policy" != "Delete" ]; then
+          echo "❌ PV reclaim policy was not updated to Delete within expected time"
+          exit 1
+        fi
+
+        # Delete the DocumentDB cluster
+        echo "Deleting DocumentDB cluster to test PV cleanup with Delete policy..."
+        kubectl -n ${{ env.DB_NS }} delete documentdb ${{ env.DB_NAME }} --wait=false
+
+        # Wait for DocumentDB to be deleted
+        echo "Waiting for DocumentDB to be deleted..."
+        MAX_RETRIES=30
+        SLEEP_INTERVAL=10
+        ITER=0
+        while [ $ITER -lt $MAX_RETRIES ]; do
+          db_exists=$(kubectl -n ${{ env.DB_NS }} get documentdb ${{ env.DB_NAME }} --ignore-not-found)
+          if [ -z "$db_exists" ]; then
+            echo "✓ DocumentDB deleted successfully."
+            break
+          else
+            echo "DocumentDB still exists. Waiting..."
+            sleep $SLEEP_INTERVAL
+          fi
+          ((++ITER))
+        done
+
+        # Verify no PVsRetained warning event was emitted (since policy is Delete)
+        events=$(kubectl -n ${{ env.DB_NS }} get events --field-selector reason=PVsRetained,involvedObject.name=${{ env.DB_NAME }} --ignore-not-found -o jsonpath='{.items}')
+        if [ -z "$events" ] || [ "$events" == "[]" ]; then
+          echo "✓ No PVsRetained warning event emitted (expected for Delete policy)"
+        else
+          echo "⚠️ Unexpected PVsRetained event found for Delete policy cluster"
+        fi
+
+        # Wait a bit for PV to be deleted (the storage class handles actual deletion)
+        echo "Waiting for PV to be deleted..."
+        sleep 30
+
+        # Verify PV was deleted (because reclaim policy is Delete)
+        pv_exists=$(kubectl get pv $pv_name --ignore-not-found 2>/dev/null)
+        if [ -z "$pv_exists" ]; then
+          echo "✓ PV $pv_name was deleted as expected (Delete policy)"
+        else
+          pv_status=$(kubectl get pv $pv_name -o jsonpath='{.status.phase}')
+          echo "⚠️ PV $pv_name still exists with status: $pv_status"
+          echo "Note: PV deletion depends on the storage provisioner. The reclaim policy was correctly set to Delete."
-        # Wait a bit for PV to be deleted (the storage class handles actual deletion)
-        echo "Waiting for PV to be deleted..."
-        sleep 30
-        
-        # Verify PV was deleted (because reclaim policy is Delete)
-        pv_exists=$(kubectl get pv $pv_name --ignore-not-found 2>/dev/null)
-        if [ -z "$pv_exists" ]; then
-          echo "✓ PV $pv_name was deleted as expected (Delete policy)"
-        else
-          pv_status=$(kubectl get pv $pv_name -o jsonpath='{.status.phase}')
-          echo "⚠️ PV $pv_name still exists with status: $pv_status"
-          echo "Note: PV deletion depends on the storage provisioner. The reclaim policy was correctly set to Delete."
+        # Wait (with timeout) for PV to be deleted (the storage class handles actual deletion)
+        echo "Waiting for PV $pv_name to be deleted (up to 60 seconds)..."
+        MAX_PV_WAIT_SECONDS=60
+        PV_POLL_INTERVAL=5
+        ELAPSED=0
+        
+        while [ $ELAPSED -lt $MAX_PV_WAIT_SECONDS ]; do
+          pv_exists=$(kubectl get pv $pv_name --ignore-not-found 2>/dev/null)
+          if [ -z "$pv_exists" ]; then
+            echo "✓ PV $pv_name was deleted as expected (Delete policy)"
+            break
+          fi
+          echo "PV $pv_name still exists. Waiting..."
+          sleep $PV_POLL_INTERVAL
+          ELAPSED=$((ELAPSED + PV_POLL_INTERVAL))
+        done
+        
+        # After waiting, fail the test if the PV still exists
+        pv_exists=$(kubectl get pv $pv_name --ignore-not-found 2>/dev/null)
+        if [ -n "$pv_exists" ]; then
+          pv_status=$(kubectl get pv $pv_name -o jsonpath='{.status.phase}')
+          echo "❌ PV $pv_name still exists after ${MAX_PV_WAIT_SECONDS}s with status: $pv_status"
+          echo "PV should be deleted when reclaim policy is Delete. Investigate storage provisioner or policy configuration."
+          exit 1
-        # Wait a bit for PV to be deleted (the storage class handles actual deletion)
-        echo "Waiting for PV to be deleted..."
-        sleep 30
-        
-        # Verify PV was deleted (because reclaim policy is Delete)
-        pv_exists=$(kubectl get pv $pv_name --ignore-not-found 2>/dev/null)
-        if [ -z "$pv_exists" ]; then
-          echo "✓ PV $pv_name was deleted as expected (Delete policy)"
-        else
-          pv_status=$(kubectl get pv $pv_name -o jsonpath='{.status.phase}')
-          echo "⚠️ PV $pv_name still exists with status: $pv_status"
-          echo "Note: PV deletion depends on the storage provisioner. The reclaim policy was correctly set to Delete."
+        # Wait (with timeout) for PV to be deleted (the storage class handles actual deletion)
+        echo "Waiting for PV $pv_name to be deleted (up to 60 seconds)..."
+        MAX_PV_WAIT_SECONDS=60
+        PV_POLL_INTERVAL=5
+        ELAPSED=0
+        
+        while [ $ELAPSED -lt $MAX_PV_WAIT_SECONDS ]; do
+          pv_exists=$(kubectl get pv $pv_name --ignore-not-found 2>/dev/null)
+          if [ -z "$pv_exists" ]; then
+            echo "✓ PV $pv_name was deleted as expected (Delete policy)"
+            break
+          fi
+          echo "PV $pv_name still exists. Waiting..."
+          sleep $PV_POLL_INTERVAL
+          ELAPSED=$((ELAPSED + PV_POLL_INTERVAL))
+        done
+        
+        # After waiting, fail the test if the PV still exists
+        pv_exists=$(kubectl get pv $pv_name --ignore-not-found 2>/dev/null)
+        if [ -n "$pv_exists" ]; then
+          pv_status=$(kubectl get pv $pv_name -o jsonpath='{.status.phase}')
+          echo "❌ PV $pv_name still exists after ${MAX_PV_WAIT_SECONDS}s with status: $pv_status"
+          echo "PV should be deleted when reclaim policy is Delete. Investigate storage provisioner or policy configuration."
+          exit 1
+        fi
+
+        echo ""
+        echo "✓ PV reclaim policy test completed successfully"
+
     - name: Collect comprehensive logs on failure
       if: failure()
       uses: ./.github/actions/collect-logs

@@ -354,4 +354,206 @@ jobs:
           ((++ITER))
         done
         echo "❌ Expired backup was not cleaned up within expected time."
-        exit 1  
+        exit 1  
+
+    - name: Test PV retention after DocumentDB deletion
+      id: test-pv-retention
+      shell: bash
+      run: |
+        echo "Testing PV retention after DocumentDB deletion..."
+
+        # Get the PVC name and PV name before deleting the DocumentDB
+        # PVCs are created by CNPG and labeled with cnpg.io/cluster
+        pvc_name=$(kubectl -n ${{ env.DB_NS }} get pvc -l cnpg.io/cluster=${{ env.DB_RESTORE_NAME }} -o jsonpath='{.items[0].metadata.name}')
+        echo "PVC name: $pvc_name"
+
+        if [ -z "$pvc_name" ]; then
+          echo "❌ Failed to find PVC for cluster ${{ env.DB_RESTORE_NAME }}"
+          exit 1
+        fi
+
+        # Get the PV name bound to this PVC
+        pv_name=$(kubectl -n ${{ env.DB_NS }} get pvc $pvc_name -o jsonpath='{.spec.volumeName}')
+        echo "PV name: $pv_name"
+
+        if [ -z "$pv_name" ]; then
+          echo "❌ Failed to find PV bound to PVC $pvc_name"
+          exit 1
+        fi
+
+        # Check current PV reclaim policy - should be Retain by default
+        current_policy=$(kubectl get pv $pv_name -o jsonpath='{.spec.persistentVolumeReclaimPolicy}')
+        echo "Current PV reclaim policy: $current_policy"
+
+        if [ "$current_policy" != "Retain" ]; then
+          echo "❌ Expected PV reclaim policy to be 'Retain' (default), but got '$current_policy'"
+          exit 1
+        fi
+        echo "✓ PV reclaim policy is correctly set to Retain (default)"
+
+        # Delete the restored DocumentDB cluster
+        kubectl -n ${{ env.DB_NS }} delete documentdb ${{ env.DB_RESTORE_NAME }} --wait=false
+
+        # Wait for DocumentDB to be deleted
+        echo "Waiting for DocumentDB to be deleted..."
+        MAX_RETRIES=30
+        SLEEP_INTERVAL=10
+        ITER=0
+        while [ $ITER -lt $MAX_RETRIES ]; do
+          db_exists=$(kubectl -n ${{ env.DB_NS }} get documentdb ${{ env.DB_RESTORE_NAME }} --ignore-not-found)
+          if [ -z "$db_exists" ]; then
+            echo "✓ DocumentDB deleted successfully."
+            break
+          else
+            echo "DocumentDB still exists. Waiting..."
+            sleep $SLEEP_INTERVAL
+          fi
+          ((++ITER))
+        done
+
+        # Verify PV still exists (because reclaim policy is Retain)
+        pv_exists=$(kubectl get pv $pv_name --ignore-not-found)
+        if [ -n "$pv_exists" ]; then
+          echo "✓ PV $pv_name retained after DocumentDB deletion"
+        else
+          echo "❌ PV $pv_name was deleted unexpectedly"
+          exit 1
+        fi
+
+        # Store PV name for later steps using GitHub Actions output (more robust than temp files)
+        echo "pv_name=$pv_name" >> $GITHUB_OUTPUT
+
+    - name: Restore DocumentDB from retained PV
+      shell: bash
+      run: |
+        pv_name="${{ steps.test-pv-retention.outputs.pv_name }}"
+        echo "Restoring DocumentDB from retained PV: $pv_name"
+
+        # Check the PV status - it should be in "Released" state after PVC deletion
+        pv_status=$(kubectl get pv $pv_name -o jsonpath='{.status.phase}')
+        echo "PV status: $pv_status"
+
+        # Clear the claimRef from the PV so a new PVC can bind to it
+        # When a PV is in "Released" state, it still has a claimRef to the old deleted PVC
+        echo "Clearing claimRef from PV $pv_name to allow new PVC binding..."
+        kubectl patch pv $pv_name --type=json -p='[{"op": "remove", "path": "/spec/claimRef"}]'
+
+        # Verify PV is now Available
+        pv_status=$(kubectl get pv $pv_name -o jsonpath='{.status.phase}')
+        echo "PV status after clearing claimRef: $pv_status"
+
+        # Create a new PVC that binds to the retained PV
+        new_pvc_name="recovered-pvc-from-pv"
+        echo "Creating new PVC $new_pvc_name to bind to retained PV $pv_name"
+
+        # Get the storage capacity from the PV
+        pv_capacity=$(kubectl get pv $pv_name -o jsonpath='{.spec.capacity.storage}')
+        echo "PV capacity: $pv_capacity"
+
+        cat <<EOF | kubectl apply -f -
+        apiVersion: v1
+        kind: PersistentVolumeClaim
+        metadata:
+          name: $new_pvc_name
+          namespace: ${{ env.DB_NS }}
+        spec:
+          accessModes:
+            - ReadWriteOnce
+          resources:
+            requests:
+              storage: $pv_capacity
+          storageClassName: csi-hostpath-sc
+          volumeName: $pv_name
+        EOF
+
+        # Wait for PVC to be bound
+        echo "Waiting for PVC to be bound to PV..."
+        MAX_RETRIES=30
+        SLEEP_INTERVAL=5
+        ITER=0
+        while [ $ITER -lt $MAX_RETRIES ]; do
+          pvc_status=$(kubectl -n ${{ env.DB_NS }} get pvc $new_pvc_name -o jsonpath='{.status.phase}')
+          if [ "$pvc_status" == "Bound" ]; then
+            echo "✓ PVC $new_pvc_name is now bound to PV $pv_name"
+            break
+          else
+            echo "PVC status: $pvc_status. Waiting..."
+            sleep $SLEEP_INTERVAL
+          fi
+          ((++ITER))
+        done
+
+        if [ "$pvc_status" != "Bound" ]; then
+          echo "❌ PVC failed to bind to PV within expected time"
+          exit 1
+        fi
+
+        # Create DocumentDB resource with PVC recovery
+        echo "Creating DocumentDB with PVC recovery from $new_pvc_name"
+        cat <<EOF | kubectl apply -f -
+        apiVersion: documentdb.io/preview
+        kind: DocumentDB
+        metadata:
+          name: ${{ env.DB_RESTORE_NAME }}-from-pvc
+          namespace: ${{ env.DB_NS }}
+        spec:
+          nodeCount: ${{ matrix.node_count }}
+          instancesPerNode: ${{ matrix.instances_per_node }}
+          documentDBImage: ghcr.io/microsoft/documentdb/documentdb-local:16
+          gatewayImage: ghcr.io/microsoft/documentdb/documentdb-local:16
+          resource:
+            storage:
+              pvcSize: 5Gi
+              storageClass: csi-hostpath-sc
+          exposeViaService:
+            serviceType: ClusterIP
+          bootstrap:
+            recovery:
+              pvc:
+                name: $new_pvc_name
+        EOF
+
+    - name: Setup port forwarding for PVC restored cluster
+      uses: ./.github/actions/setup-port-forwarding
+      with:
+        namespace: ${{ env.DB_NS }}
+        cluster-name: ${{ env.DB_RESTORE_NAME }}-from-pvc
+        port: ${{ env.DB_PORT }}
+        architecture: ${{ matrix.architecture }}
+        test-type: 'comprehensive'
+
+    - name: Validate data exists after PVC restoration
+      run: |
+        echo "Validating data exists after PVC restoration..."
+
+        # Validate that the restored cluster has the expected data
+        count=$(mongosh 127.0.0.1:$DB_PORT --quiet --eval "db.testCollection.countDocuments({})" -u $DB_USERNAME -p $DB_PASSWORD --authenticationMechanism SCRAM-SHA-256 --tls --tlsAllowInvalidCertificates)
+        if [ "$count" -eq 100 ]; then
+          echo "✓ Data validation completed successfully after PVC restoration on ${{ matrix.architecture }}"
+        else
+          echo "❌ Data validation failed after PVC restoration on ${{ matrix.architecture }}. Count: $count"
+          exit 1
+        fi
+
+    - name: Cleanup PVC restored cluster port forwarding
+      if: always()
+      run: |
+        # Stop port-forward if it exists
+        if [ -f /tmp/pf_pid ]; then
+          PF_PID=$(cat /tmp/pf_pid)
+          kill $PF_PID 2>/dev/null || true
+          rm -f /tmp/pf_pid
+        fi
+
+        # Clean up output log
+        rm -f /tmp/pf_output.log
+
+    - name: Collect logs on failure
+      if: failure()
+      uses: ./.github/actions/collect-logs
+      with:
+        architecture: ${{ matrix.architecture }}
+        operator-namespace: ${{ env.OPERATOR_NS }}
+        db-namespace: ${{ env.DB_NS }}
+        db-cluster-name: ${{ env.DB_NAME }}
+        cert-manager-namespace: ${{ env.CERT_MANAGER_NS }}