From 1c31cb88eb448aa1beed1588d77debf618c9acbe Mon Sep 17 00:00:00 2001 From: Gerald Barker Date: Sat, 31 Jan 2026 22:29:44 +0000 Subject: [PATCH 1/2] Refactor and tidy the e2e tests --- api/v1/nodedrain_types.go | 12 +- api/v1/zz_generated.deepcopy.go | 9 +- .../crd/bases/k8s.gezb.co.uk_nodedrains.yaml | 24 +- internal/controller/nodedrain_controller.go | 139 ++-- .../controller/nodedrain_controller_test.go | 45 +- internal/events/recorder.go | 1 - magefile.go | 2 +- test/e2e/e2e_suite_test.go | 14 +- test/e2e/e2e_test.go | 604 ++++++++++-------- test/utils/utils.go | 192 ++++++ 10 files changed, 680 insertions(+), 362 deletions(-) diff --git a/api/v1/nodedrain_types.go b/api/v1/nodedrain_types.go index 8c454d1..b0c13cd 100644 --- a/api/v1/nodedrain_types.go +++ b/api/v1/nodedrain_types.go @@ -44,6 +44,7 @@ type NodeDrainSpec struct { // NodeDrainStatus defines the observed state of NodeDrain. type NodeDrainStatus struct { // Phase represents the progress of this nodeDrain + // +operator-sdk:csv:customresourcedefinitions:type=status Phase NodeDrainPhase `json:"phase"` // The last time the status has been updated LastUpdate metav1.Time `json:"lastUpdate,omitempty"` @@ -51,13 +52,14 @@ type NodeDrainStatus struct { LastError string `json:"lastError,omitempty"` // PodsToBeEvicted is the list of pods for the controller needs to evict PodsToBeEvicted []NamespaceAndName `json:"podsToBeEvicted,omitempty"` - // PendingPods is a list of pending pods for eviction - PendingPods []string `json:"pendingPods,omitempty"` + // PendingEvictionPods is a list of pods still to be evicted + PendingEvictionPods []string `json:"PendingEvictionPods,omitempty"` + // PodsToRestart is the list of pods that we are waiting to restart + PodsToRestart []NamespaceAndName `json:"PodsToRestart,omitempty"` // TotalPods is the total number of all pods on the node from the start - // +operator-sdk:csv:customresourcedefinitions:type=status TotalPods int `json:"totalPods,omitempty"` // EvictionPods is the total number of pods up for eviction from the start - EvictionPodCount int `json:"evictionPods,omitempty"` + EvictionPodCount int `json:"evictionPods"` // Percentage completion of draining the node DrainProgress int `json:"drainProgress,omitempty"` // PodsBlockingDrain is a list of pods that are blocking the draining of this node @@ -67,7 +69,7 @@ type NodeDrainStatus struct { // +kubebuilder:object:root=true // +kubebuilder:subresource:status // +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase",description="Phase of the NodeDrain" -// +kubebuilder:printcolumn:name="Pods BlockingDrain",type="string",JSONPath=".status.podsblockingdrain",description="Pods that are blocking drain" +// +kubebuilder:printcolumn:name="Pods BlockingDrain",type="string",JSONPath=".status.podsBlockingDrain",description="Pods that are blocking drain" // NodeDrain is the Schema for the nodedrains API. type NodeDrain struct { diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index 7e80794..f39ffc3 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -195,11 +195,16 @@ func (in *NodeDrainStatus) DeepCopyInto(out *NodeDrainStatus) { *out = make([]NamespaceAndName, len(*in)) copy(*out, *in) } - if in.PendingPods != nil { - in, out := &in.PendingPods, &out.PendingPods + if in.PendingEvictionPods != nil { + in, out := &in.PendingEvictionPods, &out.PendingEvictionPods *out = make([]string, len(*in)) copy(*out, *in) } + if in.PodsToRestart != nil { + in, out := &in.PodsToRestart, &out.PodsToRestart + *out = make([]NamespaceAndName, len(*in)) + copy(*out, *in) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeDrainStatus. diff --git a/config/crd/bases/k8s.gezb.co.uk_nodedrains.yaml b/config/crd/bases/k8s.gezb.co.uk_nodedrains.yaml index 02fef2c..3d84ee9 100644 --- a/config/crd/bases/k8s.gezb.co.uk_nodedrains.yaml +++ b/config/crd/bases/k8s.gezb.co.uk_nodedrains.yaml @@ -20,7 +20,7 @@ spec: name: Phase type: string - description: Pods that are blocking drain - jsonPath: .status.podsblockingdrain + jsonPath: .status.podsBlockingDrain name: Pods BlockingDrain type: string name: v1 @@ -73,6 +73,22 @@ spec: status: description: NodeDrainStatus defines the observed state of NodeDrain. properties: + PendingEvictionPods: + description: PendingEvictionPods is a list of pods still to be evicted + items: + type: string + type: array + PodsToRestart: + description: PodsToRestart is the list of pods that we are waiting + to restart + items: + properties: + name: + type: string + namespace: + type: string + type: object + type: array drainProgress: description: Percentage completion of draining the node type: integer @@ -88,11 +104,6 @@ spec: description: The last time the status has been updated format: date-time type: string - pendingPods: - description: PendingPods is a list of pending pods for eviction - items: - type: string - type: array phase: description: Phase represents the progress of this nodeDrain type: string @@ -116,6 +127,7 @@ spec: from the start type: integer required: + - evictionPods - phase type: object type: object diff --git a/internal/controller/nodedrain_controller.go b/internal/controller/nodedrain_controller.go index 686247c..7c781dc 100644 --- a/internal/controller/nodedrain_controller.go +++ b/internal/controller/nodedrain_controller.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "regexp" + "slices" "strings" "time" @@ -130,7 +131,8 @@ func (r *NodeDrainReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( needsRequeue, drainError, err = r.reconcileDraining(drainer, nodeDrain) case gezbcoukalphav1.NodeDrainPhaseWaitForPodsToRestart: needUpdate = true - needsRequeue = r.reconcileWaitForPodsToRestart(ctx, nodeDrain) + needsRequeue = true + err = r.reconcileWaitForPodsToRestart(ctx, drainer, nodeDrain) } @@ -159,7 +161,7 @@ func (r *NodeDrainReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( } } if needsRequeue { - return ctrl.Result{Requeue: true}, nil + return ctrl.Result{Requeue: true, RequeueAfter: 5 * time.Second}, nil } return ctrl.Result{}, nil } @@ -220,7 +222,7 @@ func (r *NodeDrainReconciler) reconcilePending(ctx context.Context, drainer *dra nodeDrain.Status.TotalPods = len(podlist.Items) err = r.updatePendingPodCount(drainer, nodeDrain) - nodeDrain.Status.EvictionPodCount = len(nodeDrain.Status.PendingPods) + nodeDrain.Status.EvictionPodCount = len(nodeDrain.Status.PendingEvictionPods) if err != nil { return false, err } @@ -232,9 +234,20 @@ func (r *NodeDrainReconciler) reconcilePending(ctx context.Context, drainer *dra // reconcileCordoned Checks that no pods exist that block draining this node & Checks all nodes for this k8s version&role are cordened, // once both are true update status to Draining func (r *NodeDrainReconciler) reconcileCordoned(ctx context.Context, drainer *drain.Helper, nodeDrain *gezbcoukalphav1.NodeDrain) (bool, bool, error) { + + allNodesCordoned, err := r.areAllNodesCordoned(ctx, nodeDrain) + if err != nil { + return true, false, err + } + if !allNodesCordoned { + setStatus(r.Recorder, nodeDrain, gezbcoukalphav1.NodeDrainPhaseOtherNodesNotCordoned) + // requeue this request so we check when the blocking pod(s) have been removed + return true, true, nil + } + // update status with pods to be deleted // ignore updated here we need to save the CR anyway - err := r.updatePendingPodCount(drainer, nodeDrain) + err = r.updatePendingPodCount(drainer, nodeDrain) if err != nil { return false, true, err } @@ -243,73 +256,70 @@ func (r *NodeDrainReconciler) reconcileCordoned(ctx context.Context, drainer *dr if err != nil { return false, false, err } - nodeDrain.Status.PodsBlockingDrain = strings.Join(podsBlocking, ",") + podsBlockingString := strings.Join(podsBlocking, ",") + if podsBlockingString != nodeDrain.Status.PodsBlockingDrain { + r.logger.Info(fmt.Sprintf("Pods blocking drain for node %s are [%s]", nodeDrain.Spec.NodeName, podsBlockingString)) + nodeDrain.Status.PodsBlockingDrain = podsBlockingString + } if len(podsBlocking) > 0 { setStatus(r.Recorder, nodeDrain, gezbcoukalphav1.NodeDrainPhasePodsBlocking) // requeue this request so we check when the blocking pod(s) have been removed return true, true, nil } - allNodesCordoned, err := r.areAllNodesCordoned(ctx, nodeDrain) - if err != nil { - return true, false, err - } - if !allNodesCordoned { - setStatus(r.Recorder, nodeDrain, gezbcoukalphav1.NodeDrainPhaseOtherNodesNotCordoned) - // requeue this request so we check when the blocking pod(s) have been removed - return true, true, nil - } setStatus(r.Recorder, nodeDrain, gezbcoukalphav1.NodeDrainPhaseDraining) - return true, false, nil + return true, true, nil } // reconcileDraining Drains the given node func (r *NodeDrainReconciler) reconcileDraining(drainer *drain.Helper, nodeDrain *gezbcoukalphav1.NodeDrain) (bool, bool, error) { - if nodeDrain.Status.Phase == gezbcoukalphav1.NodeDrainPhaseDraining { - - pendingList, errlist := drainer.GetPodsForDeletion(nodeDrain.Spec.NodeName) - if errlist != nil { - return false, false, fmt.Errorf("failed to get pods for eviction while initializing status: %v", errlist) - } - if pendingList != nil { - nodeDrain.Status.PodsToBeEvicted = GetNameSpaceAndName(pendingList.Pods()) - } - nodeName := nodeDrain.Spec.NodeName - r.logger.Info(fmt.Sprintf("Evict all Pods from Node %s", nodeName), "nodeName", nodeName) - if err := drain.RunNodeDrain(drainer, nodeName); err != nil { - r.logger.Info("Not all pods evicted", "nodeName", nodeName, "error", err) - setStatus(r.Recorder, nodeDrain, gezbcoukalphav1.NodeDrainPhaseFailed) - // ignore updated here we need to save the CR anyway - err = r.updatePendingPodCount(drainer, nodeDrain) - if err != nil { - return false, true, err - } - return false, true, nil - } - err := r.updatePendingPodCount(drainer, nodeDrain) + pendingList, errlist := drainer.GetPodsForDeletion(nodeDrain.Spec.NodeName) + if errlist != nil { + return false, false, fmt.Errorf("failed to get pods for eviction while initializing status: %v", errlist) + } + if pendingList != nil { + nodeDrain.Status.PodsToBeEvicted = GetNameSpaceAndName(pendingList.Pods()) + } + if nodeDrain.Spec.WaitForPodsToRestart { + nodeDrain.Status.PodsToRestart = GetNameSpaceAndName(pendingList.Pods()) + r.logger.Info(fmt.Sprintf("Pods to wait to restart on node %s - init to %v", nodeDrain.Spec.NodeName, nodeDrain.Status.PodsToRestart)) + } + nodeName := nodeDrain.Spec.NodeName + r.logger.Info(fmt.Sprintf("Evict all Pods from Node %s", nodeName), "nodeName", nodeName) + if err := drain.RunNodeDrain(drainer, nodeName); err != nil { + r.logger.Info("Not all pods evicted", "nodeName", nodeName, "error", err) + setStatus(r.Recorder, nodeDrain, gezbcoukalphav1.NodeDrainPhaseFailed) + // ignore updated here we need to save the CR anyway + err = r.updatePendingPodCount(drainer, nodeDrain) if err != nil { - return false, false, err + return false, true, err } - nodeDrain.Status.DrainProgress = 100 - + return false, true, nil } + if nodeDrain.Spec.WaitForPodsToRestart { - message := fmt.Sprintf("Waiting for %d pod(s) to restart", len(nodeDrain.Status.PodsToBeEvicted)) + message := fmt.Sprintf("Waiting for [%v] pod(s) to restart", nodeDrain.Status.PodsToRestart) publishEvent(r.Recorder, nodeDrain, corev1.EventTypeNormal, events.EventReasonWaitingForPodsToRestart, message) setStatus(r.Recorder, nodeDrain, gezbcoukalphav1.NodeDrainPhaseWaitForPodsToRestart) // requeue to wait for the pods to be running return true, true, nil } else { + err := r.updatePendingPodCount(drainer, nodeDrain) + if err != nil { + return false, false, err + } + nodeDrain.Status.DrainProgress = 100 setStatus(r.Recorder, nodeDrain, gezbcoukalphav1.NodeDrainPhaseCompleted) return false, false, nil } } // reconcileWaitForPodsToRestart wiats for all evicted pods to be running again -func (r *NodeDrainReconciler) reconcileWaitForPodsToRestart(ctx context.Context, nodeDrain *gezbcoukalphav1.NodeDrain) bool { - allPodsRunning := false +func (r *NodeDrainReconciler) reconcileWaitForPodsToRestart(ctx context.Context, drainer *drain.Helper, nodeDrain *gezbcoukalphav1.NodeDrain) error { + r.logger.Info(fmt.Sprintf("ReconcileWaitForRestarts() for node %s - init to %v", nodeDrain.Spec.NodeName, nodeDrain.Status.PodsToRestart)) - for _, nameAndNamespace := range nodeDrain.Status.PodsToBeEvicted { + runningPods := []string{} + for _, nameAndNamespace := range nodeDrain.Status.PodsToRestart { pod := corev1.Pod{} err := r.Get(ctx, client.ObjectKey{Namespace: nameAndNamespace.Namespace, Name: nameAndNamespace.Name}, &pod) if err != nil { @@ -318,21 +328,32 @@ func (r *NodeDrainReconciler) reconcileWaitForPodsToRestart(ctx context.Context, } else { // check if the pod is running if !r.PodStatusChecker.CheckStatus(&pod) { - r.logger.Info("Pod not running yet", "Pod.Namespace", nameAndNamespace.Namespace, "Pod.Name", nameAndNamespace.Name) - allPodsRunning = false + r.logger.Info("Pod(s) not running yet", "Pod.Namespace", nameAndNamespace.Namespace, "pod.Name", nameAndNamespace.Name) } else { - r.logger.Info("Pod running", "Pod.Namespace", nameAndNamespace.Namespace, "Pod.Name", nameAndNamespace.Name) - allPodsRunning = true + r.logger.Info("Pod(s) running", "Pod.Namespace", nameAndNamespace.Namespace, "pod.Name", nameAndNamespace.Name) + runningPods = append(runningPods, nameAndNamespace.Name) } } } - if allPodsRunning { + for _, podName := range runningPods { + nodeDrain.Status.PodsToRestart = slices.DeleteFunc(nodeDrain.Status.PodsToRestart, func(cmp gezbcoukalphav1.NamespaceAndName) bool { + return cmp.Name == podName + }) + } + r.logger.Info(fmt.Sprintf("%s Pods are now down to %v", nodeDrain.Spec.NodeName, nodeDrain.Status.PodsToRestart)) + + if len(nodeDrain.Status.PodsToRestart) == 0 { r.logger.Info("All pods running, updating status to complete") + err := r.updatePendingPodCount(drainer, nodeDrain) + if err != nil { + return err + } + nodeDrain.Status.DrainProgress = 100 setStatus(r.Recorder, nodeDrain, gezbcoukalphav1.NodeDrainPhaseCompleted) - return false + return nil } - return true + return nil } func (r *NodeDrainReconciler) getBlockingPods(ctx context.Context, nodeDrain *gezbcoukalphav1.NodeDrain) ([]string, error) { @@ -345,12 +366,8 @@ func (r *NodeDrainReconciler) getBlockingPods(ctx context.Context, nodeDrain *ge } for _, drainCheck := range drainCheckList.Items { pattern := regexp.MustCompile(drainCheck.Spec.PodRegex) - for _, podName := range nodeDrain.Status.PendingPods { + for _, podName := range nodeDrain.Status.PendingEvictionPods { if pattern.MatchString(podName) { - blockingMessage := fmt.Sprintf("Node %s is blocked from draining by pod: %s", nodeDrain.Spec.NodeName, podName) - r.logger.Info(blockingMessage) - publishEvent(r.Recorder, nodeDrain, corev1.EventTypeWarning, events.EventReasonDrainBlockedByPods, blockingMessage) - r.logger.Info(fmt.Sprintf("Node %s is blocked from draining by pod: %s", nodeDrain.Spec.NodeName, podName)) podsBlocking = append(podsBlocking, podName) } } @@ -398,12 +415,12 @@ func (r *NodeDrainReconciler) updatePendingPodCount(drainer *drain.Helper, nodeD return fmt.Errorf("failed to get pods for eviction while initializing status: %v", errlist) } if pendingList != nil { - nodeDrain.Status.PendingPods = GetPodNameList(pendingList.Pods()) + nodeDrain.Status.PendingEvictionPods = GetPodNameList(pendingList.Pods()) } else { - nodeDrain.Status.PendingPods = []string{} + nodeDrain.Status.PendingEvictionPods = []string{} } if nodeDrain.Status.EvictionPodCount > 0 { - nodeDrain.Status.DrainProgress = (nodeDrain.Status.EvictionPodCount - len(nodeDrain.Status.PendingPods)) * 100 / nodeDrain.Status.EvictionPodCount + nodeDrain.Status.DrainProgress = (nodeDrain.Status.EvictionPodCount - len(nodeDrain.Status.PendingEvictionPods)) * 100 / nodeDrain.Status.EvictionPodCount } return nil } @@ -415,9 +432,9 @@ func (r *NodeDrainReconciler) onReconcileErrorWithRequeue(ctx context.Context, d if nodeDrain.Spec.NodeName != "" { pendingList, _ := drainer.GetPodsForDeletion(nodeDrain.Spec.NodeName) if pendingList != nil { - nodeDrain.Status.PendingPods = GetPodNameList(pendingList.Pods()) + nodeDrain.Status.PendingEvictionPods = GetPodNameList(pendingList.Pods()) if nodeDrain.Status.EvictionPodCount != 0 { - nodeDrain.Status.DrainProgress = (nodeDrain.Status.EvictionPodCount - len(nodeDrain.Status.PendingPods)) * 100 / nodeDrain.Status.EvictionPodCount + nodeDrain.Status.DrainProgress = (nodeDrain.Status.EvictionPodCount - len(nodeDrain.Status.PendingEvictionPods)) * 100 / nodeDrain.Status.EvictionPodCount } } } diff --git a/internal/controller/nodedrain_controller_test.go b/internal/controller/nodedrain_controller_test.go index 8e4fa01..7eb39d6 100644 --- a/internal/controller/nodedrain_controller_test.go +++ b/internal/controller/nodedrain_controller_test.go @@ -63,7 +63,8 @@ var _ = Describe("Node Drain", func() { Expect(nodeDrain.Status.LastError).To(Equal("")) Expect(nodeDrain.Status.TotalPods).To(Equal(0)) - Expect(nodeDrain.Status.PendingPods).To(BeEmpty()) + Expect(nodeDrain.Status.PendingEvictionPods).To(BeEmpty()) + Expect(nodeDrain.Status.PodsToRestart).To(BeEmpty()) Expect(nodeDrain.Status.EvictionPodCount).To(Equal(0)) Expect(nodeDrain.Status.DrainProgress).To(Equal(100)) Expect(nodeDrain.Status.PodsBlockingDrain).To(Equal("")) @@ -93,7 +94,8 @@ var _ = Describe("Node Drain", func() { verifyStatusEvent(gezbcoukalphav1.NodeDrainPhaseCompleted) Expect(nodeDrain.Status.LastError).To(Equal("")) - Expect(nodeDrain.Status.PendingPods).To(BeEmpty()) + Expect(nodeDrain.Status.PendingEvictionPods).To(BeEmpty()) + Expect(nodeDrain.Status.PodsToRestart).To(BeEmpty()) Expect(nodeDrain.Status.TotalPods).To(Equal(0)) Expect(nodeDrain.Status.EvictionPodCount).To(Equal(0)) Expect(nodeDrain.Status.DrainProgress).To(Equal(100)) @@ -136,7 +138,8 @@ var _ = Describe("Node Drain", func() { Expect(nodeDrain.Status.LastError).To(Equal("")) Expect(nodeDrain.Status.TotalPods).To(Equal(3)) - Expect(nodeDrain.Status.PendingPods).To(BeEmpty()) + Expect(nodeDrain.Status.PendingEvictionPods).To(BeEmpty()) + Expect(nodeDrain.Status.PodsToRestart).To(BeEmpty()) Expect(nodeDrain.Status.PodsToBeEvicted).To(Equal([]gezbcoukalphav1.NamespaceAndName{ { Namespace: testNamespace, @@ -204,13 +207,11 @@ var _ = Describe("Node Drain", func() { Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(nodeDrainCR), nodeDrain)).To(Succeed()) Expect(nodeDrain.Status.PodsBlockingDrain).To(Equal("block-1,block-2")) - verifyEvent(events.EventReasonDrainBlockedByPods, "Node node1 is blocked from draining by pod: block-1") - verifyEvent(events.EventReasonDrainBlockedByPods, "Node node1 is blocked from draining by pod: block-2") - Expect(nodeDrain.Status.LastError).To(Equal("")) Expect(nodeDrain.Status.TotalPods).To(Equal(4)) Expect(nodeDrain.Status.PodsToBeEvicted).To(BeEmpty()) - Expect(nodeDrain.Status.PendingPods).To(Equal([]string{"block-1", "block-2", "pod1", "shouldnotblock-1"})) + Expect(nodeDrain.Status.PendingEvictionPods).To(Equal([]string{"block-1", "block-2", "pod1", "shouldnotblock-1"})) + Expect(nodeDrain.Status.PodsToRestart).To(BeEmpty()) Expect(nodeDrain.Status.EvictionPodCount).To(Equal(4)) Expect(nodeDrain.Status.DrainProgress).To(Equal(0)) @@ -221,13 +222,13 @@ var _ = Describe("Node Drain", func() { By("Check the status has been updated") Eventually(func(g Gomega) { g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(nodeDrainCR), nodeDrain)).To(Succeed()) - g.Expect(nodeDrain.Status.PendingPods).To(HaveLen(3)) + g.Expect(nodeDrain.Status.PendingEvictionPods).To(HaveLen(3)) }, timeout, interval).Should(Succeed()) Expect(nodeDrain.Status.LastError).To(Equal("")) Expect(nodeDrain.Status.TotalPods).To(Equal(4)) Expect(nodeDrain.Status.PodsToBeEvicted).To(BeNil()) - Expect(nodeDrain.Status.PendingPods).To(Equal([]string{"block-2", "pod1", "shouldnotblock-1"})) + Expect(nodeDrain.Status.PendingEvictionPods).To(Equal([]string{"block-2", "pod1", "shouldnotblock-1"})) Expect(nodeDrain.Status.EvictionPodCount).To(Equal(4)) Expect(nodeDrain.Status.DrainProgress).To(Equal(25)) Expect(nodeDrain.Status.PodsBlockingDrain).To(Equal("block-2")) @@ -258,7 +259,8 @@ var _ = Describe("Node Drain", func() { }, } Expect(nodeDrain.Status.PodsToBeEvicted).To(Equal(podsToBeEvicted)) - Expect(nodeDrain.Status.PendingPods).To(BeEmpty()) + Expect(nodeDrain.Status.PendingEvictionPods).To(BeEmpty()) + Expect(nodeDrain.Status.PodsToRestart).To(BeEmpty()) Expect(nodeDrain.Status.EvictionPodCount).To(Equal(4)) Expect(nodeDrain.Status.DrainProgress).To(Equal(100)) Expect(nodeDrain.Status.PodsBlockingDrain).To(Equal("")) @@ -305,7 +307,8 @@ var _ = Describe("Node Drain", func() { }, } Expect(nodeDrain.Status.PodsToBeEvicted).To(BeNil()) - Expect(nodeDrain.Status.PendingPods).To(Equal([]string{"pod1"})) + Expect(nodeDrain.Status.PendingEvictionPods).To(Equal([]string{"pod1"})) + Expect(nodeDrain.Status.PodsToRestart).To(BeEmpty()) Expect(nodeDrain.Status.EvictionPodCount).To(Equal(1)) Expect(nodeDrain.Status.DrainProgress).To(Equal(0)) Expect(nodeDrain.Status.PodsBlockingDrain).To(Equal("")) @@ -326,7 +329,8 @@ var _ = Describe("Node Drain", func() { Expect(nodeDrain.Status.LastError).To(Equal("")) Expect(nodeDrain.Status.TotalPods).To(Equal(1)) Expect(nodeDrain.Status.PodsToBeEvicted).To(Equal(podsToBeEvicted)) - Expect(nodeDrain.Status.PendingPods).To(BeEmpty()) + Expect(nodeDrain.Status.PendingEvictionPods).To(BeEmpty()) + Expect(nodeDrain.Status.PodsToRestart).To(BeEmpty()) Expect(nodeDrain.Status.EvictionPodCount).To(Equal(1)) Expect(nodeDrain.Status.DrainProgress).To(Equal(100)) Expect(nodeDrain.Status.PodsBlockingDrain).To(Equal("")) @@ -384,7 +388,8 @@ var _ = Describe("Node Drain", func() { }, } Expect(nodeDrain.Status.PodsToBeEvicted).To(Equal(podsToBeEvicted)) - Expect(nodeDrain.Status.PendingPods).To(BeEmpty()) + Expect(nodeDrain.Status.PendingEvictionPods).To(BeEmpty()) + Expect(nodeDrain.Status.PodsToRestart).To(BeEmpty()) Expect(nodeDrain.Status.DrainProgress).To(Equal(100)) Expect(nodeDrain.Status.PodsBlockingDrain).To(Equal("")) @@ -414,7 +419,8 @@ var _ = Describe("Node Drain", func() { Expect(nodeDrain.Status.TotalPods).To(Equal(0)) Expect(nodeDrain.Status.EvictionPodCount).To(Equal(0)) Expect(nodeDrain.Status.PodsToBeEvicted).To(BeEmpty()) - Expect(nodeDrain.Status.PendingPods).To(BeEmpty()) + Expect(nodeDrain.Status.PendingEvictionPods).To(BeEmpty()) + Expect(nodeDrain.Status.PodsToRestart).To(BeEmpty()) Expect(nodeDrain.Status.DrainProgress).To(Equal(0)) Expect(nodeDrain.Status.PodsBlockingDrain).To(Equal("")) }) @@ -478,8 +484,9 @@ var _ = Describe("Node Drain", func() { }, } Expect(nodeDrain.Status.PodsToBeEvicted).To(Equal(podsToBeEvicted)) - Expect(nodeDrain.Status.PendingPods).To(BeEmpty()) - Expect(nodeDrain.Status.DrainProgress).To(Equal(100)) + Expect(nodeDrain.Status.PendingEvictionPods).To(Equal([]string{"pod1", "pod2", "pod3"})) + Expect(nodeDrain.Status.PodsToRestart).To(Equal(podsToBeEvicted)) + Expect(nodeDrain.Status.DrainProgress).To(Equal(0)) Expect(nodeDrain.Status.PodsBlockingDrain).To(Equal("")) }) @@ -521,7 +528,8 @@ var _ = Describe("Node Drain", func() { Expect(nodeDrain.Status.LastError).To(Equal("")) Expect(nodeDrain.Status.TotalPods).To(Equal(0)) - Expect(nodeDrain.Status.PendingPods).To(BeEmpty()) + Expect(nodeDrain.Status.PodsToRestart).To(BeEmpty()) + Expect(nodeDrain.Status.PendingEvictionPods).To(BeEmpty()) Expect(nodeDrain.Status.PodsToBeEvicted).To(BeEmpty()) Expect(nodeDrain.Status.EvictionPodCount).To(Equal(0)) Expect(nodeDrain.Status.DrainProgress).To(Equal(0)) @@ -566,8 +574,9 @@ var _ = Describe("Node Drain", func() { Expect(nodeDrain.Status.LastError).To(Equal("")) Expect(nodeDrain.Status.TotalPods).To(Equal(0)) - Expect(nodeDrain.Status.PendingPods).To(BeEmpty()) + Expect(nodeDrain.Status.PendingEvictionPods).To(BeEmpty()) Expect(nodeDrain.Status.PodsToBeEvicted).To(BeEmpty()) + Expect(nodeDrain.Status.PodsToRestart).To(BeEmpty()) Expect(nodeDrain.Status.EvictionPodCount).To(Equal(0)) Expect(nodeDrain.Status.DrainProgress).To(Equal(0)) Expect(nodeDrain.Status.PodsBlockingDrain).To(Equal("")) diff --git a/internal/events/recorder.go b/internal/events/recorder.go index 469f1c3..8018e28 100644 --- a/internal/events/recorder.go +++ b/internal/events/recorder.go @@ -32,7 +32,6 @@ import ( const ( EventReasonNodeNotFound = "NodeNotFound" EventReasonStatusUpdated = "StatusUpdated" - EventReasonDrainBlockedByPods = "DrainingBlocked" EventReasonWaitingForPodsToRestart = "WaitingForPodsToRestart" ) diff --git a/magefile.go b/magefile.go index acc669e..b8357f2 100644 --- a/magefile.go +++ b/magefile.go @@ -10,7 +10,7 @@ import ( const ( K3S_IMAGE = "rancher/k3s:v1.34.3-k3s1" CLUSTER_NAME = "nd-e2e" - AGENTS = "3" + AGENTS = "4" ) // Runs the End-to-End Test in a local Kind Cluster diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index dc48f57..1edd7ad 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -30,6 +30,9 @@ const worker2Node = "k3d-nd-e2e-agent-1" // worker3Node is the name of the node reserved for further testing const worker3Node = "k3d-nd-e2e-agent-2" +// worker4Node is the name of the node reserved for further testing +const worker4Node = "k3d-nd-e2e-agent-3" + var ( // Optional Environment Variables: // - PROMETHEUS_INSTALL_SKIP=true: Skips Prometheus Operator installation during test setup. @@ -43,10 +46,11 @@ var ( isPrometheusOperatorAlreadyInstalled = false // isCertManagerAlreadyInstalled will be set true when CertManager CRDs be found on the cluster isCertManagerAlreadyInstalled = false - // projectImage is the name of the image which will be build and loaded // with the code source changes to be tested. projectImage = "node-drain-controller:e2e-tests" + // testNodes is an array of the nodes that are reserved for drain testing + testNodes = []string{worker2Node, worker3Node, worker4Node} ) // TestE2E runs the end-to-end (e2e) test suite for the project. These tests execute in an isolated, @@ -65,15 +69,21 @@ var _ = BeforeSuite(func() { ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to set KUBECONFIG environment variable") By("Labeling and cordening the test nodes so only our workloads run on them") + // worker2 utils.LabelNode(worker2Node, k8sRole) utils.CordonNode(worker2Node) + // worker 3 utils.LabelNode(worker3Node, k8sRole) utils.CordonNode(worker3Node) + // worker 4 + utils.LabelNode(worker4Node, k8sRole) + utils.CordonNode(worker4Node) + By("Draining the test nodes to ensure a clean state") - utils.DrainNodes([]string{worker2Node, worker3Node}) + utils.DrainNodes([]string{worker2Node, worker3Node, worker4Node}) By("Ensure that Prometheus is enabled") _ = utils.UncommentCode("config/default/kustomization.yaml", "#- ../prometheus", "#") diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index 0025e51..aeefc9a 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -6,7 +6,6 @@ import ( "os" "os/exec" "path/filepath" - "strings" "time" . "github.com/onsi/ginkgo/v2" @@ -84,8 +83,8 @@ var _ = Describe("Manager", Ordered, func() { } }) - SetDefaultEventuallyTimeout(2 * time.Minute) - SetDefaultEventuallyPollingInterval(time.Second) + SetDefaultEventuallyTimeout(2 * 2 * time.Minute) + SetDefaultEventuallyPollingInterval(5 * time.Second) Context("Manager", func() { It("should run successfully", func() { @@ -203,7 +202,7 @@ var _ = Describe("Manager", Ordered, func() { g.Expect(err).NotTo(HaveOccurred()) g.Expect(output).To(Equal("Succeeded"), "curl pod in wrong status") } - Eventually(verifyCurlUp, 5*time.Minute).Should(Succeed()) + Eventually(verifyCurlUp, 5*2*time.Minute).Should(Succeed()) By("getting the metrics by checking curl-metrics logs") metricsOutput := getMetricsOutput() @@ -220,436 +219,509 @@ var _ = Describe("Manager", Ordered, func() { // +kubebuilder:scaffold:e2e-webhooks-checks - It("should drain a node when there are no pods blocking the drain", func() { + It("should drain a node if there are no drianchecks", func() { - By("Uncordening our test node") + By("Un-cordoned our test node") utils.UncordonNode(worker2Node) By("Verifying the number of pods on the node(s) is 0") - expectNumberOfPodsRunning(0) + utils.ExpectNumberOfPodsRunning(testNodes, 0) - createStatefulSetWithName("nginx") - createStatefulSetWithName("blocking") + utils.CreateStatefulSetWithName("nginx", 1, k8sRole) + utils.CreateStatefulSetWithName("blocking", 2, k8sRole) By("Waiting for all pods to be running") - expectNumberOfPodsRunning(2) + utils.ExpectNumberOfPodsRunning(testNodes, 3) - nodeDrainFile := applyNodeDrain(false, K8sVersionRegex, k8sRole) + preDrainPodStartTimes := utils.GetPodStartTimesOnNodes(testNodes) - By("Waiting for all pods to be removed") - expectNumberOfPodsRunning(0) + utils.ApplyNodeDrain(worker2Node, false, K8sVersionRegex, k8sRole) + + By("Drain should run and there should be no pods left") + utils.ExpectNumberOfPodsRunning(testNodes, 0) + + By("Un-cordoned other test node to allow pods to restart there (simulate new node)") + utils.UncordonNode(worker3Node) + + By("Waiting for all pods to be restarted on other nodes") + utils.ExpectNumberOfPodsRunning(testNodes, 3) + + postDrainPodEndTimes := utils.GetPodStartTimesOnNodes(testNodes) + + for podName, preDrainStartTime := range preDrainPodStartTimes { + newStartTime, exists := postDrainPodEndTimes[podName] + Expect(exists).To(BeTrue(), "Pod %s not found after drain", podName) + Expect(newStartTime.After(preDrainStartTime)).To(BeTrue()) + } nodeDrainIsPhaseCompleted := func(g Gomega) { - cmd := exec.Command("kubectl", "get", "nodedrain", "nodedrain-sample") + cmd := exec.Command("kubectl", "get", "nodedrain", worker2Node) output, err := utils.Run(cmd) g.Expect(err).NotTo(HaveOccurred()) - g.Expect(output).To(ContainSubstring("nodedrain-sample Completed")) + g.Expect(output).To(ContainSubstring(fmt.Sprintf("%s %s", worker2Node, "Completed"))) + } - Eventually(nodeDrainIsPhaseCompleted, 5*time.Minute).Should(Succeed()) + Eventually(nodeDrainIsPhaseCompleted, 2*time.Minute).Should(Succeed()) By("Clean up after the test") - By("deleting the NodeDrain") - cmd := exec.Command("kubectl", "delete", "-f", nodeDrainFile) - _, err := utils.Run(cmd) - Expect(err).NotTo(HaveOccurred(), "Failed delete nodeDrain") - err = os.Remove(nodeDrainFile) - Expect(err).NotTo(HaveOccurred(), "Failed remove nodeDrainFile") + By("Deleting statefulsets") + utils.DeleteStatefulSet("nginx") + utils.DeleteStatefulSet("blocking") - By("Deleting test statefulsets") - cmd = exec.Command("kubectl", "delete", "statefulset", "nginx") - _, err = utils.Run(cmd) - Expect(err).NotTo(HaveOccurred(), "Failed delete nginx statefulset") - cmd = exec.Command("kubectl", "delete", "statefulset", "blocking") - _, err = utils.Run(cmd) - Expect(err).NotTo(HaveOccurred(), "Failed delete blocking statefulset") + By("deleting the NodeDrain") + utils.DeleteNodeDrain(worker2Node) - By("re-cordening our test node") + By("Re-cordoned our test nodes") utils.CordonNode(worker2Node) + utils.CordonNode(worker3Node) }) }) It("drain should be blocked by a DrainCheck until pods matching the regex are deleted", func() { - By("Uncordening our test node") + By("Un-cordoned our test node") utils.UncordonNode(worker2Node) By("Verifying the number of pods on the node(s) is 2") - expectNumberOfPodsRunning(0) + utils.ExpectNumberOfPodsRunning(testNodes, 0) - createStatefulSetWithName("nginx") - createStatefulSetWithName("blocking") + utils.CreateStatefulSetWithName("nginx", 1, k8sRole) + utils.CreateStatefulSetWithName("blocking", 2, k8sRole) By("Waiting for all pods to be running") - expectNumberOfPodsRunning(2) + utils.ExpectNumberOfPodsRunning(testNodes, 3) - By("creating a DrainCheck for `blocking-.` pods") + By("Creating a DrainCheck for `blocking-.` pods") - drainCheckFile := applyDrainCheck("^blocking-.*") + utils.ApplyDrainCheck("blocking", "^blocking-.*") - nodeDrainFile := applyNodeDrain(false, K8sVersionRegex, k8sRole) + preDrainPodStartTimes := utils.GetPodStartTimesOnNodes(testNodes) - By("Checking we still have all pods running") - expectNumberOfPodsRunning(2) + utils.ApplyNodeDrain(worker2Node, false, K8sVersionRegex, k8sRole) + + By("Checking all pods are still running") + utils.ExpectNumberOfPodsRunning(testNodes, 3) nodedrainIsPhaseCordened := func(g Gomega) { - cmd := exec.Command("kubectl", "get", "nodedrain", "nodedrain-sample") + cmd := exec.Command("kubectl", "get", "nodedrain", worker2Node) output, err := utils.Run(cmd) g.Expect(err).NotTo(HaveOccurred()) - g.Expect(output).To(ContainSubstring("nodedrain-sample PodsBlockingDrain"), - "coredened NodeDrain not found") + g.Expect(output).To(ContainSubstring(fmt.Sprintf("%s %s", worker2Node, "PodsBlockingDrain"))) } - Eventually(nodedrainIsPhaseCordened, 5*time.Minute).Should(Succeed()) + Eventually(nodedrainIsPhaseCordened, 2*time.Minute).Should(Succeed()) - By("Clean up after the test") + By("Deleting the first blocking pod the drain should be held") + utils.DeletePod("blocking-0") - By("Deleting the blocking statefulsets") - cmd := exec.Command("kubectl", "delete", "statefulset", "blocking") - _, err := utils.Run(cmd) - Expect(err).NotTo(HaveOccurred(), "Failed delete nodeDrain") + By("Checking 2 pods are running on our test nodes") + utils.ExpectNumberOfPodsRunning(testNodes, 2) - By("Drain should run and we should be left with no pods") - expectNumberOfPodsRunning(0) + Eventually(nodedrainIsPhaseCordened, 2*time.Minute).Should(Succeed()) + + By("Deleting the final blocking pod to allow the drain to continue") + utils.DeletePod("blocking-1") + + By("Drain should run and there should be left with no pods running") + utils.ExpectNumberOfPodsRunning(testNodes, 0) nodeDrainIsPhaseCompleted := func(g Gomega) { - cmd := exec.Command("kubectl", "get", "nodedrain", "nodedrain-sample") + cmd := exec.Command("kubectl", "get", "nodedrain", worker2Node) output, err := utils.Run(cmd) g.Expect(err).NotTo(HaveOccurred()) - g.Expect(output).To(ContainSubstring("nodedrain-sample Completed")) + g.Expect(output).To(ContainSubstring(fmt.Sprintf("%s %s", worker2Node, "Completed"))) } - Eventually(nodeDrainIsPhaseCompleted, 5*time.Minute).Should(Succeed()) + Eventually(nodeDrainIsPhaseCompleted, 2*time.Minute).Should(Succeed()) + + By("Un-cordoned other test node to allow pods to restart there (simulate new node)") + utils.UncordonNode(worker3Node) + + By("Waiting for all pods to be restarted on other nodes") + utils.ExpectNumberOfPodsRunning(testNodes, 3) + + havePodsRestarted(preDrainPodStartTimes) By("Clean up after the test") - By("Deleting nginx statefulsets") - cmd = exec.Command("kubectl", "delete", "statefulset", "nginx") - _, err = utils.Run(cmd) - Expect(err).NotTo(HaveOccurred(), "Failed delete nginx statefulset") + By("Deleting statefulsets") + utils.DeleteStatefulSet("nginx") + utils.DeleteStatefulSet("blocking") - By("deleting the NpdeDrain") - cmd = exec.Command("kubectl", "delete", "-f", nodeDrainFile) - _, err = utils.Run(cmd) - Expect(err).NotTo(HaveOccurred(), "Failed delete nodeDrain") - err = os.Remove(nodeDrainFile) - Expect(err).NotTo(HaveOccurred(), "Failed remove nodeDrainFile") + By("deleting the NodeDrain") + utils.DeleteNodeDrain(worker2Node) By("removing the drainCheck") - cmd = exec.Command("kubectl", "delete", "-f", drainCheckFile) - _, err = utils.Run(cmd) - Expect(err).NotTo(HaveOccurred(), "Failed remove drainCheck") - err = os.Remove(drainCheckFile) - Expect(err).NotTo(HaveOccurred(), "Failed remove nodeDrainFile") + utils.DeleteDrainCheck("blocking") - By("re-cordening our test node") + By("re-cordoned our test node") utils.CordonNode(worker2Node) + utils.CordonNode(worker3Node) }) - It("if another node with the same 'role' & version exists uncodened should hold until it is cordened", func() { + It("if another node with the same 'role' & version exists un-cordoned should hold until it is cordoned", func() { - By("Uncordening our test node") + By("Un-cordoned our test node") utils.UncordonNode(worker2Node) By("Verifying the number of pods on the node(s) is 0") - expectNumberOfPodsRunning(0) + utils.ExpectNumberOfPodsRunning(testNodes, 0) - createStatefulSetWithName("nginx") - createStatefulSetWithName("nginx2") + utils.CreateStatefulSetWithName("nginx", 1, k8sRole) + utils.CreateStatefulSetWithName("nginx2", 1, k8sRole) By("Waiting for all pods to be running") - expectNumberOfPodsRunning(2) + utils.ExpectNumberOfPodsRunning(testNodes, 2) - // By("Cordening the worker2 now it has workload on it") - // utils.CordonNode(worker2Node) + preDrainPodStartTimes := utils.GetPodStartTimesOnNodes(testNodes) - By("Uncordening worker3") + By("Un-cordoned worker3 - should block the drain on worker2") utils.UncordonNode(worker3Node) - nodeDrainFile := applyNodeDrain(false, K8sVersionRegex, k8sRole) + utils.ApplyNodeDrain(worker2Node, false, K8sVersionRegex, k8sRole) nodeDrainIsPhaseOtherNodesNotCordoned := func(g Gomega) { - cmd := exec.Command("kubectl", "get", "nodedrain", "nodedrain-sample") + cmd := exec.Command("kubectl", "get", "nodedrain", worker2Node) output, err := utils.Run(cmd) g.Expect(err).NotTo(HaveOccurred()) - g.Expect(output).To(ContainSubstring("nodedrain-sample OtherNodesNotCordoned")) + g.Expect(output).To(ContainSubstring(fmt.Sprintf("%s %s", worker2Node, "OtherNodesNotCordoned"))) } - Eventually(nodeDrainIsPhaseOtherNodesNotCordoned, 5*time.Minute).Should(Succeed()) + Eventually(nodeDrainIsPhaseOtherNodesNotCordoned, 2*time.Minute).Should(Succeed()) - By("Cordening worker3") + By("cordoned worker3") cmd := exec.Command("kubectl", "cordon", worker3Node) _, err := utils.Run(cmd) Expect(err).NotTo(HaveOccurred(), "Failed delete nodeDrain") - By("Drain should run and we should be left no pods") - expectNumberOfPodsRunning(0) + By("Drain should run and there should be left with no pods running") + utils.ExpectNumberOfPodsRunning(testNodes, 0) nodeDrainIsPhaseCompleted := func(g Gomega) { - cmd := exec.Command("kubectl", "get", "nodedrain", "nodedrain-sample") + cmd := exec.Command("kubectl", "get", "nodedrain", worker2Node) output, err := utils.Run(cmd) g.Expect(err).NotTo(HaveOccurred()) - g.Expect(output).To(ContainSubstring("nodedrain-sample Completed")) + g.Expect(output).To(ContainSubstring(fmt.Sprintf("%s %s", worker2Node, "Completed"))) } - Eventually(nodeDrainIsPhaseCompleted, 5*time.Minute).Should(Succeed()) + Eventually(nodeDrainIsPhaseCompleted, 2*time.Minute).Should(Succeed()) - By("Clean up after the test") + By("Un-cordoned other test node to allow pods to restart there (simulate new node)") + utils.UncordonNode(worker3Node) - By("Deleting nginx statefulsets") - cmd = exec.Command("kubectl", "delete", "statefulset", "nginx") - _, err = utils.Run(cmd) - Expect(err).NotTo(HaveOccurred(), "Failed delete nginx statefulset") + By("Waiting for all pods to be restarted on other nodes") + utils.ExpectNumberOfPodsRunning(testNodes, 2) - By("Deleting nginx2 statefulsets") - cmd = exec.Command("kubectl", "delete", "statefulset", "nginx2") - _, err = utils.Run(cmd) - Expect(err).NotTo(HaveOccurred(), "Failed delete nginx statefulset") + havePodsRestarted(preDrainPodStartTimes) + + By("Clean up after the test") + + By("Deleting statefulsets") + utils.DeleteStatefulSet("nginx") + utils.DeleteStatefulSet("nginx2") By("deleting the NodeDrain") - cmd = exec.Command("kubectl", "delete", "-f", nodeDrainFile) - _, err = utils.Run(cmd) - Expect(err).NotTo(HaveOccurred(), "Failed delete nodeDrain") - err = os.Remove(nodeDrainFile) - Expect(err).NotTo(HaveOccurred(), "Failed remove nodeDrainFile") + utils.DeleteNodeDrain(worker2Node) - By("re-cordening our test nodes") + By("re-cordoned our test node") utils.CordonNode(worker2Node) utils.CordonNode(worker3Node) - }) It("if configured will wait for evicted pods to get to running state", func() { - By("Uncordening our test nodes") + By("Un-cordoned our test nodes") utils.UncordonNode(worker2Node) By("Verifying the number of pods on the node(s) is 0") - expectNumberOfPodsRunning(0) + utils.ExpectNumberOfPodsRunning(testNodes, 0) - createStatefulSetWithName("nginx") - createStatefulSetWithName("nginx2") + utils.CreateStatefulSetWithName("nginx", 1, k8sRole) + utils.CreateStatefulSetWithName("nginx2", 1, k8sRole) By("Waiting for all pods to be running") - expectNumberOfPodsRunning(2) + utils.ExpectNumberOfPodsRunning(testNodes, 2) + + preDrainPodStartTimes := utils.GetPodStartTimesOnNodes(testNodes) - nodeDrainFile := applyNodeDrain(true, K8sVersionRegex, k8sRole) + utils.ApplyNodeDrain(worker2Node, true, K8sVersionRegex, k8sRole) - By("Drain should run and we should be left no pods") - expectNumberOfPodsRunning(0) + By("Drain should run and there should be left with no pods running") + utils.ExpectNumberOfPodsRunning(testNodes, 0) nodeDrainIsPhaseWaitForPodsToRestart := func(g Gomega) { - cmd := exec.Command("kubectl", "get", "nodedrain", "nodedrain-sample") + cmd := exec.Command("kubectl", "get", "nodedrain", worker2Node) output, err := utils.Run(cmd) g.Expect(err).NotTo(HaveOccurred()) - g.Expect(output).To(ContainSubstring("nodedrain-sample WaitForPodsToRestart")) + g.Expect(output).To(ContainSubstring(fmt.Sprintf("%s %s", worker2Node, "WaitForPodsToRestart"))) } - Eventually(nodeDrainIsPhaseWaitForPodsToRestart, 5*time.Minute).Should(Succeed()) + Eventually(nodeDrainIsPhaseWaitForPodsToRestart, 2*time.Minute).Should(Succeed()) - By("Uncordoning node (simulate new node) Pods should start and drain should go to completed") + By("Un-cordoning node (simulate new node) Pods should start and drain should go to completed") utils.UncordonNode(worker2Node) nodeDrainIsPhaseCompleted := func(g Gomega) { - cmd := exec.Command("kubectl", "get", "nodedrain", "nodedrain-sample") + cmd := exec.Command("kubectl", "get", "nodedrain", worker2Node) output, err := utils.Run(cmd) g.Expect(err).NotTo(HaveOccurred()) - g.Expect(output).To(ContainSubstring("nodedrain-sample Completed")) + g.Expect(output).To(ContainSubstring(fmt.Sprintf("%s %s", worker2Node, "WaitForPodsToRestart"))) } - Eventually(nodeDrainIsPhaseCompleted, 5*time.Minute).Should(Succeed()) + Eventually(nodeDrainIsPhaseCompleted, 2*time.Minute).Should(Succeed()) - By("Clean up after the test") + By("Un-cordoned other test node to allow pods to restart there (simulate new node)") + utils.UncordonNode(worker3Node) - By("Deleting nginx statefulsets") - cmd := exec.Command("kubectl", "delete", "statefulset", "nginx") - _, err := utils.Run(cmd) - Expect(err).NotTo(HaveOccurred(), "Failed delete nginx statefulset") + By("Waiting for all pods to be restarted on other nodes") + utils.ExpectNumberOfPodsRunning(testNodes, 2) - By("Deleting nginx2 statefulsets") - cmd = exec.Command("kubectl", "delete", "statefulset", "nginx2") - _, err = utils.Run(cmd) - Expect(err).NotTo(HaveOccurred(), "Failed delete nginx statefulset") + havePodsRestarted(preDrainPodStartTimes) + + By("Clean up after the test") + + By("Deleting statefulsets") + utils.DeleteStatefulSet("nginx") + utils.DeleteStatefulSet("nginx2") By("deleting the NodeDrain") - cmd = exec.Command("kubectl", "delete", "-f", nodeDrainFile) - _, err = utils.Run(cmd) - Expect(err).NotTo(HaveOccurred(), "Failed delete nodeDrain") - err = os.Remove(nodeDrainFile) - Expect(err).NotTo(HaveOccurred(), "Failed remove nodeDrainFile") + utils.DeleteNodeDrain(worker2Node) - By("re-cordening our test nodes") + By("re-cordoned our test node") utils.CordonNode(worker2Node) + utils.CordonNode(worker3Node) }) It("if NodeDrain Role does not match the nodes role the drain will fail", func() { - By("Uncordening our test nodes") + By("Un-cordoned our test nodes") utils.UncordonNode(worker2Node) - nodeDrainFile := applyNodeDrain(true, K8sVersionRegex, "wrong-role") + utils.ApplyNodeDrain(worker2Node, true, K8sVersionRegex, "wrong-role") nodeDrainIsPhaseFailed := func(g Gomega) { - cmd := exec.Command("kubectl", "get", "nodedrain", "nodedrain-sample") + cmd := exec.Command("kubectl", "get", "nodedrain", worker2Node) output, err := utils.Run(cmd) g.Expect(err).NotTo(HaveOccurred()) - g.Expect(output).To(ContainSubstring("nodedrain-sample Failed")) + g.Expect(output).To(ContainSubstring(fmt.Sprintf("%s %s", worker2Node, "Failed"))) } - Eventually(nodeDrainIsPhaseFailed, 5*time.Minute).Should(Succeed()) + Eventually(nodeDrainIsPhaseFailed, 2*time.Minute).Should(Succeed()) By("Clean up after the test") By("deleting the NodeDrain") - cmd := exec.Command("kubectl", "delete", "-f", nodeDrainFile) - _, err := utils.Run(cmd) - Expect(err).NotTo(HaveOccurred(), "Failed delete nodeDrain") - err = os.Remove(nodeDrainFile) - Expect(err).NotTo(HaveOccurred(), "Failed remove nodeDrainFile") + utils.DeleteNodeDrain(worker2Node) - By("re-cordening our test nodes") + By("Re-cordoned our test nodes") utils.CordonNode(worker2Node) }) It("if NodeDrain K8S Version regexp does not match the nodes kubelet version the drain will fail", func() { - By("Uncordening our test nodes") + By("Un-cordoned our test nodes") utils.UncordonNode(worker2Node) - nodeDrainFile := applyNodeDrain(true, "^v1\\.30\\..*$", k8sRole) + utils.ApplyNodeDrain(worker2Node, true, "^v1\\.30\\..*$", k8sRole) - nodeDrainIsPhaseFailed := func(g Gomega) { - cmd := exec.Command("kubectl", "get", "nodedrain", "nodedrain-sample") - output, err := utils.Run(cmd) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(output).To(ContainSubstring("nodedrain-sample Failed")) - } - Eventually(nodeDrainIsPhaseFailed, 5*time.Minute).Should(Succeed()) + nodeDrainIsPhaseFailed := nodeDrainInPhaseFunc(worker2Node, "Failed") + Eventually(nodeDrainIsPhaseFailed, 2*time.Minute).Should(Succeed()) By("Clean up after the test") By("deleting the NodeDrain") - cmd := exec.Command("kubectl", "delete", "-f", nodeDrainFile) - _, err := utils.Run(cmd) - Expect(err).NotTo(HaveOccurred(), "Failed delete nodeDrain") - err = os.Remove(nodeDrainFile) - Expect(err).NotTo(HaveOccurred(), "Failed remove nodeDrainFile") + utils.DeleteNodeDrain(worker2Node) - By("re-cordening our test nodes") + By("Re-cordoned our test nodes") utils.CordonNode(worker2Node) }) -}) -func applyDrainCheck(podRegex string) string { - By("Creating DrainCheck for:" + podRegex) - - drainCheck := fmt.Sprintf(` -apiVersion: k8s.gezb.co.uk/v1 -kind: DrainCheck -metadata: - name: draincheck-sample -spec: - podRegex: %s -`, podRegex) - - drainCheckFile, err := utils.CreateTempFile(drainCheck) - Expect(err).NotTo(HaveOccurred(), "Failed creating DrainCheck file to apply: "+drainCheckFile) - - cmd := exec.Command("kubectl", "apply", "-f", drainCheckFile) - _, err = utils.Run(cmd) - Expect(err).NotTo(HaveOccurred(), "Failed apply DrainCheck") - return drainCheckFile -} + It("multi node with multiple drianchecks", func() { -func applyNodeDrain(waitForPodToRestart bool, versionToDrainRegex string, role string) string { - By("Creating NodeDrain for node :" + worker2Node) - - nodeDrain := fmt.Sprintf(` -apiVersion: k8s.gezb.co.uk/v1 -kind: NodeDrain -metadata: - name: nodedrain-sample -spec: - nodeName: %s - versionToDrainRegex: %s - nodeRole: %s - waitForPodsToRestart: %t -`, worker2Node, versionToDrainRegex, role, waitForPodToRestart) - - nodeDrainFile, err := utils.CreateTempFile(nodeDrain) - Expect(err).NotTo(HaveOccurred(), "Failed creating NodeDrain file to apply: "+nodeDrainFile) - - cmd := exec.Command("kubectl", "apply", "-f", nodeDrainFile) - _, err = utils.Run(cmd) - Expect(err).NotTo(HaveOccurred(), "Failed apply NodeDrain") - return nodeDrainFile -} + By("Un-cordoned our test node(s)") + utils.UncordonNode(worker2Node) + utils.UncordonNode(worker3Node) -func expectNumberOfPodsRunning(expected int) { - var worker2, worker3 []string - verifyAllPodsRunning := func(g Gomega) { - cmd := exec.Command("kubectl", "get", "pods", - "-o", "wide", - "-A", - "--field-selector", "spec.nodeName="+worker2Node) - output, err := utils.Run(cmd) - g.Expect(err).NotTo(HaveOccurred()) - worker2 = strings.Split(output, "\n") - worker2 = worker2[:len(worker2)-1] // remove the empty last line - worker2 = worker2[1:] // remove header - for _, line := range worker2 { - g.Expect(line).To(ContainSubstring("Running")) + By("Starting some test workload across both nodes") + + utils.ExpectNumberOfPodsRunning(testNodes, 0) + utils.CreateStatefulSetWithName("nginx", 4, k8sRole) + utils.CreateStatefulSetWithName("test-app-1", 2, k8sRole) + utils.CreateStatefulSetWithName("test-app-2", 2, k8sRole) + utils.CreateStatefulSetWithName("vault", 3, k8sRole) + utils.CreateStatefulSetWithName("elasticsearch-data", 3, k8sRole) + + totalPods := 14 + + By("Verifying the number of pods on the node(s) is totalPods") + utils.ExpectNumberOfPodsRunning(testNodes, totalPods) + + preDrainPodStartTimes := utils.GetPodStartTimesOnNodes(testNodes) + + By("Creating a DrainCheck(s)") + + utils.ApplyDrainCheck("vault", "^vault-.*") + utils.ApplyDrainCheck("elasticsearch-data", "^elasticsearch-data-.*") + + utils.ApplyNodeDrain(worker2Node, true, K8sVersionRegex, k8sRole) + + By("Checking all pods are still running") + utils.ExpectNumberOfPodsRunning(testNodes, totalPods) + + By("Verifying status is OtherNodesNotCordoned ") + + worker2IsPhaseOtherNodesNotCordoned := nodeDrainInPhaseFunc(worker2Node, "OtherNodesNotCordoned") + Eventually(worker2IsPhaseOtherNodesNotCordoned, 2*time.Minute).Should(Succeed()) + + By("Applying a NodeDrain to the other test code") + utils.ApplyNodeDrain(worker3Node, true, K8sVersionRegex, k8sRole) + + By("Verifying both nodedrains are in PodsBlockingDrain phase") + worker2IsPhasePodsBlockingDrain := nodeDrainInPhaseFunc(worker2Node, "PodsBlockingDrain") + Eventually(worker2IsPhasePodsBlockingDrain, 2*time.Minute).Should(Succeed()) + + worker3IsPhasePodsBlockingDrain := nodeDrainInPhaseFunc(worker3Node, "PodsBlockingDrain") + Eventually(worker3IsPhasePodsBlockingDrain, 2*time.Minute).Should(Succeed()) + + // validate all pods are still running + utils.ExpectNumberOfPodsRunning(testNodes, totalPods) + + By("Removing all blocking pods on worker 2 allows it move on and drain the node") + + blockingPods := []string{"vault-0", "vault-1", "vault-2", "elasticsearch-data-0", "elasticsearch-data-1", "elasticsearch-data-2"} + podsToBeDeleted := []string{} + podsOnNode := utils.GetPodsOnNode(worker2Node) + + for _, blockingPod := range blockingPods { + for _, pod := range podsOnNode { + if pod == blockingPod { + podsToBeDeleted = append(podsToBeDeleted, pod) + } + } } - cmd = exec.Command("kubectl", "get", "pods", - "-o", "wide", - "-A", - "--field-selector", "spec.nodeName="+worker3Node) - output, err = utils.Run(cmd) - g.Expect(err).NotTo(HaveOccurred()) - worker3 = strings.Split(output, "\n") - worker3 = worker3[:len(worker3)-1] // remove the empty last line - worker3 = worker3[1:] // remove header - for _, line := range worker3 { - g.Expect(line).To(ContainSubstring("Running")) + + Expect(len(podsOnNode)-len(podsToBeDeleted)).NotTo(BeZero(), "We need non blocking pods on the node") + + for _, pod := range podsToBeDeleted { + // Phase should still be Pods Blocking Drain on both nodes + Eventually(worker2IsPhasePodsBlockingDrain, 2*time.Minute).Should(Succeed()) + Eventually(worker3IsPhasePodsBlockingDrain, 2*time.Minute).Should(Succeed()) + utils.DeletePod(pod) } - g.Expect(len(worker2) + len(worker3)).To(Equal(expected)) + + // We should now be running totalPods - however many pods were on worker 2 + utils.ExpectNumberOfPodsRunning(testNodes, totalPods-len(podsOnNode)) + utils.ExpectNumberOfPodsRunning([]string{worker2Node}, 0) + utils.ExpectNumberOfPodsRunning([]string{worker3Node}, totalPods-len(podsOnNode)) + + // worker 3 should still be Phase PodsBlocking Drain + Eventually(worker3IsPhasePodsBlockingDrain, 2*time.Minute).Should(Succeed()) + + // Worker 2 should be Phase WaitForPodsToRestart (they cant start atm as all nodes are cordoned) + worker2IsPhaseWaitForPodsToRestart := nodeDrainInPhaseFunc(worker2Node, "WaitForPodsToRestart") + Eventually(worker2IsPhaseWaitForPodsToRestart, 2*time.Minute).Should(Succeed()) + + // Un-cordon worker 4 so pods can restart + utils.UncordonNode(worker4Node) + + worker2IsPhaseComplete := nodeDrainInPhaseFunc(worker2Node, "Complete") + Eventually(worker2IsPhaseComplete, 2*time.Minute).Should(Succeed()) + + // validate all pods are now running + utils.ExpectNumberOfPodsRunning(testNodes, totalPods) + + // Re-Cordon worker4 so all nodes are cordoned + utils.CordonNode(worker4Node) + + By("Removing all blocking pods on worker 3 allows it move on and drain the node") + + podsToBeDeleted = []string{} + podsOnNode = utils.GetPodsOnNode(worker3Node) + + for _, blockingPod := range blockingPods { + for _, pod := range podsOnNode { + if pod == blockingPod { + podsToBeDeleted = append(podsToBeDeleted, pod) + } + } + } + + Expect(len(podsOnNode)-len(podsToBeDeleted)).NotTo(BeZero(), "We need non blocking pods on the node") + + for _, pod := range podsToBeDeleted { + // Phase should still be Pods Blocking Drain on both nodes + Eventually(worker2IsPhaseComplete, 2*time.Minute).Should(Succeed()) + Eventually(worker3IsPhasePodsBlockingDrain, 2*time.Minute).Should(Succeed()) + utils.DeletePod(pod) + } + + worker3IsPhaseWaitForPodsToRestart := nodeDrainInPhaseFunc(worker3Node, "WaitForPodsToRestart") + Eventually(worker3IsPhaseWaitForPodsToRestart, 2*time.Minute).Should(Succeed()) + + // Un-cordon worker 4 so pods can restart + utils.UncordonNode(worker4Node) + + worker3IsPhaseComplete := nodeDrainInPhaseFunc(worker3Node, "Complete") + Eventually(worker2IsPhaseComplete, 2*time.Minute).Should(Succeed()) + Eventually(worker3IsPhaseComplete, 2*time.Minute).Should(Succeed()) + + // validate all pods are now running + utils.ExpectNumberOfPodsRunning(testNodes, totalPods) + + Eventually(worker2IsPhaseComplete, 2*time.Minute).Should(Succeed()) + Eventually(worker3IsPhaseComplete, 2*time.Minute).Should(Succeed()) + + // We should now be running totalPods - however many pods were on worker 3 + utils.ExpectNumberOfPodsRunning(testNodes, totalPods) + utils.ExpectNumberOfPodsRunning([]string{worker2Node}, 0) + utils.ExpectNumberOfPodsRunning([]string{worker3Node}, 0) + + // Check all pods have restarted + havePodsRestarted(preDrainPodStartTimes) + + By("Clean up after the test") + + By("Deleting statefulset(s)") + + utils.DeleteStatefulSet("nginx") + utils.DeleteStatefulSet("test-app-1") + utils.DeleteStatefulSet("test-app-2") + utils.DeleteStatefulSet("vault") + utils.DeleteStatefulSet("elasticsearch-data") + + By("deleting the NodeDrain(s)") + utils.DeleteNodeDrain(worker2Node) + utils.DeleteNodeDrain(worker3Node) + + By("removing the drainCheck(s)") + utils.DeleteDrainCheck("vault") + utils.DeleteDrainCheck("elasticsearch-data") + + By("re-cordoned our test node") + utils.CordonNode(worker2Node) + utils.CordonNode(worker3Node) + utils.CordonNode(worker4Node) + }) + +}) + +func nodeDrainInPhaseFunc(node string, phase string) func(g Gomega) { + return func(g Gomega) { + cmd := exec.Command("kubectl", "get", "nodedrain", node) + output, err := utils.Run(cmd) + g.ExpectWithOffset(1, err).NotTo(HaveOccurred()) + g.ExpectWithOffset(1, output).To(ContainSubstring(fmt.Sprintf("%s %s", node, phase))) } - EventuallyWithOffset(-2, verifyAllPodsRunning, time.Minute).Should(Succeed()) } -func createStatefulSetWithName(name string) { - By("starting StatefulSet with name " + name) - inflate := fmt.Sprintf(` -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: {{.metadata.name}} -spec: - selector: - matchLabels: - app: {{.metadata.name}} - replicas: 1 - template: - metadata: - labels: - app: {{.metadata.name}} - spec: - terminationGracePeriodSeconds: 60 - containers: - - name: {{.metadata.name}} - image: public.ecr.aws/eks-distro/kubernetes/pause:3.7 - nodeSelector: - kubernetes.io/os: linux - kubernetes.io/arch: amd64 - role: %s -`, k8sRole) - inflate = strings.ReplaceAll(inflate, "{{.metadata.name}}", name) - inflateFile, err := utils.CreateTempFile(inflate) - Expect(err).NotTo(HaveOccurred(), "Failed apply "+name) - cmd := exec.Command("kubectl", "apply", "-f", inflateFile) - _, err = utils.Run(cmd) - Expect(err).NotTo(HaveOccurred(), "Failed apply "+name) - err = os.Remove(inflateFile) - Expect(err).NotTo(HaveOccurred(), "Failed remove nodeDrainFile") +// havePodsRestarted checks that all pods in preDrainPodStartTimes have restarted +func havePodsRestarted(preDrainPodStartTimes map[string]time.Time) { + postDrainPodEndTimes := utils.GetPodStartTimesOnNodes(testNodes) + for podName, preDrainStartTime := range preDrainPodStartTimes { + newStartTime, exists := postDrainPodEndTimes[podName] + ExpectWithOffset(1, exists).To(BeTrue(), "Pod %s not found after drain", podName) + ExpectWithOffset(1, newStartTime.After(preDrainStartTime)).To(BeTrue(), fmt.Sprintf("Pod %s has not been restarted", podName)) + } } // serviceAccountToken returns a token for the specified service account in the given namespace. diff --git a/test/utils/utils.go b/test/utils/utils.go index 2303d10..9dc40a2 100644 --- a/test/utils/utils.go +++ b/test/utils/utils.go @@ -7,11 +7,17 @@ import ( "os" "os/exec" "strings" + "time" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" ) +type PodDetails struct { + Name string + Status string +} + const ( prometheusOperatorVersion = "v0.77.1" prometheusOperatorURL = "https://github.com/prometheus-operator/prometheus-operator/" + @@ -243,6 +249,192 @@ func UncommentCode(filename, target, prefix string) error { return os.WriteFile(filename, out.Bytes(), 0644) } +func ApplyDrainCheck(name string, podRegex string) { + By("Creating DrainCheck for:" + podRegex) + + drainCheck := fmt.Sprintf(` +apiVersion: k8s.gezb.co.uk/v1 +kind: DrainCheck +metadata: + name: %s +spec: + podRegex: %s +`, name, podRegex) + + drainCheckFile, err := CreateTempFile(drainCheck) + Expect(err).NotTo(HaveOccurred(), "Failed creating DrainCheck file to apply: "+drainCheckFile) + + cmd := exec.Command("kubectl", "apply", "-f", drainCheckFile) + _, err = Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed apply DrainCheck") +} + +// DeleteDrainCheck deletes the given DrainCheck resource +func DeleteDrainCheck(drainCheckName string) { + cmd := exec.Command("kubectl", "delete", "draincheck", drainCheckName) + _, err := Run(cmd) + ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed delete drainCheck") +} + +// creates and applies a NodeDrain resource for the given node +func ApplyNodeDrain(nodeName string, waitForPodToRestart bool, versionToDrainRegex string, role string) { + By("Creating NodeDrain for node :" + nodeName) + + nodeDrain := fmt.Sprintf(` +apiVersion: k8s.gezb.co.uk/v1 +kind: NodeDrain +metadata: + name: %s +spec: + nodeName: %s + versionToDrainRegex: %s + nodeRole: %s + waitForPodsToRestart: %t +`, nodeName, nodeName, versionToDrainRegex, role, waitForPodToRestart) + + nodeDrainFile, err := CreateTempFile(nodeDrain) + Expect(err).NotTo(HaveOccurred(), "Failed creating NodeDrain file to apply: "+nodeDrainFile) + + cmd := exec.Command("kubectl", "apply", "-f", nodeDrainFile) + _, err = Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed apply NodeDrain") +} + +// DeleteNodeDrain deletes the given NodeDrain resource +func DeleteNodeDrain(NodeDrainName string) { + cmd := exec.Command("kubectl", "delete", "nodedrain", NodeDrainName) + _, err := Run(cmd) + ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed delete nodeDrain") +} + +// asserts that the number of pods running on the given nodes equals expected +func ExpectNumberOfPodsRunning(nodes []string, expected int) { + verifyAllPodsRunning := func(g Gomega) { + numberOfPods := 0 + for _, nodeName := range nodes { + podDetailsForNode := getPodDetailsForNode(nodeName) + for _, podDetails := range podDetailsForNode { + g.Expect(podDetails.Status).To(Equal("Running")) + } + numberOfPods += len(podDetailsForNode) + } + g.ExpectWithOffset(1, numberOfPods).To(Equal(expected)) + } + EventuallyWithOffset(1, verifyAllPodsRunning, time.Minute).Should(Succeed()) +} + +// GePodsForNode returns the list of pods running on a given node +func GetPodsOnNode(nodeName string) []string { + podDetails := getPodDetailsForNode(nodeName) + pods := make([]string, 0, len(podDetails)) + for _, pd := range podDetails { + pods = append(pods, pd.Name) + } + return pods +} + +// gets the output from Kubectl get pods filtered for a given node +func getPodDetailsForNode(nodeName string) []PodDetails { + cmd := exec.Command("kubectl", "get", "pods", + "-o", "wide", + "-A", + "--field-selector", fmt.Sprintf("spec.nodeName=%s", nodeName)) + output, err := Run(cmd) + Expect(err).NotTo(HaveOccurred()) + lines := strings.Split(output, "\n") + lines = lines[:len(lines)-1] // remove the empty last line + lines = lines[1:] // remove header + podDetails := make([]PodDetails, 0, len(lines)) + for _, line := range lines { + fields := strings.Fields(line) + podDetails = append(podDetails, PodDetails{ + Name: fields[1], + Status: fields[3], + }) + } + return podDetails +} + +// creates a StatefuleSet with the given name and replicas that schedules on nodes with the given role +func CreateStatefulSetWithName(name string, replicas int, role string) { + By("starting StatefulSet with name " + name) + inflate := fmt.Sprintf(` +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: {{.metadata.name}} +spec: + selector: + matchLabels: + app: {{.metadata.name}} + replicas: %d + template: + metadata: + labels: + app: {{.metadata.name}} + spec: + terminationGracePeriodSeconds: 60 + containers: + - name: {{.metadata.name}} + image: public.ecr.aws/eks-distro/kubernetes/pause:3.7 + nodeSelector: + kubernetes.io/os: linux + kubernetes.io/arch: amd64 + role: %s +`, replicas, role) + inflate = strings.ReplaceAll(inflate, "{{.metadata.name}}", name) + inflateFile, err := CreateTempFile(inflate) + Expect(err).NotTo(HaveOccurred(), "Failed apply "+name) + cmd := exec.Command("kubectl", "apply", "-f", inflateFile) + _, err = Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed apply "+name) + err = os.Remove(inflateFile) + Expect(err).NotTo(HaveOccurred(), "Failed remove nodeDrainFile") +} + +// DeletePod deletes the given pod +func DeletePod(podName string) { + cmd := exec.Command("kubectl", "delete", "pod", podName) + _, err := Run(cmd) + ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed delete nodeDrain") +} + +// DeleteStatefulSet deletes the given StatefulSet +func DeleteStatefulSet(podName string) { + cmd := exec.Command("kubectl", "delete", "statefulset", podName) + _, err := Run(cmd) + ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed delete nodeDrain") +} + +// Returns a map of pod names to their start times on the specified nodes +func GetPodStartTimesOnNodes(nodes []string) map[string]time.Time { + podStartTimes := make(map[string]time.Time) + for _, nodeName := range nodes { + cmd := exec.Command("kubectl", "get", "pods", + "-o", "wide", + "-A", + "--field-selector", fmt.Sprintf("spec.nodeName=%s", nodeName)) + output, err := Run(cmd) + Expect(err).NotTo(HaveOccurred()) + lines := strings.Split(output, "\n") + lines = lines[:len(lines)-1] // remove the empty last line + lines = lines[1:] // remove header + for _, pod := range lines { + fields := strings.Fields(pod) + podName := fields[1] + namespace := fields[0] + // Get detailed pod info to extract the start time + cmdDetail := exec.Command("kubectl", "get", "pod", podName, "-n", namespace, "-o", "jsonpath={.status.startTime}") + startTimeStr, err := Run(cmdDetail) + Expect(err).NotTo(HaveOccurred()) + startTime, err := time.Parse(time.RFC3339, startTimeStr) + Expect(err).NotTo(HaveOccurred()) + podStartTimes[podName] = startTime + } + } + return podStartTimes +} + // CreateTempFile creates a temporary file containing the given content func CreateTempFile(content string) (string, error) { tmpFile, err := os.CreateTemp("", "k8syaml") From a595282e82e68bae678902978b05aba05307d868 Mon Sep 17 00:00:00 2001 From: Gerald Barker Date: Wed, 4 Feb 2026 20:51:23 +0000 Subject: [PATCH 2/2] WIP --- README.md | 1 - api/v1/nodedrain_types.go | 10 +- .../crd/bases/k8s.gezb.co.uk_nodedrains.yaml | 12 +- config/samples/k8s_v1_nodedrain.yaml | 1 - internal/controller/nodedrain_controller.go | 102 +++--- .../controller/nodedrain_controller_test.go | 306 +++++++++++++----- internal/controller/suite_test.go | 3 +- test/utils/utils.go | 4 +- 8 files changed, 298 insertions(+), 141 deletions(-) diff --git a/README.md b/README.md index 7bc205e..7a48e57 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,6 @@ metadata: name: nodedrain-node1 spec: nodeName: node1 - disableCordon: false # optional waitForPodsToRestart: true # optional ``` diff --git a/api/v1/nodedrain_types.go b/api/v1/nodedrain_types.go index b0c13cd..979d619 100644 --- a/api/v1/nodedrain_types.go +++ b/api/v1/nodedrain_types.go @@ -8,7 +8,9 @@ type NodeDrainPhase string const ( // NodeDrainFinalizer is a finalizer for a NodeMaintenance CR deletion - NodeDrainFinalizer string = "co.uk.gezb,NodeDrain" + NodeDrainFinalizer string = "co.uk.gezb.NodeDrain" + + NodeDrainAnnotation string = "co.uk.gezb.node-restart-controller" NodeDrainPhasePending NodeDrainPhase = "Pending" NodeDrainPhaseCordoned NodeDrainPhase = "Cordoned" @@ -33,12 +35,8 @@ type NodeDrainSpec struct { VersionToDrainRegex string `json:"versionToDrainRegex"` // NodeRole is the nodes expected "role" label NodeRole string `json:"nodeRole"` - // DisableCordon stop the controller cordoning the node - // +kubebuilder:validation:Optional - DisableCordon bool `json:"disableCordon"` // WaitForPods waits for the evicted pods to be running again before completing - // +kubebuilder:validation:Optional - WaitForPodsToRestart bool `json:"waitForPodsToRestart"` + SkipWaitForPodsToRestart bool `json:"skipWaitForPodsToRestart"` } // NodeDrainStatus defines the observed state of NodeDrain. diff --git a/config/crd/bases/k8s.gezb.co.uk_nodedrains.yaml b/config/crd/bases/k8s.gezb.co.uk_nodedrains.yaml index 3d84ee9..037920f 100644 --- a/config/crd/bases/k8s.gezb.co.uk_nodedrains.yaml +++ b/config/crd/bases/k8s.gezb.co.uk_nodedrains.yaml @@ -48,26 +48,24 @@ spec: spec: description: NodeDrainSpec defines the desired state of NodeDrain. properties: - disableCordon: - description: DisableCordon stop the controller cordoning the node - type: boolean nodeName: description: NodeName is the name of the node to drain type: string nodeRole: description: NodeRole is the nodes expected "role" label type: string + skipWaitForPodsToRestart: + description: WaitForPods waits for the evicted pods to be running + again before completing + type: boolean versionToDrainRegex: description: VersionToDrainRegex is a regex to match the expected kubernetes version that we want to Drain type: string - waitForPodsToRestart: - description: WaitForPods waits for the evicted pods to be running - again before completing - type: boolean required: - nodeName - nodeRole + - skipWaitForPodsToRestart - versionToDrainRegex type: object status: diff --git a/config/samples/k8s_v1_nodedrain.yaml b/config/samples/k8s_v1_nodedrain.yaml index 52a82fc..7dea3af 100644 --- a/config/samples/k8s_v1_nodedrain.yaml +++ b/config/samples/k8s_v1_nodedrain.yaml @@ -4,5 +4,4 @@ metadata: name: nodedrain-sample spec: nodeName: node1 - disableCordon: false # optional waitForPodsToRestart: true # optional diff --git a/internal/controller/nodedrain_controller.go b/internal/controller/nodedrain_controller.go index 7c781dc..40be898 100644 --- a/internal/controller/nodedrain_controller.go +++ b/internal/controller/nodedrain_controller.go @@ -72,7 +72,6 @@ func (r *NodeDrainReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( // Request object not found, could have been deleted after reconcile request. // Owned objects are automatically garbage collected. For additional cleanup logic use finalizers. // Return and don't requeue - r.logger.Info("NodeDrain not found", "name", req.NamespacedName) return ctrl.Result{}, nil } // Error reading the object - requeue the request. @@ -80,20 +79,40 @@ func (r *NodeDrainReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, err } - if !controllerutil.ContainsFinalizer(nodeDrain, gezbcoukalphav1.NodeDrainFinalizer) && nodeDrain.DeletionTimestamp.IsZero() { - // Add finalizer when object is created - controllerutil.AddFinalizer(nodeDrain, gezbcoukalphav1.NodeDrainFinalizer) - - } else if controllerutil.ContainsFinalizer(nodeDrain, gezbcoukalphav1.NodeDrainFinalizer) && !nodeDrain.DeletionTimestamp.IsZero() { + if nodeDrain.ObjectMeta.DeletionTimestamp.IsZero() { + if !controllerutil.ContainsFinalizer(nodeDrain, gezbcoukalphav1.NodeDrainFinalizer) { + controllerutil.AddFinalizer(nodeDrain, gezbcoukalphav1.NodeDrainFinalizer) + if err := r.Update(ctx, nodeDrain); err != nil { + return ctrl.Result{}, err + } + } + } else { // The object is being deleted + if controllerutil.ContainsFinalizer(nodeDrain, gezbcoukalphav1.NodeDrainFinalizer) { - // Do nothing special on deletion for now - - // Remove finalizer - controllerutil.RemoveFinalizer(nodeDrain, gezbcoukalphav1.NodeDrainFinalizer) - if err := r.Update(ctx, nodeDrain); err != nil { - return r.onReconcileError(ctx, nil, nodeDrain, err) + // Remove annotation + drainer, err := createDrainer(ctx, r.MgrConfig) + if err != nil { + return ctrl.Result{}, err + } + _, err = r.fetchNode(ctx, drainer, nodeDrain.Spec.NodeName) + if err != nil { + return ctrl.Result{}, err + } + // if metav1.HasAnnotation(node.ObjectMeta, gezbcoukalphav1.NodeDrainAnnotation) { + // delete(node.ObjectMeta.Annotations, gezbcoukalphav1.NodeDrainAnnotation) + // if err := r.Update(ctx, node); err != nil { + // return ctrl.Result{}, err + // } + // return ctrl.Result{}, nil + // } + // remove our finalizer from the list and update it. + controllerutil.RemoveFinalizer(nodeDrain, gezbcoukalphav1.NodeDrainFinalizer) + if err := r.Update(ctx, nodeDrain); err != nil { + return ctrl.Result{}, err + } } + // Stop reconciliation as the item is being deleted return ctrl.Result{}, nil } @@ -119,7 +138,7 @@ func (r *NodeDrainReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( // switch on status switch nodeDrain.Status.Phase { case gezbcoukalphav1.NodeDrainPhasePending: - needUpdate, err = r.reconcilePending(ctx, drainer, nodeDrain) + needsRequeue, needUpdate, err = r.reconcilePending(ctx, drainer, nodeDrain) case gezbcoukalphav1.NodeDrainPhaseCordoned: needUpdate, needsRequeue, err = r.reconcileCordoned(ctx, drainer, nodeDrain) case gezbcoukalphav1.NodeDrainPhasePodsBlocking: @@ -167,7 +186,7 @@ func (r *NodeDrainReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( } // reconcilePending checks the node exists, cordens the node if it needs to and updated -func (r *NodeDrainReconciler) reconcilePending(ctx context.Context, drainer *drain.Helper, nodeDrain *gezbcoukalphav1.NodeDrain) (bool, error) { +func (r *NodeDrainReconciler) reconcilePending(ctx context.Context, drainer *drain.Helper, nodeDrain *gezbcoukalphav1.NodeDrain) (bool, bool, error) { nodeName := nodeDrain.Spec.NodeName node, err := r.fetchNode(ctx, drainer, nodeName) if err != nil { @@ -176,10 +195,10 @@ func (r *NodeDrainReconciler) reconcilePending(ctx context.Context, drainer *dra message := fmt.Sprintf("Node: %s not found", nodeName) publishEvent(r.Recorder, nodeDrain, corev1.EventTypeWarning, events.EventReasonNodeNotFound, message) setStatus(r.Recorder, nodeDrain, gezbcoukalphav1.NodeDrainPhaseFailed) - return true, nil + return false, true, nil } else { r.logger.Error(err, "Unexpected error for the NodeName field", "NodeName", nodeName) - return false, err + return true, false, err } } @@ -188,7 +207,7 @@ func (r *NodeDrainReconciler) reconcilePending(ctx context.Context, drainer *dra r.logger.Info(message) publishEvent(r.Recorder, nodeDrain, corev1.EventTypeWarning, events.EventReasonNodeNotFound, message) setStatus(r.Recorder, nodeDrain, gezbcoukalphav1.NodeDrainPhaseFailed) - return true, nil + return false, true, nil } pattern := regexp.MustCompile(nodeDrain.Spec.VersionToDrainRegex) @@ -197,18 +216,20 @@ func (r *NodeDrainReconciler) reconcilePending(ctx context.Context, drainer *dra r.logger.Info(message) publishEvent(r.Recorder, nodeDrain, corev1.EventTypeWarning, events.EventReasonNodeNotFound, message) setStatus(r.Recorder, nodeDrain, gezbcoukalphav1.NodeDrainPhaseFailed) - return true, nil + return false, true, nil } - if nodeDrain.Spec.DisableCordon { - r.logger.Info("NOT Cordoning node", "node", nodeName) - } else { - if !node.Spec.Unschedulable { - // cordon node - r.logger.Info("Cordoning node", "node", nodeName) - if err = drain.RunCordonOrUncordon(drainer, node, true); err != nil { - return false, err - } + // Add Annotation + metav1.SetMetaDataAnnotation(&node.ObjectMeta, gezbcoukalphav1.NodeDrainAnnotation, "true") + if err := r.Update(ctx, node); err != nil { + return true, false, err + } + + if !node.Spec.Unschedulable { + // cordon node + r.logger.Info("Cordoning node", "node", nodeName) + if err = drain.RunCordonOrUncordon(drainer, node, true); err != nil { + return true, false, err } } @@ -217,18 +238,18 @@ func (r *NodeDrainReconciler) reconcilePending(ctx context.Context, drainer *dra FieldSelector: fields.SelectorFromSet(fields.Set{"spec.nodeName": nodeDrain.Spec.NodeName}).String(), }) if err != nil { - return false, err + return true, false, err } nodeDrain.Status.TotalPods = len(podlist.Items) err = r.updatePendingPodCount(drainer, nodeDrain) - nodeDrain.Status.EvictionPodCount = len(nodeDrain.Status.PendingEvictionPods) if err != nil { - return false, err + return false, false, err } + nodeDrain.Status.EvictionPodCount = len(nodeDrain.Status.PendingEvictionPods) setStatus(r.Recorder, nodeDrain, gezbcoukalphav1.NodeDrainPhaseCordoned) - return true, nil + return false, true, nil } // reconcileCordoned Checks that no pods exist that block draining this node & Checks all nodes for this k8s version&role are cordened, @@ -245,7 +266,7 @@ func (r *NodeDrainReconciler) reconcileCordoned(ctx context.Context, drainer *dr return true, true, nil } - // update status with pods to be deleted + // update status with pods to be deleted now all the nodes we expect are cordoned // ignore updated here we need to save the CR anyway err = r.updatePendingPodCount(drainer, nodeDrain) if err != nil { @@ -280,7 +301,8 @@ func (r *NodeDrainReconciler) reconcileDraining(drainer *drain.Helper, nodeDrain if pendingList != nil { nodeDrain.Status.PodsToBeEvicted = GetNameSpaceAndName(pendingList.Pods()) } - if nodeDrain.Spec.WaitForPodsToRestart { + if !nodeDrain.Spec.SkipWaitForPodsToRestart { + // populate the list of pods we need to wait for nodeDrain.Status.PodsToRestart = GetNameSpaceAndName(pendingList.Pods()) r.logger.Info(fmt.Sprintf("Pods to wait to restart on node %s - init to %v", nodeDrain.Spec.NodeName, nodeDrain.Status.PodsToRestart)) } @@ -297,13 +319,7 @@ func (r *NodeDrainReconciler) reconcileDraining(drainer *drain.Helper, nodeDrain return false, true, nil } - if nodeDrain.Spec.WaitForPodsToRestart { - message := fmt.Sprintf("Waiting for [%v] pod(s) to restart", nodeDrain.Status.PodsToRestart) - publishEvent(r.Recorder, nodeDrain, corev1.EventTypeNormal, events.EventReasonWaitingForPodsToRestart, message) - setStatus(r.Recorder, nodeDrain, gezbcoukalphav1.NodeDrainPhaseWaitForPodsToRestart) - // requeue to wait for the pods to be running - return true, true, nil - } else { + if nodeDrain.Spec.SkipWaitForPodsToRestart { err := r.updatePendingPodCount(drainer, nodeDrain) if err != nil { return false, false, err @@ -311,6 +327,12 @@ func (r *NodeDrainReconciler) reconcileDraining(drainer *drain.Helper, nodeDrain nodeDrain.Status.DrainProgress = 100 setStatus(r.Recorder, nodeDrain, gezbcoukalphav1.NodeDrainPhaseCompleted) return false, false, nil + } else { + message := fmt.Sprintf("Waiting for [%v] pod(s) to restart", nodeDrain.Status.PodsToRestart) + publishEvent(r.Recorder, nodeDrain, corev1.EventTypeNormal, events.EventReasonWaitingForPodsToRestart, message) + setStatus(r.Recorder, nodeDrain, gezbcoukalphav1.NodeDrainPhaseWaitForPodsToRestart) + // requeue to wait for the pods to be running + return true, true, nil } } diff --git a/internal/controller/nodedrain_controller_test.go b/internal/controller/nodedrain_controller_test.go index 7eb39d6..72ac6ca 100644 --- a/internal/controller/nodedrain_controller_test.go +++ b/internal/controller/nodedrain_controller_test.go @@ -37,22 +37,21 @@ var _ = Describe("Node Drain", func() { } ClearEventsCache() }) - var nodeDrainCR *gezbcoukalphav1.NodeDrain - When("NodeDrain CR for a node that is empty", func() { - BeforeEach(func() { + var node *corev1.Node + var nodeDrain *gezbcoukalphav1.NodeDrain + It("Setup for test", func() { node := getTestNode("node1", false) Expect(k8sClient.Create(ctx, node)).To(Succeed()) - DeferCleanup(k8sClient.Delete, ctx, node) - nodeDrainCR = getTestNodeDrain(false, false) - Expect(k8sClient.Create(ctx, nodeDrainCR)).To(Succeed()) - DeferCleanup(k8sClient.Delete, ctx, nodeDrainCR) + nodeDrain = getTestNodeDrain(false) + Expect(k8sClient.Create(ctx, nodeDrain)).To(Succeed()) }) + It("should goto a completed state", func() { nodeDrain := &gezbcoukalphav1.NodeDrain{} Eventually(func(g Gomega) { - g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(nodeDrainCR), nodeDrain)).To(Succeed()) + g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: "node-drain", Namespace: testNamespace}, nodeDrain)).To(Succeed()) g.Expect(nodeDrain.Status.Phase).To(Equal(gezbcoukalphav1.NodeDrainPhaseCompleted)) }, timeout, interval).Should(Succeed()) @@ -68,23 +67,36 @@ var _ = Describe("Node Drain", func() { Expect(nodeDrain.Status.EvictionPodCount).To(Equal(0)) Expect(nodeDrain.Status.DrainProgress).To(Equal(100)) Expect(nodeDrain.Status.PodsBlockingDrain).To(Equal("")) + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(node), node)) + Expect(metav1.HasAnnotation(node.ObjectMeta, gezbcoukalphav1.NodeDrainAnnotation)).To(BeTrue()) + }) + + It("Cleanup after test", func() { + // if these are left to deferCleanup() we get timing issues + // where the node does not exist when the controller comes to delete the annotation + Expect(k8sClient.Delete(ctx, nodeDrain)).To(Succeed()) + + Expect(k8sClient.Delete(ctx, node)).To(Succeed()) }) }) When("NodeDrain CR for a node that is empty, already cordened", func() { + var node *corev1.Node + var nodeDrain *gezbcoukalphav1.NodeDrain BeforeEach(func() { - node := getTestNode("node1", true) + node = getTestNode("node1", true) Expect(k8sClient.Create(ctx, node)).To(Succeed()) - DeferCleanup(k8sClient.Delete, ctx, node) + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(node), node)).To(Succeed()) + }, timeout, interval).Should(Succeed()) - nodeDrainCR = getTestNodeDrain(false, false) - Expect(k8sClient.Create(ctx, nodeDrainCR)).To(Succeed()) - DeferCleanup(k8sClient.Delete, ctx, nodeDrainCR) + nodeDrain = getTestNodeDrain(false) + Expect(k8sClient.Create(ctx, nodeDrain)).To(Succeed()) }) It("should goto a completed state", func() { nodeDrain := &gezbcoukalphav1.NodeDrain{} Eventually(func(g Gomega) { - g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(nodeDrainCR), nodeDrain)).To(Succeed()) + g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: "node-drain", Namespace: testNamespace}, nodeDrain)).To(Succeed()) g.Expect(nodeDrain.Status.Phase).To(Equal(gezbcoukalphav1.NodeDrainPhaseCompleted)) }, timeout, interval).Should(Succeed()) @@ -101,14 +113,24 @@ var _ = Describe("Node Drain", func() { Expect(nodeDrain.Status.DrainProgress).To(Equal(100)) Expect(nodeDrain.Status.PodsBlockingDrain).To(Equal("")) + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(node), node)) + Expect(metav1.HasAnnotation(node.ObjectMeta, gezbcoukalphav1.NodeDrainAnnotation)).To(BeTrue()) + }) + + It("Cleanup after test", func() { + // if these are left to deferCleanup() we get timing issues + // where the node does not exist when the controller comes to delete the annotation + Expect(k8sClient.Delete(ctx, nodeDrain)).To(Succeed()) + Expect(k8sClient.Delete(ctx, node)).To(Succeed()) }) }) When("NodeDrain CR for a node that is not empty", func() { - BeforeEach(func() { - node := getTestNode("node1", false) + var node *corev1.Node + var nodeDrain *gezbcoukalphav1.NodeDrain + It("Setup for test", func() { + node = getTestNode("node1", false) Expect(k8sClient.Create(ctx, node)).To(Succeed()) - DeferCleanup(k8sClient.Delete, ctx, node) pod1 := getTestPod("pod1") Expect(k8sClient.Create(ctx, pod1)).To(Succeed()) @@ -119,15 +141,13 @@ var _ = Describe("Node Drain", func() { // this will be done by the drain // DeferCleanup(k8sClient.Delete, ctx, pod1) - nodeDrainCR = getTestNodeDrain(false, false) - Expect(k8sClient.Create(ctx, nodeDrainCR)).To(Succeed()) - DeferCleanup(k8sClient.Delete, ctx, nodeDrainCR) + nodeDrain = getTestNodeDrain(false) + Expect(k8sClient.Create(ctx, nodeDrain)).To(Succeed()) }) It("should goto a completed state", func() { - nodeDrain := &gezbcoukalphav1.NodeDrain{} Eventually(func(g Gomega) { - g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(nodeDrainCR), nodeDrain)).To(Succeed()) + g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: "node-drain", Namespace: testNamespace}, nodeDrain)).To(Succeed()) g.Expect(nodeDrain.Status.Phase).To(Equal(gezbcoukalphav1.NodeDrainPhaseCompleted)) }, timeout, interval).Should(Succeed()) @@ -157,17 +177,26 @@ var _ = Describe("Node Drain", func() { Expect(nodeDrain.Status.EvictionPodCount).To(Equal(3)) Expect(nodeDrain.Status.DrainProgress).To(Equal(100)) Expect(nodeDrain.Status.PodsBlockingDrain).To(Equal("")) + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(node), node)) + Expect(metav1.HasAnnotation(node.ObjectMeta, gezbcoukalphav1.NodeDrainAnnotation)).To(BeTrue()) + }) + It("Cleanup after test", func() { + // if these are left to deferCleanup() we get timing issues + // where the node does not exist when the controller comes to delete the annotation + Expect(k8sClient.Delete(ctx, nodeDrain)).To(Succeed()) + Expect(k8sClient.Delete(ctx, node)).To(Succeed()) }) }) When("Drain gets blocked by a draincheck", func() { var blockpod *corev1.Pod var blockpod2 *corev1.Pod - BeforeEach(func() { - node := getTestNode("node1", false) + var node *corev1.Node + var nodeDrain *gezbcoukalphav1.NodeDrain + It("Setup for test", func() { + node = getTestNode("node1", false) Expect(k8sClient.Create(ctx, node)).To(Succeed()) - DeferCleanup(k8sClient.Delete, ctx, node) pod1 := getTestPod("pod1") Expect(k8sClient.Create(ctx, pod1)).To(Succeed()) @@ -188,23 +217,23 @@ var _ = Describe("Node Drain", func() { // this will be done later in the test to complete the drain DeferCleanup(k8sClient.Delete, ctx, drainCheck) - nodeDrainCR = getTestNodeDrain(false, false) - Expect(k8sClient.Create(ctx, nodeDrainCR)).To(Succeed()) - DeferCleanup(k8sClient.Delete, ctx, nodeDrainCR) + nodeDrain = getTestNodeDrain(false) + Expect(k8sClient.Create(ctx, nodeDrain)).To(Succeed()) }) It(" should be blocked by pods block-1 and block-2", func() { - nodeDrain := &gezbcoukalphav1.NodeDrain{} Eventually(func(g Gomega) { - g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(nodeDrainCR), nodeDrain)).To(Succeed()) + g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: "node-drain", Namespace: testNamespace}, nodeDrain)).To(Succeed()) g.Expect(nodeDrain.Status.Phase).To(Equal(gezbcoukalphav1.NodeDrainPhasePodsBlocking)) }, timeout, interval).Should(Succeed()) + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(node), node)) + Expect(metav1.HasAnnotation(node.ObjectMeta, gezbcoukalphav1.NodeDrainAnnotation)).To(BeTrue()) verifyStatusEvent(gezbcoukalphav1.NodeDrainPhasePending) verifyStatusEvent(gezbcoukalphav1.NodeDrainPhaseCordoned) verifyStatusEvent(gezbcoukalphav1.NodeDrainPhasePodsBlocking) - Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(nodeDrainCR), nodeDrain)).To(Succeed()) + Expect(k8sClient.Get(ctx, client.ObjectKey{Name: "node-drain", Namespace: testNamespace}, nodeDrain)).To(Succeed()) Expect(nodeDrain.Status.PodsBlockingDrain).To(Equal("block-1,block-2")) Expect(nodeDrain.Status.LastError).To(Equal("")) @@ -221,7 +250,7 @@ var _ = Describe("Node Drain", func() { By("Check the status has been updated") Eventually(func(g Gomega) { - g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(nodeDrainCR), nodeDrain)).To(Succeed()) + g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: "node-drain", Namespace: testNamespace}, nodeDrain)).To(Succeed()) g.Expect(nodeDrain.Status.PendingEvictionPods).To(HaveLen(3)) }, timeout, interval).Should(Succeed()) @@ -239,7 +268,7 @@ var _ = Describe("Node Drain", func() { By("The drain should run to completion") Eventually(func(g Gomega) { - g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(nodeDrainCR), nodeDrain)).To(Succeed()) + g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: "node-drain", Namespace: testNamespace}, nodeDrain)).To(Succeed()) g.Expect(nodeDrain.Status.Phase).To(Equal(gezbcoukalphav1.NodeDrainPhaseCompleted)) }, timeout, interval).Should(Succeed()) @@ -264,35 +293,44 @@ var _ = Describe("Node Drain", func() { Expect(nodeDrain.Status.EvictionPodCount).To(Equal(4)) Expect(nodeDrain.Status.DrainProgress).To(Equal(100)) Expect(nodeDrain.Status.PodsBlockingDrain).To(Equal("")) + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(node), node)) + Expect(metav1.HasAnnotation(node.ObjectMeta, gezbcoukalphav1.NodeDrainAnnotation)).To(BeTrue()) + }) + + It("Cleanup after test", func() { + // if these are left to deferCleanup() we get timing issues + // where the node does not exist when the controller comes to delete the annotation + Expect(k8sClient.Delete(ctx, nodeDrain)).To(Succeed()) + Expect(k8sClient.Delete(ctx, node)).To(Succeed()) }) }) When("Drain does not run if another node for the same role & version has not been cordened ", func() { - var node2 *corev1.Node - BeforeEach(func() { - node := getTestNode("node1", true) + var node, node2 *corev1.Node + var nodeDrain *gezbcoukalphav1.NodeDrain + It("Setup for test", func() { + node = getTestNode("node1", true) Expect(k8sClient.Create(ctx, node)).To(Succeed()) - DeferCleanup(k8sClient.Delete, ctx, node) node2 = getTestNode("uncodened-node", false) Expect(k8sClient.Create(ctx, node2)).To(Succeed()) - DeferCleanup(k8sClient.Delete, ctx, node2) pod1 := getTestPod("pod1") Expect(k8sClient.Create(ctx, pod1)).To(Succeed()) // this will be done by the drain // DeferCleanup(k8sClient.Delete, ctx, pod1) - nodeDrainCR = getTestNodeDrain(true, false) - Expect(k8sClient.Create(ctx, nodeDrainCR)).To(Succeed()) - DeferCleanup(k8sClient.Delete, ctx, nodeDrainCR) + nodeDrain = getTestNodeDrain(false) + Expect(k8sClient.Create(ctx, nodeDrain)).To(Succeed()) }) It("should stop at OtherNodesNotCordoned when another node is not cordened", func() { nodeDrain := &gezbcoukalphav1.NodeDrain{} Eventually(func(g Gomega) { - g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(nodeDrainCR), nodeDrain)).To(Succeed()) + g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: "node-drain", Namespace: testNamespace}, nodeDrain)).To(Succeed()) g.Expect(nodeDrain.Status.Phase).To(Equal(gezbcoukalphav1.NodeDrainPhaseOtherNodesNotCordoned)) }, timeout, interval).Should(Succeed()) + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(node), node)) + Expect(metav1.HasAnnotation(node.ObjectMeta, gezbcoukalphav1.NodeDrainAnnotation)).To(BeTrue()) verifyStatusEvent(gezbcoukalphav1.NodeDrainPhasePending) verifyStatusEvent(gezbcoukalphav1.NodeDrainPhaseCordoned) @@ -319,7 +357,7 @@ var _ = Describe("Node Drain", func() { Expect(k8sClient.Update(ctx, node2)).To(Succeed()) Eventually(func(g Gomega) { - g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(nodeDrainCR), nodeDrain)).To(Succeed()) + g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: "node-drain", Namespace: testNamespace}, nodeDrain)).To(Succeed()) g.Expect(nodeDrain.Status.Phase).To(Equal(gezbcoukalphav1.NodeDrainPhaseCompleted)) }, timeout, interval).Should(Succeed()) @@ -334,14 +372,25 @@ var _ = Describe("Node Drain", func() { Expect(nodeDrain.Status.EvictionPodCount).To(Equal(1)) Expect(nodeDrain.Status.DrainProgress).To(Equal(100)) Expect(nodeDrain.Status.PodsBlockingDrain).To(Equal("")) + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(node), node)) + Expect(metav1.HasAnnotation(node.ObjectMeta, gezbcoukalphav1.NodeDrainAnnotation)).To(BeTrue()) + }) + + It("Cleanup after test", func() { + // if these are left to deferCleanup() we get timing issues + // where the node does not exist when the controller comes to delete the annotation + Expect(k8sClient.Delete(ctx, nodeDrain)).To(Succeed()) + Expect(k8sClient.Delete(ctx, node)).To(Succeed()) + Expect(k8sClient.Delete(ctx, node2)).To(Succeed()) }) }) When("NodeDrain CR for a node that is not empty", func() { - BeforeEach(func() { - node := getTestNode("node1", false) + var node *corev1.Node + var nodeDrain *gezbcoukalphav1.NodeDrain + It("Setup for test", func() { + node = getTestNode("node1", false) Expect(k8sClient.Create(ctx, node)).To(Succeed()) - DeferCleanup(k8sClient.Delete, ctx, node) pod1 := getTestPod("pod1") Expect(k8sClient.Create(ctx, pod1)).To(Succeed()) @@ -354,14 +403,13 @@ var _ = Describe("Node Drain", func() { // DeferCleanup(k8sClient.Delete, ctx, pod2) // DeferCleanup(k8sClient.Delete, ctx, pod3) - nodeDrainCR = getTestNodeDrain(false, false) - Expect(k8sClient.Create(ctx, nodeDrainCR)).To(Succeed()) - DeferCleanup(k8sClient.Delete, ctx, nodeDrainCR) + nodeDrain = getTestNodeDrain(false) + Expect(k8sClient.Create(ctx, nodeDrain)).To(Succeed()) }) It("should goto a completed state", func() { nodeDrain := &gezbcoukalphav1.NodeDrain{} Eventually(func(g Gomega) { - g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(nodeDrainCR), nodeDrain)).To(Succeed()) + g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: "node-drain", Namespace: testNamespace}, nodeDrain)).To(Succeed()) g.Expect(nodeDrain.Status.Phase).To(Equal(gezbcoukalphav1.NodeDrainPhaseCompleted)) }, timeout, interval).Should(Succeed()) @@ -392,21 +440,29 @@ var _ = Describe("Node Drain", func() { Expect(nodeDrain.Status.PodsToRestart).To(BeEmpty()) Expect(nodeDrain.Status.DrainProgress).To(Equal(100)) Expect(nodeDrain.Status.PodsBlockingDrain).To(Equal("")) + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(node), node)) + Expect(metav1.HasAnnotation(node.ObjectMeta, gezbcoukalphav1.NodeDrainAnnotation)).To(BeTrue()) + }) + It("Cleanup after test", func() { + // if these are left to deferCleanup() we get timing issues + // where the node does not exist when the controller comes to delete the annotation + Expect(k8sClient.Delete(ctx, nodeDrain)).To(Succeed()) + Expect(k8sClient.Delete(ctx, node)).To(Succeed()) }) }) When("NodeDrain CR for a node that does not exist", func() { - BeforeEach(func() { - nodeDrainCR = getTestNodeDrain(false, false) - Expect(k8sClient.Create(ctx, nodeDrainCR)).To(Succeed()) - DeferCleanup(k8sClient.Delete, ctx, nodeDrainCR) + var nodeDrain *gezbcoukalphav1.NodeDrain + It("Setup for test", func() { + nodeDrain = getTestNodeDrain(false) + Expect(k8sClient.Create(ctx, nodeDrain)).To(Succeed()) }) It("should goto a failed state as the node does not exist", func() { By("check nodeDrain CR status was Failed") nodeDrain := &gezbcoukalphav1.NodeDrain{} Eventually(func(g Gomega) { - g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(nodeDrainCR), nodeDrain)).To(Succeed()) + g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: "node-drain", Namespace: testNamespace}, nodeDrain)).To(Succeed()) g.Expect(nodeDrain.Status.Phase).To(Equal(gezbcoukalphav1.NodeDrainPhaseFailed)) }, timeout, interval).Should(Succeed()) @@ -424,14 +480,21 @@ var _ = Describe("Node Drain", func() { Expect(nodeDrain.Status.DrainProgress).To(Equal(0)) Expect(nodeDrain.Status.PodsBlockingDrain).To(Equal("")) }) + + It("Cleanup after test", func() { + // if these are left to deferCleanup() we get timing issues + // where the node does not exist when the controller comes to delete the annotation + Expect(k8sClient.Delete(ctx, nodeDrain)).To(Succeed()) + }) }) When("NodeDrain CR for a node that is not empty, waits for pods to restart when requested", func() { - BeforeEach(func() { + var node *corev1.Node + var nodeDrain *gezbcoukalphav1.NodeDrain + It("Setup for test", func() { statusChecker = neverTruePodStatusChecker{} - node := getTestNode("node1", false) + node = getTestNode("node1", false) Expect(k8sClient.Create(ctx, node)).To(Succeed()) - DeferCleanup(k8sClient.Delete, ctx, node) pod1 := getTestPod("pod1") Expect(k8sClient.Create(ctx, pod1)).To(Succeed()) @@ -444,9 +507,8 @@ var _ = Describe("Node Drain", func() { // DeferCleanup(k8sClient.Delete, ctx, pod2) // DeferCleanup(k8sClient.Delete, ctx, pod3) - nodeDrainCR = getTestNodeDrain(false, true) - Expect(k8sClient.Create(ctx, nodeDrainCR)).To(Succeed()) - DeferCleanup(k8sClient.Delete, ctx, nodeDrainCR) + nodeDrain = getTestNodeDrain(true) + Expect(k8sClient.Create(ctx, nodeDrain)).To(Succeed()) }) AfterEach(func() { @@ -457,7 +519,7 @@ var _ = Describe("Node Drain", func() { nodeDrain := &gezbcoukalphav1.NodeDrain{} Eventually(func(g Gomega) { - g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(nodeDrainCR), nodeDrain)).To(Succeed()) + g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: "node-drain", Namespace: testNamespace}, nodeDrain)).To(Succeed()) g.Expect(nodeDrain.Status.Phase).To(Equal(gezbcoukalphav1.NodeDrainPhaseWaitForPodsToRestart)) }, timeout, interval).Should(Succeed()) @@ -488,16 +550,25 @@ var _ = Describe("Node Drain", func() { Expect(nodeDrain.Status.PodsToRestart).To(Equal(podsToBeEvicted)) Expect(nodeDrain.Status.DrainProgress).To(Equal(0)) Expect(nodeDrain.Status.PodsBlockingDrain).To(Equal("")) + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(node), node)) + Expect(metav1.HasAnnotation(node.ObjectMeta, gezbcoukalphav1.NodeDrainAnnotation)).To(BeTrue()) + }) + It("Cleanup after test", func() { + // if these are left to deferCleanup() we get timing issues + // where the node does not exist when the controller comes to delete the annotation + Expect(k8sClient.Delete(ctx, nodeDrain)).To(Succeed()) + Expect(k8sClient.Delete(ctx, node)).To(Succeed()) }) }) When("NodeDrain CR for a node that does not match the role", func() { - BeforeEach(func() { - node := getTestNode("node1", false) + var node *corev1.Node + var nodeDrain *gezbcoukalphav1.NodeDrain + It("Setup for test", func() { + node = getTestNode("node1", false) node.Labels["role"] = "not-test-role" Expect(k8sClient.Create(ctx, node)).To(Succeed()) - DeferCleanup(k8sClient.Delete, ctx, node) pod1 := getTestPod("pod1") Expect(k8sClient.Create(ctx, pod1)).To(Succeed()) @@ -509,15 +580,14 @@ var _ = Describe("Node Drain", func() { DeferCleanup(k8sClient.Delete, ctx, pod2) DeferCleanup(k8sClient.Delete, ctx, pod3) - nodeDrainCR = getTestNodeDrain(false, false) - Expect(k8sClient.Create(ctx, nodeDrainCR)).To(Succeed()) - DeferCleanup(k8sClient.Delete, ctx, nodeDrainCR) + nodeDrain = getTestNodeDrain(false) + Expect(k8sClient.Create(ctx, nodeDrain)).To(Succeed()) }) It("should goto a failed state", func() { nodeDrain := &gezbcoukalphav1.NodeDrain{} Eventually(func(g Gomega) { - g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(nodeDrainCR), nodeDrain)).To(Succeed()) + g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: "node-drain", Namespace: testNamespace}, nodeDrain)).To(Succeed()) g.Expect(nodeDrain.Status.Phase).To(Equal(gezbcoukalphav1.NodeDrainPhaseFailed)) }, timeout, interval).Should(Succeed()) @@ -534,15 +604,24 @@ var _ = Describe("Node Drain", func() { Expect(nodeDrain.Status.EvictionPodCount).To(Equal(0)) Expect(nodeDrain.Status.DrainProgress).To(Equal(0)) Expect(nodeDrain.Status.PodsBlockingDrain).To(Equal("")) + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(node), node)) + Expect(metav1.HasAnnotation(node.ObjectMeta, gezbcoukalphav1.NodeDrainAnnotation)).To(BeFalse()) + }) + It("Cleanup after test", func() { + // if these are left to deferCleanup() we get timing issues + // where the node does not exist when the controller comes to delete the annotation + Expect(k8sClient.Delete(ctx, nodeDrain)).To(Succeed()) + Expect(k8sClient.Delete(ctx, node)).To(Succeed()) }) }) When("NodeDrain CR for a node that has version does not match the regex", func() { - BeforeEach(func() { - node := getTestNode("node1", false) + var node *corev1.Node + var nodeDrain *gezbcoukalphav1.NodeDrain + It("Setup for test", func() { + node = getTestNode("node1", false) Expect(k8sClient.Create(ctx, node)).To(Succeed()) - DeferCleanup(k8sClient.Delete, ctx, node) pod1 := getTestPod("pod1") Expect(k8sClient.Create(ctx, pod1)).To(Succeed()) @@ -554,16 +633,15 @@ var _ = Describe("Node Drain", func() { DeferCleanup(k8sClient.Delete, ctx, pod2) DeferCleanup(k8sClient.Delete, ctx, pod3) - nodeDrainCR = getTestNodeDrain(false, false) - nodeDrainCR.Spec.VersionToDrainRegex = "^1.34.*$" - Expect(k8sClient.Create(ctx, nodeDrainCR)).To(Succeed()) - DeferCleanup(k8sClient.Delete, ctx, nodeDrainCR) + nodeDrain = getTestNodeDrain(false) + nodeDrain.Spec.VersionToDrainRegex = "^1.34.*$" + Expect(k8sClient.Create(ctx, nodeDrain)).To(Succeed()) }) It("should goto a failed state", func() { nodeDrain := &gezbcoukalphav1.NodeDrain{} Eventually(func(g Gomega) { - g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(nodeDrainCR), nodeDrain)).To(Succeed()) + g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: "node-drain", Namespace: testNamespace}, nodeDrain)).To(Succeed()) g.Expect(nodeDrain.Status.Phase).To(Equal(gezbcoukalphav1.NodeDrainPhaseFailed)) }, timeout, interval).Should(Succeed()) @@ -580,10 +658,73 @@ var _ = Describe("Node Drain", func() { Expect(nodeDrain.Status.EvictionPodCount).To(Equal(0)) Expect(nodeDrain.Status.DrainProgress).To(Equal(0)) Expect(nodeDrain.Status.PodsBlockingDrain).To(Equal("")) + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(node), node)) + Expect(metav1.HasAnnotation(node.ObjectMeta, gezbcoukalphav1.NodeDrainAnnotation)).To(BeFalse()) + }) + It("Cleanup after test", func() { + // if these are left to deferCleanup() we get timing issues + // where the node does not exist when the controller comes to delete the annotation + Expect(k8sClient.Delete(ctx, nodeDrain)).To(Succeed()) + Expect(k8sClient.Delete(ctx, node)).To(Succeed()) }) }) + When("Drain does not run if another node for the same role & version has not been cordened ", func() { + var node, node2 *corev1.Node + var nodeDrain *gezbcoukalphav1.NodeDrain + It("Setup for test", func() { + node = getTestNode("node1", true) + Expect(k8sClient.Create(ctx, node)).To(Succeed()) + node2 = getTestNode("uncodened-node", false) + Expect(k8sClient.Create(ctx, node2)).To(Succeed()) + + pod1 := getTestPod("pod1") + Expect(k8sClient.Create(ctx, pod1)).To(Succeed()) + // this will be done by the drain + // DeferCleanup(k8sClient.Delete, ctx, pod1) + + nodeDrain = getTestNodeDrain(false) + Expect(k8sClient.Create(ctx, nodeDrain)).To(Succeed()) + }) + + It("when a nodeDrain is deleted annotation gets removed", func() { + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: "node-drain", Namespace: testNamespace}, nodeDrain)).To(Succeed()) + g.Expect(nodeDrain.Status.Phase).To(Equal(gezbcoukalphav1.NodeDrainPhaseOtherNodesNotCordoned)) + }, timeout, interval).Should(Succeed()) + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(node), node)) + Expect(metav1.HasAnnotation(node.ObjectMeta, gezbcoukalphav1.NodeDrainAnnotation)).To(BeTrue()) + + verifyStatusEvent(gezbcoukalphav1.NodeDrainPhasePending) + verifyStatusEvent(gezbcoukalphav1.NodeDrainPhaseCordoned) + verifyStatusEvent(gezbcoukalphav1.NodeDrainPhaseOtherNodesNotCordoned) + + Expect(nodeDrain.Status.LastError).To(Equal("")) + Expect(nodeDrain.Status.TotalPods).To(Equal(1)) + Expect(nodeDrain.Status.PodsToBeEvicted).To(BeNil()) + Expect(nodeDrain.Status.PendingEvictionPods).To(Equal([]string{"pod1"})) + Expect(nodeDrain.Status.PodsToRestart).To(BeEmpty()) + Expect(nodeDrain.Status.EvictionPodCount).To(Equal(1)) + Expect(nodeDrain.Status.DrainProgress).To(Equal(0)) + Expect(nodeDrain.Status.PodsBlockingDrain).To(Equal("")) + + Expect(k8sClient.Get(ctx, client.ObjectKey{Name: "node-drain", Namespace: testNamespace}, nodeDrain)).To(Succeed()) + Expect(k8sClient.Delete(ctx, nodeDrain)).To(Succeed()) + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(node), node)).To(Succeed()) + g.Expect(metav1.HasAnnotation(node.ObjectMeta, gezbcoukalphav1.NodeDrainAnnotation)).To(BeFalse()) + }, timeout, interval).Should(Succeed()) + }) + + It("Cleanup after test", func() { + // if these are left to deferCleanup() we get timing issues + // where the node does not exist when the controller comes to delete the annotation + Expect(k8sClient.Delete(ctx, nodeDrain)).To(Succeed()) + Expect(k8sClient.Delete(ctx, node)).To(Succeed()) + Expect(k8sClient.Delete(ctx, node2)).To(Succeed()) + }) + }) }) func getTestPod(podName string) *corev1.Pod { @@ -613,18 +754,17 @@ func getTestPod(podName string) *corev1.Pod { } } -func getTestNodeDrain(disableCordon bool, waitForRestart bool) *gezbcoukalphav1.NodeDrain { +func getTestNodeDrain(waitForRestart bool) *gezbcoukalphav1.NodeDrain { return &gezbcoukalphav1.NodeDrain{ ObjectMeta: metav1.ObjectMeta{ Name: "node-drain", Namespace: testNamespace, }, Spec: gezbcoukalphav1.NodeDrainSpec{ - NodeName: "node1", - VersionToDrainRegex: "^1.35.*$", - NodeRole: "test-role", - DisableCordon: disableCordon, - WaitForPodsToRestart: waitForRestart, + NodeName: "node1", + VersionToDrainRegex: "^1.35.*$", + NodeRole: "test-role", + SkipWaitForPodsToRestart: !waitForRestart, }, } } diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go index b47be42..5f2f6b2 100644 --- a/internal/controller/suite_test.go +++ b/internal/controller/suite_test.go @@ -2,6 +2,7 @@ package controller import ( "context" + "fmt" "os" "path/filepath" "sync" @@ -134,7 +135,7 @@ func (e *EventRecorder) DetectedEvent(reason string, msg string) bool { func TestControllers(t *testing.T) { RegisterFailHandler(Fail) - + _, _ = fmt.Fprintf(GinkgoWriter, "Starting node-drainer unit test suite\n") suiteConfig, reporterConfig := GinkgoConfiguration() reporterConfig.Verbose = true RunSpecs(t, "Controller Suite", suiteConfig, reporterConfig) diff --git a/test/utils/utils.go b/test/utils/utils.go index 9dc40a2..630d0a6 100644 --- a/test/utils/utils.go +++ b/test/utils/utils.go @@ -289,8 +289,8 @@ spec: nodeName: %s versionToDrainRegex: %s nodeRole: %s - waitForPodsToRestart: %t -`, nodeName, nodeName, versionToDrainRegex, role, waitForPodToRestart) + skipWaitForPodsToRestart: %t +`, nodeName, nodeName, versionToDrainRegex, role, !waitForPodToRestart) nodeDrainFile, err := CreateTempFile(nodeDrain) Expect(err).NotTo(HaveOccurred(), "Failed creating NodeDrain file to apply: "+nodeDrainFile)