Skip to content

Commit 763a520

Browse files
committed
Add OTE helper functions in test/library/ote
Add two new OTE helper functions for common test scenarios: 1. WaitForAPIServerRollout: Waits for all API server pods to be recreated after a configuration change. Unlike WaitForAPIServerToStabilizeOnTheSameRevision, this specifically waits for NEW pods to replace old ones. 2. WaitForFeatureGateEnabled: Waits for a specific feature gate to be enabled in the cluster by polling the FeatureGate resource. These functions are needed for testing configuration changes and feature gate enablement in operator e2e tests, particularly for EventTTL configuration tests in cluster-kube-apiserver-operator.
1 parent ab97ebb commit 763a520

File tree

1 file changed

+374
-0
lines changed

1 file changed

+374
-0
lines changed

test/ote/util.go

Lines changed: 374 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,374 @@
1+
package ote
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"reflect"
7+
"time"
8+
9+
configv1client "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1"
10+
"github.com/openshift/library-go/test/library"
11+
corev1 "k8s.io/api/core/v1"
12+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
13+
"k8s.io/apimachinery/pkg/util/wait"
14+
corev1client "k8s.io/client-go/kubernetes/typed/core/v1"
15+
)
16+
17+
// WaitForClusterOperatorStatus waits for a ClusterOperator to reach the expected status conditions.
18+
// Example usage:
19+
//
20+
// err := WaitForClusterOperatorStatus(t, coClient, "kube-apiserver",
21+
// map[string]string{"Available": "True", "Progressing": "False", "Degraded": "False"},
22+
// 10*time.Minute, 1.0)
23+
func WaitForClusterOperatorStatus(t library.LoggingT, coClient configv1client.ClusterOperatorInterface, coName string, expectedStatus map[string]string, timeout time.Duration, waitMultiplier float64) error {
24+
stableDelay := 100 * time.Second
25+
26+
// Apply timeout multiplier
27+
if waitMultiplier != 1.0 {
28+
timeout = time.Duration(float64(timeout) * waitMultiplier)
29+
t.Logf("Adjusted timeout for cluster type: %v (multiplier: %.2f)", timeout, waitMultiplier)
30+
}
31+
32+
t.Logf("Waiting for ClusterOperator %s to reach status %v (timeout: %v)", coName, expectedStatus, timeout)
33+
34+
attempt := 0
35+
consecutiveErrors := 0
36+
maxConsecutiveErrors := 5
37+
var lastStatus map[string]string
38+
var lastConditionDetails map[string]conditionDetail
39+
40+
errCo := wait.PollUntilContextTimeout(context.Background(), 20*time.Second, timeout, false, func(ctx context.Context) (bool, error) {
41+
attempt++
42+
gottenStatus, conditionDetails, err := getClusterOperatorConditionStatusWithDetails(ctx, coClient, coName, expectedStatus)
43+
if err != nil {
44+
consecutiveErrors++
45+
t.Logf("[Attempt %d] Error getting ClusterOperator status: %v (consecutive errors: %d)", attempt, err, consecutiveErrors)
46+
// Fail fast if we hit too many consecutive errors
47+
if consecutiveErrors >= maxConsecutiveErrors {
48+
return false, fmt.Errorf("too many consecutive errors (%d) getting ClusterOperator status: %w", consecutiveErrors, err)
49+
}
50+
return false, nil
51+
}
52+
consecutiveErrors = 0
53+
54+
// Log detailed status changes
55+
if !reflect.DeepEqual(lastStatus, gottenStatus) && lastStatus != nil {
56+
logConditionChanges(t, attempt, coName, lastConditionDetails, conditionDetails)
57+
}
58+
lastStatus = gottenStatus
59+
lastConditionDetails = conditionDetails
60+
61+
eq := reflect.DeepEqual(expectedStatus, gottenStatus)
62+
if eq {
63+
// Check if this is the stable healthy state
64+
isHealthyState := reflect.DeepEqual(expectedStatus, map[string]string{"Available": "True", "Progressing": "False", "Degraded": "False"})
65+
if isHealthyState {
66+
// For True/False/False, wait some additional time and double check to ensure it is stably healthy
67+
t.Logf("[Attempt %d] ClusterOperator %s reached healthy state, waiting %v for stability check", attempt, coName, stableDelay)
68+
time.Sleep(stableDelay)
69+
70+
gottenStatus, conditionDetails, err := getClusterOperatorConditionStatusWithDetails(ctx, coClient, coName, expectedStatus)
71+
if err != nil {
72+
t.Logf("Error during stability check: %v", err)
73+
return false, nil
74+
}
75+
76+
eq := reflect.DeepEqual(expectedStatus, gottenStatus)
77+
if eq {
78+
t.Logf("ClusterOperator %s is stably available/non-progressing/non-degraded", coName)
79+
return true, nil
80+
}
81+
t.Logf("ClusterOperator %s became unstable during stability check", coName)
82+
logConditionDetails(t, conditionDetails)
83+
return false, nil
84+
} else {
85+
t.Logf("[Attempt %d] ClusterOperator %s reached expected status: %v", attempt, coName, gottenStatus)
86+
return true, nil
87+
}
88+
}
89+
90+
// Log progress every 3 attempts (1 minute) with detailed condition info
91+
if attempt%3 == 0 {
92+
t.Logf("[Attempt %d] ClusterOperator %s status check:", attempt, coName)
93+
logConditionDetails(t, conditionDetails)
94+
t.Logf(" Expected: %v", expectedStatus)
95+
}
96+
return false, nil
97+
})
98+
99+
if errCo != nil {
100+
t.Logf("Failed waiting for ClusterOperator %s to reach expected status", coName)
101+
if lastConditionDetails != nil {
102+
t.Logf("Final ClusterOperator %s status:", coName)
103+
logConditionDetails(t, lastConditionDetails)
104+
}
105+
}
106+
return errCo
107+
}
108+
109+
// conditionDetail holds detailed information about a ClusterOperator condition
110+
type conditionDetail struct {
111+
Type string
112+
Status string
113+
Reason string
114+
Message string
115+
LastTransitionTime time.Time
116+
}
117+
118+
// getClusterOperatorConditionStatusWithDetails retrieves detailed status information for specified conditions
119+
func getClusterOperatorConditionStatusWithDetails(ctx context.Context, coClient configv1client.ClusterOperatorInterface, coName string, statusToCompare map[string]string) (map[string]string, map[string]conditionDetail, error) {
120+
co, err := coClient.Get(ctx, coName, metav1.GetOptions{})
121+
if err != nil {
122+
return nil, nil, fmt.Errorf("failed to get ClusterOperator %s: %w", coName, err)
123+
}
124+
125+
statusMap := make(map[string]string)
126+
detailsMap := make(map[string]conditionDetail)
127+
128+
for conditionType := range statusToCompare {
129+
found := false
130+
for _, condition := range co.Status.Conditions {
131+
if string(condition.Type) == conditionType {
132+
statusMap[conditionType] = string(condition.Status)
133+
detailsMap[conditionType] = conditionDetail{
134+
Type: string(condition.Type),
135+
Status: string(condition.Status),
136+
Reason: condition.Reason,
137+
Message: condition.Message,
138+
LastTransitionTime: condition.LastTransitionTime.Time,
139+
}
140+
found = true
141+
break
142+
}
143+
}
144+
if !found {
145+
statusMap[conditionType] = "Unknown"
146+
detailsMap[conditionType] = conditionDetail{
147+
Type: conditionType,
148+
Status: "Unknown",
149+
Reason: "ConditionNotFound",
150+
Message: "Condition not present in ClusterOperator status",
151+
}
152+
}
153+
}
154+
155+
return statusMap, detailsMap, nil
156+
}
157+
158+
// logConditionDetails logs detailed information about all conditions
159+
func logConditionDetails(t library.LoggingT, details map[string]conditionDetail) {
160+
// Sort condition types for consistent output
161+
conditionTypes := []string{"Available", "Progressing", "Degraded"}
162+
163+
for _, condType := range conditionTypes {
164+
if detail, ok := details[condType]; ok {
165+
if detail.Status == "Unknown" {
166+
t.Logf(" %s: %s (%s)", detail.Type, detail.Status, detail.Reason)
167+
} else {
168+
msg := detail.Message
169+
if len(msg) > 100 {
170+
msg = msg[:97] + "..."
171+
}
172+
if detail.Reason != "" {
173+
t.Logf(" %s=%s (reason: %s, message: %s)", detail.Type, detail.Status, detail.Reason, msg)
174+
} else {
175+
t.Logf(" %s=%s (message: %s)", detail.Type, detail.Status, msg)
176+
}
177+
}
178+
}
179+
}
180+
181+
// Log any other conditions not in the standard set
182+
for condType, detail := range details {
183+
isStandard := false
184+
for _, std := range conditionTypes {
185+
if condType == std {
186+
isStandard = true
187+
break
188+
}
189+
}
190+
if !isStandard {
191+
msg := detail.Message
192+
if len(msg) > 100 {
193+
msg = msg[:97] + "..."
194+
}
195+
t.Logf(" %s=%s (reason: %s, message: %s)", detail.Type, detail.Status, detail.Reason, msg)
196+
}
197+
}
198+
}
199+
200+
// logConditionChanges logs what changed between two condition states
201+
func logConditionChanges(t library.LoggingT, attempt int, coName string, oldDetails, newDetails map[string]conditionDetail) {
202+
t.Logf("[Attempt %d] ClusterOperator %s status changed:", attempt, coName)
203+
for condType, newDetail := range newDetails {
204+
if oldDetail, ok := oldDetails[condType]; ok {
205+
if oldDetail.Status != newDetail.Status {
206+
t.Logf(" %s: %s -> %s (reason: %s)", condType, oldDetail.Status, newDetail.Status, newDetail.Reason)
207+
if newDetail.Message != "" {
208+
msg := newDetail.Message
209+
if len(msg) > 150 {
210+
msg = msg[:147] + "..."
211+
}
212+
t.Logf(" Message: %s", msg)
213+
}
214+
} else if oldDetail.Reason != newDetail.Reason || oldDetail.Message != newDetail.Message {
215+
t.Logf(" %s: %s (reason changed: %s -> %s)", condType, newDetail.Status, oldDetail.Reason, newDetail.Reason)
216+
}
217+
} else {
218+
t.Logf(" %s: (new) -> %s (reason: %s)", condType, newDetail.Status, newDetail.Reason)
219+
}
220+
}
221+
}
222+
223+
// GetClusterOperatorConditionStatus retrieves the current status values for specified conditionsof a ClusterOperator.
224+
// Example usage:
225+
//
226+
// status, err := GetClusterOperatorConditionStatus(ctx, coClient, "kube-apiserver",
227+
// map[string]string{"Available": "", "Progressing": "", "Degraded": ""})
228+
// Returns: map[string]string{"Available": "True", "Progressing": "False", "Degraded": "False"}
229+
func GetClusterOperatorConditionStatus(ctx context.Context, coClient configv1client.ClusterOperatorInterface, coName string, statusToCompare map[string]string) (map[string]string, error) {
230+
co, err := coClient.Get(ctx, coName, metav1.GetOptions{})
231+
if err != nil {
232+
return nil, fmt.Errorf("failed to get ClusterOperator %s: %w", coName, err)
233+
}
234+
235+
newStatusToCompare := make(map[string]string)
236+
for conditionType := range statusToCompare {
237+
// Find the condition with the matching type
238+
conditionStatus := "Unknown"
239+
for _, condition := range co.Status.Conditions {
240+
if string(condition.Type) == conditionType {
241+
conditionStatus = string(condition.Status)
242+
break
243+
}
244+
}
245+
newStatusToCompare[conditionType] = conditionStatus
246+
}
247+
248+
return newStatusToCompare, nil
249+
}
250+
251+
// WaitForClusterOperatorHealthy is a convenience wrapper for waiting for a ClusterOperator
252+
// to reach the standard healthy state (Available=True, Progressing=False, Degraded=False).
253+
// Example usage:
254+
//
255+
// err := WaitForClusterOperatorHealthy(t, coClient, "kube-apiserver", 10*time.Minute, 1.0)
256+
func WaitForClusterOperatorHealthy(t library.LoggingT, coClient configv1client.ClusterOperatorInterface, coName string, timeout time.Duration, waitMultiplier float64) error {
257+
return WaitForClusterOperatorStatus(t, coClient, coName,
258+
map[string]string{"Available": "True", "Progressing": "False", "Degraded": "False"},
259+
timeout, waitMultiplier)
260+
}
261+
262+
// WaitForAPIServerRollout waits for all API server pods to be recreated and running
263+
// after a configuration change. Unlike WaitForAPIServerToStabilizeOnTheSameRevision which
264+
// waits for pods to converge on the same revision, this function specifically waits for
265+
// NEW pods (created after the function is called) to replace the old ones.
266+
//
267+
// This is useful when you make a configuration change and need to ensure all pods have
268+
// been recreated with the new configuration, not just that they're on the same revision.
269+
//
270+
// Parameters:
271+
// - t: Logger interface for test output
272+
// - podClient: Pod client interface for the target namespace
273+
// - labelSelector: Label selector to identify API server pods (e.g., "apiserver=true")
274+
// - timeout: Maximum time to wait for rollout to complete
275+
//
276+
// Returns:
277+
// - error if timeout is reached or an error occurs during polling
278+
//
279+
// Note:
280+
// - All existing pods must be replaced by new pods created after this function is called
281+
// - Supports both single-node and multi-node deployments
282+
func WaitForAPIServerRollout(t library.LoggingT, podClient corev1client.PodInterface, labelSelector string, timeout time.Duration) error {
283+
rolloutStartTime := time.Now()
284+
285+
// Get current pods before we start waiting
286+
initialPods, err := podClient.List(context.Background(), metav1.ListOptions{
287+
LabelSelector: labelSelector,
288+
})
289+
if err != nil {
290+
t.Logf("Warning: Could not get initial pods: %v", err)
291+
}
292+
293+
var oldestPodTime time.Time
294+
initialRevision := ""
295+
if initialPods != nil && len(initialPods.Items) > 0 {
296+
oldestPodTime = initialPods.Items[0].CreationTimestamp.Time
297+
for _, pod := range initialPods.Items {
298+
if pod.CreationTimestamp.Time.Before(oldestPodTime) {
299+
oldestPodTime = pod.CreationTimestamp.Time
300+
}
301+
if rev, ok := pod.Labels["revision"]; ok && initialRevision == "" {
302+
initialRevision = rev
303+
}
304+
}
305+
t.Logf("Initial state: %d pods, oldest created at %s, initial revision: %s",
306+
len(initialPods.Items), oldestPodTime.Format(time.RFC3339), initialRevision)
307+
}
308+
309+
attempt := 0
310+
lastPodCount := 0
311+
lastNotRunningCount := 0
312+
313+
return wait.PollUntilContextTimeout(context.Background(), 15*time.Second, timeout, false, func(ctx context.Context) (bool, error) {
314+
attempt++
315+
pods, err := podClient.List(ctx, metav1.ListOptions{
316+
LabelSelector: labelSelector,
317+
})
318+
if err != nil {
319+
t.Logf("[Attempt %d] Error listing pods: %v", attempt, err)
320+
return false, nil
321+
}
322+
323+
if len(pods.Items) == 0 {
324+
t.Logf("[Attempt %d] No pods found yet", attempt)
325+
return false, nil
326+
}
327+
328+
// Count pods and check if we have new pods (created after rollout started)
329+
notRunningCount := 0
330+
newPodsCount := 0
331+
runningNewPodsCount := 0
332+
var notRunningPods []string
333+
var currentRevision string
334+
335+
for _, pod := range pods.Items {
336+
isNewPod := pod.CreationTimestamp.Time.After(rolloutStartTime)
337+
338+
if pod.Status.Phase != corev1.PodRunning {
339+
notRunningCount++
340+
notRunningPods = append(notRunningPods, fmt.Sprintf("%s (%s)", pod.Name, pod.Status.Phase))
341+
}
342+
343+
if isNewPod {
344+
newPodsCount++
345+
if pod.Status.Phase == corev1.PodRunning {
346+
runningNewPodsCount++
347+
}
348+
}
349+
350+
if rev, ok := pod.Labels["revision"]; ok && currentRevision == "" {
351+
currentRevision = rev
352+
}
353+
}
354+
355+
// Success condition: ALL pods must be new (created after rolloutStartTime) and running
356+
expectedPodCount := len(pods.Items)
357+
allPodsNewAndRunning := newPodsCount == expectedPodCount && runningNewPodsCount == expectedPodCount
358+
359+
// Log only when state changes or every 4th attempt (1 minute)
360+
if notRunningCount != lastNotRunningCount || len(pods.Items) != lastPodCount || attempt%4 == 0 {
361+
if notRunningCount > 0 {
362+
t.Logf("[Attempt %d] %d/%d pods running. Not running: %v. New pods: %d/%d running",
363+
attempt, len(pods.Items)-notRunningCount, len(pods.Items), notRunningPods, runningNewPodsCount, newPodsCount)
364+
} else {
365+
t.Logf("[Attempt %d] All %d pods are running. New pods: %d/%d. Revision: %s",
366+
attempt, len(pods.Items), runningNewPodsCount, newPodsCount, currentRevision)
367+
}
368+
lastPodCount = len(pods.Items)
369+
lastNotRunningCount = notRunningCount
370+
}
371+
372+
return allPodsNewAndRunning, nil
373+
})
374+
}

0 commit comments

Comments
 (0)