openshift · jluhrsen · Dec 19, 2025 · danwinship · Jan 15, 2026
diff --git a/pkg/controller/statusmanager/machineconfig_status.go b/pkg/controller/statusmanager/machineconfig_status.go
@@ -125,7 +125,8 @@ func (status *StatusManager) SetFromMachineConfigPool(mcPools []mcfgv1.MachineCo
 	status.Lock()
 	defer status.Unlock()
 	// The status.renderedMachineConfigs is a non-nil map at the time when SetFromMachineConfigPool method is invoked.
-	for role, machineConfigs := range status.renderedMachineConfigs {
+	// First check if any role is degraded
+	for role := range status.renderedMachineConfigs {
 		pools, err := status.findMachineConfigPoolsForLabel(mcPools, map[string]string{names.MachineConfigLabelRoleKey: role})
 		if err != nil {
 			klog.Errorf("failed to get machine config pools for the role %s: %v", role, err)
@@ -135,7 +136,16 @@ func (status *StatusManager) SetFromMachineConfigPool(mcPools []mcfgv1.MachineCo
 			status.setDegraded(MachineConfig, "MachineConfig", fmt.Sprintf("%s machine config pool in degraded state", degradedPool))
 			return nil
 		}
-		status.setNotDegraded(MachineConfig)
+	}
+	// No degraded pools, so clear degraded status
+	status.setNotDegraded(MachineConfig)
+
+	// Now check for progressing and process machine configs
+	for role, machineConfigs := range status.renderedMachineConfigs {
+		pools, err := status.findMachineConfigPoolsForLabel(mcPools, map[string]string{names.MachineConfigLabelRoleKey: role})
+		if err != nil {
+			klog.Errorf("failed to get machine config pools for the role %s: %v", role, err)
+		}
 
 		progressingPool := status.isAnyMachineConfigPoolProgressing(pools)
 		if progressingPool != "" {

diff --git a/pkg/controller/statusmanager/status_manager.go b/pkg/controller/statusmanager/status_manager.go
@@ -8,6 +8,7 @@ import (
 	"reflect"
 	"strings"
 	"sync"
+	"time"
 
 	"github.com/ghodss/yaml"
 	"github.com/openshift/cluster-network-operator/pkg/hypershift"
@@ -61,6 +62,9 @@ const (
 const (
 	ClusteredNameSeparator = '/'
 	fieldManager           = "cluster-network-operator/status-manager"
+
+	// degradedFailureDurationThreshold is duration for failures before setting the Degraded status
+	degradedFailureDurationThreshold = 2 * time.Minute
 )
 
 // keepCRDs is a list of CRD names that won't be removed from the system even if
@@ -105,6 +109,11 @@ type StatusManager struct {
 	failing         [maxStatusLevel]*operv1.OperatorCondition
 	installComplete bool
 
+	// failureFirstSeen tracks when each StatusLevel first started failing.
+	failureFirstSeen map[StatusLevel]time.Time
+
+	clock clock.PassiveClock
+
 	// All our informers and listers
 	dsInformers map[string]cache.SharedIndexInformer
 	dsListers   map[string]DaemonSetLister
@@ -135,6 +144,8 @@ func New(client cnoclient.Client, name, cluster string) *StatusManager {
 		name:             name,
 		hyperShiftConfig: hypershift.NewHyperShiftConfig(),
 
+		failureFirstSeen:           map[StatusLevel]time.Time{},
+		clock:                      clock.RealClock{},
 		dsInformers:                map[string]cache.SharedIndexInformer{},
 		dsListers:                  map[string]DaemonSetLister{},
 		depInformers:               map[string]cache.SharedIndexInformer{},
@@ -545,6 +556,18 @@ func (status *StatusManager) syncDegraded() {
 }
 
 func (status *StatusManager) setDegraded(statusLevel StatusLevel, reason, message string) {
+	// Track when we first saw this failure
+	if _, exists := status.failureFirstSeen[statusLevel]; !exists {
+		status.failureFirstSeen[statusLevel] = status.clock.Now()
+		return // Don't set Degraded on first failure
+	}
+
+	// Check if failure has persisted long enough
+	if status.clock.Since(status.failureFirstSeen[statusLevel]) < degradedFailureDurationThreshold {
+		return // Not persistent enough yet
+	}
+
+	// Set Degraded - failure has persisted for 2+ minutes
 	status.failing[statusLevel] = &operv1.OperatorCondition{
 		Type:    operv1.OperatorStatusTypeDegraded,
 		Status:  operv1.ConditionTrue,
@@ -555,9 +578,8 @@ func (status *StatusManager) setDegraded(statusLevel StatusLevel, reason, messag
 }
 
 func (status *StatusManager) setNotDegraded(statusLevel StatusLevel) {
-	if status.failing[statusLevel] != nil {
-		status.failing[statusLevel] = nil
-	}
+	status.failing[statusLevel] = nil
+	delete(status.failureFirstSeen, statusLevel) // Clear failure tracking
 	status.syncDegraded()
 }
 

diff --git a/pkg/controller/statusmanager/status_manager_test.go b/pkg/controller/statusmanager/status_manager_test.go
@@ -25,6 +25,7 @@ import (
 	"k8s.io/apimachinery/pkg/runtime/schema"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/client-go/kubernetes/scheme"
+	testingclock "k8s.io/utils/clock/testing"
 
 	crclient "sigs.k8s.io/controller-runtime/pkg/client"
 )
@@ -363,7 +364,8 @@ func TestStatusManagerSetDegraded(t *testing.T) {
 		Reason: "Pods",
 	}
 
-	// Initial failure status
+	// Initial failure status - backdate so it sets Degraded immediately
+	status.failureFirstSeen[OperatorConfig] = time.Now().Add(-3 * time.Minute)
 	status.SetDegraded(OperatorConfig, "Operator", "")
 	oc, err := getOC(client)
 	if err != nil {
@@ -373,7 +375,8 @@ func TestStatusManagerSetDegraded(t *testing.T) {
 		t.Fatalf("unexpected Status.Conditions: %#v", oc.Status.Conditions)
 	}
 
-	// Setting a higher-level status should override it
+	// Setting a higher-level status should override it - backdate this one too
+	status.failureFirstSeen[ClusterConfig] = time.Now().Add(-3 * time.Minute)
 	status.SetDegraded(ClusterConfig, "Cluster", "")
 	oc, err = getOC(client)
 	if err != nil {
@@ -383,7 +386,8 @@ func TestStatusManagerSetDegraded(t *testing.T) {
 		t.Fatalf("unexpected Status.Conditions: %#v", oc.Status.Conditions)
 	}
 
-	// Setting a lower-level status should be ignored
+	// Setting a lower-level status should be ignored - backdate
+	status.failureFirstSeen[PodDeployment] = time.Now().Add(-3 * time.Minute)
 	status.SetDegraded(PodDeployment, "Pods", "")
 	oc, err = getOC(client)
 	if err != nil {
@@ -428,6 +432,7 @@ func TestStatusManagerSetFromIPsecConfigs(t *testing.T) {
 	client := fake.NewFakeClient()
 	status := New(client, "testing", names.StandAloneClusterName)
 	setFakeListers(status)
+	status.clock = testingclock.NewFakeClock(time.Now())
 	no := &operv1.Network{ObjectMeta: metav1.ObjectMeta{Name: names.OPERATOR_CONFIG},
 		Spec: operv1.NetworkSpec{DefaultNetwork: operv1.DefaultNetworkDefinition{
 			OVNKubernetesConfig: &operv1.OVNKubernetesConfig{IPsecConfig: &operv1.IPsecConfig{Mode: operv1.IPsecModeFull}}}}}
@@ -510,6 +515,14 @@ func TestStatusManagerSetFromIPsecConfigs(t *testing.T) {
 		Status: mcfgv1.MachineConfigPoolStatus{Conditions: []mcfgv1.MachineConfigPoolCondition{{Type: mcfgv1.MachineConfigPoolDegraded,
 			Status: v1.ConditionTrue}}}}
 	mcPools = append(mcPools, workerIPsecMachineConfigPool)
+	// First call records the failure
+	err = status.SetFromMachineConfigPool(mcPools)
+	if err != nil {
+		t.Fatalf("error processing machine config pools: %v", err)
+	}
+	// Advance time past the debouncing threshold
+	status.clock.(*testingclock.FakeClock).Step(3 * time.Minute)
+	// Second call sets Degraded
 	err = status.SetFromMachineConfigPool(mcPools)
 	if err != nil {
 		t.Fatalf("error processing machine config pools: %v", err)
@@ -654,6 +667,15 @@ func TestStatusManagerSetFromIPsecConfigs(t *testing.T) {
 	masterIPsecmachineConfigPool.Status = mcfgv1.MachineConfigPoolStatus{Conditions: []mcfgv1.MachineConfigPoolCondition{{Type: mcfgv1.MachineConfigPoolDegraded,
 		Status: v1.ConditionTrue}}, Configuration: mcfgv1.MachineConfigPoolStatusConfiguration{
 		Source: []v1.ObjectReference{{Name: masterMachineConfigIPsecExtName}}}}
+	// First call records the failure
+	err = status.SetFromMachineConfigPool([]mcfgv1.MachineConfigPool{masterIPsecmachineConfigPool,
+		workerIPsecMachineConfigPool})
+	if err != nil {
+		t.Fatalf("error processing machine config pools: %v", err)
+	}
+	// Advance time past threshold
+	status.clock.(*testingclock.FakeClock).Step(3 * time.Minute)
+	// Second call sets Degraded
 	err = status.SetFromMachineConfigPool([]mcfgv1.MachineConfigPool{masterIPsecmachineConfigPool,
 		workerIPsecMachineConfigPool})
 	if err != nil {
@@ -775,6 +797,7 @@ func TestStatusManagerSetFromIPsecConfigs(t *testing.T) {
 func TestStatusManagerSetFromDaemonSets(t *testing.T) {
 	client := fake.NewFakeClient()
 	status := New(client, "testing", names.StandAloneClusterName)
+	status.clock = testingclock.NewFakeClock(time.Now())
 	setFakeListers(status)
 	no := &operv1.Network{ObjectMeta: metav1.ObjectMeta{Name: names.OPERATOR_CONFIG}}
 	setOC(t, client, no)
@@ -1158,6 +1181,11 @@ func TestStatusManagerSetFromDaemonSets(t *testing.T) {
 		}
 	}
 	setLastPodState(t, client, "testing", ps)
+	// First call records the failure
+	status.SetFromPods()
+	// Advance time past the debouncing threshold
+	status.clock.(*testingclock.FakeClock).Step(3 * time.Minute)
+	// Second call sets Degraded
 	status.SetFromPods()
 
 	co, oc, err = getStatuses(client, "testing")
@@ -1377,6 +1405,7 @@ func TestStatusManagerSetFromDaemonSets(t *testing.T) {
 func TestStatusManagerSetFromDeployments(t *testing.T) {
 	client := fake.NewFakeClient()
 	status := New(client, "testing", names.StandAloneClusterName)
+	status.clock = testingclock.NewFakeClock(time.Now())
 	setFakeListers(status)
 	no := &operv1.Network{ObjectMeta: metav1.ObjectMeta{Name: names.OPERATOR_CONFIG}}
 	setOC(t, client, no)
@@ -1605,6 +1634,11 @@ func TestStatusManagerSetFromDeployments(t *testing.T) {
 		}
 	}
 	setLastPodState(t, client, "testing", ps)
+	// First call records the failure
+	status.SetFromPods()
+	// Advance time past the debouncing threshold
+	status.clock.(*testingclock.FakeClock).Step(3 * time.Minute)
+	// Second call sets Degraded
 	status.SetFromPods()
 
 	co, oc, err = getStatuses(client, "testing")
@@ -1909,7 +1943,16 @@ func TestStatusManagerCheckCrashLoopBackOffPods(t *testing.T) {
 	}
 	set(t, client, podC)
 
+	// First call to SetFromPods() will record the failure but not set Degraded yet
 	status.SetFromPods()
+
+	// Simulate time passing beyond the degraded threshold by setting failure first-seen time
+	// to 3 minutes ago
+	status.failureFirstSeen[RolloutHung] = time.Now().Add(-3 * time.Minute)
+
+	// Second call to SetFromPods() will now set Degraded since the failure has persisted
+	status.SetFromPods()
+
 	oc, err = getOC(client)
 	if err != nil {
 		t.Fatalf("error getting ClusterOperator: %v", err)