Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions pkg/controller/statusmanager/machineconfig_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,8 @@ func (status *StatusManager) SetFromMachineConfigPool(mcPools []mcfgv1.MachineCo
status.Lock()
defer status.Unlock()
// The status.renderedMachineConfigs is a non-nil map at the time when SetFromMachineConfigPool method is invoked.
for role, machineConfigs := range status.renderedMachineConfigs {
// First check if any role is degraded
for role := range status.renderedMachineConfigs {
pools, err := status.findMachineConfigPoolsForLabel(mcPools, map[string]string{names.MachineConfigLabelRoleKey: role})
if err != nil {
klog.Errorf("failed to get machine config pools for the role %s: %v", role, err)
Expand All @@ -135,7 +136,16 @@ func (status *StatusManager) SetFromMachineConfigPool(mcPools []mcfgv1.MachineCo
status.setDegraded(MachineConfig, "MachineConfig", fmt.Sprintf("%s machine config pool in degraded state", degradedPool))
return nil
}
status.setNotDegraded(MachineConfig)
}
// No degraded pools, so clear degraded status
status.setNotDegraded(MachineConfig)

// Now check for progressing and process machine configs
for role, machineConfigs := range status.renderedMachineConfigs {
pools, err := status.findMachineConfigPoolsForLabel(mcPools, map[string]string{names.MachineConfigLabelRoleKey: role})
if err != nil {
klog.Errorf("failed to get machine config pools for the role %s: %v", role, err)
}

progressingPool := status.isAnyMachineConfigPoolProgressing(pools)
if progressingPool != "" {
Expand Down
28 changes: 25 additions & 3 deletions pkg/controller/statusmanager/status_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"reflect"
"strings"
"sync"
"time"

"github.com/ghodss/yaml"
"github.com/openshift/cluster-network-operator/pkg/hypershift"
Expand Down Expand Up @@ -61,6 +62,9 @@ const (
const (
ClusteredNameSeparator = '/'
fieldManager = "cluster-network-operator/status-manager"

// degradedFailureDurationThreshold is duration for failures before setting the Degraded status
degradedFailureDurationThreshold = 2 * time.Minute
)

// keepCRDs is a list of CRD names that won't be removed from the system even if
Expand Down Expand Up @@ -105,6 +109,11 @@ type StatusManager struct {
failing [maxStatusLevel]*operv1.OperatorCondition
installComplete bool

// failureFirstSeen tracks when each StatusLevel first started failing.
failureFirstSeen map[StatusLevel]time.Time

clock clock.PassiveClock

// All our informers and listers
dsInformers map[string]cache.SharedIndexInformer
dsListers map[string]DaemonSetLister
Expand Down Expand Up @@ -135,6 +144,8 @@ func New(client cnoclient.Client, name, cluster string) *StatusManager {
name: name,
hyperShiftConfig: hypershift.NewHyperShiftConfig(),

failureFirstSeen: map[StatusLevel]time.Time{},
clock: clock.RealClock{},
dsInformers: map[string]cache.SharedIndexInformer{},
dsListers: map[string]DaemonSetLister{},
depInformers: map[string]cache.SharedIndexInformer{},
Expand Down Expand Up @@ -545,6 +556,18 @@ func (status *StatusManager) syncDegraded() {
}

func (status *StatusManager) setDegraded(statusLevel StatusLevel, reason, message string) {
// Track when we first saw this failure
if _, exists := status.failureFirstSeen[statusLevel]; !exists {
status.failureFirstSeen[statusLevel] = status.clock.Now()
return // Don't set Degraded on first failure
}

// Check if failure has persisted long enough
if status.clock.Since(status.failureFirstSeen[statusLevel]) < degradedFailureDurationThreshold {
return // Not persistent enough yet
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are at least some places where we're already doing timeouts, like SetFromPods in pkg/controller/statusmanager/pod_status.go, which only calls setDegraded if a rollout has been hung for longer than 10 minutes. So it doesn't need another timeout here...

Maybe split out a MaybeSetDegraded() that does the degradedFailureDurationThreshold check, and leave SetDegraded() to set it degraded right away?

You'll have to look over each caller to see if they're doing their own timeouts already (though it might be just pod_status that does that...)

}

// Set Degraded - failure has persisted for 2+ minutes
status.failing[statusLevel] = &operv1.OperatorCondition{
Type: operv1.OperatorStatusTypeDegraded,
Status: operv1.ConditionTrue,
Expand All @@ -555,9 +578,8 @@ func (status *StatusManager) setDegraded(statusLevel StatusLevel, reason, messag
}

func (status *StatusManager) setNotDegraded(statusLevel StatusLevel) {
if status.failing[statusLevel] != nil {
status.failing[statusLevel] = nil
}
status.failing[statusLevel] = nil
delete(status.failureFirstSeen, statusLevel) // Clear failure tracking
status.syncDegraded()
}

Expand Down
49 changes: 46 additions & 3 deletions pkg/controller/statusmanager/status_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes/scheme"
testingclock "k8s.io/utils/clock/testing"

crclient "sigs.k8s.io/controller-runtime/pkg/client"
)
Expand Down Expand Up @@ -363,7 +364,8 @@ func TestStatusManagerSetDegraded(t *testing.T) {
Reason: "Pods",
}

// Initial failure status
// Initial failure status - backdate so it sets Degraded immediately
status.failureFirstSeen[OperatorConfig] = time.Now().Add(-3 * time.Minute)
status.SetDegraded(OperatorConfig, "Operator", "")
oc, err := getOC(client)
if err != nil {
Expand All @@ -373,7 +375,8 @@ func TestStatusManagerSetDegraded(t *testing.T) {
t.Fatalf("unexpected Status.Conditions: %#v", oc.Status.Conditions)
}

// Setting a higher-level status should override it
// Setting a higher-level status should override it - backdate this one too
status.failureFirstSeen[ClusterConfig] = time.Now().Add(-3 * time.Minute)
status.SetDegraded(ClusterConfig, "Cluster", "")
oc, err = getOC(client)
if err != nil {
Expand All @@ -383,7 +386,8 @@ func TestStatusManagerSetDegraded(t *testing.T) {
t.Fatalf("unexpected Status.Conditions: %#v", oc.Status.Conditions)
}

// Setting a lower-level status should be ignored
// Setting a lower-level status should be ignored - backdate
status.failureFirstSeen[PodDeployment] = time.Now().Add(-3 * time.Minute)
status.SetDegraded(PodDeployment, "Pods", "")
oc, err = getOC(client)
if err != nil {
Expand Down Expand Up @@ -428,6 +432,7 @@ func TestStatusManagerSetFromIPsecConfigs(t *testing.T) {
client := fake.NewFakeClient()
status := New(client, "testing", names.StandAloneClusterName)
setFakeListers(status)
status.clock = testingclock.NewFakeClock(time.Now())
no := &operv1.Network{ObjectMeta: metav1.ObjectMeta{Name: names.OPERATOR_CONFIG},
Spec: operv1.NetworkSpec{DefaultNetwork: operv1.DefaultNetworkDefinition{
OVNKubernetesConfig: &operv1.OVNKubernetesConfig{IPsecConfig: &operv1.IPsecConfig{Mode: operv1.IPsecModeFull}}}}}
Expand Down Expand Up @@ -510,6 +515,14 @@ func TestStatusManagerSetFromIPsecConfigs(t *testing.T) {
Status: mcfgv1.MachineConfigPoolStatus{Conditions: []mcfgv1.MachineConfigPoolCondition{{Type: mcfgv1.MachineConfigPoolDegraded,
Status: v1.ConditionTrue}}}}
mcPools = append(mcPools, workerIPsecMachineConfigPool)
// First call records the failure
err = status.SetFromMachineConfigPool(mcPools)
if err != nil {
t.Fatalf("error processing machine config pools: %v", err)
}
// Advance time past the debouncing threshold
status.clock.(*testingclock.FakeClock).Step(3 * time.Minute)
// Second call sets Degraded
err = status.SetFromMachineConfigPool(mcPools)
if err != nil {
t.Fatalf("error processing machine config pools: %v", err)
Expand Down Expand Up @@ -654,6 +667,15 @@ func TestStatusManagerSetFromIPsecConfigs(t *testing.T) {
masterIPsecmachineConfigPool.Status = mcfgv1.MachineConfigPoolStatus{Conditions: []mcfgv1.MachineConfigPoolCondition{{Type: mcfgv1.MachineConfigPoolDegraded,
Status: v1.ConditionTrue}}, Configuration: mcfgv1.MachineConfigPoolStatusConfiguration{
Source: []v1.ObjectReference{{Name: masterMachineConfigIPsecExtName}}}}
// First call records the failure
err = status.SetFromMachineConfigPool([]mcfgv1.MachineConfigPool{masterIPsecmachineConfigPool,
workerIPsecMachineConfigPool})
if err != nil {
t.Fatalf("error processing machine config pools: %v", err)
}
// Advance time past threshold
status.clock.(*testingclock.FakeClock).Step(3 * time.Minute)
// Second call sets Degraded
err = status.SetFromMachineConfigPool([]mcfgv1.MachineConfigPool{masterIPsecmachineConfigPool,
workerIPsecMachineConfigPool})
if err != nil {
Expand Down Expand Up @@ -775,6 +797,7 @@ func TestStatusManagerSetFromIPsecConfigs(t *testing.T) {
func TestStatusManagerSetFromDaemonSets(t *testing.T) {
client := fake.NewFakeClient()
status := New(client, "testing", names.StandAloneClusterName)
status.clock = testingclock.NewFakeClock(time.Now())
setFakeListers(status)
no := &operv1.Network{ObjectMeta: metav1.ObjectMeta{Name: names.OPERATOR_CONFIG}}
setOC(t, client, no)
Expand Down Expand Up @@ -1158,6 +1181,11 @@ func TestStatusManagerSetFromDaemonSets(t *testing.T) {
}
}
setLastPodState(t, client, "testing", ps)
// First call records the failure
status.SetFromPods()
// Advance time past the debouncing threshold
status.clock.(*testingclock.FakeClock).Step(3 * time.Minute)
// Second call sets Degraded
status.SetFromPods()

co, oc, err = getStatuses(client, "testing")
Expand Down Expand Up @@ -1377,6 +1405,7 @@ func TestStatusManagerSetFromDaemonSets(t *testing.T) {
func TestStatusManagerSetFromDeployments(t *testing.T) {
client := fake.NewFakeClient()
status := New(client, "testing", names.StandAloneClusterName)
status.clock = testingclock.NewFakeClock(time.Now())
setFakeListers(status)
no := &operv1.Network{ObjectMeta: metav1.ObjectMeta{Name: names.OPERATOR_CONFIG}}
setOC(t, client, no)
Expand Down Expand Up @@ -1605,6 +1634,11 @@ func TestStatusManagerSetFromDeployments(t *testing.T) {
}
}
setLastPodState(t, client, "testing", ps)
// First call records the failure
status.SetFromPods()
// Advance time past the debouncing threshold
status.clock.(*testingclock.FakeClock).Step(3 * time.Minute)
// Second call sets Degraded
status.SetFromPods()

co, oc, err = getStatuses(client, "testing")
Expand Down Expand Up @@ -1909,7 +1943,16 @@ func TestStatusManagerCheckCrashLoopBackOffPods(t *testing.T) {
}
set(t, client, podC)

// First call to SetFromPods() will record the failure but not set Degraded yet
status.SetFromPods()

// Simulate time passing beyond the degraded threshold by setting failure first-seen time
// to 3 minutes ago
status.failureFirstSeen[RolloutHung] = time.Now().Add(-3 * time.Minute)

// Second call to SetFromPods() will now set Degraded since the failure has persisted
status.SetFromPods()

oc, err = getOC(client)
if err != nil {
t.Fatalf("error getting ClusterOperator: %v", err)
Expand Down
Loading