Skip to content

Commit a4fe451

Browse files
committed
Make pod health more representative of pods over containers
1 parent 2c756ca commit a4fe451

File tree

1 file changed

+85
-22
lines changed

1 file changed

+85
-22
lines changed

internal/infrastructure/k8s/pod_health.go

Lines changed: 85 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -670,8 +670,8 @@ func (self *KubeClient) GetSimpleHealthStatus(ctx context.Context, namespace str
670670
hasCrashing := false
671671
hasPending := false
672672
hasTerminating := false
673-
readyCount := 0
674-
allInstances := make([]SimpleInstanceStatus, 0)
673+
readyPodCount := 0
674+
allInstances := make([]SimpleInstanceStatus, 0, len(podStatuses))
675675

676676
for _, podStatus := range podStatuses {
677677
// Check if pod itself is terminating
@@ -684,75 +684,138 @@ func (self *KubeClient) GetSimpleHealthStatus(ctx context.Context, namespace str
684684
hasCrashing = true
685685
}
686686

687+
// For grouping logic - track pod-level health
688+
podHasCrashing := false
689+
podHasPending := false
690+
podHasTerminating := false
691+
podReadyContainers := 0
692+
podTotalMainContainers := len(podStatus.Instances)
693+
var podState ContainerState
694+
var podEvents []models.EventRecord
695+
var maxRestartCount int32
696+
697+
// Process main containers
687698
for _, instance := range podStatus.Instances {
688-
allInstances = append(allInstances, SimpleInstanceStatus{
689-
KubernetesName: instance.KubernetesName,
690-
Status: instance.State,
691-
RestartCount: instance.RestartCount,
692-
PodCreatedAt: instance.PodCreatedAt,
693-
Events: instance.Events,
694-
})
699+
// Collect all events from all containers in this pod
700+
podEvents = append(podEvents, instance.Events...)
701+
702+
// Track highest restart count
703+
if instance.RestartCount > maxRestartCount {
704+
maxRestartCount = instance.RestartCount
705+
}
695706

696-
// Count ready instances and detect pending/crashing/terminating states
707+
// Track pod-level states
697708
switch instance.State {
698709
case ContainerStateCrashing:
710+
podHasCrashing = true
699711
hasCrashing = true
712+
podState = ContainerStateCrashing // Crashing takes precedence
700713
case ContainerStateTerminating:
714+
podHasTerminating = true
701715
hasTerminating = true
716+
if podState != ContainerStateCrashing {
717+
podState = ContainerStateTerminating
718+
}
702719
case ContainerStateRunning:
703720
if instance.Ready {
704-
readyCount++
721+
podReadyContainers++
705722
} else {
723+
podHasPending = true
706724
hasPending = true
725+
if podState != ContainerStateCrashing && podState != ContainerStateTerminating {
726+
podState = ContainerStateNotReady
727+
}
707728
}
708729
case ContainerStateNotReady, ContainerStateWaiting, ContainerStateStarting, ContainerStateImagePullError:
730+
podHasPending = true
709731
hasPending = true
732+
if podState != ContainerStateCrashing && podState != ContainerStateTerminating {
733+
podState = instance.State
734+
}
710735
case ContainerStateTerminated:
711736
// Terminated containers might be crashing if they have restart counts or failed
712737
if instance.IsCrashing {
738+
podHasCrashing = true
713739
hasCrashing = true
740+
podState = ContainerStateCrashing
741+
} else if podState == "" {
742+
podState = ContainerStateTerminated
714743
}
715744
}
716745
}
717746

718-
// Also check init containers
747+
// Process init containers but filter out terminated ones
719748
for _, instance := range podStatus.InstanceDependencies {
720-
allInstances = append(allInstances, SimpleInstanceStatus{
721-
KubernetesName: instance.KubernetesName,
722-
Status: instance.State,
723-
RestartCount: instance.RestartCount,
724-
PodCreatedAt: instance.PodCreatedAt,
725-
Events: instance.Events,
726-
})
749+
// Skip terminated init containers as they're expected to be terminated after successful completion
750+
if instance.State == ContainerStateTerminated && !instance.IsCrashing {
751+
continue
752+
}
753+
754+
// Collect events from init containers that are still relevant
755+
podEvents = append(podEvents, instance.Events...)
756+
757+
// Track highest restart count including init containers
758+
if instance.RestartCount > maxRestartCount {
759+
maxRestartCount = instance.RestartCount
760+
}
727761

728762
// Init containers failing can affect overall health
729763
switch instance.State {
730764
case ContainerStateCrashing:
765+
podHasCrashing = true
731766
hasCrashing = true
767+
podState = ContainerStateCrashing
732768
case ContainerStateTerminating:
769+
podHasTerminating = true
733770
hasTerminating = true
771+
if podState != ContainerStateCrashing {
772+
podState = ContainerStateTerminating
773+
}
734774
case ContainerStateWaiting, ContainerStateStarting, ContainerStateImagePullError:
775+
podHasPending = true
735776
hasPending = true
777+
if podState != ContainerStateCrashing && podState != ContainerStateTerminating {
778+
podState = instance.State
779+
}
736780
case ContainerStateTerminated:
737781
if instance.IsCrashing {
782+
podHasCrashing = true
738783
hasCrashing = true
784+
podState = ContainerStateCrashing
739785
}
740786
}
741787
}
788+
789+
// Determine final pod state - if all main containers are ready and running, pod is running
790+
if !podHasCrashing && !podHasTerminating && !podHasPending && podReadyContainers == podTotalMainContainers && podTotalMainContainers > 0 {
791+
podState = ContainerStateRunning
792+
readyPodCount++
793+
}
794+
795+
// Create a single SimpleInstanceStatus representing the entire pod
796+
podInstanceStatus := SimpleInstanceStatus{
797+
KubernetesName: podStatus.KubernetesName, // Use pod name instead of container name
798+
Status: podState,
799+
RestartCount: maxRestartCount, // Use highest restart count from all containers
800+
PodCreatedAt: podStatus.CreatedAt,
801+
Events: podEvents, // Combine events from all containers
802+
}
803+
804+
allInstances = append(allInstances, podInstanceStatus)
742805
}
743806

744807
// Determine health status based on priority:
745808
// 1. Crashing takes precedence over everything (indicates real problems)
746809
// 2. Terminating comes next (planned shutdown/scaling)
747-
// 3. Pending if any containers are not ready or we don't have enough instances
748-
// 4. Active only if all expected instances are ready and running
810+
// 3. Pending if any containers are not ready or we don't have enough pod replicas
811+
// 4. Active only if all expected pod replicas are ready and running
749812
var health InstanceHealth
750813
switch {
751814
case hasCrashing:
752815
health = InstanceHealthCrashing
753816
case hasTerminating:
754817
health = InstanceHealthTerminating
755-
case hasPending || readyCount < expectedInstances:
818+
case hasPending || readyPodCount < expectedInstances:
756819
health = InstanceHealthPending
757820
default:
758821
health = InstanceHealthActive

0 commit comments

Comments
 (0)