Skip to content

Commit 7d801be

Browse files
committed
e2e: PP: cover ExecCPUAffinity support in tests
Add basic e2e tests that checks the default behavior of performance-profile with default enabled `ExecCPUAffinity: first`. Signed-off-by: Shereen Haj <shajmakh@redhat.com>
1 parent 500a805 commit 7d801be

File tree

4 files changed

+219
-29
lines changed

4 files changed

+219
-29
lines changed

test/e2e/performanceprofile/functests/11_mixedcpus/mixedcpus.go

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,129 @@ var _ = Describe("Mixedcpus", Ordered, Label(string(label.MixedCPUs)), func() {
510510
})
511511
})
512512
})
513+
514+
Context("Check exec-cpu-affinity feature", func() {
515+
When("exec-cpu-affinity is enabled (default in PP)", func() {
516+
var workerRTNode *corev1.Node
517+
var profile *performancev2.PerformanceProfile
518+
var getter cgroup.ControllersGetter
519+
var updatedShared, updatedIsolated cpuset.CPUSet
520+
521+
BeforeEach(func() {
522+
By("Checking if exec-cpu-affinity is enabled by default in the profile")
523+
profile, _ = profiles.GetByNodeLabels(testutils.NodeSelectorLabels)
524+
Expect(profile).ToNot(BeNil(), "Failed to get performance profile")
525+
if profile.Annotations != nil {
526+
val, ok := profile.Annotations[performancev2.PerformanceProfileDisableExecCPUAffinityAnnotation]
527+
if ok && val == "true" {
528+
// fail loadly because the default should be enabled
529+
Fail("exec-cpu-affinity is disabled in the profile")
530+
}
531+
}
532+
533+
By("Updating performance profile to have enough shared cpus if needed")
534+
// update pp to have 2 currentShared cpus
535+
currentShared := mustParse(string(*profile.Spec.CPU.Shared))
536+
if len(currentShared.List()) < 2 {
537+
testlog.Info("shared cpuset has less than 2 cpus; this test requires at least 2 shared cpus; update the profile")
538+
isolated := mustParse(string(*profile.Spec.CPU.Isolated))
539+
540+
// we need 4 total isolated and shared CPUs:
541+
// 1 as a buffer for node's base load
542+
// 1 as the test gu pod requests
543+
// 2 as shared cpus
544+
leastIsolatedCpus := 3
545+
if len(currentShared.List()) == 0 {
546+
leastIsolatedCpus = 4
547+
}
548+
if len(isolated.List()) < leastIsolatedCpus {
549+
Skip(fmt.Sprintf("isolated cpuset has less than %d cpus; this test requires at least %d isolated cpus", leastIsolatedCpus, leastIsolatedCpus))
550+
}
551+
552+
updatedShared = cpuset.New(isolated.List()[0], isolated.List()[1])
553+
updatedIsolated = cpuset.New(isolated.List()[2:]...)
554+
555+
if len(currentShared.List()) == 1 {
556+
updatedShared = cpuset.New(currentShared.List()[0], isolated.List()[0])
557+
updatedIsolated = cpuset.New(isolated.List()[1:]...)
558+
}
559+
560+
testlog.Infof("shared cpu ids to be updated are: %q", updatedShared.String())
561+
profile.Spec.CPU.Isolated = cpuSetToPerformanceCPUSet(&updatedIsolated)
562+
profile.Spec.CPU.Shared = cpuSetToPerformanceCPUSet(&updatedShared)
563+
profile.Spec.WorkloadHints.MixedCpus = ptr.To(true) // if not already
564+
565+
profiles.UpdateWithRetry(profile)
566+
567+
poolName := poolname.GetByProfile(context.TODO(), profile)
568+
By(fmt.Sprintf("Applying changes in performance profile and waiting until %s will start updating", poolName))
569+
profilesupdate.WaitForTuningUpdating(context.TODO(), profile)
570+
By(fmt.Sprintf("Waiting when %s finishes updates", poolName))
571+
profilesupdate.WaitForTuningUpdated(context.TODO(), profile)
572+
}
573+
574+
workerRTNodes, err := nodes.GetByLabels(testutils.NodeSelectorLabels)
575+
Expect(err).ToNot(HaveOccurred())
576+
workerRTNodes, err = nodes.MatchingOptionalSelector(workerRTNodes)
577+
Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("error looking for the optional selector: %v", err))
578+
Expect(workerRTNodes).ToNot(BeEmpty())
579+
workerRTNode = &workerRTNodes[0]
580+
581+
getter, err = cgroup.BuildGetter(ctx, testclient.DataPlaneClient, testclient.K8sClient)
582+
Expect(err).ToNot(HaveOccurred())
583+
})
584+
585+
It("should pin exec process to first Shared CPU of the container - guratanteed pod", func() {
586+
By("Creating a guaranteed test pod with shared CPU request")
587+
rl := &corev1.ResourceList{
588+
corev1.ResourceCPU: resource.MustParse("1"),
589+
corev1.ResourceMemory: resource.MustParse("100Mi"),
590+
sharedCpusResource: resource.MustParse("1"),
591+
}
592+
testPod := makePod(ctx, testclient.DataPlaneClient, testutils.NamespaceTesting,
593+
withRequests(rl),
594+
withLimits(rl),
595+
onNode(workerRTNode.Name),
596+
withRuntime(components.GetComponentName(profile.Name, components.ComponentNamePrefix)))
597+
598+
Expect(testclient.Client.Create(ctx, testPod)).To(Succeed(), "Failed to create test pod")
599+
testPod, err := pods.WaitForCondition(ctx, client.ObjectKeyFromObject(testPod), corev1.PodReady, corev1.ConditionTrue, 5*time.Minute)
600+
Expect(err).ToNot(HaveOccurred())
601+
defer func() {
602+
if testPod != nil {
603+
testlog.Infof("deleting pod %q", testPod.Name)
604+
Expect(pods.Delete(ctx, testPod)).To(BeTrue(), "Failed to delete pod")
605+
}
606+
}()
607+
608+
By("Prepare comparable data")
609+
cpusetCfg := &controller.CpuSet{}
610+
Expect(getter.Container(ctx, testPod, testPod.Spec.Containers[0].Name, cpusetCfg)).To(Succeed(), "Failed to get cpuset config for test pod")
611+
612+
cpusIncludingShared, err := cpuset.Parse(cpusetCfg.Cpus)
613+
Expect(err).ToNot(HaveOccurred(), "Failed to parse cpuset config for test pod cpus=%q", cpusetCfg.Cpus)
614+
cntShared := cpusIncludingShared.Difference(updatedIsolated)
615+
firstSharedCPU := cntShared.List()[0]
616+
testlog.Infof("first shared CPU: %d", firstSharedCPU)
617+
618+
sharedCpuRequest := testPod.Spec.Containers[0].Resources.Requests.Name(sharedCpusResource, resource.DecimalSI).Value()
619+
retries := int(10 / sharedCpuRequest)
620+
By("Run exec command on the pod and verify the process is pinned to the first shared CPU")
621+
622+
for i := 0; i < retries; i++ {
623+
cmd := []string{"/bin/bash", "-c", "sleep 10 & SLPID=$!; ps -o psr -p $SLPID;"}
624+
output, err := pods.ExecCommandOnPod(testclient.K8sClient, testPod, testPod.Spec.Containers[0].Name, cmd)
625+
Expect(err).ToNot(HaveOccurred(), "Failed to exec command on the pod; retry %d", i)
626+
strout := string(output)
627+
testlog.Infof("exec command output: %s", strout)
628+
629+
execProcessCPUs := strings.TrimSpace(strout)
630+
Expect(execProcessCPUs).ToNot(BeEmpty(), "Failed to get exec process CPU; retry %d", i)
631+
Expect(execProcessCPUs).To(Equal(firstSharedCPU), "Exec process CPU is not the first shared CPU; retry %d", i)
632+
}
633+
})
634+
})
635+
})
513636
})
514637

515638
func setup(ctx context.Context) func(ctx2 context.Context) {
@@ -659,6 +782,12 @@ func withRuntime(name string) func(p *corev1.Pod) {
659782
}
660783
}
661784

785+
func onNode(nodeName string) func(p *corev1.Pod) {
786+
return func(p *corev1.Pod) {
787+
p.Spec.NodeName = nodeName
788+
}
789+
}
790+
662791
func getTestingNamespace() corev1.Namespace {
663792
return *namespaces.TestingNamespace
664793
}

test/e2e/performanceprofile/functests/1_performance/cpu_management.go

Lines changed: 66 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ import (
1212

1313
appsv1 "k8s.io/api/apps/v1"
1414
corev1 "k8s.io/api/core/v1"
15-
"k8s.io/apimachinery/pkg/api/errors"
1615
"k8s.io/apimachinery/pkg/api/resource"
1716
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1817
"k8s.io/apimachinery/pkg/labels"
@@ -247,7 +246,7 @@ var _ = Describe("[rfe_id:27363][performance] CPU Management", Ordered, func() {
247246
})
248247

249248
AfterEach(func() {
250-
deleteTestPod(context.TODO(), testpod)
249+
Expect(pods.Delete(context.TODO(), testpod)).To(BeTrue(), "Failed to delete pod")
251250
})
252251

253252
DescribeTable("Verify CPU usage by stress PODs", func(ctx context.Context, guaranteed bool) {
@@ -338,7 +337,7 @@ var _ = Describe("[rfe_id:27363][performance] CPU Management", Ordered, func() {
338337
Expect(err).ToNot(HaveOccurred())
339338
})
340339
AfterEach(func() {
341-
deleteTestPod(context.TODO(), testpod)
340+
Expect(pods.Delete(context.TODO(), testpod)).To(BeTrue(), "Failed to delete pod")
342341
})
343342
When("kubelet is restart", func() {
344343
It("[test_id: 73501] defaultCpuset should not change", func() {
@@ -421,7 +420,7 @@ var _ = Describe("[rfe_id:27363][performance] CPU Management", Ordered, func() {
421420

422421
AfterEach(func() {
423422
if testpod != nil {
424-
deleteTestPod(context.TODO(), testpod)
423+
Expect(pods.Delete(context.TODO(), testpod)).To(BeTrue(), "Failed to delete pod")
425424
}
426425
})
427426

@@ -480,7 +479,7 @@ var _ = Describe("[rfe_id:27363][performance] CPU Management", Ordered, func() {
480479
fmt.Sprintf("IRQ still active on CPU%s", psr))
481480

482481
By("Checking that after removing POD default smp affinity is returned back to all active CPUs")
483-
deleteTestPod(context.TODO(), testpod)
482+
Expect(pods.Delete(context.TODO(), testpod)).To(BeTrue(), "Failed to delete pod")
484483
defaultSmpAffinitySet, err = nodes.GetDefaultSmpAffinitySet(context.TODO(), workerRTNode)
485484
Expect(err).ToNot(HaveOccurred())
486485

@@ -579,7 +578,7 @@ var _ = Describe("[rfe_id:27363][performance] CPU Management", Ordered, func() {
579578
if testpod == nil {
580579
return
581580
}
582-
deleteTestPod(context.TODO(), testpod)
581+
Expect(pods.Delete(context.TODO(), testpod)).To(BeTrue(), "Failed to delete pod")
583582
})
584583

585584
It("[test_id:49149] should reject pods which request integral CPUs not aligned with machine SMT level", func() {
@@ -632,7 +631,7 @@ var _ = Describe("[rfe_id:27363][performance] CPU Management", Ordered, func() {
632631
if testpod == nil {
633632
return
634633
}
635-
deleteTestPod(context.TODO(), testpod)
634+
Expect(pods.Delete(context.TODO(), testpod)).To(BeTrue(), "Failed to delete pod")
636635
})
637636

638637
DescribeTable("Verify Hyper-Thread aware scheduling for guaranteed pods",
@@ -679,7 +678,7 @@ var _ = Describe("[rfe_id:27363][performance] CPU Management", Ordered, func() {
679678
testpod = startHTtestPod(ctx, cpuCount)
680679
Expect(checkPodHTSiblings(ctx, testpod)).To(BeTrue(), "Pod cpu set does not map to host cpu sibling pairs")
681680
By("Deleting test pod...")
682-
deleteTestPod(ctx, testpod)
681+
Expect(pods.Delete(ctx, testpod)).To(BeTrue(), "Failed to delete pod")
683682
}
684683
},
685684

@@ -982,7 +981,7 @@ var _ = Describe("[rfe_id:27363][performance] CPU Management", Ordered, func() {
982981
defer func() {
983982
if guaranteedPod != nil {
984983
testlog.Infof("deleting pod %q", guaranteedPod.Name)
985-
deleteTestPod(ctx, guaranteedPod)
984+
Expect(pods.Delete(ctx, guaranteedPod)).To(BeTrue(), "Failed to delete guaranteed pod")
986985
}
987986
}()
988987

@@ -1013,7 +1012,7 @@ var _ = Describe("[rfe_id:27363][performance] CPU Management", Ordered, func() {
10131012
defer func() {
10141013
if bestEffortPod != nil {
10151014
testlog.Infof("deleting pod %q", bestEffortPod.Name)
1016-
deleteTestPod(ctx, bestEffortPod)
1015+
Expect(pods.Delete(ctx, bestEffortPod)).To(BeTrue(), "Failed to delete best-effort pod")
10171016
}
10181017
}()
10191018

@@ -1067,6 +1066,63 @@ var _ = Describe("[rfe_id:27363][performance] CPU Management", Ordered, func() {
10671066
})
10681067
})
10691068

1069+
Context("Check exec-cpu-affinity feature", func() {
1070+
When("exec-cpu-affinity is enabled (default in PP)", func() {
1071+
// shared-cpus case is covered in 11_mixedcpus test
1072+
// legacy test is covered in 2_performance_update
1073+
1074+
BeforeEach(func() {
1075+
By("Checking if exec-cpu-affinity is enabled by default in the profile")
1076+
profile, _ := profiles.GetByNodeLabels(testutils.NodeSelectorLabels)
1077+
Expect(profile).ToNot(BeNil(), "Failed to get performance profile")
1078+
if profile.Annotations != nil {
1079+
val, ok := profile.Annotations[performancev2.PerformanceProfileDisableExecCPUAffinityAnnotation]
1080+
if ok && val == "true" {
1081+
// fail loadly because the default should be enabled
1082+
Fail("exec-cpu-affinity is disabled in the profile")
1083+
}
1084+
}
1085+
})
1086+
1087+
It("should pin exec process to first CPU dedicated to the container - guratanteed pod", func() {
1088+
By("Creating a guaranteed test pod")
1089+
testPod := makePod(ctx, workerRTNode, true)
1090+
Expect(testclient.Client.Create(ctx, testPod)).To(Succeed(), "Failed to create test pod")
1091+
testPod, err = pods.WaitForCondition(ctx, client.ObjectKeyFromObject(testPod), corev1.PodReady, corev1.ConditionTrue, 5*time.Minute)
1092+
Expect(err).ToNot(HaveOccurred())
1093+
defer func() {
1094+
if testPod != nil {
1095+
testlog.Infof("deleting pod %q", testPod.Name)
1096+
Expect(pods.Delete(ctx, testPod)).To(BeTrue(), "Failed to delete test pod")
1097+
}
1098+
}()
1099+
1100+
cpusetCfg := &controller.CpuSet{}
1101+
Expect(getter.Container(ctx, testPod, testPod.Spec.Containers[0].Name, cpusetCfg)).To(Succeed(), "Failed to get cpuset config for test pod")
1102+
1103+
// assuming no shared cpus are used
1104+
cpusList := strings.Split(cpusetCfg.Cpus, ",")
1105+
Expect(cpusList).ToNot(BeEmpty())
1106+
firstExclusiveCPU := strings.TrimSpace(cpusList[0])
1107+
testlog.Infof("first exclusive CPU: %s", firstExclusiveCPU)
1108+
1109+
cpuRequest := testPod.Spec.Containers[0].Resources.Requests.Name(corev1.ResourceCPU, resource.DecimalSI).Value()
1110+
retries := int(10 / cpuRequest)
1111+
By("Run exec command on the pod and verify the process is pinned to the first exclusive CPU")
1112+
1113+
for i := 0; i < retries; i++ {
1114+
cmd := []string{"/bin/bash", "-c", "sleep 10 & SLPID=$!; ps -o psr -p $SLPID;"}
1115+
output, err := pods.ExecCommandOnPod(testclient.K8sClient, testPod, testPod.Spec.Containers[0].Name, cmd)
1116+
Expect(err).ToNot(HaveOccurred(), "Failed to exec command on the pod; retry %d", i)
1117+
testlog.Infof("exec command output: %s", string(output))
1118+
1119+
execProcessCPUs := strings.TrimSpace(string(output))
1120+
Expect(execProcessCPUs).ToNot(BeEmpty(), "Failed to get exec process CPU; retry %d", i)
1121+
Expect(execProcessCPUs).To(Equal(firstExclusiveCPU), "Exec process CPU is not the first exclusive CPU; retry %d", i)
1122+
}
1123+
})
1124+
})
1125+
})
10701126
})
10711127

10721128
func extractConfigInfo(output string) (*ContainerConfig, error) {
@@ -1317,24 +1373,6 @@ func getTestPodWithAnnotations(annotations map[string]string, cpus int) *corev1.
13171373
return testpod
13181374
}
13191375

1320-
func deleteTestPod(ctx context.Context, testpod *corev1.Pod) (types.UID, bool) {
1321-
// it possible that the pod already was deleted as part of the test, in this case we want to skip teardown
1322-
err := testclient.DataPlaneClient.Get(ctx, client.ObjectKeyFromObject(testpod), testpod)
1323-
if errors.IsNotFound(err) {
1324-
return "", false
1325-
}
1326-
1327-
testpodUID := testpod.UID
1328-
1329-
err = testclient.DataPlaneClient.Delete(ctx, testpod)
1330-
Expect(err).ToNot(HaveOccurred())
1331-
1332-
err = pods.WaitForDeletion(ctx, testpod, pods.DefaultDeletionTimeout*time.Second)
1333-
Expect(err).ToNot(HaveOccurred())
1334-
1335-
return testpodUID, true
1336-
}
1337-
13381376
func cpuSpecToString(cpus *performancev2.CPU) (string, error) {
13391377
if cpus == nil {
13401378
return "", fmt.Errorf("performance CPU field is nil")

test/e2e/performanceprofile/functests/1_performance/irqbalance.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ var _ = Describe("[performance] Checking IRQBalance settings", Ordered, func() {
213213
defer func() {
214214
if testpod != nil {
215215
testlog.Infof("deleting pod %q", testpod.Name)
216-
deleteTestPod(context.TODO(), testpod)
216+
Expect(pods.Delete(context.TODO(), testpod)).To(BeTrue(), "Failed to delete pod")
217217
}
218218
bannedCPUs, err := getIrqBalanceBannedCPUs(context.TODO(), targetNode)
219219
Expect(err).ToNot(HaveOccurred(), "failed to extract the banned CPUs from node %q", targetNode.Name)

test/e2e/performanceprofile/functests/utils/pods/pods.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"k8s.io/client-go/kubernetes"
2121
"k8s.io/client-go/kubernetes/scheme"
2222
"k8s.io/client-go/tools/remotecommand"
23+
"k8s.io/klog/v2"
2324
"sigs.k8s.io/controller-runtime/pkg/client"
2425

2526
testclient "github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils/client"
@@ -51,6 +52,28 @@ func GetTestPod() *corev1.Pod {
5152
}
5253
}
5354

55+
func Delete(ctx context.Context, pod *corev1.Pod) bool {
56+
err := testclient.DataPlaneClient.Get(ctx, client.ObjectKeyFromObject(pod), pod)
57+
if errors.IsNotFound(err) {
58+
klog.InfoS("pod already deleted", "namespace", pod.Namespace, "name", pod.Name)
59+
return true
60+
}
61+
62+
err = testclient.DataPlaneClient.Delete(ctx, pod)
63+
if err != nil {
64+
klog.ErrorS(err, "failed to delete pod", "namespace", pod.Namespace, "name", pod.Name)
65+
return false
66+
}
67+
68+
err = WaitForDeletion(ctx, pod, DefaultDeletionTimeout*time.Second)
69+
if err != nil {
70+
klog.ErrorS(err, "failed to wait for pod deletion", "namespace", pod.Namespace, "name", pod.Name)
71+
return false
72+
}
73+
74+
return true
75+
}
76+
5477
// WaitForDeletion waits until the pod will be removed from the cluster
5578
func WaitForDeletion(ctx context.Context, pod *corev1.Pod, timeout time.Duration) error {
5679
return wait.PollUntilContextTimeout(ctx, time.Second, timeout, true, func(ctx context.Context) (bool, error) {

0 commit comments

Comments
 (0)