Skip to content

Commit b7824a4

Browse files
committed
e2e: PP: cover ExecCPUAffinity support in tests
Add basic e2e tests that checks the default behavior of performance-profile with default enabled `ExecCPUAffinity: first`. Signed-off-by: Shereen Haj <shajmakh@redhat.com>
1 parent c3627af commit b7824a4

File tree

4 files changed

+226
-31
lines changed

4 files changed

+226
-31
lines changed

test/e2e/performanceprofile/functests/11_mixedcpus/mixedcpus.go

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"encoding/json"
66
"fmt"
7+
"math"
78
"strconv"
89
"strings"
910
"time"
@@ -510,6 +511,131 @@ var _ = Describe("Mixedcpus", Ordered, Label(string(label.MixedCPUs)), func() {
510511
})
511512
})
512513
})
514+
515+
Context("Check exec-cpu-affinity feature", func() {
516+
When("exec-cpu-affinity is enabled (default in PP)", func() {
517+
var workerRTNode *corev1.Node
518+
var profile *performancev2.PerformanceProfile
519+
var getter cgroup.ControllersGetter
520+
var updatedShared, updatedIsolated cpuset.CPUSet
521+
522+
BeforeEach(func() {
523+
By("Checking if exec-cpu-affinity is enabled by default in the profile")
524+
profile, _ = profiles.GetByNodeLabels(testutils.NodeSelectorLabels)
525+
Expect(profile).ToNot(BeNil(), "Failed to get performance profile")
526+
if profile.Annotations != nil {
527+
val, ok := profile.Annotations[performancev2.PerformanceProfileDisableExecCPUAffinityAnnotation]
528+
if ok && val == "true" {
529+
// fail loadly because the default should be enabled
530+
Fail("exec-cpu-affinity is disabled in the profile")
531+
}
532+
}
533+
534+
By("Updating performance profile to have enough shared cpus if needed")
535+
updatedIsolated = *mustParse(string(*profile.Spec.CPU.Isolated))
536+
currentShared := mustParse(string(*profile.Spec.CPU.Shared))
537+
if len(currentShared.List()) < 2 {
538+
testlog.Info("shared cpuset has less than 2 cpus; this test requires at least 2 shared cpus; update the profile")
539+
isolated := mustParse(string(*profile.Spec.CPU.Isolated))
540+
541+
// we need 4 total isolated and shared CPUs:
542+
// 1 as a buffer for node's base load
543+
// 1 as the test gu pod requests
544+
// 2 as shared cpus
545+
leastIsolatedCpus := 3
546+
if len(currentShared.List()) == 0 {
547+
leastIsolatedCpus = 4
548+
}
549+
if len(isolated.List()) < leastIsolatedCpus {
550+
Skip(fmt.Sprintf("isolated cpuset has less than %d cpus; this test requires at least %d isolated cpus", leastIsolatedCpus, leastIsolatedCpus))
551+
}
552+
553+
updatedShared = cpuset.New(isolated.List()[0], isolated.List()[1])
554+
updatedIsolated = cpuset.New(isolated.List()[2:]...)
555+
556+
if len(currentShared.List()) == 1 {
557+
updatedShared = cpuset.New(currentShared.List()[0], isolated.List()[0])
558+
updatedIsolated = cpuset.New(isolated.List()[1:]...)
559+
}
560+
561+
testlog.Infof("shared cpu ids to be updated are: %q", updatedShared.String())
562+
profile.Spec.CPU.Isolated = cpuSetToPerformanceCPUSet(&updatedIsolated)
563+
profile.Spec.CPU.Shared = cpuSetToPerformanceCPUSet(&updatedShared)
564+
profile.Spec.WorkloadHints.MixedCpus = ptr.To(true) // if not already
565+
566+
profiles.UpdateWithRetry(profile)
567+
568+
poolName := poolname.GetByProfile(context.TODO(), profile)
569+
By(fmt.Sprintf("Applying changes in performance profile and waiting until %s will start updating", poolName))
570+
profilesupdate.WaitForTuningUpdating(context.TODO(), profile)
571+
By(fmt.Sprintf("Waiting when %s finishes updates", poolName))
572+
profilesupdate.WaitForTuningUpdated(context.TODO(), profile)
573+
}
574+
575+
workerRTNodes, err := nodes.GetByLabels(testutils.NodeSelectorLabels)
576+
Expect(err).ToNot(HaveOccurred())
577+
workerRTNodes, err = nodes.MatchingOptionalSelector(workerRTNodes)
578+
Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("error looking for the optional selector: %v", err))
579+
Expect(workerRTNodes).ToNot(BeEmpty())
580+
workerRTNode = &workerRTNodes[0]
581+
582+
getter, err = cgroup.BuildGetter(ctx, testclient.DataPlaneClient, testclient.K8sClient)
583+
Expect(err).ToNot(HaveOccurred())
584+
})
585+
586+
It("should pin exec process to first Shared CPU of the container - guratanteed pod", func() {
587+
By("Creating a guaranteed test pod with shared CPU request")
588+
rl := &corev1.ResourceList{
589+
corev1.ResourceCPU: resource.MustParse("1"),
590+
corev1.ResourceMemory: resource.MustParse("100Mi"),
591+
sharedCpusResource: resource.MustParse("1"),
592+
}
593+
testPod := makePod(ctx, testclient.DataPlaneClient, testutils.NamespaceTesting,
594+
withRequests(rl),
595+
withLimits(rl),
596+
onNode(workerRTNode.Name),
597+
withRuntime(components.GetComponentName(profile.Name, components.ComponentNamePrefix)))
598+
599+
Expect(testclient.Client.Create(ctx, testPod)).To(Succeed(), "Failed to create test pod")
600+
testPod, err := pods.WaitForCondition(ctx, client.ObjectKeyFromObject(testPod), corev1.PodReady, corev1.ConditionTrue, 5*time.Minute)
601+
Expect(err).ToNot(HaveOccurred())
602+
defer func() {
603+
if testPod != nil {
604+
testlog.Infof("deleting pod %q", testPod.Name)
605+
Expect(pods.Delete(ctx, testPod)).To(BeTrue(), "Failed to delete pod")
606+
}
607+
}()
608+
609+
By("Prepare comparable data")
610+
cpusetCfg := &controller.CpuSet{}
611+
Expect(getter.Container(ctx, testPod, testPod.Spec.Containers[0].Name, cpusetCfg)).To(Succeed(), "Failed to get cpuset config for test pod")
612+
613+
cpusIncludingShared, err := cpuset.Parse(cpusetCfg.Cpus)
614+
testlog.Infof("cpus including shared: %s", cpusIncludingShared.String())
615+
Expect(err).ToNot(HaveOccurred(), "Failed to parse cpuset config for test pod cpus=%q", cpusetCfg.Cpus)
616+
cntShared := cpusIncludingShared.Difference(updatedIsolated)
617+
firstSharedCPU := cntShared.List()[0]
618+
testlog.Infof("first shared CPU: %d; all shared CPUs: %s", firstSharedCPU, cntShared.String())
619+
620+
retries := int(math.Ceil(float64(10) / float64(cntShared.Size())))
621+
By("Run exec command on the pod and verify the process is pinned to the first shared CPU")
622+
623+
for i := 0; i < retries; i++ {
624+
cmd := []string{"/bin/bash", "-c", "sleep 10 & SLPID=$!; ps -o psr -p $SLPID;"}
625+
output, err := pods.ExecCommandOnPod(testclient.K8sClient, testPod, testPod.Spec.Containers[0].Name, cmd)
626+
Expect(err).ToNot(HaveOccurred(), "Failed to exec command on the pod; retry %d", i)
627+
strout := string(output)
628+
testlog.Infof("retry %d exec command output: %s", i, strout)
629+
630+
strout = strings.ReplaceAll(strout, "PSR", "")
631+
execProcessCPUs := strings.TrimSpace(strout)
632+
testlog.Infof("exec process CPU: %s", execProcessCPUs)
633+
Expect(execProcessCPUs).ToNot(BeEmpty(), "Failed to get exec process CPU; retry %d", i)
634+
Expect(execProcessCPUs).To(Equal(firstSharedCPU), "Exec process CPU is not the first shared CPU; retry %d", i)
635+
}
636+
})
637+
})
638+
})
513639
})
514640

515641
func setup(ctx context.Context) func(ctx2 context.Context) {
@@ -659,6 +785,12 @@ func withRuntime(name string) func(p *corev1.Pod) {
659785
}
660786
}
661787

788+
func onNode(nodeName string) func(p *corev1.Pod) {
789+
return func(p *corev1.Pod) {
790+
p.Spec.NodeName = nodeName
791+
}
792+
}
793+
662794
func getTestingNamespace() corev1.Namespace {
663795
return *namespaces.TestingNamespace
664796
}

test/e2e/performanceprofile/functests/1_performance/cpu_management.go

Lines changed: 70 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"encoding/json"
66
"fmt"
7+
"math"
78
"os"
89
"regexp"
910
"strconv"
@@ -12,7 +13,6 @@ import (
1213

1314
appsv1 "k8s.io/api/apps/v1"
1415
corev1 "k8s.io/api/core/v1"
15-
"k8s.io/apimachinery/pkg/api/errors"
1616
"k8s.io/apimachinery/pkg/api/resource"
1717
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1818
"k8s.io/apimachinery/pkg/labels"
@@ -248,7 +248,7 @@ var _ = Describe("[rfe_id:27363][performance] CPU Management", Ordered, func() {
248248
})
249249

250250
AfterEach(func() {
251-
deleteTestPod(context.TODO(), testpod)
251+
Expect(pods.Delete(context.TODO(), testpod)).To(BeTrue(), "Failed to delete pod")
252252
})
253253

254254
DescribeTable("Verify CPU usage by stress PODs", func(ctx context.Context, guaranteed bool) {
@@ -339,7 +339,7 @@ var _ = Describe("[rfe_id:27363][performance] CPU Management", Ordered, func() {
339339
Expect(err).ToNot(HaveOccurred())
340340
})
341341
AfterEach(func() {
342-
deleteTestPod(context.TODO(), testpod)
342+
Expect(pods.Delete(context.TODO(), testpod)).To(BeTrue(), "Failed to delete pod")
343343
})
344344
When("kubelet is restart", func() {
345345
It("[test_id: 73501] defaultCpuset should not change", func() {
@@ -422,7 +422,7 @@ var _ = Describe("[rfe_id:27363][performance] CPU Management", Ordered, func() {
422422

423423
AfterEach(func() {
424424
if testpod != nil {
425-
deleteTestPod(context.TODO(), testpod)
425+
Expect(pods.Delete(context.TODO(), testpod)).To(BeTrue(), "Failed to delete pod")
426426
}
427427
})
428428

@@ -481,7 +481,7 @@ var _ = Describe("[rfe_id:27363][performance] CPU Management", Ordered, func() {
481481
fmt.Sprintf("IRQ still active on CPU%s", psr))
482482

483483
By("Checking that after removing POD default smp affinity is returned back to all active CPUs")
484-
deleteTestPod(context.TODO(), testpod)
484+
Expect(pods.Delete(context.TODO(), testpod)).To(BeTrue(), "Failed to delete pod")
485485
defaultSmpAffinitySet, err = nodes.GetDefaultSmpAffinitySet(context.TODO(), workerRTNode)
486486
Expect(err).ToNot(HaveOccurred())
487487

@@ -580,7 +580,7 @@ var _ = Describe("[rfe_id:27363][performance] CPU Management", Ordered, func() {
580580
if testpod == nil {
581581
return
582582
}
583-
deleteTestPod(context.TODO(), testpod)
583+
Expect(pods.Delete(context.TODO(), testpod)).To(BeTrue(), "Failed to delete pod")
584584
})
585585

586586
It("[test_id:49149] should reject pods which request integral CPUs not aligned with machine SMT level", func() {
@@ -633,7 +633,7 @@ var _ = Describe("[rfe_id:27363][performance] CPU Management", Ordered, func() {
633633
if testpod == nil {
634634
return
635635
}
636-
deleteTestPod(context.TODO(), testpod)
636+
Expect(pods.Delete(context.TODO(), testpod)).To(BeTrue(), "Failed to delete pod")
637637
})
638638

639639
DescribeTable("Verify Hyper-Thread aware scheduling for guaranteed pods",
@@ -680,7 +680,7 @@ var _ = Describe("[rfe_id:27363][performance] CPU Management", Ordered, func() {
680680
testpod = startHTtestPod(ctx, cpuCount)
681681
Expect(checkPodHTSiblings(ctx, testpod)).To(BeTrue(), "Pod cpu set does not map to host cpu sibling pairs")
682682
By("Deleting test pod...")
683-
deleteTestPod(ctx, testpod)
683+
Expect(pods.Delete(ctx, testpod)).To(BeTrue(), "Failed to delete pod")
684684
}
685685
},
686686

@@ -983,7 +983,7 @@ var _ = Describe("[rfe_id:27363][performance] CPU Management", Ordered, func() {
983983
defer func() {
984984
if guaranteedPod != nil {
985985
testlog.Infof("deleting pod %q", guaranteedPod.Name)
986-
deleteTestPod(ctx, guaranteedPod)
986+
Expect(pods.Delete(ctx, guaranteedPod)).To(BeTrue(), "Failed to delete guaranteed pod")
987987
}
988988
}()
989989

@@ -1014,7 +1014,7 @@ var _ = Describe("[rfe_id:27363][performance] CPU Management", Ordered, func() {
10141014
defer func() {
10151015
if bestEffortPod != nil {
10161016
testlog.Infof("deleting pod %q", bestEffortPod.Name)
1017-
deleteTestPod(ctx, bestEffortPod)
1017+
Expect(pods.Delete(ctx, bestEffortPod)).To(BeTrue(), "Failed to delete best-effort pod")
10181018
}
10191019
}()
10201020

@@ -1142,15 +1142,73 @@ var _ = Describe("[rfe_id:27363][performance] CPU Management", Ordered, func() {
11421142
defer func() {
11431143
if guPod != nil {
11441144
testlog.Infof("deleting pod %q", guPod.Name)
1145-
deleteTestPod(ctx, guPod)
1145+
Expect(pods.Delete(ctx, guPod)).To(BeTrue(), "Failed to delete guaranteed pod")
11461146
}
11471147
if buPod != nil {
11481148
testlog.Infof("deleting pod %q", buPod.Name)
1149-
deleteTestPod(ctx, buPod)
1149+
Expect(pods.Delete(ctx, buPod)).To(BeTrue(), "Failed to delete burstable pod")
11501150
}
11511151
}()
11521152
})
11531153
})
1154+
1155+
Context("Check exec-cpu-affinity feature", func() {
1156+
When("exec-cpu-affinity is enabled (default in PP)", func() {
1157+
// shared-cpus case is covered in 11_mixedcpus test
1158+
BeforeEach(func() {
1159+
By("Checking if exec-cpu-affinity is enabled by default in the profile")
1160+
profile, _ := profiles.GetByNodeLabels(testutils.NodeSelectorLabels)
1161+
Expect(profile).ToNot(BeNil(), "Failed to get performance profile")
1162+
if profile.Annotations != nil {
1163+
val, ok := profile.Annotations[performancev2.PerformanceProfileDisableExecCPUAffinityAnnotation]
1164+
if ok && val == "true" {
1165+
// fail loadly because the default should be enabled
1166+
Fail("exec-cpu-affinity is disabled in the profile")
1167+
}
1168+
}
1169+
})
1170+
1171+
It("should pin exec process to first CPU dedicated to the container - guratanteed pod", func() {
1172+
By("Creating a guaranteed test pod")
1173+
testPod := makePod(ctx, workerRTNode, true)
1174+
Expect(testclient.Client.Create(ctx, testPod)).To(Succeed(), "Failed to create test pod")
1175+
testPod, err = pods.WaitForCondition(ctx, client.ObjectKeyFromObject(testPod), corev1.PodReady, corev1.ConditionTrue, 5*time.Minute)
1176+
Expect(err).ToNot(HaveOccurred())
1177+
defer func() {
1178+
if testPod != nil {
1179+
testlog.Infof("deleting pod %q", testPod.Name)
1180+
Expect(pods.Delete(ctx, testPod)).To(BeTrue(), "Failed to delete test pod")
1181+
}
1182+
}()
1183+
1184+
cpusetCfg := &controller.CpuSet{}
1185+
Expect(getter.Container(ctx, testPod, testPod.Spec.Containers[0].Name, cpusetCfg)).To(Succeed(), "Failed to get cpuset config for test pod")
1186+
1187+
cpusList := strings.Split(cpusetCfg.Cpus, ",")
1188+
Expect(cpusList).ToNot(BeEmpty())
1189+
// assumes no shared configured in this suite
1190+
firstExclusiveCPU := strings.TrimSpace(cpusList[0])
1191+
testlog.Infof("first exclusive CPU: %s, all exclusive CPUs: %s", firstExclusiveCPU, strings.Join(cpusList, ","))
1192+
1193+
cpuRequest := testPod.Spec.Containers[0].Resources.Requests.Name(corev1.ResourceCPU, resource.DecimalSI).Value()
1194+
retries := int(math.Ceil(float64(10) / float64(cpuRequest)))
1195+
By("Run exec command on the pod and verify the process is pinned to the first exclusive CPU")
1196+
1197+
for i := 0; i < retries; i++ {
1198+
cmd := []string{"/bin/bash", "-c", "sleep 10 & SLPID=$!; ps -o psr -p $SLPID;"}
1199+
output, err := pods.ExecCommandOnPod(testclient.K8sClient, testPod, testPod.Spec.Containers[0].Name, cmd)
1200+
Expect(err).ToNot(HaveOccurred(), "Failed to exec command on the pod; retry %d", i)
1201+
strout := string(output)
1202+
testlog.Infof("retry %d exec command output: %s", i, strout)
1203+
1204+
strout = strings.ReplaceAll(strout, "PSR", "")
1205+
execProcessCPUs := strings.TrimSpace(strout)
1206+
Expect(execProcessCPUs).ToNot(BeEmpty(), "Failed to get exec process CPU; retry %d", i)
1207+
Expect(execProcessCPUs).To(Equal(firstExclusiveCPU), "Exec process CPU is not the first exclusive CPU; retry %d", i)
1208+
}
1209+
})
1210+
})
1211+
})
11541212
})
11551213

11561214
func extractConfigInfo(output string) (*ContainerConfig, error) {
@@ -1401,24 +1459,6 @@ func getTestPodWithAnnotations(annotations map[string]string, cpus int) *corev1.
14011459
return testpod
14021460
}
14031461

1404-
func deleteTestPod(ctx context.Context, testpod *corev1.Pod) (types.UID, bool) {
1405-
// it possible that the pod already was deleted as part of the test, in this case we want to skip teardown
1406-
err := testclient.DataPlaneClient.Get(ctx, client.ObjectKeyFromObject(testpod), testpod)
1407-
if errors.IsNotFound(err) {
1408-
return "", false
1409-
}
1410-
1411-
testpodUID := testpod.UID
1412-
1413-
err = testclient.DataPlaneClient.Delete(ctx, testpod)
1414-
Expect(err).ToNot(HaveOccurred())
1415-
1416-
err = pods.WaitForDeletion(ctx, testpod, pods.DefaultDeletionTimeout*time.Second)
1417-
Expect(err).ToNot(HaveOccurred())
1418-
1419-
return testpodUID, true
1420-
}
1421-
14221462
func cpuSpecToString(cpus *performancev2.CPU) (string, error) {
14231463
if cpus == nil {
14241464
return "", fmt.Errorf("performance CPU field is nil")

test/e2e/performanceprofile/functests/1_performance/irqbalance.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ var _ = Describe("[performance] Checking IRQBalance settings", Ordered, func() {
213213
defer func() {
214214
if testpod != nil {
215215
testlog.Infof("deleting pod %q", testpod.Name)
216-
deleteTestPod(context.TODO(), testpod)
216+
Expect(pods.Delete(context.TODO(), testpod)).To(BeTrue(), "Failed to delete pod")
217217
}
218218
bannedCPUs, err := getIrqBalanceBannedCPUs(context.TODO(), targetNode)
219219
Expect(err).ToNot(HaveOccurred(), "failed to extract the banned CPUs from node %q", targetNode.Name)

test/e2e/performanceprofile/functests/utils/pods/pods.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"k8s.io/client-go/kubernetes"
2121
"k8s.io/client-go/kubernetes/scheme"
2222
"k8s.io/client-go/tools/remotecommand"
23+
"k8s.io/klog/v2"
2324
"sigs.k8s.io/controller-runtime/pkg/client"
2425

2526
testclient "github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils/client"
@@ -51,6 +52,28 @@ func GetTestPod() *corev1.Pod {
5152
}
5253
}
5354

55+
func Delete(ctx context.Context, pod *corev1.Pod) bool {
56+
err := testclient.DataPlaneClient.Get(ctx, client.ObjectKeyFromObject(pod), pod)
57+
if errors.IsNotFound(err) {
58+
klog.InfoS("pod already deleted", "namespace", pod.Namespace, "name", pod.Name)
59+
return true
60+
}
61+
62+
err = testclient.DataPlaneClient.Delete(ctx, pod)
63+
if err != nil {
64+
klog.ErrorS(err, "failed to delete pod", "namespace", pod.Namespace, "name", pod.Name)
65+
return false
66+
}
67+
68+
err = WaitForDeletion(ctx, pod, DefaultDeletionTimeout*time.Second)
69+
if err != nil {
70+
klog.ErrorS(err, "failed to wait for pod deletion", "namespace", pod.Namespace, "name", pod.Name)
71+
return false
72+
}
73+
74+
return true
75+
}
76+
5477
// WaitForDeletion waits until the pod will be removed from the cluster
5578
func WaitForDeletion(ctx context.Context, pod *corev1.Pod, timeout time.Duration) error {
5679
return wait.PollUntilContextTimeout(ctx, time.Second, timeout, true, func(ctx context.Context) (bool, error) {

0 commit comments

Comments
 (0)