diff --git a/api/v1alpha1/deviceconfig_types.go b/api/v1alpha1/deviceconfig_types.go index 94a4d6a2..b6f186c0 100644 --- a/api/v1alpha1/deviceconfig_types.go +++ b/api/v1alpha1/deviceconfig_types.go @@ -94,7 +94,9 @@ type DriverSpec struct { // +kubebuilder:default=true Enable *bool `json:"enable,omitempty"` - // blacklist amdgpu drivers on the host + // blacklist amdgpu drivers on the host. Node reboot is required to apply the baclklist on the worker nodes. + // Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist. + // Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="BlacklistDrivers",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:blacklistDrivers"} Blacklist *bool `json:"blacklist,omitempty"` @@ -117,7 +119,7 @@ type DriverSpec struct { // example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3 //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Image",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:image"} // +optional - // +kubebuilder:validation:Pattern=`^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$` + // +kubebuilder:validation:Pattern=`^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$` Image string `json:"image,omitempty"` // driver image registry TLS setting for the container image @@ -251,12 +253,11 @@ type DevicePluginSpec struct { // +optional DevicePluginTolerations []v1.Toleration `json:"devicePluginTolerations,omitempty"` - // resource naming strategy for device plugin - //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="ResourceNamingStrategy",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:ResourceNamingStrategy"} - // +kubebuilder:validation:Enum=single;mixed - // +kubebuilder:default:="single" + // device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset + // supported flag values: {"resource_naming_strategy": {"single", "mixed"}} + //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="DevicePluginArguments",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:devicePluginArguments"} // +optional - ResourceNamingStrategy string `json:"resourceNamingStrategy,omitempty"` + DevicePluginArguments map[string]string `json:"devicePluginArguments,omitempty"` // node labeller image //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="NodeLabellerImage",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:nodeLabellerImage"} diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index bbbd03c0..c2be36c9 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -267,6 +267,13 @@ func (in *DevicePluginSpec) DeepCopyInto(out *DevicePluginSpec) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.DevicePluginArguments != nil { + in, out := &in.DevicePluginArguments, &out.DevicePluginArguments + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } if in.NodeLabellerTolerations != nil { in, out := &in.NodeLabellerTolerations, &out.NodeLabellerTolerations *out = make([]v1.Toleration, len(*in)) diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml index 495f3670..134634b9 100644 --- a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml +++ b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml @@ -29,12 +29,30 @@ metadata: } } ] - capabilities: Basic Install - createdAt: "2025-03-20T06:06:57Z" + capabilities: Seamless Upgrades + categories: AI/Machine Learning,Monitoring + containerImage: docker.io/rocm/gpu-operator:v1.2.0 + createdAt: "2025-04-02T23:22:18Z" + description: |- + Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter + For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/) + devicePluginImage: docker.io/rocm/k8s-device-plugin:rhubi-latest + features.operators.openshift.io/disconnected: "true" + features.operators.openshift.io/fips-compliant: "false" + features.operators.openshift.io/proxy-aware: "true" + features.operators.openshift.io/tls-profiles: "false" + features.operators.openshift.io/token-auth-aws: "false" + features.operators.openshift.io/token-auth-azure: "false" + features.operators.openshift.io/token-auth-gcp: "false" + metricsExporterImage: docker.io/rocm/device-metrics-exporter:v1.2.0 + nodelabellerImage: docker.io/rocm/k8s-device-plugin:labeller-rhubi-latest + operatorframework.io/cluster-monitoring: "true" operatorframework.io/suggested-namespace: openshift-amd-gpu + operators.openshift.io/valid-subscription: '[]' operators.operatorframework.io/builder: operator-sdk-v1.32.0 operators.operatorframework.io/project_layout: go.kubebuilder.io/v3 repository: https://github.com/ROCm/gpu-operator + support: Advanced Micro Devices, Inc. name: amd-gpu-operator.v1.2.0 namespace: placeholder spec: @@ -152,6 +170,13 @@ spec: path: devicePlugin x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:devicePlugin + - description: 'device plugin arguments is used to pass supported flags and + their values while starting device plugin daemonset supported flag values: + {"resource_naming_strategy": {"single", "mixed"}}' + displayName: DevicePluginArguments + path: devicePlugin.devicePluginArguments + x-descriptors: + - urn:alm:descriptor:com.amd.deviceconfigs:devicePluginArguments - description: device plugin image displayName: DevicePluginImage path: devicePlugin.devicePluginImage @@ -192,11 +217,6 @@ spec: path: devicePlugin.nodeLabellerTolerations x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:nodeLabellerTolerations - - description: resource naming strategy for device plugin - displayName: ResourceNamingStrategy - path: devicePlugin.resourceNamingStrategy - x-descriptors: - - urn:alm:descriptor:com.amd.deviceconfigs:ResourceNamingStrategy - description: upgrade policy for device plugin and node labeller daemons displayName: UpgradePolicy path: devicePlugin.upgradePolicy @@ -227,7 +247,10 @@ spec: path: driver.amdgpuInstallerRepoURL x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:amdgpuInstallerRepoURL - - description: blacklist amdgpu drivers on the host + - description: blacklist amdgpu drivers on the host. Node reboot is required + to apply the baclklist on the worker nodes. Not working for OpenShift cluster. + OpenShift users please use the Machine Config Operator (MCO) resource to + configure amdgpu blacklist. Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module displayName: BlacklistDrivers path: driver.blacklist x-descriptors: @@ -606,7 +629,7 @@ spec: - urn:alm:descriptor:com.amd.deviceconfigs:nodeModuleStatus version: v1alpha1 description: |- - Operator responsible for deploying AMD GPU kernel drivers and device plugin + Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/) displayName: amd-gpu-operator icon: @@ -1110,11 +1133,24 @@ spec: - supported: true type: AllNamespaces keywords: - - amd-gpu-operator + - AMD + - GPU + - AI + - Deep Learning + - Hardware + - Driver + - Monitoring links: - - name: Amd Gpu Operator - url: https://amd-gpu-operator.domain - maturity: alpha + - name: AMD GPU Operator + url: https://github.com/ROCm/gpu-operator + maintainers: + - email: Yan.Sun3@amd.com + name: Yan Sun + - email: farshad.ghodsian@amd.com + name: Farshad Ghodsian + - email: shrey.ajmera@amd.com + name: Shrey Ajmera + maturity: stable provider: - name: amd-gpu-operator + name: Advanced Micro Devices, Inc. version: 1.2.0 diff --git a/bundle/manifests/amd.com_deviceconfigs.yaml b/bundle/manifests/amd.com_deviceconfigs.yaml index a0476c71..d2669dc1 100644 --- a/bundle/manifests/amd.com_deviceconfigs.yaml +++ b/bundle/manifests/amd.com_deviceconfigs.yaml @@ -190,6 +190,13 @@ spec: devicePlugin: description: device plugin properties: + devicePluginArguments: + additionalProperties: + type: string + description: |- + device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset + supported flag values: {"resource_naming_strategy": {"single", "mixed"}} + type: object devicePluginImage: description: device plugin image pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ @@ -306,13 +313,6 @@ spec: type: string type: object type: array - resourceNamingStrategy: - default: single - description: resource naming strategy for device plugin - enum: - - single - - mixed - type: string upgradePolicy: description: upgrade policy for device plugin and node labeller daemons @@ -342,7 +342,10 @@ spec: installer URL is https://repo.radeon.com/amdgpu-install by default type: string blacklist: - description: blacklist amdgpu drivers on the host + description: |- + blacklist amdgpu drivers on the host. Node reboot is required to apply the baclklist on the worker nodes. + Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist. + Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module type: boolean enable: default: true @@ -357,7 +360,7 @@ spec: for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod image tag will be in the format of --- example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3 - pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ + pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ type: string imageRegistrySecret: description: secrets used for pull/push images from/to private diff --git a/cmd/main.go b/cmd/main.go index b3c985ff..168a730d 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -35,9 +35,6 @@ package main import ( "flag" - "github.com/ROCm/gpu-operator/internal/configmanager" - "github.com/ROCm/gpu-operator/internal/metricsexporter" - "github.com/ROCm/gpu-operator/internal/testrunner" kmmv1beta1 "github.com/rh-ecosystem-edge/kernel-module-management/api/v1beta1" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" @@ -51,11 +48,15 @@ import ( _ "k8s.io/client-go/plugin/pkg/client/auth" gpuev1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1" + utils "github.com/ROCm/gpu-operator/internal" "github.com/ROCm/gpu-operator/internal/cmd" "github.com/ROCm/gpu-operator/internal/config" + "github.com/ROCm/gpu-operator/internal/configmanager" "github.com/ROCm/gpu-operator/internal/controllers" "github.com/ROCm/gpu-operator/internal/kmmmodule" + "github.com/ROCm/gpu-operator/internal/metricsexporter" "github.com/ROCm/gpu-operator/internal/nodelabeller" + "github.com/ROCm/gpu-operator/internal/testrunner" //+kubebuilder:scaffold:imports ) @@ -107,8 +108,9 @@ func main() { } client := mgr.GetClient() - kmmHandler := kmmmodule.NewKMMModule(client, scheme) - nlHandler := nodelabeller.NewNodeLabeller(scheme) + isOpenShift := utils.IsOpenShift(setupLogger) + kmmHandler := kmmmodule.NewKMMModule(client, scheme, isOpenShift) + nlHandler := nodelabeller.NewNodeLabeller(scheme, isOpenShift) metricsHandler := metricsexporter.NewMetricsExporter(scheme) testrunnerHandler := testrunner.NewTestRunner(scheme) configmanagerHandler := configmanager.NewConfigManager(scheme) diff --git a/config/crd/bases/amd.com_deviceconfigs.yaml b/config/crd/bases/amd.com_deviceconfigs.yaml index 5f7a02bd..7916a7e6 100644 --- a/config/crd/bases/amd.com_deviceconfigs.yaml +++ b/config/crd/bases/amd.com_deviceconfigs.yaml @@ -186,6 +186,13 @@ spec: devicePlugin: description: device plugin properties: + devicePluginArguments: + additionalProperties: + type: string + description: |- + device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset + supported flag values: {"resource_naming_strategy": {"single", "mixed"}} + type: object devicePluginImage: description: device plugin image pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ @@ -302,13 +309,6 @@ spec: type: string type: object type: array - resourceNamingStrategy: - default: single - description: resource naming strategy for device plugin - enum: - - single - - mixed - type: string upgradePolicy: description: upgrade policy for device plugin and node labeller daemons @@ -338,7 +338,10 @@ spec: installer URL is https://repo.radeon.com/amdgpu-install by default type: string blacklist: - description: blacklist amdgpu drivers on the host + description: |- + blacklist amdgpu drivers on the host. Node reboot is required to apply the baclklist on the worker nodes. + Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist. + Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module type: boolean enable: default: true @@ -353,7 +356,7 @@ spec: for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod image tag will be in the format of --- example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3 - pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ + pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ type: string imageRegistrySecret: description: secrets used for pull/push images from/to private diff --git a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml index a5b6cd65..878483bd 100644 --- a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml +++ b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml @@ -3,9 +3,27 @@ kind: ClusterServiceVersion metadata: annotations: alm-examples: '[]' - capabilities: Basic Install + capabilities: Seamless Upgrades + categories: AI/Machine Learning,Monitoring + containerImage: docker.io/rocm/gpu-operator:v1.2.0 + description: |- + Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter + For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/) + devicePluginImage: docker.io/rocm/k8s-device-plugin:rhubi-latest + features.operators.openshift.io/disconnected: "true" + features.operators.openshift.io/fips-compliant: "false" + features.operators.openshift.io/proxy-aware: "true" + features.operators.openshift.io/tls-profiles: "false" + features.operators.openshift.io/token-auth-aws: "false" + features.operators.openshift.io/token-auth-azure: "false" + features.operators.openshift.io/token-auth-gcp: "false" + metricsExporterImage: docker.io/rocm/device-metrics-exporter:v1.2.0 + nodelabellerImage: docker.io/rocm/k8s-device-plugin:labeller-rhubi-latest + operatorframework.io/cluster-monitoring: "true" operatorframework.io/suggested-namespace: openshift-amd-gpu + operators.openshift.io/valid-subscription: '[]' repository: https://github.com/ROCm/gpu-operator + support: Advanced Micro Devices, Inc. name: amd-gpu-operator.v0.0.0 namespace: placeholder spec: @@ -123,6 +141,13 @@ spec: path: devicePlugin x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:devicePlugin + - description: 'device plugin arguments is used to pass supported flags and + their values while starting device plugin daemonset supported flag values: + {"resource_naming_strategy": {"single", "mixed"}}' + displayName: DevicePluginArguments + path: devicePlugin.devicePluginArguments + x-descriptors: + - urn:alm:descriptor:com.amd.deviceconfigs:devicePluginArguments - description: device plugin image displayName: DevicePluginImage path: devicePlugin.devicePluginImage @@ -163,11 +188,6 @@ spec: path: devicePlugin.nodeLabellerTolerations x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:nodeLabellerTolerations - - description: resource naming strategy for device plugin - displayName: ResourceNamingStrategy - path: devicePlugin.resourceNamingStrategy - x-descriptors: - - urn:alm:descriptor:com.amd.deviceconfigs:ResourceNamingStrategy - description: upgrade policy for device plugin and node labeller daemons displayName: UpgradePolicy path: devicePlugin.upgradePolicy @@ -198,7 +218,10 @@ spec: path: driver.amdgpuInstallerRepoURL x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:amdgpuInstallerRepoURL - - description: blacklist amdgpu drivers on the host + - description: blacklist amdgpu drivers on the host. Node reboot is required + to apply the baclklist on the worker nodes. Not working for OpenShift cluster. + OpenShift users please use the Machine Config Operator (MCO) resource to + configure amdgpu blacklist. Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module displayName: BlacklistDrivers path: driver.blacklist x-descriptors: @@ -577,7 +600,7 @@ spec: - urn:alm:descriptor:com.amd.deviceconfigs:nodeModuleStatus version: v1alpha1 description: |- - Operator responsible for deploying AMD GPU kernel drivers and device plugin + Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/) displayName: amd-gpu-operator icon: @@ -597,11 +620,24 @@ spec: - supported: true type: AllNamespaces keywords: - - amd-gpu-operator + - AMD + - GPU + - AI + - Deep Learning + - Hardware + - Driver + - Monitoring links: - - name: Amd Gpu Operator - url: https://amd-gpu-operator.domain - maturity: alpha + - name: AMD GPU Operator + url: https://github.com/ROCm/gpu-operator + maintainers: + - email: Yan.Sun3@amd.com + name: Yan Sun + - email: farshad.ghodsian@amd.com + name: Farshad Ghodsian + - email: shrey.ajmera@amd.com + name: Shrey Ajmera + maturity: stable provider: - name: amd-gpu-operator + name: Advanced Micro Devices, Inc. version: 0.0.0 diff --git a/docs/device_plugin/device-plugin.md b/docs/device_plugin/device-plugin.md new file mode 100644 index 00000000..4ecfb97b --- /dev/null +++ b/docs/device_plugin/device-plugin.md @@ -0,0 +1,112 @@ +# Device Plugin + +## Configure device plugin + +To start the Device Plugin along with the GPU Operator configure fields under the ``` spec/devicePlugin ``` field in deviceconfig Custom Resource(CR) + +```yaml + devicePlugin: + # Specify the device plugin image + # default value is rocm/k8s-device-plugin:latest + devicePluginImage: rocm/k8s-device-plugin:latest + + # The device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset + devicePluginArguments: + resource_naming_strategy: single + + # Specify the node labeller image + # default value is rocm/k8s-device-plugin:labeller-latest + nodeLabellerImage: rocm/k8s-device-plugin:labeller-latest + + # Specify whether to bring up node labeller component + # default value is true + enableNodeLabeller: True + +``` + +The **device-plugin** pods start after updating the **DeviceConfig** CR + +```bash +#kubectl get pods -n kube-amd-gpu +NAME READY STATUS RESTARTS AGE +amd-gpu-operator-gpu-operator-charts-controller-manager-77tpmgn 1/1 Running 0 4h9m +amd-gpu-operator-kmm-controller-6d459dffcf-lbgtt 1/1 Running 0 4h9m +amd-gpu-operator-kmm-webhook-server-5fdc8b995-qgj49 1/1 Running 0 4h9m +amd-gpu-operator-node-feature-discovery-gc-78989c896-7lh8t 1/1 Running 0 3h48m +amd-gpu-operator-node-feature-discovery-master-b8bffc48b-6rnz6 1/1 Running 0 4h9m +amd-gpu-operator-node-feature-discovery-worker-m9lwn 1/1 Running 0 4h9m +test-deviceconfig-device-plugin-rk5f4 1/1 Running 0 134m +test-deviceconfig-node-labeller-bxk7x 1/1 Running 0 134m +``` + +
+Note: The Device Plugin name will be prefixed with the name of your DeviceConfig custom resource +

+ +## Device Plugin DeviceConfig +| Field Name | Details | +|----------------------------------|----------------------------------------------| +| **DevicePluginImage** | Device plugin image | +| **DevicePluginImagePullPolicy** | One of Always, Never, IfNotPresent. | +| **NodeLabellerImage** | Node labeller image | +| **NodeLabellerImagePullPolicy** | One of Always, Never, IfNotPresent. | +| **EnableNodeLabeller** | Enable/Disable node labeller with True/False | +| **DevicePluginArguments** | The flag/values to pass on to Device Plugin | +
+ +1. Both the `ImagePullPolicy` fields default to `Always` if `:latest` tag is specified on the respective Image, or defaults to `IfNotPresent` otherwise. This is default k8s behaviour for `ImagePullPolicy` + +2. `DevicePluginArguments` is of type `map[string]string`. Currently supported key value pairs to set under `DevicePluginArguments` are: + -> "resource_naming_strategy": {"single", "mixed"} + +## How to choose Resource Naming Strategy + +To customize the way device plugin reports gpu resources to kubernetes as allocatable k8s resources, use the `single` or `mixed` resource naming strategy in **DeviceConfig** CR +Before understanding each strategy, please note the definition of homogeneous and heterogeneous nodes + +Homogeneous node: A node whose gpu's follow the same compute-memory partition style + -> Example: A node of 8 GPU's where all 8 GPU's are following CPX-NPS4 partition style + +Heterogeneous node: A node whose gpu's follow different compute-memory partition styles + -> Example: A node of 8 GPU's where 5 GPU's are following SPX-NPS1 and 3 GPU's are following CPX-NPS1 + +### Single + +In `single` mode, the device plugin reports all gpu's (regardless of whether they are whole gpu's or partitions of a gpu) under the resource name `amd.com/gpu` +This mode is supported for homogeneous nodes but not supported for heterogeneous nodes + +A node which has 8 GPUs where all GPUs are not partitioned will report its resources as: + +```bash +amd.com/gpu: 8 +``` + +A node which has 8 GPUs where all GPUs are partitioned using CPX-NPS4 style will report its resources as: + +```bash +amd.com/gpu: 64 +``` + +### Mixed + +In `mixed` mode, the device plugin reports all gpu's under a name which matches its partition style. +This mode is supported for both homogeneous nodes and heterogeneous nodes + +A node which has 8 GPUs which are all partitioned using CPX-NPS4 style will report its resources as: + +```bash +amd.com/cpx_nps4: 64 +``` + +A node which has 8 GPUs where 5 GPU's are following SPX-NPS1 and 3 GPU's are following CPX-NPS1 will report its resources as: + +```bash +amd.com/spx_nps1: 5 +amd.com/cpx_nps1: 24 +``` + +#### **Notes** + +- If `resource_naming_strategy` is not passed using `DevicePluginArguments` field in CR, then device plugin will internally default to `single` resource naming strategy. This maintains backwards compatibility with earlier release of device plugin with reported resource name of `amd.com/gpu` +- If a node has GPUs which do not support partitioning, such as MI210, then the GPUs are reported under resource name `amd.com/gpu` regardless of the resource naming strategy +- These different naming styles of resources, for example, `amd.com/cpx_nps1` should be followed when requesting for resources in a pod spec \ No newline at end of file diff --git a/docs/drivers/installation.md b/docs/drivers/installation.md index 890da553..ead38e4d 100644 --- a/docs/drivers/installation.md +++ b/docs/drivers/installation.md @@ -18,12 +18,19 @@ Before installing the AMD GPU driver: Before installing the out-of-tree AMD GPU driver, you must blacklist the inbox AMD GPU driver: +- These commands need to either be run as `root` or by using `sudo` - Create blacklist configuration file on worker nodes: ```bash echo "blacklist amdgpu" > /etc/modprobe.d/blacklist-amdgpu.conf ``` +- After blacklist configuration file, you need to rebuild the initramfs for the change to take effect: + +```bash +echo update-initramfs -u -k all +``` + - Reboot the worker node to apply the blacklist - Verify the blacklisting: diff --git a/docs/index.md b/docs/index.md index 3a8340ea..130064ba 100644 --- a/docs/index.md +++ b/docs/index.md @@ -13,8 +13,46 @@ The AMD GPU Operator simplifies the deployment and management of AMD Instinct GP ## Compatibility -- **Kubernetes**: 1.29.0 -- Please refer to the [ROCm documentation](https://rocm.docs.amd.com/en/latest/compatibility/compatibility-matrix.html) for the compatibility matrix for the AMD GPU DKMS driver. +### Supported Hardware + +| **GPUs** | | +| --- | --- | +| AMD Instinct™ MI300X | ✅ Supported | +| AMD Instinct™ MI250 | ✅ Supported | +| AMD Instinct™ MI210 | ✅ Supported | + +### OS & Platform Support Matrix + +Below is a matrix of supported Operating systems and the corresponding Kubernetes version that have been validated to work. We will continue to add more Operating Systems and future versions of Kubernetes with each release of the AMD GPU Operator and Metrics Exporter. + + + + + + + + + + + + + + + + + + + + + + + + + + +
Operating SystemKubernetesRed Hat OpenShift
Ubuntu 22.04 LTS1.29—1.31
Ubuntu 24.04 LTS1.29—1.31
Red Hat Core OS (RHCOS)4.16—4.18
+ +Please refer to the [ROCM documentaiton](https://rocm.docs.amd.com/en/latest/compatibility/compatibility-matrix.html) for the compatability matrix for the AMD GPU DKMS driver. ## Prerequisites diff --git a/docs/installation/openshift-olm.md b/docs/installation/openshift-olm.md index 89625fc1..489644d9 100644 --- a/docs/installation/openshift-olm.md +++ b/docs/installation/openshift-olm.md @@ -204,6 +204,13 @@ spec: "feature.node.kubernetes.io/amd-gpu": "true" ``` +Things to note: +1. By default, there is no need to specify the image field in CR for Openshift. Default will be used which is: image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod + +2. If users specify image, $MOD_NAMESPACE can be a place holder , KMM Operator can automatically translate it to the namespace + +3. Openshift internal registry has image url restriction, OpenShift users cannot use image like `/` , it requires the image URL to be `//`. However, if any other registry is being used by the user, the image URL can be of either form. + The operator will: 1. Collect worker node system specifications diff --git a/docs/metrics/ecc-error-injection.md b/docs/metrics/ecc-error-injection.md new file mode 100644 index 00000000..f3f17926 --- /dev/null +++ b/docs/metrics/ecc-error-injection.md @@ -0,0 +1,199 @@ +## ECC Error Injection Testing + +The Metric Exporter has the capability to check for unhealthy GPUs via the monitoring of ECC Errors that can occur when a GPU is not functioning as expected. When an ECC error is detected the Metrics Exporter will now mark the offending GPU as unhealthy and add a node label to indicate which GPU on the node is unhealthy. The Kubernetes Device Plugin also listens to the health metrics coming from the Metrics Exporter to determine GPU status, marking GPUs as schedulable if healthy and unschedulable if unhealthy. + +This health check workflow runs automatically on every node the Device Metrics Exporter is running on, with the Metrics Exporter polling GPUs every 30 seconds and the device plugin checking health status at the same interval, ensuring updates within one minute. Users can customize the default ECC error threshold (set to 0) via the `HealthThresholds` field in the metrics exporter ConfigMap. As part of this workflow healthy GPUs are made available for Kubernetes job scheduling, while ensuring no new jobs are scheduled on an unhealthy GPUs. + +## To do error injection follow these steps + +We have added a new `metricsclient` to the Device Metrics Exporter pod that can be used to inject ECC errors into an otherwise healthy GPU for testing the above health check workflow. This is fairly simple and don't worry this does not harm your GPU as any errors that are being injected are debugging in nature and not real errors. The steps to do this have been outlined below: + +### 1. Set Node Name + +Use an environment variable to set the Kubernetes node name to indicate which node you want to test error injection on: + +```bash +NODE_NAME= +``` + +Replace with the name of the node you want to test. If you are running this from the same node you want to test you can grab the hostname using: + +```bash +NODE_NAME=$(hostname) +``` + +### 2. Set Metrics Exporter Pod Name + +Since you have to execute the `metricsclient` from directly within the Device Metrics Exporter pod we need to get the Metrics Exporter pod name running on the node: + +```bash +METRICS_POD=$(kubectl get pods -n kube-amd-gpu --field-selector spec.nodeName=$NODE_NAME --no-headers -o custom-columns=":metadata.name" | grep '^gpu-operator-metrics-exporter-' | head -n 1) +``` + +### 3. Check Metrics Client to see GPU Health + +Now that you have the name of the metrics exporter pod you can use the metricsclient to check the current health of all GPUs on the node: + +```bash +kubectl exec -n kube-amd-gpu $METRICS_POD -c metrics-exporter-container -- metricsclient +``` + +You should see a list of all the GPUs on that node along with their corresponding status. In most cases all GPUs should report as being `healthy`. + +```bash +ID Health Associated Workload +------------------------------------------------ +1 healthy [] +0 healthy [] +7 healthy [] +6 healthy [] +5 healthy [] +4 healthy [] +3 healthy [] +2 healthy [] +------------------------------------------------ +``` + +### 4. Inject ECC Errors on GPU 0 + +In order to simulate errors on a GPU we will be using a json file that specifies a GPU ID along with counters for several ECC Uncorrectable error fields that are being monitored by the Device Metrics Exporter. In the below example you can see that we are specifying `GPU 0` and injecting 1 `GPU_ECC_UNCORRECT_SEM` error and 2 `GPU_ECC_UNCORRECT_FUSE` errors. We use the `metricslient -ecc-file-path ` command to specify the json file we want to inject into the metrics table. To create the json file and execute the metricsclient command all in in one go run the following: + +```bash +kubectl exec -n kube-amd-gpu $METRICS_POD -c metrics-exporter-container -- sh -c 'cat > /tmp/ecc.json < /tmp/delete_ecc.json <--- example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3 - pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ + pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ type: string imageRegistrySecret: description: secrets used for pull/push images from/to private registry diff --git a/helm-charts-openshift/Chart.lock b/helm-charts-openshift/Chart.lock index 6e3c4ccc..8eb0ba07 100644 --- a/helm-charts-openshift/Chart.lock +++ b/helm-charts-openshift/Chart.lock @@ -6,4 +6,4 @@ dependencies: repository: file://./charts/kmm version: v1.0.0 digest: sha256:25200c34a5cc846a1275e5bf3fc637b19e909dc68de938189c5278d77d03f5ac -generated: "2025-03-20T06:06:55.80187139Z" +generated: "2025-03-26T20:10:56.781691243Z" diff --git a/helm-charts-openshift/crds/deviceconfig-crd.yaml b/helm-charts-openshift/crds/deviceconfig-crd.yaml index 6058c151..81c564c1 100644 --- a/helm-charts-openshift/crds/deviceconfig-crd.yaml +++ b/helm-charts-openshift/crds/deviceconfig-crd.yaml @@ -194,6 +194,13 @@ spec: devicePlugin: description: device plugin properties: + devicePluginArguments: + additionalProperties: + type: string + description: |- + device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset + supported flag values: {"resource_naming_strategy": {"single", "mixed"}} + type: object devicePluginImage: description: device plugin image pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ @@ -310,13 +317,6 @@ spec: type: string type: object type: array - resourceNamingStrategy: - default: single - description: resource naming strategy for device plugin - enum: - - single - - mixed - type: string upgradePolicy: description: upgrade policy for device plugin and node labeller daemons @@ -346,7 +346,10 @@ spec: installer URL is https://repo.radeon.com/amdgpu-install by default type: string blacklist: - description: blacklist amdgpu drivers on the host + description: |- + blacklist amdgpu drivers on the host. Node reboot is required to apply the baclklist on the worker nodes. + Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist. + Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module type: boolean enable: default: true @@ -361,7 +364,7 @@ spec: for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod image tag will be in the format of --- example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3 - pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ + pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ type: string imageRegistrySecret: description: secrets used for pull/push images from/to private registry diff --git a/internal/controllers/mock_upgrademgr.go b/internal/controllers/mock_upgrademgr.go index 7db0fa9c..03944030 100644 --- a/internal/controllers/mock_upgrademgr.go +++ b/internal/controllers/mock_upgrademgr.go @@ -216,6 +216,20 @@ func (mr *MockupgradeMgrHelperAPIMockRecorder) deleteRebootPod(ctx, nodeName, dc return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "deleteRebootPod", reflect.TypeOf((*MockupgradeMgrHelperAPI)(nil).deleteRebootPod), ctx, nodeName, dc, force, genId) } +// getBootID mocks base method. +func (m *MockupgradeMgrHelperAPI) getBootID(nodeName string) string { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "getBootID", nodeName) + ret0, _ := ret[0].(string) + return ret0 +} + +// getBootID indicates an expected call of getBootID. +func (mr *MockupgradeMgrHelperAPIMockRecorder) getBootID(nodeName any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "getBootID", reflect.TypeOf((*MockupgradeMgrHelperAPI)(nil).getBootID), nodeName) +} + // getNode mocks base method. func (m *MockupgradeMgrHelperAPI) getNode(ctx context.Context, nodeName string) (*v1.Node, error) { m.ctrl.T.Helper() @@ -465,6 +479,18 @@ func (mr *MockupgradeMgrHelperAPIMockRecorder) isUpgradePolicyViolated(upgradeIn return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "isUpgradePolicyViolated", reflect.TypeOf((*MockupgradeMgrHelperAPI)(nil).isUpgradePolicyViolated), upgradeInProgress, upgradeFailedState, totalNodes, deviceConfig) } +// setBootID mocks base method. +func (m *MockupgradeMgrHelperAPI) setBootID(nodeName, bootID string) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "setBootID", nodeName, bootID) +} + +// setBootID indicates an expected call of setBootID. +func (mr *MockupgradeMgrHelperAPIMockRecorder) setBootID(nodeName, bootID any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "setBootID", reflect.TypeOf((*MockupgradeMgrHelperAPI)(nil).setBootID), nodeName, bootID) +} + // setNodeStatus mocks base method. func (m *MockupgradeMgrHelperAPI) setNodeStatus(ctx context.Context, nodeName string, status v1alpha1.UpgradeState) { m.ctrl.T.Helper() diff --git a/internal/controllers/upgrademgr.go b/internal/controllers/upgrademgr.go index c5f3fd6e..ad2ee41c 100644 --- a/internal/controllers/upgrademgr.go +++ b/internal/controllers/upgrademgr.go @@ -151,11 +151,6 @@ func (n *upgradeMgr) HandleUpgrade(ctx context.Context, deviceConfig *amdv1alpha // 1. Set init status for unprocessed nodes n.helper.handleInitStatus(ctx, &nodeList.Items[i]) - if !n.helper.isNodeReadyForUpgrade(ctx, &nodeList.Items[i]) { - res = ctrl.Result{Requeue: true, RequeueAfter: time.Second * 20} - continue - } - // 2. Handle failed nodes if n.helper.isNodeStateUpgradeFailed(ctx, &nodeList.Items[i], deviceConfig) { n.helper.clearUpgradeStartTime(nodeList.Items[i].Name) @@ -193,6 +188,11 @@ func (n *upgradeMgr) HandleUpgrade(ctx context.Context, deviceConfig *amdv1alpha continue } + if !n.helper.isNodeReadyForUpgrade(ctx, &nodeList.Items[i]) { + res = ctrl.Result{Requeue: true, RequeueAfter: time.Second * 20} + continue + } + //This node is a candidate for selection candidateNodes = append(candidateNodes, nodeList.Items[i]) } @@ -287,6 +287,8 @@ type upgradeMgrHelperAPI interface { setUpgradeStartTime(nodeName string) clearUpgradeStartTime(nodeName string) checkUpgradeTimeExceeded(ctx context.Context, nodeName string, deviceConfig *amdv1alpha1.DeviceConfig) bool + getBootID(nodeName string) string + setBootID(nodeName string, bootID string) clearNodeStatus() isInit() bool } @@ -297,6 +299,7 @@ type upgradeMgrHelper struct { drainHelper *drain.Helper nodeStatus *sync.Map nodeUpgradeStartTime *sync.Map + nodeBootID *sync.Map init bool currentSpec driverSpec } @@ -313,6 +316,7 @@ func newUpgradeMgrHelperHandler(client client.Client, k8sInterface kubernetes.In k8sInterface: k8sInterface, nodeStatus: new(sync.Map), nodeUpgradeStartTime: new(sync.Map), + nodeBootID: new(sync.Map), } } @@ -527,6 +531,18 @@ func (h *upgradeMgrHelper) checkUpgradeTimeExceeded(ctx context.Context, nodeNam return false } +func (h *upgradeMgrHelper) getBootID(nodeName string) string { + if value, ok := h.nodeBootID.Load(nodeName); ok { + return value.(string) + } + + return "" +} + +func (h *upgradeMgrHelper) setBootID(nodeName string, currentbootID string) { + h.nodeBootID.Store(nodeName, currentbootID) +} + func (h *upgradeMgrHelper) getNodeStatus(nodeName string) amdv1alpha1.UpgradeState { if value, ok := h.nodeStatus.Load(nodeName); ok { @@ -867,6 +883,8 @@ func (h *upgradeMgrHelper) handleNodeReboot(ctx context.Context, node *v1.Node, // Wait for the driver upgrade to complete waitForDriverUpgrade() + currentBootID := node.Status.NodeInfo.BootID + h.setBootID(node.Name, currentBootID) if err := h.client.Create(ctx, rebootPod); err != nil { logger.Error(err, fmt.Sprintf("Node: %v State: %v RebootPod Create failed with Error: %v", node.Name, h.getNodeStatus(node.Name), err)) // Mark the state as failed @@ -888,6 +906,11 @@ func (h *upgradeMgrHelper) handleNodeReboot(ctx context.Context, node *v1.Node, } } + if nodeObj.Status.NodeInfo.BootID != h.getBootID(node.Name) { + h.setBootID(node.Name, nodeObj.Status.NodeInfo.BootID) + logger.Info(fmt.Sprintf("Node: %v has rebooted", node.Name)) + return + } // If node is NotReady, proceed; otherwise, wait for the next tick if nodeNotReady { logger.Info(fmt.Sprintf("Node: %v has moved to NotReady", node.Name)) diff --git a/internal/kmmmodule/kmmmodule.go b/internal/kmmmodule/kmmmodule.go index 9aa6632b..9ca34383 100644 --- a/internal/kmmmodule/kmmmodule.go +++ b/internal/kmmmodule/kmmmodule.go @@ -55,10 +55,8 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" - "k8s.io/client-go/discovery" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/log" @@ -107,27 +105,14 @@ type kmmModule struct { isOpenShift bool } -func NewKMMModule(client client.Client, scheme *runtime.Scheme) KMMModuleAPI { +func NewKMMModule(client client.Client, scheme *runtime.Scheme, isOpenShift bool) KMMModuleAPI { return &kmmModule{ client: client, scheme: scheme, - isOpenShift: isOpenshift(), + isOpenShift: isOpenShift, } } -func isOpenshift() bool { - if dc, err := discovery.NewDiscoveryClientForConfig(ctrl.GetConfigOrDie()); err == nil { - if gplist, err := dc.ServerGroups(); err == nil { - for _, gp := range gplist.Groups { - if gp.Name == "route.openshift.io" { - return true - } - } - } - } - return false -} - func (km *kmmModule) SetNodeVersionLabelAsDesired(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, nodes *v1.NodeList) error { // for each selected node // put the KMM version label given by CR's driver version @@ -272,8 +257,14 @@ func (km *kmmModule) SetDevicePluginAsDesired(ds *appsv1.DaemonSet, devConfig *a return fmt.Errorf("daemon set is not initialized, zero pointer") } - resourceNamingStrategy := devConfig.Spec.DevicePlugin.ResourceNamingStrategy - command := []string{"sh", "-c", fmt.Sprintf("./k8s-device-plugin -logtostderr=true -stderrthreshold=INFO -v=5 -pulse=30 -resource_naming_strategy=%s", resourceNamingStrategy)} + commandArgs := "./k8s-device-plugin -logtostderr=true -stderrthreshold=INFO -v=5 -pulse=30" + + devicePluginArguments := devConfig.Spec.DevicePlugin.DevicePluginArguments + for key, val := range devicePluginArguments { + commandArgs += " -" + key + "=" + val + } + + command := []string{"sh", "-c", commandArgs} nodeSelector := map[string]string{} for key, val := range devConfig.Spec.Selector { nodeSelector[key] = val diff --git a/internal/nodelabeller/nodelabeller.go b/internal/nodelabeller/nodelabeller.go index 8f60805b..81293fd9 100644 --- a/internal/nodelabeller/nodelabeller.go +++ b/internal/nodelabeller/nodelabeller.go @@ -42,9 +42,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/intstr" - "k8s.io/client-go/discovery" "k8s.io/utils/ptr" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" ) @@ -54,6 +52,8 @@ const ( defaultNodeLabellerImage = "rocm/k8s-device-plugin:labeller-latest" defaultUbiNodeLabellerImage = "rocm/k8s-node-labeller:rhubi-latest" defaultInitContainerImage = "busybox:1.36" + defaultBlacklistFileName = "blacklist-amdgpu.conf" + openShiftBlacklistFileName = "blacklist-amdgpu-by-operator.conf" ) //go:generate mockgen -source=nodelabeller.go -package=nodelabeller -destination=mock_nodelabeller.go NodeLabeller @@ -66,26 +66,13 @@ type nodeLabeller struct { isOpenShift bool } -func NewNodeLabeller(scheme *runtime.Scheme) NodeLabeller { +func NewNodeLabeller(scheme *runtime.Scheme, isOpenshift bool) NodeLabeller { return &nodeLabeller{ scheme: scheme, - isOpenShift: isOpenshift(), + isOpenShift: isOpenshift, } } -func isOpenshift() bool { - if dc, err := discovery.NewDiscoveryClientForConfig(ctrl.GetConfigOrDie()); err == nil { - if gplist, err := dc.ServerGroups(); err == nil { - for _, gp := range gplist.Groups { - if gp.Name == "route.openshift.io" { - return true - } - } - } - } - return false -} - func (nl *nodeLabeller) SetNodeLabellerAsDesired(ds *appsv1.DaemonSet, devConfig *amdv1alpha1.DeviceConfig) error { if ds == nil { return fmt.Errorf("daemon set is not initialized, zero pointer") @@ -144,15 +131,19 @@ func (nl *nodeLabeller) SetNodeLabellerAsDesired(ds *appsv1.DaemonSet, devConfig }, } - var initContainerCommand []string + blackListFileName := defaultBlacklistFileName + if nl.isOpenShift { + blackListFileName = openShiftBlacklistFileName + } + var initContainerCommand []string if devConfig.Spec.Driver.Blacklist != nil && *devConfig.Spec.Driver.Blacklist { // if users want to apply the blacklist, init container will add the amdgpu to the blacklist - initContainerCommand = []string{"sh", "-c", "echo \"# added by gpu operator \nblacklist amdgpu\" > /host-etc/modprobe.d/blacklist-amdgpu.conf; while [ ! -d /host-sys/class/kfd ] || [ ! -d /host-sys/module/amdgpu/drivers/ ]; do echo \"amdgpu driver is not loaded \"; sleep 2 ;done"} + initContainerCommand = []string{"sh", "-c", fmt.Sprintf("echo \"# added by gpu operator \nblacklist amdgpu\" > /host-etc/modprobe.d/%v; while [ ! -d /host-sys/class/kfd ] || [ ! -d /host-sys/module/amdgpu/drivers/ ]; do echo \"amdgpu driver is not loaded \"; sleep 2 ;done", blackListFileName)} } else { // if users disabled the KMM driver, or disabled the blacklist // init container will remove any hanging amdgpu blacklist entry from the list - initContainerCommand = []string{"sh", "-c", "rm -f /host-etc/modprobe.d/blacklist-amdgpu.conf; while [ ! -d /host-sys/class/kfd ] || [ ! -d /host-sys/module/amdgpu/drivers/ ]; do echo \"amdgpu driver is not loaded \"; sleep 2 ;done"} + initContainerCommand = []string{"sh", "-c", fmt.Sprintf("rm -f /host-etc/modprobe.d/%v; while [ ! -d /host-sys/class/kfd ] || [ ! -d /host-sys/module/amdgpu/drivers/ ]; do echo \"amdgpu driver is not loaded \"; sleep 2 ;done", blackListFileName)} } initContainerImage := defaultInitContainerImage diff --git a/internal/utils.go b/internal/utils.go index 9c67f1d3..bc642343 100644 --- a/internal/utils.go +++ b/internal/utils.go @@ -17,17 +17,28 @@ limitations under the License. package utils import ( + "context" "fmt" "strings" - amdv1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1" + "github.com/go-logr/logr" v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + + amdv1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1" + "github.com/ROCm/gpu-operator/internal/cmd" ) const ( - defaultOcDriversVersion = "6.2.2" - NodeFeatureLabelAmdGpu = "feature.node.kubernetes.io/amd-gpu" - NodeFeatureLabelAmdVGpu = "feature.node.kubernetes.io/amd-vgpu" + defaultOcDriversVersion = "6.2.2" + openShiftNodeLabel = "node.openshift.io/os_id" + NodeFeatureLabelAmdGpu = "feature.node.kubernetes.io/amd-gpu" + NodeFeatureLabelAmdVGpu = "feature.node.kubernetes.io/amd-vgpu" + ResourceNamingStrategyFlag = "resource_naming_strategy" + SingleStrategy = "single" + MixedStrategy = "mixed" ) func GetDriverVersion(node v1.Node, deviceConfig amdv1alpha1.DeviceConfig) (string, error) { @@ -88,3 +99,30 @@ func HasNodeLabelKey(node v1.Node, labelKey string) bool { } return false } + +func IsOpenShift(logger logr.Logger) bool { + config, err := rest.InClusterConfig() + if err != nil { + cmd.FatalError(logger, err, "unable to get cluster config") + } + // creates the clientset + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + cmd.FatalError(logger, err, "unable to create cluster clientset") + } + // Check for OpenShift-specific labels on nodes + nodes, err := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) + if err != nil { + cmd.FatalError(logger, err, "unable to list nodes") + } + + isOpenShift := false + for _, node := range nodes.Items { + if _, exists := node.Labels[openShiftNodeLabel]; exists { + isOpenShift = true + break + } + } + logger.Info(fmt.Sprintf("IsOpenShift: %+v", isOpenShift)) + return isOpenShift +} diff --git a/internal/utils_container/Dockerfile b/internal/utils_container/Dockerfile index 59e84fda..a40f740b 100644 --- a/internal/utils_container/Dockerfile +++ b/internal/utils_container/Dockerfile @@ -1,31 +1,9 @@ -# Base image -FROM alpine:3.20.3 +FROM registry.access.redhat.com/ubi9/ubi-minimal:9.3 -# Install build dependencies -RUN apk add --no-cache \ - bash \ - build-base \ - automake \ - autoconf \ - libtool \ - pkgconfig \ - gettext-dev \ - bison \ - wget \ - tar \ - flex \ - linux-headers - -# Set working directory -WORKDIR /tmp - -RUN wget https://github.com/util-linux/util-linux/archive/v2.40.tar.gz && tar -xzf v2.40.tar.gz - -# Build and install nsenter only -WORKDIR /tmp/util-linux-2.40 -RUN ./autogen.sh && \ - ./configure --disable-all-programs --enable-nsenter && \ - make nsenter && \ - cp nsenter /nsenter +# Install nsenter from util-linux package +RUN microdnf install -y util-linux && \ + cp /usr/bin/nsenter /nsenter && \ + microdnf clean all +# Set entrypoint to nsenter ENTRYPOINT ["/nsenter"] diff --git a/internal/validator/specValidators.go b/internal/validator/specValidators.go index f6d87ca7..b804c488 100644 --- a/internal/validator/specValidators.go +++ b/internal/validator/specValidators.go @@ -21,6 +21,7 @@ import ( "fmt" amdv1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1" + utils "github.com/ROCm/gpu-operator/internal" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -86,5 +87,29 @@ func ValidateDevicePluginSpec(ctx context.Context, client client.Client, devConf } } + supportedFlagValues := map[string][]string{ + utils.ResourceNamingStrategyFlag: {utils.SingleStrategy, utils.MixedStrategy}, + } + + devicePluginArguments := devConfig.Spec.DevicePlugin.DevicePluginArguments + for key, val := range devicePluginArguments { + validValues, validKey := supportedFlagValues[key] + if !validKey { + return fmt.Errorf("Invalid flag: %s", key) + } + validKeyValue := false + + for _, validVal := range validValues { + if val == validVal { + validKeyValue = true + break + } + } + + if !validKeyValue { + return fmt.Errorf("Invalid flag value: %s=%s. Supported values: %v", key, val, supportedFlagValues[key]) + } + } + return nil } diff --git a/tests/e2e/Makefile b/tests/e2e/Makefile index 90b1e791..e00185c7 100644 --- a/tests/e2e/Makefile +++ b/tests/e2e/Makefile @@ -7,8 +7,11 @@ E2E_KUBE_RBAC_PROXY_CURL_IMAGE ?= curlimages/curl:7.78.0 E2E_UBUNTU_BASE_IMAGE ?= ubuntu:22.04 E2E_MINIO_IMAGE ?= minio/minio:latest E2E_EXPORTER_IMAGE ?= rocm/device-metrics-exporter:v1.2.0 +E2E_EXPORTER_IMAGE_2 ?= rocm/device-metrics-exporter:v1.1.1-beta.0 E2E_DEVICE_PLUGIN_IMAGE ?= rocm/k8s-device-plugin:latest E2E_NODE_LABELLER_IMAGE ?= rocm/k8s-device-plugin:labeller-latest +E2E_DEVICE_PLUGIN_IMAGE_2 ?= rocm/k8s-device-plugin:1.31.0.6 +E2E_NODE_LABELLER_IMAGE_2 ?= rocm/k8s-device-plugin:labeller-1.31.0.6 E2E_TEST_RUNNER_IMAGE ?= rocm/test-runner:v1.2.0-beta.0 export E2E_INIT_CONTAINER_IMAGE @@ -16,8 +19,11 @@ export E2E_KUBE_RBAC_PROXY_CURL_IMAGE export E2E_UBUNTU_BASE_IMAGE export E2E_MINIO_IMAGE export E2E_EXPORTER_IMAGE +export E2E_EXPORTER_IMAGE_2 export E2E_DEVICE_PLUGIN_IMAGE export E2E_NODE_LABELLER_IMAGE +export E2E_DEVICE_PLUGIN_IMAGE_2 +export E2E_NODE_LABELLER_IMAGE_2 export E2E_TEST_RUNNER_IMAGE export E2E_DCM_IMAGE diff --git a/tests/e2e/cluster_test.go b/tests/e2e/cluster_test.go index ed22eb99..40fd52e5 100644 --- a/tests/e2e/cluster_test.go +++ b/tests/e2e/cluster_test.go @@ -1051,7 +1051,7 @@ func (s *E2ESuite) TestWorkloadRequestedGPUs(c *C) { s.verifyDeviceConfigStatus(devCfg, c) s.verifyNodeGPULabel(devCfg, c) - ret, err := utils.GetAMDGPUCount(ctx, s.clientSet) + ret, err := utils.GetAMDGPUCount(ctx, s.clientSet, "gpu") if err != nil { logger.Errorf("error: %v", err) } @@ -1078,7 +1078,7 @@ func (s *E2ESuite) TestWorkloadRequestedGPUs(c *C) { err = utils.DeployRocmPods(context.TODO(), s.clientSet, res) assert.NoError(c, err, "failed to deploy pods") s.verifyROCMPOD(true, c) - err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount) + err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount, "gpu") assert.NoError(c, err, fmt.Sprintf("%v", err)) // delete @@ -1092,6 +1092,244 @@ func (s *E2ESuite) TestWorkloadRequestedGPUs(c *C) { assert.NoError(c, err, "failed to reboot nodes") } +func (s *E2ESuite) TestWorkloadRequestedGPUsHomogeneousSingle(c *C) { + if s.simEnable { + c.Skip("Skipping for non amd gpu testbed") + } + if !dcmImageDefined { + c.Skip("skip DCM test because E2E_DCM_IMAGE is not defined") + } + + s.configMapHelper(c) + + logger.Infof("Add node label after pod comes up") + time.Sleep(30 * time.Second) + + nodes := utils.GetAMDGpuWorker(s.clientSet, s.openshift) + nodeNames := make([]string, 0) + for _, node := range nodes { + nodeNames = append(nodeNames, node.Name) + } + for _, nodeName := range nodeNames { + s.addRemoveNodeLabels(nodeName, "e2e_profile2") + } + + logs := s.getLogs() + if strings.Contains(logs, "Partition completed successfully") && (!strings.Contains(logs, "ERROR")) && (s.eventHelper("SuccessfullyPartitioned", "Normal")) { + logger.Infof("Successfully tested homogenous default partitioning") + } else { + logger.Errorf("Failure test homogenous partitioning") + } + devCfgDcm := s.getDeviceConfigForDCM(c) + s.deleteDeviceConfig(devCfgDcm, c) + + time.Sleep(60 * time.Second) + + ctx := context.TODO() + logger.Infof("create %v", s.cfgName) + devCfg := s.getDeviceConfig(c) + driverEnable := false + devCfg.Spec.Driver.Enable = &driverEnable + s.createDeviceConfig(devCfg, c) + s.checkNFDWorkerStatus(s.ns, c, "") + s.checkNodeLabellerStatus(s.ns, c, devCfg) + s.verifyDeviceConfigStatus(devCfg, c) + s.verifyNodeGPULabel(devCfg, c) + + ret, err := utils.GetAMDGPUCount(ctx, s.clientSet, "gpu") + if err != nil { + logger.Errorf("error: %v", err) + } + var minGPU int = 10000 + for _, v := range ret { + if v < minGPU { + minGPU = v + } + } + assert.Greater(c, minGPU, 0, "did not find any server with amd gpu") + + gpuLimitCount := minGPU + gpuReqCount := minGPU + + res := &v1.ResourceRequirements{ + Limits: v1.ResourceList{ + "amd.com/gpu": resource.MustParse(fmt.Sprintf("%d", gpuLimitCount)), + }, + Requests: v1.ResourceList{ + "amd.com/gpu": resource.MustParse(fmt.Sprintf("%d", gpuReqCount)), + }, + } + + err = utils.DeployRocmPods(context.TODO(), s.clientSet, res) + assert.NoError(c, err, "failed to deploy pods") + err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount, "gpu") + assert.NoError(c, err, fmt.Sprintf("%v", err)) + + // delete + s.deleteDeviceConfig(devCfg, c) + + err = utils.DelRocmPods(context.TODO(), s.clientSet) + assert.NoError(c, err, "failed to remove rocm pods") +} + +func (s *E2ESuite) TestWorkloadRequestedGPUsHomogeneousMixed(c *C) { + if s.simEnable { + c.Skip("Skipping for non amd gpu testbed") + } + if !dcmImageDefined { + c.Skip("skip DCM test because E2E_DCM_IMAGE is not defined") + } + + s.configMapHelper(c) + + logger.Infof("Add node label after pod comes up") + time.Sleep(30 * time.Second) + + nodes := utils.GetAMDGpuWorker(s.clientSet, s.openshift) + nodeNames := make([]string, 0) + for _, node := range nodes { + nodeNames = append(nodeNames, node.Name) + } + for _, nodeName := range nodeNames { + s.addRemoveNodeLabels(nodeName, "e2e_profile2") + } + + logs := s.getLogs() + if strings.Contains(logs, "Partition completed successfully") && (!strings.Contains(logs, "ERROR")) && (s.eventHelper("SuccessfullyPartitioned", "Normal")) { + logger.Infof("Successfully tested homogeneous partitioning") + } else { + logger.Errorf("Failure test homogeneous partitioning") + } + devCfgDcm := s.getDeviceConfigForDCM(c) + s.deleteDeviceConfig(devCfgDcm, c) + time.Sleep(60 * time.Second) + ctx := context.TODO() + logger.Infof("create %v", s.cfgName) + devCfg := s.getDeviceConfig(c) + driverEnable := false + devCfg.Spec.Driver.Enable = &driverEnable + devCfg.Spec.DevicePlugin.DevicePluginArguments = map[string]string{"resource_naming_strategy": "mixed"} + s.createDeviceConfig(devCfg, c) + s.checkNFDWorkerStatus(s.ns, c, "") + s.checkNodeLabellerStatus(s.ns, c, devCfg) + s.verifyDeviceConfigStatus(devCfg, c) + + ret, err := utils.GetAMDGPUCount(ctx, s.clientSet, "cpx_nps4") + if err != nil { + logger.Errorf("error: %v", err) + } + var minGPU int = 10000 + for _, v := range ret { + if v < minGPU { + minGPU = v + } + } + assert.Greater(c, minGPU, 0, "did not find any server with amd gpu") + + gpuLimitCount := minGPU + gpuReqCount := minGPU + + res := &v1.ResourceRequirements{ + Limits: v1.ResourceList{ + "amd.com/cpx_nps4": resource.MustParse(fmt.Sprintf("%d", gpuLimitCount)), + }, + Requests: v1.ResourceList{ + "amd.com/cpx_nps4": resource.MustParse(fmt.Sprintf("%d", gpuReqCount)), + }, + } + + err = utils.DeployRocmPods(context.TODO(), s.clientSet, res) + assert.NoError(c, err, "failed to deploy pods") + err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount, "cpx_nps4") + assert.NoError(c, err, fmt.Sprintf("%v", err)) + + // delete + s.deleteDeviceConfig(devCfg, c) + + err = utils.DelRocmPods(context.TODO(), s.clientSet) + assert.NoError(c, err, "failed to remove rocm pods") + +} + +func (s *E2ESuite) TestWorkloadRequestedGPUsHeterogeneousMixed(c *C) { + if s.simEnable { + c.Skip("Skipping for non amd gpu testbed") + } + if !dcmImageDefined { + c.Skip("skip DCM test because E2E_DCM_IMAGE is not defined") + } + + s.configMapHelper(c) + + logger.Infof("Add node label after pod comes up") + time.Sleep(30 * time.Second) + + nodes := utils.GetAMDGpuWorker(s.clientSet, s.openshift) + nodeNames := make([]string, 0) + for _, node := range nodes { + nodeNames = append(nodeNames, node.Name) + } + for _, nodeName := range nodeNames { + s.addRemoveNodeLabels(nodeName, "e2e_profile1") + } + + logs := s.getLogs() + if strings.Contains(logs, "Partition completed successfully") && (!strings.Contains(logs, "ERROR")) && (s.eventHelper("SuccessfullyPartitioned", "Normal")) { + logger.Infof("Successfully tested homogeneous partitioning") + } else { + logger.Errorf("Failure test heterogenous partitioning") + } + devCfgDcm := s.getDeviceConfigForDCM(c) + s.deleteDeviceConfig(devCfgDcm, c) + time.Sleep(60 * time.Second) + + ctx := context.TODO() + logger.Infof("create %v", s.cfgName) + devCfg := s.getDeviceConfig(c) + driverEnable := false + devCfg.Spec.Driver.Enable = &driverEnable + devCfg.Spec.DevicePlugin.DevicePluginArguments = map[string]string{"resource_naming_strategy": "mixed"} + s.createDeviceConfig(devCfg, c) + s.checkNFDWorkerStatus(s.ns, c, "") + s.checkNodeLabellerStatus(s.ns, c, devCfg) + s.verifyDeviceConfigStatus(devCfg, c) + + ret, err := utils.GetAMDGPUCount(ctx, s.clientSet, "cpx_nps1") + if err != nil { + logger.Errorf("error: %v", err) + } + var minGPU int = 10000 + for _, v := range ret { + if v < minGPU { + minGPU = v + } + } + assert.Greater(c, minGPU, 0, "did not find any server with amd gpu") + + gpuLimitCount := minGPU + gpuReqCount := minGPU + + res := &v1.ResourceRequirements{ + Limits: v1.ResourceList{ + "amd.com/cpx_nps1": resource.MustParse(fmt.Sprintf("%d", gpuLimitCount)), + }, + Requests: v1.ResourceList{ + "amd.com/cpx_nps1": resource.MustParse(fmt.Sprintf("%d", gpuReqCount)), + }, + } + + err = utils.DeployRocmPods(context.TODO(), s.clientSet, res) + assert.NoError(c, err, "failed to deploy pods") + err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount, "cpx_nps1") + assert.NoError(c, err, fmt.Sprintf("%v", err)) + + // delete + s.deleteDeviceConfig(devCfg, c) + + err = utils.DelRocmPods(context.TODO(), s.clientSet) + assert.NoError(c, err, "failed to remove rocm pods") +} + func (s *E2ESuite) TestKubeRbacProxyClusterIP(c *C) { _, err := s.dClient.DeviceConfigs(s.ns).Get("deviceconfig-kuberbac-clusterip", metav1.GetOptions{}) assert.Errorf(c, err, "config deviceconfig-kuberbac-clusterip exists") @@ -1877,8 +2115,8 @@ func (s *E2ESuite) TestDevicePluginNodeLabellerDaemonSetUpgrade(c *C) { // upgrade // update the CR's device plugin with image - devCfg.Spec.DevicePlugin.DevicePluginImage = devicePluginImage - devCfg.Spec.DevicePlugin.NodeLabellerImage = nodeLabellerImage + devCfg.Spec.DevicePlugin.DevicePluginImage = devicePluginImage2 + devCfg.Spec.DevicePlugin.NodeLabellerImage = nodeLabellerImage2 s.patchDevicePluginImage(devCfg, c) s.patchNodeLabellerImage(devCfg, c) s.verifyDevicePluginStatus(s.ns, c, devCfg) @@ -1911,7 +2149,7 @@ func (s *E2ESuite) TestMetricsExporterDaemonSetUpgrade(c *C) { // upgrade // update the CR's device plugin with image - devCfg.Spec.MetricsExporter.Image = exporterImage + devCfg.Spec.MetricsExporter.Image = exporterImage2 s.patchMetricsExporterImage(devCfg, c) s.verifyDeviceConfigStatus(devCfg, c) s.checkMetricsExporterStatus(devCfg, s.ns, v1.ServiceTypeClusterIP, c) diff --git a/tests/e2e/dcm_e2e_test.go b/tests/e2e/dcm_e2e_test.go index f3f8b9df..cd11bd3c 100644 --- a/tests/e2e/dcm_e2e_test.go +++ b/tests/e2e/dcm_e2e_test.go @@ -72,7 +72,7 @@ func (s *E2ESuite) addRemoveNodeLabels(nodeName string, selectedProfile string) logger.Infof("Error adding node lbels: %s\n", err.Error()) return } - time.Sleep(15 * time.Second) + time.Sleep(45 * time.Second) // Allow partition to happen err = utils.DeleteNodeLabel(s.clientSet, nodeName, "dcm.amd.com/gpu-config-profile") _ = utils.DeleteNodeLabel(s.clientSet, nodeName, "dcm.amd.com/apply-gpu-config-profile") @@ -269,6 +269,7 @@ func (s *E2ESuite) createConfigMap() GPUConfigProfiles { { ComputePartition: "CPX", MemoryPartition: "NPS4", + NumGPUsAssigned: 1, }, } diff --git a/tests/e2e/init.go b/tests/e2e/init.go index d7a863eb..973cedb6 100644 --- a/tests/e2e/init.go +++ b/tests/e2e/init.go @@ -25,8 +25,11 @@ var ( initContainerImage string kubeRbacProxyCurlImage string exporterImage string + exporterImage2 string devicePluginImage string nodeLabellerImage string + devicePluginImage2 string + nodeLabellerImage2 string testRunnerImage string driverImageRepo string ) @@ -46,6 +49,10 @@ func init() { if !ok { log.Fatalf("E2E_EXPORTER_IMAGE is not defined") } + exporterImage2, ok = os.LookupEnv("E2E_EXPORTER_IMAGE_2") + if !ok { + log.Fatalf("E2E_EXPORTER_IMAGE_2 is not defined") + } devicePluginImage, ok = os.LookupEnv("E2E_DEVICE_PLUGIN_IMAGE") if !ok { log.Fatalf("E2E_DEVICE_PLUGIN_IMAGE is not defined") @@ -54,6 +61,14 @@ func init() { if !ok { log.Fatalf("E2E_NODE_LABELLER_IMAGE is not defined") } + devicePluginImage2, ok = os.LookupEnv("E2E_DEVICE_PLUGIN_IMAGE_2") + if !ok { + log.Fatalf("E2E_DEVICE_PLUGIN_IMAGE_2 is not defined") + } + nodeLabellerImage2, ok = os.LookupEnv("E2E_NODE_LABELLER_IMAGE_2") + if !ok { + log.Fatalf("E2E_NODE_LABELLER_IMAGE_2 is not defined") + } testRunnerImage, ok = os.LookupEnv("E2E_TEST_RUNNER_IMAGE") if !ok { log.Fatalf("E2E_TEST_RUNNER_IMAGE is not defined") diff --git a/tests/e2e/testrunner_test.go b/tests/e2e/testrunner_test.go index 305b6f59..aa7b37a7 100644 --- a/tests/e2e/testrunner_test.go +++ b/tests/e2e/testrunner_test.go @@ -200,7 +200,7 @@ func (s *E2ESuite) createTestRunnerConfigmap(valid bool, devCfg *v1alpha1.Device } func (s *E2ESuite) scheduleWorkloadOnNodeWithMaxGPUs(c *C) string { - ret, err := utils.GetAMDGPUCount(context.TODO(), s.clientSet) + ret, err := utils.GetAMDGPUCount(context.TODO(), s.clientSet, "gpu") if err != nil { logger.Errorf("error: %v", err) } @@ -228,7 +228,7 @@ func (s *E2ESuite) scheduleWorkloadOnNodeWithMaxGPUs(c *C) string { err = utils.DeployRocmPods(context.TODO(), s.clientSet, res) assert.NoError(c, err, "failed to deploy pods") - err = utils.VerifyROCMPODResourceCount(context.TODO(), s.clientSet, gpuReqCount) + err = utils.VerifyROCMPODResourceCount(context.TODO(), s.clientSet, gpuReqCount, "gpu") assert.NoError(c, err, fmt.Sprintf("%v", err)) return nodeWithMaxGPU @@ -730,7 +730,7 @@ func (s *E2ESuite) TestTestRunnerLogsExport(c *C) { func (s *E2ESuite) getGPUNodeName() (nodeWithMaxGPU string) { var maxPerNodeGPU int = 0 - ret, err := utils.GetAMDGPUCount(context.TODO(), s.clientSet) + ret, err := utils.GetAMDGPUCount(context.TODO(), s.clientSet, "gpu") if err != nil { logger.Printf("Unable to fetch gpu nodes. Error %v", err) return diff --git a/tests/e2e/utils/utils.go b/tests/e2e/utils/utils.go index 9c9dcf9f..5813ccf5 100644 --- a/tests/e2e/utils/utils.go +++ b/tests/e2e/utils/utils.go @@ -598,14 +598,6 @@ func GetWorkerNodes(cl *kubernetes.Clientset) []*v1.Node { func GetAMDGpuWorker(cl *kubernetes.Clientset, isOpenshift bool) []v1.Node { ret := make([]v1.Node, 0) labelSelector := labels.NewSelector() - if !isOpenshift { - r, _ := labels.NewRequirement( - "node-role.kubernetes.io/control-plane", - selection.DoesNotExist, - nil, - ) - labelSelector = labelSelector.Add(*r) - } r, _ := labels.NewRequirement( "feature.node.kubernetes.io/amd-gpu", selection.Equals, @@ -766,7 +758,7 @@ func DelRocmPodsByNodeNames(ctx context.Context, cl *kubernetes.Clientset, } -func GetAMDGPUCount(ctx context.Context, cl *kubernetes.Clientset) (map[string]int, error) { +func GetAMDGPUCount(ctx context.Context, cl *kubernetes.Clientset, resourceType string) (map[string]int, error) { ret := make(map[string]int) // Get the list of nodes @@ -777,7 +769,8 @@ func GetAMDGPUCount(ctx context.Context, cl *kubernetes.Clientset) (map[string]i // Iterate over the nodes and count AMD GPUs for _, node := range nodes.Items { - if val, ok := node.Status.Capacity["amd.com/gpu"]; ok { + resourceKey := v1.ResourceName("amd.com/" + resourceType) + if val, ok := node.Status.Capacity[resourceKey]; ok { num, err := strconv.ParseInt(val.String(), 10, 64) if err != nil { log.Infof("error: %v", err) @@ -790,7 +783,7 @@ func GetAMDGPUCount(ctx context.Context, cl *kubernetes.Clientset) (map[string]i } func VerifyROCMPODResourceCount(ctx context.Context, cl *kubernetes.Clientset, - gpuReqCount int) error { + gpuReqCount int, resourceType string) error { its, err := cl.CoreV1().Pods("").List(ctx, metav1.ListOptions{ @@ -805,7 +798,8 @@ func VerifyROCMPODResourceCount(ctx context.Context, cl *kubernetes.Clientset, continue } - if gpu, ok := cntr.Resources.Requests["amd.com/gpu"]; ok { + resourceKey := v1.ResourceName("amd.com/" + resourceType) + if gpu, ok := cntr.Resources.Requests[resourceKey]; ok { gpuAssignedCount := int(gpu.Value()) if gpuReqCount < gpuAssignedCount { return fmt.Errorf("gpu requested %d got %d",