From 70a556a821671352d02ad705703586a0c15aa515 Mon Sep 17 00:00:00 2001
From: vm <sriramr2230@gmail.com>
Date: Fri, 21 Mar 2025 05:08:08 +0000
Subject: [PATCH 01/16] GPUOP-210

---
 api/v1alpha1/deviceconfig_types.go                         | 2 +-
 .../manifests/amd-gpu-operator.clusterserviceversion.yaml  | 2 +-
 bundle/manifests/amd.com_deviceconfigs.yaml                | 2 +-
 config/crd/bases/amd.com_deviceconfigs.yaml                | 2 +-
 docs/installation/openshift-olm.md                         | 7 +++++++
 helm-charts-k8s/Chart.lock                                 | 2 +-
 helm-charts-k8s/crds/deviceconfig-crd.yaml                 | 2 +-
 helm-charts-openshift/Chart.lock                           | 2 +-
 helm-charts-openshift/crds/deviceconfig-crd.yaml           | 2 +-
 9 files changed, 15 insertions(+), 8 deletions(-)
diff --git a/api/v1alpha1/deviceconfig_types.go b/api/v1alpha1/deviceconfig_types.go
index 94a4d6a2..7c187780 100644
--- a/api/v1alpha1/deviceconfig_types.go
+++ b/api/v1alpha1/deviceconfig_types.go
@@ -117,7 +117,7 @@ type DriverSpec struct {
 	// example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
 	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Image",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:image"}
 	// +optional
-	// +kubebuilder:validation:Pattern=`^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$`
+	// +kubebuilder:validation:Pattern=`^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$`
 	Image string `json:"image,omitempty"`
 
 	// driver image registry TLS setting for the container image
diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
index 495f3670..86d76210 100644
--- a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
+++ b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
@@ -30,7 +30,7 @@ metadata:
         }
       ]
     capabilities: Basic Install
-    createdAt: "2025-03-20T06:06:57Z"
+    createdAt: "2025-03-21T05:09:41Z"
     operatorframework.io/suggested-namespace: openshift-amd-gpu
     operators.operatorframework.io/builder: operator-sdk-v1.32.0
     operators.operatorframework.io/project_layout: go.kubebuilder.io/v3
diff --git a/bundle/manifests/amd.com_deviceconfigs.yaml b/bundle/manifests/amd.com_deviceconfigs.yaml
index a0476c71..898220ac 100644
--- a/bundle/manifests/amd.com_deviceconfigs.yaml
+++ b/bundle/manifests/amd.com_deviceconfigs.yaml
@@ -357,7 +357,7 @@ spec:
                       for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
                       image tag will be in the format of <linux distro>-<release version>-<kernel version>-<driver version>
                       example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
-                    pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
+                    pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
                     type: string
                   imageRegistrySecret:
                     description: secrets used for pull/push images from/to private
diff --git a/config/crd/bases/amd.com_deviceconfigs.yaml b/config/crd/bases/amd.com_deviceconfigs.yaml
index 5f7a02bd..05e8b815 100644
--- a/config/crd/bases/amd.com_deviceconfigs.yaml
+++ b/config/crd/bases/amd.com_deviceconfigs.yaml
@@ -353,7 +353,7 @@ spec:
                       for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
                       image tag will be in the format of <linux distro>-<release version>-<kernel version>-<driver version>
                       example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
-                    pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
+                    pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
                     type: string
                   imageRegistrySecret:
                     description: secrets used for pull/push images from/to private
diff --git a/docs/installation/openshift-olm.md b/docs/installation/openshift-olm.md
index 89625fc1..489644d9 100644
--- a/docs/installation/openshift-olm.md
+++ b/docs/installation/openshift-olm.md
@@ -204,6 +204,13 @@ spec:
     "feature.node.kubernetes.io/amd-gpu": "true"
 ```
 
+Things to note:
+1. By default, there is no need to specify the image field in CR for Openshift. Default will be used which is: image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
+
+2. If users specify image, $MOD_NAMESPACE can be a place holder , KMM Operator can automatically translate it to the namespace
+
+3. Openshift internal registry has image url restriction, OpenShift users cannot use image like `<registry URL>/<repo name>` , it requires the image URL to be `<registry URL>/<project name or namespace>/<repo name>`. However, if any other registry is being used by the user, the image URL can be of either form.
+
 The operator will:
 
 1. Collect worker node system specifications
diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock
index 6ad80130..477dc578 100644
--- a/helm-charts-k8s/Chart.lock
+++ b/helm-charts-k8s/Chart.lock
@@ -6,4 +6,4 @@ dependencies:
   repository: file://./charts/kmm
   version: v1.0.0
 digest: sha256:f9a315dd2ce3d515ebf28c8e9a6a82158b493ca2686439ec381487761261b597
-generated: "2025-03-20T06:06:33.9562362Z"
+generated: "2025-03-21T05:09:30.645342377Z"
diff --git a/helm-charts-k8s/crds/deviceconfig-crd.yaml b/helm-charts-k8s/crds/deviceconfig-crd.yaml
index 6058c151..707eb1ce 100644
--- a/helm-charts-k8s/crds/deviceconfig-crd.yaml
+++ b/helm-charts-k8s/crds/deviceconfig-crd.yaml
@@ -361,7 +361,7 @@ spec:
                       for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
                       image tag will be in the format of <linux distro>-<release version>-<kernel version>-<driver version>
                       example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
-                    pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
+                    pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
                     type: string
                   imageRegistrySecret:
                     description: secrets used for pull/push images from/to private registry
diff --git a/helm-charts-openshift/Chart.lock b/helm-charts-openshift/Chart.lock
index 6e3c4ccc..f39ad5ac 100644
--- a/helm-charts-openshift/Chart.lock
+++ b/helm-charts-openshift/Chart.lock
@@ -6,4 +6,4 @@ dependencies:
   repository: file://./charts/kmm
   version: v1.0.0
 digest: sha256:25200c34a5cc846a1275e5bf3fc637b19e909dc68de938189c5278d77d03f5ac
-generated: "2025-03-20T06:06:55.80187139Z"
+generated: "2025-03-21T05:09:40.013067636Z"
diff --git a/helm-charts-openshift/crds/deviceconfig-crd.yaml b/helm-charts-openshift/crds/deviceconfig-crd.yaml
index 6058c151..707eb1ce 100644
--- a/helm-charts-openshift/crds/deviceconfig-crd.yaml
+++ b/helm-charts-openshift/crds/deviceconfig-crd.yaml
@@ -361,7 +361,7 @@ spec:
                       for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
                       image tag will be in the format of <linux distro>-<release version>-<kernel version>-<driver version>
                       example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
-                    pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
+                    pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
                     type: string
                   imageRegistrySecret:
                     description: secrets used for pull/push images from/to private registry

From 40bf349916f10e219840cd119e282f91106063d6 Mon Sep 17 00:00:00 2001
From: yansun1996 <yan@pensando.io>
Date: Mon, 24 Mar 2025 22:31:12 +0000
Subject: [PATCH 02/16] Refactor IsOpenshift function based on RedHat
 suggestion

---
 cmd/main.go                           | 12 +++++----
 internal/kmmmodule/kmmmodule.go       | 19 ++------------
 internal/nodelabeller/nodelabeller.go | 19 ++------------
 internal/utils.go                     | 37 ++++++++++++++++++++++++++-
 4 files changed, 47 insertions(+), 40 deletions(-)

diff --git a/cmd/main.go b/cmd/main.go
index b3c985ff..168a730d 100644
--- a/cmd/main.go
+++ b/cmd/main.go
@@ -35,9 +35,6 @@ package main
 import (
 	"flag"
 
-	"github.com/ROCm/gpu-operator/internal/configmanager"
-	"github.com/ROCm/gpu-operator/internal/metricsexporter"
-	"github.com/ROCm/gpu-operator/internal/testrunner"
 	kmmv1beta1 "github.com/rh-ecosystem-edge/kernel-module-management/api/v1beta1"
 	"k8s.io/apimachinery/pkg/runtime"
 	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
@@ -51,11 +48,15 @@ import (
 	_ "k8s.io/client-go/plugin/pkg/client/auth"
 
 	gpuev1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1"
+	utils "github.com/ROCm/gpu-operator/internal"
 	"github.com/ROCm/gpu-operator/internal/cmd"
 	"github.com/ROCm/gpu-operator/internal/config"
+	"github.com/ROCm/gpu-operator/internal/configmanager"
 	"github.com/ROCm/gpu-operator/internal/controllers"
 	"github.com/ROCm/gpu-operator/internal/kmmmodule"
+	"github.com/ROCm/gpu-operator/internal/metricsexporter"
 	"github.com/ROCm/gpu-operator/internal/nodelabeller"
+	"github.com/ROCm/gpu-operator/internal/testrunner"
 	//+kubebuilder:scaffold:imports
 )
 
@@ -107,8 +108,9 @@ func main() {
 	}
 
 	client := mgr.GetClient()
-	kmmHandler := kmmmodule.NewKMMModule(client, scheme)
-	nlHandler := nodelabeller.NewNodeLabeller(scheme)
+	isOpenShift := utils.IsOpenShift(setupLogger)
+	kmmHandler := kmmmodule.NewKMMModule(client, scheme, isOpenShift)
+	nlHandler := nodelabeller.NewNodeLabeller(scheme, isOpenShift)
 	metricsHandler := metricsexporter.NewMetricsExporter(scheme)
 	testrunnerHandler := testrunner.NewTestRunner(scheme)
 	configmanagerHandler := configmanager.NewConfigManager(scheme)
diff --git a/internal/kmmmodule/kmmmodule.go b/internal/kmmmodule/kmmmodule.go
index 9aa6632b..8c753604 100644
--- a/internal/kmmmodule/kmmmodule.go
+++ b/internal/kmmmodule/kmmmodule.go
@@ -55,10 +55,8 @@ import (
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/apimachinery/pkg/util/intstr"
-	"k8s.io/client-go/discovery"
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/rest"
-	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
 	"sigs.k8s.io/controller-runtime/pkg/log"
@@ -107,27 +105,14 @@ type kmmModule struct {
 	isOpenShift bool
 }
 
-func NewKMMModule(client client.Client, scheme *runtime.Scheme) KMMModuleAPI {
+func NewKMMModule(client client.Client, scheme *runtime.Scheme, isOpenShift bool) KMMModuleAPI {
 	return &kmmModule{
 		client:      client,
 		scheme:      scheme,
-		isOpenShift: isOpenshift(),
+		isOpenShift: isOpenShift,
 	}
 }
 
-func isOpenshift() bool {
-	if dc, err := discovery.NewDiscoveryClientForConfig(ctrl.GetConfigOrDie()); err == nil {
-		if gplist, err := dc.ServerGroups(); err == nil {
-			for _, gp := range gplist.Groups {
-				if gp.Name == "route.openshift.io" {
-					return true
-				}
-			}
-		}
-	}
-	return false
-}
-
 func (km *kmmModule) SetNodeVersionLabelAsDesired(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, nodes *v1.NodeList) error {
 	// for each selected node
 	// put the KMM version label given by CR's driver version
diff --git a/internal/nodelabeller/nodelabeller.go b/internal/nodelabeller/nodelabeller.go
index 8f60805b..959bf39f 100644
--- a/internal/nodelabeller/nodelabeller.go
+++ b/internal/nodelabeller/nodelabeller.go
@@ -42,9 +42,7 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/util/intstr"
-	"k8s.io/client-go/discovery"
 	"k8s.io/utils/ptr"
-	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
 )
 
@@ -66,26 +64,13 @@ type nodeLabeller struct {
 	isOpenShift bool
 }
 
-func NewNodeLabeller(scheme *runtime.Scheme) NodeLabeller {
+func NewNodeLabeller(scheme *runtime.Scheme, isOpenshift bool) NodeLabeller {
 	return &nodeLabeller{
 		scheme:      scheme,
-		isOpenShift: isOpenshift(),
+		isOpenShift: isOpenshift,
 	}
 }
 
-func isOpenshift() bool {
-	if dc, err := discovery.NewDiscoveryClientForConfig(ctrl.GetConfigOrDie()); err == nil {
-		if gplist, err := dc.ServerGroups(); err == nil {
-			for _, gp := range gplist.Groups {
-				if gp.Name == "route.openshift.io" {
-					return true
-				}
-			}
-		}
-	}
-	return false
-}
-
 func (nl *nodeLabeller) SetNodeLabellerAsDesired(ds *appsv1.DaemonSet, devConfig *amdv1alpha1.DeviceConfig) error {
 	if ds == nil {
 		return fmt.Errorf("daemon set is not initialized, zero pointer")
diff --git a/internal/utils.go b/internal/utils.go
index 9c67f1d3..b34dd031 100644
--- a/internal/utils.go
+++ b/internal/utils.go
@@ -17,15 +17,23 @@ limitations under the License.
 package utils
 
 import (
+	"context"
 	"fmt"
 	"strings"
 
-	amdv1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1"
+	"github.com/go-logr/logr"
 	v1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/rest"
+
+	amdv1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1"
+	"github.com/ROCm/gpu-operator/internal/cmd"
 )
 
 const (
 	defaultOcDriversVersion = "6.2.2"
+	openShiftNodeLabel      = "node.openshift.io/os_id"
 	NodeFeatureLabelAmdGpu  = "feature.node.kubernetes.io/amd-gpu"
 	NodeFeatureLabelAmdVGpu = "feature.node.kubernetes.io/amd-vgpu"
 )
@@ -88,3 +96,30 @@ func HasNodeLabelKey(node v1.Node, labelKey string) bool {
 	}
 	return false
 }
+
+func IsOpenShift(logger logr.Logger) bool {
+	config, err := rest.InClusterConfig()
+	if err != nil {
+		cmd.FatalError(logger, err, "unable to get cluster config")
+	}
+	// creates the clientset
+	clientset, err := kubernetes.NewForConfig(config)
+	if err != nil {
+		cmd.FatalError(logger, err, "unable to create cluster clientset")
+	}
+	// Check for OpenShift-specific labels on nodes
+	nodes, err := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
+	if err != nil {
+		cmd.FatalError(logger, err, "unable to list nodes")
+	}
+
+	isOpenShift := false
+	for _, node := range nodes.Items {
+		if _, exists := node.Labels[openShiftNodeLabel]; exists {
+			isOpenShift = true
+			break
+		}
+	}
+	logger.Info(fmt.Sprintf("IsOpenShift: %+v", isOpenShift))
+	return isOpenShift
+}

From 044c6b4794046150131e7d153607ad077f52d012 Mon Sep 17 00:00:00 2001
From: vm <sriramr2230@gmail.com>
Date: Tue, 25 Mar 2025 06:30:37 +0000
Subject: [PATCH 03/16] Device Plugin Args option

---
 api/v1alpha1/deviceconfig_types.go            |  9 +++----
 api/v1alpha1/zz_generated.deepcopy.go         |  7 ++++++
 ...md-gpu-operator.clusterserviceversion.yaml | 14 ++++++-----
 bundle/manifests/amd.com_deviceconfigs.yaml   | 14 +++++------
 config/crd/bases/amd.com_deviceconfigs.yaml   | 14 +++++------
 ...md-gpu-operator.clusterserviceversion.yaml | 12 +++++----
 helm-charts-k8s/Chart.lock                    |  2 +-
 helm-charts-k8s/crds/deviceconfig-crd.yaml    | 14 +++++------
 helm-charts-openshift/Chart.lock              |  2 +-
 .../crds/deviceconfig-crd.yaml                | 14 +++++------
 internal/kmmmodule/kmmmodule.go               | 10 ++++++--
 internal/utils.go                             | 11 +++++---
 internal/validator/specValidators.go          | 25 +++++++++++++++++++
 tests/e2e/Makefile                            |  6 +++++
 tests/e2e/cluster_test.go                     |  6 ++---
 tests/e2e/init.go                             | 15 +++++++++++
 16 files changed, 120 insertions(+), 55 deletions(-)

diff --git a/api/v1alpha1/deviceconfig_types.go b/api/v1alpha1/deviceconfig_types.go
index 7c187780..503c0939 100644
--- a/api/v1alpha1/deviceconfig_types.go
+++ b/api/v1alpha1/deviceconfig_types.go
@@ -251,12 +251,11 @@ type DevicePluginSpec struct {
 	// +optional
 	DevicePluginTolerations []v1.Toleration `json:"devicePluginTolerations,omitempty"`
 
-	// resource naming strategy for device plugin
-	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="ResourceNamingStrategy",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:ResourceNamingStrategy"}
-	// +kubebuilder:validation:Enum=single;mixed
-	// +kubebuilder:default:="single"
+	// device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset
+	// supported flag values: {"resource_naming_strategy": {"single", "mixed"}}
+	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="DevicePluginArguments",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:devicePluginArguments"}
 	// +optional
-	ResourceNamingStrategy string `json:"resourceNamingStrategy,omitempty"`
+	DevicePluginArguments map[string]string `json:"devicePluginArguments,omitempty"`
 
 	// node labeller image
 	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="NodeLabellerImage",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:nodeLabellerImage"}
diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
index bbbd03c0..c2be36c9 100644
--- a/api/v1alpha1/zz_generated.deepcopy.go
+++ b/api/v1alpha1/zz_generated.deepcopy.go
@@ -267,6 +267,13 @@ func (in *DevicePluginSpec) DeepCopyInto(out *DevicePluginSpec) {
 			(*in)[i].DeepCopyInto(&(*out)[i])
 		}
 	}
+	if in.DevicePluginArguments != nil {
+		in, out := &in.DevicePluginArguments, &out.DevicePluginArguments
+		*out = make(map[string]string, len(*in))
+		for key, val := range *in {
+			(*out)[key] = val
+		}
+	}
 	if in.NodeLabellerTolerations != nil {
 		in, out := &in.NodeLabellerTolerations, &out.NodeLabellerTolerations
 		*out = make([]v1.Toleration, len(*in))
diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
index 86d76210..45078acb 100644
--- a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
+++ b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
@@ -30,7 +30,7 @@ metadata:
         }
       ]
     capabilities: Basic Install
-    createdAt: "2025-03-21T05:09:41Z"
+    createdAt: "2025-03-25T06:19:27Z"
     operatorframework.io/suggested-namespace: openshift-amd-gpu
     operators.operatorframework.io/builder: operator-sdk-v1.32.0
     operators.operatorframework.io/project_layout: go.kubebuilder.io/v3
@@ -152,6 +152,13 @@ spec:
         path: devicePlugin
         x-descriptors:
         - urn:alm:descriptor:com.amd.deviceconfigs:devicePlugin
+      - description: 'device plugin arguments is used to pass supported flags and
+          their values while starting device plugin daemonset supported flag values:
+          {"resource_naming_strategy": {"single", "mixed"}}'
+        displayName: DevicePluginArguments
+        path: devicePlugin.devicePluginArguments
+        x-descriptors:
+        - urn:alm:descriptor:com.amd.deviceconfigs:devicePluginArguments
       - description: device plugin image
         displayName: DevicePluginImage
         path: devicePlugin.devicePluginImage
@@ -192,11 +199,6 @@ spec:
         path: devicePlugin.nodeLabellerTolerations
         x-descriptors:
         - urn:alm:descriptor:com.amd.deviceconfigs:nodeLabellerTolerations
-      - description: resource naming strategy for device plugin
-        displayName: ResourceNamingStrategy
-        path: devicePlugin.resourceNamingStrategy
-        x-descriptors:
-        - urn:alm:descriptor:com.amd.deviceconfigs:ResourceNamingStrategy
       - description: upgrade policy for device plugin and node labeller daemons
         displayName: UpgradePolicy
         path: devicePlugin.upgradePolicy
diff --git a/bundle/manifests/amd.com_deviceconfigs.yaml b/bundle/manifests/amd.com_deviceconfigs.yaml
index 898220ac..c9123ffe 100644
--- a/bundle/manifests/amd.com_deviceconfigs.yaml
+++ b/bundle/manifests/amd.com_deviceconfigs.yaml
@@ -190,6 +190,13 @@ spec:
               devicePlugin:
                 description: device plugin
                 properties:
+                  devicePluginArguments:
+                    additionalProperties:
+                      type: string
+                    description: |-
+                      device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset
+                      supported flag values: {"resource_naming_strategy": {"single", "mixed"}}
+                    type: object
                   devicePluginImage:
                     description: device plugin image
                     pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
@@ -306,13 +313,6 @@ spec:
                           type: string
                       type: object
                     type: array
-                  resourceNamingStrategy:
-                    default: single
-                    description: resource naming strategy for device plugin
-                    enum:
-                    - single
-                    - mixed
-                    type: string
                   upgradePolicy:
                     description: upgrade policy for device plugin and node labeller
                       daemons
diff --git a/config/crd/bases/amd.com_deviceconfigs.yaml b/config/crd/bases/amd.com_deviceconfigs.yaml
index 05e8b815..24c2b053 100644
--- a/config/crd/bases/amd.com_deviceconfigs.yaml
+++ b/config/crd/bases/amd.com_deviceconfigs.yaml
@@ -186,6 +186,13 @@ spec:
               devicePlugin:
                 description: device plugin
                 properties:
+                  devicePluginArguments:
+                    additionalProperties:
+                      type: string
+                    description: |-
+                      device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset
+                      supported flag values: {"resource_naming_strategy": {"single", "mixed"}}
+                    type: object
                   devicePluginImage:
                     description: device plugin image
                     pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
@@ -302,13 +309,6 @@ spec:
                           type: string
                       type: object
                     type: array
-                  resourceNamingStrategy:
-                    default: single
-                    description: resource naming strategy for device plugin
-                    enum:
-                    - single
-                    - mixed
-                    type: string
                   upgradePolicy:
                     description: upgrade policy for device plugin and node labeller
                       daemons
diff --git a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml
index a5b6cd65..a9f4d685 100644
--- a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml
+++ b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml
@@ -123,6 +123,13 @@ spec:
         path: devicePlugin
         x-descriptors:
         - urn:alm:descriptor:com.amd.deviceconfigs:devicePlugin
+      - description: 'device plugin arguments is used to pass supported flags and
+          their values while starting device plugin daemonset supported flag values:
+          {"resource_naming_strategy": {"single", "mixed"}}'
+        displayName: DevicePluginArguments
+        path: devicePlugin.devicePluginArguments
+        x-descriptors:
+        - urn:alm:descriptor:com.amd.deviceconfigs:devicePluginArguments
       - description: device plugin image
         displayName: DevicePluginImage
         path: devicePlugin.devicePluginImage
@@ -163,11 +170,6 @@ spec:
         path: devicePlugin.nodeLabellerTolerations
         x-descriptors:
         - urn:alm:descriptor:com.amd.deviceconfigs:nodeLabellerTolerations
-      - description: resource naming strategy for device plugin
-        displayName: ResourceNamingStrategy
-        path: devicePlugin.resourceNamingStrategy
-        x-descriptors:
-        - urn:alm:descriptor:com.amd.deviceconfigs:ResourceNamingStrategy
       - description: upgrade policy for device plugin and node labeller daemons
         displayName: UpgradePolicy
         path: devicePlugin.upgradePolicy
diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock
index 477dc578..54b4cb8c 100644
--- a/helm-charts-k8s/Chart.lock
+++ b/helm-charts-k8s/Chart.lock
@@ -6,4 +6,4 @@ dependencies:
   repository: file://./charts/kmm
   version: v1.0.0
 digest: sha256:f9a315dd2ce3d515ebf28c8e9a6a82158b493ca2686439ec381487761261b597
-generated: "2025-03-21T05:09:30.645342377Z"
+generated: "2025-03-25T06:19:17.248998622Z"
diff --git a/helm-charts-k8s/crds/deviceconfig-crd.yaml b/helm-charts-k8s/crds/deviceconfig-crd.yaml
index 707eb1ce..502f4b89 100644
--- a/helm-charts-k8s/crds/deviceconfig-crd.yaml
+++ b/helm-charts-k8s/crds/deviceconfig-crd.yaml
@@ -194,6 +194,13 @@ spec:
               devicePlugin:
                 description: device plugin
                 properties:
+                  devicePluginArguments:
+                    additionalProperties:
+                      type: string
+                    description: |-
+                      device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset
+                      supported flag values: {"resource_naming_strategy": {"single", "mixed"}}
+                    type: object
                   devicePluginImage:
                     description: device plugin image
                     pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
@@ -310,13 +317,6 @@ spec:
                           type: string
                       type: object
                     type: array
-                  resourceNamingStrategy:
-                    default: single
-                    description: resource naming strategy for device plugin
-                    enum:
-                    - single
-                    - mixed
-                    type: string
                   upgradePolicy:
                     description: upgrade policy for device plugin and node labeller
                       daemons
diff --git a/helm-charts-openshift/Chart.lock b/helm-charts-openshift/Chart.lock
index f39ad5ac..6e9b718d 100644
--- a/helm-charts-openshift/Chart.lock
+++ b/helm-charts-openshift/Chart.lock
@@ -6,4 +6,4 @@ dependencies:
   repository: file://./charts/kmm
   version: v1.0.0
 digest: sha256:25200c34a5cc846a1275e5bf3fc637b19e909dc68de938189c5278d77d03f5ac
-generated: "2025-03-21T05:09:40.013067636Z"
+generated: "2025-03-25T06:19:26.060856628Z"
diff --git a/helm-charts-openshift/crds/deviceconfig-crd.yaml b/helm-charts-openshift/crds/deviceconfig-crd.yaml
index 707eb1ce..502f4b89 100644
--- a/helm-charts-openshift/crds/deviceconfig-crd.yaml
+++ b/helm-charts-openshift/crds/deviceconfig-crd.yaml
@@ -194,6 +194,13 @@ spec:
               devicePlugin:
                 description: device plugin
                 properties:
+                  devicePluginArguments:
+                    additionalProperties:
+                      type: string
+                    description: |-
+                      device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset
+                      supported flag values: {"resource_naming_strategy": {"single", "mixed"}}
+                    type: object
                   devicePluginImage:
                     description: device plugin image
                     pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
@@ -310,13 +317,6 @@ spec:
                           type: string
                       type: object
                     type: array
-                  resourceNamingStrategy:
-                    default: single
-                    description: resource naming strategy for device plugin
-                    enum:
-                    - single
-                    - mixed
-                    type: string
                   upgradePolicy:
                     description: upgrade policy for device plugin and node labeller
                       daemons
diff --git a/internal/kmmmodule/kmmmodule.go b/internal/kmmmodule/kmmmodule.go
index 8c753604..9ca34383 100644
--- a/internal/kmmmodule/kmmmodule.go
+++ b/internal/kmmmodule/kmmmodule.go
@@ -257,8 +257,14 @@ func (km *kmmModule) SetDevicePluginAsDesired(ds *appsv1.DaemonSet, devConfig *a
 		return fmt.Errorf("daemon set is not initialized, zero pointer")
 	}
 
-	resourceNamingStrategy := devConfig.Spec.DevicePlugin.ResourceNamingStrategy
-	command := []string{"sh", "-c", fmt.Sprintf("./k8s-device-plugin -logtostderr=true -stderrthreshold=INFO -v=5 -pulse=30 -resource_naming_strategy=%s", resourceNamingStrategy)}
+	commandArgs := "./k8s-device-plugin -logtostderr=true -stderrthreshold=INFO -v=5 -pulse=30"
+
+	devicePluginArguments := devConfig.Spec.DevicePlugin.DevicePluginArguments
+	for key, val := range devicePluginArguments {
+		commandArgs += " -" + key + "=" + val
+	}
+
+	command := []string{"sh", "-c", commandArgs}
 	nodeSelector := map[string]string{}
 	for key, val := range devConfig.Spec.Selector {
 		nodeSelector[key] = val
diff --git a/internal/utils.go b/internal/utils.go
index b34dd031..bc642343 100644
--- a/internal/utils.go
+++ b/internal/utils.go
@@ -32,10 +32,13 @@ import (
 )
 
 const (
-	defaultOcDriversVersion = "6.2.2"
-	openShiftNodeLabel      = "node.openshift.io/os_id"
-	NodeFeatureLabelAmdGpu  = "feature.node.kubernetes.io/amd-gpu"
-	NodeFeatureLabelAmdVGpu = "feature.node.kubernetes.io/amd-vgpu"
+	defaultOcDriversVersion    = "6.2.2"
+	openShiftNodeLabel         = "node.openshift.io/os_id"
+	NodeFeatureLabelAmdGpu     = "feature.node.kubernetes.io/amd-gpu"
+	NodeFeatureLabelAmdVGpu    = "feature.node.kubernetes.io/amd-vgpu"
+	ResourceNamingStrategyFlag = "resource_naming_strategy"
+	SingleStrategy             = "single"
+	MixedStrategy              = "mixed"
 )
 
 func GetDriverVersion(node v1.Node, deviceConfig amdv1alpha1.DeviceConfig) (string, error) {
diff --git a/internal/validator/specValidators.go b/internal/validator/specValidators.go
index f6d87ca7..b804c488 100644
--- a/internal/validator/specValidators.go
+++ b/internal/validator/specValidators.go
@@ -21,6 +21,7 @@ import (
 	"fmt"
 
 	amdv1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1"
+	utils "github.com/ROCm/gpu-operator/internal"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
@@ -86,5 +87,29 @@ func ValidateDevicePluginSpec(ctx context.Context, client client.Client, devConf
 		}
 	}
 
+	supportedFlagValues := map[string][]string{
+		utils.ResourceNamingStrategyFlag: {utils.SingleStrategy, utils.MixedStrategy},
+	}
+
+	devicePluginArguments := devConfig.Spec.DevicePlugin.DevicePluginArguments
+	for key, val := range devicePluginArguments {
+		validValues, validKey := supportedFlagValues[key]
+		if !validKey {
+			return fmt.Errorf("Invalid flag: %s", key)
+		}
+		validKeyValue := false
+
+		for _, validVal := range validValues {
+			if val == validVal {
+				validKeyValue = true
+				break
+			}
+		}
+
+		if !validKeyValue {
+			return fmt.Errorf("Invalid flag value: %s=%s. Supported values: %v", key, val, supportedFlagValues[key])
+		}
+	}
+
 	return nil
 }
diff --git a/tests/e2e/Makefile b/tests/e2e/Makefile
index 90b1e791..e00185c7 100644
--- a/tests/e2e/Makefile
+++ b/tests/e2e/Makefile
@@ -7,8 +7,11 @@ E2E_KUBE_RBAC_PROXY_CURL_IMAGE ?= curlimages/curl:7.78.0
 E2E_UBUNTU_BASE_IMAGE ?= ubuntu:22.04
 E2E_MINIO_IMAGE ?= minio/minio:latest
 E2E_EXPORTER_IMAGE ?= rocm/device-metrics-exporter:v1.2.0
+E2E_EXPORTER_IMAGE_2 ?= rocm/device-metrics-exporter:v1.1.1-beta.0
 E2E_DEVICE_PLUGIN_IMAGE ?= rocm/k8s-device-plugin:latest
 E2E_NODE_LABELLER_IMAGE ?= rocm/k8s-device-plugin:labeller-latest
+E2E_DEVICE_PLUGIN_IMAGE_2 ?= rocm/k8s-device-plugin:1.31.0.6
+E2E_NODE_LABELLER_IMAGE_2 ?= rocm/k8s-device-plugin:labeller-1.31.0.6
 E2E_TEST_RUNNER_IMAGE ?= rocm/test-runner:v1.2.0-beta.0
 
 export E2E_INIT_CONTAINER_IMAGE
@@ -16,8 +19,11 @@ export E2E_KUBE_RBAC_PROXY_CURL_IMAGE
 export E2E_UBUNTU_BASE_IMAGE
 export E2E_MINIO_IMAGE
 export E2E_EXPORTER_IMAGE
+export E2E_EXPORTER_IMAGE_2
 export E2E_DEVICE_PLUGIN_IMAGE
 export E2E_NODE_LABELLER_IMAGE
+export E2E_DEVICE_PLUGIN_IMAGE_2
+export E2E_NODE_LABELLER_IMAGE_2
 export E2E_TEST_RUNNER_IMAGE
 
 export E2E_DCM_IMAGE
diff --git a/tests/e2e/cluster_test.go b/tests/e2e/cluster_test.go
index ed22eb99..5c10af8f 100644
--- a/tests/e2e/cluster_test.go
+++ b/tests/e2e/cluster_test.go
@@ -1877,8 +1877,8 @@ func (s *E2ESuite) TestDevicePluginNodeLabellerDaemonSetUpgrade(c *C) {
 
 	// upgrade
 	// update the CR's device plugin with image
-	devCfg.Spec.DevicePlugin.DevicePluginImage = devicePluginImage
-	devCfg.Spec.DevicePlugin.NodeLabellerImage = nodeLabellerImage
+	devCfg.Spec.DevicePlugin.DevicePluginImage = devicePluginImage2
+	devCfg.Spec.DevicePlugin.NodeLabellerImage = nodeLabellerImage2
 	s.patchDevicePluginImage(devCfg, c)
 	s.patchNodeLabellerImage(devCfg, c)
 	s.verifyDevicePluginStatus(s.ns, c, devCfg)
@@ -1911,7 +1911,7 @@ func (s *E2ESuite) TestMetricsExporterDaemonSetUpgrade(c *C) {
 
 	// upgrade
 	// update the CR's device plugin with image
-	devCfg.Spec.MetricsExporter.Image = exporterImage
+	devCfg.Spec.MetricsExporter.Image = exporterImage2
 	s.patchMetricsExporterImage(devCfg, c)
 	s.verifyDeviceConfigStatus(devCfg, c)
 	s.checkMetricsExporterStatus(devCfg, s.ns, v1.ServiceTypeClusterIP, c)
diff --git a/tests/e2e/init.go b/tests/e2e/init.go
index d7a863eb..973cedb6 100644
--- a/tests/e2e/init.go
+++ b/tests/e2e/init.go
@@ -25,8 +25,11 @@ var (
 	initContainerImage     string
 	kubeRbacProxyCurlImage string
 	exporterImage          string
+	exporterImage2         string
 	devicePluginImage      string
 	nodeLabellerImage      string
+	devicePluginImage2     string
+	nodeLabellerImage2     string
 	testRunnerImage        string
 	driverImageRepo        string
 )
@@ -46,6 +49,10 @@ func init() {
 	if !ok {
 		log.Fatalf("E2E_EXPORTER_IMAGE is not defined")
 	}
+	exporterImage2, ok = os.LookupEnv("E2E_EXPORTER_IMAGE_2")
+	if !ok {
+		log.Fatalf("E2E_EXPORTER_IMAGE_2 is not defined")
+	}
 	devicePluginImage, ok = os.LookupEnv("E2E_DEVICE_PLUGIN_IMAGE")
 	if !ok {
 		log.Fatalf("E2E_DEVICE_PLUGIN_IMAGE is not defined")
@@ -54,6 +61,14 @@ func init() {
 	if !ok {
 		log.Fatalf("E2E_NODE_LABELLER_IMAGE is not defined")
 	}
+	devicePluginImage2, ok = os.LookupEnv("E2E_DEVICE_PLUGIN_IMAGE_2")
+	if !ok {
+		log.Fatalf("E2E_DEVICE_PLUGIN_IMAGE_2 is not defined")
+	}
+	nodeLabellerImage2, ok = os.LookupEnv("E2E_NODE_LABELLER_IMAGE_2")
+	if !ok {
+		log.Fatalf("E2E_NODE_LABELLER_IMAGE_2 is not defined")
+	}
 	testRunnerImage, ok = os.LookupEnv("E2E_TEST_RUNNER_IMAGE")
 	if !ok {
 		log.Fatalf("E2E_TEST_RUNNER_IMAGE is not defined")

From 5aaffe5fe36a3c7346b91ee34b38d64893c15dca Mon Sep 17 00:00:00 2001
From: vm <sriramr2230@gmail.com>
Date: Wed, 26 Mar 2025 06:18:58 +0000
Subject: [PATCH 04/16] Regression caused from previous commit

---
 internal/controllers/upgrademgr.go | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/internal/controllers/upgrademgr.go b/internal/controllers/upgrademgr.go
index c5f3fd6e..a5e519b2 100644
--- a/internal/controllers/upgrademgr.go
+++ b/internal/controllers/upgrademgr.go
@@ -151,11 +151,6 @@ func (n *upgradeMgr) HandleUpgrade(ctx context.Context, deviceConfig *amdv1alpha
 		// 1. Set init status for unprocessed nodes
 		n.helper.handleInitStatus(ctx, &nodeList.Items[i])
 
-		if !n.helper.isNodeReadyForUpgrade(ctx, &nodeList.Items[i]) {
-			res = ctrl.Result{Requeue: true, RequeueAfter: time.Second * 20}
-			continue
-		}
-
 		// 2. Handle failed nodes
 		if n.helper.isNodeStateUpgradeFailed(ctx, &nodeList.Items[i], deviceConfig) {
 			n.helper.clearUpgradeStartTime(nodeList.Items[i].Name)
@@ -193,6 +188,11 @@ func (n *upgradeMgr) HandleUpgrade(ctx context.Context, deviceConfig *amdv1alpha
 			continue
 		}
 
+		if !n.helper.isNodeReadyForUpgrade(ctx, &nodeList.Items[i]) {
+			res = ctrl.Result{Requeue: true, RequeueAfter: time.Second * 20}
+			continue
+		}
+
 		//This node is a candidate for selection
 		candidateNodes = append(candidateNodes, nodeList.Items[i])
 	}

From 42692bd60d92e67b330023c22870dfb85662803a Mon Sep 17 00:00:00 2001
From: vm <sriramr2230@gmail.com>
Date: Wed, 26 Mar 2025 03:14:38 +0000
Subject: [PATCH 05/16] Device Plugin e2e for homogeneous/heterogeneous with
 single/mixed strategy

---
 tests/e2e/cluster_test.go    | 242 ++++++++++++++++++++++++++++++++++-
 tests/e2e/dcm_e2e_test.go    |   3 +-
 tests/e2e/testrunner_test.go |   6 +-
 tests/e2e/utils/utils.go     |  18 +--
 4 files changed, 251 insertions(+), 18 deletions(-)

diff --git a/tests/e2e/cluster_test.go b/tests/e2e/cluster_test.go
index 5c10af8f..40fd52e5 100644
--- a/tests/e2e/cluster_test.go
+++ b/tests/e2e/cluster_test.go
@@ -1051,7 +1051,7 @@ func (s *E2ESuite) TestWorkloadRequestedGPUs(c *C) {
 	s.verifyDeviceConfigStatus(devCfg, c)
 	s.verifyNodeGPULabel(devCfg, c)
 
-	ret, err := utils.GetAMDGPUCount(ctx, s.clientSet)
+	ret, err := utils.GetAMDGPUCount(ctx, s.clientSet, "gpu")
 	if err != nil {
 		logger.Errorf("error: %v", err)
 	}
@@ -1078,7 +1078,7 @@ func (s *E2ESuite) TestWorkloadRequestedGPUs(c *C) {
 	err = utils.DeployRocmPods(context.TODO(), s.clientSet, res)
 	assert.NoError(c, err, "failed to deploy pods")
 	s.verifyROCMPOD(true, c)
-	err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount)
+	err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount, "gpu")
 	assert.NoError(c, err, fmt.Sprintf("%v", err))
 
 	// delete
@@ -1092,6 +1092,244 @@ func (s *E2ESuite) TestWorkloadRequestedGPUs(c *C) {
 	assert.NoError(c, err, "failed to reboot nodes")
 }
 
+func (s *E2ESuite) TestWorkloadRequestedGPUsHomogeneousSingle(c *C) {
+	if s.simEnable {
+		c.Skip("Skipping for non amd gpu testbed")
+	}
+	if !dcmImageDefined {
+		c.Skip("skip DCM test because E2E_DCM_IMAGE is not defined")
+	}
+
+	s.configMapHelper(c)
+
+	logger.Infof("Add node label after pod comes up")
+	time.Sleep(30 * time.Second)
+
+	nodes := utils.GetAMDGpuWorker(s.clientSet, s.openshift)
+	nodeNames := make([]string, 0)
+	for _, node := range nodes {
+		nodeNames = append(nodeNames, node.Name)
+	}
+	for _, nodeName := range nodeNames {
+		s.addRemoveNodeLabels(nodeName, "e2e_profile2")
+	}
+
+	logs := s.getLogs()
+	if strings.Contains(logs, "Partition completed successfully") && (!strings.Contains(logs, "ERROR")) && (s.eventHelper("SuccessfullyPartitioned", "Normal")) {
+		logger.Infof("Successfully tested homogenous default partitioning")
+	} else {
+		logger.Errorf("Failure test homogenous partitioning")
+	}
+	devCfgDcm := s.getDeviceConfigForDCM(c)
+	s.deleteDeviceConfig(devCfgDcm, c)
+
+	time.Sleep(60 * time.Second)
+
+	ctx := context.TODO()
+	logger.Infof("create %v", s.cfgName)
+	devCfg := s.getDeviceConfig(c)
+	driverEnable := false
+	devCfg.Spec.Driver.Enable = &driverEnable
+	s.createDeviceConfig(devCfg, c)
+	s.checkNFDWorkerStatus(s.ns, c, "")
+	s.checkNodeLabellerStatus(s.ns, c, devCfg)
+	s.verifyDeviceConfigStatus(devCfg, c)
+	s.verifyNodeGPULabel(devCfg, c)
+
+	ret, err := utils.GetAMDGPUCount(ctx, s.clientSet, "gpu")
+	if err != nil {
+		logger.Errorf("error: %v", err)
+	}
+	var minGPU int = 10000
+	for _, v := range ret {
+		if v < minGPU {
+			minGPU = v
+		}
+	}
+	assert.Greater(c, minGPU, 0, "did not find any server with amd gpu")
+
+	gpuLimitCount := minGPU
+	gpuReqCount := minGPU
+
+	res := &v1.ResourceRequirements{
+		Limits: v1.ResourceList{
+			"amd.com/gpu": resource.MustParse(fmt.Sprintf("%d", gpuLimitCount)),
+		},
+		Requests: v1.ResourceList{
+			"amd.com/gpu": resource.MustParse(fmt.Sprintf("%d", gpuReqCount)),
+		},
+	}
+
+	err = utils.DeployRocmPods(context.TODO(), s.clientSet, res)
+	assert.NoError(c, err, "failed to deploy pods")
+	err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount, "gpu")
+	assert.NoError(c, err, fmt.Sprintf("%v", err))
+
+	// delete
+	s.deleteDeviceConfig(devCfg, c)
+
+	err = utils.DelRocmPods(context.TODO(), s.clientSet)
+	assert.NoError(c, err, "failed to remove rocm pods")
+}
+
+func (s *E2ESuite) TestWorkloadRequestedGPUsHomogeneousMixed(c *C) {
+	if s.simEnable {
+		c.Skip("Skipping for non amd gpu testbed")
+	}
+	if !dcmImageDefined {
+		c.Skip("skip DCM test because E2E_DCM_IMAGE is not defined")
+	}
+
+	s.configMapHelper(c)
+
+	logger.Infof("Add node label after pod comes up")
+	time.Sleep(30 * time.Second)
+
+	nodes := utils.GetAMDGpuWorker(s.clientSet, s.openshift)
+	nodeNames := make([]string, 0)
+	for _, node := range nodes {
+		nodeNames = append(nodeNames, node.Name)
+	}
+	for _, nodeName := range nodeNames {
+		s.addRemoveNodeLabels(nodeName, "e2e_profile2")
+	}
+
+	logs := s.getLogs()
+	if strings.Contains(logs, "Partition completed successfully") && (!strings.Contains(logs, "ERROR")) && (s.eventHelper("SuccessfullyPartitioned", "Normal")) {
+		logger.Infof("Successfully tested homogeneous partitioning")
+	} else {
+		logger.Errorf("Failure test homogeneous partitioning")
+	}
+	devCfgDcm := s.getDeviceConfigForDCM(c)
+	s.deleteDeviceConfig(devCfgDcm, c)
+	time.Sleep(60 * time.Second)
+	ctx := context.TODO()
+	logger.Infof("create %v", s.cfgName)
+	devCfg := s.getDeviceConfig(c)
+	driverEnable := false
+	devCfg.Spec.Driver.Enable = &driverEnable
+	devCfg.Spec.DevicePlugin.DevicePluginArguments = map[string]string{"resource_naming_strategy": "mixed"}
+	s.createDeviceConfig(devCfg, c)
+	s.checkNFDWorkerStatus(s.ns, c, "")
+	s.checkNodeLabellerStatus(s.ns, c, devCfg)
+	s.verifyDeviceConfigStatus(devCfg, c)
+
+	ret, err := utils.GetAMDGPUCount(ctx, s.clientSet, "cpx_nps4")
+	if err != nil {
+		logger.Errorf("error: %v", err)
+	}
+	var minGPU int = 10000
+	for _, v := range ret {
+		if v < minGPU {
+			minGPU = v
+		}
+	}
+	assert.Greater(c, minGPU, 0, "did not find any server with amd gpu")
+
+	gpuLimitCount := minGPU
+	gpuReqCount := minGPU
+
+	res := &v1.ResourceRequirements{
+		Limits: v1.ResourceList{
+			"amd.com/cpx_nps4": resource.MustParse(fmt.Sprintf("%d", gpuLimitCount)),
+		},
+		Requests: v1.ResourceList{
+			"amd.com/cpx_nps4": resource.MustParse(fmt.Sprintf("%d", gpuReqCount)),
+		},
+	}
+
+	err = utils.DeployRocmPods(context.TODO(), s.clientSet, res)
+	assert.NoError(c, err, "failed to deploy pods")
+	err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount, "cpx_nps4")
+	assert.NoError(c, err, fmt.Sprintf("%v", err))
+
+	// delete
+	s.deleteDeviceConfig(devCfg, c)
+
+	err = utils.DelRocmPods(context.TODO(), s.clientSet)
+	assert.NoError(c, err, "failed to remove rocm pods")
+
+}
+
+func (s *E2ESuite) TestWorkloadRequestedGPUsHeterogeneousMixed(c *C) {
+	if s.simEnable {
+		c.Skip("Skipping for non amd gpu testbed")
+	}
+	if !dcmImageDefined {
+		c.Skip("skip DCM test because E2E_DCM_IMAGE is not defined")
+	}
+
+	s.configMapHelper(c)
+
+	logger.Infof("Add node label after pod comes up")
+	time.Sleep(30 * time.Second)
+
+	nodes := utils.GetAMDGpuWorker(s.clientSet, s.openshift)
+	nodeNames := make([]string, 0)
+	for _, node := range nodes {
+		nodeNames = append(nodeNames, node.Name)
+	}
+	for _, nodeName := range nodeNames {
+		s.addRemoveNodeLabels(nodeName, "e2e_profile1")
+	}
+
+	logs := s.getLogs()
+	if strings.Contains(logs, "Partition completed successfully") && (!strings.Contains(logs, "ERROR")) && (s.eventHelper("SuccessfullyPartitioned", "Normal")) {
+		logger.Infof("Successfully tested homogeneous partitioning")
+	} else {
+		logger.Errorf("Failure test heterogenous partitioning")
+	}
+	devCfgDcm := s.getDeviceConfigForDCM(c)
+	s.deleteDeviceConfig(devCfgDcm, c)
+	time.Sleep(60 * time.Second)
+
+	ctx := context.TODO()
+	logger.Infof("create %v", s.cfgName)
+	devCfg := s.getDeviceConfig(c)
+	driverEnable := false
+	devCfg.Spec.Driver.Enable = &driverEnable
+	devCfg.Spec.DevicePlugin.DevicePluginArguments = map[string]string{"resource_naming_strategy": "mixed"}
+	s.createDeviceConfig(devCfg, c)
+	s.checkNFDWorkerStatus(s.ns, c, "")
+	s.checkNodeLabellerStatus(s.ns, c, devCfg)
+	s.verifyDeviceConfigStatus(devCfg, c)
+
+	ret, err := utils.GetAMDGPUCount(ctx, s.clientSet, "cpx_nps1")
+	if err != nil {
+		logger.Errorf("error: %v", err)
+	}
+	var minGPU int = 10000
+	for _, v := range ret {
+		if v < minGPU {
+			minGPU = v
+		}
+	}
+	assert.Greater(c, minGPU, 0, "did not find any server with amd gpu")
+
+	gpuLimitCount := minGPU
+	gpuReqCount := minGPU
+
+	res := &v1.ResourceRequirements{
+		Limits: v1.ResourceList{
+			"amd.com/cpx_nps1": resource.MustParse(fmt.Sprintf("%d", gpuLimitCount)),
+		},
+		Requests: v1.ResourceList{
+			"amd.com/cpx_nps1": resource.MustParse(fmt.Sprintf("%d", gpuReqCount)),
+		},
+	}
+
+	err = utils.DeployRocmPods(context.TODO(), s.clientSet, res)
+	assert.NoError(c, err, "failed to deploy pods")
+	err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount, "cpx_nps1")
+	assert.NoError(c, err, fmt.Sprintf("%v", err))
+
+	// delete
+	s.deleteDeviceConfig(devCfg, c)
+
+	err = utils.DelRocmPods(context.TODO(), s.clientSet)
+	assert.NoError(c, err, "failed to remove rocm pods")
+}
+
 func (s *E2ESuite) TestKubeRbacProxyClusterIP(c *C) {
 	_, err := s.dClient.DeviceConfigs(s.ns).Get("deviceconfig-kuberbac-clusterip", metav1.GetOptions{})
 	assert.Errorf(c, err, "config deviceconfig-kuberbac-clusterip exists")
diff --git a/tests/e2e/dcm_e2e_test.go b/tests/e2e/dcm_e2e_test.go
index f3f8b9df..cd11bd3c 100644
--- a/tests/e2e/dcm_e2e_test.go
+++ b/tests/e2e/dcm_e2e_test.go
@@ -72,7 +72,7 @@ func (s *E2ESuite) addRemoveNodeLabels(nodeName string, selectedProfile string)
 		logger.Infof("Error adding node lbels: %s\n", err.Error())
 		return
 	}
-	time.Sleep(15 * time.Second)
+	time.Sleep(45 * time.Second)
 	// Allow partition to happen
 	err = utils.DeleteNodeLabel(s.clientSet, nodeName, "dcm.amd.com/gpu-config-profile")
 	_ = utils.DeleteNodeLabel(s.clientSet, nodeName, "dcm.amd.com/apply-gpu-config-profile")
@@ -269,6 +269,7 @@ func (s *E2ESuite) createConfigMap() GPUConfigProfiles {
 		{
 			ComputePartition: "CPX",
 			MemoryPartition:  "NPS4",
+			NumGPUsAssigned:  1,
 		},
 	}
 
diff --git a/tests/e2e/testrunner_test.go b/tests/e2e/testrunner_test.go
index 305b6f59..aa7b37a7 100644
--- a/tests/e2e/testrunner_test.go
+++ b/tests/e2e/testrunner_test.go
@@ -200,7 +200,7 @@ func (s *E2ESuite) createTestRunnerConfigmap(valid bool, devCfg *v1alpha1.Device
 }
 
 func (s *E2ESuite) scheduleWorkloadOnNodeWithMaxGPUs(c *C) string {
-	ret, err := utils.GetAMDGPUCount(context.TODO(), s.clientSet)
+	ret, err := utils.GetAMDGPUCount(context.TODO(), s.clientSet, "gpu")
 	if err != nil {
 		logger.Errorf("error: %v", err)
 	}
@@ -228,7 +228,7 @@ func (s *E2ESuite) scheduleWorkloadOnNodeWithMaxGPUs(c *C) string {
 
 	err = utils.DeployRocmPods(context.TODO(), s.clientSet, res)
 	assert.NoError(c, err, "failed to deploy pods")
-	err = utils.VerifyROCMPODResourceCount(context.TODO(), s.clientSet, gpuReqCount)
+	err = utils.VerifyROCMPODResourceCount(context.TODO(), s.clientSet, gpuReqCount, "gpu")
 	assert.NoError(c, err, fmt.Sprintf("%v", err))
 
 	return nodeWithMaxGPU
@@ -730,7 +730,7 @@ func (s *E2ESuite) TestTestRunnerLogsExport(c *C) {
 
 func (s *E2ESuite) getGPUNodeName() (nodeWithMaxGPU string) {
 	var maxPerNodeGPU int = 0
-	ret, err := utils.GetAMDGPUCount(context.TODO(), s.clientSet)
+	ret, err := utils.GetAMDGPUCount(context.TODO(), s.clientSet, "gpu")
 	if err != nil {
 		logger.Printf("Unable to fetch gpu nodes. Error %v", err)
 		return
diff --git a/tests/e2e/utils/utils.go b/tests/e2e/utils/utils.go
index 9c9dcf9f..5813ccf5 100644
--- a/tests/e2e/utils/utils.go
+++ b/tests/e2e/utils/utils.go
@@ -598,14 +598,6 @@ func GetWorkerNodes(cl *kubernetes.Clientset) []*v1.Node {
 func GetAMDGpuWorker(cl *kubernetes.Clientset, isOpenshift bool) []v1.Node {
 	ret := make([]v1.Node, 0)
 	labelSelector := labels.NewSelector()
-	if !isOpenshift {
-		r, _ := labels.NewRequirement(
-			"node-role.kubernetes.io/control-plane",
-			selection.DoesNotExist,
-			nil,
-		)
-		labelSelector = labelSelector.Add(*r)
-	}
 	r, _ := labels.NewRequirement(
 		"feature.node.kubernetes.io/amd-gpu",
 		selection.Equals,
@@ -766,7 +758,7 @@ func DelRocmPodsByNodeNames(ctx context.Context, cl *kubernetes.Clientset,
 
 }
 
-func GetAMDGPUCount(ctx context.Context, cl *kubernetes.Clientset) (map[string]int, error) {
+func GetAMDGPUCount(ctx context.Context, cl *kubernetes.Clientset, resourceType string) (map[string]int, error) {
 
 	ret := make(map[string]int)
 	// Get the list of nodes
@@ -777,7 +769,8 @@ func GetAMDGPUCount(ctx context.Context, cl *kubernetes.Clientset) (map[string]i
 
 	// Iterate over the nodes and count AMD GPUs
 	for _, node := range nodes.Items {
-		if val, ok := node.Status.Capacity["amd.com/gpu"]; ok {
+		resourceKey := v1.ResourceName("amd.com/" + resourceType)
+		if val, ok := node.Status.Capacity[resourceKey]; ok {
 			num, err := strconv.ParseInt(val.String(), 10, 64)
 			if err != nil {
 				log.Infof("error: %v", err)
@@ -790,7 +783,7 @@ func GetAMDGPUCount(ctx context.Context, cl *kubernetes.Clientset) (map[string]i
 }
 
 func VerifyROCMPODResourceCount(ctx context.Context, cl *kubernetes.Clientset,
-	gpuReqCount int) error {
+	gpuReqCount int, resourceType string) error {
 
 	its, err := cl.CoreV1().Pods("").List(ctx,
 		metav1.ListOptions{
@@ -805,7 +798,8 @@ func VerifyROCMPODResourceCount(ctx context.Context, cl *kubernetes.Clientset,
 				continue
 			}
 
-			if gpu, ok := cntr.Resources.Requests["amd.com/gpu"]; ok {
+			resourceKey := v1.ResourceName("amd.com/" + resourceType)
+			if gpu, ok := cntr.Resources.Requests[resourceKey]; ok {
 				gpuAssignedCount := int(gpu.Value())
 				if gpuReqCount < gpuAssignedCount {
 					return fmt.Errorf("gpu requested %d got %d",

From 16d74f4592ea107f50d3ef78d0d1424028ec394d Mon Sep 17 00:00:00 2001
From: yansun1996 <yan@pensando.io>
Date: Fri, 28 Mar 2025 23:23:34 +0000
Subject: [PATCH 06/16] [DOC] Add note that RVS test isn't compatible with
 partitioned GPU yet

---
 docs/test/auto-unhealthy-device-test.md | 4 ++++
 docs/test/manual-test.md                | 4 ++++
 docs/test/pre-start-job-test.md         | 4 ++++
 3 files changed, 12 insertions(+)

diff --git a/docs/test/auto-unhealthy-device-test.md b/docs/test/auto-unhealthy-device-test.md
index 0b6e9cb3..354cc0c7 100644
--- a/docs/test/auto-unhealthy-device-test.md
+++ b/docs/test/auto-unhealthy-device-test.md
@@ -4,6 +4,10 @@
 
 Test runner is periodically watching for the device health status from device metrics exporter per 30 seconds. Once exporter reported GPU status is unhealthy, test runner will start to run one-time test on the unhealthy GPU. The test result will be exported as Kubernetes event.
 
+```{warning}
+The Test Runner's RVS test recipes aren't compatible with partitioned GPU. If you're using partitoned GPU please disable the test runner from ```DeviceConfig``` by setting ```spec/testRunner/enable``` to ```false```.
+```
+
 ## Configure test runner
 
 To start the Test Runner along with the GPU Operator, Device Metrics Exporter must be enabled since Test Runner is depending on the exported health status. Configure the ``` spec/metricsExporter/enable ``` field in deviceconfig Custom Resource(CR) to enable/disable metrics exporter and configure the ``` spec/testRunner/enable ``` field in deviceconfig Custom Resource(CR) to enable/disable test runner.
diff --git a/docs/test/manual-test.md b/docs/test/manual-test.md
index c00ac288..7d14f1a9 100644
--- a/docs/test/manual-test.md
+++ b/docs/test/manual-test.md
@@ -4,6 +4,10 @@
 
 To start the manual test, directly use the test runner image to create the Kubernetes job and related resources, then the test will be triggered manually.
 
+```{warning}
+The Test Runner's RVS test recipes aren't compatible with partitioned GPU. If you're using partitoned GPU please reset the GPU partition configuration and run the manual test against the non-partitioned GPU.
+```
+
 ## Use Case 1 - GPU is unhealthy on the node
 
 When any GPU on a specific worker node is unhealthy, you can manually trigger a test / benchmark run on that worker node to check more details on the unhealthy state. The test job requires RBAC config to grant the test runner access to export events and add node labels to the cluster. Here is an example of configuring the RBAC and Job resources:
diff --git a/docs/test/pre-start-job-test.md b/docs/test/pre-start-job-test.md
index 2bad5332..f11e765d 100644
--- a/docs/test/pre-start-job-test.md
+++ b/docs/test/pre-start-job-test.md
@@ -4,6 +4,10 @@
 
 Test runner can be embedded as an init container within your Kubernetes workload pod definition. The init container will be executed before the actual workload containers start, in that way the system could be tested right before the workload start to use the hardware resource.
 
+```{warning}
+The Test Runner's RVS test recipes aren't compatible with partitioned GPU. If you're using partitoned GPU, don't run the test runner as init container to perform the pre-start job test.
+```
+
 ## Configure pre-start init container
 
 The init container requires RBAC config to grant the pod access to export events and add node labels to the cluster. Here is an example of configuring the RBAC and Job resources:

From 66b51d5ab83c33502dad49b185abbb007c001fd0 Mon Sep 17 00:00:00 2001
From: yansun1996 <yan@pensando.io>
Date: Mon, 31 Mar 2025 09:01:10 +0000
Subject: [PATCH 07/16] Address comments

---
 docs/test/auto-unhealthy-device-test.md | 2 +-
 docs/test/manual-test.md                | 2 +-
 docs/test/pre-start-job-test.md         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/test/auto-unhealthy-device-test.md b/docs/test/auto-unhealthy-device-test.md
index 354cc0c7..c610a32c 100644
--- a/docs/test/auto-unhealthy-device-test.md
+++ b/docs/test/auto-unhealthy-device-test.md
@@ -5,7 +5,7 @@
 Test runner is periodically watching for the device health status from device metrics exporter per 30 seconds. Once exporter reported GPU status is unhealthy, test runner will start to run one-time test on the unhealthy GPU. The test result will be exported as Kubernetes event.
 
 ```{warning}
-The Test Runner's RVS test recipes aren't compatible with partitioned GPU. If you're using partitoned GPU please disable the test runner from ```DeviceConfig``` by setting ```spec/testRunner/enable``` to ```false```.
+The RVS test recipes in the Test Runner aren't compatible with partitioned GPUs. To address this, either disable the test runner by setting ```spec/testRunner/enable``` to ```false```, or configure the test runner to run only on nodes without partitioned GPUs by using ```spec/testRunner/selector```.
 ```
 
 ## Configure test runner
diff --git a/docs/test/manual-test.md b/docs/test/manual-test.md
index 7d14f1a9..c4ba4bae 100644
--- a/docs/test/manual-test.md
+++ b/docs/test/manual-test.md
@@ -5,7 +5,7 @@
 To start the manual test, directly use the test runner image to create the Kubernetes job and related resources, then the test will be triggered manually.
 
 ```{warning}
-The Test Runner's RVS test recipes aren't compatible with partitioned GPU. If you're using partitoned GPU please reset the GPU partition configuration and run the manual test against the non-partitioned GPU.
+The RVS test recipes in the Test Runner are not compatible with partitioned GPUs. If you are using a partitioned GPU, please reset the GPU partition configuration and conduct the manual test on a non-partitioned GPU.
 ```
 
 ## Use Case 1 - GPU is unhealthy on the node
diff --git a/docs/test/pre-start-job-test.md b/docs/test/pre-start-job-test.md
index f11e765d..d5133faa 100644
--- a/docs/test/pre-start-job-test.md
+++ b/docs/test/pre-start-job-test.md
@@ -5,7 +5,7 @@
 Test runner can be embedded as an init container within your Kubernetes workload pod definition. The init container will be executed before the actual workload containers start, in that way the system could be tested right before the workload start to use the hardware resource.
 
 ```{warning}
-The Test Runner's RVS test recipes aren't compatible with partitioned GPU. If you're using partitoned GPU, don't run the test runner as init container to perform the pre-start job test.
+The RVS test recipes in the Test Runner are not compatible with partitioned GPUs. If you are using a partitioned GPU, avoid running the Test Runner as an init container for the pre-start job test.
 ```
 
 ## Configure pre-start init container

From d7231ef1f87e026c87dd8296dd0b4618999d10fa Mon Sep 17 00:00:00 2001
From: vm <sriramr2230@gmail.com>
Date: Fri, 28 Mar 2025 04:12:20 +0000
Subject: [PATCH 08/16] BootID support for Reboot during Driver Upgrade

---
 internal/controllers/mock_upgrademgr.go | 26 +++++++++++++++++++++++++
 internal/controllers/upgrademgr.go      | 23 ++++++++++++++++++++++
 2 files changed, 49 insertions(+)

diff --git a/internal/controllers/mock_upgrademgr.go b/internal/controllers/mock_upgrademgr.go
index 7db0fa9c..03944030 100644
--- a/internal/controllers/mock_upgrademgr.go
+++ b/internal/controllers/mock_upgrademgr.go
@@ -216,6 +216,20 @@ func (mr *MockupgradeMgrHelperAPIMockRecorder) deleteRebootPod(ctx, nodeName, dc
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "deleteRebootPod", reflect.TypeOf((*MockupgradeMgrHelperAPI)(nil).deleteRebootPod), ctx, nodeName, dc, force, genId)
 }
 
+// getBootID mocks base method.
+func (m *MockupgradeMgrHelperAPI) getBootID(nodeName string) string {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "getBootID", nodeName)
+	ret0, _ := ret[0].(string)
+	return ret0
+}
+
+// getBootID indicates an expected call of getBootID.
+func (mr *MockupgradeMgrHelperAPIMockRecorder) getBootID(nodeName any) *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "getBootID", reflect.TypeOf((*MockupgradeMgrHelperAPI)(nil).getBootID), nodeName)
+}
+
 // getNode mocks base method.
 func (m *MockupgradeMgrHelperAPI) getNode(ctx context.Context, nodeName string) (*v1.Node, error) {
 	m.ctrl.T.Helper()
@@ -465,6 +479,18 @@ func (mr *MockupgradeMgrHelperAPIMockRecorder) isUpgradePolicyViolated(upgradeIn
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "isUpgradePolicyViolated", reflect.TypeOf((*MockupgradeMgrHelperAPI)(nil).isUpgradePolicyViolated), upgradeInProgress, upgradeFailedState, totalNodes, deviceConfig)
 }
 
+// setBootID mocks base method.
+func (m *MockupgradeMgrHelperAPI) setBootID(nodeName, bootID string) {
+	m.ctrl.T.Helper()
+	m.ctrl.Call(m, "setBootID", nodeName, bootID)
+}
+
+// setBootID indicates an expected call of setBootID.
+func (mr *MockupgradeMgrHelperAPIMockRecorder) setBootID(nodeName, bootID any) *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "setBootID", reflect.TypeOf((*MockupgradeMgrHelperAPI)(nil).setBootID), nodeName, bootID)
+}
+
 // setNodeStatus mocks base method.
 func (m *MockupgradeMgrHelperAPI) setNodeStatus(ctx context.Context, nodeName string, status v1alpha1.UpgradeState) {
 	m.ctrl.T.Helper()
diff --git a/internal/controllers/upgrademgr.go b/internal/controllers/upgrademgr.go
index a5e519b2..ad2ee41c 100644
--- a/internal/controllers/upgrademgr.go
+++ b/internal/controllers/upgrademgr.go
@@ -287,6 +287,8 @@ type upgradeMgrHelperAPI interface {
 	setUpgradeStartTime(nodeName string)
 	clearUpgradeStartTime(nodeName string)
 	checkUpgradeTimeExceeded(ctx context.Context, nodeName string, deviceConfig *amdv1alpha1.DeviceConfig) bool
+	getBootID(nodeName string) string
+	setBootID(nodeName string, bootID string)
 	clearNodeStatus()
 	isInit() bool
 }
@@ -297,6 +299,7 @@ type upgradeMgrHelper struct {
 	drainHelper          *drain.Helper
 	nodeStatus           *sync.Map
 	nodeUpgradeStartTime *sync.Map
+	nodeBootID           *sync.Map
 	init                 bool
 	currentSpec          driverSpec
 }
@@ -313,6 +316,7 @@ func newUpgradeMgrHelperHandler(client client.Client, k8sInterface kubernetes.In
 		k8sInterface:         k8sInterface,
 		nodeStatus:           new(sync.Map),
 		nodeUpgradeStartTime: new(sync.Map),
+		nodeBootID:           new(sync.Map),
 	}
 }
 
@@ -527,6 +531,18 @@ func (h *upgradeMgrHelper) checkUpgradeTimeExceeded(ctx context.Context, nodeNam
 	return false
 }
 
+func (h *upgradeMgrHelper) getBootID(nodeName string) string {
+	if value, ok := h.nodeBootID.Load(nodeName); ok {
+		return value.(string)
+	}
+
+	return ""
+}
+
+func (h *upgradeMgrHelper) setBootID(nodeName string, currentbootID string) {
+	h.nodeBootID.Store(nodeName, currentbootID)
+}
+
 func (h *upgradeMgrHelper) getNodeStatus(nodeName string) amdv1alpha1.UpgradeState {
 
 	if value, ok := h.nodeStatus.Load(nodeName); ok {
@@ -867,6 +883,8 @@ func (h *upgradeMgrHelper) handleNodeReboot(ctx context.Context, node *v1.Node,
 	// Wait for the driver upgrade to complete
 	waitForDriverUpgrade()
 
+	currentBootID := node.Status.NodeInfo.BootID
+	h.setBootID(node.Name, currentBootID)
 	if err := h.client.Create(ctx, rebootPod); err != nil {
 		logger.Error(err, fmt.Sprintf("Node: %v State: %v RebootPod Create failed with Error: %v", node.Name, h.getNodeStatus(node.Name), err))
 		// Mark the state as failed
@@ -888,6 +906,11 @@ func (h *upgradeMgrHelper) handleNodeReboot(ctx context.Context, node *v1.Node,
 						}
 					}
 
+					if nodeObj.Status.NodeInfo.BootID != h.getBootID(node.Name) {
+						h.setBootID(node.Name, nodeObj.Status.NodeInfo.BootID)
+						logger.Info(fmt.Sprintf("Node: %v has rebooted", node.Name))
+						return
+					}
 					// If node is NotReady, proceed; otherwise, wait for the next tick
 					if nodeNotReady {
 						logger.Info(fmt.Sprintf("Node: %v has moved to NotReady", node.Name))

From b64f29b521ab7d3091bf0de15046435ae89d9547 Mon Sep 17 00:00:00 2001
From: vm <sriramr2230@gmail.com>
Date: Wed, 26 Mar 2025 07:09:14 +0000
Subject: [PATCH 09/16] Device Plugin Usage documentation from GPU Operator

---
 docs/device_plugin/device-plugin.md | 112 ++++++++++++++++++++++++++++
 docs/sphinx/_toc.yml                |   3 +
 docs/sphinx/_toc.yml.in             |   3 +
 3 files changed, 118 insertions(+)
 create mode 100644 docs/device_plugin/device-plugin.md

diff --git a/docs/device_plugin/device-plugin.md b/docs/device_plugin/device-plugin.md
new file mode 100644
index 00000000..4ecfb97b
--- /dev/null
+++ b/docs/device_plugin/device-plugin.md
@@ -0,0 +1,112 @@
+# Device Plugin
+
+## Configure device plugin
+
+To start the Device Plugin along with the GPU Operator configure fields under the ``` spec/devicePlugin ``` field in deviceconfig Custom Resource(CR)
+
+```yaml
+  devicePlugin:
+    # Specify the device plugin image
+    # default value is rocm/k8s-device-plugin:latest
+    devicePluginImage: rocm/k8s-device-plugin:latest
+
+    # The device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset
+    devicePluginArguments:
+      resource_naming_strategy: single
+
+    # Specify the node labeller image
+    # default value is rocm/k8s-device-plugin:labeller-latest
+    nodeLabellerImage: rocm/k8s-device-plugin:labeller-latest
+
+    # Specify whether to bring up node labeller component
+    # default value is true
+    enableNodeLabeller: True
+
+```
+
+The **device-plugin** pods start after updating the **DeviceConfig** CR
+
+```bash
+#kubectl get pods -n kube-amd-gpu
+NAME                                                              READY   STATUS    RESTARTS       AGE
+amd-gpu-operator-gpu-operator-charts-controller-manager-77tpmgn   1/1     Running   0              4h9m
+amd-gpu-operator-kmm-controller-6d459dffcf-lbgtt                  1/1     Running   0              4h9m
+amd-gpu-operator-kmm-webhook-server-5fdc8b995-qgj49               1/1     Running   0              4h9m
+amd-gpu-operator-node-feature-discovery-gc-78989c896-7lh8t        1/1     Running   0              3h48m
+amd-gpu-operator-node-feature-discovery-master-b8bffc48b-6rnz6    1/1     Running   0              4h9m
+amd-gpu-operator-node-feature-discovery-worker-m9lwn              1/1     Running   0              4h9m
+test-deviceconfig-device-plugin-rk5f4                             1/1     Running   0              134m
+test-deviceconfig-node-labeller-bxk7x                             1/1     Running   0              134m
+```
+
+<div style="background-color: #d0e7f; border-left: 6px solid #2196F3; padding: 10px;">
+<strong>Note:</strong> The Device Plugin name will be prefixed with the name of your DeviceConfig custom resource
+</div></br>
+
+## Device Plugin DeviceConfig
+| Field Name                       | Details                                      |
+|----------------------------------|----------------------------------------------|
+| **DevicePluginImage**            | Device plugin image                          |
+| **DevicePluginImagePullPolicy**  | One of Always, Never, IfNotPresent.          |
+| **NodeLabellerImage**            | Node labeller image                          |
+| **NodeLabellerImagePullPolicy**  | One of Always, Never, IfNotPresent.          |
+| **EnableNodeLabeller**           | Enable/Disable node labeller with True/False |
+| **DevicePluginArguments**        | The flag/values to pass on to Device Plugin  |
+</br>
+
+1. Both the `ImagePullPolicy` fields default to `Always` if `:latest` tag is specified on the respective Image, or defaults to `IfNotPresent` otherwise. This is default k8s behaviour for `ImagePullPolicy`
+
+2. `DevicePluginArguments` is of type `map[string]string`. Currently supported key value pairs to set under `DevicePluginArguments` are:
+   -> "resource_naming_strategy": {"single", "mixed"}
+
+## How to choose Resource Naming Strategy
+
+To customize the way device plugin reports gpu resources to kubernetes as allocatable k8s resources, use the `single` or `mixed` resource naming strategy in **DeviceConfig** CR
+Before understanding each strategy, please note the definition of homogeneous and heterogeneous nodes
+
+Homogeneous node: A node whose gpu's follow the same compute-memory partition style 
+    -> Example: A node of 8 GPU's where all 8 GPU's are following CPX-NPS4 partition style
+    
+Heterogeneous node: A node whose gpu's follow different compute-memory partition styles
+    -> Example: A node of 8 GPU's where 5 GPU's are following SPX-NPS1 and 3 GPU's are following CPX-NPS1
+
+### Single
+
+In `single` mode, the device plugin reports all gpu's (regardless of whether they are whole gpu's or partitions of a gpu) under the resource name `amd.com/gpu`
+This mode is supported for homogeneous nodes but not supported for heterogeneous nodes
+
+A node which has 8 GPUs where all GPUs are not partitioned will report its resources as:
+
+```bash
+amd.com/gpu: 8
+```
+
+A node which has 8 GPUs where all GPUs are partitioned using CPX-NPS4 style will report its resources as:
+
+```bash
+amd.com/gpu: 64
+```
+
+### Mixed
+
+In `mixed` mode, the device plugin reports all gpu's under a name which matches its partition style.
+This mode is supported for both homogeneous nodes and heterogeneous nodes
+
+A node which has 8 GPUs which are all partitioned using CPX-NPS4 style will report its resources as:
+
+```bash
+amd.com/cpx_nps4: 64
+```
+
+A node which has 8 GPUs where 5 GPU's are following SPX-NPS1 and 3 GPU's are following CPX-NPS1 will report its resources as:
+
+```bash
+amd.com/spx_nps1: 5
+amd.com/cpx_nps1: 24
+``` 
+
+#### **Notes**
+
+- If `resource_naming_strategy` is not passed using `DevicePluginArguments` field in CR, then device plugin will internally default to `single` resource naming strategy. This maintains backwards compatibility with earlier release of device plugin with reported resource name of `amd.com/gpu`
+- If a node has GPUs which do not support partitioning, such as MI210, then the GPUs are reported under resource name `amd.com/gpu` regardless of the resource naming strategy
+- These different naming styles of resources, for example, `amd.com/cpx_nps1` should be followed when requesting for resources in a pod spec
\ No newline at end of file
diff --git a/docs/sphinx/_toc.yml b/docs/sphinx/_toc.yml
index a232e7ab..62786ea4 100644
--- a/docs/sphinx/_toc.yml
+++ b/docs/sphinx/_toc.yml
@@ -44,6 +44,9 @@ subtrees:
       - file: test/manual-test
       - file: test/pre-start-job-test
       - file: test/appendix-test-recipe
+  - caption: Device Plugin
+    entries:
+      - file: device_plugin/device-plugin
   - caption: Specialized Networks
     entries:
       - file: specialized_networks/airgapped-install
diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
index a232e7ab..62786ea4 100644
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -44,6 +44,9 @@ subtrees:
       - file: test/manual-test
       - file: test/pre-start-job-test
       - file: test/appendix-test-recipe
+  - caption: Device Plugin
+    entries:
+      - file: device_plugin/device-plugin
   - caption: Specialized Networks
     entries:
       - file: specialized_networks/airgapped-install

From 22854130e073c474f37543fda9b6e53e8b2d7d30 Mon Sep 17 00:00:00 2001
From: yansun1996 <yan@pensando.io>
Date: Wed, 26 Mar 2025 20:13:14 +0000
Subject: [PATCH 10/16] Optimize the docs and filename for blacklist function

---
 api/v1alpha1/deviceconfig_types.go                   |  4 +++-
 .../amd-gpu-operator.clusterserviceversion.yaml      |  7 +++++--
 bundle/manifests/amd.com_deviceconfigs.yaml          |  5 ++++-
 config/crd/bases/amd.com_deviceconfigs.yaml          |  5 ++++-
 .../amd-gpu-operator.clusterserviceversion.yaml      |  5 ++++-
 helm-charts-k8s/Chart.lock                           |  2 +-
 helm-charts-k8s/crds/deviceconfig-crd.yaml           |  5 ++++-
 helm-charts-openshift/Chart.lock                     |  2 +-
 helm-charts-openshift/crds/deviceconfig-crd.yaml     |  5 ++++-
 internal/nodelabeller/nodelabeller.go                | 12 +++++++++---
 10 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/api/v1alpha1/deviceconfig_types.go b/api/v1alpha1/deviceconfig_types.go
index 503c0939..b6f186c0 100644
--- a/api/v1alpha1/deviceconfig_types.go
+++ b/api/v1alpha1/deviceconfig_types.go
@@ -94,7 +94,9 @@ type DriverSpec struct {
 	// +kubebuilder:default=true
 	Enable *bool `json:"enable,omitempty"`
 
-	// blacklist amdgpu drivers on the host
+	// blacklist amdgpu drivers on the host. Node reboot is required to apply the baclklist on the worker nodes.
+	// Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist.
+	// Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module
 	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="BlacklistDrivers",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:blacklistDrivers"}
 	Blacklist *bool `json:"blacklist,omitempty"`
 
diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
index 45078acb..3a6cd86b 100644
--- a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
+++ b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
@@ -30,7 +30,7 @@ metadata:
         }
       ]
     capabilities: Basic Install
-    createdAt: "2025-03-25T06:19:27Z"
+    createdAt: "2025-03-26T20:10:59Z"
     operatorframework.io/suggested-namespace: openshift-amd-gpu
     operators.operatorframework.io/builder: operator-sdk-v1.32.0
     operators.operatorframework.io/project_layout: go.kubebuilder.io/v3
@@ -229,7 +229,10 @@ spec:
         path: driver.amdgpuInstallerRepoURL
         x-descriptors:
         - urn:alm:descriptor:com.amd.deviceconfigs:amdgpuInstallerRepoURL
-      - description: blacklist amdgpu drivers on the host
+      - description: blacklist amdgpu drivers on the host. Node reboot is required
+          to apply the baclklist on the worker nodes. Not working for OpenShift cluster.
+          OpenShift users please use the Machine Config Operator (MCO) resource to
+          configure amdgpu blacklist. Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module
         displayName: BlacklistDrivers
         path: driver.blacklist
         x-descriptors:
diff --git a/bundle/manifests/amd.com_deviceconfigs.yaml b/bundle/manifests/amd.com_deviceconfigs.yaml
index c9123ffe..d2669dc1 100644
--- a/bundle/manifests/amd.com_deviceconfigs.yaml
+++ b/bundle/manifests/amd.com_deviceconfigs.yaml
@@ -342,7 +342,10 @@ spec:
                       installer URL is https://repo.radeon.com/amdgpu-install by default
                     type: string
                   blacklist:
-                    description: blacklist amdgpu drivers on the host
+                    description: |-
+                      blacklist amdgpu drivers on the host. Node reboot is required to apply the baclklist on the worker nodes.
+                      Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist.
+                      Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module
                     type: boolean
                   enable:
                     default: true
diff --git a/config/crd/bases/amd.com_deviceconfigs.yaml b/config/crd/bases/amd.com_deviceconfigs.yaml
index 24c2b053..7916a7e6 100644
--- a/config/crd/bases/amd.com_deviceconfigs.yaml
+++ b/config/crd/bases/amd.com_deviceconfigs.yaml
@@ -338,7 +338,10 @@ spec:
                       installer URL is https://repo.radeon.com/amdgpu-install by default
                     type: string
                   blacklist:
-                    description: blacklist amdgpu drivers on the host
+                    description: |-
+                      blacklist amdgpu drivers on the host. Node reboot is required to apply the baclklist on the worker nodes.
+                      Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist.
+                      Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module
                     type: boolean
                   enable:
                     default: true
diff --git a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml
index a9f4d685..f91b8a24 100644
--- a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml
+++ b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml
@@ -200,7 +200,10 @@ spec:
         path: driver.amdgpuInstallerRepoURL
         x-descriptors:
         - urn:alm:descriptor:com.amd.deviceconfigs:amdgpuInstallerRepoURL
-      - description: blacklist amdgpu drivers on the host
+      - description: blacklist amdgpu drivers on the host. Node reboot is required
+          to apply the baclklist on the worker nodes. Not working for OpenShift cluster.
+          OpenShift users please use the Machine Config Operator (MCO) resource to
+          configure amdgpu blacklist. Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module
         displayName: BlacklistDrivers
         path: driver.blacklist
         x-descriptors:
diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock
index 54b4cb8c..f42b6cfb 100644
--- a/helm-charts-k8s/Chart.lock
+++ b/helm-charts-k8s/Chart.lock
@@ -6,4 +6,4 @@ dependencies:
   repository: file://./charts/kmm
   version: v1.0.0
 digest: sha256:f9a315dd2ce3d515ebf28c8e9a6a82158b493ca2686439ec381487761261b597
-generated: "2025-03-25T06:19:17.248998622Z"
+generated: "2025-03-26T20:10:45.247725094Z"
diff --git a/helm-charts-k8s/crds/deviceconfig-crd.yaml b/helm-charts-k8s/crds/deviceconfig-crd.yaml
index 502f4b89..81c564c1 100644
--- a/helm-charts-k8s/crds/deviceconfig-crd.yaml
+++ b/helm-charts-k8s/crds/deviceconfig-crd.yaml
@@ -346,7 +346,10 @@ spec:
                       installer URL is https://repo.radeon.com/amdgpu-install by default
                     type: string
                   blacklist:
-                    description: blacklist amdgpu drivers on the host
+                    description: |-
+                      blacklist amdgpu drivers on the host. Node reboot is required to apply the baclklist on the worker nodes.
+                      Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist.
+                      Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module
                     type: boolean
                   enable:
                     default: true
diff --git a/helm-charts-openshift/Chart.lock b/helm-charts-openshift/Chart.lock
index 6e9b718d..8eb0ba07 100644
--- a/helm-charts-openshift/Chart.lock
+++ b/helm-charts-openshift/Chart.lock
@@ -6,4 +6,4 @@ dependencies:
   repository: file://./charts/kmm
   version: v1.0.0
 digest: sha256:25200c34a5cc846a1275e5bf3fc637b19e909dc68de938189c5278d77d03f5ac
-generated: "2025-03-25T06:19:26.060856628Z"
+generated: "2025-03-26T20:10:56.781691243Z"
diff --git a/helm-charts-openshift/crds/deviceconfig-crd.yaml b/helm-charts-openshift/crds/deviceconfig-crd.yaml
index 502f4b89..81c564c1 100644
--- a/helm-charts-openshift/crds/deviceconfig-crd.yaml
+++ b/helm-charts-openshift/crds/deviceconfig-crd.yaml
@@ -346,7 +346,10 @@ spec:
                       installer URL is https://repo.radeon.com/amdgpu-install by default
                     type: string
                   blacklist:
-                    description: blacklist amdgpu drivers on the host
+                    description: |-
+                      blacklist amdgpu drivers on the host. Node reboot is required to apply the baclklist on the worker nodes.
+                      Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist.
+                      Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module
                     type: boolean
                   enable:
                     default: true
diff --git a/internal/nodelabeller/nodelabeller.go b/internal/nodelabeller/nodelabeller.go
index 959bf39f..81293fd9 100644
--- a/internal/nodelabeller/nodelabeller.go
+++ b/internal/nodelabeller/nodelabeller.go
@@ -52,6 +52,8 @@ const (
 	defaultNodeLabellerImage    = "rocm/k8s-device-plugin:labeller-latest"
 	defaultUbiNodeLabellerImage = "rocm/k8s-node-labeller:rhubi-latest"
 	defaultInitContainerImage   = "busybox:1.36"
+	defaultBlacklistFileName    = "blacklist-amdgpu.conf"
+	openShiftBlacklistFileName  = "blacklist-amdgpu-by-operator.conf"
 )
 
 //go:generate mockgen -source=nodelabeller.go -package=nodelabeller -destination=mock_nodelabeller.go NodeLabeller
@@ -129,15 +131,19 @@ func (nl *nodeLabeller) SetNodeLabellerAsDesired(ds *appsv1.DaemonSet, devConfig
 		},
 	}
 
-	var initContainerCommand []string
+	blackListFileName := defaultBlacklistFileName
+	if nl.isOpenShift {
+		blackListFileName = openShiftBlacklistFileName
+	}
 
+	var initContainerCommand []string
 	if devConfig.Spec.Driver.Blacklist != nil && *devConfig.Spec.Driver.Blacklist {
 		// if users want to apply the blacklist, init container will add the amdgpu to the blacklist
-		initContainerCommand = []string{"sh", "-c", "echo \"# added by gpu operator \nblacklist amdgpu\" > /host-etc/modprobe.d/blacklist-amdgpu.conf; while [ ! -d /host-sys/class/kfd ] || [ ! -d /host-sys/module/amdgpu/drivers/ ]; do echo \"amdgpu driver is not loaded \"; sleep 2 ;done"}
+		initContainerCommand = []string{"sh", "-c", fmt.Sprintf("echo \"# added by gpu operator \nblacklist amdgpu\" > /host-etc/modprobe.d/%v; while [ ! -d /host-sys/class/kfd ] || [ ! -d /host-sys/module/amdgpu/drivers/ ]; do echo \"amdgpu driver is not loaded \"; sleep 2 ;done", blackListFileName)}
 	} else {
 		// if users disabled the KMM driver, or disabled the blacklist
 		// init container will remove any hanging amdgpu blacklist entry from the list
-		initContainerCommand = []string{"sh", "-c", "rm -f /host-etc/modprobe.d/blacklist-amdgpu.conf; while [ ! -d /host-sys/class/kfd ] || [ ! -d /host-sys/module/amdgpu/drivers/ ]; do echo \"amdgpu driver is not loaded \"; sleep 2 ;done"}
+		initContainerCommand = []string{"sh", "-c", fmt.Sprintf("rm -f /host-etc/modprobe.d/%v; while [ ! -d /host-sys/class/kfd ] || [ ! -d /host-sys/module/amdgpu/drivers/ ]; do echo \"amdgpu driver is not loaded \"; sleep 2 ;done", blackListFileName)}
 	}
 
 	initContainerImage := defaultInitContainerImage

From 9630502228943bcec9fac2607824be3b5c932fa4 Mon Sep 17 00:00:00 2001
From: vm <sriramr2230@gmail.com>
Date: Wed, 2 Apr 2025 05:37:22 +0000
Subject: [PATCH 11/16] Rhubi based utils container

---
 internal/utils_container/Dockerfile | 36 ++++++-----------------------
 1 file changed, 7 insertions(+), 29 deletions(-)

diff --git a/internal/utils_container/Dockerfile b/internal/utils_container/Dockerfile
index 59e84fda..ada5a760 100644
--- a/internal/utils_container/Dockerfile
+++ b/internal/utils_container/Dockerfile
@@ -1,31 +1,9 @@
-# Base image
-FROM alpine:3.20.3
+FROM registry.access.redhat.com/ubi9/ubi:9.3
 
-# Install build dependencies
-RUN apk add --no-cache \
-    bash \
-    build-base \
-    automake \
-    autoconf \
-    libtool \
-    pkgconfig \
-    gettext-dev \
-    bison \
-    wget \
-    tar \
-    flex \
-    linux-headers
+# Install nsenter from util-linux package
+RUN dnf install -y util-linux && \
+    cp /usr/bin/nsenter /nsenter && \
+    dnf clean all
 
-# Set working directory
-WORKDIR /tmp
-
-RUN wget https://github.com/util-linux/util-linux/archive/v2.40.tar.gz && tar -xzf v2.40.tar.gz
-
-# Build and install nsenter only
-WORKDIR /tmp/util-linux-2.40
-RUN ./autogen.sh && \
-    ./configure --disable-all-programs --enable-nsenter && \
-    make nsenter && \
-    cp nsenter /nsenter
-
-ENTRYPOINT ["/nsenter"]
+# Set entrypoint to nsenter
+ENTRYPOINT ["/nsenter"]
\ No newline at end of file

From 61bd320e1e181523d7cbf5d0fb6d7a899bbfffe7 Mon Sep 17 00:00:00 2001
From: Sriram Ravishankar <79412470+sriram-30@users.noreply.github.com>
Date: Wed, 2 Apr 2025 11:25:23 +0530
Subject: [PATCH 12/16] use ubi minimal image for smaller size

---
 internal/utils_container/Dockerfile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/internal/utils_container/Dockerfile b/internal/utils_container/Dockerfile
index ada5a760..a40f740b 100644
--- a/internal/utils_container/Dockerfile
+++ b/internal/utils_container/Dockerfile
@@ -1,9 +1,9 @@
-FROM registry.access.redhat.com/ubi9/ubi:9.3
+FROM registry.access.redhat.com/ubi9/ubi-minimal:9.3
 
 # Install nsenter from util-linux package
-RUN dnf install -y util-linux && \
+RUN microdnf install -y util-linux && \
     cp /usr/bin/nsenter /nsenter && \
-    dnf clean all
+    microdnf clean all
 
 # Set entrypoint to nsenter
-ENTRYPOINT ["/nsenter"]
\ No newline at end of file
+ENTRYPOINT ["/nsenter"]

From f4b47f2ef355dafc25eec874c95227d83ce5233b Mon Sep 17 00:00:00 2001
From: yansun1996 <yan@pensando.io>
Date: Wed, 2 Apr 2025 23:23:03 +0000
Subject: [PATCH 13/16] Push OLM changes for certification on OperatorHub

---
 ...md-gpu-operator.clusterserviceversion.yaml | 47 +++++++++++++++----
 ...md-gpu-operator.clusterserviceversion.yaml | 45 +++++++++++++++---
 2 files changed, 77 insertions(+), 15 deletions(-)

diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
index 3a6cd86b..134634b9 100644
--- a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
+++ b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
@@ -29,12 +29,30 @@ metadata:
           }
         }
       ]
-    capabilities: Basic Install
-    createdAt: "2025-03-26T20:10:59Z"
+    capabilities: Seamless Upgrades
+    categories: AI/Machine Learning,Monitoring
+    containerImage: docker.io/rocm/gpu-operator:v1.2.0
+    createdAt: "2025-04-02T23:22:18Z"
+    description: |-
+      Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
+      For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
+    devicePluginImage: docker.io/rocm/k8s-device-plugin:rhubi-latest
+    features.operators.openshift.io/disconnected: "true"
+    features.operators.openshift.io/fips-compliant: "false"
+    features.operators.openshift.io/proxy-aware: "true"
+    features.operators.openshift.io/tls-profiles: "false"
+    features.operators.openshift.io/token-auth-aws: "false"
+    features.operators.openshift.io/token-auth-azure: "false"
+    features.operators.openshift.io/token-auth-gcp: "false"
+    metricsExporterImage: docker.io/rocm/device-metrics-exporter:v1.2.0
+    nodelabellerImage: docker.io/rocm/k8s-device-plugin:labeller-rhubi-latest
+    operatorframework.io/cluster-monitoring: "true"
     operatorframework.io/suggested-namespace: openshift-amd-gpu
+    operators.openshift.io/valid-subscription: '[]'
     operators.operatorframework.io/builder: operator-sdk-v1.32.0
     operators.operatorframework.io/project_layout: go.kubebuilder.io/v3
     repository: https://github.com/ROCm/gpu-operator
+    support: Advanced Micro Devices, Inc.
   name: amd-gpu-operator.v1.2.0
   namespace: placeholder
 spec:
@@ -611,7 +629,7 @@ spec:
         - urn:alm:descriptor:com.amd.deviceconfigs:nodeModuleStatus
       version: v1alpha1
   description: |-
-    Operator responsible for deploying AMD GPU kernel drivers and device plugin
+    Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
     For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
   displayName: amd-gpu-operator
   icon:
@@ -1115,11 +1133,24 @@ spec:
   - supported: true
     type: AllNamespaces
   keywords:
-  - amd-gpu-operator
+  - AMD
+  - GPU
+  - AI
+  - Deep Learning
+  - Hardware
+  - Driver
+  - Monitoring
   links:
-  - name: Amd Gpu Operator
-    url: https://amd-gpu-operator.domain
-  maturity: alpha
+  - name: AMD GPU Operator
+    url: https://github.com/ROCm/gpu-operator
+  maintainers:
+  - email: Yan.Sun3@amd.com
+    name: Yan Sun
+  - email: farshad.ghodsian@amd.com
+    name: Farshad Ghodsian
+  - email: shrey.ajmera@amd.com
+    name: Shrey Ajmera
+  maturity: stable
   provider:
-    name: amd-gpu-operator
+    name: Advanced Micro Devices, Inc.
   version: 1.2.0
diff --git a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml
index f91b8a24..878483bd 100644
--- a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml
+++ b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml
@@ -3,9 +3,27 @@ kind: ClusterServiceVersion
 metadata:
   annotations:
     alm-examples: '[]'
-    capabilities: Basic Install
+    capabilities: Seamless Upgrades
+    categories: AI/Machine Learning,Monitoring
+    containerImage: docker.io/rocm/gpu-operator:v1.2.0
+    description: |-
+      Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
+      For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
+    devicePluginImage: docker.io/rocm/k8s-device-plugin:rhubi-latest
+    features.operators.openshift.io/disconnected: "true"
+    features.operators.openshift.io/fips-compliant: "false"
+    features.operators.openshift.io/proxy-aware: "true"
+    features.operators.openshift.io/tls-profiles: "false"
+    features.operators.openshift.io/token-auth-aws: "false"
+    features.operators.openshift.io/token-auth-azure: "false"
+    features.operators.openshift.io/token-auth-gcp: "false"
+    metricsExporterImage: docker.io/rocm/device-metrics-exporter:v1.2.0
+    nodelabellerImage: docker.io/rocm/k8s-device-plugin:labeller-rhubi-latest
+    operatorframework.io/cluster-monitoring: "true"
     operatorframework.io/suggested-namespace: openshift-amd-gpu
+    operators.openshift.io/valid-subscription: '[]'
     repository: https://github.com/ROCm/gpu-operator
+    support: Advanced Micro Devices, Inc.
   name: amd-gpu-operator.v0.0.0
   namespace: placeholder
 spec:
@@ -582,7 +600,7 @@ spec:
         - urn:alm:descriptor:com.amd.deviceconfigs:nodeModuleStatus
       version: v1alpha1
   description: |-
-    Operator responsible for deploying AMD GPU kernel drivers and device plugin
+    Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
     For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
   displayName: amd-gpu-operator
   icon:
@@ -602,11 +620,24 @@ spec:
   - supported: true
     type: AllNamespaces
   keywords:
-  - amd-gpu-operator
+  - AMD
+  - GPU
+  - AI
+  - Deep Learning
+  - Hardware
+  - Driver
+  - Monitoring
   links:
-  - name: Amd Gpu Operator
-    url: https://amd-gpu-operator.domain
-  maturity: alpha
+  - name: AMD GPU Operator
+    url: https://github.com/ROCm/gpu-operator
+  maintainers:
+  - email: Yan.Sun3@amd.com
+    name: Yan Sun
+  - email: farshad.ghodsian@amd.com
+    name: Farshad Ghodsian
+  - email: shrey.ajmera@amd.com
+    name: Shrey Ajmera
+  maturity: stable
   provider:
-    name: amd-gpu-operator
+    name: Advanced Micro Devices, Inc.
   version: 0.0.0

From 6ac7da46124c5249b00ea1a616d9754b72a13144 Mon Sep 17 00:00:00 2001
From: Abhishek Patil <patil.abhishek@gatech.edu>
Date: Thu, 3 Apr 2025 18:35:05 -0700
Subject: [PATCH 14/16] Added Test Runner overview page, ECC error injection
 test page, compatibility matrix on index page, added missing intramfs rebuild
 step on Driver Installation page, updated the TOC to reflect new additions

---
 docs/drivers/installation.md        |   5 +
 docs/index.md                       |  42 +++++-
 docs/metrics/ecc-error-injection.md | 199 ++++++++++++++++++++++++++++
 docs/sphinx/_toc.yml.in             |   3 +
 docs/test/test-runner-overview.md   |  34 +++++
 5 files changed, 281 insertions(+), 2 deletions(-)
 create mode 100644 docs/metrics/ecc-error-injection.md
 create mode 100644 docs/test/test-runner-overview.md

diff --git a/docs/drivers/installation.md b/docs/drivers/installation.md
index 890da553..11cb2a73 100644
--- a/docs/drivers/installation.md
+++ b/docs/drivers/installation.md
@@ -18,12 +18,17 @@ Before installing the AMD GPU driver:
 
 Before installing the out-of-tree AMD GPU driver, you must blacklist the inbox AMD GPU driver:
 
+- These commands need to either be run as `root` or by using `sudo`
 - Create blacklist configuration file on worker nodes:
 
 ```bash
 echo "blacklist amdgpu" > /etc/modprobe.d/blacklist-amdgpu.conf
 ```
+- After blacklist configuration file, you need to rebuild the initramfs for the change to take effect:
 
+```bash
+echo update-initramfs -u -k all
+``` 
 - Reboot the worker node to apply the blacklist
 - Verify the blacklisting:
 
diff --git a/docs/index.md b/docs/index.md
index 3a8340ea..ecfc95e7 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -12,9 +12,47 @@ The AMD GPU Operator simplifies the deployment and management of AMD Instinct GP
 - Automatic worker node labeling for GPU-enabled nodes
 
 ## Compatibility
+### Supported Hardware
 
-- **Kubernetes**: 1.29.0
-- Please refer to the [ROCm documentation](https://rocm.docs.amd.com/en/latest/compatibility/compatibility-matrix.html) for the compatibility matrix for the AMD GPU DKMS driver.
+| **GPUs** | |
+| --- | --- |
+| AMD Instinct™ MI300X | ✅ Supported |
+| AMD Instinct™ MI250 | ✅ Supported |
+| AMD Instinct™ MI210 | ✅ Supported |
+
+### OS & Platform Support Matrix
+
+Below is a matrix of supported Operating systems and the corresponding Kubernetes version that have been validated to work. We will continue to add more Operating Systems and future versions of Kubernetes with each release of the AMD GPU Operator and Metrics Exporter.
+
+<table style="border-collapse: collapse; margin-left: 0; margin-right: auto;">
+  <thead style="background-color: #2c2c2c; color: white;">
+    <tr>
+      <th style="border: 1px solid grey;">Operating System</th>
+      <th style="border: 1px solid grey;">Kubernetes</th>
+      <th style="border: 1px solid grey;">Red Hat OpenShift</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr style="background-color: white; color: black;">
+      <td style="background-color: #2c2c2c; color: white; border: 1px solid grey;">Ubuntu 22.04 LTS</td>
+      <td style="border: 1px solid grey;">1.29—1.31</td>
+      <td style="border: 1px solid grey;"></td>
+    </tr>
+    <tr style="background-color: lightgrey; color: black;">
+      <td style="background-color: #2c2c2c; color: white; border: 1px solid grey;">Ubuntu 24.04 LTS</td>
+      <td style="border: 1px solid grey;">1.29—1.31</td>
+      <td style="border: 1px solid grey;"></td>
+    </tr>
+    <tr style="background-color: white; color: black;">
+      <td style="background-color: #2c2c2c; color: white; border: 1px solid grey;">Red Hat Core OS (RHCOS)</td>
+      <td style="border: 1px solid grey;"></td>
+      <td style="border: 1px solid grey;">4.16—4.17</td>
+    </tr>
+  </tbody>
+</table>
+
+
+Please refer to the [ROCM documentaiton](https://rocm.docs.amd.com/en/latest/compatibility/compatibility-matrix.html) for the compatability matrix for the AMD GPU DKMS driver.
 
 ## Prerequisites
 
diff --git a/docs/metrics/ecc-error-injection.md b/docs/metrics/ecc-error-injection.md
new file mode 100644
index 00000000..f3f17926
--- /dev/null
+++ b/docs/metrics/ecc-error-injection.md
@@ -0,0 +1,199 @@
+## ECC Error Injection Testing
+
+The Metric Exporter has the capability to check for unhealthy GPUs via the monitoring of ECC Errors that can occur when a GPU is not functioning as expected. When an ECC error is detected the Metrics Exporter will now mark the offending GPU as unhealthy and add a node label to indicate which GPU on the node is unhealthy. The Kubernetes Device Plugin also listens to the health metrics coming from the Metrics Exporter to determine GPU status, marking GPUs as schedulable if healthy and unschedulable if unhealthy.
+
+This health check workflow runs automatically on every node the Device Metrics Exporter is running on, with the Metrics Exporter polling GPUs every 30 seconds and the device plugin checking health status at the same interval, ensuring updates within one minute. Users can customize the default ECC error threshold (set to 0) via the `HealthThresholds` field in the metrics exporter ConfigMap. As part of this workflow healthy GPUs are made available for Kubernetes job scheduling, while ensuring no new jobs are scheduled on an unhealthy GPUs.
+
+## To do error injection follow these steps
+
+We have added a new `metricsclient` to the Device Metrics Exporter pod that can be used to inject ECC errors into an otherwise healthy GPU for testing the above health check workflow. This is fairly simple and don't worry this does not harm your GPU as any errors that are being injected are debugging in nature and not real errors. The steps to do this have been outlined below:
+
+### 1. Set Node Name
+
+Use an environment variable to set the Kubernetes node name to indicate which node you want to test error injection on:
+
+```bash
+NODE_NAME=<node-name>
+```
+
+Replace <node-name> with the name of the node you want to test. If you are running this from the same node you want to test you can grab the hostname using:
+
+```bash
+NODE_NAME=$(hostname)
+```
+
+### 2. Set Metrics Exporter Pod Name
+
+Since you have to execute the `metricsclient` from directly within the Device Metrics Exporter pod we need to get the Metrics Exporter pod name running on the node:
+
+```bash
+METRICS_POD=$(kubectl get pods -n kube-amd-gpu --field-selector spec.nodeName=$NODE_NAME --no-headers -o custom-columns=":metadata.name" | grep '^gpu-operator-metrics-exporter-' | head -n 1)
+```
+
+### 3. Check Metrics Client to see GPU Health
+
+Now that you have the name of the metrics exporter pod you can use the metricsclient to check the current health of all GPUs on the node:
+
+```bash
+kubectl exec -n kube-amd-gpu $METRICS_POD -c metrics-exporter-container -- metricsclient
+```
+
+You should see a list of all the GPUs on that node along with their corresponding status. In most cases all GPUs should report as being `healthy`.
+
+```bash
+ID      Health  Associated Workload
+------------------------------------------------
+1       healthy []
+0       healthy []
+7       healthy []
+6       healthy []
+5       healthy []
+4       healthy []
+3       healthy []
+2       healthy []
+------------------------------------------------
+```
+
+### 4. Inject ECC Errors on GPU 0
+
+In order to simulate errors on a GPU we will be using a json file that specifies a GPU ID along with counters for several ECC Uncorrectable error fields that are being monitored by the Device Metrics Exporter. In the below example you can see that we are specifying `GPU 0` and injecting 1 `GPU_ECC_UNCORRECT_SEM` error and 2 `GPU_ECC_UNCORRECT_FUSE` errors. We use the `metricslient -ecc-file-path <file.json>` command to specify the json file we want to inject into the metrics table. To create the json file and execute the metricsclient command all in in one go run the following:
+
+```bash
+kubectl exec -n kube-amd-gpu $METRICS_POD -c metrics-exporter-container -- sh -c 'cat > /tmp/ecc.json <<EOF
+{
+        "ID": "0",
+        "Fields": [
+                "GPU_ECC_UNCORRECT_SEM",
+                "GPU_ECC_UNCORRECT_FUSE"
+        ],
+        "Counts" : [
+                1, 2
+        ]
+}
+EOF
+metricsclient -ecc-file-path /tmp/ecc.json'
+```
+
+The metricsclient should report back the current status of the GPUs as well as the new json string you just injected.
+
+```bash
+ID      Health  Associated Workload
+------------------------------------------------
+6       healthy []
+5       healthy []
+4       healthy []
+3       healthy []
+2       healthy []
+1       healthy []
+0       healthy []
+7       healthy []
+------------------------------------------------
+{"ID":"0","Fields":["GPU_ECC_UNCORRECT_SEM","GPU_ECC_UNCORRECT_FUSE"]}
+```
+
+### 5. Query the Mericsclient to See the Unhealthy GPU
+
+Since the Metric Exporter will check every 30 seconds for GPU health status you will need to wait this amount of time before executing the following command again to see the unhealthy GPU:
+
+```bash
+kubectl exec -n kube-amd-gpu $METRICS_POD -c metrics-exporter-container -- metricsclient
+```
+
+You should now see that one of the GPUs, `GPU 0`, in this case has been marked as unhealthy:
+
+```bash
+ ID      Health  Associated Workload
+------------------------------------------------
+0       unhealthy       []
+7       healthy []
+6       healthy []
+5       healthy []
+4       healthy []
+3       healthy []
+2       healthy []
+1       healthy []
+------------------------------------------------
+```
+
+### 6. Checking the Unhealthy GPU Node label
+
+The Metrics Exporter should of also added an unhealthy GPU label to your affected node to identify which GPU is unhealthy. Run the following to check for unhealth gpu node labels:
+
+```bash
+kubectl describe node $NODE_NAME | grep unhealthy
+```
+
+The command should return back one label indicating `gpu.0.state` as unhealthy:
+
+```yaml
+metricsexporter.amd.com.gpu.0.state=unhealthy
+```
+
+### 7. Check Number of Allocatable GPUs
+
+In order to confirm that the unhealthy GPU resource has in fact been removed from the Kubernetes Scheduler we can check the number of total GPUs on the node and compare it with the number of allocatable GPUs. To do so run the following:
+
+```bash
+kubectl get nodes -o custom-columns=NAME:.metadata.name,"Total GPUs:.status.capacity.amd\.com/gpu","Allocatable GPUs:.status.allocatable.amd\.com/gpu"
+```
+
+You should now have one less GPU that is allocatable on your node:
+
+```bash
+NAME                     Total GPUs   Allocatable GPUs
+amd-mi300x-gpu-worker1   8            7
+```
+
+### 8. Clear ECC Errors on GPU 0
+
+Now that we have tested to ensure the Health Check workflow is working we can clear the ECC errors on GPU0 by using the metrics client in a similar fashion to 4. This time we are setting the error counts to 0 for both GPU_ECC_UNCORRECT error fields.
+
+```bash
+kubectl exec -n kube-amd-gpu $METRICS_POD -c metrics-exporter-container -- sh -c 'cat > /tmp/delete_ecc.json <<EOF
+{
+        "ID": "0",
+        "Fields": [
+                "GPU_ECC_UNCORRECT_SEM",
+                "GPU_ECC_UNCORRECT_FUSE"
+        ],
+        "Counts" : [
+                0, 0
+        ]
+}
+EOF
+metricsclient -ecc-file-path /tmp/delete_ecc.json'
+```
+
+### 9. Check to see GPU 0 Become Healthy Again
+
+After waiting another 30 seconds or so you can check the metrics client again to see that all GPUs are now healthy again:
+
+```bash
+kubectl exec -n kube-amd-gpu $METRICS_POD -c metrics-exporter-container -- metricsclient
+```
+
+You should see the following:
+
+```bash
+ID      Health  Associated Workload
+------------------------------------------------
+4       healthy []
+3       healthy []
+2       healthy []
+1       healthy []
+0       healthy []
+7       healthy []
+6       healthy []
+5       healthy []
+------------------------------------------------
+```
+
+### 10. Check that all GPUs are Allocatable Again
+
+Lastly check the number of allocatable GPUs on your node to ensure that it matches the total number of GPUs:
+
+```bash
+kubectl get nodes -o custom-columns=NAME:.metadata.name,"Total GPUs:.status.capacity.amd\.com/gpu","Allocatable GPUs:.status.allocatable.amd\.com/gpu"
+```
+
+Following the above steps will help you successfully test the new GPU Health Check Feature.
diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
index 62786ea4..58638453 100644
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -38,8 +38,11 @@ subtrees:
       - file: metrics/kube-rbac-proxy
       - file: metrics/health
         title: Health Checks
+      - file: metrics/ecc-error-injection
+        title: ECC Error Injection Testing
   - caption: Test Runner
     entries:
+      - file: test/test-runner-overview
       - file: test/auto-unhealthy-device-test
       - file: test/manual-test
       - file: test/pre-start-job-test
diff --git a/docs/test/test-runner-overview.md b/docs/test/test-runner-overview.md
new file mode 100644
index 00000000..7ffa72a0
--- /dev/null
+++ b/docs/test/test-runner-overview.md
@@ -0,0 +1,34 @@
+## Test Runner Overview
+
+The test runner component offers hardware validation, diagnostics and benchmarking capabilities for your GPU Worker nodes. The new capabilities include:
+
+- Automatically triggering of configurable tests on unhealthy GPUs
+
+- Scheduling or Manually triggering tests within the Kubernetes cluster
+
+- Running pre-start job tests as init containers within your GPU workload pods to ensure GPU health and stability before execution of long running jobs
+
+- Reporting test results as Kubernetes events
+
+Under the hood the Device Test runner leverages the ROCm Validation Suite (RVS) to run any number of tests including GPU stress tests, PCIE bandwidth benchmarks, memory tests, and longer burn-in tests if so desired. The DeviceConfig custom resource has also been updated to provide new configuration options for the Test Runner:
+
+```bash
+  testRunner:
+    # To enable/disable the testrunner, disabled by default
+    enable: True
+
+    # testrunner image
+    image: docker.io/rocm/test-runner:v1.2.0-beta.0
+
+    # image pull policy for the testrunner
+    # default value is IfNotPresent for valid tags, Always for no tag or "latest" tag
+    imagePullPolicy: "Always"
+
+    # specify the mount for test logs
+    logsLocation:
+      # mount path inside test runner container
+      mountPath: "/var/log/amd-test-runner"
+
+      # host path to be mounted into test runner container
+      hostPath: "/var/log/amd-test-runner"
+```

From 7a4f8ff992812ae395ba3ed6b35d983a2d78adec Mon Sep 17 00:00:00 2001
From: Abhishek Patil <patil.abhishek@gatech.edu>
Date: Thu, 3 Apr 2025 18:55:30 -0700
Subject: [PATCH 15/16] Fixed linting/markdown errors

---
 docs/drivers/installation.md | 4 +++-
 docs/index.md                | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/drivers/installation.md b/docs/drivers/installation.md
index 11cb2a73..ead38e4d 100644
--- a/docs/drivers/installation.md
+++ b/docs/drivers/installation.md
@@ -24,11 +24,13 @@ Before installing the out-of-tree AMD GPU driver, you must blacklist the inbox A
 ```bash
 echo "blacklist amdgpu" > /etc/modprobe.d/blacklist-amdgpu.conf
 ```
+
 - After blacklist configuration file, you need to rebuild the initramfs for the change to take effect:
 
 ```bash
 echo update-initramfs -u -k all
-``` 
+```
+
 - Reboot the worker node to apply the blacklist
 - Verify the blacklisting:
 
diff --git a/docs/index.md b/docs/index.md
index ecfc95e7..9348b933 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -12,6 +12,7 @@ The AMD GPU Operator simplifies the deployment and management of AMD Instinct GP
 - Automatic worker node labeling for GPU-enabled nodes
 
 ## Compatibility
+
 ### Supported Hardware
 
 | **GPUs** | |
@@ -51,7 +52,6 @@ Below is a matrix of supported Operating systems and the corresponding Kubernete
   </tbody>
 </table>
 
-
 Please refer to the [ROCM documentaiton](https://rocm.docs.amd.com/en/latest/compatibility/compatibility-matrix.html) for the compatability matrix for the AMD GPU DKMS driver.
 
 ## Prerequisites

From 7f3f7e4d7d9f60080e2b0f7aa69a6517562ec911 Mon Sep 17 00:00:00 2001
From: im-AbhiP <8828883+im-AbhiP@users.noreply.github.com>
Date: Thu, 24 Apr 2025 11:40:39 -0700
Subject: [PATCH 16/16] Update index.md to reflect 4.18 support of OpenShift

---
 docs/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/index.md b/docs/index.md
index 9348b933..130064ba 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -47,7 +47,7 @@ Below is a matrix of supported Operating systems and the corresponding Kubernete
     <tr style="background-color: white; color: black;">
       <td style="background-color: #2c2c2c; color: white; border: 1px solid grey;">Red Hat Core OS (RHCOS)</td>
       <td style="border: 1px solid grey;"></td>
-      <td style="border: 1px solid grey;">4.16—4.17</td>
+      <td style="border: 1px solid grey;">4.16—4.18</td>
     </tr>
   </tbody>
 </table>

Operating System	Kubernetes	Red Hat OpenShift
Ubuntu 22.04 LTS	1.29—1.31
Ubuntu 24.04 LTS	1.29—1.31
Red Hat Core OS (RHCOS)		4.16—4.17