diff --git a/docs/api-reference/operator-api.md b/docs/api-reference/operator-api.md
index 4e93f333b..cec98f706 100644
--- a/docs/api-reference/operator-api.md
+++ b/docs/api-reference/operator-api.md
@@ -608,9 +608,7 @@ _Appears in:_
_Underlying type:_ _string_
-TopologyDomain represents a predefined topology level in the hierarchy.
-Topology ordering (broadest to narrowest):
-Region > Zone > DataCenter > Block > Rack > Host > Numa
+TopologyDomain represents a level in the cluster topology hierarchy.
@@ -634,16 +632,19 @@ _Appears in:_
TopologyLevel defines a single level in the topology hierarchy.
+Maps a platform-agnostic domain to a platform-specific node label key,
+allowing workload operators a consistent way to reference topology levels when defining TopologyConstraint's.
_Appears in:_
+- [ClusterTopologyConfiguration](#clustertopologyconfiguration)
- [ClusterTopologySpec](#clustertopologyspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
-| `domain` _[TopologyDomain](#topologydomain)_ | Domain is the predefined level identifier used in TopologyConstraint references.
Must be one of: region, zone, datacenter, block, rack, host, numa | | Enum: [region zone datacenter block rack host numa]
Required: \{\}
|
-| `key` _string_ | Key is the node label key that identifies this topology domain.
Must be a valid Kubernetes label key (qualified name).
Examples: "topology.kubernetes.io/zone", "kubernetes.io/hostname" | | MaxLength: 63
MinLength: 1
Required: \{\}
|
+| `domain` _[TopologyDomain](#topologydomain)_ | Domain is a platform provider-agnostic level identifier.
Must be one of: region, zone, datacenter, block, rack, host, numa | | Enum: [region zone datacenter block rack host numa]
Required: \{\}
|
+| `key` _string_ | Key is the node label key that identifies this topology domain.
Must be a valid Kubernetes label key (qualified name).
Examples: "topology.kubernetes.io/zone", "kubernetes.io/hostname" | | MaxLength: 63
MinLength: 1
Pattern: `^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]/)?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$`
Required: \{\}
|
@@ -702,7 +703,7 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `enabled` _boolean_ | Enabled indicates whether topology-aware scheduling is enabled. | | |
-| `name` _string_ | Name is the ClusterTopology resource name to use.
Defaults to "grove-topology" if not specified when topology is enabled. | | |
+| `levels` _[TopologyLevel](#topologylevel) array_ | Levels is an ordered list of topology levels from broadest to narrowest scope.
Used to create/update the ClusterTopology CR at operator startup. | | |
#### ControllerConfiguration
diff --git a/docs/designs/topology.md b/docs/designs/topology.md
index 3d761add4..c83dfe76a 100644
--- a/docs/designs/topology.md
+++ b/docs/designs/topology.md
@@ -172,6 +172,7 @@ Domain TopologyDomain `json:"domain"`
// +kubebuilder:validation:Required
// +kubebuilder:validation:MinLength=1
// +kubebuilder:validation:MaxLength=63
+// +kubebuilder:validation:Pattern=`^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]/)?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$`
Key string `json:"key"`
}
diff --git a/hack/ld-flags.sh b/hack/ld-flags.sh
index 6eed0ab8f..387eb2794 100644
--- a/hack/ld-flags.sh
+++ b/hack/ld-flags.sh
@@ -39,8 +39,7 @@ function build_ld_flags() {
FLAGS="-X $PACKAGE_PATH/version.gitCommit=$(git rev-parse --verify HEAD)
-X $PACKAGE_PATH/version.gitTreeState=$tree_state
- -X $PACKAGE_PATH/version.buildDate=$build_date
- -X $PACKAGE_PATH/version.programName=$PROGRAM_NAME"
+ -X $PACKAGE_PATH/version.buildDate=$build_date"
# The k8s.component-base/version.gitVersion can not be set to the version of grove
# due to the error: "emulation version 1.33 is not between [1.31, 0.1.0-dev]".
diff --git a/operator/Makefile b/operator/Makefile
index 9c89712af..0645cbbaa 100644
--- a/operator/Makefile
+++ b/operator/Makefile
@@ -86,12 +86,6 @@ cover-html: test-cover
@go tool cover -html=coverage.out -o coverage.html
@echo "Coverage report generated at coverage.html"
-# Run envtest tests (requires envtest binaries)
-.PHONY: test-envtest
-test-envtest: $(SETUP_ENVTEST)
- @echo "Running envtest with CRD validation..."
- @KUBEBUILDER_ASSETS=$$($(SETUP_ENVTEST) use -p path) go test ./internal/webhook/admission/clustertopology/validation -v -run TestClusterTopologyCRDValidation
-
# Run e2e tests
.PHONY: test-e2e
test-e2e:
diff --git a/operator/api/common/constants/constants.go b/operator/api/common/constants/constants.go
index ae3568cf8..437feaffa 100644
--- a/operator/api/common/constants/constants.go
+++ b/operator/api/common/constants/constants.go
@@ -16,6 +16,15 @@
package constants
+const (
+ // OperatorName is the name of the Grove operator.
+ OperatorName = "grove-operator"
+ // OperatorConfigGroupName is the name of the group for Grove operator configuration.
+ OperatorConfigGroupName = "operator.config.grove.io"
+ // OperatorGroupName is the name of the group for all Grove custom resources.
+ OperatorGroupName = "grove.io"
+)
+
// Constants for finalizers.
const (
// FinalizerPodCliqueSet is the finalizer for PodCliqueSet that is added to `.metadata.finalizers[]` slice. This will be placed on all PodCliqueSet resources
@@ -107,4 +116,6 @@ const (
KindPodClique = "PodClique"
// KindPodCliqueScalingGroup is the kind for a PodCliqueScalingGroup resource.
KindPodCliqueScalingGroup = "PodCliqueScalingGroup"
+ // KindClusterTopology is the kind for a ClusterTopology resource.
+ KindClusterTopology = "ClusterTopology"
)
diff --git a/operator/api/config/v1alpha1/defaults.go b/operator/api/config/v1alpha1/defaults.go
index b9e8e7359..e2752790d 100644
--- a/operator/api/config/v1alpha1/defaults.go
+++ b/operator/api/config/v1alpha1/defaults.go
@@ -27,7 +27,6 @@ const (
defaultLeaderElectionResourceLock = "leases"
defaultLeaderElectionResourceName = "grove-operator-leader-election"
defaultWebhookServerTLSServerCertDir = "/etc/grove-operator/webhook-certs"
- defaultTopologyName = "grove-topology"
)
// SetDefaults_ClientConnectionConfiguration sets defaults for the k8s client connection.
@@ -115,10 +114,3 @@ func SetDefaults_PodCliqueScalingGroupControllerConfiguration(obj *PodCliqueScal
obj.ConcurrentSyncs = ptr.To(1)
}
}
-
-// SetDefaults_ClusterTopologyConfiguration sets defaults for the ClusterTopologyConfiguration.
-func SetDefaults_ClusterTopologyConfiguration(obj *ClusterTopologyConfiguration) {
- if obj.Enabled && obj.Name == "" {
- obj.Name = defaultTopologyName
- }
-}
diff --git a/operator/api/config/v1alpha1/defaults_test.go b/operator/api/config/v1alpha1/defaults_test.go
deleted file mode 100644
index 35209914d..000000000
--- a/operator/api/config/v1alpha1/defaults_test.go
+++ /dev/null
@@ -1,85 +0,0 @@
-// /*
-// Copyright 2025 The Grove Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// */
-
-package v1alpha1
-
-import (
- "testing"
-
- "github.com/stretchr/testify/assert"
-)
-
-func TestSetDefaults_ClusterTopologyConfiguration(t *testing.T) {
- tests := []struct {
- name string
- input ClusterTopologyConfiguration
- expected ClusterTopologyConfiguration
- }{
- {
- name: "enabled with no name defaults to grove-topology",
- input: ClusterTopologyConfiguration{
- Enabled: true,
- Name: "",
- },
- expected: ClusterTopologyConfiguration{
- Enabled: true,
- Name: "grove-topology",
- },
- },
- {
- name: "enabled with custom name preserves name",
- input: ClusterTopologyConfiguration{
- Enabled: true,
- Name: "custom-topology",
- },
- expected: ClusterTopologyConfiguration{
- Enabled: true,
- Name: "custom-topology",
- },
- },
- {
- name: "disabled with no name remains empty",
- input: ClusterTopologyConfiguration{
- Enabled: false,
- Name: "",
- },
- expected: ClusterTopologyConfiguration{
- Enabled: false,
- Name: "",
- },
- },
- {
- name: "disabled with name preserves name",
- input: ClusterTopologyConfiguration{
- Enabled: false,
- Name: "some-topology",
- },
- expected: ClusterTopologyConfiguration{
- Enabled: false,
- Name: "some-topology",
- },
- },
- }
-
- for _, tt := range tests {
- t.Run(tt.name, func(t *testing.T) {
- cfg := tt.input
- SetDefaults_ClusterTopologyConfiguration(&cfg)
- assert.Equal(t, tt.expected, cfg)
- })
- }
-}
diff --git a/operator/api/config/v1alpha1/register.go b/operator/api/config/v1alpha1/register.go
index c4f0f181a..ccd73f208 100644
--- a/operator/api/config/v1alpha1/register.go
+++ b/operator/api/config/v1alpha1/register.go
@@ -17,16 +17,15 @@
package v1alpha1
import (
+ "github.com/ai-dynamo/grove/operator/api/common/constants"
+
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
)
-// GroupName is the group name used in this package
-const GroupName = "operator.config.grove.io"
-
var (
// SchemeGroupVersion is group version used to register these objects
- SchemeGroupVersion = schema.GroupVersion{Group: GroupName, Version: "v1alpha1"}
+ SchemeGroupVersion = schema.GroupVersion{Group: constants.OperatorConfigGroupName, Version: "v1alpha1"}
// SchemeBuilder is used to add go types to the GroupVersionKind scheme
SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes, addDefaultingFuncs)
// AddToScheme is a reference to the Scheme Builder's AddToScheme function.
diff --git a/operator/api/config/v1alpha1/types.go b/operator/api/config/v1alpha1/types.go
index 130189a84..a1d2aa1f7 100644
--- a/operator/api/config/v1alpha1/types.go
+++ b/operator/api/config/v1alpha1/types.go
@@ -17,6 +17,8 @@
package v1alpha1
import (
+ corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
@@ -55,14 +57,14 @@ var (
type OperatorConfiguration struct {
metav1.TypeMeta `json:",inline"`
ClientConnection ClientConnectionConfiguration `json:"runtimeClientConnection"`
- LeaderElection LeaderElectionConfiguration
- Server ServerConfiguration `json:"server"`
- Debugging *DebuggingConfiguration `json:"debugging,omitempty"`
- Controllers ControllerConfiguration `json:"controllers"`
- LogLevel LogLevel `json:"logLevel"`
- LogFormat LogFormat `json:"logFormat"`
- Authorizer AuthorizerConfig `json:"authorizer"`
- ClusterTopology ClusterTopologyConfiguration `json:"clusterTopology"`
+ LeaderElection LeaderElectionConfiguration `json:"leaderElection"`
+ Server ServerConfiguration `json:"server"`
+ Debugging *DebuggingConfiguration `json:"debugging,omitempty"`
+ Controllers ControllerConfiguration `json:"controllers"`
+ LogLevel LogLevel `json:"logLevel"`
+ LogFormat LogFormat `json:"logFormat"`
+ Authorizer AuthorizerConfig `json:"authorizer"`
+ ClusterTopology ClusterTopologyConfiguration `json:"clusterTopology"`
}
// LeaderElectionConfiguration defines the configuration for the leader election.
@@ -193,8 +195,8 @@ type AuthorizerConfig struct {
type ClusterTopologyConfiguration struct {
// Enabled indicates whether topology-aware scheduling is enabled.
Enabled bool `json:"enabled"`
- // Name is the ClusterTopology resource name to use.
- // Defaults to "grove-topology" if not specified when topology is enabled.
+ // Levels is an ordered list of topology levels from broadest to narrowest scope.
+ // Used to create/update the ClusterTopology CR at operator startup.
// +optional
- Name string `json:"name,omitempty"`
+ Levels []corev1alpha1.TopologyLevel `json:"levels,omitempty"`
}
diff --git a/operator/api/config/v1alpha1/zz_generated.deepcopy.go b/operator/api/config/v1alpha1/zz_generated.deepcopy.go
index 2cab37b4c..70bc1f6ae 100644
--- a/operator/api/config/v1alpha1/zz_generated.deepcopy.go
+++ b/operator/api/config/v1alpha1/zz_generated.deepcopy.go
@@ -22,6 +22,7 @@ limitations under the License.
package v1alpha1
import (
+ corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
runtime "k8s.io/apimachinery/pkg/runtime"
)
@@ -65,6 +66,11 @@ func (in *ClientConnectionConfiguration) DeepCopy() *ClientConnectionConfigurati
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ClusterTopologyConfiguration) DeepCopyInto(out *ClusterTopologyConfiguration) {
*out = *in
+ if in.Levels != nil {
+ in, out := &in.Levels, &out.Levels
+ *out = make([]corev1alpha1.TopologyLevel, len(*in))
+ copy(*out, *in)
+ }
return
}
@@ -151,7 +157,7 @@ func (in *OperatorConfiguration) DeepCopyInto(out *OperatorConfiguration) {
}
in.Controllers.DeepCopyInto(&out.Controllers)
in.Authorizer.DeepCopyInto(&out.Authorizer)
- out.ClusterTopology = in.ClusterTopology
+ in.ClusterTopology.DeepCopyInto(&out.ClusterTopology)
return
}
diff --git a/operator/api/config/v1alpha1/zz_generated.defaults.go b/operator/api/config/v1alpha1/zz_generated.defaults.go
index 3583b3ef3..27cb68efa 100644
--- a/operator/api/config/v1alpha1/zz_generated.defaults.go
+++ b/operator/api/config/v1alpha1/zz_generated.defaults.go
@@ -41,5 +41,4 @@ func SetObjectDefaults_OperatorConfiguration(in *OperatorConfiguration) {
SetDefaults_PodCliqueSetControllerConfiguration(&in.Controllers.PodCliqueSet)
SetDefaults_PodCliqueControllerConfiguration(&in.Controllers.PodClique)
SetDefaults_PodCliqueScalingGroupControllerConfiguration(&in.Controllers.PodCliqueScalingGroup)
- SetDefaults_ClusterTopologyConfiguration(&in.ClusterTopology)
}
diff --git a/operator/api/config/validation/validation.go b/operator/api/config/validation/validation.go
index 05b2a601d..b3e3bca65 100644
--- a/operator/api/config/validation/validation.go
+++ b/operator/api/config/validation/validation.go
@@ -17,9 +17,12 @@
package validation
import (
+ "fmt"
+ "slices"
"strings"
configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1"
+ corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/sets"
@@ -114,11 +117,51 @@ func mustBeGreaterThanZeroDuration(duration metav1.Duration, fldPath *field.Path
}
// validateClusterTopologyConfiguration validates the cluster topology configuration.
-// When cluster topology is enabled, it ensures the topology name is provided.
+// When cluster topology is enabled, it ensures the topology name and levels are provided,
+// and validates domain and key uniqueness.
func validateClusterTopologyConfiguration(clusterTopologyCfg configv1alpha1.ClusterTopologyConfiguration, fldPath *field.Path) field.ErrorList {
allErrs := field.ErrorList{}
- if clusterTopologyCfg.Enabled && len(strings.TrimSpace(clusterTopologyCfg.Name)) == 0 {
- allErrs = append(allErrs, field.Required(fldPath.Child("name"), "clusterTopology name is required"))
+ if !clusterTopologyCfg.Enabled {
+ return allErrs
+ }
+ allErrs = validateClusterTopologyLevels(clusterTopologyCfg.Levels, fldPath.Child("levels"))
+ return allErrs
+}
+
+func validateClusterTopologyLevels(levels []corev1alpha1.TopologyLevel, fldPath *field.Path) field.ErrorList {
+ allErrs := field.ErrorList{}
+ if len(levels) == 0 {
+ allErrs = append(allErrs, field.Required(fldPath, "levels are required when topology is enabled"))
+ }
+ allErrs = append(allErrs, mustHaveSupportedTopologyDomains(levels, fldPath)...)
+ allErrs = append(allErrs, mustHaveUniqueTopologyLevels(levels, fldPath)...)
+ return allErrs
+}
+
+func mustHaveSupportedTopologyDomains(levels []corev1alpha1.TopologyLevel, fldPath *field.Path) field.ErrorList {
+ allErrs := field.ErrorList{}
+ supportedDomains := corev1alpha1.SupportedTopologyDomains()
+ for i, level := range levels {
+ if !slices.Contains(supportedDomains, level.Domain) {
+ allErrs = append(allErrs, field.Invalid(fldPath.Index(i).Child("domain"), level.Domain, fmt.Sprintf("must be one of %v", supportedDomains)))
+ }
+ }
+ return allErrs
+}
+
+func mustHaveUniqueTopologyLevels(levels []corev1alpha1.TopologyLevel, fldPath *field.Path) field.ErrorList {
+ allErrs := field.ErrorList{}
+ seenDomains := make(map[corev1alpha1.TopologyDomain]struct{})
+ seenKeys := make(map[string]struct{})
+ for i, level := range levels {
+ if _, exists := seenDomains[level.Domain]; exists {
+ allErrs = append(allErrs, field.Duplicate(fldPath.Index(i).Child("domain"), level.Domain))
+ }
+ if _, exists := seenKeys[level.Key]; exists {
+ allErrs = append(allErrs, field.Duplicate(fldPath.Index(i).Child("key"), level.Key))
+ }
+ seenDomains[level.Domain] = struct{}{}
+ seenKeys[level.Key] = struct{}{}
}
return allErrs
}
diff --git a/operator/api/config/validation/validation_test.go b/operator/api/config/validation/validation_test.go
index 3e6099739..3f3f1906a 100644
--- a/operator/api/config/validation/validation_test.go
+++ b/operator/api/config/validation/validation_test.go
@@ -20,6 +20,7 @@ import (
"testing"
configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1"
+ corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
"github.com/stretchr/testify/assert"
"k8s.io/apimachinery/pkg/util/validation/field"
@@ -27,66 +28,182 @@ import (
func TestValidateClusterTopologyConfiguration(t *testing.T) {
tests := []struct {
- name string
- config configv1alpha1.ClusterTopologyConfiguration
- expectError bool
- errorField string
+ name string
+ config configv1alpha1.ClusterTopologyConfiguration
+ expectErrors int
+ expectedFields []string
+ expectedTypes []field.ErrorType
}{
{
- name: "valid: enabled with name",
+ name: "valid: disabled with no levels",
config: configv1alpha1.ClusterTopologyConfiguration{
- Enabled: true,
- Name: "my-topology",
+ Enabled: false,
},
- expectError: false,
+ expectErrors: 0,
},
{
- name: "valid: disabled with no name",
+ name: "valid: disabled with levels (levels are ignored when disabled)",
config: configv1alpha1.ClusterTopologyConfiguration{
Enabled: false,
- Name: "",
+ Levels: []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ },
},
- expectError: false,
+ expectErrors: 0,
},
{
- name: "valid: disabled with name",
+ name: "valid: enabled with single level",
config: configv1alpha1.ClusterTopologyConfiguration{
- Enabled: false,
- Name: "my-topology",
+ Enabled: true,
+ Levels: []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ },
+ },
+ expectErrors: 0,
+ },
+ {
+ name: "valid: enabled with multiple levels",
+ config: configv1alpha1.ClusterTopologyConfiguration{
+ Enabled: true,
+ Levels: []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainRegion, Key: "topology.kubernetes.io/region"},
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"},
+ },
},
- expectError: false,
+ expectErrors: 0,
},
{
- name: "invalid: enabled with empty name",
+ name: "valid: enabled with all supported domains",
config: configv1alpha1.ClusterTopologyConfiguration{
Enabled: true,
- Name: "",
+ Levels: []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainRegion, Key: "topology.kubernetes.io/region"},
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: corev1alpha1.TopologyDomainDataCenter, Key: "topology.kubernetes.io/datacenter"},
+ {Domain: corev1alpha1.TopologyDomainBlock, Key: "topology.kubernetes.io/block"},
+ {Domain: corev1alpha1.TopologyDomainRack, Key: "topology.kubernetes.io/rack"},
+ {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"},
+ {Domain: corev1alpha1.TopologyDomainNuma, Key: "topology.kubernetes.io/numa"},
+ },
},
- expectError: true,
- errorField: "clusterTopology.name",
+ expectErrors: 0,
},
{
- name: "invalid: enabled with whitespace-only name",
+ name: "invalid: enabled with empty levels",
config: configv1alpha1.ClusterTopologyConfiguration{
Enabled: true,
- Name: " ",
+ Levels: []corev1alpha1.TopologyLevel{},
},
- expectError: true,
- errorField: "clusterTopology.name",
+ expectErrors: 1,
+ expectedFields: []string{"clusterTopology.levels"},
+ expectedTypes: []field.ErrorType{field.ErrorTypeRequired},
+ },
+ {
+ name: "invalid: enabled with nil levels",
+ config: configv1alpha1.ClusterTopologyConfiguration{
+ Enabled: true,
+ Levels: nil,
+ },
+ expectErrors: 1,
+ expectedFields: []string{"clusterTopology.levels"},
+ expectedTypes: []field.ErrorType{field.ErrorTypeRequired},
+ },
+ {
+ name: "invalid: unsupported domain",
+ config: configv1alpha1.ClusterTopologyConfiguration{
+ Enabled: true,
+ Levels: []corev1alpha1.TopologyLevel{
+ {Domain: "invalid-domain", Key: "some.key"},
+ },
+ },
+ expectErrors: 1,
+ expectedFields: []string{"clusterTopology.levels[0].domain"},
+ expectedTypes: []field.ErrorType{field.ErrorTypeInvalid},
+ },
+ {
+ name: "invalid: duplicate domains",
+ config: configv1alpha1.ClusterTopologyConfiguration{
+ Enabled: true,
+ Levels: []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "another.zone.key"},
+ },
+ },
+ expectErrors: 1,
+ expectedFields: []string{"clusterTopology.levels[1].domain"},
+ expectedTypes: []field.ErrorType{field.ErrorTypeDuplicate},
+ },
+ {
+ name: "invalid: duplicate keys",
+ config: configv1alpha1.ClusterTopologyConfiguration{
+ Enabled: true,
+ Levels: []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: corev1alpha1.TopologyDomainHost, Key: "topology.kubernetes.io/zone"},
+ },
+ },
+ expectErrors: 1,
+ expectedFields: []string{"clusterTopology.levels[1].key"},
+ expectedTypes: []field.ErrorType{field.ErrorTypeDuplicate},
+ },
+ {
+ name: "invalid: duplicate domains and keys",
+ config: configv1alpha1.ClusterTopologyConfiguration{
+ Enabled: true,
+ Levels: []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ },
+ },
+ expectErrors: 2,
+ expectedFields: []string{"clusterTopology.levels[1].domain", "clusterTopology.levels[1].key"},
+ expectedTypes: []field.ErrorType{field.ErrorTypeDuplicate, field.ErrorTypeDuplicate},
+ },
+ {
+ name: "invalid: multiple unsupported domains",
+ config: configv1alpha1.ClusterTopologyConfiguration{
+ Enabled: true,
+ Levels: []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: "invalid1", Key: "key1"},
+ {Domain: "invalid2", Key: "key2"},
+ },
+ },
+ expectErrors: 2,
+ expectedFields: []string{"clusterTopology.levels[1].domain", "clusterTopology.levels[2].domain"},
+ expectedTypes: []field.ErrorType{field.ErrorTypeInvalid, field.ErrorTypeInvalid},
+ },
+ {
+ name: "invalid: multiple validation errors - unsupported domain and duplicates",
+ config: configv1alpha1.ClusterTopologyConfiguration{
+ Enabled: true,
+ Levels: []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: "invalid", Key: "invalid.key"},
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ },
+ },
+ expectErrors: 3,
+ expectedFields: []string{"clusterTopology.levels[1].domain", "clusterTopology.levels[2].domain", "clusterTopology.levels[2].key"},
+ expectedTypes: []field.ErrorType{field.ErrorTypeInvalid, field.ErrorTypeDuplicate, field.ErrorTypeDuplicate},
},
}
- for _, tt := range tests {
- t.Run(tt.name, func(t *testing.T) {
- errs := validateClusterTopologyConfiguration(tt.config, field.NewPath("clusterTopology"))
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ errs := validateClusterTopologyConfiguration(test.config, field.NewPath("clusterTopology"))
+
+ assert.Len(t, errs, test.expectErrors, "expected %d validation errors but got %d: %v", test.expectErrors, len(errs), errs)
- if tt.expectError {
- assert.NotEmpty(t, errs, "expected validation errors but got none")
- if len(errs) > 0 {
- assert.Equal(t, tt.errorField, errs[0].Field)
+ if test.expectErrors > 0 {
+ // Verify each expected error
+ for i, expectedField := range test.expectedFields {
+ assert.Equal(t, expectedField, errs[i].Field, "error %d: expected field %s but got %s", i, expectedField, errs[i].Field)
+ if i < len(test.expectedTypes) {
+ assert.Equal(t, test.expectedTypes[i], errs[i].Type, "error %d: expected type %s but got %s", i, test.expectedTypes[i], errs[i].Type)
+ }
}
- } else {
- assert.Empty(t, errs, "expected no validation errors but got: %v", errs)
}
})
}
diff --git a/operator/api/core/v1alpha1/clustertopology.go b/operator/api/core/v1alpha1/clustertopology.go
index dc756e824..5de382283 100644
--- a/operator/api/core/v1alpha1/clustertopology.go
+++ b/operator/api/core/v1alpha1/clustertopology.go
@@ -17,9 +17,18 @@
package v1alpha1
import (
+ "slices"
+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
+const (
+ // DefaultClusterTopologyName is the name of the default ClusterTopology resource managed by the operator.
+ // Currently, Grove operator generates and maintains a single ClusterTopology resource per cluster. There is no support
+ // for externally created ClusterTopology resources. However, this may change in the future to allow more flexibility.
+ DefaultClusterTopologyName = "grove-topology"
+)
+
// +genclient
// +genclient:nonNamespaced
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
@@ -52,12 +61,32 @@ type ClusterTopologySpec struct {
// This field is immutable after creation.
// +kubebuilder:validation:MinItems=1
// +kubebuilder:validation:MaxItems=7
+ // +kubebuilder:validation:XValidation:rule="self.all(x, self.filter(y, y.domain == x.domain).size() == 1)",message="domain must be unique across all levels"
+ // +kubebuilder:validation:XValidation:rule="self.all(x, self.filter(y, y.key == x.key).size() == 1)",message="key must be unique across all levels"
Levels []TopologyLevel `json:"levels"`
}
-// TopologyDomain represents a predefined topology level in the hierarchy.
-// Topology ordering (broadest to narrowest):
-// Region > Zone > DataCenter > Block > Rack > Host > Numa
+// TopologyLevel defines a single level in the topology hierarchy.
+// Maps a platform-agnostic domain to a platform-specific node label key,
+// allowing workload operators a consistent way to reference topology levels when defining TopologyConstraint's.
+type TopologyLevel struct {
+ // Domain is a platform provider-agnostic level identifier.
+ // Must be one of: region, zone, datacenter, block, rack, host, numa
+ // +kubebuilder:validation:Required
+ // +kubebuilder:validation:Enum=region;zone;datacenter;block;rack;host;numa
+ Domain TopologyDomain `json:"domain"`
+
+ // Key is the node label key that identifies this topology domain.
+ // Must be a valid Kubernetes label key (qualified name).
+ // Examples: "topology.kubernetes.io/zone", "kubernetes.io/hostname"
+ // +kubebuilder:validation:Required
+ // +kubebuilder:validation:MinLength=1
+ // +kubebuilder:validation:MaxLength=63
+ // +kubebuilder:validation:Pattern=`^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]/)?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$`
+ Key string `json:"key"`
+}
+
+// TopologyDomain represents a level in the cluster topology hierarchy.
type TopologyDomain string
const (
@@ -77,8 +106,20 @@ const (
TopologyDomainNuma TopologyDomain = "numa"
)
+// SupportedTopologyDomains returns all supported topology domain values.
+func SupportedTopologyDomains() []TopologyDomain {
+ topologyDomains := make([]TopologyDomain, 0, len(topologyDomainOrder))
+ for domain := range topologyDomainOrder {
+ topologyDomains = append(topologyDomains, domain)
+ }
+ return topologyDomains
+}
+
// topologyDomainOrder defines the hierarchical order of topology domains from broadest to narrowest.
// Lower value = broader scope (e.g., Region is broader than Zone).
+// Context: TopologyDomain is used when defining TopologyConstraint in workload specifications. TopologyConstraint at different
+// levels (PodClique, PodCliqueScalingGroup and PodCliqueSet) captures required packing guarantees that a scheduler backend
+// must provide.
var topologyDomainOrder = map[TopologyDomain]int{
TopologyDomainRegion: 0,
TopologyDomainZone: 1,
@@ -89,63 +130,9 @@ var topologyDomainOrder = map[TopologyDomain]int{
TopologyDomainNuma: 6,
}
-// Compare compares this domain with another domain.
-// Returns:
-// - negative value if this domain is broader than other
-// - zero if domains are equal
-// - positive value if this domain is narrower than other
-//
-// This method assumes both domains are valid (enforced by kubebuilder validation).
-func (d TopologyDomain) Compare(other TopologyDomain) int {
- return topologyDomainOrder[d] - topologyDomainOrder[other]
-}
-
-// BroaderThan returns true if this domain represents a broader scope than the other domain.
-// For example, Region.BroaderThan(Zone) returns true.
-func (d TopologyDomain) BroaderThan(other TopologyDomain) bool {
- return d.Compare(other) < 0
-}
-
-// NarrowerThan returns true if this domain represents a narrower scope than the other domain.
-// For example, Zone.NarrowerThan(Region) returns true.
-func (d TopologyDomain) NarrowerThan(other TopologyDomain) bool {
- return d.Compare(other) > 0
-}
-
-// TopologyLevel defines a single level in the topology hierarchy.
-type TopologyLevel struct {
- // Domain is the predefined level identifier used in TopologyConstraint references.
- // Must be one of: region, zone, datacenter, block, rack, host, numa
- // +kubebuilder:validation:Required
- // +kubebuilder:validation:Enum=region;zone;datacenter;block;rack;host;numa
- Domain TopologyDomain `json:"domain"`
-
- // Key is the node label key that identifies this topology domain.
- // Must be a valid Kubernetes label key (qualified name).
- // Examples: "topology.kubernetes.io/zone", "kubernetes.io/hostname"
- // +kubebuilder:validation:Required
- // +kubebuilder:validation:MinLength=1
- // +kubebuilder:validation:MaxLength=63
- Key string `json:"key"`
-}
-
-// Compare compares this topology level with another level based on their domains.
-// Returns:
-// - negative value if this level is broader than other
-// - zero if levels have equal domain scope
-// - positive value if this level is narrower than other
-func (l TopologyLevel) Compare(other TopologyLevel) int {
- return l.Domain.Compare(other.Domain)
-}
-
-// BroaderThan returns true if this level represents a broader scope than the other level.
-// For example, a Region level is broader than a Zone level.
-func (l TopologyLevel) BroaderThan(other TopologyLevel) bool {
- return l.Domain.BroaderThan(other.Domain)
-}
-
-// NarrowerThan returns true if this level represents a narrower scope than the other level.
-// For example, a Zone level is narrower than a Region level.
-func (l TopologyLevel) NarrowerThan(other TopologyLevel) bool {
- return l.Domain.NarrowerThan(other.Domain)
+// SortTopologyLevels sorts a slice of TopologyLevel from broadest to narrowest scope, sorting by their Domain.
+func SortTopologyLevels(levels []TopologyLevel) {
+ slices.SortFunc(levels, func(a, b TopologyLevel) int {
+ return topologyDomainOrder[a.Domain] - topologyDomainOrder[b.Domain]
+ })
}
diff --git a/operator/api/core/v1alpha1/clustertopology_test.go b/operator/api/core/v1alpha1/clustertopology_test.go
index cfc39333f..dce6fe9bc 100644
--- a/operator/api/core/v1alpha1/clustertopology_test.go
+++ b/operator/api/core/v1alpha1/clustertopology_test.go
@@ -18,117 +18,136 @@ package v1alpha1
import (
"testing"
+
+ "github.com/stretchr/testify/assert"
)
-func TestTopologyDomainComparison(t *testing.T) {
+func TestSupportedTopologyDomains(t *testing.T) {
+ expectedDomains := []TopologyDomain{
+ TopologyDomainRegion,
+ TopologyDomainZone,
+ TopologyDomainDataCenter,
+ TopologyDomainBlock,
+ TopologyDomainRack,
+ TopologyDomainHost,
+ TopologyDomainNuma,
+ }
+
+ domains := SupportedTopologyDomains()
+
+ // Verify all expected domains are present and no duplicates
+ seen := make(map[TopologyDomain]bool)
+ for _, domain := range domains {
+ assert.False(t, seen[domain], "domain %s should not be duplicated", domain)
+ seen[domain] = true
+ }
+ // Verify all expected domains are included
+ for _, expected := range expectedDomains {
+ assert.True(t, seen[expected], "domain %s should be in the returned list", expected)
+ }
+}
+
+func TestSortTopologyLevels(t *testing.T) {
tests := []struct {
- name string
- domain TopologyDomain
- other TopologyDomain
- expectedCompare int
- expectedBroaderThan bool
- expectedNarrowerThan bool
+ name string
+ input []TopologyLevel
+ expected []TopologyLevel
}{
{
- name: "region vs zone",
- domain: TopologyDomainRegion,
- other: TopologyDomainZone,
- expectedCompare: -1,
- expectedBroaderThan: true,
- expectedNarrowerThan: false,
- },
- {
- name: "zone vs region",
- domain: TopologyDomainZone,
- other: TopologyDomainRegion,
- expectedCompare: 1,
- expectedBroaderThan: false,
- expectedNarrowerThan: true,
- },
- {
- name: "region vs region (equal)",
- domain: TopologyDomainRegion,
- other: TopologyDomainRegion,
- expectedCompare: 0,
- expectedBroaderThan: false,
- expectedNarrowerThan: false,
- },
- {
- name: "host vs numa",
- domain: TopologyDomainHost,
- other: TopologyDomainNuma,
- expectedCompare: -1,
- expectedBroaderThan: true,
- expectedNarrowerThan: false,
+ name: "empty slice",
+ input: []TopologyLevel{},
+ expected: []TopologyLevel{},
},
{
- name: "region vs numa (multiple levels apart)",
- domain: TopologyDomainRegion,
- other: TopologyDomainNuma,
- expectedCompare: -6,
- expectedBroaderThan: true,
- expectedNarrowerThan: false,
+ name: "single level",
+ input: []TopologyLevel{
+ {Domain: TopologyDomainZone, Key: "zone-key"},
+ },
+ expected: []TopologyLevel{
+ {Domain: TopologyDomainZone, Key: "zone-key"},
+ },
},
{
- name: "numa vs region (multiple levels apart)",
- domain: TopologyDomainNuma,
- other: TopologyDomainRegion,
- expectedCompare: 6,
- expectedBroaderThan: false,
- expectedNarrowerThan: true,
+ name: "already sorted",
+ input: []TopologyLevel{
+ {Domain: TopologyDomainRegion, Key: "region-key"},
+ {Domain: TopologyDomainZone, Key: "zone-key"},
+ {Domain: TopologyDomainHost, Key: "host-key"},
+ },
+ expected: []TopologyLevel{
+ {Domain: TopologyDomainRegion, Key: "region-key"},
+ {Domain: TopologyDomainZone, Key: "zone-key"},
+ {Domain: TopologyDomainHost, Key: "host-key"},
+ },
},
{
- name: "datacenter vs rack",
- domain: TopologyDomainDataCenter,
- other: TopologyDomainRack,
- expectedCompare: -2,
- expectedBroaderThan: true,
- expectedNarrowerThan: false,
+ name: "reverse sorted",
+ input: []TopologyLevel{
+ {Domain: TopologyDomainHost, Key: "host-key"},
+ {Domain: TopologyDomainZone, Key: "zone-key"},
+ {Domain: TopologyDomainRegion, Key: "region-key"},
+ },
+ expected: []TopologyLevel{
+ {Domain: TopologyDomainRegion, Key: "region-key"},
+ {Domain: TopologyDomainZone, Key: "zone-key"},
+ {Domain: TopologyDomainHost, Key: "host-key"},
+ },
},
{
- name: "rack vs datacenter",
- domain: TopologyDomainRack,
- other: TopologyDomainDataCenter,
- expectedCompare: 2,
- expectedBroaderThan: false,
- expectedNarrowerThan: true,
+ name: "random order",
+ input: []TopologyLevel{
+ {Domain: TopologyDomainRack, Key: "rack-key"},
+ {Domain: TopologyDomainRegion, Key: "region-key"},
+ {Domain: TopologyDomainNuma, Key: "numa-key"},
+ {Domain: TopologyDomainZone, Key: "zone-key"},
+ },
+ expected: []TopologyLevel{
+ {Domain: TopologyDomainRegion, Key: "region-key"},
+ {Domain: TopologyDomainZone, Key: "zone-key"},
+ {Domain: TopologyDomainRack, Key: "rack-key"},
+ {Domain: TopologyDomainNuma, Key: "numa-key"},
+ },
},
{
- name: "rack vs zone (narrower vs broader)",
- domain: TopologyDomainRack,
- other: TopologyDomainZone,
- expectedCompare: 3,
- expectedBroaderThan: false,
- expectedNarrowerThan: true,
- },
- {
- name: "zone vs rack (broader vs narrower)",
- domain: TopologyDomainZone,
- other: TopologyDomainRack,
- expectedCompare: -3,
- expectedBroaderThan: true,
- expectedNarrowerThan: false,
+ name: "all domains",
+ input: []TopologyLevel{
+ {Domain: TopologyDomainNuma, Key: "numa-key"},
+ {Domain: TopologyDomainHost, Key: "host-key"},
+ {Domain: TopologyDomainRack, Key: "rack-key"},
+ {Domain: TopologyDomainBlock, Key: "block-key"},
+ {Domain: TopologyDomainDataCenter, Key: "dc-key"},
+ {Domain: TopologyDomainZone, Key: "zone-key"},
+ {Domain: TopologyDomainRegion, Key: "region-key"},
+ },
+ expected: []TopologyLevel{
+ {Domain: TopologyDomainRegion, Key: "region-key"},
+ {Domain: TopologyDomainZone, Key: "zone-key"},
+ {Domain: TopologyDomainDataCenter, Key: "dc-key"},
+ {Domain: TopologyDomainBlock, Key: "block-key"},
+ {Domain: TopologyDomainRack, Key: "rack-key"},
+ {Domain: TopologyDomainHost, Key: "host-key"},
+ {Domain: TopologyDomainNuma, Key: "numa-key"},
+ },
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
- // Test Compare()
- gotCompare := tt.domain.Compare(tt.other)
- if gotCompare != tt.expectedCompare {
- t.Errorf("Compare() = %v, want %v", gotCompare, tt.expectedCompare)
- }
+ // Make a copy to avoid modifying test data
+ input := make([]TopologyLevel, len(tt.input))
+ copy(input, tt.input)
+
+ // Sort the input
+ SortTopologyLevels(input)
- // Test BroaderThan()
- gotBroaderThan := tt.domain.BroaderThan(tt.other)
- if gotBroaderThan != tt.expectedBroaderThan {
- t.Errorf("BroaderThan() = %v, want %v", gotBroaderThan, tt.expectedBroaderThan)
+ // Check if sorted correctly
+ if len(input) != len(tt.expected) {
+ t.Fatalf("length mismatch: got %d, want %d", len(input), len(tt.expected))
}
- // Test NarrowerThan()
- gotNarrowerThan := tt.domain.NarrowerThan(tt.other)
- if gotNarrowerThan != tt.expectedNarrowerThan {
- t.Errorf("NarrowerThan() = %v, want %v", gotNarrowerThan, tt.expectedNarrowerThan)
+ for i := range input {
+ assert.Equal(t, input[i].Domain, tt.expected[i].Domain, "at index %d: got domain %v, want %v", i, input[i].Domain, tt.expected[i].Domain)
+ assert.Equal(t, input[i].Key, tt.expected[i].Key, "at index %d: got key %v, want %v", i, input[i].Key, tt.expected[i].Key)
}
})
}
diff --git a/operator/api/core/v1alpha1/crds/grove.io_clustertopologies.yaml b/operator/api/core/v1alpha1/crds/grove.io_clustertopologies.yaml
index f923ab48d..47630912e 100644
--- a/operator/api/core/v1alpha1/crds/grove.io_clustertopologies.yaml
+++ b/operator/api/core/v1alpha1/crds/grove.io_clustertopologies.yaml
@@ -49,12 +49,14 @@ spec:
The order in this list defines the hierarchy (index 0 = broadest level).
This field is immutable after creation.
items:
- description: TopologyLevel defines a single level in the topology
- hierarchy.
+ description: |-
+ TopologyLevel defines a single level in the topology hierarchy.
+ Maps a platform-agnostic domain to a platform-specific node label key,
+ allowing workload operators a consistent way to reference topology levels when defining TopologyConstraint's.
properties:
domain:
description: |-
- Domain is the predefined level identifier used in TopologyConstraint references.
+ Domain is a platform provider-agnostic level identifier.
Must be one of: region, zone, datacenter, block, rack, host, numa
enum:
- region
@@ -72,6 +74,7 @@ spec:
Examples: "topology.kubernetes.io/zone", "kubernetes.io/hostname"
maxLength: 63
minLength: 1
+ pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]/)?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$
type: string
required:
- domain
@@ -80,6 +83,12 @@ spec:
maxItems: 7
minItems: 1
type: array
+ x-kubernetes-validations:
+ - message: domain must be unique across all levels
+ rule: self.all(x, self.filter(y, y.domain == x.domain).size() ==
+ 1)
+ - message: key must be unique across all levels
+ rule: self.all(x, self.filter(y, y.key == x.key).size() == 1)
required:
- levels
type: object
diff --git a/operator/api/core/v1alpha1/register.go b/operator/api/core/v1alpha1/register.go
index b397b95bd..c400e30f2 100644
--- a/operator/api/core/v1alpha1/register.go
+++ b/operator/api/core/v1alpha1/register.go
@@ -17,19 +17,16 @@
package v1alpha1
import (
+ "github.com/ai-dynamo/grove/operator/api/common/constants"
+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
)
-const (
- // GroupName is the name of the group for all resources defined in this package.
- GroupName = "grove.io"
-)
-
var (
// SchemeGroupVersion is group version used to register these objects
- SchemeGroupVersion = schema.GroupVersion{Group: GroupName, Version: "v1alpha1"}
+ SchemeGroupVersion = schema.GroupVersion{Group: constants.OperatorGroupName, Version: "v1alpha1"}
// SchemeBuilder is used to add go types to the GroupVersionKind scheme
SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes)
// AddToScheme is a reference to the Scheme Builder's AddToScheme function.
diff --git a/operator/charts/templates/_helpers.tpl b/operator/charts/templates/_helpers.tpl
index 60cd3a1fd..3399ec233 100644
--- a/operator/charts/templates/_helpers.tpl
+++ b/operator/charts/templates/_helpers.tpl
@@ -34,8 +34,12 @@ config.yaml: |
{{- if .Values.config.clusterTopology }}
clusterTopology:
enabled: {{ .Values.config.clusterTopology.enabled }}
- {{- if .Values.config.clusterTopology.name }}
- name: {{ .Values.config.clusterTopology.name }}
+ {{- if .Values.config.clusterTopology.levels }}
+ levels:
+ {{- range .Values.config.clusterTopology.levels }}
+ - domain: {{ .domain }}
+ key: {{ .key }}
+ {{- end }}
{{- end }}
{{- end }}
{{- if .Values.config.authorizer.enabled }}
@@ -150,4 +154,18 @@ release: "{{ .Release.Name }}"
{{- range $key, $val := .Values.webhookServerSecret.labels }}
{{ $key }}: {{ $val }}
{{- end }}
-{{- end -}}
\ No newline at end of file
+{{- end -}}
+
+{{- define "operator.lease.role.labels" -}}
+{{- include "common.chart.labels" . }}
+{{- range $key, $val := .Values.leaseRole.labels }}
+{{ $key }}: {{ $val }}
+{{- end }}
+{{- end -}}
+
+{{- define "operator.lease.rolebinding.labels" -}}
+{{- include "common.chart.labels" . }}
+{{- range $key, $val := .Values.leaseRoleBinding.labels }}
+{{ $key }}: {{ $val }}
+{{- end }}
+{{- end -}}
diff --git a/operator/charts/templates/clusterrole.yaml b/operator/charts/templates/clusterrole.yaml
index 4cc2c83d7..5fcd04eaf 100644
--- a/operator/charts/templates/clusterrole.yaml
+++ b/operator/charts/templates/clusterrole.yaml
@@ -26,6 +26,7 @@ rules:
- podcliques
- podcliques/status
- podcliquescalinggroups
+ - clustertopologies
- podcliquescalinggroups/status
verbs:
- create
@@ -36,14 +37,6 @@ rules:
- deletecollection
- patch
- update
-- apiGroups:
- - grove.io
- resources:
- - clustertopologies
- verbs:
- - get
- - list
- - watch
- apiGroups:
- ""
resources:
@@ -136,4 +129,15 @@ rules:
- patch
- update
- watch
+- apiGroups:
+ - kai.scheduler
+ resources:
+ - topologies
+ verbs:
+ - create
+ - get
+ - list
+ - watch
+ - patch
+ - update
diff --git a/operator/charts/templates/clustertopology-validating-webhook-config.yaml b/operator/charts/templates/clustertopology-validating-webhook-config.yaml
deleted file mode 100644
index 0a562b209..000000000
--- a/operator/charts/templates/clustertopology-validating-webhook-config.yaml
+++ /dev/null
@@ -1,34 +0,0 @@
----
-apiVersion: admissionregistration.k8s.io/v1
-kind: ValidatingWebhookConfiguration
-metadata:
- name: clustertopology-validating-webhook
- labels:
-{{- include "operator.clustertopology.validating.webhook.labels" . | nindent 4 }}
-webhooks:
- - admissionReviewVersions:
- - v1
- clientConfig:
- service:
- name: {{ required ".Values.service.name is required" .Values.service.name }}
- namespace: {{ .Release.Namespace }}
- path: /webhooks/validate-clustertopology
- port: {{ required ".Values.config.server.webhooks.port" .Values.config.server.webhooks.port }}
- failurePolicy: Fail
- matchPolicy: Exact
- name: clustertopology.validating.webhooks.grove.io
- namespaceSelector: { }
- rules:
- - apiGroups:
- - "grove.io"
- apiVersions:
- - "v1alpha1"
- operations:
- - CREATE
- - UPDATE
- resources:
- - clustertopologies
- scope: Cluster
- sideEffects: None
- timeoutSeconds: 10
-
diff --git a/operator/charts/templates/leaderelection-role.yaml b/operator/charts/templates/leaderelection-role.yaml
new file mode 100644
index 000000000..32524c1e6
--- /dev/null
+++ b/operator/charts/templates/leaderelection-role.yaml
@@ -0,0 +1,27 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+ labels:
+{{- include "operator.lease.role.labels" . | nindent 4 }}
+ name: {{ required ".Values.leaseRole.name is required" .Values.leaseRole.name }}
+ namespace: '{{ .Release.Namespace }}'
+rules:
+ - apiGroups:
+ - coordination.k8s.io
+ resources:
+ - leases
+ verbs:
+ - get
+ - list
+ - watch
+ - create
+ - update
+ - patch
+ - delete
+ - apiGroups:
+ - ""
+ resources:
+ - events
+ verbs:
+ - create
+ - patch
\ No newline at end of file
diff --git a/operator/charts/templates/leaderelection-rolebinding.yaml b/operator/charts/templates/leaderelection-rolebinding.yaml
new file mode 100644
index 000000000..f29f8a0eb
--- /dev/null
+++ b/operator/charts/templates/leaderelection-rolebinding.yaml
@@ -0,0 +1,15 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+ name: {{ required ".Values.leaseRoleBinding.name is required" .Values.leaseRoleBinding.name }}
+ labels:
+{{- include "operator.lease.rolebinding.labels" . | nindent 4 }}
+ namespace: '{{ .Release.Namespace }}'
+roleRef:
+ apiGroup: rbac.authorization.k8s.io
+ kind: Role
+ name: {{ required ".Values.leaseRole.name is required" .Values.leaseRole.name }}
+subjects:
+ - kind: ServiceAccount
+ name: {{ required ".Values.serviceAccount.name is required" .Values.serviceAccount.name }}
+ namespace: '{{ .Release.Namespace }}'
\ No newline at end of file
diff --git a/operator/charts/values.yaml b/operator/charts/values.yaml
index cc62d8d5b..c7b03db2c 100644
--- a/operator/charts/values.yaml
+++ b/operator/charts/values.yaml
@@ -62,7 +62,21 @@ config:
logFormat: json
clusterTopology:
enabled: false
- name: "grove-topology"
+# levels:
+# - domain: zone
+# key: topology.kubernetes.io/zone
+# - domain: datacenter
+# key:
+# - domain: block
+# key:
+# - domain: rack
+# key:
+# - domain: host
+# key: kubernetes.io/hostname
+# - domain: region
+# key: topology.kubernetes.io/region
+# - domain: numa
+# key:
authorizer:
enabled: false
exemptServiceAccountUserNames:
@@ -114,6 +128,20 @@ clusterRoleBinding:
app.kubernetes.io/name: grove-operator
app.kubernetes.io/part-of: grove
+leaseRole:
+ name: grove:system:grove-operator-lease
+ labels:
+ app.kubernetes.io/component: operator-lease-role
+ app.kubernetes.io/name: grove-operator
+ app.kubernetes.io/part-of: grove
+
+leaseRoleBinding:
+ name: grove:system:grove-operator-lease
+ labels:
+ app.kubernetes.io/component: operator-lease-rolebinding
+ app.kubernetes.io/name: grove-operator
+ app.kubernetes.io/part-of: grove
+
webhooks:
podCliqueSetValidationWebhook:
labels:
@@ -125,11 +153,6 @@ webhooks:
app.kubernetes.io/component: operator-pcs-defaulting-webhook
app.kubernetes.io/name: pcs-defaulting-webhook
app.kubernetes.io/part-of: grove
- clusterTopologyValidationWebhook:
- labels:
- app.kubernetes.io/component: operator-clustertopology-validating-webhook
- app.kubernetes.io/name: clustertopology-validating-webhook
- app.kubernetes.io/part-of: grove
authorizerWebhook:
labels:
app.kubernetes.io/component: operator-authorizer-webhook
diff --git a/operator/cmd/cli/cli.go b/operator/cmd/cli/cli.go
new file mode 100644
index 000000000..2d53f52c6
--- /dev/null
+++ b/operator/cmd/cli/cli.go
@@ -0,0 +1,124 @@
+// /*
+// Copyright 2025 The Grove Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// */
+
+package cli
+
+import (
+ "fmt"
+ "os"
+
+ apicommonconstants "github.com/ai-dynamo/grove/operator/api/common/constants"
+ configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1"
+ operatorvalidation "github.com/ai-dynamo/grove/operator/api/config/validation"
+
+ "github.com/spf13/pflag"
+ "k8s.io/apimachinery/pkg/runtime"
+ "k8s.io/apimachinery/pkg/runtime/serializer"
+)
+
+const (
+ // ExitSuccess is the exit code indicating that the application exited with no error.
+ ExitSuccess = iota
+ // ExitErrParseCLIArgs is the exit code indicating that the application exited due to an error parsing CLI arguments.
+ ExitErrParseCLIArgs
+ // ExitErrLoadOperatorConfig indicates that the application exited due to an error loading the operator configuration.
+ ExitErrLoadOperatorConfig
+ // ExitErrSynchronizeTopology indicates that the application exited due to an error synchronizing the cluster topology.
+ ExitErrSynchronizeTopology
+ // ExitErrInitializeManager indicates that the application exited due to an error initializing the manager.
+ // This includes registration of controllers and webhooks and setting up probes.
+ ExitErrInitializeManager
+ // ExitErrStart indicates that the application exited due to an error when starting the application.
+ ExitErrStart
+)
+
+var (
+ errParseCLIArgs = fmt.Errorf("error parsing cli arguments")
+ errMissingLaunchOption = fmt.Errorf("missing required launch option")
+ errLoadOperatorConfig = fmt.Errorf("cannot load %q operator config", apicommonconstants.OperatorName)
+ errInvalidOperatorConfig = fmt.Errorf("invalid %q operator config", apicommonconstants.OperatorName)
+)
+
+// LaunchOptions defines options for launching the operator.
+type LaunchOptions struct {
+ // ConfigFile is the path to the operator configuration file.
+ ConfigFile string
+ // Version indicates whether to print the operator version and exit.
+ Version bool
+}
+
+// ParseLaunchOptions parses the CLI arguments for the operator.
+func ParseLaunchOptions(cliArgs []string) (*LaunchOptions, error) {
+ launchOpts := &LaunchOptions{}
+ flagSet := pflag.NewFlagSet(apicommonconstants.OperatorName, pflag.ContinueOnError)
+ launchOpts.mapFlags(flagSet)
+ if err := flagSet.Parse(cliArgs); err != nil {
+ return nil, fmt.Errorf("%w: %w", errParseCLIArgs, err)
+ }
+ if err := launchOpts.validate(); err != nil {
+ return nil, err
+ }
+ return launchOpts, nil
+}
+
+// LoadAndValidateOperatorConfig loads and validates the operator configuration from the specified path in the launch options.
+func (o *LaunchOptions) LoadAndValidateOperatorConfig() (*configv1alpha1.OperatorConfiguration, error) {
+ operatorConfig, err := o.loadOperatorConfig()
+ if err != nil {
+ return nil, err
+ }
+ if errs := operatorvalidation.ValidateOperatorConfiguration(operatorConfig); len(errs) > 0 {
+ return nil, fmt.Errorf("%w: %w", errInvalidOperatorConfig, errs.ToAggregate())
+ }
+ return operatorConfig, nil
+}
+
+func (o *LaunchOptions) loadOperatorConfig() (*configv1alpha1.OperatorConfiguration, error) {
+ // Set up scheme and decoder for operator configuration
+ configScheme := runtime.NewScheme()
+ if err := configv1alpha1.AddToScheme(configScheme); err != nil {
+ return nil, fmt.Errorf("%w: error adding to scheme: %w", errLoadOperatorConfig, err)
+ }
+ configDecoder := serializer.NewCodecFactory(configScheme).UniversalDecoder()
+
+ // Read configuration file
+ operatorConfigBytes, err := os.ReadFile(o.ConfigFile)
+ if err != nil {
+ return nil, fmt.Errorf("%w: error reading file: %w", errLoadOperatorConfig, err)
+ }
+
+ // Decode configuration
+ operatorConfig := &configv1alpha1.OperatorConfiguration{}
+ if err = runtime.DecodeInto(configDecoder, operatorConfigBytes, operatorConfig); err != nil {
+ return nil, fmt.Errorf("%w: error decoding operator config: %w", errLoadOperatorConfig, err)
+ }
+ return operatorConfig, nil
+}
+
+func (o *LaunchOptions) mapFlags(fs *pflag.FlagSet) {
+ fs.StringVar(&o.ConfigFile, "config", "", "Path to the operator configuration file.")
+ fs.BoolVarP(&o.Version, "version", "v", false, "Print the version and exit.")
+}
+
+func (o *LaunchOptions) validate() error {
+ if len(o.ConfigFile) == 0 && !o.Version {
+ return fmt.Errorf("%w: one of version or operator config file must be specified", errMissingLaunchOption)
+ }
+ if len(o.ConfigFile) > 0 && o.Version {
+ return fmt.Errorf("%w: both version and operator config file cannot be specified", errMissingLaunchOption)
+ }
+ return nil
+}
diff --git a/operator/cmd/cli/cli_test.go b/operator/cmd/cli/cli_test.go
new file mode 100644
index 000000000..19b8b67c8
--- /dev/null
+++ b/operator/cmd/cli/cli_test.go
@@ -0,0 +1,171 @@
+// /*
+// Copyright 2025 The Grove Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// */
+
+package cli
+
+import (
+ "errors"
+ "testing"
+
+ apicommonconstants "github.com/ai-dynamo/grove/operator/api/common/constants"
+ configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1"
+
+ "github.com/spf13/pflag"
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+)
+
+// TestParseLaunchOptions tests the parsing of CLI arguments into LaunchOptions.
+func TestParseLaunchOptions(t *testing.T) {
+ tests := []struct {
+ name string
+ args []string
+ expected *LaunchOptions
+ expectedError error
+ }{
+ {
+ name: "valid config file",
+ args: []string{"--config", "testdata/valid-config.yaml"},
+ expected: &LaunchOptions{
+ ConfigFile: "testdata/valid-config.yaml",
+ },
+ },
+ {
+ name: "no CLI args",
+ args: []string{},
+ expected: nil,
+ expectedError: errMissingLaunchOption,
+ },
+ {
+ name: "unknown CLI flag",
+ args: []string{"--unknown-flag", "value"},
+ expected: nil,
+ expectedError: errParseCLIArgs,
+ },
+ {
+ name: "invalid CLI flag format",
+ args: []string{"---config", "test.yaml"},
+ expected: nil,
+ expectedError: errParseCLIArgs,
+ },
+ {
+ name: "no value specified for config-file flag",
+ args: []string{"--config"},
+ expected: nil,
+ expectedError: errParseCLIArgs,
+ },
+ {
+ name: "multiple config_file flags", // last one should take precedence
+ args: []string{"--config", "first.yaml", "--config", "second.yaml"},
+ expected: &LaunchOptions{
+ ConfigFile: "second.yaml",
+ },
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ opts, err := ParseLaunchOptions(test.args)
+
+ if test.expectedError != nil {
+ require.Error(t, err)
+ assert.True(t, errors.Is(err, test.expectedError),
+ "expected error to wrap %v, got %v", test.expectedError, err)
+ } else {
+ require.NoError(t, err)
+ require.NotNil(t, opts)
+ assert.Equal(t, test.expected.ConfigFile, opts.ConfigFile)
+ }
+ })
+ }
+}
+
+// TestMapFlags tests that flags are properly mapped to the LaunchOptions struct.
+func TestMapFlags(t *testing.T) {
+ opts := &LaunchOptions{}
+ flagSet := pflag.NewFlagSet(apicommonconstants.OperatorName, pflag.ContinueOnError)
+ opts.mapFlags(flagSet)
+
+ // Verify the config-file flag was registered
+ flag := flagSet.Lookup("config")
+ require.NotNil(t, flag, "config flag should be registered")
+ assert.Equal(t, "", flag.DefValue, "config should have empty default value")
+ assert.Equal(t, "string", flag.Value.Type(), "config should be a string flag")
+}
+
+// TestLoadAndValidateOperatorConfig tests loading and validating operator configuration.
+func TestLoadAndValidateOperatorConfig(t *testing.T) {
+ tests := []struct {
+ name string
+ configFile string
+ expectedError error
+ validateFunc func(t *testing.T, config *configv1alpha1.OperatorConfiguration)
+ }{
+ {
+ name: "valid config",
+ configFile: "testdata/valid-config.yaml",
+ validateFunc: func(t *testing.T, config *configv1alpha1.OperatorConfiguration) {
+ require.NotNil(t, config)
+ assert.Equal(t, configv1alpha1.LogLevel("info"), config.LogLevel)
+ assert.Equal(t, configv1alpha1.LogFormat("json"), config.LogFormat)
+ assert.Equal(t, float32(100), config.ClientConnection.QPS)
+ assert.Equal(t, 150, config.ClientConnection.Burst)
+ assert.Equal(t, 9443, config.Server.Webhooks.Port)
+ assert.NotNil(t, config.Server.HealthProbes)
+ assert.Equal(t, 9444, config.Server.HealthProbes.Port)
+ assert.NotNil(t, config.Server.Metrics)
+ assert.Equal(t, 9445, config.Server.Metrics.Port)
+ assert.NotNil(t, config.Controllers.PodCliqueSet.ConcurrentSyncs)
+ assert.Equal(t, 3, *config.Controllers.PodCliqueSet.ConcurrentSyncs)
+ },
+ },
+ {
+ name: "invalid operator config",
+ configFile: "testdata/invalid-config.yaml",
+ expectedError: errInvalidOperatorConfig,
+ },
+ {
+ name: "non existent config file",
+ configFile: "testdata/nonexistent.yaml",
+ expectedError: errLoadOperatorConfig,
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ opts := &LaunchOptions{
+ ConfigFile: test.configFile,
+ }
+
+ config, err := opts.LoadAndValidateOperatorConfig()
+
+ if test.expectedError != nil {
+ require.Error(t, err)
+ if test.expectedError != nil {
+ assert.True(t, errors.Is(err, test.expectedError),
+ "expected error to wrap %v, got %v", test.expectedError, err)
+ }
+ assert.Nil(t, config)
+ } else {
+ require.NoError(t, err)
+ require.NotNil(t, config)
+ if test.validateFunc != nil {
+ test.validateFunc(t, config)
+ }
+ }
+ })
+ }
+}
diff --git a/operator/cmd/cli/testdata/invalid-config.yaml b/operator/cmd/cli/testdata/invalid-config.yaml
new file mode 100644
index 000000000..34c36b9c3
--- /dev/null
+++ b/operator/cmd/cli/testdata/invalid-config.yaml
@@ -0,0 +1,26 @@
+apiVersion: operator.config.grove.io/v1alpha1
+kind: OperatorConfiguration
+runtimeClientConnection:
+ qps: 100
+ burst: -10 # invalid value
+leaderElection:
+ enabled: false
+server:
+ webhooks:
+ bindAddress: 0.0.0.0
+ port: 9443
+ serverCertDir: /etc/grove-operator/webhook-certs
+controllers:
+ podCliqueSet:
+ concurrentSyncs: 3
+ podClique:
+ concurrentSyncs: 3
+ podCliqueScalingGroup:
+ concurrentSyncs: 2
+logLevel: info
+logFormat: json
+authorizer:
+ enabled: false
+clusterTopology:
+ enabled: false
+
diff --git a/operator/cmd/cli/testdata/valid-config.yaml b/operator/cmd/cli/testdata/valid-config.yaml
new file mode 100644
index 000000000..570c4630c
--- /dev/null
+++ b/operator/cmd/cli/testdata/valid-config.yaml
@@ -0,0 +1,38 @@
+apiVersion: operator.config.grove.io/v1alpha1
+kind: OperatorConfiguration
+runtimeClientConnection:
+ qps: 100
+ burst: 150
+leaderElection:
+ enabled: true
+ leaseDuration: 15s
+ renewDeadline: 10s
+ retryPeriod: 2s
+ resourceLock: leases
+ resourceName: grove-operator-leader-election
+ resourceNamespace: default
+server:
+ webhooks:
+ bindAddress: 0.0.0.0
+ port: 9443
+ serverCertDir: /etc/grove-operator/webhook-certs
+ healthProbes:
+ bindAddress: 0.0.0.0
+ port: 9444
+ metrics:
+ bindAddress: 0.0.0.0
+ port: 9445
+controllers:
+ podCliqueSet:
+ concurrentSyncs: 3
+ podClique:
+ concurrentSyncs: 3
+ podCliqueScalingGroup:
+ concurrentSyncs: 2
+logLevel: info
+logFormat: json
+authorizer:
+ enabled: false
+clusterTopology:
+ enabled: false
+
diff --git a/operator/cmd/main.go b/operator/cmd/main.go
index f0a9ec310..8d38eeb98 100644
--- a/operator/cmd/main.go
+++ b/operator/cmd/main.go
@@ -18,20 +18,23 @@ package main
import (
"context"
+ "errors"
"flag"
"fmt"
+ "io"
"os"
+ apicommonconstants "github.com/ai-dynamo/grove/operator/api/common/constants"
configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1"
- grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
- groveopts "github.com/ai-dynamo/grove/operator/cmd/opts"
+ corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
+ "github.com/ai-dynamo/grove/operator/cmd/cli"
+ "github.com/ai-dynamo/grove/operator/internal/clustertopology"
grovectrl "github.com/ai-dynamo/grove/operator/internal/controller"
"github.com/ai-dynamo/grove/operator/internal/controller/cert"
grovelogger "github.com/ai-dynamo/grove/operator/internal/logger"
groveversion "github.com/ai-dynamo/grove/operator/internal/version"
"github.com/spf13/pflag"
- "k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
)
@@ -42,73 +45,86 @@ var (
func main() {
ctrl.SetLogger(grovelogger.MustNewLogger(false, configv1alpha1.InfoLevel, configv1alpha1.LogFormatJSON))
+ groveInfo := groveversion.New()
- fs := pflag.CommandLine
- groveversion.AddFlags(fs)
- cliOpts := groveopts.NewCLIOptions(fs)
-
- // parse and print command line flags
- pflag.Parse()
- groveversion.PrintVersionAndExitIfRequested()
-
- logger.Info("Starting grove operator", "version", groveversion.Get())
- printFlags()
-
- operatorCfg, err := initializeOperatorConfig(cliOpts)
+ launchOpts, err := cli.ParseLaunchOptions(os.Args[1:])
+ if err != nil {
+ handleErrorAndExit(err, cli.ExitErrParseCLIArgs)
+ }
+ if launchOpts.Version {
+ _, _ = fmt.Fprintf(io.Writer(os.Stdout), "%s %v\n", apicommonconstants.OperatorName, groveInfo)
+ os.Exit(cli.ExitSuccess)
+ }
+ operatorConfig, err := launchOpts.LoadAndValidateOperatorConfig()
if err != nil {
- logger.Error(err, "failed to initialize operator configuration")
- os.Exit(1)
+ logger.Error(err, "failed to load operator config")
+ handleErrorAndExit(err, cli.ExitErrLoadOperatorConfig)
}
- mgr, err := grovectrl.CreateManager(operatorCfg)
+ logger.Info("Starting grove operator", "grove-info", groveInfo.Verbose())
+ printFlags()
+
+ mgr, err := grovectrl.CreateManager(operatorConfig)
if err != nil {
logger.Error(err, "failed to create grove controller manager")
- os.Exit(1)
+ handleErrorAndExit(err, cli.ExitErrInitializeManager)
}
ctx := ctrl.SetupSignalHandler()
- if err = validateClusterTopology(ctx, mgr.GetAPIReader(), operatorCfg.ClusterTopology); err != nil {
- logger.Error(err, "cannot validate cluster topology, operator cannot start")
- os.Exit(1)
+ // Initialize or clean up ClusterTopology based on operator configuration.
+ // This must be done before starting the controllers that may depend on the ClusterTopology resource.
+ // NOTE: In this version of the operator the synchronization will additionally ensure that the KAI Topology resource
+ // is created based on the ClusterTopology. When we introduce support for pluggable scheduler backends,
+ // handling of scheduler specified resources will be delegated to the backend scheduler controller.
+ if err = synchronizeTopology(ctx, mgr, operatorConfig); err != nil {
+ logger.Error(err, "failed to synchronize cluster topology")
+ handleErrorAndExit(err, cli.ExitErrSynchronizeTopology)
}
webhookCertsReadyCh := make(chan struct{})
- if err = cert.ManageWebhookCerts(mgr, operatorCfg.Server.Webhooks.ServerCertDir, operatorCfg.Authorizer.Enabled, webhookCertsReadyCh); err != nil {
+ if err = cert.ManageWebhookCerts(mgr, operatorConfig.Server.Webhooks.ServerCertDir, operatorConfig.Authorizer.Enabled, webhookCertsReadyCh); err != nil {
logger.Error(err, "failed to setup cert rotation")
- os.Exit(1)
+ handleErrorAndExit(err, cli.ExitErrInitializeManager)
}
if err = grovectrl.SetupHealthAndReadinessEndpoints(mgr, webhookCertsReadyCh); err != nil {
logger.Error(err, "failed to set up health and readiness for grove controller manager")
- os.Exit(1)
+ handleErrorAndExit(err, cli.ExitErrInitializeManager)
}
// Certificates need to be generated before the webhooks are started, which can only happen once the manager is started.
// Block while generating the certificates, and then start the webhooks.
go func() {
- if err = grovectrl.RegisterControllersAndWebhooks(mgr, logger, operatorCfg, webhookCertsReadyCh); err != nil {
+ if err = grovectrl.RegisterControllersAndWebhooks(mgr, logger, operatorConfig, webhookCertsReadyCh); err != nil {
logger.Error(err, "failed to initialize grove controller manager")
- os.Exit(1)
+ handleErrorAndExit(err, cli.ExitErrInitializeManager)
}
}()
logger.Info("Starting manager")
if err = mgr.Start(ctx); err != nil {
- logger.Error(err, "Error running manager")
- os.Exit(1)
+ logger.Error(err, "Error starting controller manager")
+ handleErrorAndExit(err, cli.ExitErrStart)
}
}
-func initializeOperatorConfig(cliOpts *groveopts.CLIOptions) (*configv1alpha1.OperatorConfiguration, error) {
- // complete and validate operator configuration
- if err := cliOpts.Complete(); err != nil {
- return nil, err
+func synchronizeTopology(ctx context.Context, mgr ctrl.Manager, operatorCfg *configv1alpha1.OperatorConfiguration) error {
+ cl, err := client.New(mgr.GetConfig(), client.Options{Scheme: mgr.GetScheme()})
+ if err != nil {
+ return fmt.Errorf("failed to create client for topology synchronization: %w", err)
}
- if err := cliOpts.Validate(); err != nil {
- return nil, err
+ if !operatorCfg.ClusterTopology.Enabled {
+ logger.Info("cluster topology is disabled, deleting existing ClusterTopology if any")
+ return clustertopology.DeleteClusterTopology(ctx, cl, corev1alpha1.DefaultClusterTopologyName)
}
- return cliOpts.Config, nil
+ // create or update ClusterTopology based on configuration
+ clusterTopology, err := clustertopology.EnsureClusterTopology(ctx, cl, logger, corev1alpha1.DefaultClusterTopologyName, operatorCfg.ClusterTopology.Levels)
+ if err != nil {
+ return err
+ }
+ // create or update KAI Topology based on ClusterTopology
+ return clustertopology.EnsureKAITopology(ctx, cl, logger, corev1alpha1.DefaultClusterTopologyName, clusterTopology)
}
func printFlags() {
@@ -119,14 +135,11 @@ func printFlags() {
logger.Info("Running with flags", flagKVs...)
}
-func validateClusterTopology(ctx context.Context, reader client.Reader, clusterTopologyConfig configv1alpha1.ClusterTopologyConfiguration) error {
- if !clusterTopologyConfig.Enabled {
- return nil
- }
- var clusterTopology grovecorev1alpha1.ClusterTopology
- if err := reader.Get(ctx, types.NamespacedName{Name: clusterTopologyConfig.Name}, &clusterTopology); err != nil {
- return fmt.Errorf("failed to fetch ClusterTopology %s: %w", clusterTopologyConfig.Name, err)
+// handleErrorAndExit gracefully handles errors before exiting the program.
+func handleErrorAndExit(err error, exitCode int) {
+ if errors.Is(err, pflag.ErrHelp) {
+ os.Exit(cli.ExitSuccess)
}
- logger.Info("topology validated successfully", "cluster topology", clusterTopologyConfig.Name)
- return nil
+ _, _ = fmt.Fprintf(os.Stderr, "Err: %v\n", err)
+ os.Exit(exitCode)
}
diff --git a/operator/cmd/opts/options.go b/operator/cmd/opts/options.go
deleted file mode 100644
index 6f69608d1..000000000
--- a/operator/cmd/opts/options.go
+++ /dev/null
@@ -1,80 +0,0 @@
-// /*
-// Copyright 2024 The Grove Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// */
-
-package opts
-
-import (
- "fmt"
- "os"
-
- configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1"
- operatorvalidation "github.com/ai-dynamo/grove/operator/api/config/validation"
-
- "github.com/spf13/pflag"
- "k8s.io/apimachinery/pkg/runtime"
- "k8s.io/apimachinery/pkg/runtime/serializer"
- utilruntime "k8s.io/apimachinery/pkg/util/runtime"
-)
-
-var configDecoder runtime.Decoder
-
-func init() {
- configScheme := runtime.NewScheme()
- utilruntime.Must(configv1alpha1.AddToScheme(configScheme))
- configDecoder = serializer.NewCodecFactory(configScheme).UniversalDecoder()
-}
-
-// CLIOptions provides convenience abstraction to initialize and validate OperatorConfiguration from CLI flags.
-type CLIOptions struct {
- configFile string
- // Config is the operator configuration initialized from the CLI flags.
- Config *configv1alpha1.OperatorConfiguration
-}
-
-// NewCLIOptions creates a new CLIOptions and adds the required CLI flags to the flag.flagSet.
-func NewCLIOptions(fs *pflag.FlagSet) *CLIOptions {
- cliOpts := &CLIOptions{}
- cliOpts.addFlags(fs)
- return cliOpts
-}
-
-// Complete reads the configuration file and decodes it into an OperatorConfiguration.
-func (o *CLIOptions) Complete() error {
- if len(o.configFile) == 0 {
- return fmt.Errorf("missing config file")
- }
- data, err := os.ReadFile(o.configFile)
- if err != nil {
- return fmt.Errorf("error reading config file: %w", err)
- }
- o.Config = &configv1alpha1.OperatorConfiguration{}
- if err = runtime.DecodeInto(configDecoder, data, o.Config); err != nil {
- return fmt.Errorf("error decoding config: %w", err)
- }
- return nil
-}
-
-// Validate validates the created OperatorConfiguration.
-func (o *CLIOptions) Validate() error {
- if errs := operatorvalidation.ValidateOperatorConfiguration(o.Config); errs != nil {
- return errs.ToAggregate()
- }
- return nil
-}
-
-func (o *CLIOptions) addFlags(fs *pflag.FlagSet) {
- fs.StringVar(&o.configFile, "config", o.configFile, "Path to configuration file.")
-}
diff --git a/operator/e2e/setup/k8s_clusters.go b/operator/e2e/setup/k8s_clusters.go
index df98843ef..dfa45dcf7 100644
--- a/operator/e2e/setup/k8s_clusters.go
+++ b/operator/e2e/setup/k8s_clusters.go
@@ -1031,7 +1031,7 @@ func buildLDFlagsForE2E() string {
// Build the ldflags string
// Note: The module path changed from github.com/NVIDIA to github.com/ai-dynamo
- ldflags := fmt.Sprintf("-X github.com/ai-dynamo/grove/operator/internal/version.gitCommit=%s -X github.com/ai-dynamo/grove/operator/internal/version.gitTreeState=%s -X github.com/ai-dynamo/grove/operator/internal/version.buildDate=%s -X github.com/ai-dynamo/grove/operator/internal/version.gitVersion=E2E_TESTS -X github.com/ai-dynamo/grove/operator/internal/version.programName=grove-operator",
+ ldflags := fmt.Sprintf("-X github.com/ai-dynamo/grove/operator/internal/version.gitCommit=%s -X github.com/ai-dynamo/grove/operator/internal/version.gitTreeState=%s -X github.com/ai-dynamo/grove/operator/internal/version.buildDate=%s -X github.com/ai-dynamo/grove/operator/internal/version.gitVersion=E2E_TESTS",
gitCommit, treeState, buildDate)
return ldflags
diff --git a/operator/e2e/setup/skaffold.go b/operator/e2e/setup/skaffold.go
index bc61f40c4..782a29d3a 100644
--- a/operator/e2e/setup/skaffold.go
+++ b/operator/e2e/setup/skaffold.go
@@ -25,6 +25,7 @@ import (
"os/exec"
"path/filepath"
"strings"
+ "time"
"github.com/ai-dynamo/grove/operator/e2e/utils"
"k8s.io/client-go/rest"
@@ -144,7 +145,11 @@ func runSkaffoldBuild(ctx context.Context, absSkaffoldPath, skaffoldDir, kubecon
// Add the skaffold.yaml path
args = append(args, "-f", absSkaffoldPath)
- cmd := exec.CommandContext(ctx, "skaffold", args...)
+ // Add timeout to prevent indefinite hangs (normal builds should complete in <10 minutes)
+ buildCtx, cancel := context.WithTimeout(ctx, 15*time.Minute)
+ defer cancel()
+
+ cmd := exec.CommandContext(buildCtx, "skaffold", args...)
cmd.Dir = skaffoldDir
// Set up environment variables
diff --git a/operator/go.mod b/operator/go.mod
index 79779c20e..03ba96fa1 100644
--- a/operator/go.mod
+++ b/operator/go.mod
@@ -1,8 +1,9 @@
module github.com/ai-dynamo/grove/operator
-go 1.24.0
+go 1.24.5
require (
+ github.com/NVIDIA/KAI-scheduler v0.12.0
github.com/ai-dynamo/grove/operator/api v0.0.0
github.com/ai-dynamo/grove/scheduler/api v0.0.0
github.com/docker/docker v28.3.3+incompatible
@@ -53,7 +54,7 @@ require (
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/dimchansky/utfbom v1.1.1 // indirect
github.com/distribution/reference v0.6.0 // indirect
- github.com/docker/cli v27.0.3+incompatible // indirect
+ github.com/docker/cli v27.1.1+incompatible // indirect
github.com/docker/distribution v2.8.2+incompatible // indirect
github.com/docker/docker-credential-helpers v0.8.2 // indirect
github.com/docker/go v1.5.1-1.0.20160303222718-d30aec9fd63c // indirect
@@ -82,7 +83,7 @@ require (
github.com/google/btree v1.1.3 // indirect
github.com/google/gnostic-models v0.7.0 // indirect
github.com/google/go-cmp v0.7.0 // indirect
- github.com/google/go-containerregistry v0.19.1 // indirect
+ github.com/google/go-containerregistry v0.20.2 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/gorilla/mux v1.8.1 // indirect
github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect
@@ -129,16 +130,15 @@ require (
github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect
- github.com/onsi/ginkgo/v2 v2.23.4 // indirect
github.com/opencontainers/go-digest v1.0.0 // indirect
github.com/opencontainers/image-spec v1.1.1 // indirect
github.com/pelletier/go-toml/v2 v2.2.3 // indirect
github.com/peterbourgon/diskv v2.0.1+incompatible // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
- github.com/prometheus/client_golang v1.23.0 // indirect
+ github.com/prometheus/client_golang v1.23.2 // indirect
github.com/prometheus/client_model v0.6.2 // indirect
- github.com/prometheus/common v0.65.0 // indirect
+ github.com/prometheus/common v0.66.1 // indirect
github.com/prometheus/procfs v0.17.0 // indirect
github.com/rancher/wharfie v0.6.2 // indirect
github.com/rubenv/sql-migrate v1.8.0 // indirect
@@ -161,7 +161,7 @@ require (
github.com/xeipuuv/gojsonschema v1.2.0 // indirect
github.com/xlab/treeprint v1.2.0 // indirect
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
- go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 // indirect
+ go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.59.0 // indirect
go.opentelemetry.io/otel v1.35.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.32.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.34.0 // indirect
@@ -187,9 +187,9 @@ require (
golang.org/x/time v0.12.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb // indirect
- google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb // indirect
+ google.golang.org/genproto/googleapis/rpc v0.0.0-20250313205543-e70fdf4c4cb4 // indirect
google.golang.org/grpc v1.72.1 // indirect
- google.golang.org/protobuf v1.36.7 // indirect
+ google.golang.org/protobuf v1.36.8 // indirect
gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/ini.v1 v1.67.0 // indirect
@@ -198,7 +198,7 @@ require (
k8s.io/apiserver v0.34.2 // indirect
k8s.io/component-base v0.34.2 // indirect
k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 // indirect
- k8s.io/kubectl v0.34.0 // indirect
+ k8s.io/kubectl v0.34.1 // indirect
oras.land/oras-go/v2 v2.6.0 // indirect
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect
sigs.k8s.io/kustomize/api v0.20.1 // indirect
diff --git a/operator/go.sum b/operator/go.sum
index e4bc0d7f0..03c075fdf 100644
--- a/operator/go.sum
+++ b/operator/go.sum
@@ -24,6 +24,8 @@ github.com/Masterminds/squirrel v1.5.4 h1:uUcX/aBc8O7Fg9kaISIUsHXdKuqehiXAMQTYX8
github.com/Masterminds/squirrel v1.5.4/go.mod h1:NNaOrjSoIDfDA40n7sr2tPNZRfjzjA400rg+riTZj10=
github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY=
github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
+github.com/NVIDIA/KAI-scheduler v0.12.0 h1:3rhB0TdbJYNgOiZ6E0cAa+VJnr4srhoigJI/PkY85vI=
+github.com/NVIDIA/KAI-scheduler v0.12.0/go.mod h1:56asdZ2ewWn2ALmPM93zxxGWi+8MX3Q0aRlvv2ytfuA=
github.com/Shopify/logrus-bugsnag v0.0.0-20170309145241-6dbc35f2c30d/go.mod h1:HI8ITrYtUY+O+ZhtlqUnD8+KwNPOyugEhfP9fdUIaEQ=
github.com/Shopify/logrus-bugsnag v0.0.0-20171204204709-577dee27f20d h1:UrqY+r/OJnIp5u0s1SbQ8dVfLCZJsnvazdBP5hS4iRs=
github.com/Shopify/logrus-bugsnag v0.0.0-20171204204709-577dee27f20d/go.mod h1:HI8ITrYtUY+O+ZhtlqUnD8+KwNPOyugEhfP9fdUIaEQ=
@@ -97,8 +99,8 @@ github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5Qvfr
github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E=
github.com/dlclark/regexp2 v1.11.0 h1:G/nrcoOa7ZXlpoa/91N3X7mM3r8eIlMBBJZvsz/mxKI=
github.com/dlclark/regexp2 v1.11.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
-github.com/docker/cli v27.0.3+incompatible h1:usGs0/BoBW8MWxGeEtqPMkzOY56jZ6kYlSN5BLDioCQ=
-github.com/docker/cli v27.0.3+incompatible/go.mod h1:JLrzqnKDaYBop7H2jaqPtU4hHvMKP+vjCwu2uszcLI8=
+github.com/docker/cli v27.1.1+incompatible h1:goaZxOqs4QKxznZjjBWKONQci/MywhtRv2oNn0GkeZE=
+github.com/docker/cli v27.1.1+incompatible/go.mod h1:JLrzqnKDaYBop7H2jaqPtU4hHvMKP+vjCwu2uszcLI8=
github.com/docker/distribution v2.7.1+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w=
github.com/docker/distribution v2.8.2+incompatible h1:T3de5rq0dB1j30rp0sA2rER+m322EBzniBPB6ZIzuh8=
github.com/docker/distribution v2.8.2+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w=
@@ -197,13 +199,13 @@ github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7O
github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
-github.com/google/go-containerregistry v0.19.1 h1:yMQ62Al6/V0Z7CqIrrS1iYoA5/oQCm88DeNujc7C1KY=
-github.com/google/go-containerregistry v0.19.1/go.mod h1:YCMFNQeeXeLF+dnhhWkqDItx/JSkH01j1Kis4PsjzFI=
+github.com/google/go-containerregistry v0.20.2 h1:B1wPJ1SN/S7pB+ZAimcciVD+r+yV/l/DSArMxlbwseo=
+github.com/google/go-containerregistry v0.20.2/go.mod h1:z38EKdKh4h7IP2gSfUUqEvalZBqs6AoLeWfUy34nQC8=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0=
github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
-github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8=
-github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
+github.com/google/pprof v0.0.0-20250820193118-f64d9cf942d6 h1:EEHtgt9IwisQ2AZ4pIsMjahcegHh6rmhqxzIRQIyepY=
+github.com/google/pprof v0.0.0-20250820193118-f64d9cf942d6/go.mod h1:I6V7YzU0XDpsHqbsyrghnFZLO1gwK6NPTNvmetQIk9U=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/gorilla/handlers v1.5.2 h1:cLTUSsNkgcwhgRqvCNmdbRWG0A3N4F+M2nWKdScwyEE=
@@ -356,13 +358,14 @@ github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw=
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
+github.com/onsi/ginkgo v1.12.0 h1:Iw5WCbBcaAAd0fpRb1c9r5YCylv4XDoCSigm1zLevwU=
github.com/onsi/ginkgo v1.12.0/go.mod h1:oUhWkIvk5aDxtKvDDuw8gItl8pKl42LzjC9KZE0HfGg=
-github.com/onsi/ginkgo/v2 v2.23.4 h1:ktYTpKJAVZnDT4VjxSbiBenUjmlL/5QkBEocaWXiQus=
-github.com/onsi/ginkgo/v2 v2.23.4/go.mod h1:Bt66ApGPBFzHyR+JO10Zbt0Gsp4uWxu5mIOTusL46e8=
+github.com/onsi/ginkgo/v2 v2.25.3 h1:Ty8+Yi/ayDAGtk4XxmmfUy4GabvM+MegeB4cDLRi6nw=
+github.com/onsi/ginkgo/v2 v2.25.3/go.mod h1:43uiyQC4Ed2tkOzLsEYm7hnrb7UJTWHYNsuy3bG/snE=
github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY=
github.com/onsi/gomega v1.9.0/go.mod h1:Ho0h+IUsWyvy1OpqCwxlQ/21gkhVunqlU8fDGcoTdcA=
-github.com/onsi/gomega v1.38.0 h1:c/WX+w8SLAinvuKKQFh77WEucCnPk4j2OTUr7lt7BeY=
-github.com/onsi/gomega v1.38.0/go.mod h1:OcXcwId0b9QsE7Y49u+BTrL4IdKOBOKnD6VQNTJEB6o=
+github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A=
+github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k=
github.com/open-policy-agent/cert-controller v0.14.0 h1:TPc19BOHOs4tARruTT5o4bzir7Ed6FF+j3EXP/nmZBs=
github.com/open-policy-agent/cert-controller v0.14.0/go.mod h1:UhE/FU54DnKo+Rt0Yf3r+oKjgy6kqSH8Vsjo+5bGrSo=
github.com/open-policy-agent/frameworks/constraint v0.0.0-20241101234656-e78c8abd754a h1:gQtOJ50XFyL2Xh3lDD9zP4KQ2PY4mZKQ9hDcWc81Sp8=
@@ -394,8 +397,8 @@ github.com/prometheus/client_golang v0.9.0-pre1.0.20180209125602-c332b6f63c06/go
github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
github.com/prometheus/client_golang v1.1.0/go.mod h1:I1FGZT9+L76gKKOs5djB6ezCbFQP1xR9D75/vuwEF3g=
-github.com/prometheus/client_golang v1.23.0 h1:ust4zpdl9r4trLY/gSjlm07PuiBq2ynaXXlptpfy8Uc=
-github.com/prometheus/client_golang v1.23.0/go.mod h1:i/o0R9ByOnHX0McrTMTyhYvKE4haaf2mW08I+jGAjEE=
+github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
+github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
github.com/prometheus/client_model v0.0.0-20171117100541-99fa1f4be8e5/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
@@ -404,8 +407,8 @@ github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvM
github.com/prometheus/common v0.0.0-20180110214958-89604d197083/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro=
github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4=
github.com/prometheus/common v0.6.0/go.mod h1:eBmuwkDJBwy6iBfxCBob6t6dR6ENT/y+J+Zk0j9GMYc=
-github.com/prometheus/common v0.65.0 h1:QDwzd+G1twt//Kwj/Ww6E9FQq1iVMmODnILtW1t2VzE=
-github.com/prometheus/common v0.65.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8=
+github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs=
+github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA=
github.com/prometheus/procfs v0.0.0-20180125133057-cb4147076ac7/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
@@ -508,8 +511,8 @@ go.opentelemetry.io/contrib/bridges/prometheus v0.57.0 h1:UW0+QyeyBVhn+COBec3nGh
go.opentelemetry.io/contrib/bridges/prometheus v0.57.0/go.mod h1:ppciCHRLsyCio54qbzQv0E4Jyth/fLWDTJYfvWpcSVk=
go.opentelemetry.io/contrib/exporters/autoexport v0.57.0 h1:jmTVJ86dP60C01K3slFQa2NQ/Aoi7zA+wy7vMOKD9H4=
go.opentelemetry.io/contrib/exporters/autoexport v0.57.0/go.mod h1:EJBheUMttD/lABFyLXhce47Wr6DPWYReCzaZiXadH7g=
-go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 h1:yd02MEjBdJkG3uabWP9apV+OuWRIXGDuJEUJbOHmCFU=
-go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0/go.mod h1:umTcuxiv1n/s/S6/c2AT/g2CQ7u5C59sHDNmfSwgz7Q=
+go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.59.0 h1:CV7UdSGJt/Ao6Gp4CXckLxVRRsRgDHoI8XjbL3PDl8s=
+go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.59.0/go.mod h1:FRmFuRJfag1IZ2dPkHnEoSFVgTVPUd2qf5Vi69hLb8I=
go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ=
go.opentelemetry.io/otel v1.35.0/go.mod h1:UEqy8Zp11hpkUrL73gSlELM0DupHoiq72dR+Zqel/+Y=
go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.8.0 h1:WzNab7hOOLzdDF/EoWCt4glhrbMPVMOO5JYTmpz36Ls=
@@ -637,13 +640,13 @@ gomodules.xyz/jsonpatch/v2 v2.5.0 h1:JELs8RLM12qJGXU4u/TO3V25KW8GreMKl9pdkk14RM0
gomodules.xyz/jsonpatch/v2 v2.5.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY=
google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb h1:p31xT4yrYrSM/G4Sn2+TNUkVhFCbG9y8itM2S6Th950=
google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:jbe3Bkdp+Dh2IrslsFCklNhweNTBgSYanP1UXhJDhKg=
-google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb h1:TLPQVbx1GJ8VKZxz52VAxl1EBgKXXbTiU9Fc5fZeLn4=
-google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:LuRYeWDFV6WOn90g357N17oMCaxpgCnbi/44qJvDn2I=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20250313205543-e70fdf4c4cb4 h1:iK2jbkWL86DXjEx0qiHcRE9dE4/Ahua5k6V8OWFb//c=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20250313205543-e70fdf4c4cb4/go.mod h1:LuRYeWDFV6WOn90g357N17oMCaxpgCnbi/44qJvDn2I=
google.golang.org/grpc v1.0.5/go.mod h1:yo6s7OP7yaDglbqo1J04qKzAhqBH6lvTonzMVmEdcZw=
google.golang.org/grpc v1.72.1 h1:HR03wO6eyZ7lknl75XlxABNVLLFc2PAb6mHlYh756mA=
google.golang.org/grpc v1.72.1/go.mod h1:wH5Aktxcg25y1I3w7H69nHfXdOG3UiadoBtjh3izSDM=
-google.golang.org/protobuf v1.36.7 h1:IgrO7UwFQGJdRNXH/sQux4R1Dj1WAKcLElzeeRaXV2A=
-google.golang.org/protobuf v1.36.7/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY=
+google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc=
+google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU=
gopkg.in/airbrake/gobrake.v2 v2.0.9/go.mod h1:/h5ZAUhDkGaJfjzjKLSjv6zCL6O0LLBxU4K+aSYdM/U=
gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw=
gopkg.in/cenkalti/backoff.v2 v2.2.1 h1:eJ9UAg01/HIHG987TwxvnzK2MgxXq97YY6rYDpY9aII=
@@ -694,12 +697,12 @@ k8s.io/component-base v0.34.2 h1:HQRqK9x2sSAsd8+R4xxRirlTjowsg6fWCPwWYeSvogQ=
k8s.io/component-base v0.34.2/go.mod h1:9xw2FHJavUHBFpiGkZoKuYZ5pdtLKe97DEByaA+hHbM=
k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
-k8s.io/kube-aggregator v0.33.4 h1:TdIJKHb0/bLpby7FblXIaVEzyA1jGEjzt/n9cRvwq8U=
-k8s.io/kube-aggregator v0.33.4/go.mod h1:wZuctdRvGde5bwzxkZRs0GYj2KOpCNgx8rRGVoNb62k=
+k8s.io/kube-aggregator v0.34.1 h1:WNLV0dVNoFKmuyvdWLd92iDSyD/TSTjqwaPj0U9XAEU=
+k8s.io/kube-aggregator v0.34.1/go.mod h1:RU8j+5ERfp0h+gIvWtxRPfsa5nK7rboDm8RST8BJfYQ=
k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 h1:liMHz39T5dJO1aOKHLvwaCjDbf07wVh6yaUlTpunnkE=
k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts=
-k8s.io/kubectl v0.34.0 h1:NcXz4TPTaUwhiX4LU+6r6udrlm0NsVnSkP3R9t0dmxs=
-k8s.io/kubectl v0.34.0/go.mod h1:bmd0W5i+HuG7/p5sqicr0Li0rR2iIhXL0oUyLF3OjR4=
+k8s.io/kubectl v0.34.1 h1:1qP1oqT5Xc93K+H8J7ecpBjaz511gan89KO9Vbsh/OI=
+k8s.io/kubectl v0.34.1/go.mod h1:JRYlhJpGPyk3dEmJ+BuBiOB9/dAvnrALJEiY/C5qa6A=
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck=
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
oras.land/oras-go/v2 v2.6.0 h1:X4ELRsiGkrbeox69+9tzTu492FMUu7zJQW6eJU+I2oc=
diff --git a/operator/hack/install-helm-charts.sh b/operator/hack/install-helm-charts.sh
deleted file mode 100755
index 0f85dba5e..000000000
--- a/operator/hack/install-helm-charts.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/usr/bin/env bash
-# /*
-# Copyright 2024 The Grove Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# */
-
-
-set -o errexit
-set -o nounset
-set -o pipefail
-
-
-function check_prerequisites() {
- if ! command -v helm &> /dev/null; then
- echo "helm is not installed. Please install helm from https://helm.sh/docs/intro/install"
- exit 1
- fi
-}
-
-function verify_and_install_charts() {
-
-}
-
diff --git a/operator/initc/cmd/main.go b/operator/initc/cmd/main.go
index f0ea4fabf..cae3e006f 100644
--- a/operator/initc/cmd/main.go
+++ b/operator/initc/cmd/main.go
@@ -26,7 +26,6 @@ import (
"github.com/ai-dynamo/grove/operator/initc/cmd/opts"
"github.com/ai-dynamo/grove/operator/initc/internal"
"github.com/ai-dynamo/grove/operator/internal/logger"
- "github.com/ai-dynamo/grove/operator/internal/version"
)
var (
@@ -41,9 +40,8 @@ func main() {
log.Error(err, "Failed to generate configuration for the init container from the flags")
os.Exit(1)
}
- version.PrintVersionAndExitIfRequested()
- log.Info("Starting grove init container", "version", version.Get())
+ log.Info("Starting grove init container")
podCliqueDependencies, err := config.GetPodCliqueDependencies()
if err != nil {
diff --git a/operator/initc/cmd/opts/options.go b/operator/initc/cmd/opts/options.go
index 3759b3dc9..76125c96d 100644
--- a/operator/initc/cmd/opts/options.go
+++ b/operator/initc/cmd/opts/options.go
@@ -23,7 +23,6 @@ import (
grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
groveerr "github.com/ai-dynamo/grove/operator/internal/errors"
- "github.com/ai-dynamo/grove/operator/internal/version"
"github.com/spf13/pflag"
)
@@ -43,11 +42,10 @@ type CLIOptions struct {
}
// RegisterFlags registers all the flags that are defined for the init container.
-func (c *CLIOptions) RegisterFlags(fs *pflag.FlagSet) {
+func (c *CLIOptions) RegisterFlags() {
// --podcliques=:
// --podcliques=podclique-a:3 --podcliques=podclique-b:4 and so on for each PodClique.
pflag.StringArrayVarP(&c.podCliques, "podcliques", "p", nil, "podclique name and minAvailable replicas seperated by comma, repeated for each podclique")
- version.AddFlags(fs)
}
// GetPodCliqueDependencies returns the PodClique information as a map with the minAvailable associated with each PodClique name.
@@ -91,7 +89,7 @@ func InitializeCLIOptions() (CLIOptions, error) {
config := CLIOptions{
podCliques: make([]string, 0),
}
- config.RegisterFlags(pflag.CommandLine)
+ config.RegisterFlags()
pflag.Parse()
return config, nil
}
diff --git a/operator/internal/client/scheme.go b/operator/internal/client/scheme.go
index ca4bd021d..8b43467d8 100644
--- a/operator/internal/client/scheme.go
+++ b/operator/internal/client/scheme.go
@@ -20,6 +20,7 @@ import (
configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1"
grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
+ kaitopologyv1alpha1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1alpha1"
schedv1alpha1 "github.com/ai-dynamo/grove/scheduler/api/core/v1alpha1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
@@ -35,6 +36,7 @@ func init() {
configv1alpha1.AddToScheme,
grovecorev1alpha1.AddToScheme,
schedv1alpha1.AddToScheme,
+ kaitopologyv1alpha1.AddToScheme,
k8sscheme.AddToScheme,
)
utilruntime.Must(metav1.AddMetaToScheme(Scheme))
diff --git a/operator/internal/clustertopology/clustertopology.go b/operator/internal/clustertopology/clustertopology.go
new file mode 100644
index 000000000..137b56324
--- /dev/null
+++ b/operator/internal/clustertopology/clustertopology.go
@@ -0,0 +1,154 @@
+// /*
+// Copyright 2025 The Grove Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// */
+
+package clustertopology
+
+import (
+ "context"
+ "fmt"
+ "reflect"
+
+ corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
+
+ kaitopologyv1alpha1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1alpha1"
+ "github.com/go-logr/logr"
+ "github.com/samber/lo"
+ apierrors "k8s.io/apimachinery/pkg/api/errors"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/apimachinery/pkg/runtime"
+ ctrl "sigs.k8s.io/controller-runtime"
+ "sigs.k8s.io/controller-runtime/pkg/client"
+ "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
+)
+
+// EnsureClusterTopology ensures that the ClusterTopology is created or updated in the cluster.
+func EnsureClusterTopology(ctx context.Context, cl client.Client, logger logr.Logger, name string, topologyLevels []corev1alpha1.TopologyLevel) (*corev1alpha1.ClusterTopology, error) {
+ desiredTopology := buildClusterTopology(name, topologyLevels)
+ existingTopology := &corev1alpha1.ClusterTopology{}
+ err := cl.Get(ctx, client.ObjectKey{Name: name}, existingTopology)
+ if err != nil {
+ // If not found, create a new ClusterTopology
+ if apierrors.IsNotFound(err) {
+ if err = cl.Create(ctx, desiredTopology); err != nil {
+ return nil, fmt.Errorf("failed to create ClusterTopology %s: %w", name, err)
+ }
+ logger.Info("Created ClusterTopology", "name", name)
+ return desiredTopology, nil
+ }
+ return nil, fmt.Errorf("failed to get ClusterTopology %s: %w", name, err)
+ }
+
+ // Update existing ClusterTopology if there are changes
+ if isClusterTopologyChanged(existingTopology, desiredTopology) {
+ existingTopology.Spec = desiredTopology.Spec
+ if err = cl.Update(ctx, existingTopology); err != nil {
+ return nil, fmt.Errorf("failed to update ClusterTopology %s: %w", name, err)
+ }
+ logger.Info("Updated ClusterTopology successfully", "name", name)
+ }
+ return existingTopology, nil
+}
+
+// EnsureKAITopology ensures that the corresponding KAI ClusterTopology resource is created.
+func EnsureKAITopology(ctx context.Context, cl client.Client, logger logr.Logger, name string, clusterTopology *corev1alpha1.ClusterTopology) error {
+ desiredTopology, err := buildKAITopology(name, clusterTopology, cl.Scheme())
+ if err != nil {
+ return fmt.Errorf("failed to build KAI Topology: %w", err)
+ }
+ existingTopology := &kaitopologyv1alpha1.Topology{}
+ if err = cl.Get(ctx, client.ObjectKey{Name: name}, existingTopology); err != nil {
+ if apierrors.IsNotFound(err) {
+ if err = cl.Create(ctx, desiredTopology); err != nil {
+ return fmt.Errorf("failed to create KAI Topology %s: %w", name, err)
+ }
+ logger.Info("Created KAI Topology", "name", name)
+ return nil
+ }
+ return fmt.Errorf("failed to get KAI Topology %s: %w", name, err)
+ }
+
+ // If the existing KAI topology does not have passed in ClusterTopology as owner, then error out.
+ if !metav1.IsControlledBy(existingTopology, clusterTopology) {
+ return fmt.Errorf("KAI Topology %s is not owned by ClusterTopology %s. It is required that KAI Topology by this name is created by Grove operator and has ClusterTopology set as its owner", name, clusterTopology.Name)
+ }
+
+ if isKAITopologyChanged(existingTopology, desiredTopology) {
+ // KAI Topology needs to be updated. Since KAI Topology has immutable levels, we need to delete and recreate it.
+ // See https://github.com/NVIDIA/KAI-Scheduler/blob/130ab4468f96b25b1899ad2eced0a072dff033e9/pkg/apis/kai/v1alpha1/topology_types.go#L60
+ if err = cl.Delete(ctx, existingTopology); err != nil {
+ return fmt.Errorf("failed to recreate (action: delete) existing KAI Topology %s: %w", name, err)
+ }
+ if err = cl.Create(ctx, desiredTopology); err != nil {
+ return fmt.Errorf("failed to recreate (action: create) KAI Topology %s: %w", name, err)
+ }
+ logger.Info("Recreated KAI Topology with updated levels", "name", name)
+ }
+ return nil
+}
+
+// DeleteClusterTopology deletes the ClusterTopology with the given name.
+func DeleteClusterTopology(ctx context.Context, cl client.Client, name string) error {
+ if err := client.IgnoreNotFound(cl.Delete(ctx, &corev1alpha1.ClusterTopology{
+ ObjectMeta: ctrl.ObjectMeta{
+ Name: name,
+ },
+ })); err != nil {
+ return fmt.Errorf("failed to delete ClusterTopology %s: %w", name, err)
+ }
+ return nil
+}
+
+func buildClusterTopology(name string, topologyLevels []corev1alpha1.TopologyLevel) *corev1alpha1.ClusterTopology {
+ sortedTopologyLevels := make([]corev1alpha1.TopologyLevel, len(topologyLevels))
+ copy(sortedTopologyLevels, topologyLevels)
+ corev1alpha1.SortTopologyLevels(sortedTopologyLevels)
+ return &corev1alpha1.ClusterTopology{
+ ObjectMeta: ctrl.ObjectMeta{
+ Name: name,
+ },
+ Spec: corev1alpha1.ClusterTopologySpec{
+ Levels: sortedTopologyLevels,
+ },
+ }
+}
+
+func isClusterTopologyChanged(oldTopology, newTopology *corev1alpha1.ClusterTopology) bool {
+ return !reflect.DeepEqual(oldTopology.Spec, newTopology.Spec)
+}
+
+func buildKAITopology(name string, clusterTopology *corev1alpha1.ClusterTopology, scheme *runtime.Scheme) (*kaitopologyv1alpha1.Topology, error) {
+ kaiTopologyLevels := lo.Map(clusterTopology.Spec.Levels, func(clusterTopologyLevel corev1alpha1.TopologyLevel, _ int) kaitopologyv1alpha1.TopologyLevel {
+ return kaitopologyv1alpha1.TopologyLevel{
+ NodeLabel: clusterTopologyLevel.Key,
+ }
+ })
+ kaiTopology := &kaitopologyv1alpha1.Topology{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: name,
+ },
+ Spec: kaitopologyv1alpha1.TopologySpec{
+ Levels: kaiTopologyLevels,
+ },
+ }
+ if err := controllerutil.SetControllerReference(clusterTopology, kaiTopology, scheme); err != nil {
+ return nil, fmt.Errorf("failed to set owner reference for KAI Topology: %w", err)
+ }
+ return kaiTopology, nil
+}
+
+func isKAITopologyChanged(oldTopology, newTopology *kaitopologyv1alpha1.Topology) bool {
+ return !reflect.DeepEqual(oldTopology.Spec.Levels, newTopology.Spec.Levels)
+}
diff --git a/operator/internal/clustertopology/clustertopology_test.go b/operator/internal/clustertopology/clustertopology_test.go
new file mode 100644
index 000000000..e4be57a21
--- /dev/null
+++ b/operator/internal/clustertopology/clustertopology_test.go
@@ -0,0 +1,774 @@
+// /*
+// Copyright 2025 The Grove Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// */
+
+package clustertopology
+
+import (
+ "context"
+ "errors"
+ "testing"
+
+ apicommonconstants "github.com/ai-dynamo/grove/operator/api/common/constants"
+ corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
+ testutils "github.com/ai-dynamo/grove/operator/test/utils"
+
+ kaitopologyv1alpha1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1alpha1"
+ "github.com/go-logr/logr"
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+ apierrors "k8s.io/apimachinery/pkg/api/errors"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/apimachinery/pkg/util/uuid"
+ "k8s.io/utils/ptr"
+ "sigs.k8s.io/controller-runtime/pkg/client"
+)
+
+const topologyName = "test-topology"
+
+func TestEnsureClusterTopologyWhenNonExists(t *testing.T) {
+ // Setup
+ ctx := context.Background()
+ cl := testutils.CreateDefaultFakeClient(nil)
+ logger := logr.Discard()
+
+ topologyLevels := []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"},
+ }
+
+ // Test
+ topology, err := EnsureClusterTopology(ctx, cl, logger, topologyName, topologyLevels)
+
+ // Assert
+ require.NoError(t, err)
+ assert.NotNil(t, topology)
+ assert.Equal(t, topologyName, topology.Name)
+ assert.Equal(t, topologyLevels, topology.Spec.Levels)
+
+ // Verify it was actually created
+ fetched := &corev1alpha1.ClusterTopology{}
+ err = cl.Get(ctx, client.ObjectKey{Name: topologyName}, fetched)
+ require.NoError(t, err)
+ assert.Equal(t, topologyLevels, fetched.Spec.Levels)
+}
+
+func TestEnsureClusterTopologyWhenUpdated(t *testing.T) {
+ // Setup
+ ctx := context.Background()
+ topologyLevels := []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"},
+ }
+ // Create initial topology
+ existing := &corev1alpha1.ClusterTopology{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: topologyName,
+ },
+ Spec: corev1alpha1.ClusterTopologySpec{
+ Levels: topologyLevels,
+ },
+ }
+ cl := testutils.CreateDefaultFakeClient([]client.Object{existing})
+ logger := logr.Discard()
+
+ // Update with new levels and test
+ updatedTopologyLevels := []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainRegion, Key: "topology.kubernetes.io/region"},
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"},
+ }
+ topology, err := EnsureClusterTopology(ctx, cl, logger, topologyName, updatedTopologyLevels)
+
+ // Assert
+ require.NoError(t, err)
+ assert.NotNil(t, topology)
+ assert.Equal(t, updatedTopologyLevels, topology.Spec.Levels)
+
+ // Verify it was actually updated
+ fetched := &corev1alpha1.ClusterTopology{}
+ err = cl.Get(ctx, client.ObjectKey{Name: topologyName}, fetched)
+ require.NoError(t, err)
+ assert.Equal(t, updatedTopologyLevels, fetched.Spec.Levels)
+}
+
+func TestEnsureClusterTopologyWhenNoUpdate(t *testing.T) {
+ // Setup
+ ctx := context.Background()
+ topologyLevels := []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"},
+ }
+ existing := &corev1alpha1.ClusterTopology{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: topologyName,
+ ResourceVersion: "1",
+ },
+ Spec: corev1alpha1.ClusterTopologySpec{
+ Levels: topologyLevels,
+ },
+ }
+ cl := testutils.CreateDefaultFakeClient([]client.Object{existing})
+ logger := logr.Discard()
+
+ // Test
+ topology, err := EnsureClusterTopology(ctx, cl, logger, topologyName, topologyLevels)
+ // Assert
+ require.NoError(t, err)
+ assert.NotNil(t, topology)
+ assert.Equal(t, topologyLevels, topology.Spec.Levels)
+ // Resource version should remain the same (no update occurred)
+ assert.Equal(t, "1", topology.ResourceVersion)
+}
+
+func TestEnsureClusterTopologyWhenCreateError(t *testing.T) {
+ ctx := context.Background()
+ logger := logr.Discard()
+ topologyLevels := []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"},
+ }
+ // Create a client that will fail on create
+ createErr := apierrors.NewInternalError(assert.AnError)
+ cl := testutils.NewTestClientBuilder().
+ RecordErrorForObjects(testutils.ClientMethodCreate, createErr, client.ObjectKey{Name: topologyName}).
+ Build()
+
+ topology, err := EnsureClusterTopology(ctx, cl, logger, topologyName, topologyLevels)
+ assert.Error(t, err)
+ assert.Nil(t, topology)
+ assert.True(t, errors.Is(err, createErr))
+}
+
+func TestEnsureClusterTopologyWhenUpdateError(t *testing.T) {
+ // Setup
+ ctx := context.Background()
+ topologyLevels := []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"},
+ }
+ existing := &corev1alpha1.ClusterTopology{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: topologyName,
+ },
+ Spec: corev1alpha1.ClusterTopologySpec{
+ Levels: topologyLevels,
+ },
+ }
+ // Create a client that will fail on update
+ updateErr := apierrors.NewInternalError(assert.AnError)
+ cl := testutils.NewTestClientBuilder().
+ WithObjects(existing).
+ RecordErrorForObjects(testutils.ClientMethodUpdate, updateErr, client.ObjectKey{Name: topologyName}).
+ Build()
+ logger := logr.Discard()
+ // Test
+ updatedTopologyLevels := []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainRegion, Key: "topology.kubernetes.io/region"},
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"},
+ }
+ topology, err := EnsureClusterTopology(ctx, cl, logger, topologyName, updatedTopologyLevels)
+ assert.Error(t, err)
+ assert.Nil(t, topology)
+ assert.True(t, errors.Is(err, updateErr))
+}
+
+func TestEnsureClusterTopologyWhenGetError(t *testing.T) {
+ ctx := context.Background()
+ logger := logr.Discard()
+
+ // Create a client that will fail on get (non-NotFound error)
+ getErr := apierrors.NewInternalError(assert.AnError)
+ cl := testutils.NewTestClientBuilder().
+ RecordErrorForObjects(testutils.ClientMethodGet, getErr, client.ObjectKey{Name: topologyName}).
+ Build()
+
+ topologyLevels := []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"},
+ }
+ topology, err := EnsureClusterTopology(ctx, cl, logger, topologyName, topologyLevels)
+ assert.Error(t, err)
+ assert.Nil(t, topology)
+ assert.Contains(t, err.Error(), "failed to get ClusterTopology")
+}
+
+func TestEnsureKAITopologyWhenNonExists(t *testing.T) {
+ // Setup
+ ctx := context.Background()
+ cl := testutils.CreateDefaultFakeClient(nil)
+ logger := logr.Discard()
+
+ topologyLevels := []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: corev1alpha1.TopologyDomainRack, Key: "topology.kubernetes.io/rack"},
+ {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"},
+ }
+ clusterTopology := &corev1alpha1.ClusterTopology{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: topologyName,
+ UID: uuid.NewUUID(),
+ },
+ Spec: corev1alpha1.ClusterTopologySpec{
+ Levels: topologyLevels,
+ },
+ }
+
+ // Test
+ err := EnsureKAITopology(ctx, cl, logger, topologyName, clusterTopology)
+ require.NoError(t, err)
+
+ // Verify KAI Topology was created
+ kaiTopology := &kaitopologyv1alpha1.Topology{}
+ err = cl.Get(ctx, client.ObjectKey{Name: topologyName}, kaiTopology)
+ require.NoError(t, err)
+ assert.Equal(t, topologyName, kaiTopology.Name)
+ assert.Len(t, kaiTopology.Spec.Levels, 3)
+ assert.Equal(t, "topology.kubernetes.io/zone", kaiTopology.Spec.Levels[0].NodeLabel)
+ assert.Equal(t, "topology.kubernetes.io/rack", kaiTopology.Spec.Levels[1].NodeLabel)
+ assert.Equal(t, "kubernetes.io/hostname", kaiTopology.Spec.Levels[2].NodeLabel)
+
+ // Verify owner reference is set
+ assert.True(t, metav1.IsControlledBy(kaiTopology, clusterTopology))
+}
+
+func TestEnsureKAITopologyWhenClusterTopologyUpdated(t *testing.T) {
+ // Setup
+ ctx := context.Background()
+ topologyLevels := []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"},
+ }
+ clusterTopology := &corev1alpha1.ClusterTopology{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: topologyName,
+ UID: uuid.NewUUID(),
+ },
+ Spec: corev1alpha1.ClusterTopologySpec{
+ Levels: topologyLevels,
+ },
+ }
+
+ // Create existing KAI Topology with old levels
+ existingKAITopology := &kaitopologyv1alpha1.Topology{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: topologyName,
+ OwnerReferences: []metav1.OwnerReference{
+ {
+ APIVersion: corev1alpha1.SchemeGroupVersion.String(),
+ Kind: apicommonconstants.KindClusterTopology,
+ Name: clusterTopology.Name,
+ UID: clusterTopology.UID,
+ Controller: ptr.To(true),
+ BlockOwnerDeletion: ptr.To(true),
+ },
+ },
+ },
+ Spec: kaitopologyv1alpha1.TopologySpec{
+ Levels: []kaitopologyv1alpha1.TopologyLevel{
+ {NodeLabel: "kubernetes.io/hostname"},
+ },
+ },
+ }
+
+ cl := testutils.CreateDefaultFakeClient([]client.Object{clusterTopology, existingKAITopology})
+ logger := logr.Discard()
+
+ err := EnsureKAITopology(ctx, cl, logger, topologyName, clusterTopology)
+ require.NoError(t, err)
+
+ // Verify KAI Topology was recreated with new levels
+ kaiTopology := &kaitopologyv1alpha1.Topology{}
+ err = cl.Get(ctx, client.ObjectKey{Name: topologyName}, kaiTopology)
+ require.NoError(t, err)
+ // Since it is recreated, generation should be "1" thus testing that the existing resource is not updated.
+ assert.Equal(t, int64(0), kaiTopology.Generation)
+ assert.Len(t, kaiTopology.Spec.Levels, 2)
+ assert.Equal(t, "topology.kubernetes.io/zone", kaiTopology.Spec.Levels[0].NodeLabel)
+ assert.Equal(t, "kubernetes.io/hostname", kaiTopology.Spec.Levels[1].NodeLabel)
+}
+
+func TestEnsureKAITopologyWhenNoChangeRequired(t *testing.T) {
+ // Setup
+ ctx := context.Background()
+ topologyLevels := []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"},
+ }
+ clusterTopology := &corev1alpha1.ClusterTopology{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: topologyName,
+ UID: uuid.NewUUID(),
+ },
+ Spec: corev1alpha1.ClusterTopologySpec{
+ Levels: topologyLevels,
+ },
+ }
+
+ // Create existing KAI Topology with same levels
+ existingKAITopology := &kaitopologyv1alpha1.Topology{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: topologyName,
+ ResourceVersion: "1",
+ OwnerReferences: []metav1.OwnerReference{
+ {
+ APIVersion: "grove.io/v1alpha1",
+ Kind: "ClusterTopology",
+ Name: clusterTopology.Name,
+ UID: clusterTopology.UID,
+ Controller: ptr.To(true),
+ BlockOwnerDeletion: ptr.To(true),
+ },
+ },
+ },
+ Spec: kaitopologyv1alpha1.TopologySpec{
+ Levels: []kaitopologyv1alpha1.TopologyLevel{
+ {NodeLabel: "topology.kubernetes.io/zone"},
+ {NodeLabel: "kubernetes.io/hostname"},
+ },
+ },
+ }
+
+ cl := testutils.CreateDefaultFakeClient([]client.Object{clusterTopology, existingKAITopology})
+ logger := logr.Discard()
+
+ err := EnsureKAITopology(ctx, cl, logger, topologyName, clusterTopology)
+ require.NoError(t, err)
+
+ // Verify KAI Topology was NOT recreated (resource version should be same)
+ kaiTopology := &kaitopologyv1alpha1.Topology{}
+ err = cl.Get(ctx, client.ObjectKey{Name: topologyName}, kaiTopology)
+ require.NoError(t, err)
+ assert.Equal(t, int64(0), kaiTopology.Generation)
+ assert.Equal(t, "1", kaiTopology.ResourceVersion)
+}
+
+func TestEnsureKAITopologyWithUnknownOwner(t *testing.T) {
+ // Setup
+ ctx := context.Background()
+ topologyLevels := []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"},
+ }
+ clusterTopology := &corev1alpha1.ClusterTopology{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: topologyName,
+ UID: uuid.NewUUID(),
+ },
+ Spec: corev1alpha1.ClusterTopologySpec{
+ Levels: topologyLevels,
+ },
+ }
+
+ // Create existing KAI Topology owned by different ClusterTopology
+ existingKAITopology := &kaitopologyv1alpha1.Topology{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: topologyName,
+ OwnerReferences: []metav1.OwnerReference{
+ {
+ APIVersion: "grove.io/v1alpha1",
+ Kind: "ClusterTopology",
+ Name: "different-owner",
+ UID: "different-uid",
+ Controller: ptr.To(true),
+ BlockOwnerDeletion: ptr.To(true),
+ },
+ },
+ },
+ Spec: kaitopologyv1alpha1.TopologySpec{
+ Levels: []kaitopologyv1alpha1.TopologyLevel{
+ {NodeLabel: "topology.kubernetes.io/zone"},
+ },
+ },
+ }
+
+ cl := testutils.CreateDefaultFakeClient([]client.Object{clusterTopology, existingKAITopology})
+ logger := logr.Discard()
+ // Test and Assert
+ err := EnsureKAITopology(ctx, cl, logger, topologyName, clusterTopology)
+ assert.Error(t, err)
+}
+
+func TestEnsureKAITopologyWhenGetError(t *testing.T) {
+ // Setup
+ ctx := context.Background()
+ topologyLevels := []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"},
+ }
+ clusterTopology := &corev1alpha1.ClusterTopology{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: topologyName,
+ UID: uuid.NewUUID(),
+ },
+ Spec: corev1alpha1.ClusterTopologySpec{
+ Levels: topologyLevels,
+ },
+ }
+
+ // Create a client that will fail on get (non-NotFound error)
+ getErr := apierrors.NewInternalError(assert.AnError)
+ cl := testutils.NewTestClientBuilder().
+ WithObjects(clusterTopology).
+ RecordErrorForObjects(testutils.ClientMethodGet, getErr, client.ObjectKey{Name: topologyName}).
+ Build()
+ logger := logr.Discard()
+ // Test
+ err := EnsureKAITopology(ctx, cl, logger, topologyName, clusterTopology)
+ // Assert
+ assert.Error(t, err)
+ assert.True(t, errors.Is(err, getErr))
+}
+
+func TestEnsureKAITopologyWhenCreateError(t *testing.T) {
+ // Setup
+ ctx := context.Background()
+ topologyLevels := []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"},
+ }
+ clusterTopology := &corev1alpha1.ClusterTopology{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: topologyName,
+ UID: uuid.NewUUID(),
+ },
+ Spec: corev1alpha1.ClusterTopologySpec{
+ Levels: topologyLevels,
+ },
+ }
+
+ // Create a client that will fail on create
+ createErr := apierrors.NewInternalError(assert.AnError)
+ cl := testutils.NewTestClientBuilder().
+ WithObjects(clusterTopology).
+ RecordErrorForObjects(testutils.ClientMethodCreate, createErr, client.ObjectKey{Name: topologyName}).
+ Build()
+ logger := logr.Discard()
+
+ // Test
+ err := EnsureKAITopology(ctx, cl, logger, topologyName, clusterTopology)
+ // Assert
+ assert.Error(t, err)
+ assert.True(t, errors.Is(err, createErr))
+}
+
+func TestEnsureKAITopologyWhenDeleteError(t *testing.T) {
+ // Setup
+ ctx := context.Background()
+ // New topology levels that ClusterTopology will have
+ topologyLevels := []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"},
+ }
+ clusterTopology := &corev1alpha1.ClusterTopology{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: topologyName,
+ UID: uuid.NewUUID(),
+ },
+ Spec: corev1alpha1.ClusterTopologySpec{
+ Levels: topologyLevels,
+ },
+ }
+
+ // Create existing KAI Topology with OLD/DIFFERENT levels to trigger delete-and-recreate
+ existingKAITopology := &kaitopologyv1alpha1.Topology{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: topologyName,
+ OwnerReferences: []metav1.OwnerReference{
+ {
+ APIVersion: "grove.io/v1alpha1",
+ Kind: "ClusterTopology",
+ Name: clusterTopology.Name,
+ UID: clusterTopology.UID,
+ Controller: ptr.To(true),
+ BlockOwnerDeletion: ptr.To(true),
+ },
+ },
+ },
+ Spec: kaitopologyv1alpha1.TopologySpec{
+ Levels: []kaitopologyv1alpha1.TopologyLevel{
+ {NodeLabel: "old-label"}, // Different from ClusterTopology levels
+ },
+ },
+ }
+
+ // Create a client that will fail on delete
+ deleteErr := apierrors.NewInternalError(assert.AnError)
+ cl := testutils.NewTestClientBuilder().
+ WithObjects(clusterTopology, existingKAITopology).
+ RecordErrorForObjects(testutils.ClientMethodDelete, deleteErr, client.ObjectKey{Name: topologyName}).
+ Build()
+ logger := logr.Discard()
+
+ // Test
+ err := EnsureKAITopology(ctx, cl, logger, topologyName, clusterTopology)
+ // Assert
+ assert.Error(t, err)
+ assert.True(t, errors.Is(err, deleteErr))
+}
+
+func TestEnsureKAITopologyWhenCreateFailsDuringRecreate(t *testing.T) {
+ // Setup
+ ctx := context.Background()
+ updatedTopologyLevels := []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"},
+ }
+ clusterTopology := &corev1alpha1.ClusterTopology{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: topologyName,
+ UID: uuid.NewUUID(),
+ },
+ Spec: corev1alpha1.ClusterTopologySpec{
+ Levels: updatedTopologyLevels,
+ },
+ }
+
+ // Create existing KAI Topology with old levels
+ existingKAITopology := &kaitopologyv1alpha1.Topology{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: topologyName,
+ OwnerReferences: []metav1.OwnerReference{
+ {
+ APIVersion: "grove.io/v1alpha1",
+ Kind: "ClusterTopology",
+ Name: clusterTopology.Name,
+ UID: clusterTopology.UID,
+ Controller: ptr.To(true),
+ BlockOwnerDeletion: ptr.To(true),
+ },
+ },
+ },
+ Spec: kaitopologyv1alpha1.TopologySpec{
+ Levels: []kaitopologyv1alpha1.TopologyLevel{
+ {NodeLabel: "kubernetes.io/hostname"},
+ },
+ },
+ }
+
+ // Create a client that succeeds when deleting KAI Topology resource but fail on create
+ // This simulates the scenario where delete succeeds but create fails during recreation
+ createErr := apierrors.NewInternalError(assert.AnError)
+ cl := testutils.NewTestClientBuilder().
+ WithObjects(clusterTopology, existingKAITopology).
+ RecordErrorForObjects(testutils.ClientMethodCreate, createErr, client.ObjectKey{Name: topologyName}).
+ Build()
+ logger := logr.Discard()
+
+ err := EnsureKAITopology(ctx, cl, logger, topologyName, clusterTopology)
+ assert.Error(t, err)
+ // After delete succeeds, create will fail, so we get the create error message
+ assert.True(t, errors.Is(err, createErr))
+}
+
+func TestDeleteClusterTopology(t *testing.T) {
+ // Setup
+ ctx := context.Background()
+ topologyLevels := []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"},
+ }
+ existing := &corev1alpha1.ClusterTopology{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: topologyName,
+ },
+ Spec: corev1alpha1.ClusterTopologySpec{
+ Levels: topologyLevels,
+ },
+ }
+ cl := testutils.CreateDefaultFakeClient([]client.Object{existing})
+
+ // Test
+ err := DeleteClusterTopology(ctx, cl, topologyName)
+ // Assert
+ require.NoError(t, err)
+ // Verify it was deleted
+ fetched := &corev1alpha1.ClusterTopology{}
+ err = cl.Get(ctx, client.ObjectKey{Name: topologyName}, fetched)
+ assert.True(t, apierrors.IsNotFound(err))
+}
+
+func TestDeleteClusterTopologyWhenNoneExists(t *testing.T) {
+ // Setup
+ ctx := context.Background()
+ cl := testutils.CreateDefaultFakeClient(nil)
+ // Test and Assert
+ // Should not error when deleting non-existent topology
+ err := DeleteClusterTopology(ctx, cl, topologyName)
+ assert.NoError(t, err)
+}
+
+func TestDeleteClusterTopologyError(t *testing.T) {
+ ctx := context.Background()
+ topologyLevels := []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"},
+ }
+ existing := &corev1alpha1.ClusterTopology{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: topologyName,
+ },
+ Spec: corev1alpha1.ClusterTopologySpec{
+ Levels: topologyLevels,
+ },
+ }
+
+ // Create a client that will fail on delete (non-NotFound error)
+ deleteErr := apierrors.NewInternalError(assert.AnError)
+ cl := testutils.NewTestClientBuilder().
+ WithObjects(existing).
+ RecordErrorForObjects(testutils.ClientMethodDelete, deleteErr, client.ObjectKey{Name: topologyName}).
+ Build()
+
+ err := DeleteClusterTopology(ctx, cl, topologyName)
+ assert.Error(t, err)
+ assert.True(t, errors.Is(err, deleteErr))
+}
+
+func TestBuildClusterTopology(t *testing.T) {
+ topologyLevels := []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"},
+ }
+ topology := buildClusterTopology(topologyName, topologyLevels)
+ assert.Equal(t, topologyName, topology.Name)
+ assert.Equal(t, topologyLevels, topology.Spec.Levels)
+}
+
+func TestIsClusterTopologyChanged(t *testing.T) {
+ tests := []struct {
+ name string
+ oldLevels []corev1alpha1.TopologyLevel
+ newLevels []corev1alpha1.TopologyLevel
+ expectedChanged bool
+ }{
+ {
+ name: "No change in levels",
+ oldLevels: []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"},
+ },
+ newLevels: []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"},
+ },
+ expectedChanged: false,
+ },
+ {
+ name: "Change in levels",
+ oldLevels: []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"},
+ },
+ newLevels: []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainRegion, Key: "topology.kubernetes.io/region"},
+ },
+ expectedChanged: true,
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ oldTopology := &corev1alpha1.ClusterTopology{
+ Spec: corev1alpha1.ClusterTopologySpec{
+ Levels: test.oldLevels,
+ },
+ }
+ newTopology := &corev1alpha1.ClusterTopology{
+ Spec: corev1alpha1.ClusterTopologySpec{
+ Levels: test.newLevels,
+ },
+ }
+ topologyChanged := isClusterTopologyChanged(oldTopology, newTopology)
+ assert.Equal(t, test.expectedChanged, topologyChanged)
+ })
+ }
+}
+
+func TestBuildKAITopology(t *testing.T) {
+ // Setup
+ cl := testutils.CreateDefaultFakeClient(nil)
+ topologyLevels := []corev1alpha1.TopologyLevel{
+ {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"},
+ {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"},
+ }
+ clusterTopology := &corev1alpha1.ClusterTopology{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: topologyName,
+ UID: uuid.NewUUID(),
+ },
+ Spec: corev1alpha1.ClusterTopologySpec{
+ Levels: topologyLevels,
+ },
+ }
+
+ // Test
+ kaiTopology, err := buildKAITopology(topologyName, clusterTopology, cl.Scheme())
+ // Assert
+ require.NoError(t, err)
+ assert.Equal(t, topologyName, kaiTopology.Name)
+ assert.Len(t, kaiTopology.Spec.Levels, 2)
+ assert.Equal(t, "topology.kubernetes.io/zone", kaiTopology.Spec.Levels[0].NodeLabel)
+ assert.Equal(t, "kubernetes.io/hostname", kaiTopology.Spec.Levels[1].NodeLabel)
+ assert.True(t, metav1.IsControlledBy(kaiTopology, clusterTopology))
+}
+
+func TestIsKAITopologyChanged(t *testing.T) {
+ tests := []struct {
+ name string
+ oldLevels []kaitopologyv1alpha1.TopologyLevel
+ newLevels []kaitopologyv1alpha1.TopologyLevel
+ expectedChanged bool
+ }{
+ {
+ name: "No change in levels",
+ oldLevels: []kaitopologyv1alpha1.TopologyLevel{
+ {NodeLabel: "topology.kubernetes.io/zone"},
+ {NodeLabel: "kubernetes.io/hostname"},
+ },
+ newLevels: []kaitopologyv1alpha1.TopologyLevel{
+ {NodeLabel: "topology.kubernetes.io/zone"},
+ {NodeLabel: "kubernetes.io/hostname"},
+ },
+ expectedChanged: false,
+ },
+ {
+ name: "Change in levels",
+ oldLevels: []kaitopologyv1alpha1.TopologyLevel{
+ {NodeLabel: "topology.kubernetes.io/zone"},
+ {NodeLabel: "kubernetes.io/hostname"},
+ },
+ newLevels: []kaitopologyv1alpha1.TopologyLevel{
+ {NodeLabel: "topology.kubernetes.io/region"},
+ },
+ expectedChanged: true,
+ },
+ }
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ oldTopology := &kaitopologyv1alpha1.Topology{
+ Spec: kaitopologyv1alpha1.TopologySpec{
+ Levels: test.oldLevels,
+ },
+ }
+ newTopology := &kaitopologyv1alpha1.Topology{
+ Spec: kaitopologyv1alpha1.TopologySpec{
+ Levels: test.newLevels,
+ },
+ }
+ topologyChanged := isKAITopologyChanged(oldTopology, newTopology)
+ assert.Equal(t, test.expectedChanged, topologyChanged)
+ })
+ }
+}
diff --git a/operator/internal/controller/cert/cert.go b/operator/internal/controller/cert/cert.go
index d7478e949..2cd60b4d2 100644
--- a/operator/internal/controller/cert/cert.go
+++ b/operator/internal/controller/cert/cert.go
@@ -22,7 +22,6 @@ import (
"strings"
"github.com/ai-dynamo/grove/operator/internal/constants"
- clustertopologyvalidation "github.com/ai-dynamo/grove/operator/internal/webhook/admission/clustertopology/validation"
authorizationwebhook "github.com/ai-dynamo/grove/operator/internal/webhook/admission/pcs/authorization"
defaultingwebhook "github.com/ai-dynamo/grove/operator/internal/webhook/admission/pcs/defaulting"
validatingwebhook "github.com/ai-dynamo/grove/operator/internal/webhook/admission/pcs/validation"
@@ -88,10 +87,6 @@ func getWebhooks(authorizerEnabled bool) []cert.WebhookInfo {
Type: cert.Validating,
Name: validatingwebhook.Name,
},
- {
- Type: cert.Validating,
- Name: clustertopologyvalidation.Name,
- },
}
if authorizerEnabled {
webhooks = append(webhooks, cert.WebhookInfo{
diff --git a/operator/internal/controller/cert/cert_test.go b/operator/internal/controller/cert/cert_test.go
index 87f1b6362..7f9e4c03f 100644
--- a/operator/internal/controller/cert/cert_test.go
+++ b/operator/internal/controller/cert/cert_test.go
@@ -32,25 +32,23 @@ func TestGetWebhooks(t *testing.T) {
t.Run("authorizer disabled", func(t *testing.T) {
webhooks := getWebhooks(false)
- // Expect 3 webhooks: podcliqueset-defaulting, podcliqueset-validating, clustertopology-validating
- require.Len(t, webhooks, 3)
+ // Expect 2 webhooks: podcliqueset-defaulting, podcliqueset-validating
+ require.Len(t, webhooks, 2)
// Check that defaulting and validating webhooks are present
assert.Equal(t, cert.Mutating, webhooks[0].Type) // podcliqueset-defaulting-webhook
assert.Equal(t, cert.Validating, webhooks[1].Type) // podcliqueset-validating-webhook
- assert.Equal(t, cert.Validating, webhooks[2].Type) // clustertopology-validating-webhook
})
// Test with authorizer enabled
t.Run("authorizer enabled", func(t *testing.T) {
webhooks := getWebhooks(true)
- // Expect 4 webhooks: the 3 base webhooks plus the authorizer-webhook
- require.Len(t, webhooks, 4)
- // Check that all four webhooks are present
+ // Expect 3 webhooks: the 2 base webhooks plus the authorizer-webhook
+ require.Len(t, webhooks, 3)
+ // Check that all three webhooks are present
assert.Equal(t, cert.Mutating, webhooks[0].Type) // podcliqueset-defaulting-webhook
assert.Equal(t, cert.Validating, webhooks[1].Type) // podcliqueset-validating-webhook
- assert.Equal(t, cert.Validating, webhooks[2].Type) // clustertopology-validating-webhook
- assert.Equal(t, cert.Validating, webhooks[3].Type) // authorizer-webhook
+ assert.Equal(t, cert.Validating, webhooks[2].Type) // authorizer-webhook
})
}
diff --git a/operator/internal/controller/manager.go b/operator/internal/controller/manager.go
index 5c5155614..0ff633593 100644
--- a/operator/internal/controller/manager.go
+++ b/operator/internal/controller/manager.go
@@ -46,7 +46,7 @@ const (
var (
waitTillWebhookCertsReady = cert.WaitTillWebhookCertsReady
registerControllersWithMgr = RegisterControllers
- registerWebhooksWithMgr = webhook.RegisterWebhooks
+ registerWebhooksWithMgr = webhook.Register
)
// CreateManager creates the manager.
@@ -56,7 +56,7 @@ func CreateManager(operatorCfg *configv1alpha1.OperatorConfiguration) (ctrl.Mana
// RegisterControllersAndWebhooks adds all the controllers and webhooks to the controller-manager using the passed in Config.
func RegisterControllersAndWebhooks(mgr ctrl.Manager, logger logr.Logger, operatorCfg *configv1alpha1.OperatorConfiguration, certsReady chan struct{}) error {
- // Controllers will not work unless the webhoooks are fully configured and operational.
+ // Controllers will not work unless the webhooks are fully configured and operational.
// For webhooks to work cert-controller should finish its work of generating and injecting certificates.
waitTillWebhookCertsReady(logger, certsReady)
if err := registerControllersWithMgr(mgr, operatorCfg.Controllers); err != nil {
diff --git a/operator/internal/controller/podclique/components/pod/initcontainer.go b/operator/internal/controller/podclique/components/pod/initcontainer.go
index c2ad8c945..fa77297e2 100644
--- a/operator/internal/controller/podclique/components/pod/initcontainer.go
+++ b/operator/internal/controller/podclique/components/pod/initcontainer.go
@@ -26,7 +26,7 @@ import (
"github.com/ai-dynamo/grove/operator/internal/constants"
"github.com/ai-dynamo/grove/operator/internal/controller/common/component"
groveerr "github.com/ai-dynamo/grove/operator/internal/errors"
- "github.com/ai-dynamo/grove/operator/internal/version"
+ groveversion "github.com/ai-dynamo/grove/operator/internal/version"
"github.com/samber/lo"
corev1 "k8s.io/api/core/v1"
@@ -107,7 +107,7 @@ func addInitContainer(pcs *grovecorev1alpha1.PodCliqueSet, pclq *grovecorev1alph
pod.Spec.InitContainers = append(pod.Spec.InitContainers, corev1.Container{
Name: initContainerName,
- Image: fmt.Sprintf("%s:%s", image, version.Get().GitVersion),
+ Image: fmt.Sprintf("%s:%s", image, groveversion.New().GitVersion),
Args: args,
VolumeMounts: []corev1.VolumeMount{
{
diff --git a/operator/internal/utils/ioutil/ioutil.go b/operator/internal/utils/ioutil/ioutil.go
new file mode 100644
index 000000000..ed51c19b6
--- /dev/null
+++ b/operator/internal/utils/ioutil/ioutil.go
@@ -0,0 +1,26 @@
+// /*
+// Copyright 2025 The Grove Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// */
+
+package ioutil
+
+import "io"
+
+// CloseQuietly safely closes an io.Closer, ignoring and suppressing any error during the close operation.
+func CloseQuietly(closer io.Closer) {
+ if closer != nil {
+ _ = closer.Close()
+ }
+}
diff --git a/operator/internal/utils/ioutil/ioutil_test.go b/operator/internal/utils/ioutil/ioutil_test.go
new file mode 100644
index 000000000..c3831531c
--- /dev/null
+++ b/operator/internal/utils/ioutil/ioutil_test.go
@@ -0,0 +1,133 @@
+// /*
+// Copyright 2025 The Grove Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// */
+
+package ioutil
+
+import (
+ "errors"
+ "io"
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+)
+
+// TestCloseQuietly tests the CloseQuietly function with various scenarios.
+func TestCloseQuietly(t *testing.T) {
+ tests := []struct {
+ name string
+ closer io.Closer
+ expectErr bool
+ validate func(t *testing.T, closer io.Closer)
+ }{
+ {
+ name: "close without errors",
+ closer: &mockCloser{
+ closeErr: nil,
+ },
+ expectErr: false,
+ validate: func(t *testing.T, closer io.Closer) {
+ mock := closer.(*mockCloser)
+ assert.True(t, mock.closed, "closer should be closed")
+ assert.Equal(t, 1, mock.callCount, "Close should be called once")
+ },
+ },
+ {
+ name: "close with error",
+ closer: &mockCloser{
+ closeErr: errors.New("close failed"),
+ },
+ expectErr: true,
+ validate: func(t *testing.T, closer io.Closer) {
+ mock := closer.(*mockCloser)
+ assert.True(t, mock.closed, "closer should be closed despite error")
+ assert.Equal(t, 1, mock.callCount, "Close should be called once")
+ },
+ },
+ {
+ name: "closer is nil",
+ closer: nil,
+ expectErr: false,
+ validate: func(_ *testing.T, _ io.Closer) {
+ // No panic should occur, nothing to validate
+ },
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ // This should not panic regardless of error
+ assert.NotPanics(t, func() {
+ CloseQuietly(tt.closer)
+ }, "CloseQuietly should not panic")
+
+ if tt.validate != nil {
+ tt.validate(t, tt.closer)
+ }
+ })
+ }
+}
+
+// TestCloseQuietlyWithRealCloser tests with a more realistic closer.
+func TestCloseQuietlyWithRealCloser(t *testing.T) {
+ // Using a simple pipe as a real io.Closer
+ r, w := io.Pipe()
+
+ // Close the reader first
+ assert.NotPanics(t, func() {
+ CloseQuietly(r)
+ })
+
+ // Close the writer
+ assert.NotPanics(t, func() {
+ CloseQuietly(w)
+ })
+
+ // Try to close again (should not panic even if already closed)
+ assert.NotPanics(t, func() {
+ CloseQuietly(r)
+ CloseQuietly(w)
+ })
+}
+
+// TestCloseQuietlyWithPanic tests that CloseQuietly doesn't handle panics (only errors).
+func TestCloseQuietlyWithPanic(t *testing.T) {
+ closer := &panicCloser{}
+
+ // CloseQuietly should not catch panics, only suppress errors
+ assert.Panics(t, func() {
+ CloseQuietly(closer)
+ }, "CloseQuietly should not suppress panics, only errors")
+}
+
+// mockCloser is a mock implementation of io.Closer for testing.
+type mockCloser struct {
+ closed bool
+ closeErr error
+ callCount int
+}
+
+func (m *mockCloser) Close() error {
+ m.callCount++
+ m.closed = true
+ return m.closeErr
+}
+
+// panicCloser is a closer that panics when closed.
+type panicCloser struct{}
+
+func (p *panicCloser) Close() error {
+ panic("intentional panic")
+}
diff --git a/operator/internal/version/version.go b/operator/internal/version/version.go
index ee1bc7018..e7ba705b5 100644
--- a/operator/internal/version/version.go
+++ b/operator/internal/version/version.go
@@ -17,21 +17,15 @@
package version
import (
+ "encoding/json"
"fmt"
- "io"
- "os"
"runtime"
-
- "github.com/spf13/pflag"
- apimachineryversion "k8s.io/apimachinery/pkg/version"
)
// These variables will be set during building the grove operator via LD_FLAGS
// These variables have been borrowed from k8s.io/component-base repository. We do not want
// the dependencies that k8s.io/component-base pulls in as the attempt is the keep a lean set of dependencies.
var (
- // programName is the name of the operator.
- programName = "grove-operator"
// gitVersion is the semantic version for grove operator.
gitVersion = "v0.0.0-master+$Format:%H$"
// gitCommit is the SHA1 from git, output of $(git rev-parse HEAD)
@@ -39,18 +33,23 @@ var (
// gitTreeState is the state of git tree, either "clean" or "dirty"
gitTreeState = ""
// buildDate is the date (in ISO8601 format) at which the build was done. Output of $(date -u +'%Y-%m-%dT%H:%M:%SZ')
- buildDate = "1970-01-01T00:00:00Z"
- versionFlag bool
+ buildDate = "1970-01-01T00:00:00Z"
)
-// AddFlags adds the --version flag to the flag.FlagSet.
-func AddFlags(fs *pflag.FlagSet) {
- fs.BoolVar(&versionFlag, "version", false, "version prints the version information and quits")
+// GroveInfo holds version and build information about the grove operator.
+type GroveInfo struct {
+ GitVersion string `json:"gitVersion"`
+ GitCommit string `json:"gitCommit"`
+ GitTreeState string `json:"gitTreeState"`
+ BuildDate string `json:"buildDate"`
+ GoVersion string `json:"goVersion"`
+ Compiler string `json:"compiler"`
+ Platform string `json:"platform"`
}
-// Get returns the version details for the grove operator.
-func Get() apimachineryversion.Info {
- return apimachineryversion.Info{
+// New creates a new GroveInfo with the build and version information.
+func New() GroveInfo {
+ return GroveInfo{
GitVersion: gitVersion,
GitCommit: gitCommit,
GitTreeState: gitTreeState,
@@ -61,11 +60,16 @@ func Get() apimachineryversion.Info {
}
}
-// PrintVersionAndExitIfRequested will check if --version is passed and if it is
-// then it will print the version information and quit.
-func PrintVersionAndExitIfRequested() {
- if versionFlag {
- _, _ = fmt.Fprintf(io.Writer(os.Stdout), "%s %v\n", programName, Get())
- os.Exit(0)
+// Version returns the version information for Grove operator.
+func (g GroveInfo) Version() string {
+ return g.GitVersion
+}
+
+// Verbose returns a detailed multi-line string with version and build information.
+func (g GroveInfo) Verbose() string {
+ infoBytes, err := json.Marshal(g)
+ if err != nil {
+ return fmt.Sprintf("error generating verbose version information: %v\n", err)
}
+ return string(infoBytes)
}
diff --git a/operator/internal/webhook/admission/clustertopology/validation/clustertopology.go b/operator/internal/webhook/admission/clustertopology/validation/clustertopology.go
deleted file mode 100644
index 1cbc032e6..000000000
--- a/operator/internal/webhook/admission/clustertopology/validation/clustertopology.go
+++ /dev/null
@@ -1,182 +0,0 @@
-// /*
-// Copyright 2025 The Grove Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// */
-
-package validation
-
-import (
- "fmt"
-
- grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
-
- "github.com/samber/lo"
- admissionv1 "k8s.io/api/admission/v1"
- apivalidation "k8s.io/apimachinery/pkg/api/validation"
- metav1validation "k8s.io/apimachinery/pkg/apis/meta/v1/validation"
- "k8s.io/apimachinery/pkg/util/validation/field"
-)
-
-// clusterTopologyValidator validates ClusterTopology resources for create and update operations.
-type clusterTopologyValidator struct {
- operation admissionv1.Operation
- clusterTopology *grovecorev1alpha1.ClusterTopology
-}
-
-// newClusterTopologyValidator creates a new ClusterTopology validator for the given operation.
-func newClusterTopologyValidator(clusterTopology *grovecorev1alpha1.ClusterTopology, operation admissionv1.Operation) *clusterTopologyValidator {
- return &clusterTopologyValidator{
- operation: operation,
- clusterTopology: clusterTopology,
- }
-}
-
-// ---------------------------- validate create of ClusterTopology -----------------------------------------------
-
-// validate validates the ClusterTopology object.
-func (v *clusterTopologyValidator) validate() error {
- allErrs := field.ErrorList{}
-
- // Note: Metadata validation is handled by the API server
- errs := v.validateClusterTopologySpec(field.NewPath("spec"))
- if len(errs) != 0 {
- allErrs = append(allErrs, errs...)
- }
-
- return allErrs.ToAggregate()
-}
-
-// validateClusterTopologySpec validates the specification of a ClusterTopology object.
-func (v *clusterTopologyValidator) validateClusterTopologySpec(fldPath *field.Path) field.ErrorList {
- allErrs := field.ErrorList{}
-
- levelsPath := fldPath.Child("levels")
-
- // Note: MinItems and MaxItems are enforced by kubebuilder validation
- // (+kubebuilder:validation:MinItems=1, +kubebuilder:validation:MaxItems=7)
-
- // First, validate each level individually (domain validity, key format, etc.)
- lo.ForEach(v.clusterTopology.Spec.Levels, func(level grovecorev1alpha1.TopologyLevel, i int) {
- levelPath := levelsPath.Index(i)
- allErrs = append(allErrs, v.validateTopologyLevel(level, levelPath)...)
- })
-
- // Then validate hierarchical order (assumes all domains are valid)
- // This also ensures domain uniqueness
- allErrs = append(allErrs, validateTopologyLevelOrder(v.clusterTopology.Spec.Levels, levelsPath)...)
-
- // Validate that all keys are unique
- allErrs = append(allErrs, validateTopologyLevelKeyUniqueness(v.clusterTopology.Spec.Levels, levelsPath)...)
-
- return allErrs
-}
-
-// validateTopologyLevel validates a single topology level in isolation.
-func (v *clusterTopologyValidator) validateTopologyLevel(level grovecorev1alpha1.TopologyLevel, fldPath *field.Path) field.ErrorList {
- allErrs := field.ErrorList{}
-
- // Note: Domain validation is handled by kubebuilder enum validation
- // (+kubebuilder:validation:Enum=region;zone;datacenter;block;rack;host;numa)
-
- // Note: Key required and length validation is handled by kubebuilder validation
- // (+kubebuilder:validation:Required, +kubebuilder:validation:MinLength=1, +kubebuilder:validation:MaxLength=63)
-
- // Validate key format is a valid Kubernetes label key
- keyPath := fldPath.Child("key")
- allErrs = append(allErrs, metav1validation.ValidateLabelName(level.Key, keyPath)...)
-
- return allErrs
-}
-
-// validateTopologyLevelOrder validates that topology levels are in the correct hierarchical order
-// (broadest to narrowest: Region > Zone > DataCenter > Block > Rack > Host > Numa).
-// This function assumes all domains have already been validated to be valid topology domains.
-// This validation also ensures domain uniqueness (duplicate domains would have the same order value).
-func validateTopologyLevelOrder(levels []grovecorev1alpha1.TopologyLevel, fldPath *field.Path) field.ErrorList {
- allErrs := field.ErrorList{}
-
- for i := 1; i < len(levels); i++ {
- prevLevel := levels[i-1]
- currLevel := levels[i]
-
- // Current level must be narrower than previous level
- if currLevel.BroaderThan(prevLevel) {
- allErrs = append(allErrs, field.Invalid(fldPath.Index(i).Child("domain"), currLevel.Domain,
- fmt.Sprintf("topology levels must be in hierarchical order (broadest to narrowest). Domain '%s' at position %d cannot come after '%s' at position %d",
- currLevel.Domain, i, prevLevel.Domain, i-1)))
- } else if currLevel.Compare(prevLevel) == 0 {
- allErrs = append(allErrs, field.Duplicate(fldPath.Index(i).Child("domain"), currLevel.Domain))
- }
- }
-
- return allErrs
-}
-
-// validateTopologyLevelKeyUniqueness validates that all keys in the topology levels are unique.
-func validateTopologyLevelKeyUniqueness(levels []grovecorev1alpha1.TopologyLevel, fldPath *field.Path) field.ErrorList {
- allErrs := field.ErrorList{}
- seenKeys := make(map[string]int) // map key to first index where it appears
-
- for i, level := range levels {
- if firstIndex, exists := seenKeys[level.Key]; exists {
- allErrs = append(allErrs, field.Invalid(fldPath.Index(i).Child("key"), level.Key,
- fmt.Sprintf("duplicate key: already used at levels[%d]", firstIndex)))
- } else {
- seenKeys[level.Key] = i
- }
- }
-
- return allErrs
-}
-
-// ---------------------------- validate update of ClusterTopology -----------------------------------------------
-
-// validateUpdate validates the update to a ClusterTopology object.
-func (v *clusterTopologyValidator) validateUpdate(oldClusterTopology *grovecorev1alpha1.ClusterTopology) error {
- allErrs := field.ErrorList{}
- allErrs = append(allErrs, v.validateClusterTopologySpecUpdate(&v.clusterTopology.Spec, &oldClusterTopology.Spec, field.NewPath("spec"))...)
- return allErrs.ToAggregate()
-}
-
-// validateClusterTopologySpecUpdate validates updates to the ClusterTopology specification.
-func (v *clusterTopologyValidator) validateClusterTopologySpecUpdate(oldSpec, newSpec *grovecorev1alpha1.ClusterTopologySpec, fldPath *field.Path) field.ErrorList {
- allErrs := field.ErrorList{}
-
- levelsPath := fldPath.Child("levels")
-
- // Validate that the number of levels hasn't changed
- if len(newSpec.Levels) != len(oldSpec.Levels) {
- allErrs = append(allErrs, field.Forbidden(levelsPath, "not allowed to add or remove topology levels"))
- return allErrs
- }
-
- // Validate that the order and domains of levels haven't changed
- allErrs = append(allErrs, validateTopologyLevelImmutableFields(newSpec.Levels, oldSpec.Levels, levelsPath)...)
-
- return allErrs
-}
-
-// validateTopologyLevelImmutableFields validates that immutable fields in topology levels haven't changed during an update.
-func validateTopologyLevelImmutableFields(newLevels, oldLevels []grovecorev1alpha1.TopologyLevel, fldPath *field.Path) field.ErrorList {
- allErrs := field.ErrorList{}
-
- for i := range newLevels {
- levelPath := fldPath.Index(i)
-
- allErrs = append(allErrs, apivalidation.ValidateImmutableField(newLevels[i].Domain, oldLevels[i].Domain, levelPath.Child("domain"))...)
- // Note: Key is allowed to change (not in the requirements), but validation has already occurred in validate()
- }
-
- return allErrs
-}
diff --git a/operator/internal/webhook/admission/clustertopology/validation/clustertopology_crd_test.go b/operator/internal/webhook/admission/clustertopology/validation/clustertopology_crd_test.go
deleted file mode 100644
index 20c774b0a..000000000
--- a/operator/internal/webhook/admission/clustertopology/validation/clustertopology_crd_test.go
+++ /dev/null
@@ -1,252 +0,0 @@
-// /*
-// Copyright 2025 The Grove Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// */
-
-package validation
-
-import (
- "context"
- "os"
- "testing"
- "time"
-
- grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
-
- "github.com/stretchr/testify/assert"
- "github.com/stretchr/testify/require"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/runtime"
- utilruntime "k8s.io/apimachinery/pkg/util/runtime"
- "k8s.io/client-go/kubernetes/scheme"
- "sigs.k8s.io/controller-runtime/pkg/client"
- "sigs.k8s.io/controller-runtime/pkg/envtest"
-)
-
-func TestClusterTopologyCRDValidation(t *testing.T) {
- testEnv, k8sClient := setupEnvTest(t)
- defer teardownEnvTest(t, testEnv)
-
- tests := []struct {
- name string
- ct *grovecorev1alpha1.ClusterTopology
- expectError bool
- errorContains string
- }{
- {
- name: "valid cluster topology with single level",
- ct: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{Name: "test-valid-single"},
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: grovecorev1alpha1.TopologyDomainRegion,
- Key: "topology.kubernetes.io/region",
- },
- },
- },
- },
- expectError: false,
- },
- {
- name: "valid cluster topology with multiple levels",
- ct: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{Name: "test-valid-multiple"},
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: grovecorev1alpha1.TopologyDomainRegion,
- Key: "topology.kubernetes.io/region",
- },
- {
- Domain: grovecorev1alpha1.TopologyDomainZone,
- Key: "topology.kubernetes.io/zone",
- },
- {
- Domain: grovecorev1alpha1.TopologyDomainHost,
- Key: "kubernetes.io/hostname",
- },
- },
- },
- },
- expectError: false,
- },
- {
- name: "reject invalid domain",
- ct: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{Name: "test-invalid-domain"},
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: "invalid-domain", // Not in enum
- Key: "test.io/key",
- },
- },
- },
- },
- expectError: true,
- errorContains: "supported values",
- },
- {
- name: "reject empty key",
- ct: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{Name: "test-empty-key"},
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: grovecorev1alpha1.TopologyDomainRegion,
- Key: "", // Empty key
- },
- },
- },
- },
- expectError: true,
- errorContains: "spec.levels[0].key",
- },
- {
- name: "reject key too long",
- ct: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{Name: "test-key-too-long"},
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: grovecorev1alpha1.TopologyDomainRegion,
- // Create a key longer than 63 characters (MaxLength validation)
- Key: "this-is-a-very-long-key-that-exceeds-the-maximum-allowed-length-of-63-characters",
- },
- },
- },
- },
- expectError: true,
- errorContains: "spec.levels[0].key",
- },
- {
- name: "reject empty levels array",
- ct: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{Name: "test-empty-levels"},
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{}, // Empty array
- },
- },
- expectError: true,
- errorContains: "spec.levels",
- },
- {
- name: "reject too many levels",
- ct: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{Name: "test-too-many-levels"},
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {Domain: grovecorev1alpha1.TopologyDomainRegion, Key: "key1"},
- {Domain: grovecorev1alpha1.TopologyDomainZone, Key: "key2"},
- {Domain: grovecorev1alpha1.TopologyDomainDataCenter, Key: "key3"},
- {Domain: grovecorev1alpha1.TopologyDomainBlock, Key: "key4"},
- {Domain: grovecorev1alpha1.TopologyDomainRack, Key: "key5"},
- {Domain: grovecorev1alpha1.TopologyDomainHost, Key: "key6"},
- {Domain: grovecorev1alpha1.TopologyDomainNuma, Key: "key7"},
- {Domain: "region", Key: "key8"}, // 8th level - exceeds max
- },
- },
- },
- expectError: true,
- errorContains: "spec.levels",
- },
- {
- name: "reject missing domain",
- ct: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{Name: "test-missing-domain"},
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Key: "test.io/key", // Missing domain
- },
- },
- },
- },
- expectError: true,
- // API server should reject due to Required validation
- },
- {
- name: "reject missing key",
- ct: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{Name: "test-missing-key"},
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: grovecorev1alpha1.TopologyDomainRegion,
- // Missing key
- },
- },
- },
- },
- expectError: true,
- // API server should reject due to Required validation
- },
- }
-
- for _, tt := range tests {
- t.Run(tt.name, func(t *testing.T) {
- err := k8sClient.Create(context.Background(), tt.ct)
-
- if tt.expectError {
- assert.Error(t, err)
- if tt.errorContains != "" {
- assert.Contains(t, err.Error(), tt.errorContains)
- }
- } else {
- assert.NoError(t, err)
- // Clean up successfully created resource
- _ = k8sClient.Delete(context.Background(), tt.ct)
- }
- })
- }
-}
-
-// setupEnvTest creates an envtest environment with ClusterTopology CRDs loaded
-func setupEnvTest(t *testing.T) (*envtest.Environment, client.Client) {
- // Check if KUBEBUILDER_ASSETS is set
- if os.Getenv("KUBEBUILDER_ASSETS") == "" {
- t.Skip("Skipping envtest: KUBEBUILDER_ASSETS not set. Run 'make test-envtest' to execute this test.")
- }
-
- // Point to CRD directory (relative from this test file location)
- crdPath := "../../../../../api/core/v1alpha1/crds"
-
- testEnv := &envtest.Environment{
- CRDDirectoryPaths: []string{crdPath},
- ErrorIfCRDPathMissing: true,
- }
-
- cfg, err := testEnv.Start()
- require.NoError(t, err)
-
- // Create client with proper scheme
- testScheme := runtime.NewScheme()
- utilruntime.Must(scheme.AddToScheme(testScheme))
- utilruntime.Must(grovecorev1alpha1.AddToScheme(testScheme))
-
- k8sClient, err := client.New(cfg, client.Options{Scheme: testScheme})
- require.NoError(t, err)
-
- // Give CRDs a moment to be fully ready
- time.Sleep(100 * time.Millisecond)
-
- return testEnv, k8sClient
-}
-
-// teardownEnvTest stops the envtest environment
-func teardownEnvTest(t *testing.T, testEnv *envtest.Environment) {
- err := testEnv.Stop()
- require.NoError(t, err)
-}
diff --git a/operator/internal/webhook/admission/clustertopology/validation/clustertopology_test.go b/operator/internal/webhook/admission/clustertopology/validation/clustertopology_test.go
deleted file mode 100644
index 1e4da5c58..000000000
--- a/operator/internal/webhook/admission/clustertopology/validation/clustertopology_test.go
+++ /dev/null
@@ -1,500 +0,0 @@
-// /*
-// Copyright 2025 The Grove Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// */
-
-package validation
-
-import (
- "context"
- "testing"
-
- grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
-
- "github.com/go-logr/logr"
- "github.com/stretchr/testify/assert"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-)
-
-func TestValidateCreate(t *testing.T) {
- tests := []struct {
- name string
- clusterTopology *grovecorev1alpha1.ClusterTopology
- expectedErr bool
- expectedErrMsg string
- }{
- {
- name: "valid cluster topology with single level",
- clusterTopology: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{
- Name: "test-topology",
- },
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: grovecorev1alpha1.TopologyDomainZone,
- Key: "topology.kubernetes.io/zone",
- },
- },
- },
- },
- expectedErr: false,
- },
- {
- name: "valid cluster topology with multiple levels",
- clusterTopology: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{
- Name: "test-topology",
- },
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: grovecorev1alpha1.TopologyDomainRegion,
- Key: "topology.kubernetes.io/region",
- },
- {
- Domain: grovecorev1alpha1.TopologyDomainZone,
- Key: "topology.kubernetes.io/zone",
- },
- {
- Domain: grovecorev1alpha1.TopologyDomainHost,
- Key: "kubernetes.io/hostname",
- },
- },
- },
- },
- expectedErr: false,
- },
- {
- name: "invalid - duplicate domain (caught by order validation)",
- clusterTopology: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{
- Name: "test-topology",
- },
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: grovecorev1alpha1.TopologyDomainZone,
- Key: "topology.kubernetes.io/zone",
- },
- {
- Domain: grovecorev1alpha1.TopologyDomainZone,
- Key: "custom.io/zone",
- },
- },
- },
- },
- expectedErr: true,
- expectedErrMsg: "Duplicate value",
- },
- {
- name: "invalid - duplicate key",
- clusterTopology: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{
- Name: "test-topology",
- },
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: grovecorev1alpha1.TopologyDomainRegion,
- Key: "topology.kubernetes.io/zone",
- },
- {
- Domain: grovecorev1alpha1.TopologyDomainZone,
- Key: "topology.kubernetes.io/zone",
- },
- },
- },
- },
- expectedErr: true,
- expectedErrMsg: "duplicate key",
- },
- {
- name: "invalid - key not a valid label (has space)",
- clusterTopology: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{
- Name: "test-topology",
- },
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: grovecorev1alpha1.TopologyDomainZone,
- Key: "Invalid value",
- },
- },
- },
- },
- expectedErr: true,
- expectedErrMsg: "name part must consist of alphanumeric characters",
- },
- {
- name: "invalid - key prefix has invalid characters (double dots)",
- clusterTopology: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{
- Name: "test-topology",
- },
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: grovecorev1alpha1.TopologyDomainZone,
- Key: "invalid..label/key",
- },
- },
- },
- },
- expectedErr: true,
- expectedErrMsg: "prefix part",
- },
- {
- name: "invalid - key name part too long",
- clusterTopology: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{
- Name: "test-topology",
- },
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: grovecorev1alpha1.TopologyDomainZone,
- Key: "example.com/this-is-a-very-long-name-that-exceeds-the-maximum-length-of-sixtythree-characters",
- },
- },
- },
- },
- expectedErr: true,
- expectedErrMsg: "name part must be no more than 63 characters",
- },
- {
- name: "invalid - levels out of order (zone before region)",
- clusterTopology: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{
- Name: "test-topology",
- },
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: grovecorev1alpha1.TopologyDomainZone,
- Key: "topology.kubernetes.io/zone",
- },
- {
- Domain: grovecorev1alpha1.TopologyDomainRegion,
- Key: "topology.kubernetes.io/region",
- },
- },
- },
- },
- expectedErr: true,
- expectedErrMsg: "topology levels must be in hierarchical order",
- },
- {
- name: "valid - correct hierarchical order (region > zone > datacenter > rack > host > numa)",
- clusterTopology: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{
- Name: "test-topology",
- },
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: grovecorev1alpha1.TopologyDomainRegion,
- Key: "topology.kubernetes.io/region",
- },
- {
- Domain: grovecorev1alpha1.TopologyDomainZone,
- Key: "topology.kubernetes.io/zone",
- },
- {
- Domain: grovecorev1alpha1.TopologyDomainDataCenter,
- Key: "topology.kubernetes.io/datacenter",
- },
- {
- Domain: grovecorev1alpha1.TopologyDomainRack,
- Key: "topology.kubernetes.io/rack",
- },
- {
- Domain: grovecorev1alpha1.TopologyDomainHost,
- Key: "kubernetes.io/hostname",
- },
- {
- Domain: grovecorev1alpha1.TopologyDomainNuma,
- Key: "topology.kubernetes.io/numa",
- },
- },
- },
- },
- expectedErr: false,
- },
- }
-
- handler := &Handler{logger: logr.Discard()}
- ctx := context.Background()
-
- for _, tt := range tests {
- t.Run(tt.name, func(t *testing.T) {
- _, err := handler.ValidateCreate(ctx, tt.clusterTopology)
-
- if tt.expectedErr {
- assert.Error(t, err)
- if tt.expectedErrMsg != "" {
- assert.Contains(t, err.Error(), tt.expectedErrMsg)
- }
- } else {
- assert.NoError(t, err)
- }
- })
- }
-}
-
-func TestValidateUpdate(t *testing.T) {
- tests := []struct {
- name string
- oldClusterTopology *grovecorev1alpha1.ClusterTopology
- newClusterTopology *grovecorev1alpha1.ClusterTopology
- expectedErr bool
- expectedErrMsg string
- }{
- {
- name: "valid update - key changed",
- oldClusterTopology: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{
- Name: "test-topology",
- },
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: grovecorev1alpha1.TopologyDomainZone,
- Key: "topology.kubernetes.io/zone",
- },
- },
- },
- },
- newClusterTopology: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{
- Name: "test-topology",
- },
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: grovecorev1alpha1.TopologyDomainZone,
- Key: "custom.io/zone",
- },
- },
- },
- },
- expectedErr: false,
- },
- {
- name: "invalid - domain changed",
- oldClusterTopology: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{
- Name: "test-topology",
- },
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: grovecorev1alpha1.TopologyDomainZone,
- Key: "topology.kubernetes.io/zone",
- },
- },
- },
- },
- newClusterTopology: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{
- Name: "test-topology",
- },
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: grovecorev1alpha1.TopologyDomainRegion,
- Key: "topology.kubernetes.io/region",
- },
- },
- },
- },
- expectedErr: true,
- expectedErrMsg: "field is immutable",
- },
- {
- name: "invalid - level added",
- oldClusterTopology: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{
- Name: "test-topology",
- },
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: grovecorev1alpha1.TopologyDomainZone,
- Key: "topology.kubernetes.io/zone",
- },
- },
- },
- },
- newClusterTopology: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{
- Name: "test-topology",
- },
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: grovecorev1alpha1.TopologyDomainZone,
- Key: "topology.kubernetes.io/zone",
- },
- {
- Domain: grovecorev1alpha1.TopologyDomainHost,
- Key: "kubernetes.io/hostname",
- },
- },
- },
- },
- expectedErr: true,
- expectedErrMsg: "not allowed to add or remove topology levels",
- },
- {
- name: "invalid - level removed",
- oldClusterTopology: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{
- Name: "test-topology",
- },
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: grovecorev1alpha1.TopologyDomainRegion,
- Key: "topology.kubernetes.io/region",
- },
- {
- Domain: grovecorev1alpha1.TopologyDomainZone,
- Key: "topology.kubernetes.io/zone",
- },
- },
- },
- },
- newClusterTopology: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{
- Name: "test-topology",
- },
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: grovecorev1alpha1.TopologyDomainRegion,
- Key: "topology.kubernetes.io/region",
- },
- },
- },
- },
- expectedErr: true,
- expectedErrMsg: "not allowed to add or remove topology levels",
- },
- {
- name: "invalid - update creates duplicate key",
- oldClusterTopology: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{
- Name: "test-topology",
- },
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: grovecorev1alpha1.TopologyDomainRegion,
- Key: "topology.kubernetes.io/region",
- },
- {
- Domain: grovecorev1alpha1.TopologyDomainZone,
- Key: "topology.kubernetes.io/zone",
- },
- },
- },
- },
- newClusterTopology: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{
- Name: "test-topology",
- },
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: grovecorev1alpha1.TopologyDomainRegion,
- Key: "topology.kubernetes.io/zone",
- },
- {
- Domain: grovecorev1alpha1.TopologyDomainZone,
- Key: "topology.kubernetes.io/zone",
- },
- },
- },
- },
- expectedErr: true,
- expectedErrMsg: "duplicate key",
- },
- {
- name: "valid update - multiple levels, keys changed",
- oldClusterTopology: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{
- Name: "test-topology",
- },
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: grovecorev1alpha1.TopologyDomainRegion,
- Key: "topology.kubernetes.io/region",
- },
- {
- Domain: grovecorev1alpha1.TopologyDomainZone,
- Key: "topology.kubernetes.io/zone",
- },
- {
- Domain: grovecorev1alpha1.TopologyDomainHost,
- Key: "kubernetes.io/hostname",
- },
- },
- },
- },
- newClusterTopology: &grovecorev1alpha1.ClusterTopology{
- ObjectMeta: metav1.ObjectMeta{
- Name: "test-topology",
- },
- Spec: grovecorev1alpha1.ClusterTopologySpec{
- Levels: []grovecorev1alpha1.TopologyLevel{
- {
- Domain: grovecorev1alpha1.TopologyDomainRegion,
- Key: "custom.io/region",
- },
- {
- Domain: grovecorev1alpha1.TopologyDomainZone,
- Key: "custom.io/zone",
- },
- {
- Domain: grovecorev1alpha1.TopologyDomainHost,
- Key: "custom.io/host",
- },
- },
- },
- },
- expectedErr: false,
- },
- }
-
- handler := &Handler{logger: logr.Discard()}
- ctx := context.Background()
-
- for _, tt := range tests {
- t.Run(tt.name, func(t *testing.T) {
- _, err := handler.ValidateUpdate(ctx, tt.oldClusterTopology, tt.newClusterTopology)
-
- if tt.expectedErr {
- assert.Error(t, err)
- if tt.expectedErrMsg != "" {
- assert.Contains(t, err.Error(), tt.expectedErrMsg)
- }
- } else {
- assert.NoError(t, err)
- }
- })
- }
-}
diff --git a/operator/internal/webhook/admission/clustertopology/validation/handler.go b/operator/internal/webhook/admission/clustertopology/validation/handler.go
deleted file mode 100644
index fb1e7dba8..000000000
--- a/operator/internal/webhook/admission/clustertopology/validation/handler.go
+++ /dev/null
@@ -1,102 +0,0 @@
-// /*
-// Copyright 2025 The Grove Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// */
-
-package validation
-
-import (
- "context"
- "fmt"
-
- "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
- "github.com/ai-dynamo/grove/operator/internal/errors"
-
- "github.com/go-logr/logr"
- admissionv1 "k8s.io/api/admission/v1"
- "k8s.io/apimachinery/pkg/runtime"
- "sigs.k8s.io/controller-runtime/pkg/manager"
- "sigs.k8s.io/controller-runtime/pkg/webhook/admission"
-)
-
-const (
- // ErrValidateCreateClusterTopology is the error code returned where the request to create a ClusterTopology is invalid.
- ErrValidateCreateClusterTopology v1alpha1.ErrorCode = "ERR_VALIDATE_CREATE_CLUSTERTOPOLOGY"
- // ErrValidateUpdateClusterTopology is the error code returned where the request to update a ClusterTopology is invalid.
- ErrValidateUpdateClusterTopology v1alpha1.ErrorCode = "ERR_VALIDATE_UPDATE_CLUSTERTOPOLOGY"
-)
-
-// Handler is a handler for validating ClusterTopology resources.
-type Handler struct {
- logger logr.Logger
-}
-
-// NewHandler creates a new handler for ClusterTopology Webhook.
-func NewHandler(mgr manager.Manager) *Handler {
- return &Handler{
- logger: mgr.GetLogger().WithName("webhook").WithName(Name),
- }
-}
-
-// ValidateCreate validates a ClusterTopology create request.
-func (h *Handler) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
- h.logValidatorFunctionInvocation(ctx)
- clusterTopology, err := castToClusterTopology(obj)
- if err != nil {
- return nil, errors.WrapError(err, ErrValidateCreateClusterTopology, string(admissionv1.Create), "failed to cast object to ClusterTopology")
- }
- return nil, newClusterTopologyValidator(clusterTopology, admissionv1.Create).validate()
-}
-
-// ValidateUpdate validates a ClusterTopology update request.
-func (h *Handler) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) {
- h.logValidatorFunctionInvocation(ctx)
- newClusterTopology, err := castToClusterTopology(newObj)
- if err != nil {
- return nil, errors.WrapError(err, ErrValidateUpdateClusterTopology, string(admissionv1.Update), "failed to cast new object to ClusterTopology")
- }
- oldClusterTopology, err := castToClusterTopology(oldObj)
- if err != nil {
- return nil, errors.WrapError(err, ErrValidateUpdateClusterTopology, string(admissionv1.Update), "failed to cast old object to ClusterTopology")
- }
- validator := newClusterTopologyValidator(newClusterTopology, admissionv1.Update)
- if err := validator.validate(); err != nil {
- return nil, err
- }
- return nil, validator.validateUpdate(oldClusterTopology)
-}
-
-// ValidateDelete validates a ClusterTopology delete request.
-func (h *Handler) ValidateDelete(_ context.Context, _ runtime.Object) (admission.Warnings, error) {
- return nil, nil
-}
-
-// castToClusterTopology attempts to cast a runtime.Object to a ClusterTopology.
-func castToClusterTopology(obj runtime.Object) (*v1alpha1.ClusterTopology, error) {
- clusterTopology, ok := obj.(*v1alpha1.ClusterTopology)
- if !ok {
- return nil, fmt.Errorf("expected a ClusterTopology object but got %T", obj)
- }
- return clusterTopology, nil
-}
-
-// logValidatorFunctionInvocation logs details about the validation request including user and operation information.
-func (h *Handler) logValidatorFunctionInvocation(ctx context.Context) {
- req, err := admission.RequestFromContext(ctx)
- if err != nil {
- h.logger.Error(err, "failed to get request from context")
- return
- }
- h.logger.Info("ClusterTopology validation webhook invoked", "name", req.Name, "namespace", req.Namespace, "operation", req.Operation, "user", req.UserInfo.Username)
-}
diff --git a/operator/internal/webhook/admission/clustertopology/validation/register.go b/operator/internal/webhook/admission/clustertopology/validation/register.go
deleted file mode 100644
index 76c579577..000000000
--- a/operator/internal/webhook/admission/clustertopology/validation/register.go
+++ /dev/null
@@ -1,39 +0,0 @@
-// /*
-// Copyright 2025 The Grove Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// */
-
-package validation
-
-import (
- "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
-
- "sigs.k8s.io/controller-runtime/pkg/manager"
- "sigs.k8s.io/controller-runtime/pkg/webhook/admission"
-)
-
-const (
- // Name is the name of the validating webhook handler for ClusterTopology.
- Name = "clustertopology-validating-webhook"
- webhookPath = "/webhooks/validate-clustertopology"
-)
-
-// RegisterWithManager registers the webhook with the manager.
-func (h *Handler) RegisterWithManager(mgr manager.Manager) error {
- webhook := admission.
- WithCustomValidator(mgr.GetScheme(), &v1alpha1.ClusterTopology{}, h).
- WithRecoverPanic(true)
- mgr.GetWebhookServer().Register(webhookPath, webhook)
- return nil
-}
diff --git a/operator/internal/webhook/register.go b/operator/internal/webhook/register.go
index b46ea1cb4..ee77da1b3 100644
--- a/operator/internal/webhook/register.go
+++ b/operator/internal/webhook/register.go
@@ -23,7 +23,6 @@ import (
configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1"
"github.com/ai-dynamo/grove/operator/internal/constants"
- ctvalidation "github.com/ai-dynamo/grove/operator/internal/webhook/admission/clustertopology/validation"
"github.com/ai-dynamo/grove/operator/internal/webhook/admission/pcs/authorization"
"github.com/ai-dynamo/grove/operator/internal/webhook/admission/pcs/defaulting"
pcsvalidation "github.com/ai-dynamo/grove/operator/internal/webhook/admission/pcs/validation"
@@ -31,8 +30,8 @@ import (
"sigs.k8s.io/controller-runtime/pkg/manager"
)
-// RegisterWebhooks registers the webhooks with the controller manager.
-func RegisterWebhooks(mgr manager.Manager, authorizerConfig configv1alpha1.AuthorizerConfig) error {
+// Register registers the webhooks with the controller manager.
+func Register(mgr manager.Manager, authorizerConfig configv1alpha1.AuthorizerConfig) error {
defaultingWebhook := defaulting.NewHandler(mgr)
slog.Info("Registering webhook with manager", "handler", defaulting.Name)
if err := defaultingWebhook.RegisterWithManager(mgr); err != nil {
@@ -43,11 +42,6 @@ func RegisterWebhooks(mgr manager.Manager, authorizerConfig configv1alpha1.Autho
if err := pcsValidatingWebhook.RegisterWithManager(mgr); err != nil {
return fmt.Errorf("failed adding %s webhook handler: %v", pcsvalidation.Name, err)
}
- ctValidatingWebhook := ctvalidation.NewHandler(mgr)
- slog.Info("Registering webhook with manager", "handler", ctvalidation.Name)
- if err := ctValidatingWebhook.RegisterWithManager(mgr); err != nil {
- return fmt.Errorf("failed adding %s webhook handler: %v", ctvalidation.Name, err)
- }
if authorizerConfig.Enabled {
serviceAccountName, ok := os.LookupEnv(constants.EnvVarServiceAccountName)
if !ok {
diff --git a/operator/internal/webhook/register_test.go b/operator/internal/webhook/register_test.go
index 18403e87b..430adeb61 100644
--- a/operator/internal/webhook/register_test.go
+++ b/operator/internal/webhook/register_test.go
@@ -91,7 +91,7 @@ func TestRegisterWebhooks_WithoutAuthorizer(t *testing.T) {
Enabled: false,
}
- err := RegisterWebhooks(mgr, authorizerConfig)
+ err := Register(mgr, authorizerConfig)
require.NoError(t, err)
}
@@ -120,7 +120,7 @@ func TestRegisterWebhooks_WithAuthorizerMissingEnvVar(t *testing.T) {
Enabled: true,
}
- err = RegisterWebhooks(mgr, authorizerConfig)
+ err = Register(mgr, authorizerConfig)
require.Error(t, err)
assert.Contains(t, err.Error(), constants.EnvVarServiceAccountName)
}
@@ -149,7 +149,7 @@ func TestRegisterWebhooks_WithAuthorizerMissingNamespaceFile(t *testing.T) {
Enabled: true,
}
- err := RegisterWebhooks(mgr, authorizerConfig)
+ err := Register(mgr, authorizerConfig)
require.Error(t, err)
assert.Contains(t, err.Error(), "error reading namespace file")
}
@@ -194,7 +194,7 @@ func TestRegisterWebhooks_WithAuthorizerSuccess(t *testing.T) {
Enabled: true,
}
- err = RegisterWebhooks(mgr, authorizerConfig)
+ err = Register(mgr, authorizerConfig)
// Will error because it tries to read the hardcoded namespace file path
require.Error(t, err)
}
diff --git a/operator/skaffold.yaml b/operator/skaffold.yaml
index 2f58e72af..a7a5e73bf 100644
--- a/operator/skaffold.yaml
+++ b/operator/skaffold.yaml
@@ -33,8 +33,6 @@ build:
- VERSION
flags:
- -v
- ldflags:
- - '{{.LD_FLAGS}}'
main: ./initc/cmd
deploy:
helm: # see https://skaffold.dev/docs/deployers/helm/