diff --git a/docs/api-reference/operator-api.md b/docs/api-reference/operator-api.md index 4e93f333b..cec98f706 100644 --- a/docs/api-reference/operator-api.md +++ b/docs/api-reference/operator-api.md @@ -608,9 +608,7 @@ _Appears in:_ _Underlying type:_ _string_ -TopologyDomain represents a predefined topology level in the hierarchy. -Topology ordering (broadest to narrowest): -Region > Zone > DataCenter > Block > Rack > Host > Numa +TopologyDomain represents a level in the cluster topology hierarchy. @@ -634,16 +632,19 @@ _Appears in:_ TopologyLevel defines a single level in the topology hierarchy. +Maps a platform-agnostic domain to a platform-specific node label key, +allowing workload operators a consistent way to reference topology levels when defining TopologyConstraint's. _Appears in:_ +- [ClusterTopologyConfiguration](#clustertopologyconfiguration) - [ClusterTopologySpec](#clustertopologyspec) | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `domain` _[TopologyDomain](#topologydomain)_ | Domain is the predefined level identifier used in TopologyConstraint references.
Must be one of: region, zone, datacenter, block, rack, host, numa | | Enum: [region zone datacenter block rack host numa]
Required: \{\}
| -| `key` _string_ | Key is the node label key that identifies this topology domain.
Must be a valid Kubernetes label key (qualified name).
Examples: "topology.kubernetes.io/zone", "kubernetes.io/hostname" | | MaxLength: 63
MinLength: 1
Required: \{\}
| +| `domain` _[TopologyDomain](#topologydomain)_ | Domain is a platform provider-agnostic level identifier.
Must be one of: region, zone, datacenter, block, rack, host, numa | | Enum: [region zone datacenter block rack host numa]
Required: \{\}
| +| `key` _string_ | Key is the node label key that identifies this topology domain.
Must be a valid Kubernetes label key (qualified name).
Examples: "topology.kubernetes.io/zone", "kubernetes.io/hostname" | | MaxLength: 63
MinLength: 1
Pattern: `^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]/)?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$`
Required: \{\}
| @@ -702,7 +703,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | | `enabled` _boolean_ | Enabled indicates whether topology-aware scheduling is enabled. | | | -| `name` _string_ | Name is the ClusterTopology resource name to use.
Defaults to "grove-topology" if not specified when topology is enabled. | | | +| `levels` _[TopologyLevel](#topologylevel) array_ | Levels is an ordered list of topology levels from broadest to narrowest scope.
Used to create/update the ClusterTopology CR at operator startup. | | | #### ControllerConfiguration diff --git a/docs/designs/topology.md b/docs/designs/topology.md index 3d761add4..c83dfe76a 100644 --- a/docs/designs/topology.md +++ b/docs/designs/topology.md @@ -172,6 +172,7 @@ Domain TopologyDomain `json:"domain"` // +kubebuilder:validation:Required // +kubebuilder:validation:MinLength=1 // +kubebuilder:validation:MaxLength=63 +// +kubebuilder:validation:Pattern=`^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]/)?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$` Key string `json:"key"` } diff --git a/hack/ld-flags.sh b/hack/ld-flags.sh index 6eed0ab8f..387eb2794 100644 --- a/hack/ld-flags.sh +++ b/hack/ld-flags.sh @@ -39,8 +39,7 @@ function build_ld_flags() { FLAGS="-X $PACKAGE_PATH/version.gitCommit=$(git rev-parse --verify HEAD) -X $PACKAGE_PATH/version.gitTreeState=$tree_state - -X $PACKAGE_PATH/version.buildDate=$build_date - -X $PACKAGE_PATH/version.programName=$PROGRAM_NAME" + -X $PACKAGE_PATH/version.buildDate=$build_date" # The k8s.component-base/version.gitVersion can not be set to the version of grove # due to the error: "emulation version 1.33 is not between [1.31, 0.1.0-dev]". diff --git a/operator/Makefile b/operator/Makefile index 9c89712af..0645cbbaa 100644 --- a/operator/Makefile +++ b/operator/Makefile @@ -86,12 +86,6 @@ cover-html: test-cover @go tool cover -html=coverage.out -o coverage.html @echo "Coverage report generated at coverage.html" -# Run envtest tests (requires envtest binaries) -.PHONY: test-envtest -test-envtest: $(SETUP_ENVTEST) - @echo "Running envtest with CRD validation..." - @KUBEBUILDER_ASSETS=$$($(SETUP_ENVTEST) use -p path) go test ./internal/webhook/admission/clustertopology/validation -v -run TestClusterTopologyCRDValidation - # Run e2e tests .PHONY: test-e2e test-e2e: diff --git a/operator/api/common/constants/constants.go b/operator/api/common/constants/constants.go index ae3568cf8..437feaffa 100644 --- a/operator/api/common/constants/constants.go +++ b/operator/api/common/constants/constants.go @@ -16,6 +16,15 @@ package constants +const ( + // OperatorName is the name of the Grove operator. + OperatorName = "grove-operator" + // OperatorConfigGroupName is the name of the group for Grove operator configuration. + OperatorConfigGroupName = "operator.config.grove.io" + // OperatorGroupName is the name of the group for all Grove custom resources. + OperatorGroupName = "grove.io" +) + // Constants for finalizers. const ( // FinalizerPodCliqueSet is the finalizer for PodCliqueSet that is added to `.metadata.finalizers[]` slice. This will be placed on all PodCliqueSet resources @@ -107,4 +116,6 @@ const ( KindPodClique = "PodClique" // KindPodCliqueScalingGroup is the kind for a PodCliqueScalingGroup resource. KindPodCliqueScalingGroup = "PodCliqueScalingGroup" + // KindClusterTopology is the kind for a ClusterTopology resource. + KindClusterTopology = "ClusterTopology" ) diff --git a/operator/api/config/v1alpha1/defaults.go b/operator/api/config/v1alpha1/defaults.go index b9e8e7359..e2752790d 100644 --- a/operator/api/config/v1alpha1/defaults.go +++ b/operator/api/config/v1alpha1/defaults.go @@ -27,7 +27,6 @@ const ( defaultLeaderElectionResourceLock = "leases" defaultLeaderElectionResourceName = "grove-operator-leader-election" defaultWebhookServerTLSServerCertDir = "/etc/grove-operator/webhook-certs" - defaultTopologyName = "grove-topology" ) // SetDefaults_ClientConnectionConfiguration sets defaults for the k8s client connection. @@ -115,10 +114,3 @@ func SetDefaults_PodCliqueScalingGroupControllerConfiguration(obj *PodCliqueScal obj.ConcurrentSyncs = ptr.To(1) } } - -// SetDefaults_ClusterTopologyConfiguration sets defaults for the ClusterTopologyConfiguration. -func SetDefaults_ClusterTopologyConfiguration(obj *ClusterTopologyConfiguration) { - if obj.Enabled && obj.Name == "" { - obj.Name = defaultTopologyName - } -} diff --git a/operator/api/config/v1alpha1/defaults_test.go b/operator/api/config/v1alpha1/defaults_test.go deleted file mode 100644 index 35209914d..000000000 --- a/operator/api/config/v1alpha1/defaults_test.go +++ /dev/null @@ -1,85 +0,0 @@ -// /* -// Copyright 2025 The Grove Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// */ - -package v1alpha1 - -import ( - "testing" - - "github.com/stretchr/testify/assert" -) - -func TestSetDefaults_ClusterTopologyConfiguration(t *testing.T) { - tests := []struct { - name string - input ClusterTopologyConfiguration - expected ClusterTopologyConfiguration - }{ - { - name: "enabled with no name defaults to grove-topology", - input: ClusterTopologyConfiguration{ - Enabled: true, - Name: "", - }, - expected: ClusterTopologyConfiguration{ - Enabled: true, - Name: "grove-topology", - }, - }, - { - name: "enabled with custom name preserves name", - input: ClusterTopologyConfiguration{ - Enabled: true, - Name: "custom-topology", - }, - expected: ClusterTopologyConfiguration{ - Enabled: true, - Name: "custom-topology", - }, - }, - { - name: "disabled with no name remains empty", - input: ClusterTopologyConfiguration{ - Enabled: false, - Name: "", - }, - expected: ClusterTopologyConfiguration{ - Enabled: false, - Name: "", - }, - }, - { - name: "disabled with name preserves name", - input: ClusterTopologyConfiguration{ - Enabled: false, - Name: "some-topology", - }, - expected: ClusterTopologyConfiguration{ - Enabled: false, - Name: "some-topology", - }, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - cfg := tt.input - SetDefaults_ClusterTopologyConfiguration(&cfg) - assert.Equal(t, tt.expected, cfg) - }) - } -} diff --git a/operator/api/config/v1alpha1/register.go b/operator/api/config/v1alpha1/register.go index c4f0f181a..ccd73f208 100644 --- a/operator/api/config/v1alpha1/register.go +++ b/operator/api/config/v1alpha1/register.go @@ -17,16 +17,15 @@ package v1alpha1 import ( + "github.com/ai-dynamo/grove/operator/api/common/constants" + "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" ) -// GroupName is the group name used in this package -const GroupName = "operator.config.grove.io" - var ( // SchemeGroupVersion is group version used to register these objects - SchemeGroupVersion = schema.GroupVersion{Group: GroupName, Version: "v1alpha1"} + SchemeGroupVersion = schema.GroupVersion{Group: constants.OperatorConfigGroupName, Version: "v1alpha1"} // SchemeBuilder is used to add go types to the GroupVersionKind scheme SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes, addDefaultingFuncs) // AddToScheme is a reference to the Scheme Builder's AddToScheme function. diff --git a/operator/api/config/v1alpha1/types.go b/operator/api/config/v1alpha1/types.go index 130189a84..a1d2aa1f7 100644 --- a/operator/api/config/v1alpha1/types.go +++ b/operator/api/config/v1alpha1/types.go @@ -17,6 +17,8 @@ package v1alpha1 import ( + corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -55,14 +57,14 @@ var ( type OperatorConfiguration struct { metav1.TypeMeta `json:",inline"` ClientConnection ClientConnectionConfiguration `json:"runtimeClientConnection"` - LeaderElection LeaderElectionConfiguration - Server ServerConfiguration `json:"server"` - Debugging *DebuggingConfiguration `json:"debugging,omitempty"` - Controllers ControllerConfiguration `json:"controllers"` - LogLevel LogLevel `json:"logLevel"` - LogFormat LogFormat `json:"logFormat"` - Authorizer AuthorizerConfig `json:"authorizer"` - ClusterTopology ClusterTopologyConfiguration `json:"clusterTopology"` + LeaderElection LeaderElectionConfiguration `json:"leaderElection"` + Server ServerConfiguration `json:"server"` + Debugging *DebuggingConfiguration `json:"debugging,omitempty"` + Controllers ControllerConfiguration `json:"controllers"` + LogLevel LogLevel `json:"logLevel"` + LogFormat LogFormat `json:"logFormat"` + Authorizer AuthorizerConfig `json:"authorizer"` + ClusterTopology ClusterTopologyConfiguration `json:"clusterTopology"` } // LeaderElectionConfiguration defines the configuration for the leader election. @@ -193,8 +195,8 @@ type AuthorizerConfig struct { type ClusterTopologyConfiguration struct { // Enabled indicates whether topology-aware scheduling is enabled. Enabled bool `json:"enabled"` - // Name is the ClusterTopology resource name to use. - // Defaults to "grove-topology" if not specified when topology is enabled. + // Levels is an ordered list of topology levels from broadest to narrowest scope. + // Used to create/update the ClusterTopology CR at operator startup. // +optional - Name string `json:"name,omitempty"` + Levels []corev1alpha1.TopologyLevel `json:"levels,omitempty"` } diff --git a/operator/api/config/v1alpha1/zz_generated.deepcopy.go b/operator/api/config/v1alpha1/zz_generated.deepcopy.go index 2cab37b4c..70bc1f6ae 100644 --- a/operator/api/config/v1alpha1/zz_generated.deepcopy.go +++ b/operator/api/config/v1alpha1/zz_generated.deepcopy.go @@ -22,6 +22,7 @@ limitations under the License. package v1alpha1 import ( + corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" runtime "k8s.io/apimachinery/pkg/runtime" ) @@ -65,6 +66,11 @@ func (in *ClientConnectionConfiguration) DeepCopy() *ClientConnectionConfigurati // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ClusterTopologyConfiguration) DeepCopyInto(out *ClusterTopologyConfiguration) { *out = *in + if in.Levels != nil { + in, out := &in.Levels, &out.Levels + *out = make([]corev1alpha1.TopologyLevel, len(*in)) + copy(*out, *in) + } return } @@ -151,7 +157,7 @@ func (in *OperatorConfiguration) DeepCopyInto(out *OperatorConfiguration) { } in.Controllers.DeepCopyInto(&out.Controllers) in.Authorizer.DeepCopyInto(&out.Authorizer) - out.ClusterTopology = in.ClusterTopology + in.ClusterTopology.DeepCopyInto(&out.ClusterTopology) return } diff --git a/operator/api/config/v1alpha1/zz_generated.defaults.go b/operator/api/config/v1alpha1/zz_generated.defaults.go index 3583b3ef3..27cb68efa 100644 --- a/operator/api/config/v1alpha1/zz_generated.defaults.go +++ b/operator/api/config/v1alpha1/zz_generated.defaults.go @@ -41,5 +41,4 @@ func SetObjectDefaults_OperatorConfiguration(in *OperatorConfiguration) { SetDefaults_PodCliqueSetControllerConfiguration(&in.Controllers.PodCliqueSet) SetDefaults_PodCliqueControllerConfiguration(&in.Controllers.PodClique) SetDefaults_PodCliqueScalingGroupControllerConfiguration(&in.Controllers.PodCliqueScalingGroup) - SetDefaults_ClusterTopologyConfiguration(&in.ClusterTopology) } diff --git a/operator/api/config/validation/validation.go b/operator/api/config/validation/validation.go index 05b2a601d..b3e3bca65 100644 --- a/operator/api/config/validation/validation.go +++ b/operator/api/config/validation/validation.go @@ -17,9 +17,12 @@ package validation import ( + "fmt" + "slices" "strings" configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" + corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/sets" @@ -114,11 +117,51 @@ func mustBeGreaterThanZeroDuration(duration metav1.Duration, fldPath *field.Path } // validateClusterTopologyConfiguration validates the cluster topology configuration. -// When cluster topology is enabled, it ensures the topology name is provided. +// When cluster topology is enabled, it ensures the topology name and levels are provided, +// and validates domain and key uniqueness. func validateClusterTopologyConfiguration(clusterTopologyCfg configv1alpha1.ClusterTopologyConfiguration, fldPath *field.Path) field.ErrorList { allErrs := field.ErrorList{} - if clusterTopologyCfg.Enabled && len(strings.TrimSpace(clusterTopologyCfg.Name)) == 0 { - allErrs = append(allErrs, field.Required(fldPath.Child("name"), "clusterTopology name is required")) + if !clusterTopologyCfg.Enabled { + return allErrs + } + allErrs = validateClusterTopologyLevels(clusterTopologyCfg.Levels, fldPath.Child("levels")) + return allErrs +} + +func validateClusterTopologyLevels(levels []corev1alpha1.TopologyLevel, fldPath *field.Path) field.ErrorList { + allErrs := field.ErrorList{} + if len(levels) == 0 { + allErrs = append(allErrs, field.Required(fldPath, "levels are required when topology is enabled")) + } + allErrs = append(allErrs, mustHaveSupportedTopologyDomains(levels, fldPath)...) + allErrs = append(allErrs, mustHaveUniqueTopologyLevels(levels, fldPath)...) + return allErrs +} + +func mustHaveSupportedTopologyDomains(levels []corev1alpha1.TopologyLevel, fldPath *field.Path) field.ErrorList { + allErrs := field.ErrorList{} + supportedDomains := corev1alpha1.SupportedTopologyDomains() + for i, level := range levels { + if !slices.Contains(supportedDomains, level.Domain) { + allErrs = append(allErrs, field.Invalid(fldPath.Index(i).Child("domain"), level.Domain, fmt.Sprintf("must be one of %v", supportedDomains))) + } + } + return allErrs +} + +func mustHaveUniqueTopologyLevels(levels []corev1alpha1.TopologyLevel, fldPath *field.Path) field.ErrorList { + allErrs := field.ErrorList{} + seenDomains := make(map[corev1alpha1.TopologyDomain]struct{}) + seenKeys := make(map[string]struct{}) + for i, level := range levels { + if _, exists := seenDomains[level.Domain]; exists { + allErrs = append(allErrs, field.Duplicate(fldPath.Index(i).Child("domain"), level.Domain)) + } + if _, exists := seenKeys[level.Key]; exists { + allErrs = append(allErrs, field.Duplicate(fldPath.Index(i).Child("key"), level.Key)) + } + seenDomains[level.Domain] = struct{}{} + seenKeys[level.Key] = struct{}{} } return allErrs } diff --git a/operator/api/config/validation/validation_test.go b/operator/api/config/validation/validation_test.go index 3e6099739..3f3f1906a 100644 --- a/operator/api/config/validation/validation_test.go +++ b/operator/api/config/validation/validation_test.go @@ -20,6 +20,7 @@ import ( "testing" configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" + corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" "github.com/stretchr/testify/assert" "k8s.io/apimachinery/pkg/util/validation/field" @@ -27,66 +28,182 @@ import ( func TestValidateClusterTopologyConfiguration(t *testing.T) { tests := []struct { - name string - config configv1alpha1.ClusterTopologyConfiguration - expectError bool - errorField string + name string + config configv1alpha1.ClusterTopologyConfiguration + expectErrors int + expectedFields []string + expectedTypes []field.ErrorType }{ { - name: "valid: enabled with name", + name: "valid: disabled with no levels", config: configv1alpha1.ClusterTopologyConfiguration{ - Enabled: true, - Name: "my-topology", + Enabled: false, }, - expectError: false, + expectErrors: 0, }, { - name: "valid: disabled with no name", + name: "valid: disabled with levels (levels are ignored when disabled)", config: configv1alpha1.ClusterTopologyConfiguration{ Enabled: false, - Name: "", + Levels: []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + }, }, - expectError: false, + expectErrors: 0, }, { - name: "valid: disabled with name", + name: "valid: enabled with single level", config: configv1alpha1.ClusterTopologyConfiguration{ - Enabled: false, - Name: "my-topology", + Enabled: true, + Levels: []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + }, + }, + expectErrors: 0, + }, + { + name: "valid: enabled with multiple levels", + config: configv1alpha1.ClusterTopologyConfiguration{ + Enabled: true, + Levels: []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainRegion, Key: "topology.kubernetes.io/region"}, + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + }, }, - expectError: false, + expectErrors: 0, }, { - name: "invalid: enabled with empty name", + name: "valid: enabled with all supported domains", config: configv1alpha1.ClusterTopologyConfiguration{ Enabled: true, - Name: "", + Levels: []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainRegion, Key: "topology.kubernetes.io/region"}, + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainDataCenter, Key: "topology.kubernetes.io/datacenter"}, + {Domain: corev1alpha1.TopologyDomainBlock, Key: "topology.kubernetes.io/block"}, + {Domain: corev1alpha1.TopologyDomainRack, Key: "topology.kubernetes.io/rack"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + {Domain: corev1alpha1.TopologyDomainNuma, Key: "topology.kubernetes.io/numa"}, + }, }, - expectError: true, - errorField: "clusterTopology.name", + expectErrors: 0, }, { - name: "invalid: enabled with whitespace-only name", + name: "invalid: enabled with empty levels", config: configv1alpha1.ClusterTopologyConfiguration{ Enabled: true, - Name: " ", + Levels: []corev1alpha1.TopologyLevel{}, }, - expectError: true, - errorField: "clusterTopology.name", + expectErrors: 1, + expectedFields: []string{"clusterTopology.levels"}, + expectedTypes: []field.ErrorType{field.ErrorTypeRequired}, + }, + { + name: "invalid: enabled with nil levels", + config: configv1alpha1.ClusterTopologyConfiguration{ + Enabled: true, + Levels: nil, + }, + expectErrors: 1, + expectedFields: []string{"clusterTopology.levels"}, + expectedTypes: []field.ErrorType{field.ErrorTypeRequired}, + }, + { + name: "invalid: unsupported domain", + config: configv1alpha1.ClusterTopologyConfiguration{ + Enabled: true, + Levels: []corev1alpha1.TopologyLevel{ + {Domain: "invalid-domain", Key: "some.key"}, + }, + }, + expectErrors: 1, + expectedFields: []string{"clusterTopology.levels[0].domain"}, + expectedTypes: []field.ErrorType{field.ErrorTypeInvalid}, + }, + { + name: "invalid: duplicate domains", + config: configv1alpha1.ClusterTopologyConfiguration{ + Enabled: true, + Levels: []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainZone, Key: "another.zone.key"}, + }, + }, + expectErrors: 1, + expectedFields: []string{"clusterTopology.levels[1].domain"}, + expectedTypes: []field.ErrorType{field.ErrorTypeDuplicate}, + }, + { + name: "invalid: duplicate keys", + config: configv1alpha1.ClusterTopologyConfiguration{ + Enabled: true, + Levels: []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "topology.kubernetes.io/zone"}, + }, + }, + expectErrors: 1, + expectedFields: []string{"clusterTopology.levels[1].key"}, + expectedTypes: []field.ErrorType{field.ErrorTypeDuplicate}, + }, + { + name: "invalid: duplicate domains and keys", + config: configv1alpha1.ClusterTopologyConfiguration{ + Enabled: true, + Levels: []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + }, + }, + expectErrors: 2, + expectedFields: []string{"clusterTopology.levels[1].domain", "clusterTopology.levels[1].key"}, + expectedTypes: []field.ErrorType{field.ErrorTypeDuplicate, field.ErrorTypeDuplicate}, + }, + { + name: "invalid: multiple unsupported domains", + config: configv1alpha1.ClusterTopologyConfiguration{ + Enabled: true, + Levels: []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: "invalid1", Key: "key1"}, + {Domain: "invalid2", Key: "key2"}, + }, + }, + expectErrors: 2, + expectedFields: []string{"clusterTopology.levels[1].domain", "clusterTopology.levels[2].domain"}, + expectedTypes: []field.ErrorType{field.ErrorTypeInvalid, field.ErrorTypeInvalid}, + }, + { + name: "invalid: multiple validation errors - unsupported domain and duplicates", + config: configv1alpha1.ClusterTopologyConfiguration{ + Enabled: true, + Levels: []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: "invalid", Key: "invalid.key"}, + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + }, + }, + expectErrors: 3, + expectedFields: []string{"clusterTopology.levels[1].domain", "clusterTopology.levels[2].domain", "clusterTopology.levels[2].key"}, + expectedTypes: []field.ErrorType{field.ErrorTypeInvalid, field.ErrorTypeDuplicate, field.ErrorTypeDuplicate}, }, } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - errs := validateClusterTopologyConfiguration(tt.config, field.NewPath("clusterTopology")) + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + errs := validateClusterTopologyConfiguration(test.config, field.NewPath("clusterTopology")) + + assert.Len(t, errs, test.expectErrors, "expected %d validation errors but got %d: %v", test.expectErrors, len(errs), errs) - if tt.expectError { - assert.NotEmpty(t, errs, "expected validation errors but got none") - if len(errs) > 0 { - assert.Equal(t, tt.errorField, errs[0].Field) + if test.expectErrors > 0 { + // Verify each expected error + for i, expectedField := range test.expectedFields { + assert.Equal(t, expectedField, errs[i].Field, "error %d: expected field %s but got %s", i, expectedField, errs[i].Field) + if i < len(test.expectedTypes) { + assert.Equal(t, test.expectedTypes[i], errs[i].Type, "error %d: expected type %s but got %s", i, test.expectedTypes[i], errs[i].Type) + } } - } else { - assert.Empty(t, errs, "expected no validation errors but got: %v", errs) } }) } diff --git a/operator/api/core/v1alpha1/clustertopology.go b/operator/api/core/v1alpha1/clustertopology.go index dc756e824..5de382283 100644 --- a/operator/api/core/v1alpha1/clustertopology.go +++ b/operator/api/core/v1alpha1/clustertopology.go @@ -17,9 +17,18 @@ package v1alpha1 import ( + "slices" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +const ( + // DefaultClusterTopologyName is the name of the default ClusterTopology resource managed by the operator. + // Currently, Grove operator generates and maintains a single ClusterTopology resource per cluster. There is no support + // for externally created ClusterTopology resources. However, this may change in the future to allow more flexibility. + DefaultClusterTopologyName = "grove-topology" +) + // +genclient // +genclient:nonNamespaced // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object @@ -52,12 +61,32 @@ type ClusterTopologySpec struct { // This field is immutable after creation. // +kubebuilder:validation:MinItems=1 // +kubebuilder:validation:MaxItems=7 + // +kubebuilder:validation:XValidation:rule="self.all(x, self.filter(y, y.domain == x.domain).size() == 1)",message="domain must be unique across all levels" + // +kubebuilder:validation:XValidation:rule="self.all(x, self.filter(y, y.key == x.key).size() == 1)",message="key must be unique across all levels" Levels []TopologyLevel `json:"levels"` } -// TopologyDomain represents a predefined topology level in the hierarchy. -// Topology ordering (broadest to narrowest): -// Region > Zone > DataCenter > Block > Rack > Host > Numa +// TopologyLevel defines a single level in the topology hierarchy. +// Maps a platform-agnostic domain to a platform-specific node label key, +// allowing workload operators a consistent way to reference topology levels when defining TopologyConstraint's. +type TopologyLevel struct { + // Domain is a platform provider-agnostic level identifier. + // Must be one of: region, zone, datacenter, block, rack, host, numa + // +kubebuilder:validation:Required + // +kubebuilder:validation:Enum=region;zone;datacenter;block;rack;host;numa + Domain TopologyDomain `json:"domain"` + + // Key is the node label key that identifies this topology domain. + // Must be a valid Kubernetes label key (qualified name). + // Examples: "topology.kubernetes.io/zone", "kubernetes.io/hostname" + // +kubebuilder:validation:Required + // +kubebuilder:validation:MinLength=1 + // +kubebuilder:validation:MaxLength=63 + // +kubebuilder:validation:Pattern=`^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]/)?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$` + Key string `json:"key"` +} + +// TopologyDomain represents a level in the cluster topology hierarchy. type TopologyDomain string const ( @@ -77,8 +106,20 @@ const ( TopologyDomainNuma TopologyDomain = "numa" ) +// SupportedTopologyDomains returns all supported topology domain values. +func SupportedTopologyDomains() []TopologyDomain { + topologyDomains := make([]TopologyDomain, 0, len(topologyDomainOrder)) + for domain := range topologyDomainOrder { + topologyDomains = append(topologyDomains, domain) + } + return topologyDomains +} + // topologyDomainOrder defines the hierarchical order of topology domains from broadest to narrowest. // Lower value = broader scope (e.g., Region is broader than Zone). +// Context: TopologyDomain is used when defining TopologyConstraint in workload specifications. TopologyConstraint at different +// levels (PodClique, PodCliqueScalingGroup and PodCliqueSet) captures required packing guarantees that a scheduler backend +// must provide. var topologyDomainOrder = map[TopologyDomain]int{ TopologyDomainRegion: 0, TopologyDomainZone: 1, @@ -89,63 +130,9 @@ var topologyDomainOrder = map[TopologyDomain]int{ TopologyDomainNuma: 6, } -// Compare compares this domain with another domain. -// Returns: -// - negative value if this domain is broader than other -// - zero if domains are equal -// - positive value if this domain is narrower than other -// -// This method assumes both domains are valid (enforced by kubebuilder validation). -func (d TopologyDomain) Compare(other TopologyDomain) int { - return topologyDomainOrder[d] - topologyDomainOrder[other] -} - -// BroaderThan returns true if this domain represents a broader scope than the other domain. -// For example, Region.BroaderThan(Zone) returns true. -func (d TopologyDomain) BroaderThan(other TopologyDomain) bool { - return d.Compare(other) < 0 -} - -// NarrowerThan returns true if this domain represents a narrower scope than the other domain. -// For example, Zone.NarrowerThan(Region) returns true. -func (d TopologyDomain) NarrowerThan(other TopologyDomain) bool { - return d.Compare(other) > 0 -} - -// TopologyLevel defines a single level in the topology hierarchy. -type TopologyLevel struct { - // Domain is the predefined level identifier used in TopologyConstraint references. - // Must be one of: region, zone, datacenter, block, rack, host, numa - // +kubebuilder:validation:Required - // +kubebuilder:validation:Enum=region;zone;datacenter;block;rack;host;numa - Domain TopologyDomain `json:"domain"` - - // Key is the node label key that identifies this topology domain. - // Must be a valid Kubernetes label key (qualified name). - // Examples: "topology.kubernetes.io/zone", "kubernetes.io/hostname" - // +kubebuilder:validation:Required - // +kubebuilder:validation:MinLength=1 - // +kubebuilder:validation:MaxLength=63 - Key string `json:"key"` -} - -// Compare compares this topology level with another level based on their domains. -// Returns: -// - negative value if this level is broader than other -// - zero if levels have equal domain scope -// - positive value if this level is narrower than other -func (l TopologyLevel) Compare(other TopologyLevel) int { - return l.Domain.Compare(other.Domain) -} - -// BroaderThan returns true if this level represents a broader scope than the other level. -// For example, a Region level is broader than a Zone level. -func (l TopologyLevel) BroaderThan(other TopologyLevel) bool { - return l.Domain.BroaderThan(other.Domain) -} - -// NarrowerThan returns true if this level represents a narrower scope than the other level. -// For example, a Zone level is narrower than a Region level. -func (l TopologyLevel) NarrowerThan(other TopologyLevel) bool { - return l.Domain.NarrowerThan(other.Domain) +// SortTopologyLevels sorts a slice of TopologyLevel from broadest to narrowest scope, sorting by their Domain. +func SortTopologyLevels(levels []TopologyLevel) { + slices.SortFunc(levels, func(a, b TopologyLevel) int { + return topologyDomainOrder[a.Domain] - topologyDomainOrder[b.Domain] + }) } diff --git a/operator/api/core/v1alpha1/clustertopology_test.go b/operator/api/core/v1alpha1/clustertopology_test.go index cfc39333f..dce6fe9bc 100644 --- a/operator/api/core/v1alpha1/clustertopology_test.go +++ b/operator/api/core/v1alpha1/clustertopology_test.go @@ -18,117 +18,136 @@ package v1alpha1 import ( "testing" + + "github.com/stretchr/testify/assert" ) -func TestTopologyDomainComparison(t *testing.T) { +func TestSupportedTopologyDomains(t *testing.T) { + expectedDomains := []TopologyDomain{ + TopologyDomainRegion, + TopologyDomainZone, + TopologyDomainDataCenter, + TopologyDomainBlock, + TopologyDomainRack, + TopologyDomainHost, + TopologyDomainNuma, + } + + domains := SupportedTopologyDomains() + + // Verify all expected domains are present and no duplicates + seen := make(map[TopologyDomain]bool) + for _, domain := range domains { + assert.False(t, seen[domain], "domain %s should not be duplicated", domain) + seen[domain] = true + } + // Verify all expected domains are included + for _, expected := range expectedDomains { + assert.True(t, seen[expected], "domain %s should be in the returned list", expected) + } +} + +func TestSortTopologyLevels(t *testing.T) { tests := []struct { - name string - domain TopologyDomain - other TopologyDomain - expectedCompare int - expectedBroaderThan bool - expectedNarrowerThan bool + name string + input []TopologyLevel + expected []TopologyLevel }{ { - name: "region vs zone", - domain: TopologyDomainRegion, - other: TopologyDomainZone, - expectedCompare: -1, - expectedBroaderThan: true, - expectedNarrowerThan: false, - }, - { - name: "zone vs region", - domain: TopologyDomainZone, - other: TopologyDomainRegion, - expectedCompare: 1, - expectedBroaderThan: false, - expectedNarrowerThan: true, - }, - { - name: "region vs region (equal)", - domain: TopologyDomainRegion, - other: TopologyDomainRegion, - expectedCompare: 0, - expectedBroaderThan: false, - expectedNarrowerThan: false, - }, - { - name: "host vs numa", - domain: TopologyDomainHost, - other: TopologyDomainNuma, - expectedCompare: -1, - expectedBroaderThan: true, - expectedNarrowerThan: false, + name: "empty slice", + input: []TopologyLevel{}, + expected: []TopologyLevel{}, }, { - name: "region vs numa (multiple levels apart)", - domain: TopologyDomainRegion, - other: TopologyDomainNuma, - expectedCompare: -6, - expectedBroaderThan: true, - expectedNarrowerThan: false, + name: "single level", + input: []TopologyLevel{ + {Domain: TopologyDomainZone, Key: "zone-key"}, + }, + expected: []TopologyLevel{ + {Domain: TopologyDomainZone, Key: "zone-key"}, + }, }, { - name: "numa vs region (multiple levels apart)", - domain: TopologyDomainNuma, - other: TopologyDomainRegion, - expectedCompare: 6, - expectedBroaderThan: false, - expectedNarrowerThan: true, + name: "already sorted", + input: []TopologyLevel{ + {Domain: TopologyDomainRegion, Key: "region-key"}, + {Domain: TopologyDomainZone, Key: "zone-key"}, + {Domain: TopologyDomainHost, Key: "host-key"}, + }, + expected: []TopologyLevel{ + {Domain: TopologyDomainRegion, Key: "region-key"}, + {Domain: TopologyDomainZone, Key: "zone-key"}, + {Domain: TopologyDomainHost, Key: "host-key"}, + }, }, { - name: "datacenter vs rack", - domain: TopologyDomainDataCenter, - other: TopologyDomainRack, - expectedCompare: -2, - expectedBroaderThan: true, - expectedNarrowerThan: false, + name: "reverse sorted", + input: []TopologyLevel{ + {Domain: TopologyDomainHost, Key: "host-key"}, + {Domain: TopologyDomainZone, Key: "zone-key"}, + {Domain: TopologyDomainRegion, Key: "region-key"}, + }, + expected: []TopologyLevel{ + {Domain: TopologyDomainRegion, Key: "region-key"}, + {Domain: TopologyDomainZone, Key: "zone-key"}, + {Domain: TopologyDomainHost, Key: "host-key"}, + }, }, { - name: "rack vs datacenter", - domain: TopologyDomainRack, - other: TopologyDomainDataCenter, - expectedCompare: 2, - expectedBroaderThan: false, - expectedNarrowerThan: true, + name: "random order", + input: []TopologyLevel{ + {Domain: TopologyDomainRack, Key: "rack-key"}, + {Domain: TopologyDomainRegion, Key: "region-key"}, + {Domain: TopologyDomainNuma, Key: "numa-key"}, + {Domain: TopologyDomainZone, Key: "zone-key"}, + }, + expected: []TopologyLevel{ + {Domain: TopologyDomainRegion, Key: "region-key"}, + {Domain: TopologyDomainZone, Key: "zone-key"}, + {Domain: TopologyDomainRack, Key: "rack-key"}, + {Domain: TopologyDomainNuma, Key: "numa-key"}, + }, }, { - name: "rack vs zone (narrower vs broader)", - domain: TopologyDomainRack, - other: TopologyDomainZone, - expectedCompare: 3, - expectedBroaderThan: false, - expectedNarrowerThan: true, - }, - { - name: "zone vs rack (broader vs narrower)", - domain: TopologyDomainZone, - other: TopologyDomainRack, - expectedCompare: -3, - expectedBroaderThan: true, - expectedNarrowerThan: false, + name: "all domains", + input: []TopologyLevel{ + {Domain: TopologyDomainNuma, Key: "numa-key"}, + {Domain: TopologyDomainHost, Key: "host-key"}, + {Domain: TopologyDomainRack, Key: "rack-key"}, + {Domain: TopologyDomainBlock, Key: "block-key"}, + {Domain: TopologyDomainDataCenter, Key: "dc-key"}, + {Domain: TopologyDomainZone, Key: "zone-key"}, + {Domain: TopologyDomainRegion, Key: "region-key"}, + }, + expected: []TopologyLevel{ + {Domain: TopologyDomainRegion, Key: "region-key"}, + {Domain: TopologyDomainZone, Key: "zone-key"}, + {Domain: TopologyDomainDataCenter, Key: "dc-key"}, + {Domain: TopologyDomainBlock, Key: "block-key"}, + {Domain: TopologyDomainRack, Key: "rack-key"}, + {Domain: TopologyDomainHost, Key: "host-key"}, + {Domain: TopologyDomainNuma, Key: "numa-key"}, + }, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - // Test Compare() - gotCompare := tt.domain.Compare(tt.other) - if gotCompare != tt.expectedCompare { - t.Errorf("Compare() = %v, want %v", gotCompare, tt.expectedCompare) - } + // Make a copy to avoid modifying test data + input := make([]TopologyLevel, len(tt.input)) + copy(input, tt.input) + + // Sort the input + SortTopologyLevels(input) - // Test BroaderThan() - gotBroaderThan := tt.domain.BroaderThan(tt.other) - if gotBroaderThan != tt.expectedBroaderThan { - t.Errorf("BroaderThan() = %v, want %v", gotBroaderThan, tt.expectedBroaderThan) + // Check if sorted correctly + if len(input) != len(tt.expected) { + t.Fatalf("length mismatch: got %d, want %d", len(input), len(tt.expected)) } - // Test NarrowerThan() - gotNarrowerThan := tt.domain.NarrowerThan(tt.other) - if gotNarrowerThan != tt.expectedNarrowerThan { - t.Errorf("NarrowerThan() = %v, want %v", gotNarrowerThan, tt.expectedNarrowerThan) + for i := range input { + assert.Equal(t, input[i].Domain, tt.expected[i].Domain, "at index %d: got domain %v, want %v", i, input[i].Domain, tt.expected[i].Domain) + assert.Equal(t, input[i].Key, tt.expected[i].Key, "at index %d: got key %v, want %v", i, input[i].Key, tt.expected[i].Key) } }) } diff --git a/operator/api/core/v1alpha1/crds/grove.io_clustertopologies.yaml b/operator/api/core/v1alpha1/crds/grove.io_clustertopologies.yaml index f923ab48d..47630912e 100644 --- a/operator/api/core/v1alpha1/crds/grove.io_clustertopologies.yaml +++ b/operator/api/core/v1alpha1/crds/grove.io_clustertopologies.yaml @@ -49,12 +49,14 @@ spec: The order in this list defines the hierarchy (index 0 = broadest level). This field is immutable after creation. items: - description: TopologyLevel defines a single level in the topology - hierarchy. + description: |- + TopologyLevel defines a single level in the topology hierarchy. + Maps a platform-agnostic domain to a platform-specific node label key, + allowing workload operators a consistent way to reference topology levels when defining TopologyConstraint's. properties: domain: description: |- - Domain is the predefined level identifier used in TopologyConstraint references. + Domain is a platform provider-agnostic level identifier. Must be one of: region, zone, datacenter, block, rack, host, numa enum: - region @@ -72,6 +74,7 @@ spec: Examples: "topology.kubernetes.io/zone", "kubernetes.io/hostname" maxLength: 63 minLength: 1 + pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]/)?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ type: string required: - domain @@ -80,6 +83,12 @@ spec: maxItems: 7 minItems: 1 type: array + x-kubernetes-validations: + - message: domain must be unique across all levels + rule: self.all(x, self.filter(y, y.domain == x.domain).size() == + 1) + - message: key must be unique across all levels + rule: self.all(x, self.filter(y, y.key == x.key).size() == 1) required: - levels type: object diff --git a/operator/api/core/v1alpha1/register.go b/operator/api/core/v1alpha1/register.go index b397b95bd..c400e30f2 100644 --- a/operator/api/core/v1alpha1/register.go +++ b/operator/api/core/v1alpha1/register.go @@ -17,19 +17,16 @@ package v1alpha1 import ( + "github.com/ai-dynamo/grove/operator/api/common/constants" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" ) -const ( - // GroupName is the name of the group for all resources defined in this package. - GroupName = "grove.io" -) - var ( // SchemeGroupVersion is group version used to register these objects - SchemeGroupVersion = schema.GroupVersion{Group: GroupName, Version: "v1alpha1"} + SchemeGroupVersion = schema.GroupVersion{Group: constants.OperatorGroupName, Version: "v1alpha1"} // SchemeBuilder is used to add go types to the GroupVersionKind scheme SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes) // AddToScheme is a reference to the Scheme Builder's AddToScheme function. diff --git a/operator/charts/templates/_helpers.tpl b/operator/charts/templates/_helpers.tpl index 60cd3a1fd..3399ec233 100644 --- a/operator/charts/templates/_helpers.tpl +++ b/operator/charts/templates/_helpers.tpl @@ -34,8 +34,12 @@ config.yaml: | {{- if .Values.config.clusterTopology }} clusterTopology: enabled: {{ .Values.config.clusterTopology.enabled }} - {{- if .Values.config.clusterTopology.name }} - name: {{ .Values.config.clusterTopology.name }} + {{- if .Values.config.clusterTopology.levels }} + levels: + {{- range .Values.config.clusterTopology.levels }} + - domain: {{ .domain }} + key: {{ .key }} + {{- end }} {{- end }} {{- end }} {{- if .Values.config.authorizer.enabled }} @@ -150,4 +154,18 @@ release: "{{ .Release.Name }}" {{- range $key, $val := .Values.webhookServerSecret.labels }} {{ $key }}: {{ $val }} {{- end }} -{{- end -}} \ No newline at end of file +{{- end -}} + +{{- define "operator.lease.role.labels" -}} +{{- include "common.chart.labels" . }} +{{- range $key, $val := .Values.leaseRole.labels }} +{{ $key }}: {{ $val }} +{{- end }} +{{- end -}} + +{{- define "operator.lease.rolebinding.labels" -}} +{{- include "common.chart.labels" . }} +{{- range $key, $val := .Values.leaseRoleBinding.labels }} +{{ $key }}: {{ $val }} +{{- end }} +{{- end -}} diff --git a/operator/charts/templates/clusterrole.yaml b/operator/charts/templates/clusterrole.yaml index 4cc2c83d7..5fcd04eaf 100644 --- a/operator/charts/templates/clusterrole.yaml +++ b/operator/charts/templates/clusterrole.yaml @@ -26,6 +26,7 @@ rules: - podcliques - podcliques/status - podcliquescalinggroups + - clustertopologies - podcliquescalinggroups/status verbs: - create @@ -36,14 +37,6 @@ rules: - deletecollection - patch - update -- apiGroups: - - grove.io - resources: - - clustertopologies - verbs: - - get - - list - - watch - apiGroups: - "" resources: @@ -136,4 +129,15 @@ rules: - patch - update - watch +- apiGroups: + - kai.scheduler + resources: + - topologies + verbs: + - create + - get + - list + - watch + - patch + - update diff --git a/operator/charts/templates/clustertopology-validating-webhook-config.yaml b/operator/charts/templates/clustertopology-validating-webhook-config.yaml deleted file mode 100644 index 0a562b209..000000000 --- a/operator/charts/templates/clustertopology-validating-webhook-config.yaml +++ /dev/null @@ -1,34 +0,0 @@ ---- -apiVersion: admissionregistration.k8s.io/v1 -kind: ValidatingWebhookConfiguration -metadata: - name: clustertopology-validating-webhook - labels: -{{- include "operator.clustertopology.validating.webhook.labels" . | nindent 4 }} -webhooks: - - admissionReviewVersions: - - v1 - clientConfig: - service: - name: {{ required ".Values.service.name is required" .Values.service.name }} - namespace: {{ .Release.Namespace }} - path: /webhooks/validate-clustertopology - port: {{ required ".Values.config.server.webhooks.port" .Values.config.server.webhooks.port }} - failurePolicy: Fail - matchPolicy: Exact - name: clustertopology.validating.webhooks.grove.io - namespaceSelector: { } - rules: - - apiGroups: - - "grove.io" - apiVersions: - - "v1alpha1" - operations: - - CREATE - - UPDATE - resources: - - clustertopologies - scope: Cluster - sideEffects: None - timeoutSeconds: 10 - diff --git a/operator/charts/templates/leaderelection-role.yaml b/operator/charts/templates/leaderelection-role.yaml new file mode 100644 index 000000000..32524c1e6 --- /dev/null +++ b/operator/charts/templates/leaderelection-role.yaml @@ -0,0 +1,27 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + labels: +{{- include "operator.lease.role.labels" . | nindent 4 }} + name: {{ required ".Values.leaseRole.name is required" .Values.leaseRole.name }} + namespace: '{{ .Release.Namespace }}' +rules: + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - "" + resources: + - events + verbs: + - create + - patch \ No newline at end of file diff --git a/operator/charts/templates/leaderelection-rolebinding.yaml b/operator/charts/templates/leaderelection-rolebinding.yaml new file mode 100644 index 000000000..f29f8a0eb --- /dev/null +++ b/operator/charts/templates/leaderelection-rolebinding.yaml @@ -0,0 +1,15 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ required ".Values.leaseRoleBinding.name is required" .Values.leaseRoleBinding.name }} + labels: +{{- include "operator.lease.rolebinding.labels" . | nindent 4 }} + namespace: '{{ .Release.Namespace }}' +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ required ".Values.leaseRole.name is required" .Values.leaseRole.name }} +subjects: + - kind: ServiceAccount + name: {{ required ".Values.serviceAccount.name is required" .Values.serviceAccount.name }} + namespace: '{{ .Release.Namespace }}' \ No newline at end of file diff --git a/operator/charts/values.yaml b/operator/charts/values.yaml index cc62d8d5b..c7b03db2c 100644 --- a/operator/charts/values.yaml +++ b/operator/charts/values.yaml @@ -62,7 +62,21 @@ config: logFormat: json clusterTopology: enabled: false - name: "grove-topology" +# levels: +# - domain: zone +# key: topology.kubernetes.io/zone +# - domain: datacenter +# key: +# - domain: block +# key: +# - domain: rack +# key: +# - domain: host +# key: kubernetes.io/hostname +# - domain: region +# key: topology.kubernetes.io/region +# - domain: numa +# key: authorizer: enabled: false exemptServiceAccountUserNames: @@ -114,6 +128,20 @@ clusterRoleBinding: app.kubernetes.io/name: grove-operator app.kubernetes.io/part-of: grove +leaseRole: + name: grove:system:grove-operator-lease + labels: + app.kubernetes.io/component: operator-lease-role + app.kubernetes.io/name: grove-operator + app.kubernetes.io/part-of: grove + +leaseRoleBinding: + name: grove:system:grove-operator-lease + labels: + app.kubernetes.io/component: operator-lease-rolebinding + app.kubernetes.io/name: grove-operator + app.kubernetes.io/part-of: grove + webhooks: podCliqueSetValidationWebhook: labels: @@ -125,11 +153,6 @@ webhooks: app.kubernetes.io/component: operator-pcs-defaulting-webhook app.kubernetes.io/name: pcs-defaulting-webhook app.kubernetes.io/part-of: grove - clusterTopologyValidationWebhook: - labels: - app.kubernetes.io/component: operator-clustertopology-validating-webhook - app.kubernetes.io/name: clustertopology-validating-webhook - app.kubernetes.io/part-of: grove authorizerWebhook: labels: app.kubernetes.io/component: operator-authorizer-webhook diff --git a/operator/cmd/cli/cli.go b/operator/cmd/cli/cli.go new file mode 100644 index 000000000..2d53f52c6 --- /dev/null +++ b/operator/cmd/cli/cli.go @@ -0,0 +1,124 @@ +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package cli + +import ( + "fmt" + "os" + + apicommonconstants "github.com/ai-dynamo/grove/operator/api/common/constants" + configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" + operatorvalidation "github.com/ai-dynamo/grove/operator/api/config/validation" + + "github.com/spf13/pflag" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/serializer" +) + +const ( + // ExitSuccess is the exit code indicating that the application exited with no error. + ExitSuccess = iota + // ExitErrParseCLIArgs is the exit code indicating that the application exited due to an error parsing CLI arguments. + ExitErrParseCLIArgs + // ExitErrLoadOperatorConfig indicates that the application exited due to an error loading the operator configuration. + ExitErrLoadOperatorConfig + // ExitErrSynchronizeTopology indicates that the application exited due to an error synchronizing the cluster topology. + ExitErrSynchronizeTopology + // ExitErrInitializeManager indicates that the application exited due to an error initializing the manager. + // This includes registration of controllers and webhooks and setting up probes. + ExitErrInitializeManager + // ExitErrStart indicates that the application exited due to an error when starting the application. + ExitErrStart +) + +var ( + errParseCLIArgs = fmt.Errorf("error parsing cli arguments") + errMissingLaunchOption = fmt.Errorf("missing required launch option") + errLoadOperatorConfig = fmt.Errorf("cannot load %q operator config", apicommonconstants.OperatorName) + errInvalidOperatorConfig = fmt.Errorf("invalid %q operator config", apicommonconstants.OperatorName) +) + +// LaunchOptions defines options for launching the operator. +type LaunchOptions struct { + // ConfigFile is the path to the operator configuration file. + ConfigFile string + // Version indicates whether to print the operator version and exit. + Version bool +} + +// ParseLaunchOptions parses the CLI arguments for the operator. +func ParseLaunchOptions(cliArgs []string) (*LaunchOptions, error) { + launchOpts := &LaunchOptions{} + flagSet := pflag.NewFlagSet(apicommonconstants.OperatorName, pflag.ContinueOnError) + launchOpts.mapFlags(flagSet) + if err := flagSet.Parse(cliArgs); err != nil { + return nil, fmt.Errorf("%w: %w", errParseCLIArgs, err) + } + if err := launchOpts.validate(); err != nil { + return nil, err + } + return launchOpts, nil +} + +// LoadAndValidateOperatorConfig loads and validates the operator configuration from the specified path in the launch options. +func (o *LaunchOptions) LoadAndValidateOperatorConfig() (*configv1alpha1.OperatorConfiguration, error) { + operatorConfig, err := o.loadOperatorConfig() + if err != nil { + return nil, err + } + if errs := operatorvalidation.ValidateOperatorConfiguration(operatorConfig); len(errs) > 0 { + return nil, fmt.Errorf("%w: %w", errInvalidOperatorConfig, errs.ToAggregate()) + } + return operatorConfig, nil +} + +func (o *LaunchOptions) loadOperatorConfig() (*configv1alpha1.OperatorConfiguration, error) { + // Set up scheme and decoder for operator configuration + configScheme := runtime.NewScheme() + if err := configv1alpha1.AddToScheme(configScheme); err != nil { + return nil, fmt.Errorf("%w: error adding to scheme: %w", errLoadOperatorConfig, err) + } + configDecoder := serializer.NewCodecFactory(configScheme).UniversalDecoder() + + // Read configuration file + operatorConfigBytes, err := os.ReadFile(o.ConfigFile) + if err != nil { + return nil, fmt.Errorf("%w: error reading file: %w", errLoadOperatorConfig, err) + } + + // Decode configuration + operatorConfig := &configv1alpha1.OperatorConfiguration{} + if err = runtime.DecodeInto(configDecoder, operatorConfigBytes, operatorConfig); err != nil { + return nil, fmt.Errorf("%w: error decoding operator config: %w", errLoadOperatorConfig, err) + } + return operatorConfig, nil +} + +func (o *LaunchOptions) mapFlags(fs *pflag.FlagSet) { + fs.StringVar(&o.ConfigFile, "config", "", "Path to the operator configuration file.") + fs.BoolVarP(&o.Version, "version", "v", false, "Print the version and exit.") +} + +func (o *LaunchOptions) validate() error { + if len(o.ConfigFile) == 0 && !o.Version { + return fmt.Errorf("%w: one of version or operator config file must be specified", errMissingLaunchOption) + } + if len(o.ConfigFile) > 0 && o.Version { + return fmt.Errorf("%w: both version and operator config file cannot be specified", errMissingLaunchOption) + } + return nil +} diff --git a/operator/cmd/cli/cli_test.go b/operator/cmd/cli/cli_test.go new file mode 100644 index 000000000..19b8b67c8 --- /dev/null +++ b/operator/cmd/cli/cli_test.go @@ -0,0 +1,171 @@ +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package cli + +import ( + "errors" + "testing" + + apicommonconstants "github.com/ai-dynamo/grove/operator/api/common/constants" + configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" + + "github.com/spf13/pflag" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestParseLaunchOptions tests the parsing of CLI arguments into LaunchOptions. +func TestParseLaunchOptions(t *testing.T) { + tests := []struct { + name string + args []string + expected *LaunchOptions + expectedError error + }{ + { + name: "valid config file", + args: []string{"--config", "testdata/valid-config.yaml"}, + expected: &LaunchOptions{ + ConfigFile: "testdata/valid-config.yaml", + }, + }, + { + name: "no CLI args", + args: []string{}, + expected: nil, + expectedError: errMissingLaunchOption, + }, + { + name: "unknown CLI flag", + args: []string{"--unknown-flag", "value"}, + expected: nil, + expectedError: errParseCLIArgs, + }, + { + name: "invalid CLI flag format", + args: []string{"---config", "test.yaml"}, + expected: nil, + expectedError: errParseCLIArgs, + }, + { + name: "no value specified for config-file flag", + args: []string{"--config"}, + expected: nil, + expectedError: errParseCLIArgs, + }, + { + name: "multiple config_file flags", // last one should take precedence + args: []string{"--config", "first.yaml", "--config", "second.yaml"}, + expected: &LaunchOptions{ + ConfigFile: "second.yaml", + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + opts, err := ParseLaunchOptions(test.args) + + if test.expectedError != nil { + require.Error(t, err) + assert.True(t, errors.Is(err, test.expectedError), + "expected error to wrap %v, got %v", test.expectedError, err) + } else { + require.NoError(t, err) + require.NotNil(t, opts) + assert.Equal(t, test.expected.ConfigFile, opts.ConfigFile) + } + }) + } +} + +// TestMapFlags tests that flags are properly mapped to the LaunchOptions struct. +func TestMapFlags(t *testing.T) { + opts := &LaunchOptions{} + flagSet := pflag.NewFlagSet(apicommonconstants.OperatorName, pflag.ContinueOnError) + opts.mapFlags(flagSet) + + // Verify the config-file flag was registered + flag := flagSet.Lookup("config") + require.NotNil(t, flag, "config flag should be registered") + assert.Equal(t, "", flag.DefValue, "config should have empty default value") + assert.Equal(t, "string", flag.Value.Type(), "config should be a string flag") +} + +// TestLoadAndValidateOperatorConfig tests loading and validating operator configuration. +func TestLoadAndValidateOperatorConfig(t *testing.T) { + tests := []struct { + name string + configFile string + expectedError error + validateFunc func(t *testing.T, config *configv1alpha1.OperatorConfiguration) + }{ + { + name: "valid config", + configFile: "testdata/valid-config.yaml", + validateFunc: func(t *testing.T, config *configv1alpha1.OperatorConfiguration) { + require.NotNil(t, config) + assert.Equal(t, configv1alpha1.LogLevel("info"), config.LogLevel) + assert.Equal(t, configv1alpha1.LogFormat("json"), config.LogFormat) + assert.Equal(t, float32(100), config.ClientConnection.QPS) + assert.Equal(t, 150, config.ClientConnection.Burst) + assert.Equal(t, 9443, config.Server.Webhooks.Port) + assert.NotNil(t, config.Server.HealthProbes) + assert.Equal(t, 9444, config.Server.HealthProbes.Port) + assert.NotNil(t, config.Server.Metrics) + assert.Equal(t, 9445, config.Server.Metrics.Port) + assert.NotNil(t, config.Controllers.PodCliqueSet.ConcurrentSyncs) + assert.Equal(t, 3, *config.Controllers.PodCliqueSet.ConcurrentSyncs) + }, + }, + { + name: "invalid operator config", + configFile: "testdata/invalid-config.yaml", + expectedError: errInvalidOperatorConfig, + }, + { + name: "non existent config file", + configFile: "testdata/nonexistent.yaml", + expectedError: errLoadOperatorConfig, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + opts := &LaunchOptions{ + ConfigFile: test.configFile, + } + + config, err := opts.LoadAndValidateOperatorConfig() + + if test.expectedError != nil { + require.Error(t, err) + if test.expectedError != nil { + assert.True(t, errors.Is(err, test.expectedError), + "expected error to wrap %v, got %v", test.expectedError, err) + } + assert.Nil(t, config) + } else { + require.NoError(t, err) + require.NotNil(t, config) + if test.validateFunc != nil { + test.validateFunc(t, config) + } + } + }) + } +} diff --git a/operator/cmd/cli/testdata/invalid-config.yaml b/operator/cmd/cli/testdata/invalid-config.yaml new file mode 100644 index 000000000..34c36b9c3 --- /dev/null +++ b/operator/cmd/cli/testdata/invalid-config.yaml @@ -0,0 +1,26 @@ +apiVersion: operator.config.grove.io/v1alpha1 +kind: OperatorConfiguration +runtimeClientConnection: + qps: 100 + burst: -10 # invalid value +leaderElection: + enabled: false +server: + webhooks: + bindAddress: 0.0.0.0 + port: 9443 + serverCertDir: /etc/grove-operator/webhook-certs +controllers: + podCliqueSet: + concurrentSyncs: 3 + podClique: + concurrentSyncs: 3 + podCliqueScalingGroup: + concurrentSyncs: 2 +logLevel: info +logFormat: json +authorizer: + enabled: false +clusterTopology: + enabled: false + diff --git a/operator/cmd/cli/testdata/valid-config.yaml b/operator/cmd/cli/testdata/valid-config.yaml new file mode 100644 index 000000000..570c4630c --- /dev/null +++ b/operator/cmd/cli/testdata/valid-config.yaml @@ -0,0 +1,38 @@ +apiVersion: operator.config.grove.io/v1alpha1 +kind: OperatorConfiguration +runtimeClientConnection: + qps: 100 + burst: 150 +leaderElection: + enabled: true + leaseDuration: 15s + renewDeadline: 10s + retryPeriod: 2s + resourceLock: leases + resourceName: grove-operator-leader-election + resourceNamespace: default +server: + webhooks: + bindAddress: 0.0.0.0 + port: 9443 + serverCertDir: /etc/grove-operator/webhook-certs + healthProbes: + bindAddress: 0.0.0.0 + port: 9444 + metrics: + bindAddress: 0.0.0.0 + port: 9445 +controllers: + podCliqueSet: + concurrentSyncs: 3 + podClique: + concurrentSyncs: 3 + podCliqueScalingGroup: + concurrentSyncs: 2 +logLevel: info +logFormat: json +authorizer: + enabled: false +clusterTopology: + enabled: false + diff --git a/operator/cmd/main.go b/operator/cmd/main.go index f0a9ec310..8d38eeb98 100644 --- a/operator/cmd/main.go +++ b/operator/cmd/main.go @@ -18,20 +18,23 @@ package main import ( "context" + "errors" "flag" "fmt" + "io" "os" + apicommonconstants "github.com/ai-dynamo/grove/operator/api/common/constants" configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" - grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" - groveopts "github.com/ai-dynamo/grove/operator/cmd/opts" + corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + "github.com/ai-dynamo/grove/operator/cmd/cli" + "github.com/ai-dynamo/grove/operator/internal/clustertopology" grovectrl "github.com/ai-dynamo/grove/operator/internal/controller" "github.com/ai-dynamo/grove/operator/internal/controller/cert" grovelogger "github.com/ai-dynamo/grove/operator/internal/logger" groveversion "github.com/ai-dynamo/grove/operator/internal/version" "github.com/spf13/pflag" - "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -42,73 +45,86 @@ var ( func main() { ctrl.SetLogger(grovelogger.MustNewLogger(false, configv1alpha1.InfoLevel, configv1alpha1.LogFormatJSON)) + groveInfo := groveversion.New() - fs := pflag.CommandLine - groveversion.AddFlags(fs) - cliOpts := groveopts.NewCLIOptions(fs) - - // parse and print command line flags - pflag.Parse() - groveversion.PrintVersionAndExitIfRequested() - - logger.Info("Starting grove operator", "version", groveversion.Get()) - printFlags() - - operatorCfg, err := initializeOperatorConfig(cliOpts) + launchOpts, err := cli.ParseLaunchOptions(os.Args[1:]) + if err != nil { + handleErrorAndExit(err, cli.ExitErrParseCLIArgs) + } + if launchOpts.Version { + _, _ = fmt.Fprintf(io.Writer(os.Stdout), "%s %v\n", apicommonconstants.OperatorName, groveInfo) + os.Exit(cli.ExitSuccess) + } + operatorConfig, err := launchOpts.LoadAndValidateOperatorConfig() if err != nil { - logger.Error(err, "failed to initialize operator configuration") - os.Exit(1) + logger.Error(err, "failed to load operator config") + handleErrorAndExit(err, cli.ExitErrLoadOperatorConfig) } - mgr, err := grovectrl.CreateManager(operatorCfg) + logger.Info("Starting grove operator", "grove-info", groveInfo.Verbose()) + printFlags() + + mgr, err := grovectrl.CreateManager(operatorConfig) if err != nil { logger.Error(err, "failed to create grove controller manager") - os.Exit(1) + handleErrorAndExit(err, cli.ExitErrInitializeManager) } ctx := ctrl.SetupSignalHandler() - if err = validateClusterTopology(ctx, mgr.GetAPIReader(), operatorCfg.ClusterTopology); err != nil { - logger.Error(err, "cannot validate cluster topology, operator cannot start") - os.Exit(1) + // Initialize or clean up ClusterTopology based on operator configuration. + // This must be done before starting the controllers that may depend on the ClusterTopology resource. + // NOTE: In this version of the operator the synchronization will additionally ensure that the KAI Topology resource + // is created based on the ClusterTopology. When we introduce support for pluggable scheduler backends, + // handling of scheduler specified resources will be delegated to the backend scheduler controller. + if err = synchronizeTopology(ctx, mgr, operatorConfig); err != nil { + logger.Error(err, "failed to synchronize cluster topology") + handleErrorAndExit(err, cli.ExitErrSynchronizeTopology) } webhookCertsReadyCh := make(chan struct{}) - if err = cert.ManageWebhookCerts(mgr, operatorCfg.Server.Webhooks.ServerCertDir, operatorCfg.Authorizer.Enabled, webhookCertsReadyCh); err != nil { + if err = cert.ManageWebhookCerts(mgr, operatorConfig.Server.Webhooks.ServerCertDir, operatorConfig.Authorizer.Enabled, webhookCertsReadyCh); err != nil { logger.Error(err, "failed to setup cert rotation") - os.Exit(1) + handleErrorAndExit(err, cli.ExitErrInitializeManager) } if err = grovectrl.SetupHealthAndReadinessEndpoints(mgr, webhookCertsReadyCh); err != nil { logger.Error(err, "failed to set up health and readiness for grove controller manager") - os.Exit(1) + handleErrorAndExit(err, cli.ExitErrInitializeManager) } // Certificates need to be generated before the webhooks are started, which can only happen once the manager is started. // Block while generating the certificates, and then start the webhooks. go func() { - if err = grovectrl.RegisterControllersAndWebhooks(mgr, logger, operatorCfg, webhookCertsReadyCh); err != nil { + if err = grovectrl.RegisterControllersAndWebhooks(mgr, logger, operatorConfig, webhookCertsReadyCh); err != nil { logger.Error(err, "failed to initialize grove controller manager") - os.Exit(1) + handleErrorAndExit(err, cli.ExitErrInitializeManager) } }() logger.Info("Starting manager") if err = mgr.Start(ctx); err != nil { - logger.Error(err, "Error running manager") - os.Exit(1) + logger.Error(err, "Error starting controller manager") + handleErrorAndExit(err, cli.ExitErrStart) } } -func initializeOperatorConfig(cliOpts *groveopts.CLIOptions) (*configv1alpha1.OperatorConfiguration, error) { - // complete and validate operator configuration - if err := cliOpts.Complete(); err != nil { - return nil, err +func synchronizeTopology(ctx context.Context, mgr ctrl.Manager, operatorCfg *configv1alpha1.OperatorConfiguration) error { + cl, err := client.New(mgr.GetConfig(), client.Options{Scheme: mgr.GetScheme()}) + if err != nil { + return fmt.Errorf("failed to create client for topology synchronization: %w", err) } - if err := cliOpts.Validate(); err != nil { - return nil, err + if !operatorCfg.ClusterTopology.Enabled { + logger.Info("cluster topology is disabled, deleting existing ClusterTopology if any") + return clustertopology.DeleteClusterTopology(ctx, cl, corev1alpha1.DefaultClusterTopologyName) } - return cliOpts.Config, nil + // create or update ClusterTopology based on configuration + clusterTopology, err := clustertopology.EnsureClusterTopology(ctx, cl, logger, corev1alpha1.DefaultClusterTopologyName, operatorCfg.ClusterTopology.Levels) + if err != nil { + return err + } + // create or update KAI Topology based on ClusterTopology + return clustertopology.EnsureKAITopology(ctx, cl, logger, corev1alpha1.DefaultClusterTopologyName, clusterTopology) } func printFlags() { @@ -119,14 +135,11 @@ func printFlags() { logger.Info("Running with flags", flagKVs...) } -func validateClusterTopology(ctx context.Context, reader client.Reader, clusterTopologyConfig configv1alpha1.ClusterTopologyConfiguration) error { - if !clusterTopologyConfig.Enabled { - return nil - } - var clusterTopology grovecorev1alpha1.ClusterTopology - if err := reader.Get(ctx, types.NamespacedName{Name: clusterTopologyConfig.Name}, &clusterTopology); err != nil { - return fmt.Errorf("failed to fetch ClusterTopology %s: %w", clusterTopologyConfig.Name, err) +// handleErrorAndExit gracefully handles errors before exiting the program. +func handleErrorAndExit(err error, exitCode int) { + if errors.Is(err, pflag.ErrHelp) { + os.Exit(cli.ExitSuccess) } - logger.Info("topology validated successfully", "cluster topology", clusterTopologyConfig.Name) - return nil + _, _ = fmt.Fprintf(os.Stderr, "Err: %v\n", err) + os.Exit(exitCode) } diff --git a/operator/cmd/opts/options.go b/operator/cmd/opts/options.go deleted file mode 100644 index 6f69608d1..000000000 --- a/operator/cmd/opts/options.go +++ /dev/null @@ -1,80 +0,0 @@ -// /* -// Copyright 2024 The Grove Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// */ - -package opts - -import ( - "fmt" - "os" - - configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" - operatorvalidation "github.com/ai-dynamo/grove/operator/api/config/validation" - - "github.com/spf13/pflag" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/serializer" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" -) - -var configDecoder runtime.Decoder - -func init() { - configScheme := runtime.NewScheme() - utilruntime.Must(configv1alpha1.AddToScheme(configScheme)) - configDecoder = serializer.NewCodecFactory(configScheme).UniversalDecoder() -} - -// CLIOptions provides convenience abstraction to initialize and validate OperatorConfiguration from CLI flags. -type CLIOptions struct { - configFile string - // Config is the operator configuration initialized from the CLI flags. - Config *configv1alpha1.OperatorConfiguration -} - -// NewCLIOptions creates a new CLIOptions and adds the required CLI flags to the flag.flagSet. -func NewCLIOptions(fs *pflag.FlagSet) *CLIOptions { - cliOpts := &CLIOptions{} - cliOpts.addFlags(fs) - return cliOpts -} - -// Complete reads the configuration file and decodes it into an OperatorConfiguration. -func (o *CLIOptions) Complete() error { - if len(o.configFile) == 0 { - return fmt.Errorf("missing config file") - } - data, err := os.ReadFile(o.configFile) - if err != nil { - return fmt.Errorf("error reading config file: %w", err) - } - o.Config = &configv1alpha1.OperatorConfiguration{} - if err = runtime.DecodeInto(configDecoder, data, o.Config); err != nil { - return fmt.Errorf("error decoding config: %w", err) - } - return nil -} - -// Validate validates the created OperatorConfiguration. -func (o *CLIOptions) Validate() error { - if errs := operatorvalidation.ValidateOperatorConfiguration(o.Config); errs != nil { - return errs.ToAggregate() - } - return nil -} - -func (o *CLIOptions) addFlags(fs *pflag.FlagSet) { - fs.StringVar(&o.configFile, "config", o.configFile, "Path to configuration file.") -} diff --git a/operator/e2e/setup/k8s_clusters.go b/operator/e2e/setup/k8s_clusters.go index df98843ef..dfa45dcf7 100644 --- a/operator/e2e/setup/k8s_clusters.go +++ b/operator/e2e/setup/k8s_clusters.go @@ -1031,7 +1031,7 @@ func buildLDFlagsForE2E() string { // Build the ldflags string // Note: The module path changed from github.com/NVIDIA to github.com/ai-dynamo - ldflags := fmt.Sprintf("-X github.com/ai-dynamo/grove/operator/internal/version.gitCommit=%s -X github.com/ai-dynamo/grove/operator/internal/version.gitTreeState=%s -X github.com/ai-dynamo/grove/operator/internal/version.buildDate=%s -X github.com/ai-dynamo/grove/operator/internal/version.gitVersion=E2E_TESTS -X github.com/ai-dynamo/grove/operator/internal/version.programName=grove-operator", + ldflags := fmt.Sprintf("-X github.com/ai-dynamo/grove/operator/internal/version.gitCommit=%s -X github.com/ai-dynamo/grove/operator/internal/version.gitTreeState=%s -X github.com/ai-dynamo/grove/operator/internal/version.buildDate=%s -X github.com/ai-dynamo/grove/operator/internal/version.gitVersion=E2E_TESTS", gitCommit, treeState, buildDate) return ldflags diff --git a/operator/e2e/setup/skaffold.go b/operator/e2e/setup/skaffold.go index bc61f40c4..782a29d3a 100644 --- a/operator/e2e/setup/skaffold.go +++ b/operator/e2e/setup/skaffold.go @@ -25,6 +25,7 @@ import ( "os/exec" "path/filepath" "strings" + "time" "github.com/ai-dynamo/grove/operator/e2e/utils" "k8s.io/client-go/rest" @@ -144,7 +145,11 @@ func runSkaffoldBuild(ctx context.Context, absSkaffoldPath, skaffoldDir, kubecon // Add the skaffold.yaml path args = append(args, "-f", absSkaffoldPath) - cmd := exec.CommandContext(ctx, "skaffold", args...) + // Add timeout to prevent indefinite hangs (normal builds should complete in <10 minutes) + buildCtx, cancel := context.WithTimeout(ctx, 15*time.Minute) + defer cancel() + + cmd := exec.CommandContext(buildCtx, "skaffold", args...) cmd.Dir = skaffoldDir // Set up environment variables diff --git a/operator/go.mod b/operator/go.mod index 79779c20e..03ba96fa1 100644 --- a/operator/go.mod +++ b/operator/go.mod @@ -1,8 +1,9 @@ module github.com/ai-dynamo/grove/operator -go 1.24.0 +go 1.24.5 require ( + github.com/NVIDIA/KAI-scheduler v0.12.0 github.com/ai-dynamo/grove/operator/api v0.0.0 github.com/ai-dynamo/grove/scheduler/api v0.0.0 github.com/docker/docker v28.3.3+incompatible @@ -53,7 +54,7 @@ require ( github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/dimchansky/utfbom v1.1.1 // indirect github.com/distribution/reference v0.6.0 // indirect - github.com/docker/cli v27.0.3+incompatible // indirect + github.com/docker/cli v27.1.1+incompatible // indirect github.com/docker/distribution v2.8.2+incompatible // indirect github.com/docker/docker-credential-helpers v0.8.2 // indirect github.com/docker/go v1.5.1-1.0.20160303222718-d30aec9fd63c // indirect @@ -82,7 +83,7 @@ require ( github.com/google/btree v1.1.3 // indirect github.com/google/gnostic-models v0.7.0 // indirect github.com/google/go-cmp v0.7.0 // indirect - github.com/google/go-containerregistry v0.19.1 // indirect + github.com/google/go-containerregistry v0.20.2 // indirect github.com/google/uuid v1.6.0 // indirect github.com/gorilla/mux v1.8.1 // indirect github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect @@ -129,16 +130,15 @@ require ( github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect - github.com/onsi/ginkgo/v2 v2.23.4 // indirect github.com/opencontainers/go-digest v1.0.0 // indirect github.com/opencontainers/image-spec v1.1.1 // indirect github.com/pelletier/go-toml/v2 v2.2.3 // indirect github.com/peterbourgon/diskv v2.0.1+incompatible // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus/client_golang v1.23.0 // indirect + github.com/prometheus/client_golang v1.23.2 // indirect github.com/prometheus/client_model v0.6.2 // indirect - github.com/prometheus/common v0.65.0 // indirect + github.com/prometheus/common v0.66.1 // indirect github.com/prometheus/procfs v0.17.0 // indirect github.com/rancher/wharfie v0.6.2 // indirect github.com/rubenv/sql-migrate v1.8.0 // indirect @@ -161,7 +161,7 @@ require ( github.com/xeipuuv/gojsonschema v1.2.0 // indirect github.com/xlab/treeprint v1.2.0 // indirect go.opentelemetry.io/auto/sdk v1.1.0 // indirect - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.59.0 // indirect go.opentelemetry.io/otel v1.35.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.32.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.34.0 // indirect @@ -187,9 +187,9 @@ require ( golang.org/x/time v0.12.0 // indirect gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250313205543-e70fdf4c4cb4 // indirect google.golang.org/grpc v1.72.1 // indirect - google.golang.org/protobuf v1.36.7 // indirect + google.golang.org/protobuf v1.36.8 // indirect gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/ini.v1 v1.67.0 // indirect @@ -198,7 +198,7 @@ require ( k8s.io/apiserver v0.34.2 // indirect k8s.io/component-base v0.34.2 // indirect k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 // indirect - k8s.io/kubectl v0.34.0 // indirect + k8s.io/kubectl v0.34.1 // indirect oras.land/oras-go/v2 v2.6.0 // indirect sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect sigs.k8s.io/kustomize/api v0.20.1 // indirect diff --git a/operator/go.sum b/operator/go.sum index e4bc0d7f0..03c075fdf 100644 --- a/operator/go.sum +++ b/operator/go.sum @@ -24,6 +24,8 @@ github.com/Masterminds/squirrel v1.5.4 h1:uUcX/aBc8O7Fg9kaISIUsHXdKuqehiXAMQTYX8 github.com/Masterminds/squirrel v1.5.4/go.mod h1:NNaOrjSoIDfDA40n7sr2tPNZRfjzjA400rg+riTZj10= github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= +github.com/NVIDIA/KAI-scheduler v0.12.0 h1:3rhB0TdbJYNgOiZ6E0cAa+VJnr4srhoigJI/PkY85vI= +github.com/NVIDIA/KAI-scheduler v0.12.0/go.mod h1:56asdZ2ewWn2ALmPM93zxxGWi+8MX3Q0aRlvv2ytfuA= github.com/Shopify/logrus-bugsnag v0.0.0-20170309145241-6dbc35f2c30d/go.mod h1:HI8ITrYtUY+O+ZhtlqUnD8+KwNPOyugEhfP9fdUIaEQ= github.com/Shopify/logrus-bugsnag v0.0.0-20171204204709-577dee27f20d h1:UrqY+r/OJnIp5u0s1SbQ8dVfLCZJsnvazdBP5hS4iRs= github.com/Shopify/logrus-bugsnag v0.0.0-20171204204709-577dee27f20d/go.mod h1:HI8ITrYtUY+O+ZhtlqUnD8+KwNPOyugEhfP9fdUIaEQ= @@ -97,8 +99,8 @@ github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5Qvfr github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= github.com/dlclark/regexp2 v1.11.0 h1:G/nrcoOa7ZXlpoa/91N3X7mM3r8eIlMBBJZvsz/mxKI= github.com/dlclark/regexp2 v1.11.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= -github.com/docker/cli v27.0.3+incompatible h1:usGs0/BoBW8MWxGeEtqPMkzOY56jZ6kYlSN5BLDioCQ= -github.com/docker/cli v27.0.3+incompatible/go.mod h1:JLrzqnKDaYBop7H2jaqPtU4hHvMKP+vjCwu2uszcLI8= +github.com/docker/cli v27.1.1+incompatible h1:goaZxOqs4QKxznZjjBWKONQci/MywhtRv2oNn0GkeZE= +github.com/docker/cli v27.1.1+incompatible/go.mod h1:JLrzqnKDaYBop7H2jaqPtU4hHvMKP+vjCwu2uszcLI8= github.com/docker/distribution v2.7.1+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w= github.com/docker/distribution v2.8.2+incompatible h1:T3de5rq0dB1j30rp0sA2rER+m322EBzniBPB6ZIzuh8= github.com/docker/distribution v2.8.2+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w= @@ -197,13 +199,13 @@ github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7O github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= -github.com/google/go-containerregistry v0.19.1 h1:yMQ62Al6/V0Z7CqIrrS1iYoA5/oQCm88DeNujc7C1KY= -github.com/google/go-containerregistry v0.19.1/go.mod h1:YCMFNQeeXeLF+dnhhWkqDItx/JSkH01j1Kis4PsjzFI= +github.com/google/go-containerregistry v0.20.2 h1:B1wPJ1SN/S7pB+ZAimcciVD+r+yV/l/DSArMxlbwseo= +github.com/google/go-containerregistry v0.20.2/go.mod h1:z38EKdKh4h7IP2gSfUUqEvalZBqs6AoLeWfUy34nQC8= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8= -github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= +github.com/google/pprof v0.0.0-20250820193118-f64d9cf942d6 h1:EEHtgt9IwisQ2AZ4pIsMjahcegHh6rmhqxzIRQIyepY= +github.com/google/pprof v0.0.0-20250820193118-f64d9cf942d6/go.mod h1:I6V7YzU0XDpsHqbsyrghnFZLO1gwK6NPTNvmetQIk9U= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gorilla/handlers v1.5.2 h1:cLTUSsNkgcwhgRqvCNmdbRWG0A3N4F+M2nWKdScwyEE= @@ -356,13 +358,14 @@ github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= +github.com/onsi/ginkgo v1.12.0 h1:Iw5WCbBcaAAd0fpRb1c9r5YCylv4XDoCSigm1zLevwU= github.com/onsi/ginkgo v1.12.0/go.mod h1:oUhWkIvk5aDxtKvDDuw8gItl8pKl42LzjC9KZE0HfGg= -github.com/onsi/ginkgo/v2 v2.23.4 h1:ktYTpKJAVZnDT4VjxSbiBenUjmlL/5QkBEocaWXiQus= -github.com/onsi/ginkgo/v2 v2.23.4/go.mod h1:Bt66ApGPBFzHyR+JO10Zbt0Gsp4uWxu5mIOTusL46e8= +github.com/onsi/ginkgo/v2 v2.25.3 h1:Ty8+Yi/ayDAGtk4XxmmfUy4GabvM+MegeB4cDLRi6nw= +github.com/onsi/ginkgo/v2 v2.25.3/go.mod h1:43uiyQC4Ed2tkOzLsEYm7hnrb7UJTWHYNsuy3bG/snE= github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY= github.com/onsi/gomega v1.9.0/go.mod h1:Ho0h+IUsWyvy1OpqCwxlQ/21gkhVunqlU8fDGcoTdcA= -github.com/onsi/gomega v1.38.0 h1:c/WX+w8SLAinvuKKQFh77WEucCnPk4j2OTUr7lt7BeY= -github.com/onsi/gomega v1.38.0/go.mod h1:OcXcwId0b9QsE7Y49u+BTrL4IdKOBOKnD6VQNTJEB6o= +github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= +github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= github.com/open-policy-agent/cert-controller v0.14.0 h1:TPc19BOHOs4tARruTT5o4bzir7Ed6FF+j3EXP/nmZBs= github.com/open-policy-agent/cert-controller v0.14.0/go.mod h1:UhE/FU54DnKo+Rt0Yf3r+oKjgy6kqSH8Vsjo+5bGrSo= github.com/open-policy-agent/frameworks/constraint v0.0.0-20241101234656-e78c8abd754a h1:gQtOJ50XFyL2Xh3lDD9zP4KQ2PY4mZKQ9hDcWc81Sp8= @@ -394,8 +397,8 @@ github.com/prometheus/client_golang v0.9.0-pre1.0.20180209125602-c332b6f63c06/go github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= github.com/prometheus/client_golang v1.1.0/go.mod h1:I1FGZT9+L76gKKOs5djB6ezCbFQP1xR9D75/vuwEF3g= -github.com/prometheus/client_golang v1.23.0 h1:ust4zpdl9r4trLY/gSjlm07PuiBq2ynaXXlptpfy8Uc= -github.com/prometheus/client_golang v1.23.0/go.mod h1:i/o0R9ByOnHX0McrTMTyhYvKE4haaf2mW08I+jGAjEE= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.0.0-20171117100541-99fa1f4be8e5/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= @@ -404,8 +407,8 @@ github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvM github.com/prometheus/common v0.0.0-20180110214958-89604d197083/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro= github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= github.com/prometheus/common v0.6.0/go.mod h1:eBmuwkDJBwy6iBfxCBob6t6dR6ENT/y+J+Zk0j9GMYc= -github.com/prometheus/common v0.65.0 h1:QDwzd+G1twt//Kwj/Ww6E9FQq1iVMmODnILtW1t2VzE= -github.com/prometheus/common v0.65.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8= +github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= +github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= github.com/prometheus/procfs v0.0.0-20180125133057-cb4147076ac7/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= @@ -508,8 +511,8 @@ go.opentelemetry.io/contrib/bridges/prometheus v0.57.0 h1:UW0+QyeyBVhn+COBec3nGh go.opentelemetry.io/contrib/bridges/prometheus v0.57.0/go.mod h1:ppciCHRLsyCio54qbzQv0E4Jyth/fLWDTJYfvWpcSVk= go.opentelemetry.io/contrib/exporters/autoexport v0.57.0 h1:jmTVJ86dP60C01K3slFQa2NQ/Aoi7zA+wy7vMOKD9H4= go.opentelemetry.io/contrib/exporters/autoexport v0.57.0/go.mod h1:EJBheUMttD/lABFyLXhce47Wr6DPWYReCzaZiXadH7g= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 h1:yd02MEjBdJkG3uabWP9apV+OuWRIXGDuJEUJbOHmCFU= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0/go.mod h1:umTcuxiv1n/s/S6/c2AT/g2CQ7u5C59sHDNmfSwgz7Q= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.59.0 h1:CV7UdSGJt/Ao6Gp4CXckLxVRRsRgDHoI8XjbL3PDl8s= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.59.0/go.mod h1:FRmFuRJfag1IZ2dPkHnEoSFVgTVPUd2qf5Vi69hLb8I= go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ= go.opentelemetry.io/otel v1.35.0/go.mod h1:UEqy8Zp11hpkUrL73gSlELM0DupHoiq72dR+Zqel/+Y= go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.8.0 h1:WzNab7hOOLzdDF/EoWCt4glhrbMPVMOO5JYTmpz36Ls= @@ -637,13 +640,13 @@ gomodules.xyz/jsonpatch/v2 v2.5.0 h1:JELs8RLM12qJGXU4u/TO3V25KW8GreMKl9pdkk14RM0 gomodules.xyz/jsonpatch/v2 v2.5.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb h1:p31xT4yrYrSM/G4Sn2+TNUkVhFCbG9y8itM2S6Th950= google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:jbe3Bkdp+Dh2IrslsFCklNhweNTBgSYanP1UXhJDhKg= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb h1:TLPQVbx1GJ8VKZxz52VAxl1EBgKXXbTiU9Fc5fZeLn4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:LuRYeWDFV6WOn90g357N17oMCaxpgCnbi/44qJvDn2I= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250313205543-e70fdf4c4cb4 h1:iK2jbkWL86DXjEx0qiHcRE9dE4/Ahua5k6V8OWFb//c= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250313205543-e70fdf4c4cb4/go.mod h1:LuRYeWDFV6WOn90g357N17oMCaxpgCnbi/44qJvDn2I= google.golang.org/grpc v1.0.5/go.mod h1:yo6s7OP7yaDglbqo1J04qKzAhqBH6lvTonzMVmEdcZw= google.golang.org/grpc v1.72.1 h1:HR03wO6eyZ7lknl75XlxABNVLLFc2PAb6mHlYh756mA= google.golang.org/grpc v1.72.1/go.mod h1:wH5Aktxcg25y1I3w7H69nHfXdOG3UiadoBtjh3izSDM= -google.golang.org/protobuf v1.36.7 h1:IgrO7UwFQGJdRNXH/sQux4R1Dj1WAKcLElzeeRaXV2A= -google.golang.org/protobuf v1.36.7/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= +google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= gopkg.in/airbrake/gobrake.v2 v2.0.9/go.mod h1:/h5ZAUhDkGaJfjzjKLSjv6zCL6O0LLBxU4K+aSYdM/U= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/cenkalti/backoff.v2 v2.2.1 h1:eJ9UAg01/HIHG987TwxvnzK2MgxXq97YY6rYDpY9aII= @@ -694,12 +697,12 @@ k8s.io/component-base v0.34.2 h1:HQRqK9x2sSAsd8+R4xxRirlTjowsg6fWCPwWYeSvogQ= k8s.io/component-base v0.34.2/go.mod h1:9xw2FHJavUHBFpiGkZoKuYZ5pdtLKe97DEByaA+hHbM= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-aggregator v0.33.4 h1:TdIJKHb0/bLpby7FblXIaVEzyA1jGEjzt/n9cRvwq8U= -k8s.io/kube-aggregator v0.33.4/go.mod h1:wZuctdRvGde5bwzxkZRs0GYj2KOpCNgx8rRGVoNb62k= +k8s.io/kube-aggregator v0.34.1 h1:WNLV0dVNoFKmuyvdWLd92iDSyD/TSTjqwaPj0U9XAEU= +k8s.io/kube-aggregator v0.34.1/go.mod h1:RU8j+5ERfp0h+gIvWtxRPfsa5nK7rboDm8RST8BJfYQ= k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 h1:liMHz39T5dJO1aOKHLvwaCjDbf07wVh6yaUlTpunnkE= k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts= -k8s.io/kubectl v0.34.0 h1:NcXz4TPTaUwhiX4LU+6r6udrlm0NsVnSkP3R9t0dmxs= -k8s.io/kubectl v0.34.0/go.mod h1:bmd0W5i+HuG7/p5sqicr0Li0rR2iIhXL0oUyLF3OjR4= +k8s.io/kubectl v0.34.1 h1:1qP1oqT5Xc93K+H8J7ecpBjaz511gan89KO9Vbsh/OI= +k8s.io/kubectl v0.34.1/go.mod h1:JRYlhJpGPyk3dEmJ+BuBiOB9/dAvnrALJEiY/C5qa6A= k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck= k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= oras.land/oras-go/v2 v2.6.0 h1:X4ELRsiGkrbeox69+9tzTu492FMUu7zJQW6eJU+I2oc= diff --git a/operator/hack/install-helm-charts.sh b/operator/hack/install-helm-charts.sh deleted file mode 100755 index 0f85dba5e..000000000 --- a/operator/hack/install-helm-charts.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash -# /* -# Copyright 2024 The Grove Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# */ - - -set -o errexit -set -o nounset -set -o pipefail - - -function check_prerequisites() { - if ! command -v helm &> /dev/null; then - echo "helm is not installed. Please install helm from https://helm.sh/docs/intro/install" - exit 1 - fi -} - -function verify_and_install_charts() { - -} - diff --git a/operator/initc/cmd/main.go b/operator/initc/cmd/main.go index f0ea4fabf..cae3e006f 100644 --- a/operator/initc/cmd/main.go +++ b/operator/initc/cmd/main.go @@ -26,7 +26,6 @@ import ( "github.com/ai-dynamo/grove/operator/initc/cmd/opts" "github.com/ai-dynamo/grove/operator/initc/internal" "github.com/ai-dynamo/grove/operator/internal/logger" - "github.com/ai-dynamo/grove/operator/internal/version" ) var ( @@ -41,9 +40,8 @@ func main() { log.Error(err, "Failed to generate configuration for the init container from the flags") os.Exit(1) } - version.PrintVersionAndExitIfRequested() - log.Info("Starting grove init container", "version", version.Get()) + log.Info("Starting grove init container") podCliqueDependencies, err := config.GetPodCliqueDependencies() if err != nil { diff --git a/operator/initc/cmd/opts/options.go b/operator/initc/cmd/opts/options.go index 3759b3dc9..76125c96d 100644 --- a/operator/initc/cmd/opts/options.go +++ b/operator/initc/cmd/opts/options.go @@ -23,7 +23,6 @@ import ( grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" groveerr "github.com/ai-dynamo/grove/operator/internal/errors" - "github.com/ai-dynamo/grove/operator/internal/version" "github.com/spf13/pflag" ) @@ -43,11 +42,10 @@ type CLIOptions struct { } // RegisterFlags registers all the flags that are defined for the init container. -func (c *CLIOptions) RegisterFlags(fs *pflag.FlagSet) { +func (c *CLIOptions) RegisterFlags() { // --podcliques=: // --podcliques=podclique-a:3 --podcliques=podclique-b:4 and so on for each PodClique. pflag.StringArrayVarP(&c.podCliques, "podcliques", "p", nil, "podclique name and minAvailable replicas seperated by comma, repeated for each podclique") - version.AddFlags(fs) } // GetPodCliqueDependencies returns the PodClique information as a map with the minAvailable associated with each PodClique name. @@ -91,7 +89,7 @@ func InitializeCLIOptions() (CLIOptions, error) { config := CLIOptions{ podCliques: make([]string, 0), } - config.RegisterFlags(pflag.CommandLine) + config.RegisterFlags() pflag.Parse() return config, nil } diff --git a/operator/internal/client/scheme.go b/operator/internal/client/scheme.go index ca4bd021d..8b43467d8 100644 --- a/operator/internal/client/scheme.go +++ b/operator/internal/client/scheme.go @@ -20,6 +20,7 @@ import ( configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + kaitopologyv1alpha1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1alpha1" schedv1alpha1 "github.com/ai-dynamo/grove/scheduler/api/core/v1alpha1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -35,6 +36,7 @@ func init() { configv1alpha1.AddToScheme, grovecorev1alpha1.AddToScheme, schedv1alpha1.AddToScheme, + kaitopologyv1alpha1.AddToScheme, k8sscheme.AddToScheme, ) utilruntime.Must(metav1.AddMetaToScheme(Scheme)) diff --git a/operator/internal/clustertopology/clustertopology.go b/operator/internal/clustertopology/clustertopology.go new file mode 100644 index 000000000..137b56324 --- /dev/null +++ b/operator/internal/clustertopology/clustertopology.go @@ -0,0 +1,154 @@ +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package clustertopology + +import ( + "context" + "fmt" + "reflect" + + corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + + kaitopologyv1alpha1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1alpha1" + "github.com/go-logr/logr" + "github.com/samber/lo" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" +) + +// EnsureClusterTopology ensures that the ClusterTopology is created or updated in the cluster. +func EnsureClusterTopology(ctx context.Context, cl client.Client, logger logr.Logger, name string, topologyLevels []corev1alpha1.TopologyLevel) (*corev1alpha1.ClusterTopology, error) { + desiredTopology := buildClusterTopology(name, topologyLevels) + existingTopology := &corev1alpha1.ClusterTopology{} + err := cl.Get(ctx, client.ObjectKey{Name: name}, existingTopology) + if err != nil { + // If not found, create a new ClusterTopology + if apierrors.IsNotFound(err) { + if err = cl.Create(ctx, desiredTopology); err != nil { + return nil, fmt.Errorf("failed to create ClusterTopology %s: %w", name, err) + } + logger.Info("Created ClusterTopology", "name", name) + return desiredTopology, nil + } + return nil, fmt.Errorf("failed to get ClusterTopology %s: %w", name, err) + } + + // Update existing ClusterTopology if there are changes + if isClusterTopologyChanged(existingTopology, desiredTopology) { + existingTopology.Spec = desiredTopology.Spec + if err = cl.Update(ctx, existingTopology); err != nil { + return nil, fmt.Errorf("failed to update ClusterTopology %s: %w", name, err) + } + logger.Info("Updated ClusterTopology successfully", "name", name) + } + return existingTopology, nil +} + +// EnsureKAITopology ensures that the corresponding KAI ClusterTopology resource is created. +func EnsureKAITopology(ctx context.Context, cl client.Client, logger logr.Logger, name string, clusterTopology *corev1alpha1.ClusterTopology) error { + desiredTopology, err := buildKAITopology(name, clusterTopology, cl.Scheme()) + if err != nil { + return fmt.Errorf("failed to build KAI Topology: %w", err) + } + existingTopology := &kaitopologyv1alpha1.Topology{} + if err = cl.Get(ctx, client.ObjectKey{Name: name}, existingTopology); err != nil { + if apierrors.IsNotFound(err) { + if err = cl.Create(ctx, desiredTopology); err != nil { + return fmt.Errorf("failed to create KAI Topology %s: %w", name, err) + } + logger.Info("Created KAI Topology", "name", name) + return nil + } + return fmt.Errorf("failed to get KAI Topology %s: %w", name, err) + } + + // If the existing KAI topology does not have passed in ClusterTopology as owner, then error out. + if !metav1.IsControlledBy(existingTopology, clusterTopology) { + return fmt.Errorf("KAI Topology %s is not owned by ClusterTopology %s. It is required that KAI Topology by this name is created by Grove operator and has ClusterTopology set as its owner", name, clusterTopology.Name) + } + + if isKAITopologyChanged(existingTopology, desiredTopology) { + // KAI Topology needs to be updated. Since KAI Topology has immutable levels, we need to delete and recreate it. + // See https://github.com/NVIDIA/KAI-Scheduler/blob/130ab4468f96b25b1899ad2eced0a072dff033e9/pkg/apis/kai/v1alpha1/topology_types.go#L60 + if err = cl.Delete(ctx, existingTopology); err != nil { + return fmt.Errorf("failed to recreate (action: delete) existing KAI Topology %s: %w", name, err) + } + if err = cl.Create(ctx, desiredTopology); err != nil { + return fmt.Errorf("failed to recreate (action: create) KAI Topology %s: %w", name, err) + } + logger.Info("Recreated KAI Topology with updated levels", "name", name) + } + return nil +} + +// DeleteClusterTopology deletes the ClusterTopology with the given name. +func DeleteClusterTopology(ctx context.Context, cl client.Client, name string) error { + if err := client.IgnoreNotFound(cl.Delete(ctx, &corev1alpha1.ClusterTopology{ + ObjectMeta: ctrl.ObjectMeta{ + Name: name, + }, + })); err != nil { + return fmt.Errorf("failed to delete ClusterTopology %s: %w", name, err) + } + return nil +} + +func buildClusterTopology(name string, topologyLevels []corev1alpha1.TopologyLevel) *corev1alpha1.ClusterTopology { + sortedTopologyLevels := make([]corev1alpha1.TopologyLevel, len(topologyLevels)) + copy(sortedTopologyLevels, topologyLevels) + corev1alpha1.SortTopologyLevels(sortedTopologyLevels) + return &corev1alpha1.ClusterTopology{ + ObjectMeta: ctrl.ObjectMeta{ + Name: name, + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: sortedTopologyLevels, + }, + } +} + +func isClusterTopologyChanged(oldTopology, newTopology *corev1alpha1.ClusterTopology) bool { + return !reflect.DeepEqual(oldTopology.Spec, newTopology.Spec) +} + +func buildKAITopology(name string, clusterTopology *corev1alpha1.ClusterTopology, scheme *runtime.Scheme) (*kaitopologyv1alpha1.Topology, error) { + kaiTopologyLevels := lo.Map(clusterTopology.Spec.Levels, func(clusterTopologyLevel corev1alpha1.TopologyLevel, _ int) kaitopologyv1alpha1.TopologyLevel { + return kaitopologyv1alpha1.TopologyLevel{ + NodeLabel: clusterTopologyLevel.Key, + } + }) + kaiTopology := &kaitopologyv1alpha1.Topology{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + Spec: kaitopologyv1alpha1.TopologySpec{ + Levels: kaiTopologyLevels, + }, + } + if err := controllerutil.SetControllerReference(clusterTopology, kaiTopology, scheme); err != nil { + return nil, fmt.Errorf("failed to set owner reference for KAI Topology: %w", err) + } + return kaiTopology, nil +} + +func isKAITopologyChanged(oldTopology, newTopology *kaitopologyv1alpha1.Topology) bool { + return !reflect.DeepEqual(oldTopology.Spec.Levels, newTopology.Spec.Levels) +} diff --git a/operator/internal/clustertopology/clustertopology_test.go b/operator/internal/clustertopology/clustertopology_test.go new file mode 100644 index 000000000..e4be57a21 --- /dev/null +++ b/operator/internal/clustertopology/clustertopology_test.go @@ -0,0 +1,774 @@ +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package clustertopology + +import ( + "context" + "errors" + "testing" + + apicommonconstants "github.com/ai-dynamo/grove/operator/api/common/constants" + corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + testutils "github.com/ai-dynamo/grove/operator/test/utils" + + kaitopologyv1alpha1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1alpha1" + "github.com/go-logr/logr" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/uuid" + "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const topologyName = "test-topology" + +func TestEnsureClusterTopologyWhenNonExists(t *testing.T) { + // Setup + ctx := context.Background() + cl := testutils.CreateDefaultFakeClient(nil) + logger := logr.Discard() + + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + + // Test + topology, err := EnsureClusterTopology(ctx, cl, logger, topologyName, topologyLevels) + + // Assert + require.NoError(t, err) + assert.NotNil(t, topology) + assert.Equal(t, topologyName, topology.Name) + assert.Equal(t, topologyLevels, topology.Spec.Levels) + + // Verify it was actually created + fetched := &corev1alpha1.ClusterTopology{} + err = cl.Get(ctx, client.ObjectKey{Name: topologyName}, fetched) + require.NoError(t, err) + assert.Equal(t, topologyLevels, fetched.Spec.Levels) +} + +func TestEnsureClusterTopologyWhenUpdated(t *testing.T) { + // Setup + ctx := context.Background() + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + // Create initial topology + existing := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: topologyLevels, + }, + } + cl := testutils.CreateDefaultFakeClient([]client.Object{existing}) + logger := logr.Discard() + + // Update with new levels and test + updatedTopologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainRegion, Key: "topology.kubernetes.io/region"}, + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + topology, err := EnsureClusterTopology(ctx, cl, logger, topologyName, updatedTopologyLevels) + + // Assert + require.NoError(t, err) + assert.NotNil(t, topology) + assert.Equal(t, updatedTopologyLevels, topology.Spec.Levels) + + // Verify it was actually updated + fetched := &corev1alpha1.ClusterTopology{} + err = cl.Get(ctx, client.ObjectKey{Name: topologyName}, fetched) + require.NoError(t, err) + assert.Equal(t, updatedTopologyLevels, fetched.Spec.Levels) +} + +func TestEnsureClusterTopologyWhenNoUpdate(t *testing.T) { + // Setup + ctx := context.Background() + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + existing := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + ResourceVersion: "1", + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: topologyLevels, + }, + } + cl := testutils.CreateDefaultFakeClient([]client.Object{existing}) + logger := logr.Discard() + + // Test + topology, err := EnsureClusterTopology(ctx, cl, logger, topologyName, topologyLevels) + // Assert + require.NoError(t, err) + assert.NotNil(t, topology) + assert.Equal(t, topologyLevels, topology.Spec.Levels) + // Resource version should remain the same (no update occurred) + assert.Equal(t, "1", topology.ResourceVersion) +} + +func TestEnsureClusterTopologyWhenCreateError(t *testing.T) { + ctx := context.Background() + logger := logr.Discard() + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + // Create a client that will fail on create + createErr := apierrors.NewInternalError(assert.AnError) + cl := testutils.NewTestClientBuilder(). + RecordErrorForObjects(testutils.ClientMethodCreate, createErr, client.ObjectKey{Name: topologyName}). + Build() + + topology, err := EnsureClusterTopology(ctx, cl, logger, topologyName, topologyLevels) + assert.Error(t, err) + assert.Nil(t, topology) + assert.True(t, errors.Is(err, createErr)) +} + +func TestEnsureClusterTopologyWhenUpdateError(t *testing.T) { + // Setup + ctx := context.Background() + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + existing := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: topologyLevels, + }, + } + // Create a client that will fail on update + updateErr := apierrors.NewInternalError(assert.AnError) + cl := testutils.NewTestClientBuilder(). + WithObjects(existing). + RecordErrorForObjects(testutils.ClientMethodUpdate, updateErr, client.ObjectKey{Name: topologyName}). + Build() + logger := logr.Discard() + // Test + updatedTopologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainRegion, Key: "topology.kubernetes.io/region"}, + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + topology, err := EnsureClusterTopology(ctx, cl, logger, topologyName, updatedTopologyLevels) + assert.Error(t, err) + assert.Nil(t, topology) + assert.True(t, errors.Is(err, updateErr)) +} + +func TestEnsureClusterTopologyWhenGetError(t *testing.T) { + ctx := context.Background() + logger := logr.Discard() + + // Create a client that will fail on get (non-NotFound error) + getErr := apierrors.NewInternalError(assert.AnError) + cl := testutils.NewTestClientBuilder(). + RecordErrorForObjects(testutils.ClientMethodGet, getErr, client.ObjectKey{Name: topologyName}). + Build() + + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + topology, err := EnsureClusterTopology(ctx, cl, logger, topologyName, topologyLevels) + assert.Error(t, err) + assert.Nil(t, topology) + assert.Contains(t, err.Error(), "failed to get ClusterTopology") +} + +func TestEnsureKAITopologyWhenNonExists(t *testing.T) { + // Setup + ctx := context.Background() + cl := testutils.CreateDefaultFakeClient(nil) + logger := logr.Discard() + + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainRack, Key: "topology.kubernetes.io/rack"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + clusterTopology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + UID: uuid.NewUUID(), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: topologyLevels, + }, + } + + // Test + err := EnsureKAITopology(ctx, cl, logger, topologyName, clusterTopology) + require.NoError(t, err) + + // Verify KAI Topology was created + kaiTopology := &kaitopologyv1alpha1.Topology{} + err = cl.Get(ctx, client.ObjectKey{Name: topologyName}, kaiTopology) + require.NoError(t, err) + assert.Equal(t, topologyName, kaiTopology.Name) + assert.Len(t, kaiTopology.Spec.Levels, 3) + assert.Equal(t, "topology.kubernetes.io/zone", kaiTopology.Spec.Levels[0].NodeLabel) + assert.Equal(t, "topology.kubernetes.io/rack", kaiTopology.Spec.Levels[1].NodeLabel) + assert.Equal(t, "kubernetes.io/hostname", kaiTopology.Spec.Levels[2].NodeLabel) + + // Verify owner reference is set + assert.True(t, metav1.IsControlledBy(kaiTopology, clusterTopology)) +} + +func TestEnsureKAITopologyWhenClusterTopologyUpdated(t *testing.T) { + // Setup + ctx := context.Background() + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + clusterTopology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + UID: uuid.NewUUID(), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: topologyLevels, + }, + } + + // Create existing KAI Topology with old levels + existingKAITopology := &kaitopologyv1alpha1.Topology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: corev1alpha1.SchemeGroupVersion.String(), + Kind: apicommonconstants.KindClusterTopology, + Name: clusterTopology.Name, + UID: clusterTopology.UID, + Controller: ptr.To(true), + BlockOwnerDeletion: ptr.To(true), + }, + }, + }, + Spec: kaitopologyv1alpha1.TopologySpec{ + Levels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "kubernetes.io/hostname"}, + }, + }, + } + + cl := testutils.CreateDefaultFakeClient([]client.Object{clusterTopology, existingKAITopology}) + logger := logr.Discard() + + err := EnsureKAITopology(ctx, cl, logger, topologyName, clusterTopology) + require.NoError(t, err) + + // Verify KAI Topology was recreated with new levels + kaiTopology := &kaitopologyv1alpha1.Topology{} + err = cl.Get(ctx, client.ObjectKey{Name: topologyName}, kaiTopology) + require.NoError(t, err) + // Since it is recreated, generation should be "1" thus testing that the existing resource is not updated. + assert.Equal(t, int64(0), kaiTopology.Generation) + assert.Len(t, kaiTopology.Spec.Levels, 2) + assert.Equal(t, "topology.kubernetes.io/zone", kaiTopology.Spec.Levels[0].NodeLabel) + assert.Equal(t, "kubernetes.io/hostname", kaiTopology.Spec.Levels[1].NodeLabel) +} + +func TestEnsureKAITopologyWhenNoChangeRequired(t *testing.T) { + // Setup + ctx := context.Background() + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + clusterTopology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + UID: uuid.NewUUID(), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: topologyLevels, + }, + } + + // Create existing KAI Topology with same levels + existingKAITopology := &kaitopologyv1alpha1.Topology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + ResourceVersion: "1", + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "grove.io/v1alpha1", + Kind: "ClusterTopology", + Name: clusterTopology.Name, + UID: clusterTopology.UID, + Controller: ptr.To(true), + BlockOwnerDeletion: ptr.To(true), + }, + }, + }, + Spec: kaitopologyv1alpha1.TopologySpec{ + Levels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "topology.kubernetes.io/zone"}, + {NodeLabel: "kubernetes.io/hostname"}, + }, + }, + } + + cl := testutils.CreateDefaultFakeClient([]client.Object{clusterTopology, existingKAITopology}) + logger := logr.Discard() + + err := EnsureKAITopology(ctx, cl, logger, topologyName, clusterTopology) + require.NoError(t, err) + + // Verify KAI Topology was NOT recreated (resource version should be same) + kaiTopology := &kaitopologyv1alpha1.Topology{} + err = cl.Get(ctx, client.ObjectKey{Name: topologyName}, kaiTopology) + require.NoError(t, err) + assert.Equal(t, int64(0), kaiTopology.Generation) + assert.Equal(t, "1", kaiTopology.ResourceVersion) +} + +func TestEnsureKAITopologyWithUnknownOwner(t *testing.T) { + // Setup + ctx := context.Background() + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + clusterTopology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + UID: uuid.NewUUID(), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: topologyLevels, + }, + } + + // Create existing KAI Topology owned by different ClusterTopology + existingKAITopology := &kaitopologyv1alpha1.Topology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "grove.io/v1alpha1", + Kind: "ClusterTopology", + Name: "different-owner", + UID: "different-uid", + Controller: ptr.To(true), + BlockOwnerDeletion: ptr.To(true), + }, + }, + }, + Spec: kaitopologyv1alpha1.TopologySpec{ + Levels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "topology.kubernetes.io/zone"}, + }, + }, + } + + cl := testutils.CreateDefaultFakeClient([]client.Object{clusterTopology, existingKAITopology}) + logger := logr.Discard() + // Test and Assert + err := EnsureKAITopology(ctx, cl, logger, topologyName, clusterTopology) + assert.Error(t, err) +} + +func TestEnsureKAITopologyWhenGetError(t *testing.T) { + // Setup + ctx := context.Background() + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + clusterTopology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + UID: uuid.NewUUID(), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: topologyLevels, + }, + } + + // Create a client that will fail on get (non-NotFound error) + getErr := apierrors.NewInternalError(assert.AnError) + cl := testutils.NewTestClientBuilder(). + WithObjects(clusterTopology). + RecordErrorForObjects(testutils.ClientMethodGet, getErr, client.ObjectKey{Name: topologyName}). + Build() + logger := logr.Discard() + // Test + err := EnsureKAITopology(ctx, cl, logger, topologyName, clusterTopology) + // Assert + assert.Error(t, err) + assert.True(t, errors.Is(err, getErr)) +} + +func TestEnsureKAITopologyWhenCreateError(t *testing.T) { + // Setup + ctx := context.Background() + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + clusterTopology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + UID: uuid.NewUUID(), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: topologyLevels, + }, + } + + // Create a client that will fail on create + createErr := apierrors.NewInternalError(assert.AnError) + cl := testutils.NewTestClientBuilder(). + WithObjects(clusterTopology). + RecordErrorForObjects(testutils.ClientMethodCreate, createErr, client.ObjectKey{Name: topologyName}). + Build() + logger := logr.Discard() + + // Test + err := EnsureKAITopology(ctx, cl, logger, topologyName, clusterTopology) + // Assert + assert.Error(t, err) + assert.True(t, errors.Is(err, createErr)) +} + +func TestEnsureKAITopologyWhenDeleteError(t *testing.T) { + // Setup + ctx := context.Background() + // New topology levels that ClusterTopology will have + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + clusterTopology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + UID: uuid.NewUUID(), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: topologyLevels, + }, + } + + // Create existing KAI Topology with OLD/DIFFERENT levels to trigger delete-and-recreate + existingKAITopology := &kaitopologyv1alpha1.Topology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "grove.io/v1alpha1", + Kind: "ClusterTopology", + Name: clusterTopology.Name, + UID: clusterTopology.UID, + Controller: ptr.To(true), + BlockOwnerDeletion: ptr.To(true), + }, + }, + }, + Spec: kaitopologyv1alpha1.TopologySpec{ + Levels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "old-label"}, // Different from ClusterTopology levels + }, + }, + } + + // Create a client that will fail on delete + deleteErr := apierrors.NewInternalError(assert.AnError) + cl := testutils.NewTestClientBuilder(). + WithObjects(clusterTopology, existingKAITopology). + RecordErrorForObjects(testutils.ClientMethodDelete, deleteErr, client.ObjectKey{Name: topologyName}). + Build() + logger := logr.Discard() + + // Test + err := EnsureKAITopology(ctx, cl, logger, topologyName, clusterTopology) + // Assert + assert.Error(t, err) + assert.True(t, errors.Is(err, deleteErr)) +} + +func TestEnsureKAITopologyWhenCreateFailsDuringRecreate(t *testing.T) { + // Setup + ctx := context.Background() + updatedTopologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + clusterTopology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + UID: uuid.NewUUID(), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: updatedTopologyLevels, + }, + } + + // Create existing KAI Topology with old levels + existingKAITopology := &kaitopologyv1alpha1.Topology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "grove.io/v1alpha1", + Kind: "ClusterTopology", + Name: clusterTopology.Name, + UID: clusterTopology.UID, + Controller: ptr.To(true), + BlockOwnerDeletion: ptr.To(true), + }, + }, + }, + Spec: kaitopologyv1alpha1.TopologySpec{ + Levels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "kubernetes.io/hostname"}, + }, + }, + } + + // Create a client that succeeds when deleting KAI Topology resource but fail on create + // This simulates the scenario where delete succeeds but create fails during recreation + createErr := apierrors.NewInternalError(assert.AnError) + cl := testutils.NewTestClientBuilder(). + WithObjects(clusterTopology, existingKAITopology). + RecordErrorForObjects(testutils.ClientMethodCreate, createErr, client.ObjectKey{Name: topologyName}). + Build() + logger := logr.Discard() + + err := EnsureKAITopology(ctx, cl, logger, topologyName, clusterTopology) + assert.Error(t, err) + // After delete succeeds, create will fail, so we get the create error message + assert.True(t, errors.Is(err, createErr)) +} + +func TestDeleteClusterTopology(t *testing.T) { + // Setup + ctx := context.Background() + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + existing := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: topologyLevels, + }, + } + cl := testutils.CreateDefaultFakeClient([]client.Object{existing}) + + // Test + err := DeleteClusterTopology(ctx, cl, topologyName) + // Assert + require.NoError(t, err) + // Verify it was deleted + fetched := &corev1alpha1.ClusterTopology{} + err = cl.Get(ctx, client.ObjectKey{Name: topologyName}, fetched) + assert.True(t, apierrors.IsNotFound(err)) +} + +func TestDeleteClusterTopologyWhenNoneExists(t *testing.T) { + // Setup + ctx := context.Background() + cl := testutils.CreateDefaultFakeClient(nil) + // Test and Assert + // Should not error when deleting non-existent topology + err := DeleteClusterTopology(ctx, cl, topologyName) + assert.NoError(t, err) +} + +func TestDeleteClusterTopologyError(t *testing.T) { + ctx := context.Background() + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + existing := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: topologyLevels, + }, + } + + // Create a client that will fail on delete (non-NotFound error) + deleteErr := apierrors.NewInternalError(assert.AnError) + cl := testutils.NewTestClientBuilder(). + WithObjects(existing). + RecordErrorForObjects(testutils.ClientMethodDelete, deleteErr, client.ObjectKey{Name: topologyName}). + Build() + + err := DeleteClusterTopology(ctx, cl, topologyName) + assert.Error(t, err) + assert.True(t, errors.Is(err, deleteErr)) +} + +func TestBuildClusterTopology(t *testing.T) { + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + topology := buildClusterTopology(topologyName, topologyLevels) + assert.Equal(t, topologyName, topology.Name) + assert.Equal(t, topologyLevels, topology.Spec.Levels) +} + +func TestIsClusterTopologyChanged(t *testing.T) { + tests := []struct { + name string + oldLevels []corev1alpha1.TopologyLevel + newLevels []corev1alpha1.TopologyLevel + expectedChanged bool + }{ + { + name: "No change in levels", + oldLevels: []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + }, + newLevels: []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + }, + expectedChanged: false, + }, + { + name: "Change in levels", + oldLevels: []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + }, + newLevels: []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainRegion, Key: "topology.kubernetes.io/region"}, + }, + expectedChanged: true, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + oldTopology := &corev1alpha1.ClusterTopology{ + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: test.oldLevels, + }, + } + newTopology := &corev1alpha1.ClusterTopology{ + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: test.newLevels, + }, + } + topologyChanged := isClusterTopologyChanged(oldTopology, newTopology) + assert.Equal(t, test.expectedChanged, topologyChanged) + }) + } +} + +func TestBuildKAITopology(t *testing.T) { + // Setup + cl := testutils.CreateDefaultFakeClient(nil) + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + clusterTopology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + UID: uuid.NewUUID(), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: topologyLevels, + }, + } + + // Test + kaiTopology, err := buildKAITopology(topologyName, clusterTopology, cl.Scheme()) + // Assert + require.NoError(t, err) + assert.Equal(t, topologyName, kaiTopology.Name) + assert.Len(t, kaiTopology.Spec.Levels, 2) + assert.Equal(t, "topology.kubernetes.io/zone", kaiTopology.Spec.Levels[0].NodeLabel) + assert.Equal(t, "kubernetes.io/hostname", kaiTopology.Spec.Levels[1].NodeLabel) + assert.True(t, metav1.IsControlledBy(kaiTopology, clusterTopology)) +} + +func TestIsKAITopologyChanged(t *testing.T) { + tests := []struct { + name string + oldLevels []kaitopologyv1alpha1.TopologyLevel + newLevels []kaitopologyv1alpha1.TopologyLevel + expectedChanged bool + }{ + { + name: "No change in levels", + oldLevels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "topology.kubernetes.io/zone"}, + {NodeLabel: "kubernetes.io/hostname"}, + }, + newLevels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "topology.kubernetes.io/zone"}, + {NodeLabel: "kubernetes.io/hostname"}, + }, + expectedChanged: false, + }, + { + name: "Change in levels", + oldLevels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "topology.kubernetes.io/zone"}, + {NodeLabel: "kubernetes.io/hostname"}, + }, + newLevels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "topology.kubernetes.io/region"}, + }, + expectedChanged: true, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + oldTopology := &kaitopologyv1alpha1.Topology{ + Spec: kaitopologyv1alpha1.TopologySpec{ + Levels: test.oldLevels, + }, + } + newTopology := &kaitopologyv1alpha1.Topology{ + Spec: kaitopologyv1alpha1.TopologySpec{ + Levels: test.newLevels, + }, + } + topologyChanged := isKAITopologyChanged(oldTopology, newTopology) + assert.Equal(t, test.expectedChanged, topologyChanged) + }) + } +} diff --git a/operator/internal/controller/cert/cert.go b/operator/internal/controller/cert/cert.go index d7478e949..2cd60b4d2 100644 --- a/operator/internal/controller/cert/cert.go +++ b/operator/internal/controller/cert/cert.go @@ -22,7 +22,6 @@ import ( "strings" "github.com/ai-dynamo/grove/operator/internal/constants" - clustertopologyvalidation "github.com/ai-dynamo/grove/operator/internal/webhook/admission/clustertopology/validation" authorizationwebhook "github.com/ai-dynamo/grove/operator/internal/webhook/admission/pcs/authorization" defaultingwebhook "github.com/ai-dynamo/grove/operator/internal/webhook/admission/pcs/defaulting" validatingwebhook "github.com/ai-dynamo/grove/operator/internal/webhook/admission/pcs/validation" @@ -88,10 +87,6 @@ func getWebhooks(authorizerEnabled bool) []cert.WebhookInfo { Type: cert.Validating, Name: validatingwebhook.Name, }, - { - Type: cert.Validating, - Name: clustertopologyvalidation.Name, - }, } if authorizerEnabled { webhooks = append(webhooks, cert.WebhookInfo{ diff --git a/operator/internal/controller/cert/cert_test.go b/operator/internal/controller/cert/cert_test.go index 87f1b6362..7f9e4c03f 100644 --- a/operator/internal/controller/cert/cert_test.go +++ b/operator/internal/controller/cert/cert_test.go @@ -32,25 +32,23 @@ func TestGetWebhooks(t *testing.T) { t.Run("authorizer disabled", func(t *testing.T) { webhooks := getWebhooks(false) - // Expect 3 webhooks: podcliqueset-defaulting, podcliqueset-validating, clustertopology-validating - require.Len(t, webhooks, 3) + // Expect 2 webhooks: podcliqueset-defaulting, podcliqueset-validating + require.Len(t, webhooks, 2) // Check that defaulting and validating webhooks are present assert.Equal(t, cert.Mutating, webhooks[0].Type) // podcliqueset-defaulting-webhook assert.Equal(t, cert.Validating, webhooks[1].Type) // podcliqueset-validating-webhook - assert.Equal(t, cert.Validating, webhooks[2].Type) // clustertopology-validating-webhook }) // Test with authorizer enabled t.Run("authorizer enabled", func(t *testing.T) { webhooks := getWebhooks(true) - // Expect 4 webhooks: the 3 base webhooks plus the authorizer-webhook - require.Len(t, webhooks, 4) - // Check that all four webhooks are present + // Expect 3 webhooks: the 2 base webhooks plus the authorizer-webhook + require.Len(t, webhooks, 3) + // Check that all three webhooks are present assert.Equal(t, cert.Mutating, webhooks[0].Type) // podcliqueset-defaulting-webhook assert.Equal(t, cert.Validating, webhooks[1].Type) // podcliqueset-validating-webhook - assert.Equal(t, cert.Validating, webhooks[2].Type) // clustertopology-validating-webhook - assert.Equal(t, cert.Validating, webhooks[3].Type) // authorizer-webhook + assert.Equal(t, cert.Validating, webhooks[2].Type) // authorizer-webhook }) } diff --git a/operator/internal/controller/manager.go b/operator/internal/controller/manager.go index 5c5155614..0ff633593 100644 --- a/operator/internal/controller/manager.go +++ b/operator/internal/controller/manager.go @@ -46,7 +46,7 @@ const ( var ( waitTillWebhookCertsReady = cert.WaitTillWebhookCertsReady registerControllersWithMgr = RegisterControllers - registerWebhooksWithMgr = webhook.RegisterWebhooks + registerWebhooksWithMgr = webhook.Register ) // CreateManager creates the manager. @@ -56,7 +56,7 @@ func CreateManager(operatorCfg *configv1alpha1.OperatorConfiguration) (ctrl.Mana // RegisterControllersAndWebhooks adds all the controllers and webhooks to the controller-manager using the passed in Config. func RegisterControllersAndWebhooks(mgr ctrl.Manager, logger logr.Logger, operatorCfg *configv1alpha1.OperatorConfiguration, certsReady chan struct{}) error { - // Controllers will not work unless the webhoooks are fully configured and operational. + // Controllers will not work unless the webhooks are fully configured and operational. // For webhooks to work cert-controller should finish its work of generating and injecting certificates. waitTillWebhookCertsReady(logger, certsReady) if err := registerControllersWithMgr(mgr, operatorCfg.Controllers); err != nil { diff --git a/operator/internal/controller/podclique/components/pod/initcontainer.go b/operator/internal/controller/podclique/components/pod/initcontainer.go index c2ad8c945..fa77297e2 100644 --- a/operator/internal/controller/podclique/components/pod/initcontainer.go +++ b/operator/internal/controller/podclique/components/pod/initcontainer.go @@ -26,7 +26,7 @@ import ( "github.com/ai-dynamo/grove/operator/internal/constants" "github.com/ai-dynamo/grove/operator/internal/controller/common/component" groveerr "github.com/ai-dynamo/grove/operator/internal/errors" - "github.com/ai-dynamo/grove/operator/internal/version" + groveversion "github.com/ai-dynamo/grove/operator/internal/version" "github.com/samber/lo" corev1 "k8s.io/api/core/v1" @@ -107,7 +107,7 @@ func addInitContainer(pcs *grovecorev1alpha1.PodCliqueSet, pclq *grovecorev1alph pod.Spec.InitContainers = append(pod.Spec.InitContainers, corev1.Container{ Name: initContainerName, - Image: fmt.Sprintf("%s:%s", image, version.Get().GitVersion), + Image: fmt.Sprintf("%s:%s", image, groveversion.New().GitVersion), Args: args, VolumeMounts: []corev1.VolumeMount{ { diff --git a/operator/internal/utils/ioutil/ioutil.go b/operator/internal/utils/ioutil/ioutil.go new file mode 100644 index 000000000..ed51c19b6 --- /dev/null +++ b/operator/internal/utils/ioutil/ioutil.go @@ -0,0 +1,26 @@ +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package ioutil + +import "io" + +// CloseQuietly safely closes an io.Closer, ignoring and suppressing any error during the close operation. +func CloseQuietly(closer io.Closer) { + if closer != nil { + _ = closer.Close() + } +} diff --git a/operator/internal/utils/ioutil/ioutil_test.go b/operator/internal/utils/ioutil/ioutil_test.go new file mode 100644 index 000000000..c3831531c --- /dev/null +++ b/operator/internal/utils/ioutil/ioutil_test.go @@ -0,0 +1,133 @@ +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package ioutil + +import ( + "errors" + "io" + "testing" + + "github.com/stretchr/testify/assert" +) + +// TestCloseQuietly tests the CloseQuietly function with various scenarios. +func TestCloseQuietly(t *testing.T) { + tests := []struct { + name string + closer io.Closer + expectErr bool + validate func(t *testing.T, closer io.Closer) + }{ + { + name: "close without errors", + closer: &mockCloser{ + closeErr: nil, + }, + expectErr: false, + validate: func(t *testing.T, closer io.Closer) { + mock := closer.(*mockCloser) + assert.True(t, mock.closed, "closer should be closed") + assert.Equal(t, 1, mock.callCount, "Close should be called once") + }, + }, + { + name: "close with error", + closer: &mockCloser{ + closeErr: errors.New("close failed"), + }, + expectErr: true, + validate: func(t *testing.T, closer io.Closer) { + mock := closer.(*mockCloser) + assert.True(t, mock.closed, "closer should be closed despite error") + assert.Equal(t, 1, mock.callCount, "Close should be called once") + }, + }, + { + name: "closer is nil", + closer: nil, + expectErr: false, + validate: func(_ *testing.T, _ io.Closer) { + // No panic should occur, nothing to validate + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // This should not panic regardless of error + assert.NotPanics(t, func() { + CloseQuietly(tt.closer) + }, "CloseQuietly should not panic") + + if tt.validate != nil { + tt.validate(t, tt.closer) + } + }) + } +} + +// TestCloseQuietlyWithRealCloser tests with a more realistic closer. +func TestCloseQuietlyWithRealCloser(t *testing.T) { + // Using a simple pipe as a real io.Closer + r, w := io.Pipe() + + // Close the reader first + assert.NotPanics(t, func() { + CloseQuietly(r) + }) + + // Close the writer + assert.NotPanics(t, func() { + CloseQuietly(w) + }) + + // Try to close again (should not panic even if already closed) + assert.NotPanics(t, func() { + CloseQuietly(r) + CloseQuietly(w) + }) +} + +// TestCloseQuietlyWithPanic tests that CloseQuietly doesn't handle panics (only errors). +func TestCloseQuietlyWithPanic(t *testing.T) { + closer := &panicCloser{} + + // CloseQuietly should not catch panics, only suppress errors + assert.Panics(t, func() { + CloseQuietly(closer) + }, "CloseQuietly should not suppress panics, only errors") +} + +// mockCloser is a mock implementation of io.Closer for testing. +type mockCloser struct { + closed bool + closeErr error + callCount int +} + +func (m *mockCloser) Close() error { + m.callCount++ + m.closed = true + return m.closeErr +} + +// panicCloser is a closer that panics when closed. +type panicCloser struct{} + +func (p *panicCloser) Close() error { + panic("intentional panic") +} diff --git a/operator/internal/version/version.go b/operator/internal/version/version.go index ee1bc7018..e7ba705b5 100644 --- a/operator/internal/version/version.go +++ b/operator/internal/version/version.go @@ -17,21 +17,15 @@ package version import ( + "encoding/json" "fmt" - "io" - "os" "runtime" - - "github.com/spf13/pflag" - apimachineryversion "k8s.io/apimachinery/pkg/version" ) // These variables will be set during building the grove operator via LD_FLAGS // These variables have been borrowed from k8s.io/component-base repository. We do not want // the dependencies that k8s.io/component-base pulls in as the attempt is the keep a lean set of dependencies. var ( - // programName is the name of the operator. - programName = "grove-operator" // gitVersion is the semantic version for grove operator. gitVersion = "v0.0.0-master+$Format:%H$" // gitCommit is the SHA1 from git, output of $(git rev-parse HEAD) @@ -39,18 +33,23 @@ var ( // gitTreeState is the state of git tree, either "clean" or "dirty" gitTreeState = "" // buildDate is the date (in ISO8601 format) at which the build was done. Output of $(date -u +'%Y-%m-%dT%H:%M:%SZ') - buildDate = "1970-01-01T00:00:00Z" - versionFlag bool + buildDate = "1970-01-01T00:00:00Z" ) -// AddFlags adds the --version flag to the flag.FlagSet. -func AddFlags(fs *pflag.FlagSet) { - fs.BoolVar(&versionFlag, "version", false, "version prints the version information and quits") +// GroveInfo holds version and build information about the grove operator. +type GroveInfo struct { + GitVersion string `json:"gitVersion"` + GitCommit string `json:"gitCommit"` + GitTreeState string `json:"gitTreeState"` + BuildDate string `json:"buildDate"` + GoVersion string `json:"goVersion"` + Compiler string `json:"compiler"` + Platform string `json:"platform"` } -// Get returns the version details for the grove operator. -func Get() apimachineryversion.Info { - return apimachineryversion.Info{ +// New creates a new GroveInfo with the build and version information. +func New() GroveInfo { + return GroveInfo{ GitVersion: gitVersion, GitCommit: gitCommit, GitTreeState: gitTreeState, @@ -61,11 +60,16 @@ func Get() apimachineryversion.Info { } } -// PrintVersionAndExitIfRequested will check if --version is passed and if it is -// then it will print the version information and quit. -func PrintVersionAndExitIfRequested() { - if versionFlag { - _, _ = fmt.Fprintf(io.Writer(os.Stdout), "%s %v\n", programName, Get()) - os.Exit(0) +// Version returns the version information for Grove operator. +func (g GroveInfo) Version() string { + return g.GitVersion +} + +// Verbose returns a detailed multi-line string with version and build information. +func (g GroveInfo) Verbose() string { + infoBytes, err := json.Marshal(g) + if err != nil { + return fmt.Sprintf("error generating verbose version information: %v\n", err) } + return string(infoBytes) } diff --git a/operator/internal/webhook/admission/clustertopology/validation/clustertopology.go b/operator/internal/webhook/admission/clustertopology/validation/clustertopology.go deleted file mode 100644 index 1cbc032e6..000000000 --- a/operator/internal/webhook/admission/clustertopology/validation/clustertopology.go +++ /dev/null @@ -1,182 +0,0 @@ -// /* -// Copyright 2025 The Grove Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// */ - -package validation - -import ( - "fmt" - - grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" - - "github.com/samber/lo" - admissionv1 "k8s.io/api/admission/v1" - apivalidation "k8s.io/apimachinery/pkg/api/validation" - metav1validation "k8s.io/apimachinery/pkg/apis/meta/v1/validation" - "k8s.io/apimachinery/pkg/util/validation/field" -) - -// clusterTopologyValidator validates ClusterTopology resources for create and update operations. -type clusterTopologyValidator struct { - operation admissionv1.Operation - clusterTopology *grovecorev1alpha1.ClusterTopology -} - -// newClusterTopologyValidator creates a new ClusterTopology validator for the given operation. -func newClusterTopologyValidator(clusterTopology *grovecorev1alpha1.ClusterTopology, operation admissionv1.Operation) *clusterTopologyValidator { - return &clusterTopologyValidator{ - operation: operation, - clusterTopology: clusterTopology, - } -} - -// ---------------------------- validate create of ClusterTopology ----------------------------------------------- - -// validate validates the ClusterTopology object. -func (v *clusterTopologyValidator) validate() error { - allErrs := field.ErrorList{} - - // Note: Metadata validation is handled by the API server - errs := v.validateClusterTopologySpec(field.NewPath("spec")) - if len(errs) != 0 { - allErrs = append(allErrs, errs...) - } - - return allErrs.ToAggregate() -} - -// validateClusterTopologySpec validates the specification of a ClusterTopology object. -func (v *clusterTopologyValidator) validateClusterTopologySpec(fldPath *field.Path) field.ErrorList { - allErrs := field.ErrorList{} - - levelsPath := fldPath.Child("levels") - - // Note: MinItems and MaxItems are enforced by kubebuilder validation - // (+kubebuilder:validation:MinItems=1, +kubebuilder:validation:MaxItems=7) - - // First, validate each level individually (domain validity, key format, etc.) - lo.ForEach(v.clusterTopology.Spec.Levels, func(level grovecorev1alpha1.TopologyLevel, i int) { - levelPath := levelsPath.Index(i) - allErrs = append(allErrs, v.validateTopologyLevel(level, levelPath)...) - }) - - // Then validate hierarchical order (assumes all domains are valid) - // This also ensures domain uniqueness - allErrs = append(allErrs, validateTopologyLevelOrder(v.clusterTopology.Spec.Levels, levelsPath)...) - - // Validate that all keys are unique - allErrs = append(allErrs, validateTopologyLevelKeyUniqueness(v.clusterTopology.Spec.Levels, levelsPath)...) - - return allErrs -} - -// validateTopologyLevel validates a single topology level in isolation. -func (v *clusterTopologyValidator) validateTopologyLevel(level grovecorev1alpha1.TopologyLevel, fldPath *field.Path) field.ErrorList { - allErrs := field.ErrorList{} - - // Note: Domain validation is handled by kubebuilder enum validation - // (+kubebuilder:validation:Enum=region;zone;datacenter;block;rack;host;numa) - - // Note: Key required and length validation is handled by kubebuilder validation - // (+kubebuilder:validation:Required, +kubebuilder:validation:MinLength=1, +kubebuilder:validation:MaxLength=63) - - // Validate key format is a valid Kubernetes label key - keyPath := fldPath.Child("key") - allErrs = append(allErrs, metav1validation.ValidateLabelName(level.Key, keyPath)...) - - return allErrs -} - -// validateTopologyLevelOrder validates that topology levels are in the correct hierarchical order -// (broadest to narrowest: Region > Zone > DataCenter > Block > Rack > Host > Numa). -// This function assumes all domains have already been validated to be valid topology domains. -// This validation also ensures domain uniqueness (duplicate domains would have the same order value). -func validateTopologyLevelOrder(levels []grovecorev1alpha1.TopologyLevel, fldPath *field.Path) field.ErrorList { - allErrs := field.ErrorList{} - - for i := 1; i < len(levels); i++ { - prevLevel := levels[i-1] - currLevel := levels[i] - - // Current level must be narrower than previous level - if currLevel.BroaderThan(prevLevel) { - allErrs = append(allErrs, field.Invalid(fldPath.Index(i).Child("domain"), currLevel.Domain, - fmt.Sprintf("topology levels must be in hierarchical order (broadest to narrowest). Domain '%s' at position %d cannot come after '%s' at position %d", - currLevel.Domain, i, prevLevel.Domain, i-1))) - } else if currLevel.Compare(prevLevel) == 0 { - allErrs = append(allErrs, field.Duplicate(fldPath.Index(i).Child("domain"), currLevel.Domain)) - } - } - - return allErrs -} - -// validateTopologyLevelKeyUniqueness validates that all keys in the topology levels are unique. -func validateTopologyLevelKeyUniqueness(levels []grovecorev1alpha1.TopologyLevel, fldPath *field.Path) field.ErrorList { - allErrs := field.ErrorList{} - seenKeys := make(map[string]int) // map key to first index where it appears - - for i, level := range levels { - if firstIndex, exists := seenKeys[level.Key]; exists { - allErrs = append(allErrs, field.Invalid(fldPath.Index(i).Child("key"), level.Key, - fmt.Sprintf("duplicate key: already used at levels[%d]", firstIndex))) - } else { - seenKeys[level.Key] = i - } - } - - return allErrs -} - -// ---------------------------- validate update of ClusterTopology ----------------------------------------------- - -// validateUpdate validates the update to a ClusterTopology object. -func (v *clusterTopologyValidator) validateUpdate(oldClusterTopology *grovecorev1alpha1.ClusterTopology) error { - allErrs := field.ErrorList{} - allErrs = append(allErrs, v.validateClusterTopologySpecUpdate(&v.clusterTopology.Spec, &oldClusterTopology.Spec, field.NewPath("spec"))...) - return allErrs.ToAggregate() -} - -// validateClusterTopologySpecUpdate validates updates to the ClusterTopology specification. -func (v *clusterTopologyValidator) validateClusterTopologySpecUpdate(oldSpec, newSpec *grovecorev1alpha1.ClusterTopologySpec, fldPath *field.Path) field.ErrorList { - allErrs := field.ErrorList{} - - levelsPath := fldPath.Child("levels") - - // Validate that the number of levels hasn't changed - if len(newSpec.Levels) != len(oldSpec.Levels) { - allErrs = append(allErrs, field.Forbidden(levelsPath, "not allowed to add or remove topology levels")) - return allErrs - } - - // Validate that the order and domains of levels haven't changed - allErrs = append(allErrs, validateTopologyLevelImmutableFields(newSpec.Levels, oldSpec.Levels, levelsPath)...) - - return allErrs -} - -// validateTopologyLevelImmutableFields validates that immutable fields in topology levels haven't changed during an update. -func validateTopologyLevelImmutableFields(newLevels, oldLevels []grovecorev1alpha1.TopologyLevel, fldPath *field.Path) field.ErrorList { - allErrs := field.ErrorList{} - - for i := range newLevels { - levelPath := fldPath.Index(i) - - allErrs = append(allErrs, apivalidation.ValidateImmutableField(newLevels[i].Domain, oldLevels[i].Domain, levelPath.Child("domain"))...) - // Note: Key is allowed to change (not in the requirements), but validation has already occurred in validate() - } - - return allErrs -} diff --git a/operator/internal/webhook/admission/clustertopology/validation/clustertopology_crd_test.go b/operator/internal/webhook/admission/clustertopology/validation/clustertopology_crd_test.go deleted file mode 100644 index 20c774b0a..000000000 --- a/operator/internal/webhook/admission/clustertopology/validation/clustertopology_crd_test.go +++ /dev/null @@ -1,252 +0,0 @@ -// /* -// Copyright 2025 The Grove Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// */ - -package validation - -import ( - "context" - "os" - "testing" - "time" - - grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - "k8s.io/client-go/kubernetes/scheme" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/envtest" -) - -func TestClusterTopologyCRDValidation(t *testing.T) { - testEnv, k8sClient := setupEnvTest(t) - defer teardownEnvTest(t, testEnv) - - tests := []struct { - name string - ct *grovecorev1alpha1.ClusterTopology - expectError bool - errorContains string - }{ - { - name: "valid cluster topology with single level", - ct: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{Name: "test-valid-single"}, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "topology.kubernetes.io/region", - }, - }, - }, - }, - expectError: false, - }, - { - name: "valid cluster topology with multiple levels", - ct: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{Name: "test-valid-multiple"}, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "topology.kubernetes.io/region", - }, - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - { - Domain: grovecorev1alpha1.TopologyDomainHost, - Key: "kubernetes.io/hostname", - }, - }, - }, - }, - expectError: false, - }, - { - name: "reject invalid domain", - ct: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{Name: "test-invalid-domain"}, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: "invalid-domain", // Not in enum - Key: "test.io/key", - }, - }, - }, - }, - expectError: true, - errorContains: "supported values", - }, - { - name: "reject empty key", - ct: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{Name: "test-empty-key"}, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "", // Empty key - }, - }, - }, - }, - expectError: true, - errorContains: "spec.levels[0].key", - }, - { - name: "reject key too long", - ct: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{Name: "test-key-too-long"}, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - // Create a key longer than 63 characters (MaxLength validation) - Key: "this-is-a-very-long-key-that-exceeds-the-maximum-allowed-length-of-63-characters", - }, - }, - }, - }, - expectError: true, - errorContains: "spec.levels[0].key", - }, - { - name: "reject empty levels array", - ct: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{Name: "test-empty-levels"}, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{}, // Empty array - }, - }, - expectError: true, - errorContains: "spec.levels", - }, - { - name: "reject too many levels", - ct: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{Name: "test-too-many-levels"}, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - {Domain: grovecorev1alpha1.TopologyDomainRegion, Key: "key1"}, - {Domain: grovecorev1alpha1.TopologyDomainZone, Key: "key2"}, - {Domain: grovecorev1alpha1.TopologyDomainDataCenter, Key: "key3"}, - {Domain: grovecorev1alpha1.TopologyDomainBlock, Key: "key4"}, - {Domain: grovecorev1alpha1.TopologyDomainRack, Key: "key5"}, - {Domain: grovecorev1alpha1.TopologyDomainHost, Key: "key6"}, - {Domain: grovecorev1alpha1.TopologyDomainNuma, Key: "key7"}, - {Domain: "region", Key: "key8"}, // 8th level - exceeds max - }, - }, - }, - expectError: true, - errorContains: "spec.levels", - }, - { - name: "reject missing domain", - ct: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{Name: "test-missing-domain"}, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Key: "test.io/key", // Missing domain - }, - }, - }, - }, - expectError: true, - // API server should reject due to Required validation - }, - { - name: "reject missing key", - ct: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{Name: "test-missing-key"}, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - // Missing key - }, - }, - }, - }, - expectError: true, - // API server should reject due to Required validation - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - err := k8sClient.Create(context.Background(), tt.ct) - - if tt.expectError { - assert.Error(t, err) - if tt.errorContains != "" { - assert.Contains(t, err.Error(), tt.errorContains) - } - } else { - assert.NoError(t, err) - // Clean up successfully created resource - _ = k8sClient.Delete(context.Background(), tt.ct) - } - }) - } -} - -// setupEnvTest creates an envtest environment with ClusterTopology CRDs loaded -func setupEnvTest(t *testing.T) (*envtest.Environment, client.Client) { - // Check if KUBEBUILDER_ASSETS is set - if os.Getenv("KUBEBUILDER_ASSETS") == "" { - t.Skip("Skipping envtest: KUBEBUILDER_ASSETS not set. Run 'make test-envtest' to execute this test.") - } - - // Point to CRD directory (relative from this test file location) - crdPath := "../../../../../api/core/v1alpha1/crds" - - testEnv := &envtest.Environment{ - CRDDirectoryPaths: []string{crdPath}, - ErrorIfCRDPathMissing: true, - } - - cfg, err := testEnv.Start() - require.NoError(t, err) - - // Create client with proper scheme - testScheme := runtime.NewScheme() - utilruntime.Must(scheme.AddToScheme(testScheme)) - utilruntime.Must(grovecorev1alpha1.AddToScheme(testScheme)) - - k8sClient, err := client.New(cfg, client.Options{Scheme: testScheme}) - require.NoError(t, err) - - // Give CRDs a moment to be fully ready - time.Sleep(100 * time.Millisecond) - - return testEnv, k8sClient -} - -// teardownEnvTest stops the envtest environment -func teardownEnvTest(t *testing.T, testEnv *envtest.Environment) { - err := testEnv.Stop() - require.NoError(t, err) -} diff --git a/operator/internal/webhook/admission/clustertopology/validation/clustertopology_test.go b/operator/internal/webhook/admission/clustertopology/validation/clustertopology_test.go deleted file mode 100644 index 1e4da5c58..000000000 --- a/operator/internal/webhook/admission/clustertopology/validation/clustertopology_test.go +++ /dev/null @@ -1,500 +0,0 @@ -// /* -// Copyright 2025 The Grove Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// */ - -package validation - -import ( - "context" - "testing" - - grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" - - "github.com/go-logr/logr" - "github.com/stretchr/testify/assert" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -func TestValidateCreate(t *testing.T) { - tests := []struct { - name string - clusterTopology *grovecorev1alpha1.ClusterTopology - expectedErr bool - expectedErrMsg string - }{ - { - name: "valid cluster topology with single level", - clusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - }, - }, - }, - expectedErr: false, - }, - { - name: "valid cluster topology with multiple levels", - clusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "topology.kubernetes.io/region", - }, - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - { - Domain: grovecorev1alpha1.TopologyDomainHost, - Key: "kubernetes.io/hostname", - }, - }, - }, - }, - expectedErr: false, - }, - { - name: "invalid - duplicate domain (caught by order validation)", - clusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "custom.io/zone", - }, - }, - }, - }, - expectedErr: true, - expectedErrMsg: "Duplicate value", - }, - { - name: "invalid - duplicate key", - clusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "topology.kubernetes.io/zone", - }, - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - }, - }, - }, - expectedErr: true, - expectedErrMsg: "duplicate key", - }, - { - name: "invalid - key not a valid label (has space)", - clusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "Invalid value", - }, - }, - }, - }, - expectedErr: true, - expectedErrMsg: "name part must consist of alphanumeric characters", - }, - { - name: "invalid - key prefix has invalid characters (double dots)", - clusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "invalid..label/key", - }, - }, - }, - }, - expectedErr: true, - expectedErrMsg: "prefix part", - }, - { - name: "invalid - key name part too long", - clusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "example.com/this-is-a-very-long-name-that-exceeds-the-maximum-length-of-sixtythree-characters", - }, - }, - }, - }, - expectedErr: true, - expectedErrMsg: "name part must be no more than 63 characters", - }, - { - name: "invalid - levels out of order (zone before region)", - clusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "topology.kubernetes.io/region", - }, - }, - }, - }, - expectedErr: true, - expectedErrMsg: "topology levels must be in hierarchical order", - }, - { - name: "valid - correct hierarchical order (region > zone > datacenter > rack > host > numa)", - clusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "topology.kubernetes.io/region", - }, - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - { - Domain: grovecorev1alpha1.TopologyDomainDataCenter, - Key: "topology.kubernetes.io/datacenter", - }, - { - Domain: grovecorev1alpha1.TopologyDomainRack, - Key: "topology.kubernetes.io/rack", - }, - { - Domain: grovecorev1alpha1.TopologyDomainHost, - Key: "kubernetes.io/hostname", - }, - { - Domain: grovecorev1alpha1.TopologyDomainNuma, - Key: "topology.kubernetes.io/numa", - }, - }, - }, - }, - expectedErr: false, - }, - } - - handler := &Handler{logger: logr.Discard()} - ctx := context.Background() - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - _, err := handler.ValidateCreate(ctx, tt.clusterTopology) - - if tt.expectedErr { - assert.Error(t, err) - if tt.expectedErrMsg != "" { - assert.Contains(t, err.Error(), tt.expectedErrMsg) - } - } else { - assert.NoError(t, err) - } - }) - } -} - -func TestValidateUpdate(t *testing.T) { - tests := []struct { - name string - oldClusterTopology *grovecorev1alpha1.ClusterTopology - newClusterTopology *grovecorev1alpha1.ClusterTopology - expectedErr bool - expectedErrMsg string - }{ - { - name: "valid update - key changed", - oldClusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - }, - }, - }, - newClusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "custom.io/zone", - }, - }, - }, - }, - expectedErr: false, - }, - { - name: "invalid - domain changed", - oldClusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - }, - }, - }, - newClusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "topology.kubernetes.io/region", - }, - }, - }, - }, - expectedErr: true, - expectedErrMsg: "field is immutable", - }, - { - name: "invalid - level added", - oldClusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - }, - }, - }, - newClusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - { - Domain: grovecorev1alpha1.TopologyDomainHost, - Key: "kubernetes.io/hostname", - }, - }, - }, - }, - expectedErr: true, - expectedErrMsg: "not allowed to add or remove topology levels", - }, - { - name: "invalid - level removed", - oldClusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "topology.kubernetes.io/region", - }, - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - }, - }, - }, - newClusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "topology.kubernetes.io/region", - }, - }, - }, - }, - expectedErr: true, - expectedErrMsg: "not allowed to add or remove topology levels", - }, - { - name: "invalid - update creates duplicate key", - oldClusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "topology.kubernetes.io/region", - }, - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - }, - }, - }, - newClusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "topology.kubernetes.io/zone", - }, - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - }, - }, - }, - expectedErr: true, - expectedErrMsg: "duplicate key", - }, - { - name: "valid update - multiple levels, keys changed", - oldClusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "topology.kubernetes.io/region", - }, - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - { - Domain: grovecorev1alpha1.TopologyDomainHost, - Key: "kubernetes.io/hostname", - }, - }, - }, - }, - newClusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "custom.io/region", - }, - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "custom.io/zone", - }, - { - Domain: grovecorev1alpha1.TopologyDomainHost, - Key: "custom.io/host", - }, - }, - }, - }, - expectedErr: false, - }, - } - - handler := &Handler{logger: logr.Discard()} - ctx := context.Background() - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - _, err := handler.ValidateUpdate(ctx, tt.oldClusterTopology, tt.newClusterTopology) - - if tt.expectedErr { - assert.Error(t, err) - if tt.expectedErrMsg != "" { - assert.Contains(t, err.Error(), tt.expectedErrMsg) - } - } else { - assert.NoError(t, err) - } - }) - } -} diff --git a/operator/internal/webhook/admission/clustertopology/validation/handler.go b/operator/internal/webhook/admission/clustertopology/validation/handler.go deleted file mode 100644 index fb1e7dba8..000000000 --- a/operator/internal/webhook/admission/clustertopology/validation/handler.go +++ /dev/null @@ -1,102 +0,0 @@ -// /* -// Copyright 2025 The Grove Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// */ - -package validation - -import ( - "context" - "fmt" - - "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" - "github.com/ai-dynamo/grove/operator/internal/errors" - - "github.com/go-logr/logr" - admissionv1 "k8s.io/api/admission/v1" - "k8s.io/apimachinery/pkg/runtime" - "sigs.k8s.io/controller-runtime/pkg/manager" - "sigs.k8s.io/controller-runtime/pkg/webhook/admission" -) - -const ( - // ErrValidateCreateClusterTopology is the error code returned where the request to create a ClusterTopology is invalid. - ErrValidateCreateClusterTopology v1alpha1.ErrorCode = "ERR_VALIDATE_CREATE_CLUSTERTOPOLOGY" - // ErrValidateUpdateClusterTopology is the error code returned where the request to update a ClusterTopology is invalid. - ErrValidateUpdateClusterTopology v1alpha1.ErrorCode = "ERR_VALIDATE_UPDATE_CLUSTERTOPOLOGY" -) - -// Handler is a handler for validating ClusterTopology resources. -type Handler struct { - logger logr.Logger -} - -// NewHandler creates a new handler for ClusterTopology Webhook. -func NewHandler(mgr manager.Manager) *Handler { - return &Handler{ - logger: mgr.GetLogger().WithName("webhook").WithName(Name), - } -} - -// ValidateCreate validates a ClusterTopology create request. -func (h *Handler) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) { - h.logValidatorFunctionInvocation(ctx) - clusterTopology, err := castToClusterTopology(obj) - if err != nil { - return nil, errors.WrapError(err, ErrValidateCreateClusterTopology, string(admissionv1.Create), "failed to cast object to ClusterTopology") - } - return nil, newClusterTopologyValidator(clusterTopology, admissionv1.Create).validate() -} - -// ValidateUpdate validates a ClusterTopology update request. -func (h *Handler) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) { - h.logValidatorFunctionInvocation(ctx) - newClusterTopology, err := castToClusterTopology(newObj) - if err != nil { - return nil, errors.WrapError(err, ErrValidateUpdateClusterTopology, string(admissionv1.Update), "failed to cast new object to ClusterTopology") - } - oldClusterTopology, err := castToClusterTopology(oldObj) - if err != nil { - return nil, errors.WrapError(err, ErrValidateUpdateClusterTopology, string(admissionv1.Update), "failed to cast old object to ClusterTopology") - } - validator := newClusterTopologyValidator(newClusterTopology, admissionv1.Update) - if err := validator.validate(); err != nil { - return nil, err - } - return nil, validator.validateUpdate(oldClusterTopology) -} - -// ValidateDelete validates a ClusterTopology delete request. -func (h *Handler) ValidateDelete(_ context.Context, _ runtime.Object) (admission.Warnings, error) { - return nil, nil -} - -// castToClusterTopology attempts to cast a runtime.Object to a ClusterTopology. -func castToClusterTopology(obj runtime.Object) (*v1alpha1.ClusterTopology, error) { - clusterTopology, ok := obj.(*v1alpha1.ClusterTopology) - if !ok { - return nil, fmt.Errorf("expected a ClusterTopology object but got %T", obj) - } - return clusterTopology, nil -} - -// logValidatorFunctionInvocation logs details about the validation request including user and operation information. -func (h *Handler) logValidatorFunctionInvocation(ctx context.Context) { - req, err := admission.RequestFromContext(ctx) - if err != nil { - h.logger.Error(err, "failed to get request from context") - return - } - h.logger.Info("ClusterTopology validation webhook invoked", "name", req.Name, "namespace", req.Namespace, "operation", req.Operation, "user", req.UserInfo.Username) -} diff --git a/operator/internal/webhook/admission/clustertopology/validation/register.go b/operator/internal/webhook/admission/clustertopology/validation/register.go deleted file mode 100644 index 76c579577..000000000 --- a/operator/internal/webhook/admission/clustertopology/validation/register.go +++ /dev/null @@ -1,39 +0,0 @@ -// /* -// Copyright 2025 The Grove Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// */ - -package validation - -import ( - "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" - - "sigs.k8s.io/controller-runtime/pkg/manager" - "sigs.k8s.io/controller-runtime/pkg/webhook/admission" -) - -const ( - // Name is the name of the validating webhook handler for ClusterTopology. - Name = "clustertopology-validating-webhook" - webhookPath = "/webhooks/validate-clustertopology" -) - -// RegisterWithManager registers the webhook with the manager. -func (h *Handler) RegisterWithManager(mgr manager.Manager) error { - webhook := admission. - WithCustomValidator(mgr.GetScheme(), &v1alpha1.ClusterTopology{}, h). - WithRecoverPanic(true) - mgr.GetWebhookServer().Register(webhookPath, webhook) - return nil -} diff --git a/operator/internal/webhook/register.go b/operator/internal/webhook/register.go index b46ea1cb4..ee77da1b3 100644 --- a/operator/internal/webhook/register.go +++ b/operator/internal/webhook/register.go @@ -23,7 +23,6 @@ import ( configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" "github.com/ai-dynamo/grove/operator/internal/constants" - ctvalidation "github.com/ai-dynamo/grove/operator/internal/webhook/admission/clustertopology/validation" "github.com/ai-dynamo/grove/operator/internal/webhook/admission/pcs/authorization" "github.com/ai-dynamo/grove/operator/internal/webhook/admission/pcs/defaulting" pcsvalidation "github.com/ai-dynamo/grove/operator/internal/webhook/admission/pcs/validation" @@ -31,8 +30,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" ) -// RegisterWebhooks registers the webhooks with the controller manager. -func RegisterWebhooks(mgr manager.Manager, authorizerConfig configv1alpha1.AuthorizerConfig) error { +// Register registers the webhooks with the controller manager. +func Register(mgr manager.Manager, authorizerConfig configv1alpha1.AuthorizerConfig) error { defaultingWebhook := defaulting.NewHandler(mgr) slog.Info("Registering webhook with manager", "handler", defaulting.Name) if err := defaultingWebhook.RegisterWithManager(mgr); err != nil { @@ -43,11 +42,6 @@ func RegisterWebhooks(mgr manager.Manager, authorizerConfig configv1alpha1.Autho if err := pcsValidatingWebhook.RegisterWithManager(mgr); err != nil { return fmt.Errorf("failed adding %s webhook handler: %v", pcsvalidation.Name, err) } - ctValidatingWebhook := ctvalidation.NewHandler(mgr) - slog.Info("Registering webhook with manager", "handler", ctvalidation.Name) - if err := ctValidatingWebhook.RegisterWithManager(mgr); err != nil { - return fmt.Errorf("failed adding %s webhook handler: %v", ctvalidation.Name, err) - } if authorizerConfig.Enabled { serviceAccountName, ok := os.LookupEnv(constants.EnvVarServiceAccountName) if !ok { diff --git a/operator/internal/webhook/register_test.go b/operator/internal/webhook/register_test.go index 18403e87b..430adeb61 100644 --- a/operator/internal/webhook/register_test.go +++ b/operator/internal/webhook/register_test.go @@ -91,7 +91,7 @@ func TestRegisterWebhooks_WithoutAuthorizer(t *testing.T) { Enabled: false, } - err := RegisterWebhooks(mgr, authorizerConfig) + err := Register(mgr, authorizerConfig) require.NoError(t, err) } @@ -120,7 +120,7 @@ func TestRegisterWebhooks_WithAuthorizerMissingEnvVar(t *testing.T) { Enabled: true, } - err = RegisterWebhooks(mgr, authorizerConfig) + err = Register(mgr, authorizerConfig) require.Error(t, err) assert.Contains(t, err.Error(), constants.EnvVarServiceAccountName) } @@ -149,7 +149,7 @@ func TestRegisterWebhooks_WithAuthorizerMissingNamespaceFile(t *testing.T) { Enabled: true, } - err := RegisterWebhooks(mgr, authorizerConfig) + err := Register(mgr, authorizerConfig) require.Error(t, err) assert.Contains(t, err.Error(), "error reading namespace file") } @@ -194,7 +194,7 @@ func TestRegisterWebhooks_WithAuthorizerSuccess(t *testing.T) { Enabled: true, } - err = RegisterWebhooks(mgr, authorizerConfig) + err = Register(mgr, authorizerConfig) // Will error because it tries to read the hardcoded namespace file path require.Error(t, err) } diff --git a/operator/skaffold.yaml b/operator/skaffold.yaml index 2f58e72af..a7a5e73bf 100644 --- a/operator/skaffold.yaml +++ b/operator/skaffold.yaml @@ -33,8 +33,6 @@ build: - VERSION flags: - -v - ldflags: - - '{{.LD_FLAGS}}' main: ./initc/cmd deploy: helm: # see https://skaffold.dev/docs/deployers/helm/