From 1ebe37e8e6417f53432945a540d18ff0aab4f6b8 Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Mon, 22 Dec 2025 14:41:31 +0200 Subject: [PATCH 01/23] refactor: migrate ClusterTopology validation to CRD and enable config-driven CR creation Move validation logic from admission webhook to struct-level kubebuilder markers and CEL rules. Operator now creates/updates ClusterTopology CR from configuration at startup, eliminating need for pre-created resources. - Add CEL validation for domain/key uniqueness to ClusterTopology CRD - Add Pattern validation for Kubernetes label key format - Add Levels field to OperatorConfiguration.ClusterTopologyConfiguration - Remove admission webhook validation code and registration - Implement ensureClusterTopology() to manage CR lifecycle at startup - Update configuration validation with domain/key uniqueness checks - Remove ClusterTopology webhook from cert management - Update tests to reflect webhook removal --- docs/designs/topology.md | 1 + operator/Makefile | 1 - operator/api/config/v1alpha1/types.go | 34 +- .../config/v1alpha1/zz_generated.deepcopy.go | 23 +- operator/api/config/validation/validation.go | 40 +- operator/api/core/v1alpha1/clustertopology.go | 3 + .../crds/grove.io_clustertopologies.yaml | 7 + operator/cmd/main.go | 61 ++- operator/internal/controller/cert/cert.go | 5 - .../internal/controller/cert/cert_test.go | 14 +- .../validation/clustertopology.go | 182 ------- .../validation/clustertopology_crd_test.go | 252 --------- .../validation/clustertopology_test.go | 500 ------------------ .../clustertopology/validation/handler.go | 102 ---- .../clustertopology/validation/register.go | 39 -- operator/internal/webhook/register.go | 6 - 16 files changed, 160 insertions(+), 1110 deletions(-) delete mode 100644 operator/internal/webhook/admission/clustertopology/validation/clustertopology.go delete mode 100644 operator/internal/webhook/admission/clustertopology/validation/clustertopology_crd_test.go delete mode 100644 operator/internal/webhook/admission/clustertopology/validation/clustertopology_test.go delete mode 100644 operator/internal/webhook/admission/clustertopology/validation/handler.go delete mode 100644 operator/internal/webhook/admission/clustertopology/validation/register.go diff --git a/docs/designs/topology.md b/docs/designs/topology.md index 3d761add4..c83dfe76a 100644 --- a/docs/designs/topology.md +++ b/docs/designs/topology.md @@ -172,6 +172,7 @@ Domain TopologyDomain `json:"domain"` // +kubebuilder:validation:Required // +kubebuilder:validation:MinLength=1 // +kubebuilder:validation:MaxLength=63 +// +kubebuilder:validation:Pattern=`^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]/)?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$` Key string `json:"key"` } diff --git a/operator/Makefile b/operator/Makefile index 9c89712af..a4012e7c5 100644 --- a/operator/Makefile +++ b/operator/Makefile @@ -90,7 +90,6 @@ cover-html: test-cover .PHONY: test-envtest test-envtest: $(SETUP_ENVTEST) @echo "Running envtest with CRD validation..." - @KUBEBUILDER_ASSETS=$$($(SETUP_ENVTEST) use -p path) go test ./internal/webhook/admission/clustertopology/validation -v -run TestClusterTopologyCRDValidation # Run e2e tests .PHONY: test-e2e diff --git a/operator/api/config/v1alpha1/types.go b/operator/api/config/v1alpha1/types.go index 130189a84..7d983b3ee 100644 --- a/operator/api/config/v1alpha1/types.go +++ b/operator/api/config/v1alpha1/types.go @@ -193,8 +193,36 @@ type AuthorizerConfig struct { type ClusterTopologyConfiguration struct { // Enabled indicates whether topology-aware scheduling is enabled. Enabled bool `json:"enabled"` - // Name is the ClusterTopology resource name to use. - // Defaults to "grove-topology" if not specified when topology is enabled. + // Levels is an ordered list of topology levels from broadest to narrowest scope. + // Used to create/update the ClusterTopology CR at operator startup. // +optional - Name string `json:"name,omitempty"` + Levels []TopologyLevel `json:"levels,omitempty"` +} + +// TopologyDomain represents a predefined topology level in the hierarchy. +type TopologyDomain string + +const ( + // TopologyDomainRegion represents the region level in the topology hierarchy. + TopologyDomainRegion TopologyDomain = "region" + // TopologyDomainZone represents the zone level in the topology hierarchy. + TopologyDomainZone TopologyDomain = "zone" + // TopologyDomainDataCenter represents the datacenter level in the topology hierarchy. + TopologyDomainDataCenter TopologyDomain = "datacenter" + // TopologyDomainBlock represents the block level in the topology hierarchy. + TopologyDomainBlock TopologyDomain = "block" + // TopologyDomainRack represents the rack level in the topology hierarchy. + TopologyDomainRack TopologyDomain = "rack" + // TopologyDomainHost represents the host level in the topology hierarchy. + TopologyDomainHost TopologyDomain = "host" + // TopologyDomainNuma represents the numa level in the topology hierarchy. + TopologyDomainNuma TopologyDomain = "numa" +) + +// TopologyLevel defines a single level in the topology hierarchy. +type TopologyLevel struct { + // Domain is the predefined level identifier used in TopologyConstraint references. + Domain TopologyDomain `json:"domain"` + // Key is the node label key that identifies this topology domain. + Key string `json:"key"` } diff --git a/operator/api/config/v1alpha1/zz_generated.deepcopy.go b/operator/api/config/v1alpha1/zz_generated.deepcopy.go index 2cab37b4c..317f41bc1 100644 --- a/operator/api/config/v1alpha1/zz_generated.deepcopy.go +++ b/operator/api/config/v1alpha1/zz_generated.deepcopy.go @@ -65,6 +65,11 @@ func (in *ClientConnectionConfiguration) DeepCopy() *ClientConnectionConfigurati // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ClusterTopologyConfiguration) DeepCopyInto(out *ClusterTopologyConfiguration) { *out = *in + if in.Levels != nil { + in, out := &in.Levels, &out.Levels + *out = make([]TopologyLevel, len(*in)) + copy(*out, *in) + } return } @@ -151,7 +156,7 @@ func (in *OperatorConfiguration) DeepCopyInto(out *OperatorConfiguration) { } in.Controllers.DeepCopyInto(&out.Controllers) in.Authorizer.DeepCopyInto(&out.Authorizer) - out.ClusterTopology = in.ClusterTopology + in.ClusterTopology.DeepCopyInto(&out.ClusterTopology) return } @@ -279,6 +284,22 @@ func (in *ServerConfiguration) DeepCopy() *ServerConfiguration { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TopologyLevel) DeepCopyInto(out *TopologyLevel) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TopologyLevel. +func (in *TopologyLevel) DeepCopy() *TopologyLevel { + if in == nil { + return nil + } + out := new(TopologyLevel) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *WebhookServer) DeepCopyInto(out *WebhookServer) { *out = *in diff --git a/operator/api/config/validation/validation.go b/operator/api/config/validation/validation.go index 05b2a601d..414683176 100644 --- a/operator/api/config/validation/validation.go +++ b/operator/api/config/validation/validation.go @@ -114,11 +114,45 @@ func mustBeGreaterThanZeroDuration(duration metav1.Duration, fldPath *field.Path } // validateClusterTopologyConfiguration validates the cluster topology configuration. -// When cluster topology is enabled, it ensures the topology name is provided. +// When cluster topology is enabled, it ensures the topology name and levels are provided, +// and validates domain and key uniqueness. func validateClusterTopologyConfiguration(clusterTopologyCfg configv1alpha1.ClusterTopologyConfiguration, fldPath *field.Path) field.ErrorList { allErrs := field.ErrorList{} - if clusterTopologyCfg.Enabled && len(strings.TrimSpace(clusterTopologyCfg.Name)) == 0 { - allErrs = append(allErrs, field.Required(fldPath.Child("name"), "clusterTopology name is required")) + + if !clusterTopologyCfg.Enabled { + return allErrs + } + + if len(strings.TrimSpace(clusterTopologyCfg.Name)) == 0 { + allErrs = append(allErrs, field.Required(fldPath.Child("name"), "name is required when topology is enabled")) } + + if len(clusterTopologyCfg.Levels) == 0 { + allErrs = append(allErrs, field.Required(fldPath.Child("levels"), "levels are required when topology is enabled")) + } + + levelsPath := fldPath.Child("levels") + if len(clusterTopologyCfg.Levels) > 7 { + allErrs = append(allErrs, field.TooMany(levelsPath, len(clusterTopologyCfg.Levels), 7)) + } + + seenDomains := make(map[string]int) + for i, level := range clusterTopologyCfg.Levels { + if _, exists := seenDomains[string(level.Domain)]; exists { + allErrs = append(allErrs, field.Duplicate(levelsPath.Index(i).Child("domain"), level.Domain)) + } else { + seenDomains[string(level.Domain)] = i + } + } + + seenKeys := make(map[string]int) + for i, level := range clusterTopologyCfg.Levels { + if _, exists := seenKeys[level.Key]; exists { + allErrs = append(allErrs, field.Duplicate(levelsPath.Index(i).Child("key"), level.Key)) + } else { + seenKeys[level.Key] = i + } + } + return allErrs } diff --git a/operator/api/core/v1alpha1/clustertopology.go b/operator/api/core/v1alpha1/clustertopology.go index dc756e824..1ae3cdce6 100644 --- a/operator/api/core/v1alpha1/clustertopology.go +++ b/operator/api/core/v1alpha1/clustertopology.go @@ -52,6 +52,8 @@ type ClusterTopologySpec struct { // This field is immutable after creation. // +kubebuilder:validation:MinItems=1 // +kubebuilder:validation:MaxItems=7 + // +kubebuilder:validation:XValidation:rule="self.all(x, self.filter(y, y.domain == x.domain).size() == 1)",message="domain must be unique across all levels" + // +kubebuilder:validation:XValidation:rule="self.all(x, self.filter(y, y.key == x.key).size() == 1)",message="key must be unique across all levels" Levels []TopologyLevel `json:"levels"` } @@ -126,6 +128,7 @@ type TopologyLevel struct { // +kubebuilder:validation:Required // +kubebuilder:validation:MinLength=1 // +kubebuilder:validation:MaxLength=63 + // +kubebuilder:validation:Pattern=`^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]/)?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$` Key string `json:"key"` } diff --git a/operator/api/core/v1alpha1/crds/grove.io_clustertopologies.yaml b/operator/api/core/v1alpha1/crds/grove.io_clustertopologies.yaml index f923ab48d..5328e0aa6 100644 --- a/operator/api/core/v1alpha1/crds/grove.io_clustertopologies.yaml +++ b/operator/api/core/v1alpha1/crds/grove.io_clustertopologies.yaml @@ -72,6 +72,7 @@ spec: Examples: "topology.kubernetes.io/zone", "kubernetes.io/hostname" maxLength: 63 minLength: 1 + pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]/)?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ type: string required: - domain @@ -80,6 +81,12 @@ spec: maxItems: 7 minItems: 1 type: array + x-kubernetes-validations: + - message: domain must be unique across all levels + rule: self.all(x, self.filter(y, y.domain == x.domain).size() == + 1) + - message: key must be unique across all levels + rule: self.all(x, self.filter(y, y.key == x.key).size() == 1) required: - levels type: object diff --git a/operator/cmd/main.go b/operator/cmd/main.go index f0a9ec310..a0bb96beb 100644 --- a/operator/cmd/main.go +++ b/operator/cmd/main.go @@ -18,6 +18,7 @@ package main import ( "context" + "errors" "flag" "fmt" "os" @@ -31,6 +32,8 @@ import ( groveversion "github.com/ai-dynamo/grove/operator/internal/version" "github.com/spf13/pflag" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -68,8 +71,8 @@ func main() { ctx := ctrl.SetupSignalHandler() - if err = validateClusterTopology(ctx, mgr.GetAPIReader(), operatorCfg.ClusterTopology); err != nil { - logger.Error(err, "cannot validate cluster topology, operator cannot start") + if err = ensureClusterTopology(ctx, mgr.GetClient(), operatorCfg.ClusterTopology); err != nil { + logger.Error(err, "cannot create/update cluster topology, operator cannot start") os.Exit(1) } @@ -119,14 +122,56 @@ func printFlags() { logger.Info("Running with flags", flagKVs...) } -func validateClusterTopology(ctx context.Context, reader client.Reader, clusterTopologyConfig configv1alpha1.ClusterTopologyConfiguration) error { - if !clusterTopologyConfig.Enabled { +func ensureClusterTopology(ctx context.Context, client client.Client, config configv1alpha1.ClusterTopologyConfiguration) error { + if !config.Enabled { return nil } - var clusterTopology grovecorev1alpha1.ClusterTopology - if err := reader.Get(ctx, types.NamespacedName{Name: clusterTopologyConfig.Name}, &clusterTopology); err != nil { - return fmt.Errorf("failed to fetch ClusterTopology %s: %w", clusterTopologyConfig.Name, err) + + if config.Name == "" { + return errors.New("ClusterTopology name is required when enabled") + } + + if len(config.Levels) == 0 { + return errors.New("ClusterTopology levels are required when enabled") + } + + topology := &grovecorev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: config.Name, + }, + Spec: grovecorev1alpha1.ClusterTopologySpec{ + Levels: convertTopologyLevels(config.Levels), + }, + } + + err := client.Create(ctx, topology) + if err != nil { + if apierrors.IsAlreadyExists(err) { + existing := &grovecorev1alpha1.ClusterTopology{} + if err := client.Get(ctx, types.NamespacedName{Name: config.Name}, existing); err != nil { + return fmt.Errorf("failed to get existing ClusterTopology: %w", err) + } + existing.Spec = topology.Spec + if err := client.Update(ctx, existing); err != nil { + return fmt.Errorf("failed to update ClusterTopology: %w", err) + } + logger.Info("cluster topology updated successfully", "name", config.Name) + return nil + } + return fmt.Errorf("failed to create ClusterTopology: %w", err) } - logger.Info("topology validated successfully", "cluster topology", clusterTopologyConfig.Name) + + logger.Info("cluster topology created successfully", "name", config.Name) return nil } + +func convertTopologyLevels(levels []configv1alpha1.TopologyLevel) []grovecorev1alpha1.TopologyLevel { + result := make([]grovecorev1alpha1.TopologyLevel, len(levels)) + for i, level := range levels { + result[i] = grovecorev1alpha1.TopologyLevel{ + Domain: grovecorev1alpha1.TopologyDomain(level.Domain), + Key: level.Key, + } + } + return result +} diff --git a/operator/internal/controller/cert/cert.go b/operator/internal/controller/cert/cert.go index d7478e949..2cd60b4d2 100644 --- a/operator/internal/controller/cert/cert.go +++ b/operator/internal/controller/cert/cert.go @@ -22,7 +22,6 @@ import ( "strings" "github.com/ai-dynamo/grove/operator/internal/constants" - clustertopologyvalidation "github.com/ai-dynamo/grove/operator/internal/webhook/admission/clustertopology/validation" authorizationwebhook "github.com/ai-dynamo/grove/operator/internal/webhook/admission/pcs/authorization" defaultingwebhook "github.com/ai-dynamo/grove/operator/internal/webhook/admission/pcs/defaulting" validatingwebhook "github.com/ai-dynamo/grove/operator/internal/webhook/admission/pcs/validation" @@ -88,10 +87,6 @@ func getWebhooks(authorizerEnabled bool) []cert.WebhookInfo { Type: cert.Validating, Name: validatingwebhook.Name, }, - { - Type: cert.Validating, - Name: clustertopologyvalidation.Name, - }, } if authorizerEnabled { webhooks = append(webhooks, cert.WebhookInfo{ diff --git a/operator/internal/controller/cert/cert_test.go b/operator/internal/controller/cert/cert_test.go index 87f1b6362..7f9e4c03f 100644 --- a/operator/internal/controller/cert/cert_test.go +++ b/operator/internal/controller/cert/cert_test.go @@ -32,25 +32,23 @@ func TestGetWebhooks(t *testing.T) { t.Run("authorizer disabled", func(t *testing.T) { webhooks := getWebhooks(false) - // Expect 3 webhooks: podcliqueset-defaulting, podcliqueset-validating, clustertopology-validating - require.Len(t, webhooks, 3) + // Expect 2 webhooks: podcliqueset-defaulting, podcliqueset-validating + require.Len(t, webhooks, 2) // Check that defaulting and validating webhooks are present assert.Equal(t, cert.Mutating, webhooks[0].Type) // podcliqueset-defaulting-webhook assert.Equal(t, cert.Validating, webhooks[1].Type) // podcliqueset-validating-webhook - assert.Equal(t, cert.Validating, webhooks[2].Type) // clustertopology-validating-webhook }) // Test with authorizer enabled t.Run("authorizer enabled", func(t *testing.T) { webhooks := getWebhooks(true) - // Expect 4 webhooks: the 3 base webhooks plus the authorizer-webhook - require.Len(t, webhooks, 4) - // Check that all four webhooks are present + // Expect 3 webhooks: the 2 base webhooks plus the authorizer-webhook + require.Len(t, webhooks, 3) + // Check that all three webhooks are present assert.Equal(t, cert.Mutating, webhooks[0].Type) // podcliqueset-defaulting-webhook assert.Equal(t, cert.Validating, webhooks[1].Type) // podcliqueset-validating-webhook - assert.Equal(t, cert.Validating, webhooks[2].Type) // clustertopology-validating-webhook - assert.Equal(t, cert.Validating, webhooks[3].Type) // authorizer-webhook + assert.Equal(t, cert.Validating, webhooks[2].Type) // authorizer-webhook }) } diff --git a/operator/internal/webhook/admission/clustertopology/validation/clustertopology.go b/operator/internal/webhook/admission/clustertopology/validation/clustertopology.go deleted file mode 100644 index 1cbc032e6..000000000 --- a/operator/internal/webhook/admission/clustertopology/validation/clustertopology.go +++ /dev/null @@ -1,182 +0,0 @@ -// /* -// Copyright 2025 The Grove Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// */ - -package validation - -import ( - "fmt" - - grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" - - "github.com/samber/lo" - admissionv1 "k8s.io/api/admission/v1" - apivalidation "k8s.io/apimachinery/pkg/api/validation" - metav1validation "k8s.io/apimachinery/pkg/apis/meta/v1/validation" - "k8s.io/apimachinery/pkg/util/validation/field" -) - -// clusterTopologyValidator validates ClusterTopology resources for create and update operations. -type clusterTopologyValidator struct { - operation admissionv1.Operation - clusterTopology *grovecorev1alpha1.ClusterTopology -} - -// newClusterTopologyValidator creates a new ClusterTopology validator for the given operation. -func newClusterTopologyValidator(clusterTopology *grovecorev1alpha1.ClusterTopology, operation admissionv1.Operation) *clusterTopologyValidator { - return &clusterTopologyValidator{ - operation: operation, - clusterTopology: clusterTopology, - } -} - -// ---------------------------- validate create of ClusterTopology ----------------------------------------------- - -// validate validates the ClusterTopology object. -func (v *clusterTopologyValidator) validate() error { - allErrs := field.ErrorList{} - - // Note: Metadata validation is handled by the API server - errs := v.validateClusterTopologySpec(field.NewPath("spec")) - if len(errs) != 0 { - allErrs = append(allErrs, errs...) - } - - return allErrs.ToAggregate() -} - -// validateClusterTopologySpec validates the specification of a ClusterTopology object. -func (v *clusterTopologyValidator) validateClusterTopologySpec(fldPath *field.Path) field.ErrorList { - allErrs := field.ErrorList{} - - levelsPath := fldPath.Child("levels") - - // Note: MinItems and MaxItems are enforced by kubebuilder validation - // (+kubebuilder:validation:MinItems=1, +kubebuilder:validation:MaxItems=7) - - // First, validate each level individually (domain validity, key format, etc.) - lo.ForEach(v.clusterTopology.Spec.Levels, func(level grovecorev1alpha1.TopologyLevel, i int) { - levelPath := levelsPath.Index(i) - allErrs = append(allErrs, v.validateTopologyLevel(level, levelPath)...) - }) - - // Then validate hierarchical order (assumes all domains are valid) - // This also ensures domain uniqueness - allErrs = append(allErrs, validateTopologyLevelOrder(v.clusterTopology.Spec.Levels, levelsPath)...) - - // Validate that all keys are unique - allErrs = append(allErrs, validateTopologyLevelKeyUniqueness(v.clusterTopology.Spec.Levels, levelsPath)...) - - return allErrs -} - -// validateTopologyLevel validates a single topology level in isolation. -func (v *clusterTopologyValidator) validateTopologyLevel(level grovecorev1alpha1.TopologyLevel, fldPath *field.Path) field.ErrorList { - allErrs := field.ErrorList{} - - // Note: Domain validation is handled by kubebuilder enum validation - // (+kubebuilder:validation:Enum=region;zone;datacenter;block;rack;host;numa) - - // Note: Key required and length validation is handled by kubebuilder validation - // (+kubebuilder:validation:Required, +kubebuilder:validation:MinLength=1, +kubebuilder:validation:MaxLength=63) - - // Validate key format is a valid Kubernetes label key - keyPath := fldPath.Child("key") - allErrs = append(allErrs, metav1validation.ValidateLabelName(level.Key, keyPath)...) - - return allErrs -} - -// validateTopologyLevelOrder validates that topology levels are in the correct hierarchical order -// (broadest to narrowest: Region > Zone > DataCenter > Block > Rack > Host > Numa). -// This function assumes all domains have already been validated to be valid topology domains. -// This validation also ensures domain uniqueness (duplicate domains would have the same order value). -func validateTopologyLevelOrder(levels []grovecorev1alpha1.TopologyLevel, fldPath *field.Path) field.ErrorList { - allErrs := field.ErrorList{} - - for i := 1; i < len(levels); i++ { - prevLevel := levels[i-1] - currLevel := levels[i] - - // Current level must be narrower than previous level - if currLevel.BroaderThan(prevLevel) { - allErrs = append(allErrs, field.Invalid(fldPath.Index(i).Child("domain"), currLevel.Domain, - fmt.Sprintf("topology levels must be in hierarchical order (broadest to narrowest). Domain '%s' at position %d cannot come after '%s' at position %d", - currLevel.Domain, i, prevLevel.Domain, i-1))) - } else if currLevel.Compare(prevLevel) == 0 { - allErrs = append(allErrs, field.Duplicate(fldPath.Index(i).Child("domain"), currLevel.Domain)) - } - } - - return allErrs -} - -// validateTopologyLevelKeyUniqueness validates that all keys in the topology levels are unique. -func validateTopologyLevelKeyUniqueness(levels []grovecorev1alpha1.TopologyLevel, fldPath *field.Path) field.ErrorList { - allErrs := field.ErrorList{} - seenKeys := make(map[string]int) // map key to first index where it appears - - for i, level := range levels { - if firstIndex, exists := seenKeys[level.Key]; exists { - allErrs = append(allErrs, field.Invalid(fldPath.Index(i).Child("key"), level.Key, - fmt.Sprintf("duplicate key: already used at levels[%d]", firstIndex))) - } else { - seenKeys[level.Key] = i - } - } - - return allErrs -} - -// ---------------------------- validate update of ClusterTopology ----------------------------------------------- - -// validateUpdate validates the update to a ClusterTopology object. -func (v *clusterTopologyValidator) validateUpdate(oldClusterTopology *grovecorev1alpha1.ClusterTopology) error { - allErrs := field.ErrorList{} - allErrs = append(allErrs, v.validateClusterTopologySpecUpdate(&v.clusterTopology.Spec, &oldClusterTopology.Spec, field.NewPath("spec"))...) - return allErrs.ToAggregate() -} - -// validateClusterTopologySpecUpdate validates updates to the ClusterTopology specification. -func (v *clusterTopologyValidator) validateClusterTopologySpecUpdate(oldSpec, newSpec *grovecorev1alpha1.ClusterTopologySpec, fldPath *field.Path) field.ErrorList { - allErrs := field.ErrorList{} - - levelsPath := fldPath.Child("levels") - - // Validate that the number of levels hasn't changed - if len(newSpec.Levels) != len(oldSpec.Levels) { - allErrs = append(allErrs, field.Forbidden(levelsPath, "not allowed to add or remove topology levels")) - return allErrs - } - - // Validate that the order and domains of levels haven't changed - allErrs = append(allErrs, validateTopologyLevelImmutableFields(newSpec.Levels, oldSpec.Levels, levelsPath)...) - - return allErrs -} - -// validateTopologyLevelImmutableFields validates that immutable fields in topology levels haven't changed during an update. -func validateTopologyLevelImmutableFields(newLevels, oldLevels []grovecorev1alpha1.TopologyLevel, fldPath *field.Path) field.ErrorList { - allErrs := field.ErrorList{} - - for i := range newLevels { - levelPath := fldPath.Index(i) - - allErrs = append(allErrs, apivalidation.ValidateImmutableField(newLevels[i].Domain, oldLevels[i].Domain, levelPath.Child("domain"))...) - // Note: Key is allowed to change (not in the requirements), but validation has already occurred in validate() - } - - return allErrs -} diff --git a/operator/internal/webhook/admission/clustertopology/validation/clustertopology_crd_test.go b/operator/internal/webhook/admission/clustertopology/validation/clustertopology_crd_test.go deleted file mode 100644 index 20c774b0a..000000000 --- a/operator/internal/webhook/admission/clustertopology/validation/clustertopology_crd_test.go +++ /dev/null @@ -1,252 +0,0 @@ -// /* -// Copyright 2025 The Grove Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// */ - -package validation - -import ( - "context" - "os" - "testing" - "time" - - grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - "k8s.io/client-go/kubernetes/scheme" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/envtest" -) - -func TestClusterTopologyCRDValidation(t *testing.T) { - testEnv, k8sClient := setupEnvTest(t) - defer teardownEnvTest(t, testEnv) - - tests := []struct { - name string - ct *grovecorev1alpha1.ClusterTopology - expectError bool - errorContains string - }{ - { - name: "valid cluster topology with single level", - ct: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{Name: "test-valid-single"}, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "topology.kubernetes.io/region", - }, - }, - }, - }, - expectError: false, - }, - { - name: "valid cluster topology with multiple levels", - ct: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{Name: "test-valid-multiple"}, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "topology.kubernetes.io/region", - }, - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - { - Domain: grovecorev1alpha1.TopologyDomainHost, - Key: "kubernetes.io/hostname", - }, - }, - }, - }, - expectError: false, - }, - { - name: "reject invalid domain", - ct: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{Name: "test-invalid-domain"}, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: "invalid-domain", // Not in enum - Key: "test.io/key", - }, - }, - }, - }, - expectError: true, - errorContains: "supported values", - }, - { - name: "reject empty key", - ct: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{Name: "test-empty-key"}, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "", // Empty key - }, - }, - }, - }, - expectError: true, - errorContains: "spec.levels[0].key", - }, - { - name: "reject key too long", - ct: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{Name: "test-key-too-long"}, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - // Create a key longer than 63 characters (MaxLength validation) - Key: "this-is-a-very-long-key-that-exceeds-the-maximum-allowed-length-of-63-characters", - }, - }, - }, - }, - expectError: true, - errorContains: "spec.levels[0].key", - }, - { - name: "reject empty levels array", - ct: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{Name: "test-empty-levels"}, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{}, // Empty array - }, - }, - expectError: true, - errorContains: "spec.levels", - }, - { - name: "reject too many levels", - ct: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{Name: "test-too-many-levels"}, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - {Domain: grovecorev1alpha1.TopologyDomainRegion, Key: "key1"}, - {Domain: grovecorev1alpha1.TopologyDomainZone, Key: "key2"}, - {Domain: grovecorev1alpha1.TopologyDomainDataCenter, Key: "key3"}, - {Domain: grovecorev1alpha1.TopologyDomainBlock, Key: "key4"}, - {Domain: grovecorev1alpha1.TopologyDomainRack, Key: "key5"}, - {Domain: grovecorev1alpha1.TopologyDomainHost, Key: "key6"}, - {Domain: grovecorev1alpha1.TopologyDomainNuma, Key: "key7"}, - {Domain: "region", Key: "key8"}, // 8th level - exceeds max - }, - }, - }, - expectError: true, - errorContains: "spec.levels", - }, - { - name: "reject missing domain", - ct: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{Name: "test-missing-domain"}, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Key: "test.io/key", // Missing domain - }, - }, - }, - }, - expectError: true, - // API server should reject due to Required validation - }, - { - name: "reject missing key", - ct: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{Name: "test-missing-key"}, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - // Missing key - }, - }, - }, - }, - expectError: true, - // API server should reject due to Required validation - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - err := k8sClient.Create(context.Background(), tt.ct) - - if tt.expectError { - assert.Error(t, err) - if tt.errorContains != "" { - assert.Contains(t, err.Error(), tt.errorContains) - } - } else { - assert.NoError(t, err) - // Clean up successfully created resource - _ = k8sClient.Delete(context.Background(), tt.ct) - } - }) - } -} - -// setupEnvTest creates an envtest environment with ClusterTopology CRDs loaded -func setupEnvTest(t *testing.T) (*envtest.Environment, client.Client) { - // Check if KUBEBUILDER_ASSETS is set - if os.Getenv("KUBEBUILDER_ASSETS") == "" { - t.Skip("Skipping envtest: KUBEBUILDER_ASSETS not set. Run 'make test-envtest' to execute this test.") - } - - // Point to CRD directory (relative from this test file location) - crdPath := "../../../../../api/core/v1alpha1/crds" - - testEnv := &envtest.Environment{ - CRDDirectoryPaths: []string{crdPath}, - ErrorIfCRDPathMissing: true, - } - - cfg, err := testEnv.Start() - require.NoError(t, err) - - // Create client with proper scheme - testScheme := runtime.NewScheme() - utilruntime.Must(scheme.AddToScheme(testScheme)) - utilruntime.Must(grovecorev1alpha1.AddToScheme(testScheme)) - - k8sClient, err := client.New(cfg, client.Options{Scheme: testScheme}) - require.NoError(t, err) - - // Give CRDs a moment to be fully ready - time.Sleep(100 * time.Millisecond) - - return testEnv, k8sClient -} - -// teardownEnvTest stops the envtest environment -func teardownEnvTest(t *testing.T, testEnv *envtest.Environment) { - err := testEnv.Stop() - require.NoError(t, err) -} diff --git a/operator/internal/webhook/admission/clustertopology/validation/clustertopology_test.go b/operator/internal/webhook/admission/clustertopology/validation/clustertopology_test.go deleted file mode 100644 index 1e4da5c58..000000000 --- a/operator/internal/webhook/admission/clustertopology/validation/clustertopology_test.go +++ /dev/null @@ -1,500 +0,0 @@ -// /* -// Copyright 2025 The Grove Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// */ - -package validation - -import ( - "context" - "testing" - - grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" - - "github.com/go-logr/logr" - "github.com/stretchr/testify/assert" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -func TestValidateCreate(t *testing.T) { - tests := []struct { - name string - clusterTopology *grovecorev1alpha1.ClusterTopology - expectedErr bool - expectedErrMsg string - }{ - { - name: "valid cluster topology with single level", - clusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - }, - }, - }, - expectedErr: false, - }, - { - name: "valid cluster topology with multiple levels", - clusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "topology.kubernetes.io/region", - }, - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - { - Domain: grovecorev1alpha1.TopologyDomainHost, - Key: "kubernetes.io/hostname", - }, - }, - }, - }, - expectedErr: false, - }, - { - name: "invalid - duplicate domain (caught by order validation)", - clusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "custom.io/zone", - }, - }, - }, - }, - expectedErr: true, - expectedErrMsg: "Duplicate value", - }, - { - name: "invalid - duplicate key", - clusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "topology.kubernetes.io/zone", - }, - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - }, - }, - }, - expectedErr: true, - expectedErrMsg: "duplicate key", - }, - { - name: "invalid - key not a valid label (has space)", - clusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "Invalid value", - }, - }, - }, - }, - expectedErr: true, - expectedErrMsg: "name part must consist of alphanumeric characters", - }, - { - name: "invalid - key prefix has invalid characters (double dots)", - clusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "invalid..label/key", - }, - }, - }, - }, - expectedErr: true, - expectedErrMsg: "prefix part", - }, - { - name: "invalid - key name part too long", - clusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "example.com/this-is-a-very-long-name-that-exceeds-the-maximum-length-of-sixtythree-characters", - }, - }, - }, - }, - expectedErr: true, - expectedErrMsg: "name part must be no more than 63 characters", - }, - { - name: "invalid - levels out of order (zone before region)", - clusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "topology.kubernetes.io/region", - }, - }, - }, - }, - expectedErr: true, - expectedErrMsg: "topology levels must be in hierarchical order", - }, - { - name: "valid - correct hierarchical order (region > zone > datacenter > rack > host > numa)", - clusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "topology.kubernetes.io/region", - }, - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - { - Domain: grovecorev1alpha1.TopologyDomainDataCenter, - Key: "topology.kubernetes.io/datacenter", - }, - { - Domain: grovecorev1alpha1.TopologyDomainRack, - Key: "topology.kubernetes.io/rack", - }, - { - Domain: grovecorev1alpha1.TopologyDomainHost, - Key: "kubernetes.io/hostname", - }, - { - Domain: grovecorev1alpha1.TopologyDomainNuma, - Key: "topology.kubernetes.io/numa", - }, - }, - }, - }, - expectedErr: false, - }, - } - - handler := &Handler{logger: logr.Discard()} - ctx := context.Background() - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - _, err := handler.ValidateCreate(ctx, tt.clusterTopology) - - if tt.expectedErr { - assert.Error(t, err) - if tt.expectedErrMsg != "" { - assert.Contains(t, err.Error(), tt.expectedErrMsg) - } - } else { - assert.NoError(t, err) - } - }) - } -} - -func TestValidateUpdate(t *testing.T) { - tests := []struct { - name string - oldClusterTopology *grovecorev1alpha1.ClusterTopology - newClusterTopology *grovecorev1alpha1.ClusterTopology - expectedErr bool - expectedErrMsg string - }{ - { - name: "valid update - key changed", - oldClusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - }, - }, - }, - newClusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "custom.io/zone", - }, - }, - }, - }, - expectedErr: false, - }, - { - name: "invalid - domain changed", - oldClusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - }, - }, - }, - newClusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "topology.kubernetes.io/region", - }, - }, - }, - }, - expectedErr: true, - expectedErrMsg: "field is immutable", - }, - { - name: "invalid - level added", - oldClusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - }, - }, - }, - newClusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - { - Domain: grovecorev1alpha1.TopologyDomainHost, - Key: "kubernetes.io/hostname", - }, - }, - }, - }, - expectedErr: true, - expectedErrMsg: "not allowed to add or remove topology levels", - }, - { - name: "invalid - level removed", - oldClusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "topology.kubernetes.io/region", - }, - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - }, - }, - }, - newClusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "topology.kubernetes.io/region", - }, - }, - }, - }, - expectedErr: true, - expectedErrMsg: "not allowed to add or remove topology levels", - }, - { - name: "invalid - update creates duplicate key", - oldClusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "topology.kubernetes.io/region", - }, - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - }, - }, - }, - newClusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "topology.kubernetes.io/zone", - }, - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - }, - }, - }, - expectedErr: true, - expectedErrMsg: "duplicate key", - }, - { - name: "valid update - multiple levels, keys changed", - oldClusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "topology.kubernetes.io/region", - }, - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "topology.kubernetes.io/zone", - }, - { - Domain: grovecorev1alpha1.TopologyDomainHost, - Key: "kubernetes.io/hostname", - }, - }, - }, - }, - newClusterTopology: &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: []grovecorev1alpha1.TopologyLevel{ - { - Domain: grovecorev1alpha1.TopologyDomainRegion, - Key: "custom.io/region", - }, - { - Domain: grovecorev1alpha1.TopologyDomainZone, - Key: "custom.io/zone", - }, - { - Domain: grovecorev1alpha1.TopologyDomainHost, - Key: "custom.io/host", - }, - }, - }, - }, - expectedErr: false, - }, - } - - handler := &Handler{logger: logr.Discard()} - ctx := context.Background() - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - _, err := handler.ValidateUpdate(ctx, tt.oldClusterTopology, tt.newClusterTopology) - - if tt.expectedErr { - assert.Error(t, err) - if tt.expectedErrMsg != "" { - assert.Contains(t, err.Error(), tt.expectedErrMsg) - } - } else { - assert.NoError(t, err) - } - }) - } -} diff --git a/operator/internal/webhook/admission/clustertopology/validation/handler.go b/operator/internal/webhook/admission/clustertopology/validation/handler.go deleted file mode 100644 index fb1e7dba8..000000000 --- a/operator/internal/webhook/admission/clustertopology/validation/handler.go +++ /dev/null @@ -1,102 +0,0 @@ -// /* -// Copyright 2025 The Grove Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// */ - -package validation - -import ( - "context" - "fmt" - - "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" - "github.com/ai-dynamo/grove/operator/internal/errors" - - "github.com/go-logr/logr" - admissionv1 "k8s.io/api/admission/v1" - "k8s.io/apimachinery/pkg/runtime" - "sigs.k8s.io/controller-runtime/pkg/manager" - "sigs.k8s.io/controller-runtime/pkg/webhook/admission" -) - -const ( - // ErrValidateCreateClusterTopology is the error code returned where the request to create a ClusterTopology is invalid. - ErrValidateCreateClusterTopology v1alpha1.ErrorCode = "ERR_VALIDATE_CREATE_CLUSTERTOPOLOGY" - // ErrValidateUpdateClusterTopology is the error code returned where the request to update a ClusterTopology is invalid. - ErrValidateUpdateClusterTopology v1alpha1.ErrorCode = "ERR_VALIDATE_UPDATE_CLUSTERTOPOLOGY" -) - -// Handler is a handler for validating ClusterTopology resources. -type Handler struct { - logger logr.Logger -} - -// NewHandler creates a new handler for ClusterTopology Webhook. -func NewHandler(mgr manager.Manager) *Handler { - return &Handler{ - logger: mgr.GetLogger().WithName("webhook").WithName(Name), - } -} - -// ValidateCreate validates a ClusterTopology create request. -func (h *Handler) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) { - h.logValidatorFunctionInvocation(ctx) - clusterTopology, err := castToClusterTopology(obj) - if err != nil { - return nil, errors.WrapError(err, ErrValidateCreateClusterTopology, string(admissionv1.Create), "failed to cast object to ClusterTopology") - } - return nil, newClusterTopologyValidator(clusterTopology, admissionv1.Create).validate() -} - -// ValidateUpdate validates a ClusterTopology update request. -func (h *Handler) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) { - h.logValidatorFunctionInvocation(ctx) - newClusterTopology, err := castToClusterTopology(newObj) - if err != nil { - return nil, errors.WrapError(err, ErrValidateUpdateClusterTopology, string(admissionv1.Update), "failed to cast new object to ClusterTopology") - } - oldClusterTopology, err := castToClusterTopology(oldObj) - if err != nil { - return nil, errors.WrapError(err, ErrValidateUpdateClusterTopology, string(admissionv1.Update), "failed to cast old object to ClusterTopology") - } - validator := newClusterTopologyValidator(newClusterTopology, admissionv1.Update) - if err := validator.validate(); err != nil { - return nil, err - } - return nil, validator.validateUpdate(oldClusterTopology) -} - -// ValidateDelete validates a ClusterTopology delete request. -func (h *Handler) ValidateDelete(_ context.Context, _ runtime.Object) (admission.Warnings, error) { - return nil, nil -} - -// castToClusterTopology attempts to cast a runtime.Object to a ClusterTopology. -func castToClusterTopology(obj runtime.Object) (*v1alpha1.ClusterTopology, error) { - clusterTopology, ok := obj.(*v1alpha1.ClusterTopology) - if !ok { - return nil, fmt.Errorf("expected a ClusterTopology object but got %T", obj) - } - return clusterTopology, nil -} - -// logValidatorFunctionInvocation logs details about the validation request including user and operation information. -func (h *Handler) logValidatorFunctionInvocation(ctx context.Context) { - req, err := admission.RequestFromContext(ctx) - if err != nil { - h.logger.Error(err, "failed to get request from context") - return - } - h.logger.Info("ClusterTopology validation webhook invoked", "name", req.Name, "namespace", req.Namespace, "operation", req.Operation, "user", req.UserInfo.Username) -} diff --git a/operator/internal/webhook/admission/clustertopology/validation/register.go b/operator/internal/webhook/admission/clustertopology/validation/register.go deleted file mode 100644 index 76c579577..000000000 --- a/operator/internal/webhook/admission/clustertopology/validation/register.go +++ /dev/null @@ -1,39 +0,0 @@ -// /* -// Copyright 2025 The Grove Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// */ - -package validation - -import ( - "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" - - "sigs.k8s.io/controller-runtime/pkg/manager" - "sigs.k8s.io/controller-runtime/pkg/webhook/admission" -) - -const ( - // Name is the name of the validating webhook handler for ClusterTopology. - Name = "clustertopology-validating-webhook" - webhookPath = "/webhooks/validate-clustertopology" -) - -// RegisterWithManager registers the webhook with the manager. -func (h *Handler) RegisterWithManager(mgr manager.Manager) error { - webhook := admission. - WithCustomValidator(mgr.GetScheme(), &v1alpha1.ClusterTopology{}, h). - WithRecoverPanic(true) - mgr.GetWebhookServer().Register(webhookPath, webhook) - return nil -} diff --git a/operator/internal/webhook/register.go b/operator/internal/webhook/register.go index b46ea1cb4..49dff0dc3 100644 --- a/operator/internal/webhook/register.go +++ b/operator/internal/webhook/register.go @@ -23,7 +23,6 @@ import ( configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" "github.com/ai-dynamo/grove/operator/internal/constants" - ctvalidation "github.com/ai-dynamo/grove/operator/internal/webhook/admission/clustertopology/validation" "github.com/ai-dynamo/grove/operator/internal/webhook/admission/pcs/authorization" "github.com/ai-dynamo/grove/operator/internal/webhook/admission/pcs/defaulting" pcsvalidation "github.com/ai-dynamo/grove/operator/internal/webhook/admission/pcs/validation" @@ -43,11 +42,6 @@ func RegisterWebhooks(mgr manager.Manager, authorizerConfig configv1alpha1.Autho if err := pcsValidatingWebhook.RegisterWithManager(mgr); err != nil { return fmt.Errorf("failed adding %s webhook handler: %v", pcsvalidation.Name, err) } - ctValidatingWebhook := ctvalidation.NewHandler(mgr) - slog.Info("Registering webhook with manager", "handler", ctvalidation.Name) - if err := ctValidatingWebhook.RegisterWithManager(mgr); err != nil { - return fmt.Errorf("failed adding %s webhook handler: %v", ctvalidation.Name, err) - } if authorizerConfig.Enabled { serviceAccountName, ok := os.LookupEnv(constants.EnvVarServiceAccountName) if !ok { From cde881659082f309bc9631a28615b16903c057c5 Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Wed, 24 Dec 2025 17:42:56 +0200 Subject: [PATCH 02/23] feat: implement ClusterTopology management with KAI integration Signed-off-by: Ron Kahn --- operator/api/config/v1alpha1/defaults.go | 7 - operator/api/config/v1alpha1/defaults_test.go | 85 ----------- operator/api/config/v1alpha1/types.go | 3 + operator/api/config/validation/validation.go | 41 ++--- .../api/config/validation/validation_test.go | 19 +-- operator/api/core/v1alpha1/clustertopology.go | 4 + ...tertopology-validating-webhook-config.yaml | 34 ----- operator/charts/values.yaml | 5 - operator/cmd/main.go | 73 +-------- operator/go.mod | 20 +-- operator/go.sum | 51 ++++--- operator/internal/client/scheme.go | 2 + operator/internal/topology/manager.go | 142 ++++++++++++++++++ operator/internal/topology/manager_test.go | 129 ++++++++++++++++ 14 files changed, 354 insertions(+), 261 deletions(-) delete mode 100644 operator/api/config/v1alpha1/defaults_test.go delete mode 100644 operator/charts/templates/clustertopology-validating-webhook-config.yaml create mode 100644 operator/internal/topology/manager.go create mode 100644 operator/internal/topology/manager_test.go diff --git a/operator/api/config/v1alpha1/defaults.go b/operator/api/config/v1alpha1/defaults.go index b9e8e7359..f9451338d 100644 --- a/operator/api/config/v1alpha1/defaults.go +++ b/operator/api/config/v1alpha1/defaults.go @@ -115,10 +115,3 @@ func SetDefaults_PodCliqueScalingGroupControllerConfiguration(obj *PodCliqueScal obj.ConcurrentSyncs = ptr.To(1) } } - -// SetDefaults_ClusterTopologyConfiguration sets defaults for the ClusterTopologyConfiguration. -func SetDefaults_ClusterTopologyConfiguration(obj *ClusterTopologyConfiguration) { - if obj.Enabled && obj.Name == "" { - obj.Name = defaultTopologyName - } -} diff --git a/operator/api/config/v1alpha1/defaults_test.go b/operator/api/config/v1alpha1/defaults_test.go deleted file mode 100644 index 35209914d..000000000 --- a/operator/api/config/v1alpha1/defaults_test.go +++ /dev/null @@ -1,85 +0,0 @@ -// /* -// Copyright 2025 The Grove Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// */ - -package v1alpha1 - -import ( - "testing" - - "github.com/stretchr/testify/assert" -) - -func TestSetDefaults_ClusterTopologyConfiguration(t *testing.T) { - tests := []struct { - name string - input ClusterTopologyConfiguration - expected ClusterTopologyConfiguration - }{ - { - name: "enabled with no name defaults to grove-topology", - input: ClusterTopologyConfiguration{ - Enabled: true, - Name: "", - }, - expected: ClusterTopologyConfiguration{ - Enabled: true, - Name: "grove-topology", - }, - }, - { - name: "enabled with custom name preserves name", - input: ClusterTopologyConfiguration{ - Enabled: true, - Name: "custom-topology", - }, - expected: ClusterTopologyConfiguration{ - Enabled: true, - Name: "custom-topology", - }, - }, - { - name: "disabled with no name remains empty", - input: ClusterTopologyConfiguration{ - Enabled: false, - Name: "", - }, - expected: ClusterTopologyConfiguration{ - Enabled: false, - Name: "", - }, - }, - { - name: "disabled with name preserves name", - input: ClusterTopologyConfiguration{ - Enabled: false, - Name: "some-topology", - }, - expected: ClusterTopologyConfiguration{ - Enabled: false, - Name: "some-topology", - }, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - cfg := tt.input - SetDefaults_ClusterTopologyConfiguration(&cfg) - assert.Equal(t, tt.expected, cfg) - }) - } -} diff --git a/operator/api/config/v1alpha1/types.go b/operator/api/config/v1alpha1/types.go index 7d983b3ee..826f7d027 100644 --- a/operator/api/config/v1alpha1/types.go +++ b/operator/api/config/v1alpha1/types.go @@ -219,6 +219,9 @@ const ( TopologyDomainNuma TopologyDomain = "numa" ) +// MaxTopologyLevels is the maximum number of topology levels supported. +const MaxTopologyLevels = 7 + // TopologyLevel defines a single level in the topology hierarchy. type TopologyLevel struct { // Domain is the predefined level identifier used in TopologyConstraint references. diff --git a/operator/api/config/validation/validation.go b/operator/api/config/validation/validation.go index 414683176..e6475ab2f 100644 --- a/operator/api/config/validation/validation.go +++ b/operator/api/config/validation/validation.go @@ -123,36 +123,41 @@ func validateClusterTopologyConfiguration(clusterTopologyCfg configv1alpha1.Clus return allErrs } - if len(strings.TrimSpace(clusterTopologyCfg.Name)) == 0 { - allErrs = append(allErrs, field.Required(fldPath.Child("name"), "name is required when topology is enabled")) - } + allErrs = validateClusterTopologyLevels(clusterTopologyCfg.Levels, fldPath.Child("levels")) + + return allErrs +} - if len(clusterTopologyCfg.Levels) == 0 { +func validateClusterTopologyLevels(levels []configv1alpha1.TopologyLevel, fldPath *field.Path) field.ErrorList { + allErrs := field.ErrorList{} + if len(levels) == 0 { allErrs = append(allErrs, field.Required(fldPath.Child("levels"), "levels are required when topology is enabled")) } - levelsPath := fldPath.Child("levels") - if len(clusterTopologyCfg.Levels) > 7 { - allErrs = append(allErrs, field.TooMany(levelsPath, len(clusterTopologyCfg.Levels), 7)) + if len(levels) > configv1alpha1.MaxTopologyLevels { + allErrs = append(allErrs, field.TooMany(fldPath, len(levels), configv1alpha1.MaxTopologyLevels)) } - seenDomains := make(map[string]int) - for i, level := range clusterTopologyCfg.Levels { - if _, exists := seenDomains[string(level.Domain)]; exists { - allErrs = append(allErrs, field.Duplicate(levelsPath.Index(i).Child("domain"), level.Domain)) - } else { - seenDomains[string(level.Domain)] = i - } - } + allErrs = append(allErrs, validateLevelsUniqueness(levels, fldPath)...) + return allErrs +} + +func validateLevelsUniqueness(levels []configv1alpha1.TopologyLevel, fldPath *field.Path) field.ErrorList { + allErrs := field.ErrorList{} + seenDomains := make(map[string]int) seenKeys := make(map[string]int) - for i, level := range clusterTopologyCfg.Levels { + for i, level := range levels { if _, exists := seenKeys[level.Key]; exists { - allErrs = append(allErrs, field.Duplicate(levelsPath.Index(i).Child("key"), level.Key)) + allErrs = append(allErrs, field.Duplicate(fldPath.Index(i).Child("key"), level.Key)) } else { seenKeys[level.Key] = i } + if _, exists := seenDomains[string(level.Domain)]; exists { + allErrs = append(allErrs, field.Duplicate(fldPath.Index(i).Child("domain"), level.Domain)) + } else { + seenDomains[string(level.Domain)] = i + } } - return allErrs } diff --git a/operator/api/config/validation/validation_test.go b/operator/api/config/validation/validation_test.go index 3e6099739..4b15439de 100644 --- a/operator/api/config/validation/validation_test.go +++ b/operator/api/config/validation/validation_test.go @@ -33,46 +33,41 @@ func TestValidateClusterTopologyConfiguration(t *testing.T) { errorField string }{ { - name: "valid: enabled with name", + name: "valid: enabled with levels", config: configv1alpha1.ClusterTopologyConfiguration{ Enabled: true, - Name: "my-topology", }, expectError: false, }, { - name: "valid: disabled with no name", + name: "valid: disabled with no levels", config: configv1alpha1.ClusterTopologyConfiguration{ Enabled: false, - Name: "", }, expectError: false, }, { - name: "valid: disabled with name", + name: "valid: disabled with levels", config: configv1alpha1.ClusterTopologyConfiguration{ Enabled: false, - Name: "my-topology", }, expectError: false, }, { - name: "invalid: enabled with empty name", + name: "invalid: enabled with empty levels", config: configv1alpha1.ClusterTopologyConfiguration{ Enabled: true, - Name: "", }, expectError: true, - errorField: "clusterTopology.name", + errorField: "clusterTopology.levels", }, { - name: "invalid: enabled with whitespace-only name", + name: "invalid: enabled with empty levels", config: configv1alpha1.ClusterTopologyConfiguration{ Enabled: true, - Name: " ", }, expectError: true, - errorField: "clusterTopology.name", + errorField: "clusterTopology.levels", }, } diff --git a/operator/api/core/v1alpha1/clustertopology.go b/operator/api/core/v1alpha1/clustertopology.go index 1ae3cdce6..8fda48f1b 100644 --- a/operator/api/core/v1alpha1/clustertopology.go +++ b/operator/api/core/v1alpha1/clustertopology.go @@ -20,6 +20,10 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +const ( + ClusterTopologyName = "grove-topology" +) + // +genclient // +genclient:nonNamespaced // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object diff --git a/operator/charts/templates/clustertopology-validating-webhook-config.yaml b/operator/charts/templates/clustertopology-validating-webhook-config.yaml deleted file mode 100644 index 0a562b209..000000000 --- a/operator/charts/templates/clustertopology-validating-webhook-config.yaml +++ /dev/null @@ -1,34 +0,0 @@ ---- -apiVersion: admissionregistration.k8s.io/v1 -kind: ValidatingWebhookConfiguration -metadata: - name: clustertopology-validating-webhook - labels: -{{- include "operator.clustertopology.validating.webhook.labels" . | nindent 4 }} -webhooks: - - admissionReviewVersions: - - v1 - clientConfig: - service: - name: {{ required ".Values.service.name is required" .Values.service.name }} - namespace: {{ .Release.Namespace }} - path: /webhooks/validate-clustertopology - port: {{ required ".Values.config.server.webhooks.port" .Values.config.server.webhooks.port }} - failurePolicy: Fail - matchPolicy: Exact - name: clustertopology.validating.webhooks.grove.io - namespaceSelector: { } - rules: - - apiGroups: - - "grove.io" - apiVersions: - - "v1alpha1" - operations: - - CREATE - - UPDATE - resources: - - clustertopologies - scope: Cluster - sideEffects: None - timeoutSeconds: 10 - diff --git a/operator/charts/values.yaml b/operator/charts/values.yaml index cc62d8d5b..da6105cbb 100644 --- a/operator/charts/values.yaml +++ b/operator/charts/values.yaml @@ -125,11 +125,6 @@ webhooks: app.kubernetes.io/component: operator-pcs-defaulting-webhook app.kubernetes.io/name: pcs-defaulting-webhook app.kubernetes.io/part-of: grove - clusterTopologyValidationWebhook: - labels: - app.kubernetes.io/component: operator-clustertopology-validating-webhook - app.kubernetes.io/name: clustertopology-validating-webhook - app.kubernetes.io/part-of: grove authorizerWebhook: labels: app.kubernetes.io/component: operator-authorizer-webhook diff --git a/operator/cmd/main.go b/operator/cmd/main.go index a0bb96beb..8822b6113 100644 --- a/operator/cmd/main.go +++ b/operator/cmd/main.go @@ -17,26 +17,20 @@ package main import ( - "context" - "errors" "flag" - "fmt" "os" configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" - grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" groveopts "github.com/ai-dynamo/grove/operator/cmd/opts" grovectrl "github.com/ai-dynamo/grove/operator/internal/controller" "github.com/ai-dynamo/grove/operator/internal/controller/cert" grovelogger "github.com/ai-dynamo/grove/operator/internal/logger" + "github.com/ai-dynamo/grove/operator/internal/topology" groveversion "github.com/ai-dynamo/grove/operator/internal/version" "github.com/spf13/pflag" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" ) var ( @@ -70,10 +64,11 @@ func main() { } ctx := ctrl.SetupSignalHandler() - - if err = ensureClusterTopology(ctx, mgr.GetClient(), operatorCfg.ClusterTopology); err != nil { - logger.Error(err, "cannot create/update cluster topology, operator cannot start") - os.Exit(1) + if operatorCfg.ClusterTopology.Enabled { + if err = topology.EnsureTopology(ctx, mgr.GetClient(), corev1alpha1.ClusterTopologyName, operatorCfg.ClusterTopology.Levels); err != nil { + logger.Error(err, "cannot create/update cluster topology, operator cannot start") + os.Exit(1) + } } webhookCertsReadyCh := make(chan struct{}) @@ -121,57 +116,3 @@ func printFlags() { }) logger.Info("Running with flags", flagKVs...) } - -func ensureClusterTopology(ctx context.Context, client client.Client, config configv1alpha1.ClusterTopologyConfiguration) error { - if !config.Enabled { - return nil - } - - if config.Name == "" { - return errors.New("ClusterTopology name is required when enabled") - } - - if len(config.Levels) == 0 { - return errors.New("ClusterTopology levels are required when enabled") - } - - topology := &grovecorev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: config.Name, - }, - Spec: grovecorev1alpha1.ClusterTopologySpec{ - Levels: convertTopologyLevels(config.Levels), - }, - } - - err := client.Create(ctx, topology) - if err != nil { - if apierrors.IsAlreadyExists(err) { - existing := &grovecorev1alpha1.ClusterTopology{} - if err := client.Get(ctx, types.NamespacedName{Name: config.Name}, existing); err != nil { - return fmt.Errorf("failed to get existing ClusterTopology: %w", err) - } - existing.Spec = topology.Spec - if err := client.Update(ctx, existing); err != nil { - return fmt.Errorf("failed to update ClusterTopology: %w", err) - } - logger.Info("cluster topology updated successfully", "name", config.Name) - return nil - } - return fmt.Errorf("failed to create ClusterTopology: %w", err) - } - - logger.Info("cluster topology created successfully", "name", config.Name) - return nil -} - -func convertTopologyLevels(levels []configv1alpha1.TopologyLevel) []grovecorev1alpha1.TopologyLevel { - result := make([]grovecorev1alpha1.TopologyLevel, len(levels)) - for i, level := range levels { - result[i] = grovecorev1alpha1.TopologyLevel{ - Domain: grovecorev1alpha1.TopologyDomain(level.Domain), - Key: level.Key, - } - } - return result -} diff --git a/operator/go.mod b/operator/go.mod index 79779c20e..e561277d4 100644 --- a/operator/go.mod +++ b/operator/go.mod @@ -1,8 +1,9 @@ module github.com/ai-dynamo/grove/operator -go 1.24.0 +go 1.24.4 require ( + github.com/NVIDIA/KAI-scheduler v0.12.0 github.com/ai-dynamo/grove/operator/api v0.0.0 github.com/ai-dynamo/grove/scheduler/api v0.0.0 github.com/docker/docker v28.3.3+incompatible @@ -53,7 +54,7 @@ require ( github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/dimchansky/utfbom v1.1.1 // indirect github.com/distribution/reference v0.6.0 // indirect - github.com/docker/cli v27.0.3+incompatible // indirect + github.com/docker/cli v27.1.1+incompatible // indirect github.com/docker/distribution v2.8.2+incompatible // indirect github.com/docker/docker-credential-helpers v0.8.2 // indirect github.com/docker/go v1.5.1-1.0.20160303222718-d30aec9fd63c // indirect @@ -82,7 +83,7 @@ require ( github.com/google/btree v1.1.3 // indirect github.com/google/gnostic-models v0.7.0 // indirect github.com/google/go-cmp v0.7.0 // indirect - github.com/google/go-containerregistry v0.19.1 // indirect + github.com/google/go-containerregistry v0.20.2 // indirect github.com/google/uuid v1.6.0 // indirect github.com/gorilla/mux v1.8.1 // indirect github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect @@ -129,16 +130,15 @@ require ( github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect - github.com/onsi/ginkgo/v2 v2.23.4 // indirect github.com/opencontainers/go-digest v1.0.0 // indirect github.com/opencontainers/image-spec v1.1.1 // indirect github.com/pelletier/go-toml/v2 v2.2.3 // indirect github.com/peterbourgon/diskv v2.0.1+incompatible // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus/client_golang v1.23.0 // indirect + github.com/prometheus/client_golang v1.23.2 // indirect github.com/prometheus/client_model v0.6.2 // indirect - github.com/prometheus/common v0.65.0 // indirect + github.com/prometheus/common v0.66.1 // indirect github.com/prometheus/procfs v0.17.0 // indirect github.com/rancher/wharfie v0.6.2 // indirect github.com/rubenv/sql-migrate v1.8.0 // indirect @@ -161,7 +161,7 @@ require ( github.com/xeipuuv/gojsonschema v1.2.0 // indirect github.com/xlab/treeprint v1.2.0 // indirect go.opentelemetry.io/auto/sdk v1.1.0 // indirect - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.59.0 // indirect go.opentelemetry.io/otel v1.35.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.32.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.34.0 // indirect @@ -187,9 +187,9 @@ require ( golang.org/x/time v0.12.0 // indirect gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250313205543-e70fdf4c4cb4 // indirect google.golang.org/grpc v1.72.1 // indirect - google.golang.org/protobuf v1.36.7 // indirect + google.golang.org/protobuf v1.36.8 // indirect gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/ini.v1 v1.67.0 // indirect @@ -198,7 +198,7 @@ require ( k8s.io/apiserver v0.34.2 // indirect k8s.io/component-base v0.34.2 // indirect k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 // indirect - k8s.io/kubectl v0.34.0 // indirect + k8s.io/kubectl v0.34.1 // indirect oras.land/oras-go/v2 v2.6.0 // indirect sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect sigs.k8s.io/kustomize/api v0.20.1 // indirect diff --git a/operator/go.sum b/operator/go.sum index e4bc0d7f0..03c075fdf 100644 --- a/operator/go.sum +++ b/operator/go.sum @@ -24,6 +24,8 @@ github.com/Masterminds/squirrel v1.5.4 h1:uUcX/aBc8O7Fg9kaISIUsHXdKuqehiXAMQTYX8 github.com/Masterminds/squirrel v1.5.4/go.mod h1:NNaOrjSoIDfDA40n7sr2tPNZRfjzjA400rg+riTZj10= github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= +github.com/NVIDIA/KAI-scheduler v0.12.0 h1:3rhB0TdbJYNgOiZ6E0cAa+VJnr4srhoigJI/PkY85vI= +github.com/NVIDIA/KAI-scheduler v0.12.0/go.mod h1:56asdZ2ewWn2ALmPM93zxxGWi+8MX3Q0aRlvv2ytfuA= github.com/Shopify/logrus-bugsnag v0.0.0-20170309145241-6dbc35f2c30d/go.mod h1:HI8ITrYtUY+O+ZhtlqUnD8+KwNPOyugEhfP9fdUIaEQ= github.com/Shopify/logrus-bugsnag v0.0.0-20171204204709-577dee27f20d h1:UrqY+r/OJnIp5u0s1SbQ8dVfLCZJsnvazdBP5hS4iRs= github.com/Shopify/logrus-bugsnag v0.0.0-20171204204709-577dee27f20d/go.mod h1:HI8ITrYtUY+O+ZhtlqUnD8+KwNPOyugEhfP9fdUIaEQ= @@ -97,8 +99,8 @@ github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5Qvfr github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= github.com/dlclark/regexp2 v1.11.0 h1:G/nrcoOa7ZXlpoa/91N3X7mM3r8eIlMBBJZvsz/mxKI= github.com/dlclark/regexp2 v1.11.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= -github.com/docker/cli v27.0.3+incompatible h1:usGs0/BoBW8MWxGeEtqPMkzOY56jZ6kYlSN5BLDioCQ= -github.com/docker/cli v27.0.3+incompatible/go.mod h1:JLrzqnKDaYBop7H2jaqPtU4hHvMKP+vjCwu2uszcLI8= +github.com/docker/cli v27.1.1+incompatible h1:goaZxOqs4QKxznZjjBWKONQci/MywhtRv2oNn0GkeZE= +github.com/docker/cli v27.1.1+incompatible/go.mod h1:JLrzqnKDaYBop7H2jaqPtU4hHvMKP+vjCwu2uszcLI8= github.com/docker/distribution v2.7.1+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w= github.com/docker/distribution v2.8.2+incompatible h1:T3de5rq0dB1j30rp0sA2rER+m322EBzniBPB6ZIzuh8= github.com/docker/distribution v2.8.2+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w= @@ -197,13 +199,13 @@ github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7O github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= -github.com/google/go-containerregistry v0.19.1 h1:yMQ62Al6/V0Z7CqIrrS1iYoA5/oQCm88DeNujc7C1KY= -github.com/google/go-containerregistry v0.19.1/go.mod h1:YCMFNQeeXeLF+dnhhWkqDItx/JSkH01j1Kis4PsjzFI= +github.com/google/go-containerregistry v0.20.2 h1:B1wPJ1SN/S7pB+ZAimcciVD+r+yV/l/DSArMxlbwseo= +github.com/google/go-containerregistry v0.20.2/go.mod h1:z38EKdKh4h7IP2gSfUUqEvalZBqs6AoLeWfUy34nQC8= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8= -github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= +github.com/google/pprof v0.0.0-20250820193118-f64d9cf942d6 h1:EEHtgt9IwisQ2AZ4pIsMjahcegHh6rmhqxzIRQIyepY= +github.com/google/pprof v0.0.0-20250820193118-f64d9cf942d6/go.mod h1:I6V7YzU0XDpsHqbsyrghnFZLO1gwK6NPTNvmetQIk9U= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gorilla/handlers v1.5.2 h1:cLTUSsNkgcwhgRqvCNmdbRWG0A3N4F+M2nWKdScwyEE= @@ -356,13 +358,14 @@ github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= +github.com/onsi/ginkgo v1.12.0 h1:Iw5WCbBcaAAd0fpRb1c9r5YCylv4XDoCSigm1zLevwU= github.com/onsi/ginkgo v1.12.0/go.mod h1:oUhWkIvk5aDxtKvDDuw8gItl8pKl42LzjC9KZE0HfGg= -github.com/onsi/ginkgo/v2 v2.23.4 h1:ktYTpKJAVZnDT4VjxSbiBenUjmlL/5QkBEocaWXiQus= -github.com/onsi/ginkgo/v2 v2.23.4/go.mod h1:Bt66ApGPBFzHyR+JO10Zbt0Gsp4uWxu5mIOTusL46e8= +github.com/onsi/ginkgo/v2 v2.25.3 h1:Ty8+Yi/ayDAGtk4XxmmfUy4GabvM+MegeB4cDLRi6nw= +github.com/onsi/ginkgo/v2 v2.25.3/go.mod h1:43uiyQC4Ed2tkOzLsEYm7hnrb7UJTWHYNsuy3bG/snE= github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY= github.com/onsi/gomega v1.9.0/go.mod h1:Ho0h+IUsWyvy1OpqCwxlQ/21gkhVunqlU8fDGcoTdcA= -github.com/onsi/gomega v1.38.0 h1:c/WX+w8SLAinvuKKQFh77WEucCnPk4j2OTUr7lt7BeY= -github.com/onsi/gomega v1.38.0/go.mod h1:OcXcwId0b9QsE7Y49u+BTrL4IdKOBOKnD6VQNTJEB6o= +github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= +github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= github.com/open-policy-agent/cert-controller v0.14.0 h1:TPc19BOHOs4tARruTT5o4bzir7Ed6FF+j3EXP/nmZBs= github.com/open-policy-agent/cert-controller v0.14.0/go.mod h1:UhE/FU54DnKo+Rt0Yf3r+oKjgy6kqSH8Vsjo+5bGrSo= github.com/open-policy-agent/frameworks/constraint v0.0.0-20241101234656-e78c8abd754a h1:gQtOJ50XFyL2Xh3lDD9zP4KQ2PY4mZKQ9hDcWc81Sp8= @@ -394,8 +397,8 @@ github.com/prometheus/client_golang v0.9.0-pre1.0.20180209125602-c332b6f63c06/go github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= github.com/prometheus/client_golang v1.1.0/go.mod h1:I1FGZT9+L76gKKOs5djB6ezCbFQP1xR9D75/vuwEF3g= -github.com/prometheus/client_golang v1.23.0 h1:ust4zpdl9r4trLY/gSjlm07PuiBq2ynaXXlptpfy8Uc= -github.com/prometheus/client_golang v1.23.0/go.mod h1:i/o0R9ByOnHX0McrTMTyhYvKE4haaf2mW08I+jGAjEE= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.0.0-20171117100541-99fa1f4be8e5/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= @@ -404,8 +407,8 @@ github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvM github.com/prometheus/common v0.0.0-20180110214958-89604d197083/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro= github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= github.com/prometheus/common v0.6.0/go.mod h1:eBmuwkDJBwy6iBfxCBob6t6dR6ENT/y+J+Zk0j9GMYc= -github.com/prometheus/common v0.65.0 h1:QDwzd+G1twt//Kwj/Ww6E9FQq1iVMmODnILtW1t2VzE= -github.com/prometheus/common v0.65.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8= +github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= +github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= github.com/prometheus/procfs v0.0.0-20180125133057-cb4147076ac7/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= @@ -508,8 +511,8 @@ go.opentelemetry.io/contrib/bridges/prometheus v0.57.0 h1:UW0+QyeyBVhn+COBec3nGh go.opentelemetry.io/contrib/bridges/prometheus v0.57.0/go.mod h1:ppciCHRLsyCio54qbzQv0E4Jyth/fLWDTJYfvWpcSVk= go.opentelemetry.io/contrib/exporters/autoexport v0.57.0 h1:jmTVJ86dP60C01K3slFQa2NQ/Aoi7zA+wy7vMOKD9H4= go.opentelemetry.io/contrib/exporters/autoexport v0.57.0/go.mod h1:EJBheUMttD/lABFyLXhce47Wr6DPWYReCzaZiXadH7g= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 h1:yd02MEjBdJkG3uabWP9apV+OuWRIXGDuJEUJbOHmCFU= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0/go.mod h1:umTcuxiv1n/s/S6/c2AT/g2CQ7u5C59sHDNmfSwgz7Q= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.59.0 h1:CV7UdSGJt/Ao6Gp4CXckLxVRRsRgDHoI8XjbL3PDl8s= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.59.0/go.mod h1:FRmFuRJfag1IZ2dPkHnEoSFVgTVPUd2qf5Vi69hLb8I= go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ= go.opentelemetry.io/otel v1.35.0/go.mod h1:UEqy8Zp11hpkUrL73gSlELM0DupHoiq72dR+Zqel/+Y= go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.8.0 h1:WzNab7hOOLzdDF/EoWCt4glhrbMPVMOO5JYTmpz36Ls= @@ -637,13 +640,13 @@ gomodules.xyz/jsonpatch/v2 v2.5.0 h1:JELs8RLM12qJGXU4u/TO3V25KW8GreMKl9pdkk14RM0 gomodules.xyz/jsonpatch/v2 v2.5.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb h1:p31xT4yrYrSM/G4Sn2+TNUkVhFCbG9y8itM2S6Th950= google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:jbe3Bkdp+Dh2IrslsFCklNhweNTBgSYanP1UXhJDhKg= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb h1:TLPQVbx1GJ8VKZxz52VAxl1EBgKXXbTiU9Fc5fZeLn4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb/go.mod h1:LuRYeWDFV6WOn90g357N17oMCaxpgCnbi/44qJvDn2I= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250313205543-e70fdf4c4cb4 h1:iK2jbkWL86DXjEx0qiHcRE9dE4/Ahua5k6V8OWFb//c= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250313205543-e70fdf4c4cb4/go.mod h1:LuRYeWDFV6WOn90g357N17oMCaxpgCnbi/44qJvDn2I= google.golang.org/grpc v1.0.5/go.mod h1:yo6s7OP7yaDglbqo1J04qKzAhqBH6lvTonzMVmEdcZw= google.golang.org/grpc v1.72.1 h1:HR03wO6eyZ7lknl75XlxABNVLLFc2PAb6mHlYh756mA= google.golang.org/grpc v1.72.1/go.mod h1:wH5Aktxcg25y1I3w7H69nHfXdOG3UiadoBtjh3izSDM= -google.golang.org/protobuf v1.36.7 h1:IgrO7UwFQGJdRNXH/sQux4R1Dj1WAKcLElzeeRaXV2A= -google.golang.org/protobuf v1.36.7/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= +google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= gopkg.in/airbrake/gobrake.v2 v2.0.9/go.mod h1:/h5ZAUhDkGaJfjzjKLSjv6zCL6O0LLBxU4K+aSYdM/U= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/cenkalti/backoff.v2 v2.2.1 h1:eJ9UAg01/HIHG987TwxvnzK2MgxXq97YY6rYDpY9aII= @@ -694,12 +697,12 @@ k8s.io/component-base v0.34.2 h1:HQRqK9x2sSAsd8+R4xxRirlTjowsg6fWCPwWYeSvogQ= k8s.io/component-base v0.34.2/go.mod h1:9xw2FHJavUHBFpiGkZoKuYZ5pdtLKe97DEByaA+hHbM= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-aggregator v0.33.4 h1:TdIJKHb0/bLpby7FblXIaVEzyA1jGEjzt/n9cRvwq8U= -k8s.io/kube-aggregator v0.33.4/go.mod h1:wZuctdRvGde5bwzxkZRs0GYj2KOpCNgx8rRGVoNb62k= +k8s.io/kube-aggregator v0.34.1 h1:WNLV0dVNoFKmuyvdWLd92iDSyD/TSTjqwaPj0U9XAEU= +k8s.io/kube-aggregator v0.34.1/go.mod h1:RU8j+5ERfp0h+gIvWtxRPfsa5nK7rboDm8RST8BJfYQ= k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 h1:liMHz39T5dJO1aOKHLvwaCjDbf07wVh6yaUlTpunnkE= k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts= -k8s.io/kubectl v0.34.0 h1:NcXz4TPTaUwhiX4LU+6r6udrlm0NsVnSkP3R9t0dmxs= -k8s.io/kubectl v0.34.0/go.mod h1:bmd0W5i+HuG7/p5sqicr0Li0rR2iIhXL0oUyLF3OjR4= +k8s.io/kubectl v0.34.1 h1:1qP1oqT5Xc93K+H8J7ecpBjaz511gan89KO9Vbsh/OI= +k8s.io/kubectl v0.34.1/go.mod h1:JRYlhJpGPyk3dEmJ+BuBiOB9/dAvnrALJEiY/C5qa6A= k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck= k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= oras.land/oras-go/v2 v2.6.0 h1:X4ELRsiGkrbeox69+9tzTu492FMUu7zJQW6eJU+I2oc= diff --git a/operator/internal/client/scheme.go b/operator/internal/client/scheme.go index ca4bd021d..8b43467d8 100644 --- a/operator/internal/client/scheme.go +++ b/operator/internal/client/scheme.go @@ -20,6 +20,7 @@ import ( configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + kaitopologyv1alpha1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1alpha1" schedv1alpha1 "github.com/ai-dynamo/grove/scheduler/api/core/v1alpha1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -35,6 +36,7 @@ func init() { configv1alpha1.AddToScheme, grovecorev1alpha1.AddToScheme, schedv1alpha1.AddToScheme, + kaitopologyv1alpha1.AddToScheme, k8sscheme.AddToScheme, ) utilruntime.Must(metav1.AddMetaToScheme(Scheme)) diff --git a/operator/internal/topology/manager.go b/operator/internal/topology/manager.go new file mode 100644 index 000000000..ca02bf30b --- /dev/null +++ b/operator/internal/topology/manager.go @@ -0,0 +1,142 @@ +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package topology + +import ( + "context" + "fmt" + "sort" + + kaitopologyv1alpha1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1alpha1" + configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" + corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" +) + +var logger = ctrl.Log.WithName("topology-manager") + +// EnsureTopology creates or updates the ClusterTopology CR from configuration +func EnsureTopology(ctx context.Context, client client.Client, name string, levels []configv1alpha1.TopologyLevel) error { + topology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: convertTopologyLevels(levels), + }, + } + + err := upsertClusterTopology(ctx, client, topology, name) + if err != nil { + return err + } + return ensureKAITopology(ctx, client, topology) +} + +func upsertClusterTopology(ctx context.Context, client client.Client, topology *corev1alpha1.ClusterTopology, name string) error { + err := client.Create(ctx, topology) + if err != nil { + if !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("failed to create ClusterTopology: %w", err) + } + existing := &corev1alpha1.ClusterTopology{} + if err := client.Get(ctx, types.NamespacedName{Name: name}, existing); err != nil { + return fmt.Errorf("failed to get existing ClusterTopology: %w", err) + } + existing.Spec = topology.Spec + if err := client.Update(ctx, existing); err != nil { + return fmt.Errorf("failed to update ClusterTopology: %w", err) + } + logger.Info("cluster topology updated successfully", "name", name) + } else { + logger.Info("cluster topology created successfully", "name", name) + } + return nil +} + +// convertTopologyLevels converts configuration topology levels to core API topology levels +func convertTopologyLevels(levels []configv1alpha1.TopologyLevel) []corev1alpha1.TopologyLevel { + result := make([]corev1alpha1.TopologyLevel, len(levels)) + for i, level := range levels { + result[i] = corev1alpha1.TopologyLevel{ + Domain: corev1alpha1.TopologyDomain(level.Domain), + Key: level.Key, + } + } + return result +} + +// convertClusterTopologyToKai converts ClusterTopology to KAI Topology with sorted levels +func convertClusterTopologyToKai(clusterTopology *corev1alpha1.ClusterTopology) *kaitopologyv1alpha1.Topology { + sortedLevels := make([]corev1alpha1.TopologyLevel, len(clusterTopology.Spec.Levels)) + copy(sortedLevels, clusterTopology.Spec.Levels) + + sort.Slice(sortedLevels, func(i, j int) bool { + return sortedLevels[i].Domain.Compare(sortedLevels[j].Domain) < 0 + }) + + kaiLevels := make([]kaitopologyv1alpha1.TopologyLevel, len(sortedLevels)) + for i, level := range sortedLevels { + kaiLevels[i] = kaitopologyv1alpha1.TopologyLevel{ + NodeLabel: level.Key, + } + } + + return &kaitopologyv1alpha1.Topology{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterTopology.Name, + }, + Spec: kaitopologyv1alpha1.TopologySpec{ + Levels: kaiLevels, + }, + } +} + +// ensureKAITopology creates or updates the KAI Topology CR from ClusterTopology +func ensureKAITopology(ctx context.Context, client client.Client, clusterTopology *corev1alpha1.ClusterTopology) error { + kaiTopology := convertClusterTopologyToKai(clusterTopology) + + if err := controllerutil.SetControllerReference(clusterTopology, kaiTopology, client.Scheme()); err != nil { + return fmt.Errorf("failed to set owner reference for KAI Topology: %w", err) + } + + err := client.Create(ctx, kaiTopology) + if err != nil { + if apierrors.IsAlreadyExists(err) { + existing := &kaitopologyv1alpha1.Topology{} + if err := client.Get(ctx, types.NamespacedName{Name: kaiTopology.Name}, existing); err != nil { + return fmt.Errorf("failed to get existing KAI Topology: %w", err) + } + existing.Spec = kaiTopology.Spec + if err := client.Update(ctx, existing); err != nil { + return fmt.Errorf("failed to update KAI Topology: %w", err) + } + logger.Info("KAI topology updated successfully", "name", kaiTopology.Name) + return nil + } + return fmt.Errorf("failed to create KAI Topology: %w", err) + } + + logger.Info("KAI topology created successfully", "name", kaiTopology.Name) + return nil +} diff --git a/operator/internal/topology/manager_test.go b/operator/internal/topology/manager_test.go new file mode 100644 index 000000000..bb926b404 --- /dev/null +++ b/operator/internal/topology/manager_test.go @@ -0,0 +1,129 @@ +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package topology + +import ( + "context" + "testing" + + kaitopologyv1alpha1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1alpha1" + configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" + corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestConvertTopologyLevels(t *testing.T) { + configLevels := []configv1alpha1.TopologyLevel{ + {Domain: "region", Key: "topology.kubernetes.io/region"}, + {Domain: "zone", Key: "topology.kubernetes.io/zone"}, + } + + coreLevels := convertTopologyLevels(configLevels) + + require.Len(t, coreLevels, 2) + assert.Equal(t, corev1alpha1.TopologyDomain("region"), coreLevels[0].Domain) + assert.Equal(t, "topology.kubernetes.io/region", coreLevels[0].Key) + assert.Equal(t, corev1alpha1.TopologyDomain("zone"), coreLevels[1].Domain) + assert.Equal(t, "topology.kubernetes.io/zone", coreLevels[1].Key) +} + +func TestConvertClusterTopologyToKai(t *testing.T) { + clusterTopology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: "grove-topology", + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: []corev1alpha1.TopologyLevel{ + {Domain: "zone", Key: "topology.kubernetes.io/zone"}, + {Domain: "region", Key: "topology.kubernetes.io/region"}, + }, + }, + } + + kaiTopology := convertClusterTopologyToKai(clusterTopology) + + assert.Equal(t, "grove-topology", kaiTopology.Name) + require.Len(t, kaiTopology.Spec.Levels, 2) + assert.Equal(t, "topology.kubernetes.io/region", kaiTopology.Spec.Levels[0].NodeLabel) + assert.Equal(t, "topology.kubernetes.io/zone", kaiTopology.Spec.Levels[1].NodeLabel) +} + +func TestEnsureTopology_Create(t *testing.T) { + scheme := runtime.NewScheme() + _ = corev1alpha1.AddToScheme(scheme) + _ = kaitopologyv1alpha1.AddToScheme(scheme) + client := fake.NewClientBuilder().WithScheme(scheme).Build() + + levels := []configv1alpha1.TopologyLevel{ + {Domain: "zone", Key: "topology.kubernetes.io/zone"}, + } + + err := EnsureTopology(context.Background(), client, "test-topology", levels) + assert.NoError(t, err) + + ct := &corev1alpha1.ClusterTopology{} + err = client.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, ct) + assert.NoError(t, err) + assert.Len(t, ct.Spec.Levels, 1) + + kaiTopology := &kaitopologyv1alpha1.Topology{} + err = client.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, kaiTopology) + assert.NoError(t, err) + assert.Len(t, kaiTopology.Spec.Levels, 1) + assert.Equal(t, "topology.kubernetes.io/zone", kaiTopology.Spec.Levels[0].NodeLabel) +} + +func TestEnsureTopology_Update(t *testing.T) { + scheme := runtime.NewScheme() + _ = corev1alpha1.AddToScheme(scheme) + _ = kaitopologyv1alpha1.AddToScheme(scheme) + + existing := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{Name: "test-topology"}, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: []corev1alpha1.TopologyLevel{ + {Domain: "zone", Key: "old-key"}, + }, + }, + } + + client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(existing).Build() + + levels := []configv1alpha1.TopologyLevel{ + {Domain: "zone", Key: "new-key"}, + } + + err := EnsureTopology(context.Background(), client, "test-topology", levels) + assert.NoError(t, err) + + ct := &corev1alpha1.ClusterTopology{} + err = client.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, ct) + assert.NoError(t, err) + assert.Equal(t, "new-key", ct.Spec.Levels[0].Key) + + kaiTopology := &kaitopologyv1alpha1.Topology{} + err = client.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, kaiTopology) + assert.NoError(t, err) + assert.Len(t, kaiTopology.Spec.Levels, 1) + assert.Equal(t, "new-key", kaiTopology.Spec.Levels[0].NodeLabel) +} From a225df1dabfe129a84c592aa3ec7b96ede0fecc8 Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Wed, 24 Dec 2025 20:38:43 +0200 Subject: [PATCH 03/23] feat: add Kubernetes client creation for ClusterTopology management Signed-off-by: Ron Kahn --- operator/api/config/v1alpha1/zz_generated.defaults.go | 1 - operator/cmd/main.go | 11 ++++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/operator/api/config/v1alpha1/zz_generated.defaults.go b/operator/api/config/v1alpha1/zz_generated.defaults.go index 3583b3ef3..27cb68efa 100644 --- a/operator/api/config/v1alpha1/zz_generated.defaults.go +++ b/operator/api/config/v1alpha1/zz_generated.defaults.go @@ -41,5 +41,4 @@ func SetObjectDefaults_OperatorConfiguration(in *OperatorConfiguration) { SetDefaults_PodCliqueSetControllerConfiguration(&in.Controllers.PodCliqueSet) SetDefaults_PodCliqueControllerConfiguration(&in.Controllers.PodClique) SetDefaults_PodCliqueScalingGroupControllerConfiguration(&in.Controllers.PodCliqueScalingGroup) - SetDefaults_ClusterTopologyConfiguration(&in.ClusterTopology) } diff --git a/operator/cmd/main.go b/operator/cmd/main.go index 8822b6113..58028c8b2 100644 --- a/operator/cmd/main.go +++ b/operator/cmd/main.go @@ -28,6 +28,7 @@ import ( grovelogger "github.com/ai-dynamo/grove/operator/internal/logger" "github.com/ai-dynamo/grove/operator/internal/topology" groveversion "github.com/ai-dynamo/grove/operator/internal/version" + "sigs.k8s.io/controller-runtime/pkg/client" "github.com/spf13/pflag" ctrl "sigs.k8s.io/controller-runtime" @@ -65,7 +66,15 @@ func main() { ctx := ctrl.SetupSignalHandler() if operatorCfg.ClusterTopology.Enabled { - if err = topology.EnsureTopology(ctx, mgr.GetClient(), corev1alpha1.ClusterTopologyName, operatorCfg.ClusterTopology.Levels); err != nil { + // create Kubernetes client for cluster topology + // the default client manager is not running prior (mgr.Start()) + topologyK8sClient, err := client.New(mgr.GetConfig(), client.Options{Scheme: mgr.GetScheme()}) + if err != nil { + logger.Error(err, "failed to create Kubernetes client") + os.Exit(1) + } + if err = topology.EnsureTopology(ctx, topologyK8sClient, + corev1alpha1.ClusterTopologyName, operatorCfg.ClusterTopology.Levels); err != nil { logger.Error(err, "cannot create/update cluster topology, operator cannot start") os.Exit(1) } From 87093fa53fbd9e64099b6eed02bb842dc9a2b93a Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Wed, 24 Dec 2025 20:39:05 +0200 Subject: [PATCH 04/23] fix type Signed-off-by: Ron Kahn --- operator/cmd/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/cmd/main.go b/operator/cmd/main.go index 58028c8b2..aaa717d0c 100644 --- a/operator/cmd/main.go +++ b/operator/cmd/main.go @@ -66,7 +66,7 @@ func main() { ctx := ctrl.SetupSignalHandler() if operatorCfg.ClusterTopology.Enabled { - // create Kubernetes client for cluster topology + // create a Kubernetes client for cluster topology // the default client manager is not running prior (mgr.Start()) topologyK8sClient, err := client.New(mgr.GetConfig(), client.Options{Scheme: mgr.GetScheme()}) if err != nil { From 62ba62aaa412d76c67abba521fd8483376842a75 Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Wed, 24 Dec 2025 20:41:16 +0200 Subject: [PATCH 05/23] chore: downgrade Go version to 1.24.0 in go.mod Signed-off-by: Ron Kahn --- operator/go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/go.mod b/operator/go.mod index e561277d4..8fc3eac78 100644 --- a/operator/go.mod +++ b/operator/go.mod @@ -1,6 +1,6 @@ module github.com/ai-dynamo/grove/operator -go 1.24.4 +go 1.24.0 require ( github.com/NVIDIA/KAI-scheduler v0.12.0 From 0fffc60a928cf8043ecbb9368607d7d184e26c6c Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Wed, 24 Dec 2025 20:44:21 +0200 Subject: [PATCH 06/23] feat: update ClusterTopology resource documentation and validation patterns Signed-off-by: Ron Kahn --- docs/api-reference/operator-api.md | 43 ++++++++++++++++++- operator/api/core/v1alpha1/clustertopology.go | 1 + operator/cmd/main.go | 2 +- operator/internal/topology/manager.go | 2 +- operator/internal/topology/manager_test.go | 2 +- 5 files changed, 45 insertions(+), 5 deletions(-) diff --git a/docs/api-reference/operator-api.md b/docs/api-reference/operator-api.md index 4e93f333b..2ef1be6ac 100644 --- a/docs/api-reference/operator-api.md +++ b/docs/api-reference/operator-api.md @@ -643,7 +643,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | | `domain` _[TopologyDomain](#topologydomain)_ | Domain is the predefined level identifier used in TopologyConstraint references.
Must be one of: region, zone, datacenter, block, rack, host, numa | | Enum: [region zone datacenter block rack host numa]
Required: \{\}
| -| `key` _string_ | Key is the node label key that identifies this topology domain.
Must be a valid Kubernetes label key (qualified name).
Examples: "topology.kubernetes.io/zone", "kubernetes.io/hostname" | | MaxLength: 63
MinLength: 1
Required: \{\}
| +| `key` _string_ | Key is the node label key that identifies this topology domain.
Must be a valid Kubernetes label key (qualified name).
Examples: "topology.kubernetes.io/zone", "kubernetes.io/hostname" | | MaxLength: 63
MinLength: 1
Pattern: `^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]/)?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$`
Required: \{\}
| @@ -702,7 +702,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | | `enabled` _boolean_ | Enabled indicates whether topology-aware scheduling is enabled. | | | -| `name` _string_ | Name is the ClusterTopology resource name to use.
Defaults to "grove-topology" if not specified when topology is enabled. | | | +| `levels` _[TopologyLevel](#topologylevel) array_ | Levels is an ordered list of topology levels from broadest to narrowest scope.
Used to create/update the ClusterTopology CR at operator startup. | | | #### ControllerConfiguration @@ -882,6 +882,45 @@ _Appears in:_ | `metrics` _[Server](#server)_ | Metrics is the configuration for serving the metrics endpoint. | | | +#### TopologyDomain + +_Underlying type:_ _string_ + +TopologyDomain represents a predefined topology level in the hierarchy. + + + +_Appears in:_ +- [TopologyLevel](#topologylevel) + +| Field | Description | +| --- | --- | +| `region` | TopologyDomainRegion represents the region level in the topology hierarchy.
| +| `zone` | TopologyDomainZone represents the zone level in the topology hierarchy.
| +| `datacenter` | TopologyDomainDataCenter represents the datacenter level in the topology hierarchy.
| +| `block` | TopologyDomainBlock represents the block level in the topology hierarchy.
| +| `rack` | TopologyDomainRack represents the rack level in the topology hierarchy.
| +| `host` | TopologyDomainHost represents the host level in the topology hierarchy.
| +| `numa` | TopologyDomainNuma represents the numa level in the topology hierarchy.
| + + +#### TopologyLevel + + + +TopologyLevel defines a single level in the topology hierarchy. + + + +_Appears in:_ +- [ClusterTopologyConfiguration](#clustertopologyconfiguration) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `domain` _[TopologyDomain](#topologydomain)_ | Domain is the predefined level identifier used in TopologyConstraint references. | | | +| `key` _string_ | Key is the node label key that identifies this topology domain. | | | + + #### WebhookServer diff --git a/operator/api/core/v1alpha1/clustertopology.go b/operator/api/core/v1alpha1/clustertopology.go index 8fda48f1b..b06da417a 100644 --- a/operator/api/core/v1alpha1/clustertopology.go +++ b/operator/api/core/v1alpha1/clustertopology.go @@ -21,6 +21,7 @@ import ( ) const ( + // ClusterTopologyName is the name of the default ClusterTopology resource managed by the operator. ClusterTopologyName = "grove-topology" ) diff --git a/operator/cmd/main.go b/operator/cmd/main.go index aaa717d0c..7cb2857a7 100644 --- a/operator/cmd/main.go +++ b/operator/cmd/main.go @@ -28,10 +28,10 @@ import ( grovelogger "github.com/ai-dynamo/grove/operator/internal/logger" "github.com/ai-dynamo/grove/operator/internal/topology" groveversion "github.com/ai-dynamo/grove/operator/internal/version" - "sigs.k8s.io/controller-runtime/pkg/client" "github.com/spf13/pflag" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" ) var ( diff --git a/operator/internal/topology/manager.go b/operator/internal/topology/manager.go index ca02bf30b..b74bee3ca 100644 --- a/operator/internal/topology/manager.go +++ b/operator/internal/topology/manager.go @@ -21,10 +21,10 @@ import ( "fmt" "sort" - kaitopologyv1alpha1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1alpha1" configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + kaitopologyv1alpha1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1alpha1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" diff --git a/operator/internal/topology/manager_test.go b/operator/internal/topology/manager_test.go index bb926b404..6a692f25b 100644 --- a/operator/internal/topology/manager_test.go +++ b/operator/internal/topology/manager_test.go @@ -20,10 +20,10 @@ import ( "context" "testing" - kaitopologyv1alpha1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1alpha1" configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + kaitopologyv1alpha1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1alpha1" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" From ff31dc3961f6a34071a25be5ebfc92263d250923 Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Wed, 24 Dec 2025 21:44:02 +0200 Subject: [PATCH 07/23] feat: enhance ClusterTopology configuration with levels and update role permissions Signed-off-by: Ron Kahn --- operator/charts/templates/_helpers.tpl | 8 ++++++-- operator/charts/templates/clusterrole.yaml | 21 +++++++++++++-------- operator/charts/values.yaml | 13 +++++++++++-- 3 files changed, 30 insertions(+), 12 deletions(-) diff --git a/operator/charts/templates/_helpers.tpl b/operator/charts/templates/_helpers.tpl index 60cd3a1fd..170a61e9f 100644 --- a/operator/charts/templates/_helpers.tpl +++ b/operator/charts/templates/_helpers.tpl @@ -34,8 +34,12 @@ config.yaml: | {{- if .Values.config.clusterTopology }} clusterTopology: enabled: {{ .Values.config.clusterTopology.enabled }} - {{- if .Values.config.clusterTopology.name }} - name: {{ .Values.config.clusterTopology.name }} + {{- if .Values.config.clusterTopology.levels }} + levels: + {{- range .Values.config.clusterTopology.levels }} + - domain: {{ .domain }} + key: {{ .key }} + {{- end }} {{- end }} {{- end }} {{- if .Values.config.authorizer.enabled }} diff --git a/operator/charts/templates/clusterrole.yaml b/operator/charts/templates/clusterrole.yaml index 4cc2c83d7..3a454d926 100644 --- a/operator/charts/templates/clusterrole.yaml +++ b/operator/charts/templates/clusterrole.yaml @@ -26,6 +26,8 @@ rules: - podcliques - podcliques/status - podcliquescalinggroups + - clustertopologies + - clustertopologies/status - podcliquescalinggroups/status verbs: - create @@ -36,14 +38,6 @@ rules: - deletecollection - patch - update -- apiGroups: - - grove.io - resources: - - clustertopologies - verbs: - - get - - list - - watch - apiGroups: - "" resources: @@ -136,4 +130,15 @@ rules: - patch - update - watch +- apiGroups: + - kai.io + resources: + - topologies + verbs: + - create + - get + - list + - watch + - patch + - update diff --git a/operator/charts/values.yaml b/operator/charts/values.yaml index da6105cbb..8a34b0fcc 100644 --- a/operator/charts/values.yaml +++ b/operator/charts/values.yaml @@ -61,8 +61,17 @@ config: logLevel: info logFormat: json clusterTopology: - enabled: false - name: "grove-topology" + enabled: true + name: default + levels: + - domain: region + key: region + - domain: zone + key: zone + - domain: datacenter + key: datacenter + - domain: block + key: block authorizer: enabled: false exemptServiceAccountUserNames: From 1d2d50d5c22b3b9270c568512e2d30bee3bffa92 Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Wed, 24 Dec 2025 21:44:30 +0200 Subject: [PATCH 08/23] feat: improve validation error messages for ClusterTopology levels Signed-off-by: Ron Kahn --- operator/api/config/v1alpha1/defaults.go | 1 - operator/api/config/validation/validation.go | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/operator/api/config/v1alpha1/defaults.go b/operator/api/config/v1alpha1/defaults.go index f9451338d..e2752790d 100644 --- a/operator/api/config/v1alpha1/defaults.go +++ b/operator/api/config/v1alpha1/defaults.go @@ -27,7 +27,6 @@ const ( defaultLeaderElectionResourceLock = "leases" defaultLeaderElectionResourceName = "grove-operator-leader-election" defaultWebhookServerTLSServerCertDir = "/etc/grove-operator/webhook-certs" - defaultTopologyName = "grove-topology" ) // SetDefaults_ClientConnectionConfiguration sets defaults for the k8s client connection. diff --git a/operator/api/config/validation/validation.go b/operator/api/config/validation/validation.go index e6475ab2f..13d188200 100644 --- a/operator/api/config/validation/validation.go +++ b/operator/api/config/validation/validation.go @@ -131,7 +131,7 @@ func validateClusterTopologyConfiguration(clusterTopologyCfg configv1alpha1.Clus func validateClusterTopologyLevels(levels []configv1alpha1.TopologyLevel, fldPath *field.Path) field.ErrorList { allErrs := field.ErrorList{} if len(levels) == 0 { - allErrs = append(allErrs, field.Required(fldPath.Child("levels"), "levels are required when topology is enabled")) + allErrs = append(allErrs, field.Required(fldPath, "levels are required when topology is enabled")) } if len(levels) > configv1alpha1.MaxTopologyLevels { From d5aefdb081ba3e07c45320139ffe90a7850b36c7 Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Wed, 24 Dec 2025 22:53:37 +0200 Subject: [PATCH 09/23] feat: update API group for ClusterRole in clusterrole.yaml Signed-off-by: Ron Kahn --- operator/charts/templates/clusterrole.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/charts/templates/clusterrole.yaml b/operator/charts/templates/clusterrole.yaml index 3a454d926..fa0491943 100644 --- a/operator/charts/templates/clusterrole.yaml +++ b/operator/charts/templates/clusterrole.yaml @@ -131,7 +131,7 @@ rules: - update - watch - apiGroups: - - kai.io + - kai.scheduler resources: - topologies verbs: From 633e9a167ed03ec590fc08981759377d5fb41c44 Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Wed, 24 Dec 2025 23:06:11 +0200 Subject: [PATCH 10/23] feat: refactor ClusterTopology management and add unit tests Signed-off-by: Ron Kahn --- operator/internal/topology/manager_test.go | 129 ------- .../topology/{manager.go => topology.go} | 110 ++++-- operator/internal/topology/topology_test.go | 350 ++++++++++++++++++ 3 files changed, 422 insertions(+), 167 deletions(-) delete mode 100644 operator/internal/topology/manager_test.go rename operator/internal/topology/{manager.go => topology.go} (51%) create mode 100644 operator/internal/topology/topology_test.go diff --git a/operator/internal/topology/manager_test.go b/operator/internal/topology/manager_test.go deleted file mode 100644 index 6a692f25b..000000000 --- a/operator/internal/topology/manager_test.go +++ /dev/null @@ -1,129 +0,0 @@ -// /* -// Copyright 2025 The Grove Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// */ - -package topology - -import ( - "context" - "testing" - - configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" - corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" - - kaitopologyv1alpha1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1alpha1" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client/fake" -) - -func TestConvertTopologyLevels(t *testing.T) { - configLevels := []configv1alpha1.TopologyLevel{ - {Domain: "region", Key: "topology.kubernetes.io/region"}, - {Domain: "zone", Key: "topology.kubernetes.io/zone"}, - } - - coreLevels := convertTopologyLevels(configLevels) - - require.Len(t, coreLevels, 2) - assert.Equal(t, corev1alpha1.TopologyDomain("region"), coreLevels[0].Domain) - assert.Equal(t, "topology.kubernetes.io/region", coreLevels[0].Key) - assert.Equal(t, corev1alpha1.TopologyDomain("zone"), coreLevels[1].Domain) - assert.Equal(t, "topology.kubernetes.io/zone", coreLevels[1].Key) -} - -func TestConvertClusterTopologyToKai(t *testing.T) { - clusterTopology := &corev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "grove-topology", - }, - Spec: corev1alpha1.ClusterTopologySpec{ - Levels: []corev1alpha1.TopologyLevel{ - {Domain: "zone", Key: "topology.kubernetes.io/zone"}, - {Domain: "region", Key: "topology.kubernetes.io/region"}, - }, - }, - } - - kaiTopology := convertClusterTopologyToKai(clusterTopology) - - assert.Equal(t, "grove-topology", kaiTopology.Name) - require.Len(t, kaiTopology.Spec.Levels, 2) - assert.Equal(t, "topology.kubernetes.io/region", kaiTopology.Spec.Levels[0].NodeLabel) - assert.Equal(t, "topology.kubernetes.io/zone", kaiTopology.Spec.Levels[1].NodeLabel) -} - -func TestEnsureTopology_Create(t *testing.T) { - scheme := runtime.NewScheme() - _ = corev1alpha1.AddToScheme(scheme) - _ = kaitopologyv1alpha1.AddToScheme(scheme) - client := fake.NewClientBuilder().WithScheme(scheme).Build() - - levels := []configv1alpha1.TopologyLevel{ - {Domain: "zone", Key: "topology.kubernetes.io/zone"}, - } - - err := EnsureTopology(context.Background(), client, "test-topology", levels) - assert.NoError(t, err) - - ct := &corev1alpha1.ClusterTopology{} - err = client.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, ct) - assert.NoError(t, err) - assert.Len(t, ct.Spec.Levels, 1) - - kaiTopology := &kaitopologyv1alpha1.Topology{} - err = client.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, kaiTopology) - assert.NoError(t, err) - assert.Len(t, kaiTopology.Spec.Levels, 1) - assert.Equal(t, "topology.kubernetes.io/zone", kaiTopology.Spec.Levels[0].NodeLabel) -} - -func TestEnsureTopology_Update(t *testing.T) { - scheme := runtime.NewScheme() - _ = corev1alpha1.AddToScheme(scheme) - _ = kaitopologyv1alpha1.AddToScheme(scheme) - - existing := &corev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{Name: "test-topology"}, - Spec: corev1alpha1.ClusterTopologySpec{ - Levels: []corev1alpha1.TopologyLevel{ - {Domain: "zone", Key: "old-key"}, - }, - }, - } - - client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(existing).Build() - - levels := []configv1alpha1.TopologyLevel{ - {Domain: "zone", Key: "new-key"}, - } - - err := EnsureTopology(context.Background(), client, "test-topology", levels) - assert.NoError(t, err) - - ct := &corev1alpha1.ClusterTopology{} - err = client.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, ct) - assert.NoError(t, err) - assert.Equal(t, "new-key", ct.Spec.Levels[0].Key) - - kaiTopology := &kaitopologyv1alpha1.Topology{} - err = client.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, kaiTopology) - assert.NoError(t, err) - assert.Len(t, kaiTopology.Spec.Levels, 1) - assert.Equal(t, "new-key", kaiTopology.Spec.Levels[0].NodeLabel) -} diff --git a/operator/internal/topology/manager.go b/operator/internal/topology/topology.go similarity index 51% rename from operator/internal/topology/manager.go rename to operator/internal/topology/topology.go index b74bee3ca..29d482a72 100644 --- a/operator/internal/topology/manager.go +++ b/operator/internal/topology/topology.go @@ -19,12 +19,13 @@ package topology import ( "context" "fmt" + "reflect" "sort" + kaitopologyv1alpha1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1alpha1" configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" - - kaitopologyv1alpha1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1alpha1" + "github.com/samber/lo" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -45,33 +46,47 @@ func EnsureTopology(ctx context.Context, client client.Client, name string, leve Levels: convertTopologyLevels(levels), }, } - - err := upsertClusterTopology(ctx, client, topology, name) + upsertTopology, err := upsertClusterTopology(ctx, client, topology) if err != nil { return err } - return ensureKAITopology(ctx, client, topology) + return ensureKAITopology(ctx, client, upsertTopology) } -func upsertClusterTopology(ctx context.Context, client client.Client, topology *corev1alpha1.ClusterTopology, name string) error { - err := client.Create(ctx, topology) - if err != nil { - if !apierrors.IsAlreadyExists(err) { - return fmt.Errorf("failed to create ClusterTopology: %w", err) - } - existing := &corev1alpha1.ClusterTopology{} - if err := client.Get(ctx, types.NamespacedName{Name: name}, existing); err != nil { - return fmt.Errorf("failed to get existing ClusterTopology: %w", err) - } - existing.Spec = topology.Spec - if err := client.Update(ctx, existing); err != nil { - return fmt.Errorf("failed to update ClusterTopology: %w", err) +// upsertClusterTopology ensures that a ClusterTopology object exists by creating or updating it based on the desired state. +func upsertClusterTopology(ctx context.Context, k8sClient client.Client, desireClusterTopology *corev1alpha1.ClusterTopology) (*corev1alpha1.ClusterTopology, error) { + existing := &corev1alpha1.ClusterTopology{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: desireClusterTopology.Name}, existing); err != nil { + if apierrors.IsNotFound(err) { + if err := k8sClient.Create(ctx, desireClusterTopology); err != nil { + return nil, fmt.Errorf("failed to create ClusterTopology: %w", err) + } + return desireClusterTopology, nil } - logger.Info("cluster topology updated successfully", "name", name) - } else { - logger.Info("cluster topology created successfully", "name", name) + return nil, fmt.Errorf("failed to get existing ClusterTopology: %w", err) } - return nil + isChanged := isClusterTopologyChanged(existing, desireClusterTopology) + if !isChanged { + return existing, nil + } + existing.Spec = desireClusterTopology.Spec + err := k8sClient.Update(ctx, existing) + if err != nil { + return nil, fmt.Errorf("failed to update ClusterTopology: %w", err) + } + return existing, nil +} + +// isClusterTopologyChanged checks if the ClusterTopology is changed +func isClusterTopologyChanged(existing *corev1alpha1.ClusterTopology, desireClusterTopology *corev1alpha1.ClusterTopology) bool { + existingLevels := lo.SliceToMap(existing.Spec.Levels, func(l corev1alpha1.TopologyLevel) (corev1alpha1.TopologyDomain, string) { + return l.Domain, l.Key + }) + newLevels := lo.SliceToMap(desireClusterTopology.Spec.Levels, func(l corev1alpha1.TopologyLevel) (corev1alpha1.TopologyDomain, string) { + return l.Domain, l.Key + }) + isChanged := !reflect.DeepEqual(existingLevels, newLevels) + return isChanged } // convertTopologyLevels converts configuration topology levels to core API topology levels @@ -112,31 +127,50 @@ func convertClusterTopologyToKai(clusterTopology *corev1alpha1.ClusterTopology) } } -// ensureKAITopology creates or updates the KAI Topology CR from ClusterTopology -func ensureKAITopology(ctx context.Context, client client.Client, clusterTopology *corev1alpha1.ClusterTopology) error { - kaiTopology := convertClusterTopologyToKai(clusterTopology) - +func createKAITopology(ctx context.Context, client client.Client, clusterTopology *corev1alpha1.ClusterTopology, kaiTopology *kaitopologyv1alpha1.Topology) error { if err := controllerutil.SetControllerReference(clusterTopology, kaiTopology, client.Scheme()); err != nil { return fmt.Errorf("failed to set owner reference for KAI Topology: %w", err) } + if err := client.Create(ctx, kaiTopology); err != nil { + return fmt.Errorf("failed to create KAI Topology: %w", err) + } + return nil +} + +func ensureKAITopology(ctx context.Context, client client.Client, clusterTopology *corev1alpha1.ClusterTopology) error { + desiredKaiTopology := convertClusterTopologyToKai(clusterTopology) + + existing := &kaitopologyv1alpha1.Topology{} + err := client.Get(ctx, types.NamespacedName{Name: desiredKaiTopology.Name}, existing) - err := client.Create(ctx, kaiTopology) if err != nil { - if apierrors.IsAlreadyExists(err) { - existing := &kaitopologyv1alpha1.Topology{} - if err := client.Get(ctx, types.NamespacedName{Name: kaiTopology.Name}, existing); err != nil { - return fmt.Errorf("failed to get existing KAI Topology: %w", err) - } - existing.Spec = kaiTopology.Spec - if err := client.Update(ctx, existing); err != nil { - return fmt.Errorf("failed to update KAI Topology: %w", err) + if apierrors.IsNotFound(err) { + if err := createKAITopology(ctx, client, clusterTopology, desiredKaiTopology); err != nil { + return err } - logger.Info("KAI topology updated successfully", "name", kaiTopology.Name) + logger.Info("KAI topology created successfully", "name", desiredKaiTopology.Name) return nil } - return fmt.Errorf("failed to create KAI Topology: %w", err) + return fmt.Errorf("failed to get existing KAI Topology: %w", err) + } + + if !metav1.IsControlledBy(existing, clusterTopology) { + return fmt.Errorf("KAI Topology %s is owned by a different controller", existing.Name) + } + + if !reflect.DeepEqual(existing.Spec.Levels, desiredKaiTopology.Spec.Levels) { + // levels have changed, levels are immutable, delete existing and recreate + if err = client.Delete(ctx, existing); err != nil { + return fmt.Errorf("failed to delete KAI Topology for recreation: %w", err) + } + + if err = createKAITopology(ctx, client, clusterTopology, desiredKaiTopology); err != nil { + return err + } + logger.Info("KAI topology recreated with new levels", "name", desiredKaiTopology.Name) + return nil } - logger.Info("KAI topology created successfully", "name", kaiTopology.Name) + logger.Info("KAI topology unchanged", "name", desiredKaiTopology.Name) return nil } diff --git a/operator/internal/topology/topology_test.go b/operator/internal/topology/topology_test.go new file mode 100644 index 000000000..2f6c55e4d --- /dev/null +++ b/operator/internal/topology/topology_test.go @@ -0,0 +1,350 @@ +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package topology + +import ( + "context" + "testing" + + configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" + corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + + kaitopologyv1alpha1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1alpha1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestConvertTopologyLevels(t *testing.T) { + configLevels := []configv1alpha1.TopologyLevel{ + {Domain: "region", Key: "topology.kubernetes.io/region"}, + {Domain: "zone", Key: "topology.kubernetes.io/zone"}, + } + + coreLevels := convertTopologyLevels(configLevels) + + require.Len(t, coreLevels, 2) + assert.Equal(t, corev1alpha1.TopologyDomain("region"), coreLevels[0].Domain) + assert.Equal(t, "topology.kubernetes.io/region", coreLevels[0].Key) + assert.Equal(t, corev1alpha1.TopologyDomain("zone"), coreLevels[1].Domain) + assert.Equal(t, "topology.kubernetes.io/zone", coreLevels[1].Key) +} + +func TestConvertClusterTopologyToKai(t *testing.T) { + clusterTopology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: "grove-topology", + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: []corev1alpha1.TopologyLevel{ + {Domain: "zone", Key: "topology.kubernetes.io/zone"}, + {Domain: "region", Key: "topology.kubernetes.io/region"}, + }, + }, + } + + kaiTopology := convertClusterTopologyToKai(clusterTopology) + + assert.Equal(t, "grove-topology", kaiTopology.Name) + require.Len(t, kaiTopology.Spec.Levels, 2) + assert.Equal(t, "topology.kubernetes.io/region", kaiTopology.Spec.Levels[0].NodeLabel) + assert.Equal(t, "topology.kubernetes.io/zone", kaiTopology.Spec.Levels[1].NodeLabel) +} + +func TestEnsureTopology_Create(t *testing.T) { + scheme := runtime.NewScheme() + _ = corev1alpha1.AddToScheme(scheme) + _ = kaitopologyv1alpha1.AddToScheme(scheme) + client := fake.NewClientBuilder().WithScheme(scheme).Build() + + levels := []configv1alpha1.TopologyLevel{ + {Domain: "zone", Key: "topology.kubernetes.io/zone"}, + } + + err := EnsureTopology(context.Background(), client, "test-topology", levels) + assert.NoError(t, err) + + ct := &corev1alpha1.ClusterTopology{} + err = client.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, ct) + assert.NoError(t, err) + assert.Len(t, ct.Spec.Levels, 1) + + kaiTopology := &kaitopologyv1alpha1.Topology{} + err = client.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, kaiTopology) + assert.NoError(t, err) + assert.Len(t, kaiTopology.Spec.Levels, 1) + assert.Equal(t, "topology.kubernetes.io/zone", kaiTopology.Spec.Levels[0].NodeLabel) +} + +func TestEnsureTopology_Update(t *testing.T) { + scheme := runtime.NewScheme() + _ = corev1alpha1.AddToScheme(scheme) + _ = kaitopologyv1alpha1.AddToScheme(scheme) + + existing := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{Name: "test-topology"}, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: []corev1alpha1.TopologyLevel{ + {Domain: "zone", Key: "old-key"}, + }, + }, + } + + client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(existing).Build() + + levels := []configv1alpha1.TopologyLevel{ + {Domain: "zone", Key: "new-key"}, + } + + err := EnsureTopology(context.Background(), client, "test-topology", levels) + assert.NoError(t, err) + + ct := &corev1alpha1.ClusterTopology{} + err = client.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, ct) + assert.NoError(t, err) + assert.Equal(t, "new-key", ct.Spec.Levels[0].Key) + + kaiTopology := &kaitopologyv1alpha1.Topology{} + err = client.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, kaiTopology) + assert.NoError(t, err) + assert.Len(t, kaiTopology.Spec.Levels, 1) + assert.Equal(t, "new-key", kaiTopology.Spec.Levels[0].NodeLabel) +} + +func Test_ensureKAITopology_CreateNew(t *testing.T) { + scheme := runtime.NewScheme() + _ = corev1alpha1.AddToScheme(scheme) + _ = kaitopologyv1alpha1.AddToScheme(scheme) + client := fake.NewClientBuilder().WithScheme(scheme).Build() + + clusterTopology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-topology", + UID: types.UID("cluster-topology-uid"), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: []corev1alpha1.TopologyLevel{ + {Domain: "region", Key: "topology.kubernetes.io/region"}, + }, + }, + } + + err := ensureKAITopology(context.Background(), client, clusterTopology) + assert.NoError(t, err) + + kaiTopology := &kaitopologyv1alpha1.Topology{} + err = client.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, kaiTopology) + assert.NoError(t, err) + assert.Len(t, kaiTopology.Spec.Levels, 1) + assert.Equal(t, "topology.kubernetes.io/region", kaiTopology.Spec.Levels[0].NodeLabel) + + ownerRef := metav1.GetControllerOf(kaiTopology) + require.NotNil(t, ownerRef) + assert.Equal(t, "test-topology", ownerRef.Name) + assert.Equal(t, clusterTopology.UID, ownerRef.UID) +} + +func Test_ensureKAITopology_ExistsUnchanged(t *testing.T) { + scheme := runtime.NewScheme() + _ = corev1alpha1.AddToScheme(scheme) + _ = kaitopologyv1alpha1.AddToScheme(scheme) + + clusterTopology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-topology", + UID: types.UID("cluster-topology-uid"), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: []corev1alpha1.TopologyLevel{ + {Domain: "region", Key: "topology.kubernetes.io/region"}, + }, + }, + } + + existingKAI := &kaitopologyv1alpha1.Topology{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-topology", + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "core.grove.io/v1alpha1", + Kind: "ClusterTopology", + Name: "test-topology", + UID: clusterTopology.UID, + Controller: ptr(true), + BlockOwnerDeletion: ptr(true), + }, + }, + }, + Spec: kaitopologyv1alpha1.TopologySpec{ + Levels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "topology.kubernetes.io/region"}, + }, + }, + } + + client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(clusterTopology, existingKAI).Build() + + err := ensureKAITopology(context.Background(), client, clusterTopology) + assert.NoError(t, err) + + kaiTopology := &kaitopologyv1alpha1.Topology{} + err = client.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, kaiTopology) + assert.NoError(t, err) + assert.Len(t, kaiTopology.Spec.Levels, 1) + assert.Equal(t, "topology.kubernetes.io/region", kaiTopology.Spec.Levels[0].NodeLabel) +} + +func Test_ensureKAITopology_WrongOwner(t *testing.T) { + scheme := runtime.NewScheme() + _ = corev1alpha1.AddToScheme(scheme) + _ = kaitopologyv1alpha1.AddToScheme(scheme) + + clusterTopology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-topology", + UID: types.UID("cluster-topology-uid"), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: []corev1alpha1.TopologyLevel{ + {Domain: "region", Key: "topology.kubernetes.io/region"}, + }, + }, + } + + existingKAI := &kaitopologyv1alpha1.Topology{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-topology", + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "core.grove.io/v1alpha1", + Kind: "ClusterTopology", + Name: "different-owner", + UID: types.UID("different-uid"), + Controller: ptr(true), + BlockOwnerDeletion: ptr(true), + }, + }, + }, + Spec: kaitopologyv1alpha1.TopologySpec{ + Levels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "topology.kubernetes.io/region"}, + }, + }, + } + + client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(clusterTopology, existingKAI).Build() + + err := ensureKAITopology(context.Background(), client, clusterTopology) + assert.Error(t, err) + assert.Contains(t, err.Error(), "owned by a different controller") +} + +func Test_ensureKAITopology_RecreateOnLevelsChange(t *testing.T) { + scheme := runtime.NewScheme() + _ = corev1alpha1.AddToScheme(scheme) + _ = kaitopologyv1alpha1.AddToScheme(scheme) + + clusterTopology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-topology", + UID: types.UID("cluster-topology-uid"), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: []corev1alpha1.TopologyLevel{ + {Domain: "region", Key: "topology.kubernetes.io/region"}, + {Domain: "zone", Key: "topology.kubernetes.io/zone"}, + }, + }, + } + + existingKAI := &kaitopologyv1alpha1.Topology{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-topology", + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "core.grove.io/v1alpha1", + Kind: "ClusterTopology", + Name: "test-topology", + UID: clusterTopology.UID, + Controller: ptr(true), + BlockOwnerDeletion: ptr(true), + }, + }, + }, + Spec: kaitopologyv1alpha1.TopologySpec{ + Levels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "old-label"}, + }, + }, + } + + client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(clusterTopology, existingKAI).Build() + + err := ensureKAITopology(context.Background(), client, clusterTopology) + assert.NoError(t, err) + + kaiTopology := &kaitopologyv1alpha1.Topology{} + err = client.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, kaiTopology) + assert.NoError(t, err) + assert.Len(t, kaiTopology.Spec.Levels, 2) + assert.Equal(t, "topology.kubernetes.io/region", kaiTopology.Spec.Levels[0].NodeLabel) + assert.Equal(t, "topology.kubernetes.io/zone", kaiTopology.Spec.Levels[1].NodeLabel) +} + +func Test_createKAITopology_Success(t *testing.T) { + scheme := runtime.NewScheme() + _ = corev1alpha1.AddToScheme(scheme) + _ = kaitopologyv1alpha1.AddToScheme(scheme) + client := fake.NewClientBuilder().WithScheme(scheme).Build() + + clusterTopology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-topology", + UID: types.UID("cluster-topology-uid"), + }, + } + + kaiTopology := &kaitopologyv1alpha1.Topology{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-kai-topology", + }, + Spec: kaitopologyv1alpha1.TopologySpec{ + Levels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "topology.kubernetes.io/region"}, + }, + }, + } + + err := createKAITopology(context.Background(), client, clusterTopology, kaiTopology) + assert.NoError(t, err) + + created := &kaitopologyv1alpha1.Topology{} + err = client.Get(context.Background(), types.NamespacedName{Name: "test-kai-topology"}, created) + assert.NoError(t, err) + + ownerRef := metav1.GetControllerOf(created) + require.NotNil(t, ownerRef) + assert.Equal(t, "test-topology", ownerRef.Name) + assert.Equal(t, clusterTopology.UID, ownerRef.UID) +} + +func ptr(b bool) *bool { + return &b +} From 17d8d8fccd6961cdbcf459b623d2996992e90f4c Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Wed, 24 Dec 2025 23:25:23 +0200 Subject: [PATCH 11/23] feat: enhance EnsureTopology tests with comprehensive scenarios and validation Signed-off-by: Ron Kahn --- operator/internal/topology/topology_test.go | 589 +++++++++++--------- 1 file changed, 335 insertions(+), 254 deletions(-) diff --git a/operator/internal/topology/topology_test.go b/operator/internal/topology/topology_test.go index 2f6c55e4d..3d4ff1c58 100644 --- a/operator/internal/topology/topology_test.go +++ b/operator/internal/topology/topology_test.go @@ -22,6 +22,7 @@ import ( configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + "k8s.io/utils/ptr" kaitopologyv1alpha1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1alpha1" "github.com/stretchr/testify/assert" @@ -29,6 +30,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) @@ -68,283 +70,362 @@ func TestConvertClusterTopologyToKai(t *testing.T) { assert.Equal(t, "topology.kubernetes.io/zone", kaiTopology.Spec.Levels[1].NodeLabel) } -func TestEnsureTopology_Create(t *testing.T) { - scheme := runtime.NewScheme() - _ = corev1alpha1.AddToScheme(scheme) - _ = kaitopologyv1alpha1.AddToScheme(scheme) - client := fake.NewClientBuilder().WithScheme(scheme).Build() - - levels := []configv1alpha1.TopologyLevel{ - {Domain: "zone", Key: "topology.kubernetes.io/zone"}, - } - - err := EnsureTopology(context.Background(), client, "test-topology", levels) - assert.NoError(t, err) - - ct := &corev1alpha1.ClusterTopology{} - err = client.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, ct) - assert.NoError(t, err) - assert.Len(t, ct.Spec.Levels, 1) - - kaiTopology := &kaitopologyv1alpha1.Topology{} - err = client.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, kaiTopology) - assert.NoError(t, err) - assert.Len(t, kaiTopology.Spec.Levels, 1) - assert.Equal(t, "topology.kubernetes.io/zone", kaiTopology.Spec.Levels[0].NodeLabel) -} - -func TestEnsureTopology_Update(t *testing.T) { - scheme := runtime.NewScheme() - _ = corev1alpha1.AddToScheme(scheme) - _ = kaitopologyv1alpha1.AddToScheme(scheme) - - existing := &corev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{Name: "test-topology"}, - Spec: corev1alpha1.ClusterTopologySpec{ - Levels: []corev1alpha1.TopologyLevel{ - {Domain: "zone", Key: "old-key"}, +func TestEnsureTopology(t *testing.T) { + tests := []struct { + name string + setup func(builder *fake.ClientBuilder) + levels []configv1alpha1.TopologyLevel + wantErr bool + errContains string + validate func(*testing.T, client.Client) + }{ + { + name: "create_new", + setup: nil, + levels: []configv1alpha1.TopologyLevel{ + {Domain: "zone", Key: "topology.kubernetes.io/zone"}, }, - }, - } - - client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(existing).Build() - - levels := []configv1alpha1.TopologyLevel{ - {Domain: "zone", Key: "new-key"}, - } - - err := EnsureTopology(context.Background(), client, "test-topology", levels) - assert.NoError(t, err) - - ct := &corev1alpha1.ClusterTopology{} - err = client.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, ct) - assert.NoError(t, err) - assert.Equal(t, "new-key", ct.Spec.Levels[0].Key) - - kaiTopology := &kaitopologyv1alpha1.Topology{} - err = client.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, kaiTopology) - assert.NoError(t, err) - assert.Len(t, kaiTopology.Spec.Levels, 1) - assert.Equal(t, "new-key", kaiTopology.Spec.Levels[0].NodeLabel) -} - -func Test_ensureKAITopology_CreateNew(t *testing.T) { - scheme := runtime.NewScheme() - _ = corev1alpha1.AddToScheme(scheme) - _ = kaitopologyv1alpha1.AddToScheme(scheme) - client := fake.NewClientBuilder().WithScheme(scheme).Build() - - clusterTopology := &corev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - UID: types.UID("cluster-topology-uid"), - }, - Spec: corev1alpha1.ClusterTopologySpec{ - Levels: []corev1alpha1.TopologyLevel{ - {Domain: "region", Key: "topology.kubernetes.io/region"}, + wantErr: false, + validate: func(t *testing.T, c client.Client) { + ct := &corev1alpha1.ClusterTopology{} + err := c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, ct) + require.NoError(t, err) + assert.Len(t, ct.Spec.Levels, 1) + assert.Equal(t, corev1alpha1.TopologyDomain("zone"), ct.Spec.Levels[0].Domain) + + kai := &kaitopologyv1alpha1.Topology{} + err = c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, kai) + require.NoError(t, err) + assert.Len(t, kai.Spec.Levels, 1) + assert.Equal(t, "topology.kubernetes.io/zone", kai.Spec.Levels[0].NodeLabel) + + ownerRef := metav1.GetControllerOf(kai) + require.NotNil(t, ownerRef) + assert.Equal(t, "test-topology", ownerRef.Name) }, }, - } - - err := ensureKAITopology(context.Background(), client, clusterTopology) - assert.NoError(t, err) - - kaiTopology := &kaitopologyv1alpha1.Topology{} - err = client.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, kaiTopology) - assert.NoError(t, err) - assert.Len(t, kaiTopology.Spec.Levels, 1) - assert.Equal(t, "topology.kubernetes.io/region", kaiTopology.Spec.Levels[0].NodeLabel) - - ownerRef := metav1.GetControllerOf(kaiTopology) - require.NotNil(t, ownerRef) - assert.Equal(t, "test-topology", ownerRef.Name) - assert.Equal(t, clusterTopology.UID, ownerRef.UID) -} - -func Test_ensureKAITopology_ExistsUnchanged(t *testing.T) { - scheme := runtime.NewScheme() - _ = corev1alpha1.AddToScheme(scheme) - _ = kaitopologyv1alpha1.AddToScheme(scheme) - - clusterTopology := &corev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - UID: types.UID("cluster-topology-uid"), + { + name: "update_cluster_create_kai", + setup: func(builder *fake.ClientBuilder) { + existing := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-topology", + UID: types.UID("cluster-uid"), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: []corev1alpha1.TopologyLevel{ + {Domain: "zone", Key: "old-key"}, + }, + }, + } + builder.WithObjects(existing) + }, + levels: []configv1alpha1.TopologyLevel{ + {Domain: "zone", Key: "new-key"}, + }, + wantErr: false, + validate: func(t *testing.T, c client.Client) { + ct := &corev1alpha1.ClusterTopology{} + err := c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, ct) + require.NoError(t, err) + assert.Equal(t, "new-key", ct.Spec.Levels[0].Key) + + kai := &kaitopologyv1alpha1.Topology{} + err = c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, kai) + require.NoError(t, err) + assert.Len(t, kai.Spec.Levels, 1) + assert.Equal(t, "new-key", kai.Spec.Levels[0].NodeLabel) + }, }, - Spec: corev1alpha1.ClusterTopologySpec{ - Levels: []corev1alpha1.TopologyLevel{ + { + name: "both_exist_unchanged", + setup: func(builder *fake.ClientBuilder) { + ct := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-topology", + UID: types.UID("cluster-uid"), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: []corev1alpha1.TopologyLevel{ + {Domain: "region", Key: "topology.kubernetes.io/region"}, + }, + }, + } + + kai := &kaitopologyv1alpha1.Topology{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-topology", + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "core.grove.io/v1alpha1", + Kind: "ClusterTopology", + Name: "test-topology", + UID: ct.UID, + Controller: ptr.To(true), + BlockOwnerDeletion: ptr.To(true), + }, + }, + }, + Spec: kaitopologyv1alpha1.TopologySpec{ + Levels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "topology.kubernetes.io/region"}, + }, + }, + } + + builder.WithObjects(ct, kai) + }, + levels: []configv1alpha1.TopologyLevel{ {Domain: "region", Key: "topology.kubernetes.io/region"}, }, - }, - } - - existingKAI := &kaitopologyv1alpha1.Topology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - OwnerReferences: []metav1.OwnerReference{ - { - APIVersion: "core.grove.io/v1alpha1", - Kind: "ClusterTopology", - Name: "test-topology", - UID: clusterTopology.UID, - Controller: ptr(true), - BlockOwnerDeletion: ptr(true), - }, + wantErr: false, + validate: func(t *testing.T, c client.Client) { + ct := &corev1alpha1.ClusterTopology{} + err := c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, ct) + require.NoError(t, err) + assert.Len(t, ct.Spec.Levels, 1) + + kai := &kaitopologyv1alpha1.Topology{} + err = c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, kai) + require.NoError(t, err) + assert.Len(t, kai.Spec.Levels, 1) }, }, - Spec: kaitopologyv1alpha1.TopologySpec{ - Levels: []kaitopologyv1alpha1.TopologyLevel{ - {NodeLabel: "topology.kubernetes.io/region"}, + { + name: "both_need_changes", + setup: func(builder *fake.ClientBuilder) { + ct := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-topology", + UID: types.UID("cluster-uid"), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: []corev1alpha1.TopologyLevel{ + {Domain: "region", Key: "old-region-key"}, + }, + }, + } + + kai := &kaitopologyv1alpha1.Topology{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-topology", + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "core.grove.io/v1alpha1", + Kind: "ClusterTopology", + Name: "test-topology", + UID: ct.UID, + Controller: ptr.To(true), + BlockOwnerDeletion: ptr.To(true), + }, + }, + }, + Spec: kaitopologyv1alpha1.TopologySpec{ + Levels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "old-region-key"}, + }, + }, + } + + builder.WithObjects(ct, kai) + }, + levels: []configv1alpha1.TopologyLevel{ + {Domain: "region", Key: "new-region-key"}, + {Domain: "zone", Key: "topology.kubernetes.io/zone"}, + }, + wantErr: false, + validate: func(t *testing.T, c client.Client) { + ct := &corev1alpha1.ClusterTopology{} + err := c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, ct) + require.NoError(t, err) + assert.Len(t, ct.Spec.Levels, 2) + assert.Equal(t, "new-region-key", ct.Spec.Levels[0].Key) + + kai := &kaitopologyv1alpha1.Topology{} + err = c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, kai) + require.NoError(t, err) + assert.Len(t, kai.Spec.Levels, 2) }, }, - } - - client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(clusterTopology, existingKAI).Build() - - err := ensureKAITopology(context.Background(), client, clusterTopology) - assert.NoError(t, err) - - kaiTopology := &kaitopologyv1alpha1.Topology{} - err = client.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, kaiTopology) - assert.NoError(t, err) - assert.Len(t, kaiTopology.Spec.Levels, 1) - assert.Equal(t, "topology.kubernetes.io/region", kaiTopology.Spec.Levels[0].NodeLabel) -} - -func Test_ensureKAITopology_WrongOwner(t *testing.T) { - scheme := runtime.NewScheme() - _ = corev1alpha1.AddToScheme(scheme) - _ = kaitopologyv1alpha1.AddToScheme(scheme) - - clusterTopology := &corev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - UID: types.UID("cluster-topology-uid"), + { + name: "empty_levels", + setup: nil, + levels: []configv1alpha1.TopologyLevel{}, + wantErr: false, + validate: func(t *testing.T, c client.Client) { + ct := &corev1alpha1.ClusterTopology{} + err := c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, ct) + require.NoError(t, err) + assert.Len(t, ct.Spec.Levels, 0) + + kai := &kaitopologyv1alpha1.Topology{} + err = c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, kai) + require.NoError(t, err) + assert.Len(t, kai.Spec.Levels, 0) + }, }, - Spec: corev1alpha1.ClusterTopologySpec{ - Levels: []corev1alpha1.TopologyLevel{ + { + name: "kai_wrong_owner", + setup: func(builder *fake.ClientBuilder) { + ct := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-topology", + UID: types.UID("cluster-uid"), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: []corev1alpha1.TopologyLevel{ + {Domain: "region", Key: "topology.kubernetes.io/region"}, + }, + }, + } + + kai := &kaitopologyv1alpha1.Topology{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-topology", + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "core.grove.io/v1alpha1", + Kind: "ClusterTopology", + Name: "different-owner", + UID: types.UID("different-uid"), + Controller: ptr.To(true), + BlockOwnerDeletion: ptr.To(true), + }, + }, + }, + Spec: kaitopologyv1alpha1.TopologySpec{ + Levels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "topology.kubernetes.io/region"}, + }, + }, + } + + builder.WithObjects(ct, kai) + }, + levels: []configv1alpha1.TopologyLevel{ {Domain: "region", Key: "topology.kubernetes.io/region"}, }, + wantErr: true, + errContains: "owned by a different controller", }, - } - - existingKAI := &kaitopologyv1alpha1.Topology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - OwnerReferences: []metav1.OwnerReference{ - { - APIVersion: "core.grove.io/v1alpha1", - Kind: "ClusterTopology", - Name: "different-owner", - UID: types.UID("different-uid"), - Controller: ptr(true), - BlockOwnerDeletion: ptr(true), - }, + { + name: "kai_no_owner", + setup: func(builder *fake.ClientBuilder) { + ct := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-topology", + UID: types.UID("cluster-uid"), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: []corev1alpha1.TopologyLevel{ + {Domain: "region", Key: "topology.kubernetes.io/region"}, + }, + }, + } + + kai := &kaitopologyv1alpha1.Topology{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-topology", + }, + Spec: kaitopologyv1alpha1.TopologySpec{ + Levels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "topology.kubernetes.io/region"}, + }, + }, + } + + builder.WithObjects(ct, kai) }, - }, - Spec: kaitopologyv1alpha1.TopologySpec{ - Levels: []kaitopologyv1alpha1.TopologyLevel{ - {NodeLabel: "topology.kubernetes.io/region"}, + levels: []configv1alpha1.TopologyLevel{ + {Domain: "region", Key: "topology.kubernetes.io/region"}, }, + wantErr: true, + errContains: "owned by a different controller", }, - } - - client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(clusterTopology, existingKAI).Build() - - err := ensureKAITopology(context.Background(), client, clusterTopology) - assert.Error(t, err) - assert.Contains(t, err.Error(), "owned by a different controller") -} - -func Test_ensureKAITopology_RecreateOnLevelsChange(t *testing.T) { - scheme := runtime.NewScheme() - _ = corev1alpha1.AddToScheme(scheme) - _ = kaitopologyv1alpha1.AddToScheme(scheme) - - clusterTopology := &corev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - UID: types.UID("cluster-topology-uid"), - }, - Spec: corev1alpha1.ClusterTopologySpec{ - Levels: []corev1alpha1.TopologyLevel{ + { + name: "levels_change_recreate_kai", + setup: func(builder *fake.ClientBuilder) { + ct := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-topology", + UID: types.UID("cluster-uid"), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: []corev1alpha1.TopologyLevel{ + {Domain: "region", Key: "topology.kubernetes.io/region"}, + }, + }, + } + + kai := &kaitopologyv1alpha1.Topology{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-topology", + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "core.grove.io/v1alpha1", + Kind: "ClusterTopology", + Name: "test-topology", + UID: ct.UID, + Controller: ptr.To(true), + BlockOwnerDeletion: ptr.To(true), + }, + }, + }, + Spec: kaitopologyv1alpha1.TopologySpec{ + Levels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "old-label"}, + }, + }, + } + + builder.WithObjects(ct, kai) + }, + levels: []configv1alpha1.TopologyLevel{ {Domain: "region", Key: "topology.kubernetes.io/region"}, {Domain: "zone", Key: "topology.kubernetes.io/zone"}, }, - }, - } - - existingKAI := &kaitopologyv1alpha1.Topology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - OwnerReferences: []metav1.OwnerReference{ - { - APIVersion: "core.grove.io/v1alpha1", - Kind: "ClusterTopology", - Name: "test-topology", - UID: clusterTopology.UID, - Controller: ptr(true), - BlockOwnerDeletion: ptr(true), - }, - }, - }, - Spec: kaitopologyv1alpha1.TopologySpec{ - Levels: []kaitopologyv1alpha1.TopologyLevel{ - {NodeLabel: "old-label"}, + wantErr: false, + validate: func(t *testing.T, c client.Client) { + ct := &corev1alpha1.ClusterTopology{} + err := c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, ct) + require.NoError(t, err) + assert.Len(t, ct.Spec.Levels, 2) + + kai := &kaitopologyv1alpha1.Topology{} + err = c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, kai) + require.NoError(t, err) + assert.Len(t, kai.Spec.Levels, 2) + assert.Equal(t, "topology.kubernetes.io/region", kai.Spec.Levels[0].NodeLabel) + assert.Equal(t, "topology.kubernetes.io/zone", kai.Spec.Levels[1].NodeLabel) }, }, } - client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(clusterTopology, existingKAI).Build() - - err := ensureKAITopology(context.Background(), client, clusterTopology) - assert.NoError(t, err) - - kaiTopology := &kaitopologyv1alpha1.Topology{} - err = client.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, kaiTopology) - assert.NoError(t, err) - assert.Len(t, kaiTopology.Spec.Levels, 2) - assert.Equal(t, "topology.kubernetes.io/region", kaiTopology.Spec.Levels[0].NodeLabel) - assert.Equal(t, "topology.kubernetes.io/zone", kaiTopology.Spec.Levels[1].NodeLabel) -} - -func Test_createKAITopology_Success(t *testing.T) { - scheme := runtime.NewScheme() - _ = corev1alpha1.AddToScheme(scheme) - _ = kaitopologyv1alpha1.AddToScheme(scheme) - client := fake.NewClientBuilder().WithScheme(scheme).Build() - - clusterTopology := &corev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - UID: types.UID("cluster-topology-uid"), - }, + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + scheme := runtime.NewScheme() + _ = corev1alpha1.AddToScheme(scheme) + _ = kaitopologyv1alpha1.AddToScheme(scheme) + + builder := fake.NewClientBuilder().WithScheme(scheme) + if tt.setup != nil { + tt.setup(builder) + } + c := builder.Build() + + err := EnsureTopology(context.Background(), c, "test-topology", tt.levels) + + if tt.wantErr { + require.Error(t, err) + if tt.errContains != "" { + assert.Contains(t, err.Error(), tt.errContains) + } + } else { + require.NoError(t, err) + if tt.validate != nil { + tt.validate(t, c) + } + } + }) } - - kaiTopology := &kaitopologyv1alpha1.Topology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-kai-topology", - }, - Spec: kaitopologyv1alpha1.TopologySpec{ - Levels: []kaitopologyv1alpha1.TopologyLevel{ - {NodeLabel: "topology.kubernetes.io/region"}, - }, - }, - } - - err := createKAITopology(context.Background(), client, clusterTopology, kaiTopology) - assert.NoError(t, err) - - created := &kaitopologyv1alpha1.Topology{} - err = client.Get(context.Background(), types.NamespacedName{Name: "test-kai-topology"}, created) - assert.NoError(t, err) - - ownerRef := metav1.GetControllerOf(created) - require.NotNil(t, ownerRef) - assert.Equal(t, "test-topology", ownerRef.Name) - assert.Equal(t, clusterTopology.UID, ownerRef.UID) -} - -func ptr(b bool) *bool { - return &b } From 9c3353bb1daf23d801fae00e92c7f9f4db98c76e Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Wed, 24 Dec 2025 23:30:00 +0200 Subject: [PATCH 12/23] feat: reorganize imports in topology.go and topology_test.go for consistency Signed-off-by: Ron Kahn --- operator/internal/topology/topology.go | 3 ++- operator/internal/topology/topology_test.go | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/operator/internal/topology/topology.go b/operator/internal/topology/topology.go index 29d482a72..0395f904e 100644 --- a/operator/internal/topology/topology.go +++ b/operator/internal/topology/topology.go @@ -22,9 +22,10 @@ import ( "reflect" "sort" - kaitopologyv1alpha1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1alpha1" configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + + kaitopologyv1alpha1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1alpha1" "github.com/samber/lo" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" diff --git a/operator/internal/topology/topology_test.go b/operator/internal/topology/topology_test.go index 3d4ff1c58..45fc7ac19 100644 --- a/operator/internal/topology/topology_test.go +++ b/operator/internal/topology/topology_test.go @@ -22,7 +22,6 @@ import ( configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" - "k8s.io/utils/ptr" kaitopologyv1alpha1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1alpha1" "github.com/stretchr/testify/assert" @@ -30,6 +29,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) From 2813ddfe61017a3c50fe7d5ba4dcee8a44a849b3 Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Thu, 25 Dec 2025 10:36:40 +0200 Subject: [PATCH 13/23] feat: implement EnsureDeleteClusterTopology function and enhance main logic for topology management Signed-off-by: Ron Kahn --- operator/cmd/main.go | 9 ++- operator/internal/topology/topology.go | 14 ++++ operator/internal/topology/topology_test.go | 73 +++++++++++++++++++++ 3 files changed, 95 insertions(+), 1 deletion(-) diff --git a/operator/cmd/main.go b/operator/cmd/main.go index 7cb2857a7..cbbc0349f 100644 --- a/operator/cmd/main.go +++ b/operator/cmd/main.go @@ -64,11 +64,12 @@ func main() { os.Exit(1) } + topologyK8sClient, err := client.New(mgr.GetConfig(), client.Options{Scheme: mgr.GetScheme()}) ctx := ctrl.SetupSignalHandler() if operatorCfg.ClusterTopology.Enabled { + logger.Info("Topology is enabled") // create a Kubernetes client for cluster topology // the default client manager is not running prior (mgr.Start()) - topologyK8sClient, err := client.New(mgr.GetConfig(), client.Options{Scheme: mgr.GetScheme()}) if err != nil { logger.Error(err, "failed to create Kubernetes client") os.Exit(1) @@ -78,6 +79,12 @@ func main() { logger.Error(err, "cannot create/update cluster topology, operator cannot start") os.Exit(1) } + } else { + logger.Info("Topology is disabled") + err = topology.EnsureDeleteClusterTopology(ctx, topologyK8sClient, corev1alpha1.ClusterTopologyName) + if err != nil { + logger.Error(err, "non-fatal: cannot delete cluster topology") + } } webhookCertsReadyCh := make(chan struct{}) diff --git a/operator/internal/topology/topology.go b/operator/internal/topology/topology.go index 0395f904e..4d90a79f6 100644 --- a/operator/internal/topology/topology.go +++ b/operator/internal/topology/topology.go @@ -175,3 +175,17 @@ func ensureKAITopology(ctx context.Context, client client.Client, clusterTopolog logger.Info("KAI topology unchanged", "name", desiredKaiTopology.Name) return nil } + +// EnsureDeleteClusterTopology deletes the ClusterTopology with the given name if it exists +func EnsureDeleteClusterTopology(ctx context.Context, client client.Client, name string) error { + kaiTopology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + } + err := client.Delete(ctx, kaiTopology) + if err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("failed to delete Cluster-Topology: %w", err) + } + return nil +} diff --git a/operator/internal/topology/topology_test.go b/operator/internal/topology/topology_test.go index 45fc7ac19..5972046c9 100644 --- a/operator/internal/topology/topology_test.go +++ b/operator/internal/topology/topology_test.go @@ -26,6 +26,7 @@ import ( kaitopologyv1alpha1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1alpha1" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" @@ -429,3 +430,75 @@ func TestEnsureTopology(t *testing.T) { }) } } + +func TestEnsureDeleteClusterTopology(t *testing.T) { + tests := []struct { + name string + setup func(builder *fake.ClientBuilder) + wantErr bool + errContains string + validate func(*testing.T, client.Client) + }{ + { + name: "successful_delete", + setup: func(builder *fake.ClientBuilder) { + ct := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-topology", + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: []corev1alpha1.TopologyLevel{ + {Domain: "region", Key: "topology.kubernetes.io/region"}, + }, + }, + } + builder.WithObjects(ct) + }, + wantErr: false, + validate: func(t *testing.T, c client.Client) { + ct := &corev1alpha1.ClusterTopology{} + err := c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, ct) + require.Error(t, err) + assert.True(t, apierrors.IsNotFound(err)) + }, + }, + { + name: "already_deleted", + setup: nil, + wantErr: false, + validate: func(t *testing.T, c client.Client) { + ct := &corev1alpha1.ClusterTopology{} + err := c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, ct) + require.Error(t, err) + assert.True(t, apierrors.IsNotFound(err)) + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + scheme := runtime.NewScheme() + _ = corev1alpha1.AddToScheme(scheme) + + builder := fake.NewClientBuilder().WithScheme(scheme) + if tt.setup != nil { + tt.setup(builder) + } + c := builder.Build() + + err := EnsureDeleteClusterTopology(context.Background(), c, "test-topology") + + if tt.wantErr { + require.Error(t, err) + if tt.errContains != "" { + assert.Contains(t, err.Error(), tt.errContains) + } + } else { + require.NoError(t, err) + if tt.validate != nil { + tt.validate(t, c) + } + } + }) + } +} From 75d6858eee95d0f68518aadfbaf1064c5cbdc044 Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Thu, 25 Dec 2025 10:49:43 +0200 Subject: [PATCH 14/23] feat: update Go version to 1.24.5 in go.mod (bc KAI) Signed-off-by: Ron Kahn --- operator/go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/go.mod b/operator/go.mod index 8fc3eac78..03ba96fa1 100644 --- a/operator/go.mod +++ b/operator/go.mod @@ -1,6 +1,6 @@ module github.com/ai-dynamo/grove/operator -go 1.24.0 +go 1.24.5 require ( github.com/NVIDIA/KAI-scheduler v0.12.0 From d0eb65fb870db0e33c93140939584c21df6606df Mon Sep 17 00:00:00 2001 From: Ron Kahn <122778260+Ronkahn21@users.noreply.github.com> Date: Thu, 25 Dec 2025 15:04:23 +0200 Subject: [PATCH 15/23] Update values.yaml disable the feature by default --- operator/charts/values.yaml | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/operator/charts/values.yaml b/operator/charts/values.yaml index 8a34b0fcc..af1a13fad 100644 --- a/operator/charts/values.yaml +++ b/operator/charts/values.yaml @@ -61,17 +61,7 @@ config: logLevel: info logFormat: json clusterTopology: - enabled: true - name: default - levels: - - domain: region - key: region - - domain: zone - key: zone - - domain: datacenter - key: datacenter - - domain: block - key: block + enabled: false authorizer: enabled: false exemptServiceAccountUserNames: From 3be0dbeb0b9f7c149be1040b66464995781cbe81 Mon Sep 17 00:00:00 2001 From: Madhav Bhargava Date: Sun, 28 Dec 2025 23:08:10 +0530 Subject: [PATCH 16/23] Refactoring of the PR containing the following changes: * Removed duplicate definition of TopologyDomain and TopologyLevel across config and core API. * Simplified sorting and removed unnecessary functions. * Added sample values for levels in values.yaml * Improved validations for OperatorConfiguration.ClusterTopology * Added missing JSON tag for LeaderElection in OperatorConfiguration * Refactored operator/cmd * Removed unused install-helm-charts script * Removed Makefile target added for running a specific int test. * Renamed and refactored internal/topology to internal/clustertopology. * Refactored application version handling. Signed-off-by: Madhav Bhargava --- docs/api-reference/operator-api.md | 50 +- operator/Makefile | 5 - operator/api/common/constants/constants.go | 11 + operator/api/config/v1alpha1/register.go | 7 +- operator/api/config/v1alpha1/types.go | 51 +- .../config/v1alpha1/zz_generated.deepcopy.go | 19 +- operator/api/config/validation/validation.go | 42 +- .../api/config/validation/validation_test.go | 174 +++- operator/api/core/v1alpha1/clustertopology.go | 109 +-- .../api/core/v1alpha1/clustertopology_test.go | 195 +++-- .../crds/grove.io_clustertopologies.yaml | 8 +- operator/api/core/v1alpha1/register.go | 9 +- operator/charts/templates/clusterrole.yaml | 1 - operator/charts/values.yaml | 15 + operator/cmd/cli/cli.go | 124 +++ operator/cmd/cli/cli_test.go | 171 ++++ operator/cmd/cli/testdata/invalid-config.yaml | 26 + operator/cmd/cli/testdata/valid-config.yaml | 38 + operator/cmd/cli/version.go | 68 ++ operator/cmd/cli/version_test.go | 158 ++++ operator/cmd/main.go | 116 +-- operator/cmd/opts/options.go | 80 -- operator/hack/install-helm-charts.sh | 34 - operator/initc/cmd/main.go | 4 +- operator/initc/cmd/opts/options.go | 3 - .../clustertopology/clustertopology.go | 154 ++++ .../clustertopology/clustertopology_test.go | 774 ++++++++++++++++++ operator/internal/controller/manager.go | 8 +- .../podclique/components/pod/initcontainer.go | 4 +- operator/internal/topology/topology.go | 191 ----- operator/internal/topology/topology_test.go | 504 ------------ operator/internal/utils/ioutil/ioutil.go | 26 + operator/internal/utils/ioutil/ioutil_test.go | 133 +++ operator/internal/version/version.go | 66 +- operator/internal/webhook/register.go | 4 +- operator/internal/webhook/register_test.go | 8 +- 36 files changed, 2174 insertions(+), 1216 deletions(-) create mode 100644 operator/cmd/cli/cli.go create mode 100644 operator/cmd/cli/cli_test.go create mode 100644 operator/cmd/cli/testdata/invalid-config.yaml create mode 100644 operator/cmd/cli/testdata/valid-config.yaml create mode 100644 operator/cmd/cli/version.go create mode 100644 operator/cmd/cli/version_test.go delete mode 100644 operator/cmd/opts/options.go delete mode 100755 operator/hack/install-helm-charts.sh create mode 100644 operator/internal/clustertopology/clustertopology.go create mode 100644 operator/internal/clustertopology/clustertopology_test.go delete mode 100644 operator/internal/topology/topology.go delete mode 100644 operator/internal/topology/topology_test.go create mode 100644 operator/internal/utils/ioutil/ioutil.go create mode 100644 operator/internal/utils/ioutil/ioutil_test.go diff --git a/docs/api-reference/operator-api.md b/docs/api-reference/operator-api.md index 2ef1be6ac..cd5b46d82 100644 --- a/docs/api-reference/operator-api.md +++ b/docs/api-reference/operator-api.md @@ -86,7 +86,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `levels` _[TopologyLevel](#topologylevel) array_ | Levels is an ordered list of topology levels from broadest to narrowest scope.
The order in this list defines the hierarchy (index 0 = broadest level).
This field is immutable after creation. | | MaxItems: 7
MinItems: 1
| +| `levels` _[TopologyLevel](#topologylevel) array_ | Levels is an ordered list of topology levels from broadest to narrowest scope.
The order in this list defines the hierarchy (index 0 = broadest level).
This field is immutable after creation. | | MinItems: 1
| #### ErrorCode @@ -608,9 +608,7 @@ _Appears in:_ _Underlying type:_ _string_ -TopologyDomain represents a predefined topology level in the hierarchy. -Topology ordering (broadest to narrowest): -Region > Zone > DataCenter > Block > Rack > Host > Numa +TopologyDomain represents a level in the cluster topology hierarchy. @@ -634,15 +632,18 @@ _Appears in:_ TopologyLevel defines a single level in the topology hierarchy. +Maps a platform-agnostic domain to a platform-specific node label key, +allowing workload operators a consistent way to reference topology levels when defining TopologyConstraint's. _Appears in:_ +- [ClusterTopologyConfiguration](#clustertopologyconfiguration) - [ClusterTopologySpec](#clustertopologyspec) | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `domain` _[TopologyDomain](#topologydomain)_ | Domain is the predefined level identifier used in TopologyConstraint references.
Must be one of: region, zone, datacenter, block, rack, host, numa | | Enum: [region zone datacenter block rack host numa]
Required: \{\}
| +| `domain` _[TopologyDomain](#topologydomain)_ | Domain is a platform provider-agnostic level identifier.
Must be one of: region, zone, datacenter, block, rack, host, numa | | Enum: [region zone datacenter block rack host numa]
Required: \{\}
| | `key` _string_ | Key is the node label key that identifies this topology domain.
Must be a valid Kubernetes label key (qualified name).
Examples: "topology.kubernetes.io/zone", "kubernetes.io/hostname" | | MaxLength: 63
MinLength: 1
Pattern: `^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]/)?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$`
Required: \{\}
| @@ -882,45 +883,6 @@ _Appears in:_ | `metrics` _[Server](#server)_ | Metrics is the configuration for serving the metrics endpoint. | | | -#### TopologyDomain - -_Underlying type:_ _string_ - -TopologyDomain represents a predefined topology level in the hierarchy. - - - -_Appears in:_ -- [TopologyLevel](#topologylevel) - -| Field | Description | -| --- | --- | -| `region` | TopologyDomainRegion represents the region level in the topology hierarchy.
| -| `zone` | TopologyDomainZone represents the zone level in the topology hierarchy.
| -| `datacenter` | TopologyDomainDataCenter represents the datacenter level in the topology hierarchy.
| -| `block` | TopologyDomainBlock represents the block level in the topology hierarchy.
| -| `rack` | TopologyDomainRack represents the rack level in the topology hierarchy.
| -| `host` | TopologyDomainHost represents the host level in the topology hierarchy.
| -| `numa` | TopologyDomainNuma represents the numa level in the topology hierarchy.
| - - -#### TopologyLevel - - - -TopologyLevel defines a single level in the topology hierarchy. - - - -_Appears in:_ -- [ClusterTopologyConfiguration](#clustertopologyconfiguration) - -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `domain` _[TopologyDomain](#topologydomain)_ | Domain is the predefined level identifier used in TopologyConstraint references. | | | -| `key` _string_ | Key is the node label key that identifies this topology domain. | | | - - #### WebhookServer diff --git a/operator/Makefile b/operator/Makefile index a4012e7c5..0645cbbaa 100644 --- a/operator/Makefile +++ b/operator/Makefile @@ -86,11 +86,6 @@ cover-html: test-cover @go tool cover -html=coverage.out -o coverage.html @echo "Coverage report generated at coverage.html" -# Run envtest tests (requires envtest binaries) -.PHONY: test-envtest -test-envtest: $(SETUP_ENVTEST) - @echo "Running envtest with CRD validation..." - # Run e2e tests .PHONY: test-e2e test-e2e: diff --git a/operator/api/common/constants/constants.go b/operator/api/common/constants/constants.go index ae3568cf8..437feaffa 100644 --- a/operator/api/common/constants/constants.go +++ b/operator/api/common/constants/constants.go @@ -16,6 +16,15 @@ package constants +const ( + // OperatorName is the name of the Grove operator. + OperatorName = "grove-operator" + // OperatorConfigGroupName is the name of the group for Grove operator configuration. + OperatorConfigGroupName = "operator.config.grove.io" + // OperatorGroupName is the name of the group for all Grove custom resources. + OperatorGroupName = "grove.io" +) + // Constants for finalizers. const ( // FinalizerPodCliqueSet is the finalizer for PodCliqueSet that is added to `.metadata.finalizers[]` slice. This will be placed on all PodCliqueSet resources @@ -107,4 +116,6 @@ const ( KindPodClique = "PodClique" // KindPodCliqueScalingGroup is the kind for a PodCliqueScalingGroup resource. KindPodCliqueScalingGroup = "PodCliqueScalingGroup" + // KindClusterTopology is the kind for a ClusterTopology resource. + KindClusterTopology = "ClusterTopology" ) diff --git a/operator/api/config/v1alpha1/register.go b/operator/api/config/v1alpha1/register.go index c4f0f181a..ccd73f208 100644 --- a/operator/api/config/v1alpha1/register.go +++ b/operator/api/config/v1alpha1/register.go @@ -17,16 +17,15 @@ package v1alpha1 import ( + "github.com/ai-dynamo/grove/operator/api/common/constants" + "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" ) -// GroupName is the group name used in this package -const GroupName = "operator.config.grove.io" - var ( // SchemeGroupVersion is group version used to register these objects - SchemeGroupVersion = schema.GroupVersion{Group: GroupName, Version: "v1alpha1"} + SchemeGroupVersion = schema.GroupVersion{Group: constants.OperatorConfigGroupName, Version: "v1alpha1"} // SchemeBuilder is used to add go types to the GroupVersionKind scheme SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes, addDefaultingFuncs) // AddToScheme is a reference to the Scheme Builder's AddToScheme function. diff --git a/operator/api/config/v1alpha1/types.go b/operator/api/config/v1alpha1/types.go index 826f7d027..a1d2aa1f7 100644 --- a/operator/api/config/v1alpha1/types.go +++ b/operator/api/config/v1alpha1/types.go @@ -17,6 +17,8 @@ package v1alpha1 import ( + corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -55,14 +57,14 @@ var ( type OperatorConfiguration struct { metav1.TypeMeta `json:",inline"` ClientConnection ClientConnectionConfiguration `json:"runtimeClientConnection"` - LeaderElection LeaderElectionConfiguration - Server ServerConfiguration `json:"server"` - Debugging *DebuggingConfiguration `json:"debugging,omitempty"` - Controllers ControllerConfiguration `json:"controllers"` - LogLevel LogLevel `json:"logLevel"` - LogFormat LogFormat `json:"logFormat"` - Authorizer AuthorizerConfig `json:"authorizer"` - ClusterTopology ClusterTopologyConfiguration `json:"clusterTopology"` + LeaderElection LeaderElectionConfiguration `json:"leaderElection"` + Server ServerConfiguration `json:"server"` + Debugging *DebuggingConfiguration `json:"debugging,omitempty"` + Controllers ControllerConfiguration `json:"controllers"` + LogLevel LogLevel `json:"logLevel"` + LogFormat LogFormat `json:"logFormat"` + Authorizer AuthorizerConfig `json:"authorizer"` + ClusterTopology ClusterTopologyConfiguration `json:"clusterTopology"` } // LeaderElectionConfiguration defines the configuration for the leader election. @@ -196,36 +198,5 @@ type ClusterTopologyConfiguration struct { // Levels is an ordered list of topology levels from broadest to narrowest scope. // Used to create/update the ClusterTopology CR at operator startup. // +optional - Levels []TopologyLevel `json:"levels,omitempty"` -} - -// TopologyDomain represents a predefined topology level in the hierarchy. -type TopologyDomain string - -const ( - // TopologyDomainRegion represents the region level in the topology hierarchy. - TopologyDomainRegion TopologyDomain = "region" - // TopologyDomainZone represents the zone level in the topology hierarchy. - TopologyDomainZone TopologyDomain = "zone" - // TopologyDomainDataCenter represents the datacenter level in the topology hierarchy. - TopologyDomainDataCenter TopologyDomain = "datacenter" - // TopologyDomainBlock represents the block level in the topology hierarchy. - TopologyDomainBlock TopologyDomain = "block" - // TopologyDomainRack represents the rack level in the topology hierarchy. - TopologyDomainRack TopologyDomain = "rack" - // TopologyDomainHost represents the host level in the topology hierarchy. - TopologyDomainHost TopologyDomain = "host" - // TopologyDomainNuma represents the numa level in the topology hierarchy. - TopologyDomainNuma TopologyDomain = "numa" -) - -// MaxTopologyLevels is the maximum number of topology levels supported. -const MaxTopologyLevels = 7 - -// TopologyLevel defines a single level in the topology hierarchy. -type TopologyLevel struct { - // Domain is the predefined level identifier used in TopologyConstraint references. - Domain TopologyDomain `json:"domain"` - // Key is the node label key that identifies this topology domain. - Key string `json:"key"` + Levels []corev1alpha1.TopologyLevel `json:"levels,omitempty"` } diff --git a/operator/api/config/v1alpha1/zz_generated.deepcopy.go b/operator/api/config/v1alpha1/zz_generated.deepcopy.go index 317f41bc1..70bc1f6ae 100644 --- a/operator/api/config/v1alpha1/zz_generated.deepcopy.go +++ b/operator/api/config/v1alpha1/zz_generated.deepcopy.go @@ -22,6 +22,7 @@ limitations under the License. package v1alpha1 import ( + corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" runtime "k8s.io/apimachinery/pkg/runtime" ) @@ -67,7 +68,7 @@ func (in *ClusterTopologyConfiguration) DeepCopyInto(out *ClusterTopologyConfigu *out = *in if in.Levels != nil { in, out := &in.Levels, &out.Levels - *out = make([]TopologyLevel, len(*in)) + *out = make([]corev1alpha1.TopologyLevel, len(*in)) copy(*out, *in) } return @@ -284,22 +285,6 @@ func (in *ServerConfiguration) DeepCopy() *ServerConfiguration { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *TopologyLevel) DeepCopyInto(out *TopologyLevel) { - *out = *in - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TopologyLevel. -func (in *TopologyLevel) DeepCopy() *TopologyLevel { - if in == nil { - return nil - } - out := new(TopologyLevel) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *WebhookServer) DeepCopyInto(out *WebhookServer) { *out = *in diff --git a/operator/api/config/validation/validation.go b/operator/api/config/validation/validation.go index 13d188200..b3e3bca65 100644 --- a/operator/api/config/validation/validation.go +++ b/operator/api/config/validation/validation.go @@ -17,9 +17,12 @@ package validation import ( + "fmt" + "slices" "strings" configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" + corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/sets" @@ -118,46 +121,47 @@ func mustBeGreaterThanZeroDuration(duration metav1.Duration, fldPath *field.Path // and validates domain and key uniqueness. func validateClusterTopologyConfiguration(clusterTopologyCfg configv1alpha1.ClusterTopologyConfiguration, fldPath *field.Path) field.ErrorList { allErrs := field.ErrorList{} - if !clusterTopologyCfg.Enabled { return allErrs } - allErrs = validateClusterTopologyLevels(clusterTopologyCfg.Levels, fldPath.Child("levels")) - return allErrs } -func validateClusterTopologyLevels(levels []configv1alpha1.TopologyLevel, fldPath *field.Path) field.ErrorList { +func validateClusterTopologyLevels(levels []corev1alpha1.TopologyLevel, fldPath *field.Path) field.ErrorList { allErrs := field.ErrorList{} if len(levels) == 0 { allErrs = append(allErrs, field.Required(fldPath, "levels are required when topology is enabled")) } + allErrs = append(allErrs, mustHaveSupportedTopologyDomains(levels, fldPath)...) + allErrs = append(allErrs, mustHaveUniqueTopologyLevels(levels, fldPath)...) + return allErrs +} - if len(levels) > configv1alpha1.MaxTopologyLevels { - allErrs = append(allErrs, field.TooMany(fldPath, len(levels), configv1alpha1.MaxTopologyLevels)) +func mustHaveSupportedTopologyDomains(levels []corev1alpha1.TopologyLevel, fldPath *field.Path) field.ErrorList { + allErrs := field.ErrorList{} + supportedDomains := corev1alpha1.SupportedTopologyDomains() + for i, level := range levels { + if !slices.Contains(supportedDomains, level.Domain) { + allErrs = append(allErrs, field.Invalid(fldPath.Index(i).Child("domain"), level.Domain, fmt.Sprintf("must be one of %v", supportedDomains))) + } } - - allErrs = append(allErrs, validateLevelsUniqueness(levels, fldPath)...) - return allErrs } -func validateLevelsUniqueness(levels []configv1alpha1.TopologyLevel, fldPath *field.Path) field.ErrorList { +func mustHaveUniqueTopologyLevels(levels []corev1alpha1.TopologyLevel, fldPath *field.Path) field.ErrorList { allErrs := field.ErrorList{} - seenDomains := make(map[string]int) - seenKeys := make(map[string]int) + seenDomains := make(map[corev1alpha1.TopologyDomain]struct{}) + seenKeys := make(map[string]struct{}) for i, level := range levels { + if _, exists := seenDomains[level.Domain]; exists { + allErrs = append(allErrs, field.Duplicate(fldPath.Index(i).Child("domain"), level.Domain)) + } if _, exists := seenKeys[level.Key]; exists { allErrs = append(allErrs, field.Duplicate(fldPath.Index(i).Child("key"), level.Key)) - } else { - seenKeys[level.Key] = i - } - if _, exists := seenDomains[string(level.Domain)]; exists { - allErrs = append(allErrs, field.Duplicate(fldPath.Index(i).Child("domain"), level.Domain)) - } else { - seenDomains[string(level.Domain)] = i } + seenDomains[level.Domain] = struct{}{} + seenKeys[level.Key] = struct{}{} } return allErrs } diff --git a/operator/api/config/validation/validation_test.go b/operator/api/config/validation/validation_test.go index 4b15439de..3f3f1906a 100644 --- a/operator/api/config/validation/validation_test.go +++ b/operator/api/config/validation/validation_test.go @@ -20,6 +20,7 @@ import ( "testing" configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" + corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" "github.com/stretchr/testify/assert" "k8s.io/apimachinery/pkg/util/validation/field" @@ -27,61 +28,182 @@ import ( func TestValidateClusterTopologyConfiguration(t *testing.T) { tests := []struct { - name string - config configv1alpha1.ClusterTopologyConfiguration - expectError bool - errorField string + name string + config configv1alpha1.ClusterTopologyConfiguration + expectErrors int + expectedFields []string + expectedTypes []field.ErrorType }{ { - name: "valid: enabled with levels", + name: "valid: disabled with no levels", config: configv1alpha1.ClusterTopologyConfiguration{ - Enabled: true, + Enabled: false, }, - expectError: false, + expectErrors: 0, }, { - name: "valid: disabled with no levels", + name: "valid: disabled with levels (levels are ignored when disabled)", config: configv1alpha1.ClusterTopologyConfiguration{ Enabled: false, + Levels: []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + }, }, - expectError: false, + expectErrors: 0, }, { - name: "valid: disabled with levels", + name: "valid: enabled with single level", config: configv1alpha1.ClusterTopologyConfiguration{ - Enabled: false, + Enabled: true, + Levels: []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + }, }, - expectError: false, + expectErrors: 0, }, { - name: "invalid: enabled with empty levels", + name: "valid: enabled with multiple levels", + config: configv1alpha1.ClusterTopologyConfiguration{ + Enabled: true, + Levels: []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainRegion, Key: "topology.kubernetes.io/region"}, + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + }, + }, + expectErrors: 0, + }, + { + name: "valid: enabled with all supported domains", config: configv1alpha1.ClusterTopologyConfiguration{ Enabled: true, + Levels: []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainRegion, Key: "topology.kubernetes.io/region"}, + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainDataCenter, Key: "topology.kubernetes.io/datacenter"}, + {Domain: corev1alpha1.TopologyDomainBlock, Key: "topology.kubernetes.io/block"}, + {Domain: corev1alpha1.TopologyDomainRack, Key: "topology.kubernetes.io/rack"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + {Domain: corev1alpha1.TopologyDomainNuma, Key: "topology.kubernetes.io/numa"}, + }, }, - expectError: true, - errorField: "clusterTopology.levels", + expectErrors: 0, }, { name: "invalid: enabled with empty levels", config: configv1alpha1.ClusterTopologyConfiguration{ Enabled: true, + Levels: []corev1alpha1.TopologyLevel{}, + }, + expectErrors: 1, + expectedFields: []string{"clusterTopology.levels"}, + expectedTypes: []field.ErrorType{field.ErrorTypeRequired}, + }, + { + name: "invalid: enabled with nil levels", + config: configv1alpha1.ClusterTopologyConfiguration{ + Enabled: true, + Levels: nil, + }, + expectErrors: 1, + expectedFields: []string{"clusterTopology.levels"}, + expectedTypes: []field.ErrorType{field.ErrorTypeRequired}, + }, + { + name: "invalid: unsupported domain", + config: configv1alpha1.ClusterTopologyConfiguration{ + Enabled: true, + Levels: []corev1alpha1.TopologyLevel{ + {Domain: "invalid-domain", Key: "some.key"}, + }, + }, + expectErrors: 1, + expectedFields: []string{"clusterTopology.levels[0].domain"}, + expectedTypes: []field.ErrorType{field.ErrorTypeInvalid}, + }, + { + name: "invalid: duplicate domains", + config: configv1alpha1.ClusterTopologyConfiguration{ + Enabled: true, + Levels: []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainZone, Key: "another.zone.key"}, + }, + }, + expectErrors: 1, + expectedFields: []string{"clusterTopology.levels[1].domain"}, + expectedTypes: []field.ErrorType{field.ErrorTypeDuplicate}, + }, + { + name: "invalid: duplicate keys", + config: configv1alpha1.ClusterTopologyConfiguration{ + Enabled: true, + Levels: []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "topology.kubernetes.io/zone"}, + }, + }, + expectErrors: 1, + expectedFields: []string{"clusterTopology.levels[1].key"}, + expectedTypes: []field.ErrorType{field.ErrorTypeDuplicate}, + }, + { + name: "invalid: duplicate domains and keys", + config: configv1alpha1.ClusterTopologyConfiguration{ + Enabled: true, + Levels: []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + }, + }, + expectErrors: 2, + expectedFields: []string{"clusterTopology.levels[1].domain", "clusterTopology.levels[1].key"}, + expectedTypes: []field.ErrorType{field.ErrorTypeDuplicate, field.ErrorTypeDuplicate}, + }, + { + name: "invalid: multiple unsupported domains", + config: configv1alpha1.ClusterTopologyConfiguration{ + Enabled: true, + Levels: []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: "invalid1", Key: "key1"}, + {Domain: "invalid2", Key: "key2"}, + }, }, - expectError: true, - errorField: "clusterTopology.levels", + expectErrors: 2, + expectedFields: []string{"clusterTopology.levels[1].domain", "clusterTopology.levels[2].domain"}, + expectedTypes: []field.ErrorType{field.ErrorTypeInvalid, field.ErrorTypeInvalid}, + }, + { + name: "invalid: multiple validation errors - unsupported domain and duplicates", + config: configv1alpha1.ClusterTopologyConfiguration{ + Enabled: true, + Levels: []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: "invalid", Key: "invalid.key"}, + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + }, + }, + expectErrors: 3, + expectedFields: []string{"clusterTopology.levels[1].domain", "clusterTopology.levels[2].domain", "clusterTopology.levels[2].key"}, + expectedTypes: []field.ErrorType{field.ErrorTypeInvalid, field.ErrorTypeDuplicate, field.ErrorTypeDuplicate}, }, } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - errs := validateClusterTopologyConfiguration(tt.config, field.NewPath("clusterTopology")) + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + errs := validateClusterTopologyConfiguration(test.config, field.NewPath("clusterTopology")) + + assert.Len(t, errs, test.expectErrors, "expected %d validation errors but got %d: %v", test.expectErrors, len(errs), errs) - if tt.expectError { - assert.NotEmpty(t, errs, "expected validation errors but got none") - if len(errs) > 0 { - assert.Equal(t, tt.errorField, errs[0].Field) + if test.expectErrors > 0 { + // Verify each expected error + for i, expectedField := range test.expectedFields { + assert.Equal(t, expectedField, errs[i].Field, "error %d: expected field %s but got %s", i, expectedField, errs[i].Field) + if i < len(test.expectedTypes) { + assert.Equal(t, test.expectedTypes[i], errs[i].Type, "error %d: expected type %s but got %s", i, test.expectedTypes[i], errs[i].Type) + } } - } else { - assert.Empty(t, errs, "expected no validation errors but got: %v", errs) } }) } diff --git a/operator/api/core/v1alpha1/clustertopology.go b/operator/api/core/v1alpha1/clustertopology.go index b06da417a..5de382283 100644 --- a/operator/api/core/v1alpha1/clustertopology.go +++ b/operator/api/core/v1alpha1/clustertopology.go @@ -17,12 +17,16 @@ package v1alpha1 import ( + "slices" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) const ( - // ClusterTopologyName is the name of the default ClusterTopology resource managed by the operator. - ClusterTopologyName = "grove-topology" + // DefaultClusterTopologyName is the name of the default ClusterTopology resource managed by the operator. + // Currently, Grove operator generates and maintains a single ClusterTopology resource per cluster. There is no support + // for externally created ClusterTopology resources. However, this may change in the future to allow more flexibility. + DefaultClusterTopologyName = "grove-topology" ) // +genclient @@ -62,9 +66,27 @@ type ClusterTopologySpec struct { Levels []TopologyLevel `json:"levels"` } -// TopologyDomain represents a predefined topology level in the hierarchy. -// Topology ordering (broadest to narrowest): -// Region > Zone > DataCenter > Block > Rack > Host > Numa +// TopologyLevel defines a single level in the topology hierarchy. +// Maps a platform-agnostic domain to a platform-specific node label key, +// allowing workload operators a consistent way to reference topology levels when defining TopologyConstraint's. +type TopologyLevel struct { + // Domain is a platform provider-agnostic level identifier. + // Must be one of: region, zone, datacenter, block, rack, host, numa + // +kubebuilder:validation:Required + // +kubebuilder:validation:Enum=region;zone;datacenter;block;rack;host;numa + Domain TopologyDomain `json:"domain"` + + // Key is the node label key that identifies this topology domain. + // Must be a valid Kubernetes label key (qualified name). + // Examples: "topology.kubernetes.io/zone", "kubernetes.io/hostname" + // +kubebuilder:validation:Required + // +kubebuilder:validation:MinLength=1 + // +kubebuilder:validation:MaxLength=63 + // +kubebuilder:validation:Pattern=`^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]/)?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$` + Key string `json:"key"` +} + +// TopologyDomain represents a level in the cluster topology hierarchy. type TopologyDomain string const ( @@ -84,8 +106,20 @@ const ( TopologyDomainNuma TopologyDomain = "numa" ) +// SupportedTopologyDomains returns all supported topology domain values. +func SupportedTopologyDomains() []TopologyDomain { + topologyDomains := make([]TopologyDomain, 0, len(topologyDomainOrder)) + for domain := range topologyDomainOrder { + topologyDomains = append(topologyDomains, domain) + } + return topologyDomains +} + // topologyDomainOrder defines the hierarchical order of topology domains from broadest to narrowest. // Lower value = broader scope (e.g., Region is broader than Zone). +// Context: TopologyDomain is used when defining TopologyConstraint in workload specifications. TopologyConstraint at different +// levels (PodClique, PodCliqueScalingGroup and PodCliqueSet) captures required packing guarantees that a scheduler backend +// must provide. var topologyDomainOrder = map[TopologyDomain]int{ TopologyDomainRegion: 0, TopologyDomainZone: 1, @@ -96,64 +130,9 @@ var topologyDomainOrder = map[TopologyDomain]int{ TopologyDomainNuma: 6, } -// Compare compares this domain with another domain. -// Returns: -// - negative value if this domain is broader than other -// - zero if domains are equal -// - positive value if this domain is narrower than other -// -// This method assumes both domains are valid (enforced by kubebuilder validation). -func (d TopologyDomain) Compare(other TopologyDomain) int { - return topologyDomainOrder[d] - topologyDomainOrder[other] -} - -// BroaderThan returns true if this domain represents a broader scope than the other domain. -// For example, Region.BroaderThan(Zone) returns true. -func (d TopologyDomain) BroaderThan(other TopologyDomain) bool { - return d.Compare(other) < 0 -} - -// NarrowerThan returns true if this domain represents a narrower scope than the other domain. -// For example, Zone.NarrowerThan(Region) returns true. -func (d TopologyDomain) NarrowerThan(other TopologyDomain) bool { - return d.Compare(other) > 0 -} - -// TopologyLevel defines a single level in the topology hierarchy. -type TopologyLevel struct { - // Domain is the predefined level identifier used in TopologyConstraint references. - // Must be one of: region, zone, datacenter, block, rack, host, numa - // +kubebuilder:validation:Required - // +kubebuilder:validation:Enum=region;zone;datacenter;block;rack;host;numa - Domain TopologyDomain `json:"domain"` - - // Key is the node label key that identifies this topology domain. - // Must be a valid Kubernetes label key (qualified name). - // Examples: "topology.kubernetes.io/zone", "kubernetes.io/hostname" - // +kubebuilder:validation:Required - // +kubebuilder:validation:MinLength=1 - // +kubebuilder:validation:MaxLength=63 - // +kubebuilder:validation:Pattern=`^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]/)?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$` - Key string `json:"key"` -} - -// Compare compares this topology level with another level based on their domains. -// Returns: -// - negative value if this level is broader than other -// - zero if levels have equal domain scope -// - positive value if this level is narrower than other -func (l TopologyLevel) Compare(other TopologyLevel) int { - return l.Domain.Compare(other.Domain) -} - -// BroaderThan returns true if this level represents a broader scope than the other level. -// For example, a Region level is broader than a Zone level. -func (l TopologyLevel) BroaderThan(other TopologyLevel) bool { - return l.Domain.BroaderThan(other.Domain) -} - -// NarrowerThan returns true if this level represents a narrower scope than the other level. -// For example, a Zone level is narrower than a Region level. -func (l TopologyLevel) NarrowerThan(other TopologyLevel) bool { - return l.Domain.NarrowerThan(other.Domain) +// SortTopologyLevels sorts a slice of TopologyLevel from broadest to narrowest scope, sorting by their Domain. +func SortTopologyLevels(levels []TopologyLevel) { + slices.SortFunc(levels, func(a, b TopologyLevel) int { + return topologyDomainOrder[a.Domain] - topologyDomainOrder[b.Domain] + }) } diff --git a/operator/api/core/v1alpha1/clustertopology_test.go b/operator/api/core/v1alpha1/clustertopology_test.go index cfc39333f..dce6fe9bc 100644 --- a/operator/api/core/v1alpha1/clustertopology_test.go +++ b/operator/api/core/v1alpha1/clustertopology_test.go @@ -18,117 +18,136 @@ package v1alpha1 import ( "testing" + + "github.com/stretchr/testify/assert" ) -func TestTopologyDomainComparison(t *testing.T) { +func TestSupportedTopologyDomains(t *testing.T) { + expectedDomains := []TopologyDomain{ + TopologyDomainRegion, + TopologyDomainZone, + TopologyDomainDataCenter, + TopologyDomainBlock, + TopologyDomainRack, + TopologyDomainHost, + TopologyDomainNuma, + } + + domains := SupportedTopologyDomains() + + // Verify all expected domains are present and no duplicates + seen := make(map[TopologyDomain]bool) + for _, domain := range domains { + assert.False(t, seen[domain], "domain %s should not be duplicated", domain) + seen[domain] = true + } + // Verify all expected domains are included + for _, expected := range expectedDomains { + assert.True(t, seen[expected], "domain %s should be in the returned list", expected) + } +} + +func TestSortTopologyLevels(t *testing.T) { tests := []struct { - name string - domain TopologyDomain - other TopologyDomain - expectedCompare int - expectedBroaderThan bool - expectedNarrowerThan bool + name string + input []TopologyLevel + expected []TopologyLevel }{ { - name: "region vs zone", - domain: TopologyDomainRegion, - other: TopologyDomainZone, - expectedCompare: -1, - expectedBroaderThan: true, - expectedNarrowerThan: false, - }, - { - name: "zone vs region", - domain: TopologyDomainZone, - other: TopologyDomainRegion, - expectedCompare: 1, - expectedBroaderThan: false, - expectedNarrowerThan: true, - }, - { - name: "region vs region (equal)", - domain: TopologyDomainRegion, - other: TopologyDomainRegion, - expectedCompare: 0, - expectedBroaderThan: false, - expectedNarrowerThan: false, - }, - { - name: "host vs numa", - domain: TopologyDomainHost, - other: TopologyDomainNuma, - expectedCompare: -1, - expectedBroaderThan: true, - expectedNarrowerThan: false, + name: "empty slice", + input: []TopologyLevel{}, + expected: []TopologyLevel{}, }, { - name: "region vs numa (multiple levels apart)", - domain: TopologyDomainRegion, - other: TopologyDomainNuma, - expectedCompare: -6, - expectedBroaderThan: true, - expectedNarrowerThan: false, + name: "single level", + input: []TopologyLevel{ + {Domain: TopologyDomainZone, Key: "zone-key"}, + }, + expected: []TopologyLevel{ + {Domain: TopologyDomainZone, Key: "zone-key"}, + }, }, { - name: "numa vs region (multiple levels apart)", - domain: TopologyDomainNuma, - other: TopologyDomainRegion, - expectedCompare: 6, - expectedBroaderThan: false, - expectedNarrowerThan: true, + name: "already sorted", + input: []TopologyLevel{ + {Domain: TopologyDomainRegion, Key: "region-key"}, + {Domain: TopologyDomainZone, Key: "zone-key"}, + {Domain: TopologyDomainHost, Key: "host-key"}, + }, + expected: []TopologyLevel{ + {Domain: TopologyDomainRegion, Key: "region-key"}, + {Domain: TopologyDomainZone, Key: "zone-key"}, + {Domain: TopologyDomainHost, Key: "host-key"}, + }, }, { - name: "datacenter vs rack", - domain: TopologyDomainDataCenter, - other: TopologyDomainRack, - expectedCompare: -2, - expectedBroaderThan: true, - expectedNarrowerThan: false, + name: "reverse sorted", + input: []TopologyLevel{ + {Domain: TopologyDomainHost, Key: "host-key"}, + {Domain: TopologyDomainZone, Key: "zone-key"}, + {Domain: TopologyDomainRegion, Key: "region-key"}, + }, + expected: []TopologyLevel{ + {Domain: TopologyDomainRegion, Key: "region-key"}, + {Domain: TopologyDomainZone, Key: "zone-key"}, + {Domain: TopologyDomainHost, Key: "host-key"}, + }, }, { - name: "rack vs datacenter", - domain: TopologyDomainRack, - other: TopologyDomainDataCenter, - expectedCompare: 2, - expectedBroaderThan: false, - expectedNarrowerThan: true, + name: "random order", + input: []TopologyLevel{ + {Domain: TopologyDomainRack, Key: "rack-key"}, + {Domain: TopologyDomainRegion, Key: "region-key"}, + {Domain: TopologyDomainNuma, Key: "numa-key"}, + {Domain: TopologyDomainZone, Key: "zone-key"}, + }, + expected: []TopologyLevel{ + {Domain: TopologyDomainRegion, Key: "region-key"}, + {Domain: TopologyDomainZone, Key: "zone-key"}, + {Domain: TopologyDomainRack, Key: "rack-key"}, + {Domain: TopologyDomainNuma, Key: "numa-key"}, + }, }, { - name: "rack vs zone (narrower vs broader)", - domain: TopologyDomainRack, - other: TopologyDomainZone, - expectedCompare: 3, - expectedBroaderThan: false, - expectedNarrowerThan: true, - }, - { - name: "zone vs rack (broader vs narrower)", - domain: TopologyDomainZone, - other: TopologyDomainRack, - expectedCompare: -3, - expectedBroaderThan: true, - expectedNarrowerThan: false, + name: "all domains", + input: []TopologyLevel{ + {Domain: TopologyDomainNuma, Key: "numa-key"}, + {Domain: TopologyDomainHost, Key: "host-key"}, + {Domain: TopologyDomainRack, Key: "rack-key"}, + {Domain: TopologyDomainBlock, Key: "block-key"}, + {Domain: TopologyDomainDataCenter, Key: "dc-key"}, + {Domain: TopologyDomainZone, Key: "zone-key"}, + {Domain: TopologyDomainRegion, Key: "region-key"}, + }, + expected: []TopologyLevel{ + {Domain: TopologyDomainRegion, Key: "region-key"}, + {Domain: TopologyDomainZone, Key: "zone-key"}, + {Domain: TopologyDomainDataCenter, Key: "dc-key"}, + {Domain: TopologyDomainBlock, Key: "block-key"}, + {Domain: TopologyDomainRack, Key: "rack-key"}, + {Domain: TopologyDomainHost, Key: "host-key"}, + {Domain: TopologyDomainNuma, Key: "numa-key"}, + }, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - // Test Compare() - gotCompare := tt.domain.Compare(tt.other) - if gotCompare != tt.expectedCompare { - t.Errorf("Compare() = %v, want %v", gotCompare, tt.expectedCompare) - } + // Make a copy to avoid modifying test data + input := make([]TopologyLevel, len(tt.input)) + copy(input, tt.input) + + // Sort the input + SortTopologyLevels(input) - // Test BroaderThan() - gotBroaderThan := tt.domain.BroaderThan(tt.other) - if gotBroaderThan != tt.expectedBroaderThan { - t.Errorf("BroaderThan() = %v, want %v", gotBroaderThan, tt.expectedBroaderThan) + // Check if sorted correctly + if len(input) != len(tt.expected) { + t.Fatalf("length mismatch: got %d, want %d", len(input), len(tt.expected)) } - // Test NarrowerThan() - gotNarrowerThan := tt.domain.NarrowerThan(tt.other) - if gotNarrowerThan != tt.expectedNarrowerThan { - t.Errorf("NarrowerThan() = %v, want %v", gotNarrowerThan, tt.expectedNarrowerThan) + for i := range input { + assert.Equal(t, input[i].Domain, tt.expected[i].Domain, "at index %d: got domain %v, want %v", i, input[i].Domain, tt.expected[i].Domain) + assert.Equal(t, input[i].Key, tt.expected[i].Key, "at index %d: got key %v, want %v", i, input[i].Key, tt.expected[i].Key) } }) } diff --git a/operator/api/core/v1alpha1/crds/grove.io_clustertopologies.yaml b/operator/api/core/v1alpha1/crds/grove.io_clustertopologies.yaml index 5328e0aa6..47630912e 100644 --- a/operator/api/core/v1alpha1/crds/grove.io_clustertopologies.yaml +++ b/operator/api/core/v1alpha1/crds/grove.io_clustertopologies.yaml @@ -49,12 +49,14 @@ spec: The order in this list defines the hierarchy (index 0 = broadest level). This field is immutable after creation. items: - description: TopologyLevel defines a single level in the topology - hierarchy. + description: |- + TopologyLevel defines a single level in the topology hierarchy. + Maps a platform-agnostic domain to a platform-specific node label key, + allowing workload operators a consistent way to reference topology levels when defining TopologyConstraint's. properties: domain: description: |- - Domain is the predefined level identifier used in TopologyConstraint references. + Domain is a platform provider-agnostic level identifier. Must be one of: region, zone, datacenter, block, rack, host, numa enum: - region diff --git a/operator/api/core/v1alpha1/register.go b/operator/api/core/v1alpha1/register.go index b397b95bd..c400e30f2 100644 --- a/operator/api/core/v1alpha1/register.go +++ b/operator/api/core/v1alpha1/register.go @@ -17,19 +17,16 @@ package v1alpha1 import ( + "github.com/ai-dynamo/grove/operator/api/common/constants" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" ) -const ( - // GroupName is the name of the group for all resources defined in this package. - GroupName = "grove.io" -) - var ( // SchemeGroupVersion is group version used to register these objects - SchemeGroupVersion = schema.GroupVersion{Group: GroupName, Version: "v1alpha1"} + SchemeGroupVersion = schema.GroupVersion{Group: constants.OperatorGroupName, Version: "v1alpha1"} // SchemeBuilder is used to add go types to the GroupVersionKind scheme SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes) // AddToScheme is a reference to the Scheme Builder's AddToScheme function. diff --git a/operator/charts/templates/clusterrole.yaml b/operator/charts/templates/clusterrole.yaml index fa0491943..5fcd04eaf 100644 --- a/operator/charts/templates/clusterrole.yaml +++ b/operator/charts/templates/clusterrole.yaml @@ -27,7 +27,6 @@ rules: - podcliques/status - podcliquescalinggroups - clustertopologies - - clustertopologies/status - podcliquescalinggroups/status verbs: - create diff --git a/operator/charts/values.yaml b/operator/charts/values.yaml index af1a13fad..0f3528ea9 100644 --- a/operator/charts/values.yaml +++ b/operator/charts/values.yaml @@ -62,6 +62,21 @@ config: logFormat: json clusterTopology: enabled: false +# levels: +# - domain: zone +# key: topology.kubernetes.io/zone +# - domain: datacenter +# key: +# - domain: block +# key: +# - domain: rack +# key: +# - domain: host +# key: kubernetes.io/hostname +# - domain: region +# key: topology.kubernetes.io/region +# - domain: numa +# key: authorizer: enabled: false exemptServiceAccountUserNames: diff --git a/operator/cmd/cli/cli.go b/operator/cmd/cli/cli.go new file mode 100644 index 000000000..e44c64704 --- /dev/null +++ b/operator/cmd/cli/cli.go @@ -0,0 +1,124 @@ +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package cli + +import ( + "fmt" + "os" + + apicommonconstants "github.com/ai-dynamo/grove/operator/api/common/constants" + configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" + operatorvalidation "github.com/ai-dynamo/grove/operator/api/config/validation" + "k8s.io/apimachinery/pkg/runtime/serializer" + + "github.com/spf13/pflag" + "k8s.io/apimachinery/pkg/runtime" +) + +const ( + // ExitSuccess is the exit code indicating that the application exited with no error. + ExitSuccess = iota + // ExitErrParseCLIArgs is the exit code indicating that the application exited due to an error parsing CLI arguments. + ExitErrParseCLIArgs + // ExitErrLoadOperatorConfig indicates that the application exited due to an error loading the operator configuration. + ExitErrLoadOperatorConfig + // ExitErrSynchronizeTopology indicates that the application exited due to an error synchronizing the cluster topology. + ExitErrSynchronizeTopology + // ExitErrInitializeManager indicates that the application exited due to an error initializing the manager. + // This includes registration of controllers and webhooks and setting up probes. + ExitErrInitializeManager + // ExitErrStart indicates that the application exited due to an error when starting the application. + ExitErrStart +) + +var ( + errParseCLIArgs = fmt.Errorf("error parsing cli arguments") + errMissingLaunchOption = fmt.Errorf("missing required launch option") + errLoadOperatorConfig = fmt.Errorf("cannot load %q operator config", apicommonconstants.OperatorName) + errInvalidOperatorConfig = fmt.Errorf("invalid %q operator config", apicommonconstants.OperatorName) +) + +// LaunchOptions defines options for launching the operator. +type LaunchOptions struct { + // ConfigFile is the path to the operator configuration file. + ConfigFile string + // Version indicates whether to print the operator version and exit. + Version bool +} + +// ParseLaunchOptions parses the CLI arguments for the operator. +func ParseLaunchOptions(cliArgs []string) (*LaunchOptions, error) { + launchOpts := &LaunchOptions{} + flagSet := pflag.NewFlagSet(apicommonconstants.OperatorName, pflag.ContinueOnError) + launchOpts.mapFlags(flagSet) + if err := flagSet.Parse(cliArgs); err != nil { + return nil, fmt.Errorf("%w: %w", errParseCLIArgs, err) + } + if err := launchOpts.validate(); err != nil { + return nil, err + } + return launchOpts, nil +} + +// LoadAndValidateOperatorConfig loads and validates the operator configuration from the specified path in the launch options. +func (o *LaunchOptions) LoadAndValidateOperatorConfig() (*configv1alpha1.OperatorConfiguration, error) { + operatorConfig, err := o.loadOperatorConfig() + if err != nil { + return nil, err + } + if errs := operatorvalidation.ValidateOperatorConfiguration(operatorConfig); len(errs) > 0 { + return nil, fmt.Errorf("%w: %w", errInvalidOperatorConfig, errs.ToAggregate()) + } + return operatorConfig, nil +} + +func (o *LaunchOptions) loadOperatorConfig() (*configv1alpha1.OperatorConfiguration, error) { + // Set up scheme and decoder for operator configuration + configScheme := runtime.NewScheme() + if err := configv1alpha1.AddToScheme(configScheme); err != nil { + return nil, fmt.Errorf("%w: error adding to scheme: %w", errLoadOperatorConfig, err) + } + configDecoder := serializer.NewCodecFactory(configScheme).UniversalDecoder() + + // Read configuration file + operatorConfigBytes, err := os.ReadFile(o.ConfigFile) + if err != nil { + return nil, fmt.Errorf("%w: error reading file: %w", errLoadOperatorConfig, err) + } + + // Decode configuration + operatorConfig := &configv1alpha1.OperatorConfiguration{} + if err = runtime.DecodeInto(configDecoder, operatorConfigBytes, operatorConfig); err != nil { + return nil, fmt.Errorf("%w: error decoding operator config: %w", errLoadOperatorConfig, err) + } + return operatorConfig, nil +} + +func (o *LaunchOptions) mapFlags(fs *pflag.FlagSet) { + fs.StringVar(&o.ConfigFile, "config", "", "Path to the operator configuration file.") + fs.BoolVarP(&o.Version, "version", "v", false, "Print the version and exit.") +} + +func (o *LaunchOptions) validate() error { + if len(o.ConfigFile) == 0 && !o.Version { + return fmt.Errorf("%w: one of version or operator config file must be specified", errMissingLaunchOption) + } + if len(o.ConfigFile) > 0 && o.Version { + return fmt.Errorf("%w: both version and operator config file cannot be specified", errMissingLaunchOption) + } + return nil +} diff --git a/operator/cmd/cli/cli_test.go b/operator/cmd/cli/cli_test.go new file mode 100644 index 000000000..19b8b67c8 --- /dev/null +++ b/operator/cmd/cli/cli_test.go @@ -0,0 +1,171 @@ +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package cli + +import ( + "errors" + "testing" + + apicommonconstants "github.com/ai-dynamo/grove/operator/api/common/constants" + configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" + + "github.com/spf13/pflag" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestParseLaunchOptions tests the parsing of CLI arguments into LaunchOptions. +func TestParseLaunchOptions(t *testing.T) { + tests := []struct { + name string + args []string + expected *LaunchOptions + expectedError error + }{ + { + name: "valid config file", + args: []string{"--config", "testdata/valid-config.yaml"}, + expected: &LaunchOptions{ + ConfigFile: "testdata/valid-config.yaml", + }, + }, + { + name: "no CLI args", + args: []string{}, + expected: nil, + expectedError: errMissingLaunchOption, + }, + { + name: "unknown CLI flag", + args: []string{"--unknown-flag", "value"}, + expected: nil, + expectedError: errParseCLIArgs, + }, + { + name: "invalid CLI flag format", + args: []string{"---config", "test.yaml"}, + expected: nil, + expectedError: errParseCLIArgs, + }, + { + name: "no value specified for config-file flag", + args: []string{"--config"}, + expected: nil, + expectedError: errParseCLIArgs, + }, + { + name: "multiple config_file flags", // last one should take precedence + args: []string{"--config", "first.yaml", "--config", "second.yaml"}, + expected: &LaunchOptions{ + ConfigFile: "second.yaml", + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + opts, err := ParseLaunchOptions(test.args) + + if test.expectedError != nil { + require.Error(t, err) + assert.True(t, errors.Is(err, test.expectedError), + "expected error to wrap %v, got %v", test.expectedError, err) + } else { + require.NoError(t, err) + require.NotNil(t, opts) + assert.Equal(t, test.expected.ConfigFile, opts.ConfigFile) + } + }) + } +} + +// TestMapFlags tests that flags are properly mapped to the LaunchOptions struct. +func TestMapFlags(t *testing.T) { + opts := &LaunchOptions{} + flagSet := pflag.NewFlagSet(apicommonconstants.OperatorName, pflag.ContinueOnError) + opts.mapFlags(flagSet) + + // Verify the config-file flag was registered + flag := flagSet.Lookup("config") + require.NotNil(t, flag, "config flag should be registered") + assert.Equal(t, "", flag.DefValue, "config should have empty default value") + assert.Equal(t, "string", flag.Value.Type(), "config should be a string flag") +} + +// TestLoadAndValidateOperatorConfig tests loading and validating operator configuration. +func TestLoadAndValidateOperatorConfig(t *testing.T) { + tests := []struct { + name string + configFile string + expectedError error + validateFunc func(t *testing.T, config *configv1alpha1.OperatorConfiguration) + }{ + { + name: "valid config", + configFile: "testdata/valid-config.yaml", + validateFunc: func(t *testing.T, config *configv1alpha1.OperatorConfiguration) { + require.NotNil(t, config) + assert.Equal(t, configv1alpha1.LogLevel("info"), config.LogLevel) + assert.Equal(t, configv1alpha1.LogFormat("json"), config.LogFormat) + assert.Equal(t, float32(100), config.ClientConnection.QPS) + assert.Equal(t, 150, config.ClientConnection.Burst) + assert.Equal(t, 9443, config.Server.Webhooks.Port) + assert.NotNil(t, config.Server.HealthProbes) + assert.Equal(t, 9444, config.Server.HealthProbes.Port) + assert.NotNil(t, config.Server.Metrics) + assert.Equal(t, 9445, config.Server.Metrics.Port) + assert.NotNil(t, config.Controllers.PodCliqueSet.ConcurrentSyncs) + assert.Equal(t, 3, *config.Controllers.PodCliqueSet.ConcurrentSyncs) + }, + }, + { + name: "invalid operator config", + configFile: "testdata/invalid-config.yaml", + expectedError: errInvalidOperatorConfig, + }, + { + name: "non existent config file", + configFile: "testdata/nonexistent.yaml", + expectedError: errLoadOperatorConfig, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + opts := &LaunchOptions{ + ConfigFile: test.configFile, + } + + config, err := opts.LoadAndValidateOperatorConfig() + + if test.expectedError != nil { + require.Error(t, err) + if test.expectedError != nil { + assert.True(t, errors.Is(err, test.expectedError), + "expected error to wrap %v, got %v", test.expectedError, err) + } + assert.Nil(t, config) + } else { + require.NoError(t, err) + require.NotNil(t, config) + if test.validateFunc != nil { + test.validateFunc(t, config) + } + } + }) + } +} diff --git a/operator/cmd/cli/testdata/invalid-config.yaml b/operator/cmd/cli/testdata/invalid-config.yaml new file mode 100644 index 000000000..34c36b9c3 --- /dev/null +++ b/operator/cmd/cli/testdata/invalid-config.yaml @@ -0,0 +1,26 @@ +apiVersion: operator.config.grove.io/v1alpha1 +kind: OperatorConfiguration +runtimeClientConnection: + qps: 100 + burst: -10 # invalid value +leaderElection: + enabled: false +server: + webhooks: + bindAddress: 0.0.0.0 + port: 9443 + serverCertDir: /etc/grove-operator/webhook-certs +controllers: + podCliqueSet: + concurrentSyncs: 3 + podClique: + concurrentSyncs: 3 + podCliqueScalingGroup: + concurrentSyncs: 2 +logLevel: info +logFormat: json +authorizer: + enabled: false +clusterTopology: + enabled: false + diff --git a/operator/cmd/cli/testdata/valid-config.yaml b/operator/cmd/cli/testdata/valid-config.yaml new file mode 100644 index 000000000..570c4630c --- /dev/null +++ b/operator/cmd/cli/testdata/valid-config.yaml @@ -0,0 +1,38 @@ +apiVersion: operator.config.grove.io/v1alpha1 +kind: OperatorConfiguration +runtimeClientConnection: + qps: 100 + burst: 150 +leaderElection: + enabled: true + leaseDuration: 15s + renewDeadline: 10s + retryPeriod: 2s + resourceLock: leases + resourceName: grove-operator-leader-election + resourceNamespace: default +server: + webhooks: + bindAddress: 0.0.0.0 + port: 9443 + serverCertDir: /etc/grove-operator/webhook-certs + healthProbes: + bindAddress: 0.0.0.0 + port: 9444 + metrics: + bindAddress: 0.0.0.0 + port: 9445 +controllers: + podCliqueSet: + concurrentSyncs: 3 + podClique: + concurrentSyncs: 3 + podCliqueScalingGroup: + concurrentSyncs: 2 +logLevel: info +logFormat: json +authorizer: + enabled: false +clusterTopology: + enabled: false + diff --git a/operator/cmd/cli/version.go b/operator/cmd/cli/version.go new file mode 100644 index 000000000..1a3f40433 --- /dev/null +++ b/operator/cmd/cli/version.go @@ -0,0 +1,68 @@ +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package cli + +import ( + "fmt" + "runtime/debug" + + apicommonconstants "github.com/ai-dynamo/grove/operator/api/common/constants" +) + +// GetBuildInfo gets the operator build information. +// If verbose is true, it returns detailed build information else it will only return the application version. +func GetBuildInfo(verbose bool) string { + info, ok := debug.ReadBuildInfo() + if !ok { + fmt.Printf("%s: binary build info not embedded\n", apicommonconstants.OperatorName) + return "" + } + if !verbose { + return fmt.Sprintf("%s version: %s\n", apicommonconstants.OperatorName, info.Main.Version) + } + return getVerboseBuildInfo(info) +} + +func getVerboseBuildInfo(info *debug.BuildInfo) string { + // Git information + gitVersion := getSetting(info, "vcs.version") + commitTime := getSetting(info, "vcs.time") + dirty := getSetting(info, "vcs.modified") + // OS and Arch + os := getSetting(info, "GOOS") + arch := getSetting(info, "GOARCH") + + return fmt.Sprintf(` +%s Build Information: + Version: %s + Go version: %s + OS/Arch: %s/%s + Git Information: + Revision (commit hash): %s + Commit Time: %s + Modified (dirty): %s +`, apicommonconstants.OperatorName, info.Main.Version, info.GoVersion, os, arch, gitVersion, commitTime, dirty) +} + +func getSetting(info *debug.BuildInfo, key string) string { + for _, setting := range info.Settings { + if setting.Key == key { + return setting.Value + } + } + return fmt.Sprintf("", key) +} diff --git a/operator/cmd/cli/version_test.go b/operator/cmd/cli/version_test.go new file mode 100644 index 000000000..797bf6a4a --- /dev/null +++ b/operator/cmd/cli/version_test.go @@ -0,0 +1,158 @@ +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package cli + +import ( + "runtime/debug" + "testing" + + apicommonconstants "github.com/ai-dynamo/grove/operator/api/common/constants" + + "github.com/stretchr/testify/assert" +) + +// TestGetSetting tests the getSetting helper function. +func TestGetSetting(t *testing.T) { + tests := []struct { + name string + info *debug.BuildInfo + key string + expected string + }{ + { + name: "setting exists", + info: &debug.BuildInfo{ + Settings: []debug.BuildSetting{ + {Key: "vcs.version", Value: "abc123"}, + {Key: "vcs.time", Value: "2025-12-28T10:00:00Z"}, + {Key: "GOOS", Value: "linux"}, + }, + }, + key: "vcs.version", + expected: "abc123", + }, + { + name: "setting does not exist", + info: &debug.BuildInfo{ + Settings: []debug.BuildSetting{ + {Key: "vcs.version", Value: "abc123"}, + {Key: "GOOS", Value: "linux"}, + }, + }, + key: "nonexistent.key", + expected: "", + }, + { + name: "empty key", + info: &debug.BuildInfo{ + Settings: []debug.BuildSetting{}, + }, + key: "vcs.version", + expected: "", + }, + { + name: "settings is nil", + info: &debug.BuildInfo{ + Settings: nil, + }, + key: "vcs.version", + expected: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := getSetting(tt.info, tt.key) + assert.Equal(t, tt.expected, result) + }) + } +} + +// TestGetVerboseBuildInfo tests the getVerboseBuildInfo helper function. +func TestGetVerboseBuildInfo(t *testing.T) { + tests := []struct { + name string + info *debug.BuildInfo + expectedContains []string + expectedNotEmpty bool + }{ + { + name: "build information is complete", + info: &debug.BuildInfo{ + Main: debug.Module{ + Version: "v1.0.0", + }, + GoVersion: "go1.21.5", + Settings: []debug.BuildSetting{ + {Key: "vcs.version", Value: "abc123def456"}, + {Key: "vcs.time", Value: "2025-12-28T10:00:00Z"}, + {Key: "vcs.modified", Value: "false"}, + {Key: "GOOS", Value: "linux"}, + {Key: "GOARCH", Value: "amd64"}, + }, + }, + expectedContains: []string{ + apicommonconstants.OperatorName + " Build Information:", + "Version: v1.0.0", + "Go version: go1.21.5", + "OS/Arch: linux/amd64", + "Revision (commit hash): abc123def456", + "Commit Time: 2025-12-28T10:00:00Z", + "Modified (dirty): false", + }, + expectedNotEmpty: true, + }, + { + name: "build information is missing some settings", + info: &debug.BuildInfo{ + Main: debug.Module{ + Version: "v0.1.0-dev", + }, + GoVersion: "go1.22.0", + Settings: []debug.BuildSetting{ + {Key: "GOOS", Value: "darwin"}, + {Key: "GOARCH", Value: "arm64"}, + }, + }, + expectedContains: []string{ + apicommonconstants.OperatorName + " Build Information:", + "Version: v0.1.0-dev", + "Go version: go1.22.0", + "OS/Arch: darwin/arm64", + "", + "", + "", + }, + expectedNotEmpty: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := getVerboseBuildInfo(tt.info) + + if tt.expectedNotEmpty { + assert.NotEmpty(t, result) + } + + for _, expected := range tt.expectedContains { + assert.Contains(t, result, expected, + "result should contain '%s'", expected) + } + }) + } +} diff --git a/operator/cmd/main.go b/operator/cmd/main.go index cbbc0349f..1418e8c48 100644 --- a/operator/cmd/main.go +++ b/operator/cmd/main.go @@ -17,21 +17,26 @@ package main import ( + "context" + "errors" "flag" + "fmt" + "io" "os" + apicommonconstants "github.com/ai-dynamo/grove/operator/api/common/constants" configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" - groveopts "github.com/ai-dynamo/grove/operator/cmd/opts" + "github.com/ai-dynamo/grove/operator/cmd/cli" + "github.com/ai-dynamo/grove/operator/internal/clustertopology" grovectrl "github.com/ai-dynamo/grove/operator/internal/controller" "github.com/ai-dynamo/grove/operator/internal/controller/cert" grovelogger "github.com/ai-dynamo/grove/operator/internal/logger" - "github.com/ai-dynamo/grove/operator/internal/topology" groveversion "github.com/ai-dynamo/grove/operator/internal/version" + "sigs.k8s.io/controller-runtime/pkg/client" "github.com/spf13/pflag" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" ) var ( @@ -40,89 +45,87 @@ var ( func main() { ctrl.SetLogger(grovelogger.MustNewLogger(false, configv1alpha1.InfoLevel, configv1alpha1.LogFormatJSON)) + groveInfo := groveversion.New() - fs := pflag.CommandLine - groveversion.AddFlags(fs) - cliOpts := groveopts.NewCLIOptions(fs) - - // parse and print command line flags - pflag.Parse() - groveversion.PrintVersionAndExitIfRequested() - - logger.Info("Starting grove operator", "version", groveversion.Get()) - printFlags() - - operatorCfg, err := initializeOperatorConfig(cliOpts) + launchOpts, err := cli.ParseLaunchOptions(os.Args[1:]) if err != nil { - logger.Error(err, "failed to initialize operator configuration") - os.Exit(1) + handleErrorAndExit(err, cli.ExitErrParseCLIArgs) + } + if launchOpts.Version { + _, _ = fmt.Fprintf(io.Writer(os.Stdout), "%s %v\n", apicommonconstants.OperatorName, groveInfo) + os.Exit(cli.ExitSuccess) + } + //time.Sleep(20 * time.Minute) + operatorConfig, err := launchOpts.LoadAndValidateOperatorConfig() + if err != nil { + logger.Error(err, "failed to load operator config") + handleErrorAndExit(err, cli.ExitErrLoadOperatorConfig) } - mgr, err := grovectrl.CreateManager(operatorCfg) + logger.Info("Starting grove operator", "grove-info", groveInfo.Verbose()) + printFlags() + + mgr, err := grovectrl.CreateManager(operatorConfig) if err != nil { logger.Error(err, "failed to create grove controller manager") os.Exit(1) } - topologyK8sClient, err := client.New(mgr.GetConfig(), client.Options{Scheme: mgr.GetScheme()}) ctx := ctrl.SetupSignalHandler() - if operatorCfg.ClusterTopology.Enabled { - logger.Info("Topology is enabled") - // create a Kubernetes client for cluster topology - // the default client manager is not running prior (mgr.Start()) - if err != nil { - logger.Error(err, "failed to create Kubernetes client") - os.Exit(1) - } - if err = topology.EnsureTopology(ctx, topologyK8sClient, - corev1alpha1.ClusterTopologyName, operatorCfg.ClusterTopology.Levels); err != nil { - logger.Error(err, "cannot create/update cluster topology, operator cannot start") - os.Exit(1) - } - } else { - logger.Info("Topology is disabled") - err = topology.EnsureDeleteClusterTopology(ctx, topologyK8sClient, corev1alpha1.ClusterTopologyName) - if err != nil { - logger.Error(err, "non-fatal: cannot delete cluster topology") - } + + // Initialize or clean up ClusterTopology based on operator configuration. + // This must be done before starting the controllers that may depend on the ClusterTopology resource. + // NOTE: In this version of the operator the synchronization will additionally ensure that the KAI Topology resource + // is created based on the ClusterTopology. When we introduce support for pluggable scheduler backends, + // handling of scheduler specified resources will be delegated to the backend scheduler controller. + if err = synchronizeTopology(ctx, mgr, operatorConfig); err != nil { + logger.Error(err, "failed to synchronize cluster topology") + os.Exit(cli.ExitErrSynchronizeTopology) } webhookCertsReadyCh := make(chan struct{}) - if err = cert.ManageWebhookCerts(mgr, operatorCfg.Server.Webhooks.ServerCertDir, operatorCfg.Authorizer.Enabled, webhookCertsReadyCh); err != nil { + if err = cert.ManageWebhookCerts(mgr, operatorConfig.Server.Webhooks.ServerCertDir, operatorConfig.Authorizer.Enabled, webhookCertsReadyCh); err != nil { logger.Error(err, "failed to setup cert rotation") - os.Exit(1) + os.Exit(cli.ExitErrInitializeManager) } if err = grovectrl.SetupHealthAndReadinessEndpoints(mgr, webhookCertsReadyCh); err != nil { logger.Error(err, "failed to set up health and readiness for grove controller manager") - os.Exit(1) + os.Exit(cli.ExitErrInitializeManager) } // Certificates need to be generated before the webhooks are started, which can only happen once the manager is started. // Block while generating the certificates, and then start the webhooks. go func() { - if err = grovectrl.RegisterControllersAndWebhooks(mgr, logger, operatorCfg, webhookCertsReadyCh); err != nil { + if err = grovectrl.RegisterControllersAndWebhooks(mgr, logger, operatorConfig, webhookCertsReadyCh); err != nil { logger.Error(err, "failed to initialize grove controller manager") - os.Exit(1) + os.Exit(cli.ExitErrInitializeManager) } }() logger.Info("Starting manager") if err = mgr.Start(ctx); err != nil { - logger.Error(err, "Error running manager") - os.Exit(1) + logger.Error(err, "Error starting controller manager") + os.Exit(cli.ExitErrStart) } } -func initializeOperatorConfig(cliOpts *groveopts.CLIOptions) (*configv1alpha1.OperatorConfiguration, error) { - // complete and validate operator configuration - if err := cliOpts.Complete(); err != nil { - return nil, err +func synchronizeTopology(ctx context.Context, mgr ctrl.Manager, operatorCfg *configv1alpha1.OperatorConfiguration) error { + cl, err := client.New(mgr.GetConfig(), client.Options{Scheme: mgr.GetScheme()}) + if err != nil { + return fmt.Errorf("failed to create client for topology synchronization: %w", err) + } + if !operatorCfg.ClusterTopology.Enabled { + logger.Info("cluster topology is disabled, deleting existing ClusterTopology if any") + return clustertopology.DeleteClusterTopology(ctx, cl, corev1alpha1.DefaultClusterTopologyName) } - if err := cliOpts.Validate(); err != nil { - return nil, err + // create or update ClusterTopology based on configuration + clusterTopology, err := clustertopology.EnsureClusterTopology(ctx, cl, logger, corev1alpha1.DefaultClusterTopologyName, operatorCfg.ClusterTopology.Levels) + if err != nil { + return err } - return cliOpts.Config, nil + // create or update KAI Topology based on ClusterTopology + return clustertopology.EnsureKAITopology(ctx, cl, logger, corev1alpha1.DefaultClusterTopologyName, clusterTopology) } func printFlags() { @@ -132,3 +135,12 @@ func printFlags() { }) logger.Info("Running with flags", flagKVs...) } + +// handleErrorAndExit gracefully handles errors before exiting the program. +func handleErrorAndExit(err error, exitCode int) { + if errors.Is(err, pflag.ErrHelp) { + os.Exit(cli.ExitSuccess) + } + _, _ = fmt.Fprintf(os.Stderr, "Err: %v\n", err) + os.Exit(exitCode) +} diff --git a/operator/cmd/opts/options.go b/operator/cmd/opts/options.go deleted file mode 100644 index 6f69608d1..000000000 --- a/operator/cmd/opts/options.go +++ /dev/null @@ -1,80 +0,0 @@ -// /* -// Copyright 2024 The Grove Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// */ - -package opts - -import ( - "fmt" - "os" - - configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" - operatorvalidation "github.com/ai-dynamo/grove/operator/api/config/validation" - - "github.com/spf13/pflag" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/serializer" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" -) - -var configDecoder runtime.Decoder - -func init() { - configScheme := runtime.NewScheme() - utilruntime.Must(configv1alpha1.AddToScheme(configScheme)) - configDecoder = serializer.NewCodecFactory(configScheme).UniversalDecoder() -} - -// CLIOptions provides convenience abstraction to initialize and validate OperatorConfiguration from CLI flags. -type CLIOptions struct { - configFile string - // Config is the operator configuration initialized from the CLI flags. - Config *configv1alpha1.OperatorConfiguration -} - -// NewCLIOptions creates a new CLIOptions and adds the required CLI flags to the flag.flagSet. -func NewCLIOptions(fs *pflag.FlagSet) *CLIOptions { - cliOpts := &CLIOptions{} - cliOpts.addFlags(fs) - return cliOpts -} - -// Complete reads the configuration file and decodes it into an OperatorConfiguration. -func (o *CLIOptions) Complete() error { - if len(o.configFile) == 0 { - return fmt.Errorf("missing config file") - } - data, err := os.ReadFile(o.configFile) - if err != nil { - return fmt.Errorf("error reading config file: %w", err) - } - o.Config = &configv1alpha1.OperatorConfiguration{} - if err = runtime.DecodeInto(configDecoder, data, o.Config); err != nil { - return fmt.Errorf("error decoding config: %w", err) - } - return nil -} - -// Validate validates the created OperatorConfiguration. -func (o *CLIOptions) Validate() error { - if errs := operatorvalidation.ValidateOperatorConfiguration(o.Config); errs != nil { - return errs.ToAggregate() - } - return nil -} - -func (o *CLIOptions) addFlags(fs *pflag.FlagSet) { - fs.StringVar(&o.configFile, "config", o.configFile, "Path to configuration file.") -} diff --git a/operator/hack/install-helm-charts.sh b/operator/hack/install-helm-charts.sh deleted file mode 100755 index 0f85dba5e..000000000 --- a/operator/hack/install-helm-charts.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash -# /* -# Copyright 2024 The Grove Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# */ - - -set -o errexit -set -o nounset -set -o pipefail - - -function check_prerequisites() { - if ! command -v helm &> /dev/null; then - echo "helm is not installed. Please install helm from https://helm.sh/docs/intro/install" - exit 1 - fi -} - -function verify_and_install_charts() { - -} - diff --git a/operator/initc/cmd/main.go b/operator/initc/cmd/main.go index f0ea4fabf..cae3e006f 100644 --- a/operator/initc/cmd/main.go +++ b/operator/initc/cmd/main.go @@ -26,7 +26,6 @@ import ( "github.com/ai-dynamo/grove/operator/initc/cmd/opts" "github.com/ai-dynamo/grove/operator/initc/internal" "github.com/ai-dynamo/grove/operator/internal/logger" - "github.com/ai-dynamo/grove/operator/internal/version" ) var ( @@ -41,9 +40,8 @@ func main() { log.Error(err, "Failed to generate configuration for the init container from the flags") os.Exit(1) } - version.PrintVersionAndExitIfRequested() - log.Info("Starting grove init container", "version", version.Get()) + log.Info("Starting grove init container") podCliqueDependencies, err := config.GetPodCliqueDependencies() if err != nil { diff --git a/operator/initc/cmd/opts/options.go b/operator/initc/cmd/opts/options.go index 3759b3dc9..a80f84fb8 100644 --- a/operator/initc/cmd/opts/options.go +++ b/operator/initc/cmd/opts/options.go @@ -23,8 +23,6 @@ import ( grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" groveerr "github.com/ai-dynamo/grove/operator/internal/errors" - "github.com/ai-dynamo/grove/operator/internal/version" - "github.com/spf13/pflag" ) @@ -47,7 +45,6 @@ func (c *CLIOptions) RegisterFlags(fs *pflag.FlagSet) { // --podcliques=: // --podcliques=podclique-a:3 --podcliques=podclique-b:4 and so on for each PodClique. pflag.StringArrayVarP(&c.podCliques, "podcliques", "p", nil, "podclique name and minAvailable replicas seperated by comma, repeated for each podclique") - version.AddFlags(fs) } // GetPodCliqueDependencies returns the PodClique information as a map with the minAvailable associated with each PodClique name. diff --git a/operator/internal/clustertopology/clustertopology.go b/operator/internal/clustertopology/clustertopology.go new file mode 100644 index 000000000..137b56324 --- /dev/null +++ b/operator/internal/clustertopology/clustertopology.go @@ -0,0 +1,154 @@ +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package clustertopology + +import ( + "context" + "fmt" + "reflect" + + corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + + kaitopologyv1alpha1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1alpha1" + "github.com/go-logr/logr" + "github.com/samber/lo" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" +) + +// EnsureClusterTopology ensures that the ClusterTopology is created or updated in the cluster. +func EnsureClusterTopology(ctx context.Context, cl client.Client, logger logr.Logger, name string, topologyLevels []corev1alpha1.TopologyLevel) (*corev1alpha1.ClusterTopology, error) { + desiredTopology := buildClusterTopology(name, topologyLevels) + existingTopology := &corev1alpha1.ClusterTopology{} + err := cl.Get(ctx, client.ObjectKey{Name: name}, existingTopology) + if err != nil { + // If not found, create a new ClusterTopology + if apierrors.IsNotFound(err) { + if err = cl.Create(ctx, desiredTopology); err != nil { + return nil, fmt.Errorf("failed to create ClusterTopology %s: %w", name, err) + } + logger.Info("Created ClusterTopology", "name", name) + return desiredTopology, nil + } + return nil, fmt.Errorf("failed to get ClusterTopology %s: %w", name, err) + } + + // Update existing ClusterTopology if there are changes + if isClusterTopologyChanged(existingTopology, desiredTopology) { + existingTopology.Spec = desiredTopology.Spec + if err = cl.Update(ctx, existingTopology); err != nil { + return nil, fmt.Errorf("failed to update ClusterTopology %s: %w", name, err) + } + logger.Info("Updated ClusterTopology successfully", "name", name) + } + return existingTopology, nil +} + +// EnsureKAITopology ensures that the corresponding KAI ClusterTopology resource is created. +func EnsureKAITopology(ctx context.Context, cl client.Client, logger logr.Logger, name string, clusterTopology *corev1alpha1.ClusterTopology) error { + desiredTopology, err := buildKAITopology(name, clusterTopology, cl.Scheme()) + if err != nil { + return fmt.Errorf("failed to build KAI Topology: %w", err) + } + existingTopology := &kaitopologyv1alpha1.Topology{} + if err = cl.Get(ctx, client.ObjectKey{Name: name}, existingTopology); err != nil { + if apierrors.IsNotFound(err) { + if err = cl.Create(ctx, desiredTopology); err != nil { + return fmt.Errorf("failed to create KAI Topology %s: %w", name, err) + } + logger.Info("Created KAI Topology", "name", name) + return nil + } + return fmt.Errorf("failed to get KAI Topology %s: %w", name, err) + } + + // If the existing KAI topology does not have passed in ClusterTopology as owner, then error out. + if !metav1.IsControlledBy(existingTopology, clusterTopology) { + return fmt.Errorf("KAI Topology %s is not owned by ClusterTopology %s. It is required that KAI Topology by this name is created by Grove operator and has ClusterTopology set as its owner", name, clusterTopology.Name) + } + + if isKAITopologyChanged(existingTopology, desiredTopology) { + // KAI Topology needs to be updated. Since KAI Topology has immutable levels, we need to delete and recreate it. + // See https://github.com/NVIDIA/KAI-Scheduler/blob/130ab4468f96b25b1899ad2eced0a072dff033e9/pkg/apis/kai/v1alpha1/topology_types.go#L60 + if err = cl.Delete(ctx, existingTopology); err != nil { + return fmt.Errorf("failed to recreate (action: delete) existing KAI Topology %s: %w", name, err) + } + if err = cl.Create(ctx, desiredTopology); err != nil { + return fmt.Errorf("failed to recreate (action: create) KAI Topology %s: %w", name, err) + } + logger.Info("Recreated KAI Topology with updated levels", "name", name) + } + return nil +} + +// DeleteClusterTopology deletes the ClusterTopology with the given name. +func DeleteClusterTopology(ctx context.Context, cl client.Client, name string) error { + if err := client.IgnoreNotFound(cl.Delete(ctx, &corev1alpha1.ClusterTopology{ + ObjectMeta: ctrl.ObjectMeta{ + Name: name, + }, + })); err != nil { + return fmt.Errorf("failed to delete ClusterTopology %s: %w", name, err) + } + return nil +} + +func buildClusterTopology(name string, topologyLevels []corev1alpha1.TopologyLevel) *corev1alpha1.ClusterTopology { + sortedTopologyLevels := make([]corev1alpha1.TopologyLevel, len(topologyLevels)) + copy(sortedTopologyLevels, topologyLevels) + corev1alpha1.SortTopologyLevels(sortedTopologyLevels) + return &corev1alpha1.ClusterTopology{ + ObjectMeta: ctrl.ObjectMeta{ + Name: name, + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: sortedTopologyLevels, + }, + } +} + +func isClusterTopologyChanged(oldTopology, newTopology *corev1alpha1.ClusterTopology) bool { + return !reflect.DeepEqual(oldTopology.Spec, newTopology.Spec) +} + +func buildKAITopology(name string, clusterTopology *corev1alpha1.ClusterTopology, scheme *runtime.Scheme) (*kaitopologyv1alpha1.Topology, error) { + kaiTopologyLevels := lo.Map(clusterTopology.Spec.Levels, func(clusterTopologyLevel corev1alpha1.TopologyLevel, _ int) kaitopologyv1alpha1.TopologyLevel { + return kaitopologyv1alpha1.TopologyLevel{ + NodeLabel: clusterTopologyLevel.Key, + } + }) + kaiTopology := &kaitopologyv1alpha1.Topology{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + Spec: kaitopologyv1alpha1.TopologySpec{ + Levels: kaiTopologyLevels, + }, + } + if err := controllerutil.SetControllerReference(clusterTopology, kaiTopology, scheme); err != nil { + return nil, fmt.Errorf("failed to set owner reference for KAI Topology: %w", err) + } + return kaiTopology, nil +} + +func isKAITopologyChanged(oldTopology, newTopology *kaitopologyv1alpha1.Topology) bool { + return !reflect.DeepEqual(oldTopology.Spec.Levels, newTopology.Spec.Levels) +} diff --git a/operator/internal/clustertopology/clustertopology_test.go b/operator/internal/clustertopology/clustertopology_test.go new file mode 100644 index 000000000..e4be57a21 --- /dev/null +++ b/operator/internal/clustertopology/clustertopology_test.go @@ -0,0 +1,774 @@ +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package clustertopology + +import ( + "context" + "errors" + "testing" + + apicommonconstants "github.com/ai-dynamo/grove/operator/api/common/constants" + corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + testutils "github.com/ai-dynamo/grove/operator/test/utils" + + kaitopologyv1alpha1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1alpha1" + "github.com/go-logr/logr" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/uuid" + "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const topologyName = "test-topology" + +func TestEnsureClusterTopologyWhenNonExists(t *testing.T) { + // Setup + ctx := context.Background() + cl := testutils.CreateDefaultFakeClient(nil) + logger := logr.Discard() + + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + + // Test + topology, err := EnsureClusterTopology(ctx, cl, logger, topologyName, topologyLevels) + + // Assert + require.NoError(t, err) + assert.NotNil(t, topology) + assert.Equal(t, topologyName, topology.Name) + assert.Equal(t, topologyLevels, topology.Spec.Levels) + + // Verify it was actually created + fetched := &corev1alpha1.ClusterTopology{} + err = cl.Get(ctx, client.ObjectKey{Name: topologyName}, fetched) + require.NoError(t, err) + assert.Equal(t, topologyLevels, fetched.Spec.Levels) +} + +func TestEnsureClusterTopologyWhenUpdated(t *testing.T) { + // Setup + ctx := context.Background() + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + // Create initial topology + existing := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: topologyLevels, + }, + } + cl := testutils.CreateDefaultFakeClient([]client.Object{existing}) + logger := logr.Discard() + + // Update with new levels and test + updatedTopologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainRegion, Key: "topology.kubernetes.io/region"}, + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + topology, err := EnsureClusterTopology(ctx, cl, logger, topologyName, updatedTopologyLevels) + + // Assert + require.NoError(t, err) + assert.NotNil(t, topology) + assert.Equal(t, updatedTopologyLevels, topology.Spec.Levels) + + // Verify it was actually updated + fetched := &corev1alpha1.ClusterTopology{} + err = cl.Get(ctx, client.ObjectKey{Name: topologyName}, fetched) + require.NoError(t, err) + assert.Equal(t, updatedTopologyLevels, fetched.Spec.Levels) +} + +func TestEnsureClusterTopologyWhenNoUpdate(t *testing.T) { + // Setup + ctx := context.Background() + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + existing := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + ResourceVersion: "1", + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: topologyLevels, + }, + } + cl := testutils.CreateDefaultFakeClient([]client.Object{existing}) + logger := logr.Discard() + + // Test + topology, err := EnsureClusterTopology(ctx, cl, logger, topologyName, topologyLevels) + // Assert + require.NoError(t, err) + assert.NotNil(t, topology) + assert.Equal(t, topologyLevels, topology.Spec.Levels) + // Resource version should remain the same (no update occurred) + assert.Equal(t, "1", topology.ResourceVersion) +} + +func TestEnsureClusterTopologyWhenCreateError(t *testing.T) { + ctx := context.Background() + logger := logr.Discard() + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + // Create a client that will fail on create + createErr := apierrors.NewInternalError(assert.AnError) + cl := testutils.NewTestClientBuilder(). + RecordErrorForObjects(testutils.ClientMethodCreate, createErr, client.ObjectKey{Name: topologyName}). + Build() + + topology, err := EnsureClusterTopology(ctx, cl, logger, topologyName, topologyLevels) + assert.Error(t, err) + assert.Nil(t, topology) + assert.True(t, errors.Is(err, createErr)) +} + +func TestEnsureClusterTopologyWhenUpdateError(t *testing.T) { + // Setup + ctx := context.Background() + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + existing := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: topologyLevels, + }, + } + // Create a client that will fail on update + updateErr := apierrors.NewInternalError(assert.AnError) + cl := testutils.NewTestClientBuilder(). + WithObjects(existing). + RecordErrorForObjects(testutils.ClientMethodUpdate, updateErr, client.ObjectKey{Name: topologyName}). + Build() + logger := logr.Discard() + // Test + updatedTopologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainRegion, Key: "topology.kubernetes.io/region"}, + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + topology, err := EnsureClusterTopology(ctx, cl, logger, topologyName, updatedTopologyLevels) + assert.Error(t, err) + assert.Nil(t, topology) + assert.True(t, errors.Is(err, updateErr)) +} + +func TestEnsureClusterTopologyWhenGetError(t *testing.T) { + ctx := context.Background() + logger := logr.Discard() + + // Create a client that will fail on get (non-NotFound error) + getErr := apierrors.NewInternalError(assert.AnError) + cl := testutils.NewTestClientBuilder(). + RecordErrorForObjects(testutils.ClientMethodGet, getErr, client.ObjectKey{Name: topologyName}). + Build() + + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + topology, err := EnsureClusterTopology(ctx, cl, logger, topologyName, topologyLevels) + assert.Error(t, err) + assert.Nil(t, topology) + assert.Contains(t, err.Error(), "failed to get ClusterTopology") +} + +func TestEnsureKAITopologyWhenNonExists(t *testing.T) { + // Setup + ctx := context.Background() + cl := testutils.CreateDefaultFakeClient(nil) + logger := logr.Discard() + + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainRack, Key: "topology.kubernetes.io/rack"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + clusterTopology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + UID: uuid.NewUUID(), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: topologyLevels, + }, + } + + // Test + err := EnsureKAITopology(ctx, cl, logger, topologyName, clusterTopology) + require.NoError(t, err) + + // Verify KAI Topology was created + kaiTopology := &kaitopologyv1alpha1.Topology{} + err = cl.Get(ctx, client.ObjectKey{Name: topologyName}, kaiTopology) + require.NoError(t, err) + assert.Equal(t, topologyName, kaiTopology.Name) + assert.Len(t, kaiTopology.Spec.Levels, 3) + assert.Equal(t, "topology.kubernetes.io/zone", kaiTopology.Spec.Levels[0].NodeLabel) + assert.Equal(t, "topology.kubernetes.io/rack", kaiTopology.Spec.Levels[1].NodeLabel) + assert.Equal(t, "kubernetes.io/hostname", kaiTopology.Spec.Levels[2].NodeLabel) + + // Verify owner reference is set + assert.True(t, metav1.IsControlledBy(kaiTopology, clusterTopology)) +} + +func TestEnsureKAITopologyWhenClusterTopologyUpdated(t *testing.T) { + // Setup + ctx := context.Background() + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + clusterTopology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + UID: uuid.NewUUID(), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: topologyLevels, + }, + } + + // Create existing KAI Topology with old levels + existingKAITopology := &kaitopologyv1alpha1.Topology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: corev1alpha1.SchemeGroupVersion.String(), + Kind: apicommonconstants.KindClusterTopology, + Name: clusterTopology.Name, + UID: clusterTopology.UID, + Controller: ptr.To(true), + BlockOwnerDeletion: ptr.To(true), + }, + }, + }, + Spec: kaitopologyv1alpha1.TopologySpec{ + Levels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "kubernetes.io/hostname"}, + }, + }, + } + + cl := testutils.CreateDefaultFakeClient([]client.Object{clusterTopology, existingKAITopology}) + logger := logr.Discard() + + err := EnsureKAITopology(ctx, cl, logger, topologyName, clusterTopology) + require.NoError(t, err) + + // Verify KAI Topology was recreated with new levels + kaiTopology := &kaitopologyv1alpha1.Topology{} + err = cl.Get(ctx, client.ObjectKey{Name: topologyName}, kaiTopology) + require.NoError(t, err) + // Since it is recreated, generation should be "1" thus testing that the existing resource is not updated. + assert.Equal(t, int64(0), kaiTopology.Generation) + assert.Len(t, kaiTopology.Spec.Levels, 2) + assert.Equal(t, "topology.kubernetes.io/zone", kaiTopology.Spec.Levels[0].NodeLabel) + assert.Equal(t, "kubernetes.io/hostname", kaiTopology.Spec.Levels[1].NodeLabel) +} + +func TestEnsureKAITopologyWhenNoChangeRequired(t *testing.T) { + // Setup + ctx := context.Background() + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + clusterTopology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + UID: uuid.NewUUID(), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: topologyLevels, + }, + } + + // Create existing KAI Topology with same levels + existingKAITopology := &kaitopologyv1alpha1.Topology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + ResourceVersion: "1", + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "grove.io/v1alpha1", + Kind: "ClusterTopology", + Name: clusterTopology.Name, + UID: clusterTopology.UID, + Controller: ptr.To(true), + BlockOwnerDeletion: ptr.To(true), + }, + }, + }, + Spec: kaitopologyv1alpha1.TopologySpec{ + Levels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "topology.kubernetes.io/zone"}, + {NodeLabel: "kubernetes.io/hostname"}, + }, + }, + } + + cl := testutils.CreateDefaultFakeClient([]client.Object{clusterTopology, existingKAITopology}) + logger := logr.Discard() + + err := EnsureKAITopology(ctx, cl, logger, topologyName, clusterTopology) + require.NoError(t, err) + + // Verify KAI Topology was NOT recreated (resource version should be same) + kaiTopology := &kaitopologyv1alpha1.Topology{} + err = cl.Get(ctx, client.ObjectKey{Name: topologyName}, kaiTopology) + require.NoError(t, err) + assert.Equal(t, int64(0), kaiTopology.Generation) + assert.Equal(t, "1", kaiTopology.ResourceVersion) +} + +func TestEnsureKAITopologyWithUnknownOwner(t *testing.T) { + // Setup + ctx := context.Background() + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + clusterTopology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + UID: uuid.NewUUID(), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: topologyLevels, + }, + } + + // Create existing KAI Topology owned by different ClusterTopology + existingKAITopology := &kaitopologyv1alpha1.Topology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "grove.io/v1alpha1", + Kind: "ClusterTopology", + Name: "different-owner", + UID: "different-uid", + Controller: ptr.To(true), + BlockOwnerDeletion: ptr.To(true), + }, + }, + }, + Spec: kaitopologyv1alpha1.TopologySpec{ + Levels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "topology.kubernetes.io/zone"}, + }, + }, + } + + cl := testutils.CreateDefaultFakeClient([]client.Object{clusterTopology, existingKAITopology}) + logger := logr.Discard() + // Test and Assert + err := EnsureKAITopology(ctx, cl, logger, topologyName, clusterTopology) + assert.Error(t, err) +} + +func TestEnsureKAITopologyWhenGetError(t *testing.T) { + // Setup + ctx := context.Background() + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + clusterTopology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + UID: uuid.NewUUID(), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: topologyLevels, + }, + } + + // Create a client that will fail on get (non-NotFound error) + getErr := apierrors.NewInternalError(assert.AnError) + cl := testutils.NewTestClientBuilder(). + WithObjects(clusterTopology). + RecordErrorForObjects(testutils.ClientMethodGet, getErr, client.ObjectKey{Name: topologyName}). + Build() + logger := logr.Discard() + // Test + err := EnsureKAITopology(ctx, cl, logger, topologyName, clusterTopology) + // Assert + assert.Error(t, err) + assert.True(t, errors.Is(err, getErr)) +} + +func TestEnsureKAITopologyWhenCreateError(t *testing.T) { + // Setup + ctx := context.Background() + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + clusterTopology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + UID: uuid.NewUUID(), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: topologyLevels, + }, + } + + // Create a client that will fail on create + createErr := apierrors.NewInternalError(assert.AnError) + cl := testutils.NewTestClientBuilder(). + WithObjects(clusterTopology). + RecordErrorForObjects(testutils.ClientMethodCreate, createErr, client.ObjectKey{Name: topologyName}). + Build() + logger := logr.Discard() + + // Test + err := EnsureKAITopology(ctx, cl, logger, topologyName, clusterTopology) + // Assert + assert.Error(t, err) + assert.True(t, errors.Is(err, createErr)) +} + +func TestEnsureKAITopologyWhenDeleteError(t *testing.T) { + // Setup + ctx := context.Background() + // New topology levels that ClusterTopology will have + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + clusterTopology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + UID: uuid.NewUUID(), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: topologyLevels, + }, + } + + // Create existing KAI Topology with OLD/DIFFERENT levels to trigger delete-and-recreate + existingKAITopology := &kaitopologyv1alpha1.Topology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "grove.io/v1alpha1", + Kind: "ClusterTopology", + Name: clusterTopology.Name, + UID: clusterTopology.UID, + Controller: ptr.To(true), + BlockOwnerDeletion: ptr.To(true), + }, + }, + }, + Spec: kaitopologyv1alpha1.TopologySpec{ + Levels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "old-label"}, // Different from ClusterTopology levels + }, + }, + } + + // Create a client that will fail on delete + deleteErr := apierrors.NewInternalError(assert.AnError) + cl := testutils.NewTestClientBuilder(). + WithObjects(clusterTopology, existingKAITopology). + RecordErrorForObjects(testutils.ClientMethodDelete, deleteErr, client.ObjectKey{Name: topologyName}). + Build() + logger := logr.Discard() + + // Test + err := EnsureKAITopology(ctx, cl, logger, topologyName, clusterTopology) + // Assert + assert.Error(t, err) + assert.True(t, errors.Is(err, deleteErr)) +} + +func TestEnsureKAITopologyWhenCreateFailsDuringRecreate(t *testing.T) { + // Setup + ctx := context.Background() + updatedTopologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + clusterTopology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + UID: uuid.NewUUID(), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: updatedTopologyLevels, + }, + } + + // Create existing KAI Topology with old levels + existingKAITopology := &kaitopologyv1alpha1.Topology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "grove.io/v1alpha1", + Kind: "ClusterTopology", + Name: clusterTopology.Name, + UID: clusterTopology.UID, + Controller: ptr.To(true), + BlockOwnerDeletion: ptr.To(true), + }, + }, + }, + Spec: kaitopologyv1alpha1.TopologySpec{ + Levels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "kubernetes.io/hostname"}, + }, + }, + } + + // Create a client that succeeds when deleting KAI Topology resource but fail on create + // This simulates the scenario where delete succeeds but create fails during recreation + createErr := apierrors.NewInternalError(assert.AnError) + cl := testutils.NewTestClientBuilder(). + WithObjects(clusterTopology, existingKAITopology). + RecordErrorForObjects(testutils.ClientMethodCreate, createErr, client.ObjectKey{Name: topologyName}). + Build() + logger := logr.Discard() + + err := EnsureKAITopology(ctx, cl, logger, topologyName, clusterTopology) + assert.Error(t, err) + // After delete succeeds, create will fail, so we get the create error message + assert.True(t, errors.Is(err, createErr)) +} + +func TestDeleteClusterTopology(t *testing.T) { + // Setup + ctx := context.Background() + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + existing := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: topologyLevels, + }, + } + cl := testutils.CreateDefaultFakeClient([]client.Object{existing}) + + // Test + err := DeleteClusterTopology(ctx, cl, topologyName) + // Assert + require.NoError(t, err) + // Verify it was deleted + fetched := &corev1alpha1.ClusterTopology{} + err = cl.Get(ctx, client.ObjectKey{Name: topologyName}, fetched) + assert.True(t, apierrors.IsNotFound(err)) +} + +func TestDeleteClusterTopologyWhenNoneExists(t *testing.T) { + // Setup + ctx := context.Background() + cl := testutils.CreateDefaultFakeClient(nil) + // Test and Assert + // Should not error when deleting non-existent topology + err := DeleteClusterTopology(ctx, cl, topologyName) + assert.NoError(t, err) +} + +func TestDeleteClusterTopologyError(t *testing.T) { + ctx := context.Background() + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + existing := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: topologyLevels, + }, + } + + // Create a client that will fail on delete (non-NotFound error) + deleteErr := apierrors.NewInternalError(assert.AnError) + cl := testutils.NewTestClientBuilder(). + WithObjects(existing). + RecordErrorForObjects(testutils.ClientMethodDelete, deleteErr, client.ObjectKey{Name: topologyName}). + Build() + + err := DeleteClusterTopology(ctx, cl, topologyName) + assert.Error(t, err) + assert.True(t, errors.Is(err, deleteErr)) +} + +func TestBuildClusterTopology(t *testing.T) { + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + topology := buildClusterTopology(topologyName, topologyLevels) + assert.Equal(t, topologyName, topology.Name) + assert.Equal(t, topologyLevels, topology.Spec.Levels) +} + +func TestIsClusterTopologyChanged(t *testing.T) { + tests := []struct { + name string + oldLevels []corev1alpha1.TopologyLevel + newLevels []corev1alpha1.TopologyLevel + expectedChanged bool + }{ + { + name: "No change in levels", + oldLevels: []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + }, + newLevels: []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + }, + expectedChanged: false, + }, + { + name: "Change in levels", + oldLevels: []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + }, + newLevels: []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainRegion, Key: "topology.kubernetes.io/region"}, + }, + expectedChanged: true, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + oldTopology := &corev1alpha1.ClusterTopology{ + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: test.oldLevels, + }, + } + newTopology := &corev1alpha1.ClusterTopology{ + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: test.newLevels, + }, + } + topologyChanged := isClusterTopologyChanged(oldTopology, newTopology) + assert.Equal(t, test.expectedChanged, topologyChanged) + }) + } +} + +func TestBuildKAITopology(t *testing.T) { + // Setup + cl := testutils.CreateDefaultFakeClient(nil) + topologyLevels := []corev1alpha1.TopologyLevel{ + {Domain: corev1alpha1.TopologyDomainZone, Key: "topology.kubernetes.io/zone"}, + {Domain: corev1alpha1.TopologyDomainHost, Key: "kubernetes.io/hostname"}, + } + clusterTopology := &corev1alpha1.ClusterTopology{ + ObjectMeta: metav1.ObjectMeta{ + Name: topologyName, + UID: uuid.NewUUID(), + }, + Spec: corev1alpha1.ClusterTopologySpec{ + Levels: topologyLevels, + }, + } + + // Test + kaiTopology, err := buildKAITopology(topologyName, clusterTopology, cl.Scheme()) + // Assert + require.NoError(t, err) + assert.Equal(t, topologyName, kaiTopology.Name) + assert.Len(t, kaiTopology.Spec.Levels, 2) + assert.Equal(t, "topology.kubernetes.io/zone", kaiTopology.Spec.Levels[0].NodeLabel) + assert.Equal(t, "kubernetes.io/hostname", kaiTopology.Spec.Levels[1].NodeLabel) + assert.True(t, metav1.IsControlledBy(kaiTopology, clusterTopology)) +} + +func TestIsKAITopologyChanged(t *testing.T) { + tests := []struct { + name string + oldLevels []kaitopologyv1alpha1.TopologyLevel + newLevels []kaitopologyv1alpha1.TopologyLevel + expectedChanged bool + }{ + { + name: "No change in levels", + oldLevels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "topology.kubernetes.io/zone"}, + {NodeLabel: "kubernetes.io/hostname"}, + }, + newLevels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "topology.kubernetes.io/zone"}, + {NodeLabel: "kubernetes.io/hostname"}, + }, + expectedChanged: false, + }, + { + name: "Change in levels", + oldLevels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "topology.kubernetes.io/zone"}, + {NodeLabel: "kubernetes.io/hostname"}, + }, + newLevels: []kaitopologyv1alpha1.TopologyLevel{ + {NodeLabel: "topology.kubernetes.io/region"}, + }, + expectedChanged: true, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + oldTopology := &kaitopologyv1alpha1.Topology{ + Spec: kaitopologyv1alpha1.TopologySpec{ + Levels: test.oldLevels, + }, + } + newTopology := &kaitopologyv1alpha1.Topology{ + Spec: kaitopologyv1alpha1.TopologySpec{ + Levels: test.newLevels, + }, + } + topologyChanged := isKAITopologyChanged(oldTopology, newTopology) + assert.Equal(t, test.expectedChanged, topologyChanged) + }) + } +} diff --git a/operator/internal/controller/manager.go b/operator/internal/controller/manager.go index 5c5155614..28e2833ef 100644 --- a/operator/internal/controller/manager.go +++ b/operator/internal/controller/manager.go @@ -46,7 +46,7 @@ const ( var ( waitTillWebhookCertsReady = cert.WaitTillWebhookCertsReady registerControllersWithMgr = RegisterControllers - registerWebhooksWithMgr = webhook.RegisterWebhooks + registerWebhooksWithMgr = webhook.Register ) // CreateManager creates the manager. @@ -58,11 +58,11 @@ func CreateManager(operatorCfg *configv1alpha1.OperatorConfiguration) (ctrl.Mana func RegisterControllersAndWebhooks(mgr ctrl.Manager, logger logr.Logger, operatorCfg *configv1alpha1.OperatorConfiguration, certsReady chan struct{}) error { // Controllers will not work unless the webhoooks are fully configured and operational. // For webhooks to work cert-controller should finish its work of generating and injecting certificates. - waitTillWebhookCertsReady(logger, certsReady) - if err := registerControllersWithMgr(mgr, operatorCfg.Controllers); err != nil { + cert.WaitTillWebhookCertsReady(logger, certsReady) + if err := RegisterControllers(mgr, operatorCfg.Controllers); err != nil { return err } - if err := registerWebhooksWithMgr(mgr, operatorCfg.Authorizer); err != nil { + if err := webhook.Register(mgr, operatorCfg.Authorizer); err != nil { return err } return nil diff --git a/operator/internal/controller/podclique/components/pod/initcontainer.go b/operator/internal/controller/podclique/components/pod/initcontainer.go index c2ad8c945..fa77297e2 100644 --- a/operator/internal/controller/podclique/components/pod/initcontainer.go +++ b/operator/internal/controller/podclique/components/pod/initcontainer.go @@ -26,7 +26,7 @@ import ( "github.com/ai-dynamo/grove/operator/internal/constants" "github.com/ai-dynamo/grove/operator/internal/controller/common/component" groveerr "github.com/ai-dynamo/grove/operator/internal/errors" - "github.com/ai-dynamo/grove/operator/internal/version" + groveversion "github.com/ai-dynamo/grove/operator/internal/version" "github.com/samber/lo" corev1 "k8s.io/api/core/v1" @@ -107,7 +107,7 @@ func addInitContainer(pcs *grovecorev1alpha1.PodCliqueSet, pclq *grovecorev1alph pod.Spec.InitContainers = append(pod.Spec.InitContainers, corev1.Container{ Name: initContainerName, - Image: fmt.Sprintf("%s:%s", image, version.Get().GitVersion), + Image: fmt.Sprintf("%s:%s", image, groveversion.New().GitVersion), Args: args, VolumeMounts: []corev1.VolumeMount{ { diff --git a/operator/internal/topology/topology.go b/operator/internal/topology/topology.go deleted file mode 100644 index 4d90a79f6..000000000 --- a/operator/internal/topology/topology.go +++ /dev/null @@ -1,191 +0,0 @@ -// /* -// Copyright 2025 The Grove Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// */ - -package topology - -import ( - "context" - "fmt" - "reflect" - "sort" - - configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" - corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" - - kaitopologyv1alpha1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1alpha1" - "github.com/samber/lo" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" -) - -var logger = ctrl.Log.WithName("topology-manager") - -// EnsureTopology creates or updates the ClusterTopology CR from configuration -func EnsureTopology(ctx context.Context, client client.Client, name string, levels []configv1alpha1.TopologyLevel) error { - topology := &corev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - }, - Spec: corev1alpha1.ClusterTopologySpec{ - Levels: convertTopologyLevels(levels), - }, - } - upsertTopology, err := upsertClusterTopology(ctx, client, topology) - if err != nil { - return err - } - return ensureKAITopology(ctx, client, upsertTopology) -} - -// upsertClusterTopology ensures that a ClusterTopology object exists by creating or updating it based on the desired state. -func upsertClusterTopology(ctx context.Context, k8sClient client.Client, desireClusterTopology *corev1alpha1.ClusterTopology) (*corev1alpha1.ClusterTopology, error) { - existing := &corev1alpha1.ClusterTopology{} - if err := k8sClient.Get(ctx, types.NamespacedName{Name: desireClusterTopology.Name}, existing); err != nil { - if apierrors.IsNotFound(err) { - if err := k8sClient.Create(ctx, desireClusterTopology); err != nil { - return nil, fmt.Errorf("failed to create ClusterTopology: %w", err) - } - return desireClusterTopology, nil - } - return nil, fmt.Errorf("failed to get existing ClusterTopology: %w", err) - } - isChanged := isClusterTopologyChanged(existing, desireClusterTopology) - if !isChanged { - return existing, nil - } - existing.Spec = desireClusterTopology.Spec - err := k8sClient.Update(ctx, existing) - if err != nil { - return nil, fmt.Errorf("failed to update ClusterTopology: %w", err) - } - return existing, nil -} - -// isClusterTopologyChanged checks if the ClusterTopology is changed -func isClusterTopologyChanged(existing *corev1alpha1.ClusterTopology, desireClusterTopology *corev1alpha1.ClusterTopology) bool { - existingLevels := lo.SliceToMap(existing.Spec.Levels, func(l corev1alpha1.TopologyLevel) (corev1alpha1.TopologyDomain, string) { - return l.Domain, l.Key - }) - newLevels := lo.SliceToMap(desireClusterTopology.Spec.Levels, func(l corev1alpha1.TopologyLevel) (corev1alpha1.TopologyDomain, string) { - return l.Domain, l.Key - }) - isChanged := !reflect.DeepEqual(existingLevels, newLevels) - return isChanged -} - -// convertTopologyLevels converts configuration topology levels to core API topology levels -func convertTopologyLevels(levels []configv1alpha1.TopologyLevel) []corev1alpha1.TopologyLevel { - result := make([]corev1alpha1.TopologyLevel, len(levels)) - for i, level := range levels { - result[i] = corev1alpha1.TopologyLevel{ - Domain: corev1alpha1.TopologyDomain(level.Domain), - Key: level.Key, - } - } - return result -} - -// convertClusterTopologyToKai converts ClusterTopology to KAI Topology with sorted levels -func convertClusterTopologyToKai(clusterTopology *corev1alpha1.ClusterTopology) *kaitopologyv1alpha1.Topology { - sortedLevels := make([]corev1alpha1.TopologyLevel, len(clusterTopology.Spec.Levels)) - copy(sortedLevels, clusterTopology.Spec.Levels) - - sort.Slice(sortedLevels, func(i, j int) bool { - return sortedLevels[i].Domain.Compare(sortedLevels[j].Domain) < 0 - }) - - kaiLevels := make([]kaitopologyv1alpha1.TopologyLevel, len(sortedLevels)) - for i, level := range sortedLevels { - kaiLevels[i] = kaitopologyv1alpha1.TopologyLevel{ - NodeLabel: level.Key, - } - } - - return &kaitopologyv1alpha1.Topology{ - ObjectMeta: metav1.ObjectMeta{ - Name: clusterTopology.Name, - }, - Spec: kaitopologyv1alpha1.TopologySpec{ - Levels: kaiLevels, - }, - } -} - -func createKAITopology(ctx context.Context, client client.Client, clusterTopology *corev1alpha1.ClusterTopology, kaiTopology *kaitopologyv1alpha1.Topology) error { - if err := controllerutil.SetControllerReference(clusterTopology, kaiTopology, client.Scheme()); err != nil { - return fmt.Errorf("failed to set owner reference for KAI Topology: %w", err) - } - if err := client.Create(ctx, kaiTopology); err != nil { - return fmt.Errorf("failed to create KAI Topology: %w", err) - } - return nil -} - -func ensureKAITopology(ctx context.Context, client client.Client, clusterTopology *corev1alpha1.ClusterTopology) error { - desiredKaiTopology := convertClusterTopologyToKai(clusterTopology) - - existing := &kaitopologyv1alpha1.Topology{} - err := client.Get(ctx, types.NamespacedName{Name: desiredKaiTopology.Name}, existing) - - if err != nil { - if apierrors.IsNotFound(err) { - if err := createKAITopology(ctx, client, clusterTopology, desiredKaiTopology); err != nil { - return err - } - logger.Info("KAI topology created successfully", "name", desiredKaiTopology.Name) - return nil - } - return fmt.Errorf("failed to get existing KAI Topology: %w", err) - } - - if !metav1.IsControlledBy(existing, clusterTopology) { - return fmt.Errorf("KAI Topology %s is owned by a different controller", existing.Name) - } - - if !reflect.DeepEqual(existing.Spec.Levels, desiredKaiTopology.Spec.Levels) { - // levels have changed, levels are immutable, delete existing and recreate - if err = client.Delete(ctx, existing); err != nil { - return fmt.Errorf("failed to delete KAI Topology for recreation: %w", err) - } - - if err = createKAITopology(ctx, client, clusterTopology, desiredKaiTopology); err != nil { - return err - } - logger.Info("KAI topology recreated with new levels", "name", desiredKaiTopology.Name) - return nil - } - - logger.Info("KAI topology unchanged", "name", desiredKaiTopology.Name) - return nil -} - -// EnsureDeleteClusterTopology deletes the ClusterTopology with the given name if it exists -func EnsureDeleteClusterTopology(ctx context.Context, client client.Client, name string) error { - kaiTopology := &corev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - }, - } - err := client.Delete(ctx, kaiTopology) - if err != nil && !apierrors.IsNotFound(err) { - return fmt.Errorf("failed to delete Cluster-Topology: %w", err) - } - return nil -} diff --git a/operator/internal/topology/topology_test.go b/operator/internal/topology/topology_test.go deleted file mode 100644 index 5972046c9..000000000 --- a/operator/internal/topology/topology_test.go +++ /dev/null @@ -1,504 +0,0 @@ -// /* -// Copyright 2025 The Grove Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// */ - -package topology - -import ( - "context" - "testing" - - configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" - corev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" - - kaitopologyv1alpha1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1alpha1" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" - "k8s.io/utils/ptr" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/client/fake" -) - -func TestConvertTopologyLevels(t *testing.T) { - configLevels := []configv1alpha1.TopologyLevel{ - {Domain: "region", Key: "topology.kubernetes.io/region"}, - {Domain: "zone", Key: "topology.kubernetes.io/zone"}, - } - - coreLevels := convertTopologyLevels(configLevels) - - require.Len(t, coreLevels, 2) - assert.Equal(t, corev1alpha1.TopologyDomain("region"), coreLevels[0].Domain) - assert.Equal(t, "topology.kubernetes.io/region", coreLevels[0].Key) - assert.Equal(t, corev1alpha1.TopologyDomain("zone"), coreLevels[1].Domain) - assert.Equal(t, "topology.kubernetes.io/zone", coreLevels[1].Key) -} - -func TestConvertClusterTopologyToKai(t *testing.T) { - clusterTopology := &corev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "grove-topology", - }, - Spec: corev1alpha1.ClusterTopologySpec{ - Levels: []corev1alpha1.TopologyLevel{ - {Domain: "zone", Key: "topology.kubernetes.io/zone"}, - {Domain: "region", Key: "topology.kubernetes.io/region"}, - }, - }, - } - - kaiTopology := convertClusterTopologyToKai(clusterTopology) - - assert.Equal(t, "grove-topology", kaiTopology.Name) - require.Len(t, kaiTopology.Spec.Levels, 2) - assert.Equal(t, "topology.kubernetes.io/region", kaiTopology.Spec.Levels[0].NodeLabel) - assert.Equal(t, "topology.kubernetes.io/zone", kaiTopology.Spec.Levels[1].NodeLabel) -} - -func TestEnsureTopology(t *testing.T) { - tests := []struct { - name string - setup func(builder *fake.ClientBuilder) - levels []configv1alpha1.TopologyLevel - wantErr bool - errContains string - validate func(*testing.T, client.Client) - }{ - { - name: "create_new", - setup: nil, - levels: []configv1alpha1.TopologyLevel{ - {Domain: "zone", Key: "topology.kubernetes.io/zone"}, - }, - wantErr: false, - validate: func(t *testing.T, c client.Client) { - ct := &corev1alpha1.ClusterTopology{} - err := c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, ct) - require.NoError(t, err) - assert.Len(t, ct.Spec.Levels, 1) - assert.Equal(t, corev1alpha1.TopologyDomain("zone"), ct.Spec.Levels[0].Domain) - - kai := &kaitopologyv1alpha1.Topology{} - err = c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, kai) - require.NoError(t, err) - assert.Len(t, kai.Spec.Levels, 1) - assert.Equal(t, "topology.kubernetes.io/zone", kai.Spec.Levels[0].NodeLabel) - - ownerRef := metav1.GetControllerOf(kai) - require.NotNil(t, ownerRef) - assert.Equal(t, "test-topology", ownerRef.Name) - }, - }, - { - name: "update_cluster_create_kai", - setup: func(builder *fake.ClientBuilder) { - existing := &corev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - UID: types.UID("cluster-uid"), - }, - Spec: corev1alpha1.ClusterTopologySpec{ - Levels: []corev1alpha1.TopologyLevel{ - {Domain: "zone", Key: "old-key"}, - }, - }, - } - builder.WithObjects(existing) - }, - levels: []configv1alpha1.TopologyLevel{ - {Domain: "zone", Key: "new-key"}, - }, - wantErr: false, - validate: func(t *testing.T, c client.Client) { - ct := &corev1alpha1.ClusterTopology{} - err := c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, ct) - require.NoError(t, err) - assert.Equal(t, "new-key", ct.Spec.Levels[0].Key) - - kai := &kaitopologyv1alpha1.Topology{} - err = c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, kai) - require.NoError(t, err) - assert.Len(t, kai.Spec.Levels, 1) - assert.Equal(t, "new-key", kai.Spec.Levels[0].NodeLabel) - }, - }, - { - name: "both_exist_unchanged", - setup: func(builder *fake.ClientBuilder) { - ct := &corev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - UID: types.UID("cluster-uid"), - }, - Spec: corev1alpha1.ClusterTopologySpec{ - Levels: []corev1alpha1.TopologyLevel{ - {Domain: "region", Key: "topology.kubernetes.io/region"}, - }, - }, - } - - kai := &kaitopologyv1alpha1.Topology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - OwnerReferences: []metav1.OwnerReference{ - { - APIVersion: "core.grove.io/v1alpha1", - Kind: "ClusterTopology", - Name: "test-topology", - UID: ct.UID, - Controller: ptr.To(true), - BlockOwnerDeletion: ptr.To(true), - }, - }, - }, - Spec: kaitopologyv1alpha1.TopologySpec{ - Levels: []kaitopologyv1alpha1.TopologyLevel{ - {NodeLabel: "topology.kubernetes.io/region"}, - }, - }, - } - - builder.WithObjects(ct, kai) - }, - levels: []configv1alpha1.TopologyLevel{ - {Domain: "region", Key: "topology.kubernetes.io/region"}, - }, - wantErr: false, - validate: func(t *testing.T, c client.Client) { - ct := &corev1alpha1.ClusterTopology{} - err := c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, ct) - require.NoError(t, err) - assert.Len(t, ct.Spec.Levels, 1) - - kai := &kaitopologyv1alpha1.Topology{} - err = c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, kai) - require.NoError(t, err) - assert.Len(t, kai.Spec.Levels, 1) - }, - }, - { - name: "both_need_changes", - setup: func(builder *fake.ClientBuilder) { - ct := &corev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - UID: types.UID("cluster-uid"), - }, - Spec: corev1alpha1.ClusterTopologySpec{ - Levels: []corev1alpha1.TopologyLevel{ - {Domain: "region", Key: "old-region-key"}, - }, - }, - } - - kai := &kaitopologyv1alpha1.Topology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - OwnerReferences: []metav1.OwnerReference{ - { - APIVersion: "core.grove.io/v1alpha1", - Kind: "ClusterTopology", - Name: "test-topology", - UID: ct.UID, - Controller: ptr.To(true), - BlockOwnerDeletion: ptr.To(true), - }, - }, - }, - Spec: kaitopologyv1alpha1.TopologySpec{ - Levels: []kaitopologyv1alpha1.TopologyLevel{ - {NodeLabel: "old-region-key"}, - }, - }, - } - - builder.WithObjects(ct, kai) - }, - levels: []configv1alpha1.TopologyLevel{ - {Domain: "region", Key: "new-region-key"}, - {Domain: "zone", Key: "topology.kubernetes.io/zone"}, - }, - wantErr: false, - validate: func(t *testing.T, c client.Client) { - ct := &corev1alpha1.ClusterTopology{} - err := c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, ct) - require.NoError(t, err) - assert.Len(t, ct.Spec.Levels, 2) - assert.Equal(t, "new-region-key", ct.Spec.Levels[0].Key) - - kai := &kaitopologyv1alpha1.Topology{} - err = c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, kai) - require.NoError(t, err) - assert.Len(t, kai.Spec.Levels, 2) - }, - }, - { - name: "empty_levels", - setup: nil, - levels: []configv1alpha1.TopologyLevel{}, - wantErr: false, - validate: func(t *testing.T, c client.Client) { - ct := &corev1alpha1.ClusterTopology{} - err := c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, ct) - require.NoError(t, err) - assert.Len(t, ct.Spec.Levels, 0) - - kai := &kaitopologyv1alpha1.Topology{} - err = c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, kai) - require.NoError(t, err) - assert.Len(t, kai.Spec.Levels, 0) - }, - }, - { - name: "kai_wrong_owner", - setup: func(builder *fake.ClientBuilder) { - ct := &corev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - UID: types.UID("cluster-uid"), - }, - Spec: corev1alpha1.ClusterTopologySpec{ - Levels: []corev1alpha1.TopologyLevel{ - {Domain: "region", Key: "topology.kubernetes.io/region"}, - }, - }, - } - - kai := &kaitopologyv1alpha1.Topology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - OwnerReferences: []metav1.OwnerReference{ - { - APIVersion: "core.grove.io/v1alpha1", - Kind: "ClusterTopology", - Name: "different-owner", - UID: types.UID("different-uid"), - Controller: ptr.To(true), - BlockOwnerDeletion: ptr.To(true), - }, - }, - }, - Spec: kaitopologyv1alpha1.TopologySpec{ - Levels: []kaitopologyv1alpha1.TopologyLevel{ - {NodeLabel: "topology.kubernetes.io/region"}, - }, - }, - } - - builder.WithObjects(ct, kai) - }, - levels: []configv1alpha1.TopologyLevel{ - {Domain: "region", Key: "topology.kubernetes.io/region"}, - }, - wantErr: true, - errContains: "owned by a different controller", - }, - { - name: "kai_no_owner", - setup: func(builder *fake.ClientBuilder) { - ct := &corev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - UID: types.UID("cluster-uid"), - }, - Spec: corev1alpha1.ClusterTopologySpec{ - Levels: []corev1alpha1.TopologyLevel{ - {Domain: "region", Key: "topology.kubernetes.io/region"}, - }, - }, - } - - kai := &kaitopologyv1alpha1.Topology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: kaitopologyv1alpha1.TopologySpec{ - Levels: []kaitopologyv1alpha1.TopologyLevel{ - {NodeLabel: "topology.kubernetes.io/region"}, - }, - }, - } - - builder.WithObjects(ct, kai) - }, - levels: []configv1alpha1.TopologyLevel{ - {Domain: "region", Key: "topology.kubernetes.io/region"}, - }, - wantErr: true, - errContains: "owned by a different controller", - }, - { - name: "levels_change_recreate_kai", - setup: func(builder *fake.ClientBuilder) { - ct := &corev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - UID: types.UID("cluster-uid"), - }, - Spec: corev1alpha1.ClusterTopologySpec{ - Levels: []corev1alpha1.TopologyLevel{ - {Domain: "region", Key: "topology.kubernetes.io/region"}, - }, - }, - } - - kai := &kaitopologyv1alpha1.Topology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - OwnerReferences: []metav1.OwnerReference{ - { - APIVersion: "core.grove.io/v1alpha1", - Kind: "ClusterTopology", - Name: "test-topology", - UID: ct.UID, - Controller: ptr.To(true), - BlockOwnerDeletion: ptr.To(true), - }, - }, - }, - Spec: kaitopologyv1alpha1.TopologySpec{ - Levels: []kaitopologyv1alpha1.TopologyLevel{ - {NodeLabel: "old-label"}, - }, - }, - } - - builder.WithObjects(ct, kai) - }, - levels: []configv1alpha1.TopologyLevel{ - {Domain: "region", Key: "topology.kubernetes.io/region"}, - {Domain: "zone", Key: "topology.kubernetes.io/zone"}, - }, - wantErr: false, - validate: func(t *testing.T, c client.Client) { - ct := &corev1alpha1.ClusterTopology{} - err := c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, ct) - require.NoError(t, err) - assert.Len(t, ct.Spec.Levels, 2) - - kai := &kaitopologyv1alpha1.Topology{} - err = c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, kai) - require.NoError(t, err) - assert.Len(t, kai.Spec.Levels, 2) - assert.Equal(t, "topology.kubernetes.io/region", kai.Spec.Levels[0].NodeLabel) - assert.Equal(t, "topology.kubernetes.io/zone", kai.Spec.Levels[1].NodeLabel) - }, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - scheme := runtime.NewScheme() - _ = corev1alpha1.AddToScheme(scheme) - _ = kaitopologyv1alpha1.AddToScheme(scheme) - - builder := fake.NewClientBuilder().WithScheme(scheme) - if tt.setup != nil { - tt.setup(builder) - } - c := builder.Build() - - err := EnsureTopology(context.Background(), c, "test-topology", tt.levels) - - if tt.wantErr { - require.Error(t, err) - if tt.errContains != "" { - assert.Contains(t, err.Error(), tt.errContains) - } - } else { - require.NoError(t, err) - if tt.validate != nil { - tt.validate(t, c) - } - } - }) - } -} - -func TestEnsureDeleteClusterTopology(t *testing.T) { - tests := []struct { - name string - setup func(builder *fake.ClientBuilder) - wantErr bool - errContains string - validate func(*testing.T, client.Client) - }{ - { - name: "successful_delete", - setup: func(builder *fake.ClientBuilder) { - ct := &corev1alpha1.ClusterTopology{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-topology", - }, - Spec: corev1alpha1.ClusterTopologySpec{ - Levels: []corev1alpha1.TopologyLevel{ - {Domain: "region", Key: "topology.kubernetes.io/region"}, - }, - }, - } - builder.WithObjects(ct) - }, - wantErr: false, - validate: func(t *testing.T, c client.Client) { - ct := &corev1alpha1.ClusterTopology{} - err := c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, ct) - require.Error(t, err) - assert.True(t, apierrors.IsNotFound(err)) - }, - }, - { - name: "already_deleted", - setup: nil, - wantErr: false, - validate: func(t *testing.T, c client.Client) { - ct := &corev1alpha1.ClusterTopology{} - err := c.Get(context.Background(), types.NamespacedName{Name: "test-topology"}, ct) - require.Error(t, err) - assert.True(t, apierrors.IsNotFound(err)) - }, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - scheme := runtime.NewScheme() - _ = corev1alpha1.AddToScheme(scheme) - - builder := fake.NewClientBuilder().WithScheme(scheme) - if tt.setup != nil { - tt.setup(builder) - } - c := builder.Build() - - err := EnsureDeleteClusterTopology(context.Background(), c, "test-topology") - - if tt.wantErr { - require.Error(t, err) - if tt.errContains != "" { - assert.Contains(t, err.Error(), tt.errContains) - } - } else { - require.NoError(t, err) - if tt.validate != nil { - tt.validate(t, c) - } - } - }) - } -} diff --git a/operator/internal/utils/ioutil/ioutil.go b/operator/internal/utils/ioutil/ioutil.go new file mode 100644 index 000000000..ed51c19b6 --- /dev/null +++ b/operator/internal/utils/ioutil/ioutil.go @@ -0,0 +1,26 @@ +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package ioutil + +import "io" + +// CloseQuietly safely closes an io.Closer, ignoring and suppressing any error during the close operation. +func CloseQuietly(closer io.Closer) { + if closer != nil { + _ = closer.Close() + } +} diff --git a/operator/internal/utils/ioutil/ioutil_test.go b/operator/internal/utils/ioutil/ioutil_test.go new file mode 100644 index 000000000..ff7722d97 --- /dev/null +++ b/operator/internal/utils/ioutil/ioutil_test.go @@ -0,0 +1,133 @@ +// /* +// Copyright 2025 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package ioutil + +import ( + "errors" + "io" + "testing" + + "github.com/stretchr/testify/assert" +) + +// TestCloseQuietly tests the CloseQuietly function with various scenarios. +func TestCloseQuietly(t *testing.T) { + tests := []struct { + name string + closer io.Closer + expectErr bool + validate func(t *testing.T, closer io.Closer) + }{ + { + name: "close without errors", + closer: &mockCloser{ + closeErr: nil, + }, + expectErr: false, + validate: func(t *testing.T, closer io.Closer) { + mock := closer.(*mockCloser) + assert.True(t, mock.closed, "closer should be closed") + assert.Equal(t, 1, mock.callCount, "Close should be called once") + }, + }, + { + name: "close with error", + closer: &mockCloser{ + closeErr: errors.New("close failed"), + }, + expectErr: true, + validate: func(t *testing.T, closer io.Closer) { + mock := closer.(*mockCloser) + assert.True(t, mock.closed, "closer should be closed despite error") + assert.Equal(t, 1, mock.callCount, "Close should be called once") + }, + }, + { + name: "closer is nil", + closer: nil, + expectErr: false, + validate: func(t *testing.T, closer io.Closer) { + // No panic should occur, nothing to validate + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // This should not panic regardless of error + assert.NotPanics(t, func() { + CloseQuietly(tt.closer) + }, "CloseQuietly should not panic") + + if tt.validate != nil { + tt.validate(t, tt.closer) + } + }) + } +} + +// TestCloseQuietlyWithRealCloser tests with a more realistic closer. +func TestCloseQuietlyWithRealCloser(t *testing.T) { + // Using a simple pipe as a real io.Closer + r, w := io.Pipe() + + // Close the reader first + assert.NotPanics(t, func() { + CloseQuietly(r) + }) + + // Close the writer + assert.NotPanics(t, func() { + CloseQuietly(w) + }) + + // Try to close again (should not panic even if already closed) + assert.NotPanics(t, func() { + CloseQuietly(r) + CloseQuietly(w) + }) +} + +// TestCloseQuietlyWithPanic tests that CloseQuietly doesn't handle panics (only errors). +func TestCloseQuietlyWithPanic(t *testing.T) { + closer := &panicCloser{} + + // CloseQuietly should not catch panics, only suppress errors + assert.Panics(t, func() { + CloseQuietly(closer) + }, "CloseQuietly should not suppress panics, only errors") +} + +// mockCloser is a mock implementation of io.Closer for testing. +type mockCloser struct { + closed bool + closeErr error + callCount int +} + +func (m *mockCloser) Close() error { + m.callCount++ + m.closed = true + return m.closeErr +} + +// panicCloser is a closer that panics when closed. +type panicCloser struct{} + +func (p *panicCloser) Close() error { + panic("intentional panic") +} diff --git a/operator/internal/version/version.go b/operator/internal/version/version.go index ee1bc7018..2e3a7f984 100644 --- a/operator/internal/version/version.go +++ b/operator/internal/version/version.go @@ -17,13 +17,9 @@ package version import ( + "encoding/json" "fmt" - "io" - "os" "runtime" - - "github.com/spf13/pflag" - apimachineryversion "k8s.io/apimachinery/pkg/version" ) // These variables will be set during building the grove operator via LD_FLAGS @@ -39,18 +35,23 @@ var ( // gitTreeState is the state of git tree, either "clean" or "dirty" gitTreeState = "" // buildDate is the date (in ISO8601 format) at which the build was done. Output of $(date -u +'%Y-%m-%dT%H:%M:%SZ') - buildDate = "1970-01-01T00:00:00Z" - versionFlag bool + buildDate = "1970-01-01T00:00:00Z" ) -// AddFlags adds the --version flag to the flag.FlagSet. -func AddFlags(fs *pflag.FlagSet) { - fs.BoolVar(&versionFlag, "version", false, "version prints the version information and quits") +// GroveInfo holds version and build information about the grove operator. +type GroveInfo struct { + GitVersion string `json:"gitVersion"` + GitCommit string `json:"gitCommit"` + GitTreeState string `json:"gitTreeState"` + BuildDate string `json:"buildDate"` + GoVersion string `json:"goVersion"` + Compiler string `json:"compiler"` + Platform string `json:"platform"` } -// Get returns the version details for the grove operator. -func Get() apimachineryversion.Info { - return apimachineryversion.Info{ +// New creates a new GroveInfo with the build and version information. +func New() GroveInfo { + return GroveInfo{ GitVersion: gitVersion, GitCommit: gitCommit, GitTreeState: gitTreeState, @@ -61,11 +62,38 @@ func Get() apimachineryversion.Info { } } -// PrintVersionAndExitIfRequested will check if --version is passed and if it is -// then it will print the version information and quit. -func PrintVersionAndExitIfRequested() { - if versionFlag { - _, _ = fmt.Fprintf(io.Writer(os.Stdout), "%s %v\n", programName, Get()) - os.Exit(0) +// String returns the version information. +func (g GroveInfo) String() string { + return g.GitVersion +} + +// Verbose returns a detailed multi-line string with version and build information. +func (g GroveInfo) Verbose() string { + infoBytes, err := json.Marshal(g) + if err != nil { + return fmt.Sprintf("error generating verbose version information: %v\n", err) } + return string(infoBytes) } + +//// Get returns the version details for the grove operator. +//func Get(verbose bool) apimachineryversion.Info { +// return apimachineryversion.Info{ +// GitVersion: gitVersion, +// GitCommit: gitCommit, +// GitTreeState: gitTreeState, +// BuildDate: buildDate, +// GoVersion: runtime.Version(), +// Compiler: runtime.Compiler, +// Platform: fmt.Sprintf("%s/%s", runtime.GOOS, runtime.GOARCH), +// } +//} + +//// PrintVersionAndExitIfRequested will check if --version is passed and if it is +//// then it will print the version information and quit. +//func PrintVersionAndExitIfRequested() { +// if versionFlag { +// _, _ = fmt.Fprintf(io.Writer(os.Stdout), "%s %v\n", programName, Get()) +// os.Exit(0) +// } +//} diff --git a/operator/internal/webhook/register.go b/operator/internal/webhook/register.go index 49dff0dc3..ee77da1b3 100644 --- a/operator/internal/webhook/register.go +++ b/operator/internal/webhook/register.go @@ -30,8 +30,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" ) -// RegisterWebhooks registers the webhooks with the controller manager. -func RegisterWebhooks(mgr manager.Manager, authorizerConfig configv1alpha1.AuthorizerConfig) error { +// Register registers the webhooks with the controller manager. +func Register(mgr manager.Manager, authorizerConfig configv1alpha1.AuthorizerConfig) error { defaultingWebhook := defaulting.NewHandler(mgr) slog.Info("Registering webhook with manager", "handler", defaulting.Name) if err := defaultingWebhook.RegisterWithManager(mgr); err != nil { diff --git a/operator/internal/webhook/register_test.go b/operator/internal/webhook/register_test.go index 18403e87b..430adeb61 100644 --- a/operator/internal/webhook/register_test.go +++ b/operator/internal/webhook/register_test.go @@ -91,7 +91,7 @@ func TestRegisterWebhooks_WithoutAuthorizer(t *testing.T) { Enabled: false, } - err := RegisterWebhooks(mgr, authorizerConfig) + err := Register(mgr, authorizerConfig) require.NoError(t, err) } @@ -120,7 +120,7 @@ func TestRegisterWebhooks_WithAuthorizerMissingEnvVar(t *testing.T) { Enabled: true, } - err = RegisterWebhooks(mgr, authorizerConfig) + err = Register(mgr, authorizerConfig) require.Error(t, err) assert.Contains(t, err.Error(), constants.EnvVarServiceAccountName) } @@ -149,7 +149,7 @@ func TestRegisterWebhooks_WithAuthorizerMissingNamespaceFile(t *testing.T) { Enabled: true, } - err := RegisterWebhooks(mgr, authorizerConfig) + err := Register(mgr, authorizerConfig) require.Error(t, err) assert.Contains(t, err.Error(), "error reading namespace file") } @@ -194,7 +194,7 @@ func TestRegisterWebhooks_WithAuthorizerSuccess(t *testing.T) { Enabled: true, } - err = RegisterWebhooks(mgr, authorizerConfig) + err = Register(mgr, authorizerConfig) // Will error because it tries to read the hardcoded namespace file path require.Error(t, err) } From 36e7af474c9f980530053aa0f7aed93d0cd20d70 Mon Sep 17 00:00:00 2001 From: Madhav Bhargava Date: Sun, 28 Dec 2025 23:22:11 +0530 Subject: [PATCH 17/23] fixed check errors Signed-off-by: Madhav Bhargava --- operator/initc/cmd/opts/options.go | 4 ++-- operator/internal/utils/ioutil/ioutil_test.go | 2 +- operator/internal/version/version.go | 24 ------------------- 3 files changed, 3 insertions(+), 27 deletions(-) diff --git a/operator/initc/cmd/opts/options.go b/operator/initc/cmd/opts/options.go index a80f84fb8..836022802 100644 --- a/operator/initc/cmd/opts/options.go +++ b/operator/initc/cmd/opts/options.go @@ -41,7 +41,7 @@ type CLIOptions struct { } // RegisterFlags registers all the flags that are defined for the init container. -func (c *CLIOptions) RegisterFlags(fs *pflag.FlagSet) { +func (c *CLIOptions) RegisterFlags() { // --podcliques=: // --podcliques=podclique-a:3 --podcliques=podclique-b:4 and so on for each PodClique. pflag.StringArrayVarP(&c.podCliques, "podcliques", "p", nil, "podclique name and minAvailable replicas seperated by comma, repeated for each podclique") @@ -88,7 +88,7 @@ func InitializeCLIOptions() (CLIOptions, error) { config := CLIOptions{ podCliques: make([]string, 0), } - config.RegisterFlags(pflag.CommandLine) + config.RegisterFlags() pflag.Parse() return config, nil } diff --git a/operator/internal/utils/ioutil/ioutil_test.go b/operator/internal/utils/ioutil/ioutil_test.go index ff7722d97..f466bef8a 100644 --- a/operator/internal/utils/ioutil/ioutil_test.go +++ b/operator/internal/utils/ioutil/ioutil_test.go @@ -60,7 +60,7 @@ func TestCloseQuietly(t *testing.T) { name: "closer is nil", closer: nil, expectErr: false, - validate: func(t *testing.T, closer io.Closer) { + validate: func(_ *testing.T, closer io.Closer) { // No panic should occur, nothing to validate }, }, diff --git a/operator/internal/version/version.go b/operator/internal/version/version.go index 2e3a7f984..c9ca21e91 100644 --- a/operator/internal/version/version.go +++ b/operator/internal/version/version.go @@ -26,8 +26,6 @@ import ( // These variables have been borrowed from k8s.io/component-base repository. We do not want // the dependencies that k8s.io/component-base pulls in as the attempt is the keep a lean set of dependencies. var ( - // programName is the name of the operator. - programName = "grove-operator" // gitVersion is the semantic version for grove operator. gitVersion = "v0.0.0-master+$Format:%H$" // gitCommit is the SHA1 from git, output of $(git rev-parse HEAD) @@ -75,25 +73,3 @@ func (g GroveInfo) Verbose() string { } return string(infoBytes) } - -//// Get returns the version details for the grove operator. -//func Get(verbose bool) apimachineryversion.Info { -// return apimachineryversion.Info{ -// GitVersion: gitVersion, -// GitCommit: gitCommit, -// GitTreeState: gitTreeState, -// BuildDate: buildDate, -// GoVersion: runtime.Version(), -// Compiler: runtime.Compiler, -// Platform: fmt.Sprintf("%s/%s", runtime.GOOS, runtime.GOARCH), -// } -//} - -//// PrintVersionAndExitIfRequested will check if --version is passed and if it is -//// then it will print the version information and quit. -//func PrintVersionAndExitIfRequested() { -// if versionFlag { -// _, _ = fmt.Fprintf(io.Writer(os.Stdout), "%s %v\n", programName, Get()) -// os.Exit(0) -// } -//} From 6d9f9faf17be1bfe1e6ccc947b8704f601149740 Mon Sep 17 00:00:00 2001 From: Madhav Bhargava Date: Mon, 29 Dec 2025 10:45:01 +0530 Subject: [PATCH 18/23] Fixed formatting and check errors Signed-off-by: Madhav Bhargava --- docs/api-reference/operator-api.md | 2 +- operator/cmd/cli/cli.go | 2 +- operator/cmd/cli/version.go | 68 -------- operator/cmd/cli/version_test.go | 158 ------------------ operator/cmd/main.go | 15 +- operator/initc/cmd/opts/options.go | 1 + operator/internal/utils/ioutil/ioutil_test.go | 2 +- operator/internal/version/version.go | 4 +- 8 files changed, 13 insertions(+), 239 deletions(-) delete mode 100644 operator/cmd/cli/version.go delete mode 100644 operator/cmd/cli/version_test.go diff --git a/docs/api-reference/operator-api.md b/docs/api-reference/operator-api.md index cd5b46d82..cec98f706 100644 --- a/docs/api-reference/operator-api.md +++ b/docs/api-reference/operator-api.md @@ -86,7 +86,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `levels` _[TopologyLevel](#topologylevel) array_ | Levels is an ordered list of topology levels from broadest to narrowest scope.
The order in this list defines the hierarchy (index 0 = broadest level).
This field is immutable after creation. | | MinItems: 1
| +| `levels` _[TopologyLevel](#topologylevel) array_ | Levels is an ordered list of topology levels from broadest to narrowest scope.
The order in this list defines the hierarchy (index 0 = broadest level).
This field is immutable after creation. | | MaxItems: 7
MinItems: 1
| #### ErrorCode diff --git a/operator/cmd/cli/cli.go b/operator/cmd/cli/cli.go index e44c64704..2d53f52c6 100644 --- a/operator/cmd/cli/cli.go +++ b/operator/cmd/cli/cli.go @@ -23,10 +23,10 @@ import ( apicommonconstants "github.com/ai-dynamo/grove/operator/api/common/constants" configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" operatorvalidation "github.com/ai-dynamo/grove/operator/api/config/validation" - "k8s.io/apimachinery/pkg/runtime/serializer" "github.com/spf13/pflag" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/serializer" ) const ( diff --git a/operator/cmd/cli/version.go b/operator/cmd/cli/version.go deleted file mode 100644 index 1a3f40433..000000000 --- a/operator/cmd/cli/version.go +++ /dev/null @@ -1,68 +0,0 @@ -// /* -// Copyright 2025 The Grove Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// */ - -package cli - -import ( - "fmt" - "runtime/debug" - - apicommonconstants "github.com/ai-dynamo/grove/operator/api/common/constants" -) - -// GetBuildInfo gets the operator build information. -// If verbose is true, it returns detailed build information else it will only return the application version. -func GetBuildInfo(verbose bool) string { - info, ok := debug.ReadBuildInfo() - if !ok { - fmt.Printf("%s: binary build info not embedded\n", apicommonconstants.OperatorName) - return "" - } - if !verbose { - return fmt.Sprintf("%s version: %s\n", apicommonconstants.OperatorName, info.Main.Version) - } - return getVerboseBuildInfo(info) -} - -func getVerboseBuildInfo(info *debug.BuildInfo) string { - // Git information - gitVersion := getSetting(info, "vcs.version") - commitTime := getSetting(info, "vcs.time") - dirty := getSetting(info, "vcs.modified") - // OS and Arch - os := getSetting(info, "GOOS") - arch := getSetting(info, "GOARCH") - - return fmt.Sprintf(` -%s Build Information: - Version: %s - Go version: %s - OS/Arch: %s/%s - Git Information: - Revision (commit hash): %s - Commit Time: %s - Modified (dirty): %s -`, apicommonconstants.OperatorName, info.Main.Version, info.GoVersion, os, arch, gitVersion, commitTime, dirty) -} - -func getSetting(info *debug.BuildInfo, key string) string { - for _, setting := range info.Settings { - if setting.Key == key { - return setting.Value - } - } - return fmt.Sprintf("", key) -} diff --git a/operator/cmd/cli/version_test.go b/operator/cmd/cli/version_test.go deleted file mode 100644 index 797bf6a4a..000000000 --- a/operator/cmd/cli/version_test.go +++ /dev/null @@ -1,158 +0,0 @@ -// /* -// Copyright 2025 The Grove Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// */ - -package cli - -import ( - "runtime/debug" - "testing" - - apicommonconstants "github.com/ai-dynamo/grove/operator/api/common/constants" - - "github.com/stretchr/testify/assert" -) - -// TestGetSetting tests the getSetting helper function. -func TestGetSetting(t *testing.T) { - tests := []struct { - name string - info *debug.BuildInfo - key string - expected string - }{ - { - name: "setting exists", - info: &debug.BuildInfo{ - Settings: []debug.BuildSetting{ - {Key: "vcs.version", Value: "abc123"}, - {Key: "vcs.time", Value: "2025-12-28T10:00:00Z"}, - {Key: "GOOS", Value: "linux"}, - }, - }, - key: "vcs.version", - expected: "abc123", - }, - { - name: "setting does not exist", - info: &debug.BuildInfo{ - Settings: []debug.BuildSetting{ - {Key: "vcs.version", Value: "abc123"}, - {Key: "GOOS", Value: "linux"}, - }, - }, - key: "nonexistent.key", - expected: "", - }, - { - name: "empty key", - info: &debug.BuildInfo{ - Settings: []debug.BuildSetting{}, - }, - key: "vcs.version", - expected: "", - }, - { - name: "settings is nil", - info: &debug.BuildInfo{ - Settings: nil, - }, - key: "vcs.version", - expected: "", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result := getSetting(tt.info, tt.key) - assert.Equal(t, tt.expected, result) - }) - } -} - -// TestGetVerboseBuildInfo tests the getVerboseBuildInfo helper function. -func TestGetVerboseBuildInfo(t *testing.T) { - tests := []struct { - name string - info *debug.BuildInfo - expectedContains []string - expectedNotEmpty bool - }{ - { - name: "build information is complete", - info: &debug.BuildInfo{ - Main: debug.Module{ - Version: "v1.0.0", - }, - GoVersion: "go1.21.5", - Settings: []debug.BuildSetting{ - {Key: "vcs.version", Value: "abc123def456"}, - {Key: "vcs.time", Value: "2025-12-28T10:00:00Z"}, - {Key: "vcs.modified", Value: "false"}, - {Key: "GOOS", Value: "linux"}, - {Key: "GOARCH", Value: "amd64"}, - }, - }, - expectedContains: []string{ - apicommonconstants.OperatorName + " Build Information:", - "Version: v1.0.0", - "Go version: go1.21.5", - "OS/Arch: linux/amd64", - "Revision (commit hash): abc123def456", - "Commit Time: 2025-12-28T10:00:00Z", - "Modified (dirty): false", - }, - expectedNotEmpty: true, - }, - { - name: "build information is missing some settings", - info: &debug.BuildInfo{ - Main: debug.Module{ - Version: "v0.1.0-dev", - }, - GoVersion: "go1.22.0", - Settings: []debug.BuildSetting{ - {Key: "GOOS", Value: "darwin"}, - {Key: "GOARCH", Value: "arm64"}, - }, - }, - expectedContains: []string{ - apicommonconstants.OperatorName + " Build Information:", - "Version: v0.1.0-dev", - "Go version: go1.22.0", - "OS/Arch: darwin/arm64", - "", - "", - "", - }, - expectedNotEmpty: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result := getVerboseBuildInfo(tt.info) - - if tt.expectedNotEmpty { - assert.NotEmpty(t, result) - } - - for _, expected := range tt.expectedContains { - assert.Contains(t, result, expected, - "result should contain '%s'", expected) - } - }) - } -} diff --git a/operator/cmd/main.go b/operator/cmd/main.go index 1418e8c48..8d38eeb98 100644 --- a/operator/cmd/main.go +++ b/operator/cmd/main.go @@ -33,10 +33,10 @@ import ( "github.com/ai-dynamo/grove/operator/internal/controller/cert" grovelogger "github.com/ai-dynamo/grove/operator/internal/logger" groveversion "github.com/ai-dynamo/grove/operator/internal/version" - "sigs.k8s.io/controller-runtime/pkg/client" "github.com/spf13/pflag" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" ) var ( @@ -55,7 +55,6 @@ func main() { _, _ = fmt.Fprintf(io.Writer(os.Stdout), "%s %v\n", apicommonconstants.OperatorName, groveInfo) os.Exit(cli.ExitSuccess) } - //time.Sleep(20 * time.Minute) operatorConfig, err := launchOpts.LoadAndValidateOperatorConfig() if err != nil { logger.Error(err, "failed to load operator config") @@ -68,7 +67,7 @@ func main() { mgr, err := grovectrl.CreateManager(operatorConfig) if err != nil { logger.Error(err, "failed to create grove controller manager") - os.Exit(1) + handleErrorAndExit(err, cli.ExitErrInitializeManager) } ctx := ctrl.SetupSignalHandler() @@ -80,18 +79,18 @@ func main() { // handling of scheduler specified resources will be delegated to the backend scheduler controller. if err = synchronizeTopology(ctx, mgr, operatorConfig); err != nil { logger.Error(err, "failed to synchronize cluster topology") - os.Exit(cli.ExitErrSynchronizeTopology) + handleErrorAndExit(err, cli.ExitErrSynchronizeTopology) } webhookCertsReadyCh := make(chan struct{}) if err = cert.ManageWebhookCerts(mgr, operatorConfig.Server.Webhooks.ServerCertDir, operatorConfig.Authorizer.Enabled, webhookCertsReadyCh); err != nil { logger.Error(err, "failed to setup cert rotation") - os.Exit(cli.ExitErrInitializeManager) + handleErrorAndExit(err, cli.ExitErrInitializeManager) } if err = grovectrl.SetupHealthAndReadinessEndpoints(mgr, webhookCertsReadyCh); err != nil { logger.Error(err, "failed to set up health and readiness for grove controller manager") - os.Exit(cli.ExitErrInitializeManager) + handleErrorAndExit(err, cli.ExitErrInitializeManager) } // Certificates need to be generated before the webhooks are started, which can only happen once the manager is started. @@ -99,14 +98,14 @@ func main() { go func() { if err = grovectrl.RegisterControllersAndWebhooks(mgr, logger, operatorConfig, webhookCertsReadyCh); err != nil { logger.Error(err, "failed to initialize grove controller manager") - os.Exit(cli.ExitErrInitializeManager) + handleErrorAndExit(err, cli.ExitErrInitializeManager) } }() logger.Info("Starting manager") if err = mgr.Start(ctx); err != nil { logger.Error(err, "Error starting controller manager") - os.Exit(cli.ExitErrStart) + handleErrorAndExit(err, cli.ExitErrStart) } } diff --git a/operator/initc/cmd/opts/options.go b/operator/initc/cmd/opts/options.go index 836022802..76125c96d 100644 --- a/operator/initc/cmd/opts/options.go +++ b/operator/initc/cmd/opts/options.go @@ -23,6 +23,7 @@ import ( grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" groveerr "github.com/ai-dynamo/grove/operator/internal/errors" + "github.com/spf13/pflag" ) diff --git a/operator/internal/utils/ioutil/ioutil_test.go b/operator/internal/utils/ioutil/ioutil_test.go index f466bef8a..c3831531c 100644 --- a/operator/internal/utils/ioutil/ioutil_test.go +++ b/operator/internal/utils/ioutil/ioutil_test.go @@ -60,7 +60,7 @@ func TestCloseQuietly(t *testing.T) { name: "closer is nil", closer: nil, expectErr: false, - validate: func(_ *testing.T, closer io.Closer) { + validate: func(_ *testing.T, _ io.Closer) { // No panic should occur, nothing to validate }, }, diff --git a/operator/internal/version/version.go b/operator/internal/version/version.go index c9ca21e91..e7ba705b5 100644 --- a/operator/internal/version/version.go +++ b/operator/internal/version/version.go @@ -60,8 +60,8 @@ func New() GroveInfo { } } -// String returns the version information. -func (g GroveInfo) String() string { +// Version returns the version information for Grove operator. +func (g GroveInfo) Version() string { return g.GitVersion } From 4a070735bc8cf0dede8cef8c451654c8925019ba Mon Sep 17 00:00:00 2001 From: Madhav Bhargava Date: Mon, 29 Dec 2025 14:40:38 +0530 Subject: [PATCH 19/23] LeaderElection never worked as it was never enabled due to missing JSON tag. Fixing that uncovered missing role and role binding. This commit adds the missing role and rolebinding allowing Grove operate on leases Signed-off-by: Madhav Bhargava --- operator/charts/templates/_helpers.tpl | 16 ++++++++++- .../charts/templates/leaderelection-role.yaml | 27 +++++++++++++++++++ .../templates/leaderelection-rolebinding.yaml | 15 +++++++++++ 3 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 operator/charts/templates/leaderelection-role.yaml create mode 100644 operator/charts/templates/leaderelection-rolebinding.yaml diff --git a/operator/charts/templates/_helpers.tpl b/operator/charts/templates/_helpers.tpl index 170a61e9f..3399ec233 100644 --- a/operator/charts/templates/_helpers.tpl +++ b/operator/charts/templates/_helpers.tpl @@ -154,4 +154,18 @@ release: "{{ .Release.Name }}" {{- range $key, $val := .Values.webhookServerSecret.labels }} {{ $key }}: {{ $val }} {{- end }} -{{- end -}} \ No newline at end of file +{{- end -}} + +{{- define "operator.lease.role.labels" -}} +{{- include "common.chart.labels" . }} +{{- range $key, $val := .Values.leaseRole.labels }} +{{ $key }}: {{ $val }} +{{- end }} +{{- end -}} + +{{- define "operator.lease.rolebinding.labels" -}} +{{- include "common.chart.labels" . }} +{{- range $key, $val := .Values.leaseRoleBinding.labels }} +{{ $key }}: {{ $val }} +{{- end }} +{{- end -}} diff --git a/operator/charts/templates/leaderelection-role.yaml b/operator/charts/templates/leaderelection-role.yaml new file mode 100644 index 000000000..32524c1e6 --- /dev/null +++ b/operator/charts/templates/leaderelection-role.yaml @@ -0,0 +1,27 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + labels: +{{- include "operator.lease.role.labels" . | nindent 4 }} + name: {{ required ".Values.leaseRole.name is required" .Values.leaseRole.name }} + namespace: '{{ .Release.Namespace }}' +rules: + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - "" + resources: + - events + verbs: + - create + - patch \ No newline at end of file diff --git a/operator/charts/templates/leaderelection-rolebinding.yaml b/operator/charts/templates/leaderelection-rolebinding.yaml new file mode 100644 index 000000000..f29f8a0eb --- /dev/null +++ b/operator/charts/templates/leaderelection-rolebinding.yaml @@ -0,0 +1,15 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ required ".Values.leaseRoleBinding.name is required" .Values.leaseRoleBinding.name }} + labels: +{{- include "operator.lease.rolebinding.labels" . | nindent 4 }} + namespace: '{{ .Release.Namespace }}' +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ required ".Values.leaseRole.name is required" .Values.leaseRole.name }} +subjects: + - kind: ServiceAccount + name: {{ required ".Values.serviceAccount.name is required" .Values.serviceAccount.name }} + namespace: '{{ .Release.Namespace }}' \ No newline at end of file From da9bf117c7c09e6bad11d4a06e8db40bf4c26091 Mon Sep 17 00:00:00 2001 From: Madhav Bhargava Date: Mon, 29 Dec 2025 15:16:24 +0530 Subject: [PATCH 20/23] reverted change manager.go->RegisterControllersAndWebhooks to make the test run, to be changed later Signed-off-by: Madhav Bhargava --- operator/internal/controller/manager.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/operator/internal/controller/manager.go b/operator/internal/controller/manager.go index 28e2833ef..0ff633593 100644 --- a/operator/internal/controller/manager.go +++ b/operator/internal/controller/manager.go @@ -56,13 +56,13 @@ func CreateManager(operatorCfg *configv1alpha1.OperatorConfiguration) (ctrl.Mana // RegisterControllersAndWebhooks adds all the controllers and webhooks to the controller-manager using the passed in Config. func RegisterControllersAndWebhooks(mgr ctrl.Manager, logger logr.Logger, operatorCfg *configv1alpha1.OperatorConfiguration, certsReady chan struct{}) error { - // Controllers will not work unless the webhoooks are fully configured and operational. + // Controllers will not work unless the webhooks are fully configured and operational. // For webhooks to work cert-controller should finish its work of generating and injecting certificates. - cert.WaitTillWebhookCertsReady(logger, certsReady) - if err := RegisterControllers(mgr, operatorCfg.Controllers); err != nil { + waitTillWebhookCertsReady(logger, certsReady) + if err := registerControllersWithMgr(mgr, operatorCfg.Controllers); err != nil { return err } - if err := webhook.Register(mgr, operatorCfg.Authorizer); err != nil { + if err := registerWebhooksWithMgr(mgr, operatorCfg.Authorizer); err != nil { return err } return nil From 0d66cc7867d213cd776cc3fcab6ca3f747b74bf2 Mon Sep 17 00:00:00 2001 From: Madhav Bhargava Date: Mon, 29 Dec 2025 18:07:48 +0530 Subject: [PATCH 21/23] removed LD_FLAGS from init container and removed programName as its no longer set Signed-off-by: Madhav Bhargava --- hack/ld-flags.sh | 1 - operator/e2e/setup/k8s_clusters.go | 2 +- operator/skaffold.yaml | 2 -- 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/hack/ld-flags.sh b/hack/ld-flags.sh index 6eed0ab8f..f5f86d2f8 100644 --- a/hack/ld-flags.sh +++ b/hack/ld-flags.sh @@ -40,7 +40,6 @@ function build_ld_flags() { FLAGS="-X $PACKAGE_PATH/version.gitCommit=$(git rev-parse --verify HEAD) -X $PACKAGE_PATH/version.gitTreeState=$tree_state -X $PACKAGE_PATH/version.buildDate=$build_date - -X $PACKAGE_PATH/version.programName=$PROGRAM_NAME" # The k8s.component-base/version.gitVersion can not be set to the version of grove # due to the error: "emulation version 1.33 is not between [1.31, 0.1.0-dev]". diff --git a/operator/e2e/setup/k8s_clusters.go b/operator/e2e/setup/k8s_clusters.go index df98843ef..dfa45dcf7 100644 --- a/operator/e2e/setup/k8s_clusters.go +++ b/operator/e2e/setup/k8s_clusters.go @@ -1031,7 +1031,7 @@ func buildLDFlagsForE2E() string { // Build the ldflags string // Note: The module path changed from github.com/NVIDIA to github.com/ai-dynamo - ldflags := fmt.Sprintf("-X github.com/ai-dynamo/grove/operator/internal/version.gitCommit=%s -X github.com/ai-dynamo/grove/operator/internal/version.gitTreeState=%s -X github.com/ai-dynamo/grove/operator/internal/version.buildDate=%s -X github.com/ai-dynamo/grove/operator/internal/version.gitVersion=E2E_TESTS -X github.com/ai-dynamo/grove/operator/internal/version.programName=grove-operator", + ldflags := fmt.Sprintf("-X github.com/ai-dynamo/grove/operator/internal/version.gitCommit=%s -X github.com/ai-dynamo/grove/operator/internal/version.gitTreeState=%s -X github.com/ai-dynamo/grove/operator/internal/version.buildDate=%s -X github.com/ai-dynamo/grove/operator/internal/version.gitVersion=E2E_TESTS", gitCommit, treeState, buildDate) return ldflags diff --git a/operator/skaffold.yaml b/operator/skaffold.yaml index 2f58e72af..a7a5e73bf 100644 --- a/operator/skaffold.yaml +++ b/operator/skaffold.yaml @@ -33,8 +33,6 @@ build: - VERSION flags: - -v - ldflags: - - '{{.LD_FLAGS}}' main: ./initc/cmd deploy: helm: # see https://skaffold.dev/docs/deployers/helm/ From 5a4569f63b217b216e9a2880e4b6c895aac06beb Mon Sep 17 00:00:00 2001 From: Madhav Bhargava Date: Mon, 29 Dec 2025 18:09:34 +0530 Subject: [PATCH 22/23] fixed ld-flags.sh Signed-off-by: Madhav Bhargava --- hack/ld-flags.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hack/ld-flags.sh b/hack/ld-flags.sh index f5f86d2f8..387eb2794 100644 --- a/hack/ld-flags.sh +++ b/hack/ld-flags.sh @@ -39,7 +39,7 @@ function build_ld_flags() { FLAGS="-X $PACKAGE_PATH/version.gitCommit=$(git rev-parse --verify HEAD) -X $PACKAGE_PATH/version.gitTreeState=$tree_state - -X $PACKAGE_PATH/version.buildDate=$build_date + -X $PACKAGE_PATH/version.buildDate=$build_date" # The k8s.component-base/version.gitVersion can not be set to the version of grove # due to the error: "emulation version 1.33 is not between [1.31, 0.1.0-dev]". From c63e4d2e433471ee611426786a79582c43ae9201 Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Mon, 29 Dec 2025 22:43:47 +0200 Subject: [PATCH 23/23] feat: add timeout to skaffold command and define lease role and role binding in values.yaml Signed-off-by: Ron Kahn --- operator/charts/values.yaml | 14 ++++++++++++++ operator/e2e/setup/skaffold.go | 7 ++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/operator/charts/values.yaml b/operator/charts/values.yaml index 0f3528ea9..c7b03db2c 100644 --- a/operator/charts/values.yaml +++ b/operator/charts/values.yaml @@ -128,6 +128,20 @@ clusterRoleBinding: app.kubernetes.io/name: grove-operator app.kubernetes.io/part-of: grove +leaseRole: + name: grove:system:grove-operator-lease + labels: + app.kubernetes.io/component: operator-lease-role + app.kubernetes.io/name: grove-operator + app.kubernetes.io/part-of: grove + +leaseRoleBinding: + name: grove:system:grove-operator-lease + labels: + app.kubernetes.io/component: operator-lease-rolebinding + app.kubernetes.io/name: grove-operator + app.kubernetes.io/part-of: grove + webhooks: podCliqueSetValidationWebhook: labels: diff --git a/operator/e2e/setup/skaffold.go b/operator/e2e/setup/skaffold.go index bc61f40c4..782a29d3a 100644 --- a/operator/e2e/setup/skaffold.go +++ b/operator/e2e/setup/skaffold.go @@ -25,6 +25,7 @@ import ( "os/exec" "path/filepath" "strings" + "time" "github.com/ai-dynamo/grove/operator/e2e/utils" "k8s.io/client-go/rest" @@ -144,7 +145,11 @@ func runSkaffoldBuild(ctx context.Context, absSkaffoldPath, skaffoldDir, kubecon // Add the skaffold.yaml path args = append(args, "-f", absSkaffoldPath) - cmd := exec.CommandContext(ctx, "skaffold", args...) + // Add timeout to prevent indefinite hangs (normal builds should complete in <10 minutes) + buildCtx, cancel := context.WithTimeout(ctx, 15*time.Minute) + defer cancel() + + cmd := exec.CommandContext(buildCtx, "skaffold", args...) cmd.Dir = skaffoldDir // Set up environment variables