Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
3d594cc
fix: always sort bindings in the scheduler to ensure deterministic de…
michaelawyu Dec 10, 2025
0552f2e
feat: add status back-reporting support (1/, work applier side) (#327)
michaelawyu Dec 10, 2025
ed6177d
feat: stopped update run implementation (#364)
britaniar Dec 11, 2025
1e1765e
chore: fix the collect-logs.sh (#355)
zhiying-lin Dec 12, 2025
2d3d18f
feat: webhook readiness wait for informer cache sync (#367)
weng271190436 Dec 13, 2025
c7b3e8a
chore: bump step-security/harden-runner from 2.13.3 to 2.14.0 (#377)
dependabot[bot] Dec 15, 2025
7bbc6fe
chore: bump actions/upload-artifact from 5 to 6 (#376)
dependabot[bot] Dec 15, 2025
6f3e972
fix: use the mc generation when copying the agentStatus (#378)
zhiying-lin Dec 16, 2025
6511b20
fix: do not emit unexpectedError when getting objs with dynamic clien…
jwtty Dec 18, 2025
22f8aa5
feat: add stopping stage update run implementation (#374)
britaniar Dec 18, 2025
71b330f
fix: fix update run name CEL (#384)
britaniar Dec 18, 2025
f81a13d
fix: bug fixes in rollout controller and work-generator (#379)
zhiying-lin Dec 18, 2025
75d0be7
feat: add status back-reporting supprt (2/, new controller) (#329)
michaelawyu Dec 22, 2025
aa55878
feat: add implementation for scheduler work queues with batch process…
michaelawyu Dec 22, 2025
a24443c
test: fix flaky report diff e2e (#391)
jwtty Dec 23, 2025
a9d431c
fix: use the correct client in work applier ITs and set a longer time…
michaelawyu Dec 23, 2025
30462ce
fix: add CEL to block user from changing External rollout strategy ty…
britaniar Dec 24, 2025
9291964
feat: skip work applier status updates if possible (#375)
michaelawyu Dec 29, 2025
f7b7af4
ci: update github runners to oci gh arc runners (#381)
koksay Dec 30, 2025
7247a4b
feat: decouple informer cache population and event handling (#380)
weng271190436 Jan 2, 2026
7464dac
fix: fix one case of from cache is not set correctly (#388)
ryanzhang-oss Jan 5, 2026
31621d8
fix: check the caData field too (#386)
ryanzhang-oss Jan 5, 2026
2f1911e
test: convert updateRun test utils public (#397)
Arvindthiru Jan 8, 2026
c0b45ac
docs: add help documentation for 13 missing Makefile targets (#401)
Copilot Jan 8, 2026
41fc0c1
test: update stop update run integration test (#387)
britaniar Jan 8, 2026
004c972
fix: add cluster inventory group to CRD validation webhook (#398)
jwtty Jan 9, 2026
40a0bc3
fix: return unexpected error if no. of updating cluster > maxConcurre…
Arvindthiru Jan 9, 2026
52aa16c
fix: do not register event handler for the same GVR multiple times (#…
weng271190436 Jan 12, 2026
94586a4
Merge remote-tracking branch 'cncf/main' into janBackport
britaniar Jan 12, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ jobs:

- name: Upload logs
if: always()
uses: actions/upload-artifact@v5
uses: actions/upload-artifact@v6
with:
name: e2e-logs-${{ matrix.customized-settings }}
path: test/e2e/logs-${{ matrix.customized-settings }}/
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/codespell.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Harden Runner
uses: step-security/harden-runner@df199fb7be9f65074067a9eb93f12bb4c5547cf2 # v2.13.3
uses: step-security/harden-runner@20cf305ff2072d973412fa9b1e3a4f227bda3c76 # v2.14.0
with:
egress-policy: audit

Expand Down
50 changes: 25 additions & 25 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ $(PROTOC):
unzip $(TOOLS_BIN_DIR)/protoc.zip -d $(TOOLS_BIN_DIR)/protoc_tmp && mv $(TOOLS_BIN_DIR)/protoc_tmp/bin/protoc $(PROTOC) && rm -rf $(TOOLS_BIN_DIR)/protoc.zip $(TOOLS_BIN_DIR)/protoc_tmp

.PHONY: help
help: ## Display this help.
help: ## Display this help
@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m<target>\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)


Expand All @@ -150,7 +150,7 @@ help: ## Display this help.
## --------------------------------------

.PHONY: lint
lint: $(GOLANGCI_LINT)
lint: $(GOLANGCI_LINT) ## Run fast linting
$(GOLANGCI_LINT) run -v

.PHONY: lint-full
Expand All @@ -161,36 +161,36 @@ lint-full: $(GOLANGCI_LINT) ## Run slower linters to detect possible issues
## Development
## --------------------------------------

staticcheck: $(STATICCHECK)
staticcheck: $(STATICCHECK) ## Run static analysis
$(STATICCHECK) ./...

.PHONY: fmt
fmt: $(GOIMPORTS) ## Run go fmt against code.
fmt: $(GOIMPORTS) ## Run go fmt against code
go fmt ./...
$(GOIMPORTS) -local go.goms.io/fleet -w $$(go list -f {{.Dir}} ./...)

.PHONY: vet
vet: ## Run go vet against code.
vet: ## Run go vet against code
go vet ./...

## --------------------------------------
## test
## --------------------------------------

.PHONY: test
test: manifests generate fmt vet local-unit-test integration-test ## Run tests.
test: manifests generate fmt vet local-unit-test integration-test ## Run unit tests and integration tests

##
# Set up the timeout parameters as some of the tests (rollout controller) lengths have exceeded the default 10 minute mark.
# TO-DO (chenyu1): enable parallelization for single package integration tests.
.PHONY: local-unit-test
local-unit-test: $(ENVTEST) ## Run tests.
local-unit-test: $(ENVTEST) ## Run unit tests
export CGO_ENABLED=1 && \
export KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) -p path)" && \
go test `go list ./pkg/... ./cmd/...` -race -coverpkg=./... -coverprofile=ut-coverage.xml -covermode=atomic -v -timeout=20m
go test `go list ./pkg/... ./cmd/...` -race -coverpkg=./... -coverprofile=ut-coverage.xml -covermode=atomic -v -timeout=30m

.PHONY: integration-test
integration-test: $(ENVTEST) ## Run tests.
integration-test: $(ENVTEST) ## Run integration tests
export CGO_ENABLED=1 && \
export KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) -p path)" && \
ginkgo -v -p --race --cover --coverpkg=./pkg/scheduler/... ./test/scheduler && \
Expand All @@ -202,14 +202,14 @@ integration-test: $(ENVTEST) ## Run tests.
LABEL_FILTER ?= !custom

.PHONY: e2e-tests
e2e-tests: setup-clusters
e2e-tests: setup-clusters ## Run E2E tests
cd ./test/e2e && ginkgo --timeout=70m --label-filter="$(LABEL_FILTER)" -v -p .

e2e-tests-custom: setup-clusters
e2e-tests-custom: setup-clusters ## Run custom E2E tests with labels
cd ./test/e2e && ginkgo --label-filter="custom" -v -p .

.PHONY: setup-clusters
setup-clusters:
setup-clusters: ## Set up Kind clusters for E2E testing
cd ./test/e2e && chmod +x ./setup.sh && ./setup.sh $(MEMBER_CLUSTER_COUNT)

.PHONY: collect-e2e-logs
Expand All @@ -218,7 +218,7 @@ collect-e2e-logs: ## Collect logs from hub and member agent pods after e2e tests

## reviewable
.PHONY: reviewable
reviewable: fmt vet lint staticcheck
reviewable: fmt vet lint staticcheck ## Run all quality checks before PR
go mod tidy

## --------------------------------------
Expand All @@ -230,7 +230,7 @@ CRD_OPTIONS ?= "crd"

# Generate manifests e.g. CRD, RBAC etc.
.PHONY: manifests
manifests: $(CONTROLLER_GEN)
manifests: $(CONTROLLER_GEN) ## Generate CRDs and manifests
$(CONTROLLER_GEN) \
$(CRD_OPTIONS) rbac:roleName=manager-role webhook paths="./apis/..." output:crd:artifacts:config=config/crd/bases

Expand All @@ -243,7 +243,7 @@ protos: $(PROTOC_GEN_GO) $(PROTOC_GEN_GO_GRPC) $(PROTOC_GEN_GRPC_GATEWAY) $(PROT
apis/protos/azure/compute/v1/vmsizerecommender.proto

# Generate code
generate: $(CONTROLLER_GEN) protos
generate: $(CONTROLLER_GEN) protos ## Generate deep copy methods
$(CONTROLLER_GEN) \
object:headerFile="hack/boilerplate.go.txt" paths="./..."

Expand All @@ -252,17 +252,17 @@ generate: $(CONTROLLER_GEN) protos
## --------------------------------------

.PHONY: build
build: generate fmt vet ## Build agent binaries.
build: generate fmt vet ## Build agent binaries
go build -o bin/hubagent cmd/hubagent/main.go
go build -o bin/memberagent cmd/memberagent/main.go
go build -o bin/crdinstaller cmd/crdinstaller/main.go

.PHONY: run-hubagent
run-hubagent: manifests generate fmt vet ## Run a controllers from your host.
run-hubagent: manifests generate fmt vet ## Run hub-agent from your host
go run ./cmd/hubagent/main.go

.PHONY: run-memberagent
run-memberagent: manifests generate fmt vet ## Run a controllers from your host.
run-memberagent: manifests generate fmt vet ## Run member-agent from your host
go run ./cmd/memberagent/main.go

.PHONY: run-crdinstaller
Expand All @@ -279,7 +279,7 @@ QEMU_VERSION ?= 7.2.0-1
BUILDKIT_VERSION ?= v0.18.1

.PHONY: push
push:
push: ## Build and push all Docker images
$(MAKE) OUTPUT_TYPE="type=registry" docker-build-hub-agent docker-build-member-agent docker-build-refresh-token docker-build-crd-installer

# By default, docker buildx create will pull image moby/buildkit:buildx-stable-1 and hit the too many requests error
Expand All @@ -289,7 +289,7 @@ push:
# we keep the original setup if the build target is x86_64 platforms (default) for compatibility reasons, but will switch to
# a more general setup for non-x86_64 hosts.
#
# On some systems the emulation setup might not work at all (e.g., macOS on Apple Silicon -> Rosetta 2 will be used
# On some systems the emulation setup might not work at all (e.g., macOS on Apple Silicon -> Rosetta 2 will be used
# by Docker Desktop as the default emulation option for AMD64 on ARM64 container compatibility).
.PHONY: docker-buildx-builder
# Note (chenyu1): the step below sets up emulation for building/running non-native binaries on the host. The original
Expand All @@ -313,7 +313,7 @@ docker-buildx-builder:
fi

.PHONY: docker-build-hub-agent
docker-build-hub-agent: docker-buildx-builder
docker-build-hub-agent: docker-buildx-builder ## Build hub-agent image
docker buildx build \
--file docker/$(HUB_AGENT_IMAGE_NAME).Dockerfile \
--output=$(OUTPUT_TYPE) \
Expand All @@ -325,7 +325,7 @@ docker-build-hub-agent: docker-buildx-builder
--build-arg GOOS=$(TARGET_OS) .

.PHONY: docker-build-member-agent
docker-build-member-agent: docker-buildx-builder
docker-build-member-agent: docker-buildx-builder ## Build member-agent image
docker buildx build \
--file docker/$(MEMBER_AGENT_IMAGE_NAME).Dockerfile \
--output=$(OUTPUT_TYPE) \
Expand All @@ -337,7 +337,7 @@ docker-build-member-agent: docker-buildx-builder
--build-arg GOOS=$(TARGET_OS) .

.PHONY: docker-build-refresh-token
docker-build-refresh-token: docker-buildx-builder
docker-build-refresh-token: docker-buildx-builder ## Build refresh-token image
docker buildx build \
--file docker/$(REFRESH_TOKEN_IMAGE_NAME).Dockerfile \
--output=$(OUTPUT_TYPE) \
Expand Down Expand Up @@ -366,7 +366,7 @@ helm-package-arc-member-cluster-agents:
envsubst < charts/member-agent-arc/values.yaml > charts/member-agent-arc/values.yaml.tmp && \
mv charts/member-agent-arc/values.yaml.tmp charts/member-agent-arc/values.yaml && \
helm package charts/member-agent-arc/ --version $(ARC_MEMBER_AGENT_HELMCHART_VERSION)

helm push $(ARC_MEMBER_AGENT_HELMCHART_NAME)-$(ARC_MEMBER_AGENT_HELMCHART_VERSION).tgz oci://$(REGISTRY)

## -----------------------------------
Expand All @@ -379,5 +379,5 @@ clean-bin: ## Remove all generated binaries
rm -rf ./bin

.PHONY: clean-e2e-tests
clean-e2e-tests:
clean-e2e-tests: ## Clean up E2E test clusters
cd ./test/e2e && chmod +x ./stop.sh && ./stop.sh $(MEMBER_CLUSTER_COUNT)
2 changes: 1 addition & 1 deletion apis/placement/v1beta1/clusterresourceplacement_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -524,9 +524,9 @@ const (
type RolloutStrategy struct {
// Type of rollout. The only supported types are "RollingUpdate" and "External".
// Default is "RollingUpdate".
// +kubebuilder:validation:Optional
// +kubebuilder:default=RollingUpdate
// +kubebuilder:validation:Enum=RollingUpdate;External
// +kubebuilder:validation:XValidation:rule="!(self != 'External' && oldSelf == 'External')",message="cannot change rollout strategy type from 'External' to other types"
Type RolloutStrategyType `json:"type,omitempty"`

// Rolling update config params. Present only if RolloutStrategyType = RollingUpdate.
Expand Down
9 changes: 5 additions & 4 deletions apis/placement/v1beta1/stageupdate_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ type UpdateRunObjList interface {
// +kubebuilder:printcolumn:JSONPath=`.status.conditions[?(@.type=="Succeeded")].status`,name="Succeeded",type=string
// +kubebuilder:printcolumn:JSONPath=`.metadata.creationTimestamp`,name="Age",type=date
// +kubebuilder:printcolumn:JSONPath=`.spec.stagedRolloutStrategyName`,name="Strategy",priority=1,type=string
// +kubebuilder:validation:XValidation:rule="size(self.metadata.name) < 128",message="metadata.name max length is 127"
// +kubebuilder:validation:XValidation:rule="size(self.metadata.name) < 64",message="metadata.name max length is 63"

// ClusterStagedUpdateRun represents a stage by stage update process that applies ClusterResourcePlacement
// selected resources to specified clusters.
Expand Down Expand Up @@ -427,7 +427,7 @@ const (
// Its condition status can be one of the following:
// - "True": The staged update run is making progress.
// - "False": The staged update run is waiting/paused/abandoned.
// - "Unknown" means it is unknown.
// - "Unknown": The staged update run is in a transitioning state.
StagedUpdateRunConditionProgressing StagedUpdateRunConditionType = "Progressing"

// StagedUpdateRunConditionSucceeded indicates whether the staged update run is completed successfully.
Expand Down Expand Up @@ -489,7 +489,8 @@ const (
// StageUpdatingConditionProgressing indicates whether the stage updating is making progress.
// Its condition status can be one of the following:
// - "True": The stage updating is making progress.
// - "False": The stage updating is waiting/pausing.
// - "False": The stage updating is waiting.
// - "Unknown": The staged updating is a transitioning state.
StageUpdatingConditionProgressing StageUpdatingConditionType = "Progressing"

// StageUpdatingConditionSucceeded indicates whether the stage updating is completed successfully.
Expand Down Expand Up @@ -790,7 +791,7 @@ func (c *ClusterApprovalRequestList) GetApprovalRequestObjs() []ApprovalRequestO
// +kubebuilder:printcolumn:JSONPath=`.status.conditions[?(@.type=="Succeeded")].status`,name="Succeeded",type=string
// +kubebuilder:printcolumn:JSONPath=`.metadata.creationTimestamp`,name="Age",type=date
// +kubebuilder:printcolumn:JSONPath=`.spec.stagedRolloutStrategyName`,name="Strategy",priority=1,type=string
// +kubebuilder:validation:XValidation:rule="size(self.metadata.name) < 128",message="metadata.name max length is 127"
// +kubebuilder:validation:XValidation:rule="size(self.metadata.name) < 64",message="metadata.name max length is 63"

// StagedUpdateRun represents a stage by stage update process that applies ResourcePlacement
// selected resources to specified clusters.
Expand Down
14 changes: 13 additions & 1 deletion cmd/hubagent/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ import (
"go.goms.io/fleet/cmd/hubagent/options"
"go.goms.io/fleet/cmd/hubagent/workload"
mcv1beta1 "go.goms.io/fleet/pkg/controllers/membercluster/v1beta1"
readiness "go.goms.io/fleet/pkg/utils/informer/readiness"
"go.goms.io/fleet/pkg/utils/validator"
"go.goms.io/fleet/pkg/webhook"
"go.goms.io/fleet/pkg/webhook/managedresource"
// +kubebuilder:scaffold:imports
Expand Down Expand Up @@ -166,7 +168,17 @@ func main() {

ctx := ctrl.SetupSignalHandler()
if err := workload.SetupControllers(ctx, &wg, mgr, config, opts); err != nil {
klog.ErrorS(err, "unable to set up ready check")
klog.ErrorS(err, "unable to set up controllers")
exitWithErrorFunc()
}

// Add readiness check for dynamic informer cache AFTER controllers are set up.
// This ensures the discovery cache is populated before the hub agent is marked ready,
// which is critical for all controllers that rely on dynamic resource discovery.
// AddReadyzCheck adds additional readiness check instead of replacing the one registered earlier provided the name is different.
// Both registered checks need to pass for the manager to be considered ready.
if err := mgr.AddReadyzCheck("informer-cache", readiness.InformerReadinessChecker(validator.ResourceInformer)); err != nil {
klog.ErrorS(err, "unable to set up informer cache readiness check")
exitWithErrorFunc()
}

Expand Down
18 changes: 17 additions & 1 deletion cmd/hubagent/workload/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager,
}
defaultFramework := framework.NewFramework(schedulerProfile, mgr)
defaultSchedulingQueue := queue.NewSimplePlacementSchedulingQueue(
queue.WithName(schedulerQueueName),
schedulerQueueName, nil,
)
// we use one scheduler for every 10 concurrent placement
defaultScheduler := scheduler.NewScheduler("DefaultScheduler", defaultFramework, defaultSchedulingQueue, mgr,
Expand Down Expand Up @@ -514,7 +514,23 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager,
}
resourceChangeController := controller.NewController(resourceChangeControllerName, controller.ClusterWideKeyFunc, rcr.Reconcile, rateLimiter)

// Set up the InformerPopulator that runs on ALL pods (leader and followers)
// This ensures all pods have synced informer caches for webhook validation
klog.Info("Setting up informer populator")
informerPopulator := &resourcewatcher.InformerPopulator{
DiscoveryClient: discoverClient,
RESTMapper: mgr.GetRESTMapper(),
InformerManager: dynamicInformerManager,
ResourceConfig: resourceConfig,
}

if err := mgr.Add(informerPopulator); err != nil {
klog.ErrorS(err, "Failed to setup informer populator")
return err
}

// Set up a runner that starts all the custom controllers we created above
// This runs ONLY on the leader and adds event handlers to the informers created by InformerPopulator
resourceChangeDetector := &resourcewatcher.ChangeDetector{
DiscoveryClient: discoverClient,
RESTMapper: mgr.GetRESTMapper(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2482,6 +2482,10 @@ spec:
- RollingUpdate
- External
type: string
x-kubernetes-validations:
- message: cannot change rollout strategy type from 'External'
to other types
rule: '!(self != ''External'' && oldSelf == ''External'')'
type: object
required:
- resourceSelectors
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2436,8 +2436,8 @@ spec:
- spec
type: object
x-kubernetes-validations:
- message: metadata.name max length is 127
rule: size(self.metadata.name) < 128
- message: metadata.name max length is 63
rule: size(self.metadata.name) < 64
served: true
storage: true
subresources:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -992,6 +992,10 @@ spec:
- RollingUpdate
- External
type: string
x-kubernetes-validations:
- message: cannot change rollout strategy type from 'External'
to other types
rule: '!(self != ''External'' && oldSelf == ''External'')'
type: object
required:
- resourceSelectors
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1356,8 +1356,8 @@ spec:
- spec
type: object
x-kubernetes-validations:
- message: metadata.name max length is 127
rule: size(self.metadata.name) < 128
- message: metadata.name max length is 63
rule: size(self.metadata.name) < 64
served: true
storage: true
subresources:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,6 @@ func (r *Reconciler) fillInClusterStatus(mc *clusterv1beta1.MemberCluster, cp *c
} else {
// throw an alert
_ = controller.NewUnexpectedBehaviorError(fmt.Errorf("cluster certificate authority data not found in member cluster %s status", mc.Name))
cp.Status.AccessProviders[0].Cluster.InsecureSkipTLSVerify = true
}
}

Expand Down
14 changes: 12 additions & 2 deletions pkg/controllers/membercluster/v1beta1/membercluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -515,8 +515,18 @@ func (r *Reconciler) syncInternalMemberClusterStatus(imc *clusterv1beta1.Interna
}

// TODO: We didn't handle condition type: clusterv1beta1.ConditionTypeMemberClusterHealthy.
// Copy Agent status.
mc.Status.AgentStatus = imc.Status.AgentStatus
// Copy Agent status and set ObservedGeneration for agent conditions.
if len(imc.Status.AgentStatus) > 0 {
mc.Status.AgentStatus = make([]clusterv1beta1.AgentStatus, len(imc.Status.AgentStatus))
}
for i := range imc.Status.AgentStatus {
mc.Status.AgentStatus[i] = *imc.Status.AgentStatus[i].DeepCopy()
// Set ObservedGeneration for each agent condition.
for j := range mc.Status.AgentStatus[i].Conditions {
mc.Status.AgentStatus[i].Conditions[j].ObservedGeneration = mc.GetGeneration()
}
}

r.aggregateJoinedCondition(mc)
// Copy resource usages.
mc.Status.ResourceUsage = imc.Status.ResourceUsage
Expand Down
Loading
Loading