diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 88393063..17c33b8a 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -26,10 +26,10 @@ jobs: - name: Checkout uses: actions/checkout@v4 - - name: Setup Go 1.24 + - name: Setup Go 1.25 uses: actions/setup-go@v5 with: - go-version: '1.24.5' + go-version: '1.25.3' - uses: azure/setup-kubectl@v4 - uses: azure/setup-helm@v4.2.0 diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 1c49c489..d1e192de 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -20,10 +20,10 @@ jobs: with: fetch-depth: 0 - - name: Setup Go 1.24 + - name: Setup Go 1.25 uses: actions/setup-go@v5 with: - go-version: '1.24.5' + go-version: '1.25.3' - name: Cache Go modules uses: actions/cache@v4 @@ -45,6 +45,7 @@ jobs: GOOS: linux GOARCH: amd64 CGO_ENABLED: 0 + GOEXPERIMENT: synctest - name: Build Go binary arm64 run: go build -ldflags "-s -w -X main.GitCommit=$GITHUB_SHA -X main.GitRef=$GITHUB_REF -X main.Version=${RELEASE_TAG:-$VERSION_TAG}" -o bin/castai-cluster-controller-arm64 . @@ -52,9 +53,12 @@ jobs: GOOS: linux GOARCH: arm64 CGO_ENABLED: 0 + GOEXPERIMENT: synctest - name: Test run: go test -short -race -timeout 15m ./... + env: + GOEXPERIMENT: synctest - name: Set up QEMU uses: docker/setup-qemu-action@v2 diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 816a1651..6eb6b0f0 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -22,10 +22,10 @@ jobs: with: fetch-depth: 0 - - name: Setup Go 1.24 + - name: Setup Go 1.25 uses: actions/setup-go@v5 with: - go-version: '1.24.5' + go-version: '1.25.3' - name: Cache Go modules uses: actions/cache@v4 @@ -43,6 +43,7 @@ jobs: GOOS: linux GOARCH: amd64 CGO_ENABLED: 0 + GOEXPERIMENT: synctest - name: Build Go binary arm64 run: go build -ldflags "-s -w -X main.GitCommit=$GITHUB_SHA -X main.GitRef=$GITHUB_REF -X main.Version=${VERSION_TAG}" -o bin/castai-cluster-controller-arm64 . @@ -50,9 +51,12 @@ jobs: GOOS: linux GOARCH: arm64 CGO_ENABLED: 0 + GOEXPERIMENT: synctest - name: Test run: go test -short -race -timeout 15m ./... + env: + GOEXPERIMENT: synctest - name: Set up QEMU uses: docker/setup-qemu-action@v2 diff --git a/Makefile b/Makefile index c65ad4dc..2cdc165c 100644 --- a/Makefile +++ b/Makefile @@ -24,7 +24,7 @@ $(GOLANGCI_LINT): ## build: Build the binary for the specified architecture and create a Docker image. Usually this means ARCH=amd64 should be set if running on an ARM machine. Use `go build .` for simple local build. build: - CGO_ENABLED=0 GOOS=linux GOARCH=$(ARCH) go build -ldflags "-s -w" -o bin/castai-cluster-controller-$(ARCH) . + CGO_ENABLED=0 GOOS=linux GOARCH=$(ARCH) GOEXPERIMENT=synctest go build -ldflags "-s -w" -o bin/castai-cluster-controller-$(ARCH) . docker build --platform=linux/$(ARCH) --build-arg TARGETARCH=$(ARCH) -t $(DOCKER_REPOSITORY):$(VERSION) . push: @@ -41,7 +41,7 @@ fix: $(GOLANGCI_LINT) .PHONY: fix test: - go test ./... -race -parallel=20 + GOEXPERIMENT=synctest go test ./... -race -parallel=20 .PHONY: test generate-e2e-client: diff --git a/cmd/controller/run.go b/cmd/controller/run.go index e83c456a..25cb44ca 100644 --- a/cmd/controller/run.go +++ b/cmd/controller/run.go @@ -28,6 +28,7 @@ import ( "github.com/castai/cluster-controller/internal/actions/csr" "github.com/castai/cluster-controller/internal/castai" "github.com/castai/cluster-controller/internal/config" + "github.com/castai/cluster-controller/internal/informer" "github.com/castai/cluster-controller/internal/controller" "github.com/castai/cluster-controller/internal/controller/logexporter" "github.com/castai/cluster-controller/internal/controller/metricexporter" @@ -131,6 +132,18 @@ func runController( log.Infof("running castai-cluster-controller version %v, log-level: %v", binVersion, logger.Level) + // Create global informer manager if enabled + log.Info("initializing global informer manager...") + informerManager := informer.NewManager( + log, + clientset, + cfg.Informer.ResyncPeriod, + ) + + if err := informerManager.Start(ctx); err != nil { + return fmt.Errorf("starting informer manager: %w", err) + } + actionsConfig := controller.Config{ PollWaitInterval: 5 * time.Second, PollTimeout: maxRequestTimeout, @@ -153,11 +166,16 @@ func runController( client, helmClient, healthzAction, + informerManager, ) defer func() { if err := svc.Close(); err != nil { log.Errorf("failed to close controller service: %v", err) } + // Stop informer manager on shutdown + if informerManager != nil { + informerManager.Stop() + } }() if cfg.Metrics.ExportEnabled { diff --git a/cmd/testserver/run.go b/cmd/testserver/run.go index b2fcc15c..4994874d 100644 --- a/cmd/testserver/run.go +++ b/cmd/testserver/run.go @@ -51,7 +51,8 @@ func run(ctx context.Context) error { // Choose scenarios below by adding/removing/etc. instances of scenarios.XXX() // All scenarios in the list run in parallel (but not necessarily at the same time if preparation takes different time). testScenarios := []scenarios.TestScenario{ - scenarios.CheckNodeDeletedStuck(300, logger), + // scenarios.CheckNodeDeletedStuck(300, logger), + scenarios.CheckNodeStatus(10000, logger), } var wg sync.WaitGroup diff --git a/go.mod b/go.mod index e94a55c6..bc595669 100644 --- a/go.mod +++ b/go.mod @@ -1,95 +1,81 @@ module github.com/castai/cluster-controller -go 1.24.6 +go 1.25.3 require ( github.com/bombsimon/logrusr/v4 v4.1.0 github.com/cenkalti/backoff/v4 v4.3.0 github.com/deepmap/oapi-codegen v1.11.0 - github.com/evanphx/json-patch v5.9.0+incompatible + github.com/evanphx/json-patch v5.9.11+incompatible github.com/fsnotify/fsnotify v1.7.0 github.com/go-resty/resty/v2 v2.15.3 github.com/golang/mock v1.6.0 github.com/google/uuid v1.6.0 github.com/kelseyhightower/envconfig v1.4.0 - github.com/prometheus/client_golang v1.21.1 + github.com/prometheus/client_golang v1.22.0 github.com/prometheus/client_model v0.6.1 github.com/samber/lo v1.47.0 github.com/sirupsen/logrus v1.9.3 - github.com/spf13/cobra v1.8.1 + github.com/spf13/cobra v1.9.1 github.com/spf13/viper v1.19.0 github.com/stretchr/testify v1.10.0 go.uber.org/goleak v1.3.0 golang.org/x/net v0.42.0 golang.org/x/sync v0.16.0 - helm.sh/helm/v3 v3.17.4 - k8s.io/api v0.33.2 - k8s.io/apiextensions-apiserver v0.32.2 - k8s.io/apimachinery v0.33.2 - k8s.io/apiserver v0.32.2 - k8s.io/cli-runtime v0.32.2 - k8s.io/client-go v0.33.2 - k8s.io/component-base v0.32.2 + helm.sh/helm/v3 v3.18.5 + k8s.io/api v0.33.3 + k8s.io/apiextensions-apiserver v0.33.3 + k8s.io/apimachinery v0.33.3 + k8s.io/apiserver v0.33.3 + k8s.io/cli-runtime v0.33.3 + k8s.io/client-go v0.33.3 + k8s.io/component-base v0.33.3 k8s.io/klog/v2 v2.130.1 - k8s.io/kubectl v0.32.2 + k8s.io/kubectl v0.33.3 sigs.k8s.io/controller-runtime v0.19.0 - sigs.k8s.io/yaml v1.4.0 + sigs.k8s.io/yaml v1.5.0 ) require ( dario.cat/mergo v1.0.1 // indirect github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6 // indirect - github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect - github.com/BurntSushi/toml v1.4.0 // indirect + github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect + github.com/BurntSushi/toml v1.5.0 // indirect github.com/MakeNowJust/heredoc v1.0.0 // indirect github.com/Masterminds/goutils v1.1.1 // indirect github.com/Masterminds/semver/v3 v3.3.0 // indirect github.com/Masterminds/sprig/v3 v3.3.0 // indirect github.com/Masterminds/squirrel v1.5.4 // indirect - github.com/Microsoft/hcsshim v0.12.9 // indirect github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/chai2010/gettext-go v1.0.2 // indirect - github.com/containerd/cgroups/v3 v3.0.5 // indirect - github.com/containerd/containerd v1.7.27 // indirect - github.com/containerd/continuity v0.4.5 // indirect + github.com/containerd/containerd v1.7.29 // indirect github.com/containerd/errdefs v1.0.0 // indirect github.com/containerd/log v0.1.0 // indirect github.com/containerd/platforms v0.2.1 // indirect - github.com/containerd/typeurl/v2 v2.2.3 // indirect - github.com/cyphar/filepath-securejoin v0.3.6 // indirect + github.com/cyphar/filepath-securejoin v0.4.1 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect - github.com/distribution/reference v0.6.0 // indirect - github.com/docker/cli v25.0.1+incompatible // indirect - github.com/docker/distribution v2.8.3+incompatible // indirect - github.com/docker/docker v28.3.3+incompatible // indirect github.com/docker/docker-credential-helpers v0.9.3 // indirect - github.com/docker/go-connections v0.5.0 // indirect github.com/docker/go-events v0.0.0-20250114142523-c867878c5e32 // indirect - github.com/docker/go-metrics v0.0.1 // indirect github.com/emicklei/go-restful/v3 v3.11.0 // indirect github.com/evanphx/json-patch/v5 v5.9.0 // indirect github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f // indirect github.com/fatih/color v1.14.1 // indirect - github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fxamacker/cbor/v2 v2.7.0 // indirect github.com/go-errors/errors v1.4.2 // indirect github.com/go-gorp/gorp/v3 v3.1.0 // indirect github.com/go-logr/logr v1.4.3 // indirect - github.com/go-logr/stdr v1.2.2 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect github.com/go-openapi/jsonreference v0.20.2 // indirect github.com/go-openapi/swag v0.23.0 // indirect github.com/gobwas/glob v0.2.3 // indirect github.com/gogo/protobuf v1.3.2 // indirect - github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect github.com/google/btree v1.1.3 // indirect github.com/google/gnostic-models v0.6.9 // indirect github.com/google/go-cmp v0.7.0 // indirect github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect - github.com/gorilla/mux v1.8.1 // indirect github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect github.com/gosuri/uitable v0.0.4 // indirect github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 // indirect @@ -115,10 +101,8 @@ require ( github.com/mitchellh/go-wordwrap v1.0.1 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect github.com/mitchellh/reflectwalk v1.0.2 // indirect - github.com/moby/locker v1.0.1 // indirect github.com/moby/spdystream v0.5.0 // indirect - github.com/moby/sys/mountinfo v0.7.2 // indirect - github.com/moby/term v0.5.0 // indirect + github.com/moby/term v0.5.2 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect @@ -132,27 +116,25 @@ require ( github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/common v0.62.0 // indirect github.com/prometheus/procfs v0.15.1 // indirect - github.com/rubenv/sql-migrate v1.7.1 // indirect + github.com/rubenv/sql-migrate v1.8.0 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/sagikazarmark/locafero v0.4.0 // indirect github.com/sagikazarmark/slog-shim v0.1.0 // indirect + github.com/santhosh-tekuri/jsonschema/v6 v6.0.2 // indirect github.com/shopspring/decimal v1.4.0 // indirect github.com/sourcegraph/conc v0.3.0 // indirect github.com/spf13/afero v1.11.0 // indirect github.com/spf13/cast v1.7.0 // indirect - github.com/spf13/pflag v1.0.5 // indirect + github.com/spf13/pflag v1.0.7 // indirect github.com/subosito/gotenv v1.6.0 // indirect github.com/x448/float16 v0.8.4 // indirect - github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb // indirect - github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 // indirect - github.com/xeipuuv/gojsonschema v1.2.0 // indirect github.com/xlab/treeprint v1.2.0 // indirect - go.opentelemetry.io/auto/sdk v1.1.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect go.opentelemetry.io/otel v1.37.0 // indirect - go.opentelemetry.io/otel/metric v1.37.0 // indirect go.opentelemetry.io/otel/trace v1.37.0 // indirect go.uber.org/multierr v1.11.0 // indirect + go.yaml.in/yaml/v2 v2.4.2 // indirect + go.yaml.in/yaml/v3 v3.0.3 // indirect golang.org/x/crypto v0.41.0 // indirect golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f // indirect golang.org/x/oauth2 v0.30.0 // indirect @@ -169,10 +151,10 @@ require ( gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 // indirect - oras.land/oras-go v1.2.5 // indirect + oras.land/oras-go/v2 v2.6.0 // indirect sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect - sigs.k8s.io/kustomize/api v0.18.0 // indirect - sigs.k8s.io/kustomize/kyaml v0.18.1 // indirect + sigs.k8s.io/kustomize/api v0.19.0 // indirect + sigs.k8s.io/kustomize/kyaml v0.19.0 // indirect sigs.k8s.io/randfill v1.0.0 // indirect sigs.k8s.io/structured-merge-diff/v4 v4.6.0 // indirect ) diff --git a/go.sum b/go.sum index 614eddab..9a909465 100644 --- a/go.sum +++ b/go.sum @@ -4,10 +4,10 @@ filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA= filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6 h1:He8afgbRMd7mFxO99hRNu+6tazq8nFF9lIwo9JFroBk= github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6/go.mod h1:8o94RPi1/7XTJvwPpRSzSUedZrtlirdB3r9Z20bi2f8= -github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0= -github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= -github.com/BurntSushi/toml v1.4.0 h1:kuoIxZQy2WRRk1pttg9asf+WVv6tWQuBNVmK8+nqPr0= -github.com/BurntSushi/toml v1.4.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho= +github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c h1:udKWzYgxTojEKWjV8V+WSxDXJ4NFATAsZjh8iIbsQIg= +github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= +github.com/BurntSushi/toml v1.5.0 h1:W5quZX/G/csjUnuI8SUYlsHs9M38FC7znL0lIO+DvMg= +github.com/BurntSushi/toml v1.5.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho= github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU= github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU= github.com/MakeNowJust/heredoc v1.0.0 h1:cXCdzVdstXyiTqTvfqk9SDHpKNjxuom+DOlyEeQ4pzQ= @@ -20,20 +20,10 @@ github.com/Masterminds/sprig/v3 v3.3.0 h1:mQh0Yrg1XPo6vjYXgtf5OtijNAKJRNcTdOOGZe github.com/Masterminds/sprig/v3 v3.3.0/go.mod h1:Zy1iXRYNqNLUolqCpL4uhk6SHUMAOSCzdgBfDb35Lz0= github.com/Masterminds/squirrel v1.5.4 h1:uUcX/aBc8O7Fg9kaISIUsHXdKuqehiXAMQTYX8afzqM= github.com/Masterminds/squirrel v1.5.4/go.mod h1:NNaOrjSoIDfDA40n7sr2tPNZRfjzjA400rg+riTZj10= -github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= -github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= -github.com/Microsoft/hcsshim v0.12.9 h1:2zJy5KA+l0loz1HzEGqyNnjd3fyZA31ZBCGKacp6lLg= -github.com/Microsoft/hcsshim v0.12.9/go.mod h1:fJ0gkFAna6ukt0bLdKB8djt4XIJhF/vEPuoIWYVvZ8Y= -github.com/Shopify/logrus-bugsnag v0.0.0-20171204204709-577dee27f20d h1:UrqY+r/OJnIp5u0s1SbQ8dVfLCZJsnvazdBP5hS4iRs= -github.com/Shopify/logrus-bugsnag v0.0.0-20171204204709-577dee27f20d/go.mod h1:HI8ITrYtUY+O+ZhtlqUnD8+KwNPOyugEhfP9fdUIaEQ= -github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= -github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3dyBCFEj5IhUbnKptjxatkF07cF2ak3yi77so= github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw= -github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= -github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= @@ -42,41 +32,29 @@ github.com/bombsimon/logrusr/v4 v4.1.0 h1:uZNPbwusB0eUXlO8hIUwStE6Lr5bLN6IgYgG+7 github.com/bombsimon/logrusr/v4 v4.1.0/go.mod h1:pjfHC5e59CvjTBIU3V3sGhFWFAnsnhOR03TRc6im0l8= github.com/bshuster-repo/logrus-logstash-hook v1.0.0 h1:e+C0SB5R1pu//O4MQ3f9cFuPGoOVeF2fE4Og9otCc70= github.com/bshuster-repo/logrus-logstash-hook v1.0.0/go.mod h1:zsTqEiSzDgAa/8GZR7E1qaXrhYNDKBYy5/dWPTIflbk= -github.com/bugsnag/bugsnag-go v0.0.0-20141110184014-b1d153021fcd h1:rFt+Y/IK1aEZkEHchZRSq9OQbsSzIT/OrI8YFFmRIng= -github.com/bugsnag/bugsnag-go v0.0.0-20141110184014-b1d153021fcd/go.mod h1:2oa8nejYd4cQ/b0hMIopN0lCRxU0bueqREvZLWFrtK8= -github.com/bugsnag/osext v0.0.0-20130617224835-0dd3f918b21b h1:otBG+dV+YK+Soembjv71DPz3uX/V/6MMlSyD9JBQ6kQ= -github.com/bugsnag/osext v0.0.0-20130617224835-0dd3f918b21b/go.mod h1:obH5gd0BsqsP2LwDJ9aOkm/6J86V6lyAXCoQWGw3K50= -github.com/bugsnag/panicwrap v0.0.0-20151223152923-e2c28503fcd0 h1:nvj0OLI3YqYXer/kZD8Ri1aaunCxIEsOst1BVJswV0o= -github.com/bugsnag/panicwrap v0.0.0-20151223152923-e2c28503fcd0/go.mod h1:D/8v3kj0zr8ZAKg1AQ6crr+5VwKN5eIywRkfhyM/+dE= github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/chai2010/gettext-go v1.0.2 h1:1Lwwip6Q2QGsAdl/ZKPCwTe9fe0CjlUbqj5bFNSjIRk= github.com/chai2010/gettext-go v1.0.2/go.mod h1:y+wnP2cHYaVj19NZhYKAwEMH2CI1gNHeQQ+5AjwawxA= -github.com/containerd/cgroups/v3 v3.0.5 h1:44na7Ud+VwyE7LIoJ8JTNQOa549a8543BmzaJHo6Bzo= -github.com/containerd/cgroups/v3 v3.0.5/go.mod h1:SA5DLYnXO8pTGYiAHXz94qvLQTKfVM5GEVisn4jpins= -github.com/containerd/containerd v1.7.27 h1:yFyEyojddO3MIGVER2xJLWoCIn+Up4GaHFquP7hsFII= -github.com/containerd/containerd v1.7.27/go.mod h1:xZmPnl75Vc+BLGt4MIfu6bp+fy03gdHAn9bz+FreFR0= -github.com/containerd/continuity v0.4.5 h1:ZRoN1sXq9u7V6QoHMcVWGhOwDFqZ4B9i5H6un1Wh0x4= -github.com/containerd/continuity v0.4.5/go.mod h1:/lNJvtJKUQStBzpVQ1+rasXO1LAWtUQssk28EZvJ3nE= +github.com/containerd/containerd v1.7.29 h1:90fWABQsaN9mJhGkoVnuzEY+o1XDPbg9BTC9QTAHnuE= +github.com/containerd/containerd v1.7.29/go.mod h1:azUkWcOvHrWvaiUjSQH0fjzuHIwSPg1WL5PshGP4Szs= github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI= github.com/containerd/errdefs v1.0.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M= -github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE= -github.com/containerd/errdefs/pkg v0.3.0/go.mod h1:NJw6s9HwNuRhnjJhM7pylWwMyAkmCQvQ4GpJHEqRLVk= github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo= github.com/containerd/platforms v0.2.1 h1:zvwtM3rz2YHPQsF2CHYM8+KtB5dvhISiXh5ZpSBQv6A= github.com/containerd/platforms v0.2.1/go.mod h1:XHCb+2/hzowdiut9rkudds9bE5yJ7npe7dG/wG+uFPw= -github.com/containerd/typeurl/v2 v2.2.3 h1:yNA/94zxWdvYACdYO8zofhrTVuQY73fFU1y++dYSw40= -github.com/containerd/typeurl/v2 v2.2.3/go.mod h1:95ljDnPfD3bAbDJRugOiShd/DlAAsxGtUBhJxIn7SCk= -github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs= +github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= +github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY= github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= github.com/cyberdelia/templates v0.0.0-20141128023046-ca7fffd4298c/go.mod h1:GyV+0YP4qX0UQ7r2MoYZ+AvYDp12OF5yg4q8rGnyNh4= -github.com/cyphar/filepath-securejoin v0.3.6 h1:4d9N5ykBnSp5Xn2JkhocYDkOpURL/18CYMpo6xB9uWM= -github.com/cyphar/filepath-securejoin v0.3.6/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI= +github.com/cyphar/filepath-securejoin v0.4.1 h1:JyxxyPEaktOD+GAnqIqTf9A8tHyAG22rowi7HkoSU1s= +github.com/cyphar/filepath-securejoin v0.4.1/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= @@ -86,30 +64,24 @@ github.com/decred/dcrd/dcrec/secp256k1/v4 v4.0.0-20210816181553-5444fa50b93d/go. github.com/decred/dcrd/dcrec/secp256k1/v4 v4.0.1/go.mod h1:hyedUtir6IdtD/7lIxGeCxkaw7y45JueMRL4DIyJDKs= github.com/deepmap/oapi-codegen v1.11.0 h1:f/X2NdIkaBKsSdpeuwLnY/vDI0AtPUrmB5LMgc7YD+A= github.com/deepmap/oapi-codegen v1.11.0/go.mod h1:k+ujhoQGxmQYBZBbxhOZNZf4j08qv5mC+OH+fFTnKxM= -github.com/distribution/distribution/v3 v3.0.0-20221208165359-362910506bc2 h1:aBfCb7iqHmDEIp6fBvC/hQUddQfg+3qdYjwzaiP9Hnc= -github.com/distribution/distribution/v3 v3.0.0-20221208165359-362910506bc2/go.mod h1:WHNsWjnIn2V1LYOrME7e8KxSeKunYHsxEm4am0BUtcI= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= +github.com/distribution/distribution/v3 v3.0.0 h1:q4R8wemdRQDClzoNNStftB2ZAfqOiN6UX90KJc4HjyM= +github.com/distribution/distribution/v3 v3.0.0/go.mod h1:tRNuFoZsUdyRVegq8xGNeds4KLjwLCRin/tTo6i1DhU= github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk= github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= -github.com/docker/cli v25.0.1+incompatible h1:mFpqnrS6Hsm3v1k7Wa/BO23oz0k121MTbTO1lpcGSkU= -github.com/docker/cli v25.0.1+incompatible/go.mod h1:JLrzqnKDaYBop7H2jaqPtU4hHvMKP+vjCwu2uszcLI8= -github.com/docker/distribution v2.8.3+incompatible h1:AtKxIZ36LoNK51+Z6RpzLpddBirtxJnzDrHLEKxTAYk= -github.com/docker/distribution v2.8.3+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w= -github.com/docker/docker v28.3.3+incompatible h1:Dypm25kh4rmk49v1eiVbsAtpAsYURjYkaKubwuBdxEI= -github.com/docker/docker v28.3.3+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= +github.com/dlclark/regexp2 v1.11.0 h1:G/nrcoOa7ZXlpoa/91N3X7mM3r8eIlMBBJZvsz/mxKI= +github.com/dlclark/regexp2 v1.11.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= github.com/docker/docker-credential-helpers v0.9.3 h1:gAm/VtF9wgqJMoxzT3Gj5p4AqIjCBS4wrsOh9yRqcz8= github.com/docker/docker-credential-helpers v0.9.3/go.mod h1:x+4Gbw9aGmChi3qTLZj8Dfn0TD20M/fuWy0E5+WDeCo= -github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c= -github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= github.com/docker/go-events v0.0.0-20250114142523-c867878c5e32 h1:EHZfspsnLAz8Hzccd67D5abwLiqoqym2jz/jOS39mCk= github.com/docker/go-events v0.0.0-20250114142523-c867878c5e32/go.mod h1:Uw6UezgYA44ePAFQYUehOuCzmy5zmg/+nl2ZfMWGkpA= github.com/docker/go-metrics v0.0.1 h1:AgB/0SvBxihN0X8OR4SjsblXkbMvalQ8cjmtKQ2rQV8= github.com/docker/go-metrics v0.0.1/go.mod h1:cG1hvH2utMXtqgqqYE9plW6lDxS3/5ayHzueweSI3Vw= -github.com/docker/libtrust v0.0.0-20150114040149-fa567046d9b1 h1:ZClxb8laGDf5arXfYcAtECDFgAgHklGI8CxgjHnXKJ4= -github.com/docker/libtrust v0.0.0-20150114040149-fa567046d9b1/go.mod h1:cyGadeNEkKy96OOhEzfZl+yxihPEzKnqJwvfuSUqbZE= github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g= github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= -github.com/evanphx/json-patch v5.9.0+incompatible h1:fBXyNpNMuTTDdquAq/uisOr2lShz4oaXpDTX2bLe7ls= -github.com/evanphx/json-patch v5.9.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= +github.com/evanphx/json-patch v5.9.11+incompatible h1:ixHHqfcGvxhWkniF1tWxBHA0yb4Z+d1UQi45df52xW8= +github.com/evanphx/json-patch v5.9.11+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch/v5 v5.9.0 h1:kcBlZQbplgElYIlo/n1hJbls2z/1awpXxpRi0/FOJfg= github.com/evanphx/json-patch/v5 v5.9.0/go.mod h1:VNkHZ/282BpEyt/tObQO8s5CMPmYYq14uClGH4abBuQ= github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f h1:Wl78ApPPB2Wvf/TIe2xdyJxTlb6obmF18d8QdkxNDu4= @@ -135,10 +107,6 @@ github.com/go-errors/errors v1.4.2 h1:J6MZopCL4uSllY1OfXM374weqZFFItUbrImctkmUxI github.com/go-errors/errors v1.4.2/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og= github.com/go-gorp/gorp/v3 v3.1.0 h1:ItKF/Vbuj31dmV4jxA1qblpSwkl9g1typ24xoe70IGs= github.com/go-gorp/gorp/v3 v3.1.0/go.mod h1:dLEjIyyRNiXvNZ8PSmzpt1GsWAUK8kjVhEpjH8TixEw= -github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= -github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE= -github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk= -github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= @@ -167,36 +135,26 @@ github.com/go-resty/resty/v2 v2.15.3 h1:bqff+hcqAflpiF591hhJzNdkRsFhlB96CYfBwSFv github.com/go-resty/resty/v2 v2.15.3/go.mod h1:0fHAoK7JoBy/Ch36N8VFeMsK7xQOHhvWaC3iOktwmIU= github.com/go-sql-driver/mysql v1.8.1 h1:LedoTUt/eveggdHS9qUFC1EFSa8bU2+1pZjSRpvNJ1Y= github.com/go-sql-driver/mysql v1.8.1/go.mod h1:wEBSXgmK//2ZFJyE+qWnIsVGmvmEKlqwuVSjsCm7DZg= -github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= github.com/goccy/go-json v0.9.7/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I= -github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang-jwt/jwt v3.2.2+incompatible/go.mod h1:8pz2t5EyA70fFQQSrl6XZXzqecmYZeUEB8OUGHkxJ+I= -github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 h1:f+oWsMOmNPc8JmEHVZIycC7hBoQxHH9pNKQORJNozsQ= -github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8/go.mod h1:wcDNUvekVysuuOpQKo3191zZyTpiI6se1N1ULghS0sw= github.com/golang/mock v1.6.0 h1:ErTB+efbowRARo13NNdxyJji2egdxLGQhRaY+DUumQc= github.com/golang/mock v1.6.0/go.mod h1:p6yTPP+5HYm5mzsMV8JkE6ZKdX+/wYM6Hr+LicevLPs= -github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/golangci/lint-1 v0.0.0-20181222135242-d2cdd8c08219/go.mod h1:/X8TswGSh1pIozq4ZwCfxS0WA5JGXguxk94ar/4c87Y= -github.com/gomodule/redigo v1.8.2 h1:H5XSIre1MB5NbPYFp+i1NBbb5qN1W8Y8YAQoAYbkm8k= -github.com/gomodule/redigo v1.8.2/go.mod h1:P9dn9mFrCBvWhGE1wpxx6fgq7BAeLBk+UUUzlpkBYO0= github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw= github.com/google/gnostic-models v0.6.9/go.mod h1:CiWsm0s6BSQd1hRn8/QmxqB6BesYcbSZxsz9b0KuDBw= -github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= @@ -209,8 +167,8 @@ github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510/go.mod h1:pupxD2MaaD3 github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/gorilla/handlers v1.5.1 h1:9lRY6j8DEeeBT10CvO9hGW0gmky0BprnvDI5vfhUHH4= -github.com/gorilla/handlers v1.5.1/go.mod h1:t8XrUpc4KVXb7HGyJ4/cEnwQiaxrX/hz1Zv/4g96P1Q= +github.com/gorilla/handlers v1.5.2 h1:cLTUSsNkgcwhgRqvCNmdbRWG0A3N4F+M2nWKdScwyEE= +github.com/gorilla/handlers v1.5.2/go.mod h1:dX+xVpaxdSw+q0Qek8SSsl3dfMk3jNddUkMzo0GtH0w= github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= @@ -220,13 +178,19 @@ github.com/gosuri/uitable v0.0.4 h1:IG2xLKRvErL3uhY6e1BylFzG+aJiwQviDDTfOKeKTpY= github.com/gosuri/uitable v0.0.4/go.mod h1:tKR86bXuXPZazfOTG1FIzvjIdXzd0mo4Vtn16vt0PJo= github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 h1:+ngKgrYPPJrOjhax5N+uePQ0Fh1Z7PheYoUI/0nzkPA= github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= +github.com/grpc-ecosystem/grpc-gateway v1.16.0 h1:gmcG1KaJ57LophUzW0Hy8NmPhnMZb4M0+kPpLofRdBo= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0 h1:TmHmbvxPmaegwhDubVz0lICL0J5Ka2vwTzhoePEXsGE= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0/go.mod h1:qztMSjm835F2bXf+5HKAPIS5qsmQDqZna/PgVt4rWtI= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I= github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= github.com/hashicorp/golang-lru v0.5.4 h1:YDjusn29QI/Das2iO9M0BHnIbxPeyuCHsjMW+lJfyTc= -github.com/hashicorp/golang-lru v0.5.4/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4= +github.com/hashicorp/golang-lru/arc/v2 v2.0.5 h1:l2zaLDubNhW4XO3LnliVj0GXO3+/CGNJAg1dcN2Fpfw= +github.com/hashicorp/golang-lru/arc/v2 v2.0.5/go.mod h1:ny6zBSQZi2JxIeYcv7kt2sH2PXJtirBN7RDhRpxPkxU= +github.com/hashicorp/golang-lru/v2 v2.0.5 h1:wW7h1TG88eUIJ2i69gaE3uNVtEPIagzhGvHgwfx2Vm4= +github.com/hashicorp/golang-lru/v2 v2.0.5/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= github.com/huandu/xstrings v1.5.0 h1:2ag3IFq9ZDANvthTwTiqSSZLjDc+BedvHPAp5tJy2TI= @@ -237,20 +201,15 @@ github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o= github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= -github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= -github.com/json-iterator/go v1.1.7/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= github.com/json-iterator/go v1.1.9/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= -github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w= github.com/kelseyhightower/envconfig v1.4.0 h1:Im6hONhd3pLkfDFsbRgu68RDNkGF1r3dvMUtDTo2cv8= github.com/kelseyhightower/envconfig v1.4.0/go.mod h1:cccZRl6mQpaq41TPp5QxidR+Sa3axMbJDNb//FQX6Gg= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= -github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= -github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk= @@ -303,7 +262,6 @@ github.com/mattn/go-runewidth v0.0.9 h1:Lm995f3rfxdpd6TSmuVCHVb/QhupuXlYr8sCI/Qd github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI= github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU= github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= -github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= github.com/miekg/dns v1.1.57 h1:Jzi7ApEIzwEPLHWRcafCN9LZSBbqQpxjt/wpgvg7wcM= github.com/miekg/dns v1.1.57/go.mod h1:uqRjCRUuEAA6qsOiJvDd+CFo/vW+y5WR6SNmHE55hZk= github.com/mitchellh/copystructure v1.2.0 h1:vpKXTN4ewci03Vljg/q9QvCGUDttBOGBIa15WveJJGw= @@ -314,28 +272,20 @@ github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyua github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ= github.com/mitchellh/reflectwalk v1.0.2/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw= -github.com/moby/locker v1.0.1 h1:fOXqR41zeveg4fFODix+1Ch4mj/gT0NE1XJbp/epuBg= -github.com/moby/locker v1.0.1/go.mod h1:S7SDdo5zpBK84bzzVlKr2V0hz+7x9hWbYC/kq7oQppc= github.com/moby/spdystream v0.5.0 h1:7r0J1Si3QO/kjRitvSLVVFUjxMEb/YLj6S9FF62JBCU= github.com/moby/spdystream v0.5.0/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI= -github.com/moby/sys/mountinfo v0.7.2 h1:1shs6aH5s4o5H2zQLn796ADW1wMrIwHsyJ2v9KouLrg= -github.com/moby/sys/mountinfo v0.7.2/go.mod h1:1YOa8w8Ih7uW0wALDUgT1dTTSBrZ+HiBLGws92L2RU4= -github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g= -github.com/moby/sys/userns v0.1.0/go.mod h1:IHUYgu/kao6N8YZlp9Cf444ySSvCmDlmzUcYfDHOl28= -github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0= -github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y= +github.com/moby/term v0.5.2 h1:6qk3FJAFDs6i/q3W/pQ97SX192qKfZgGjCQqfCJkgzQ= +github.com/moby/term v0.5.2/go.mod h1:d3djjFCrjnB+fl8NJux+EJzu0msscUP+f8it8hPkFLc= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= -github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 h1:n6/2gBQ3RWajuToeY6ZtZTIKv2v7ThUy5KKusIT0yc0= github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00/go.mod h1:Pm3mSP3c5uWn86xMLZ5Sa7JB9GsEZySvHYXCTK4E9q4= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= @@ -354,7 +304,6 @@ github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5 h1:Ii+DKncOVM8Cu1Hc+ETb5K+23HdAMvESYE3ZJ5b5cMI= github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5/go.mod h1:iIss55rKnNBTvrwdmkUpLnDpZoAHvWaiq5+iMmen4AE= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= -github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= @@ -362,30 +311,26 @@ github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRI github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/poy/onpar v1.1.2 h1:QaNrNiZx0+Nar5dLgTVp5mXkyoVFIbepjyEoGSnhbAY= github.com/poy/onpar v1.1.2/go.mod h1:6X8FLNoxyr9kkmnlqpK6LSoiOtrO6MICtWwEuWkLjzg= -github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= -github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= -github.com/prometheus/client_golang v1.1.0/go.mod h1:I1FGZT9+L76gKKOs5djB6ezCbFQP1xR9D75/vuwEF3g= -github.com/prometheus/client_golang v1.21.1 h1:DOvXXTqVzvkIewV/CDPFdejpMCGeMcbGCQ8YOmu+Ibk= -github.com/prometheus/client_golang v1.21.1/go.mod h1:U9NM32ykUErtVBxdvD3zfi+EuFkkaBvMb09mIfe0Zgg= -github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= -github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= +github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q= +github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0= github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= -github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= -github.com/prometheus/common v0.6.0/go.mod h1:eBmuwkDJBwy6iBfxCBob6t6dR6ENT/y+J+Zk0j9GMYc= github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ2Io= github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I= -github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= -github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= -github.com/prometheus/procfs v0.0.3/go.mod h1:4A/X28fw3Fc593LaREMrKMqOKvUAntwMDaekg4FpcdQ= github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= +github.com/redis/go-redis/extra/rediscmd/v9 v9.0.5 h1:EaDatTxkdHG+U3Bk4EUr+DZ7fOGwTfezUiUJMaIcaho= +github.com/redis/go-redis/extra/rediscmd/v9 v9.0.5/go.mod h1:fyalQWdtzDBECAQFBJuQe5bzQ02jGd5Qcbgb97Flm7U= +github.com/redis/go-redis/extra/redisotel/v9 v9.0.5 h1:EfpWLLCyXw8PSM2/XNJLjI3Pb27yVE+gIAfeqp8LUCc= +github.com/redis/go-redis/extra/redisotel/v9 v9.0.5/go.mod h1:WZjPDy7VNzn77AAfnAfVjZNvfJTYfPetfZk5yoSTLaQ= +github.com/redis/go-redis/v9 v9.7.3 h1:YpPyAayJV+XErNsatSElgRZZVCwXX9QzkKYNvO7x0wM= +github.com/redis/go-redis/v9 v9.7.3/go.mod h1:bGUrSggJ9X9GUmZpZNEOQKaANxSGgOEBRltRTZHSvrA= github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= github.com/rogpeppe/go-internal v1.8.0/go.mod h1:WmiCO8CzOY8rg0OYDC4/i/2WRWAB6poM+XZ2dLUbcbE= github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= -github.com/rubenv/sql-migrate v1.7.1 h1:f/o0WgfO/GqNuVg+6801K/KW3WdDSupzSjDYODmiUq4= -github.com/rubenv/sql-migrate v1.7.1/go.mod h1:Ob2Psprc0/3ggbM6wCzyYVFFuc6FyZrb2AS+ezLDFb4= +github.com/rubenv/sql-migrate v1.8.0 h1:dXnYiJk9k3wetp7GfQbKJcPHjVJL6YK19tKj8t2Ns0o= +github.com/rubenv/sql-migrate v1.8.0/go.mod h1:F2bGFBwCU+pnmbtNYDeKvSuvL6lBVtXDXUUv5t+u1qw= github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sagikazarmark/locafero v0.4.0 h1:HApY1R9zGo4DBgr7dqsTH/JJxLTTsOt7u6keLGt6kNQ= @@ -394,11 +339,12 @@ github.com/sagikazarmark/slog-shim v0.1.0 h1:diDBnUNK9N/354PgrxMywXnAwEr1QZcOr6g github.com/sagikazarmark/slog-shim v0.1.0/go.mod h1:SrcSrq8aKtyuqEI1uvTDTK1arOWRIczQRv+GVI1AkeQ= github.com/samber/lo v1.47.0 h1:z7RynLwP5nbyRscyvcD043DWYoOcYRv3mV8lBeqOCLc= github.com/samber/lo v1.47.0/go.mod h1:RmDH9Ct32Qy3gduHQuKJ3gW1fMHAnE/fAzQuf6He5cU= +github.com/santhosh-tekuri/jsonschema/v6 v6.0.2 h1:KRzFb2m7YtdldCEkzs6KqmJw4nqEVZGK7IN2kJkjTuQ= +github.com/santhosh-tekuri/jsonschema/v6 v6.0.2/go.mod h1:JXeL+ps8p7/KNMjDQk3TCwPpBy0wYklyWTfbkIzdIFU= github.com/sergi/go-diff v1.2.0 h1:XU+rvMAioB0UC3q1MFrIQy4Vo5/4VsRDQQXHsEya6xQ= github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= github.com/shopspring/decimal v1.4.0 h1:bxl37RwXBklmTi0C79JfXCEBD1cqqHt0bbgBAGFp81k= github.com/shopspring/decimal v1.4.0/go.mod h1:gawqmDU56v4yIKSwfBSFip1HdCCXN8/+DMd9qYNcwME= -github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/sourcegraph/conc v0.3.0 h1:OQTbbt6P72L20UqAkXXuLOj79LfEanQ+YQFNpLA9ySo= @@ -407,14 +353,14 @@ github.com/spf13/afero v1.11.0 h1:WJQKhtpdm3v2IzqG8VMqrr6Rf3UYpEF239Jy9wNepM8= github.com/spf13/afero v1.11.0/go.mod h1:GH9Y3pIexgf1MTIWtNGyogA5MwRIDXGUr+hbWNoBjkY= github.com/spf13/cast v1.7.0 h1:ntdiHjuueXFgm5nzDRdOS4yfT43P5Fnud6DH50rz/7w= github.com/spf13/cast v1.7.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo= -github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= -github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= -github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= -github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo= +github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0= +github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/pflag v1.0.7 h1:vN6T9TfwStFPFM5XzjsvmzZkLuaLX+HS+0SeFLRgU6M= +github.com/spf13/pflag v1.0.7/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spf13/viper v1.19.0 h1:RWq5SEjt8o25SROyN3z2OrDB9l7RPd3lwTWU8EcEdcI= github.com/spf13/viper v1.19.0/go.mod h1:GQUN9bilAbhU/jgc1bKs99f/suXKeUMct8Adx5+Ntkg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= @@ -442,48 +388,68 @@ github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyC github.com/valyala/fasttemplate v1.2.1/go.mod h1:KHLXt3tVN2HBp8eijSv/kGJopbvo7S+qRAEEKiv+SiQ= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= -github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU= -github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb h1:zGWFAtiMcyryUHoUjUJX0/lt1H2+i2Ka2n+D3DImSNo= -github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU= -github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 h1:EzJWgHovont7NscjpAxXsDA8S8BMYve8Y5+7cuRE7R0= -github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415/go.mod h1:GwrjFmJcFw6At/Gs6z4yjiIwzuJ1/+UwLxMQDVQXShQ= -github.com/xeipuuv/gojsonschema v1.2.0 h1:LhYJRs+L4fBtjZUfuSZIKGeVu0QRy8e5Xi7D17UxZ74= -github.com/xeipuuv/gojsonschema v1.2.0/go.mod h1:anYRn/JVcOK2ZgGU+IjEV4nwlhoK5sQluxsYJ78Id3Y= github.com/xlab/treeprint v1.2.0 h1:HzHnuAF1plUN2zGlAFHbSQP2qJ0ZAD3XF5XD7OesXRQ= github.com/xlab/treeprint v1.2.0/go.mod h1:gj5Gd3gPdKtR1ikdDK6fnFLdmIS0X30kTTuNd/WEJu0= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= github.com/yuin/goldmark v1.4.1/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= -github.com/yvasiyarov/go-metrics v0.0.0-20140926110328-57bccd1ccd43 h1:+lm10QQTNSBd8DVTNGHx7o/IKu9HYDvLMffDhbyLccI= -github.com/yvasiyarov/go-metrics v0.0.0-20140926110328-57bccd1ccd43/go.mod h1:aX5oPXxHm3bOH+xeAttToC8pqch2ScQN/JoXYupl6xs= -github.com/yvasiyarov/gorelic v0.0.0-20141212073537-a9bba5b9ab50 h1:hlE8//ciYMztlGpl/VA+Zm1AcTPHYkHJPbHqE6WJUXE= -github.com/yvasiyarov/gorelic v0.0.0-20141212073537-a9bba5b9ab50/go.mod h1:NUSPSUX/bi6SeDMUh6brw0nXpxHnc96TguQh0+r/ssA= -github.com/yvasiyarov/newrelic_platform_go v0.0.0-20140908184405-b21fdbd4370f h1:ERexzlUfuTvpE74urLSbIQW0Z/6hF9t8U4NsJLaioAY= -github.com/yvasiyarov/newrelic_platform_go v0.0.0-20140908184405-b21fdbd4370f/go.mod h1:GlGEuHIJweS1mbCqG+7vt2nvWLzLLnRHbXz5JKd/Qbg= -go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= -go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= +go.opentelemetry.io/contrib/bridges/prometheus v0.57.0 h1:UW0+QyeyBVhn+COBec3nGhfnFe5lwB0ic1JBVjzhk0w= +go.opentelemetry.io/contrib/bridges/prometheus v0.57.0/go.mod h1:ppciCHRLsyCio54qbzQv0E4Jyth/fLWDTJYfvWpcSVk= +go.opentelemetry.io/contrib/exporters/autoexport v0.57.0 h1:jmTVJ86dP60C01K3slFQa2NQ/Aoi7zA+wy7vMOKD9H4= +go.opentelemetry.io/contrib/exporters/autoexport v0.57.0/go.mod h1:EJBheUMttD/lABFyLXhce47Wr6DPWYReCzaZiXadH7g= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 h1:sbiXRNDSWJOTobXh5HyQKjq6wUC5tNybqjIqDpAY4CU= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0/go.mod h1:69uWxva0WgAA/4bu2Yy70SLDBwZXuQ6PbBpbsa5iZrQ= go.opentelemetry.io/otel v1.37.0 h1:9zhNfelUvx0KBfu/gb+ZgeAfAgtWrfHJZcAqFC228wQ= go.opentelemetry.io/otel v1.37.0/go.mod h1:ehE/umFRLnuLa/vSccNq9oS1ErUlkkK71gMcN34UG8I= +go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.8.0 h1:WzNab7hOOLzdDF/EoWCt4glhrbMPVMOO5JYTmpz36Ls= +go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.8.0/go.mod h1:hKvJwTzJdp90Vh7p6q/9PAOd55dI6WA6sWj62a/JvSs= +go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.8.0 h1:S+LdBGiQXtJdowoJoQPEtI52syEP/JYBUpjO49EQhV8= +go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.8.0/go.mod h1:5KXybFvPGds3QinJWQT7pmXf+TN5YIa7CNYObWRkj50= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.32.0 h1:j7ZSD+5yn+lo3sGV69nW04rRR0jhYnBwjuX3r0HvnK0= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.32.0/go.mod h1:WXbYJTUaZXAbYd8lbgGuvih0yuCfOFC5RJoYnoLcGz8= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.32.0 h1:t/Qur3vKSkUCcDVaSumWF2PKHt85pc7fRvFuoVT8qFU= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.32.0/go.mod h1:Rl61tySSdcOJWoEgYZVtmnKdA0GeKrSqkHC1t+91CH8= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 h1:Vh5HayB/0HHfOQA7Ctx69E/Y/DcQSMPpKANYVMQ7fBA= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0/go.mod h1:cpgtDBaqD/6ok/UG0jT15/uKjAY8mRA53diogHBg3UI= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0 h1:5pojmb1U1AogINhN3SurB+zm/nIcusopeBNp42f45QM= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0/go.mod h1:57gTHJSE5S1tqg+EKsLPlTWhpHMsWlVmer+LA926XiA= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.32.0 h1:cMyu9O88joYEaI47CnQkxO1XZdpoTF9fEnW2duIddhw= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.32.0/go.mod h1:6Am3rn7P9TVVeXYG+wtcGE7IE1tsQ+bP3AuWcKt/gOI= +go.opentelemetry.io/otel/exporters/prometheus v0.54.0 h1:rFwzp68QMgtzu9PgP3jm9XaMICI6TsofWWPcBDKwlsU= +go.opentelemetry.io/otel/exporters/prometheus v0.54.0/go.mod h1:QyjcV9qDP6VeK5qPyKETvNjmaaEc7+gqjh4SS0ZYzDU= +go.opentelemetry.io/otel/exporters/stdout/stdoutlog v0.8.0 h1:CHXNXwfKWfzS65yrlB2PVds1IBZcdsX8Vepy9of0iRU= +go.opentelemetry.io/otel/exporters/stdout/stdoutlog v0.8.0/go.mod h1:zKU4zUgKiaRxrdovSS2amdM5gOc59slmo/zJwGX+YBg= +go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.32.0 h1:SZmDnHcgp3zwlPBS2JX2urGYe/jBKEIT6ZedHRUyCz8= +go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.32.0/go.mod h1:fdWW0HtZJ7+jNpTKUR0GpMEDP69nR8YBJQxNiVCE3jk= +go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.32.0 h1:cC2yDI3IQd0Udsux7Qmq8ToKAx1XCilTQECZ0KDZyTw= +go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.32.0/go.mod h1:2PD5Ex6z8CFzDbTdOlwyNIUywRr1DN0ospafJM1wJ+s= +go.opentelemetry.io/otel/log v0.8.0 h1:egZ8vV5atrUWUbnSsHn6vB8R21G2wrKqNiDt3iWertk= +go.opentelemetry.io/otel/log v0.8.0/go.mod h1:M9qvDdUTRCopJcGRKg57+JSQ9LgLBrwwfC32epk5NX8= go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE= go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E= go.opentelemetry.io/otel/sdk v1.35.0 h1:iPctf8iprVySXSKJffSS79eOjl9pvxV9ZqOWT0QejKY= go.opentelemetry.io/otel/sdk v1.35.0/go.mod h1:+ga1bZliga3DxJ3CQGg3updiaAJoNECOgJREo9KHGQg= +go.opentelemetry.io/otel/sdk/log v0.8.0 h1:zg7GUYXqxk1jnGF/dTdLPrK06xJdrXgqgFLnI4Crxvs= +go.opentelemetry.io/otel/sdk/log v0.8.0/go.mod h1:50iXr0UVwQrYS45KbruFrEt4LvAdCaWWgIrsN3ZQggo= go.opentelemetry.io/otel/sdk/metric v1.35.0 h1:1RriWBmCKgkeHEhM7a2uMjMUfP7MsOF5JpUCaEqEI9o= go.opentelemetry.io/otel/sdk/metric v1.35.0/go.mod h1:is6XYCUMpcKi+ZsOvfluY5YstFnhW0BidkR+gL+qN+w= go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4= go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0= +go.opentelemetry.io/proto/otlp v1.4.0 h1:TA9WRvW6zMwP+Ssb6fLoUIuirti1gGbP28GcKG1jgeg= +go.opentelemetry.io/proto/otlp v1.4.0/go.mod h1:PPBWZIP98o2ElSqI35IHfu7hIhSwvc5N38Jw8pXuGFY= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= -golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= +go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= +go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= +go.yaml.in/yaml/v3 v3.0.3 h1:bXOww4E/J3f66rav3pX3m8w6jDE4knZjGOw8b5Y6iNE= +go.yaml.in/yaml/v3 v3.0.3/go.mod h1:tBHosrYAkRZjRAOREWbDnBXUf08JOwYq++0QNwQiWzI= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= @@ -502,9 +468,7 @@ golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.6.0-dev.0.20220106191415-9b9b3d81d5e3/go.mod h1:3p9vT2HGsQu2K1YbXdKPJLVgG5VJdoTa1poYQBtP1AY= golang.org/x/mod v0.26.0 h1:EGMPT//Ezu+ylkCijjPc+f4Aih7sZvaAr+O3EHBxvZg= golang.org/x/mod v0.26.0/go.mod h1:/j6NAhSk8iQ723BGAUyoAcn7SlD7s15Dp9Nd/SfeaFQ= -golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= @@ -517,19 +481,14 @@ golang.org/x/net v0.42.0 h1:jzkYrhi3YQWD6MLBJcsklgQsoAcw89EcZbJw8Z614hs= golang.org/x/net v0.42.0/go.mod h1:FF1RA5d3u7nAYA4z2TkclSCKh68eSXtiFwcWQpPXdt8= golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= -golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= -golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190801041406-cbf593c0f2f3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -578,6 +537,9 @@ golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8T golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20220411194840-2f41105eb62f/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/genproto v0.0.0-20240213162025-012b6fc9bca9 h1:9+tzLLstTlPTRyJTh+ah5wIMsBW5c4tQwGTN3thOW9Y= +google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422 h1:GVIKPyP/kLIyVOgOnTwFOrvQaQUzOzGMCxgFUOEmm24= +google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422/go.mod h1:b6h1vNKhxaSoEI+5jc3PJUCustfli/mRab7295pY7rw= google.golang.org/genproto/googleapis/rpc v0.0.0-20250414145226-207652e42e2e h1:ztQaXfzEXTmCBvbtWYRhJxW+0iJcz2qXfd38/e9l7bA= google.golang.org/genproto/googleapis/rpc v0.0.0-20250414145226-207652e42e2e/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= google.golang.org/grpc v1.71.1 h1:ffsFWr7ygTUscGPI0KKK6TLrGz0476KUvvsbqWK0rPI= @@ -587,7 +549,6 @@ google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQ google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= -gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= @@ -600,7 +561,6 @@ gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= -gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= @@ -611,46 +571,45 @@ gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gotest.tools/v3 v3.4.0 h1:ZazjZUfuVeZGLAmlKKuyv3IKP5orXcwtOwDQH6YVr6o= -gotest.tools/v3 v3.4.0/go.mod h1:CtbdzLSsqVhDgMtKsx03ird5YTGB3ar27v0u/yKBW5g= -helm.sh/helm/v3 v3.17.4 h1:GK+vgn9gKCyoH44+f3B5zpA78iH3AK4ywIInDEmmn/g= -helm.sh/helm/v3 v3.17.4/go.mod h1:+uJKMH/UiMzZQOALR3XUf3BLIoczI2RKKD6bMhPh4G8= -k8s.io/api v0.33.2 h1:YgwIS5jKfA+BZg//OQhkJNIfie/kmRsO0BmNaVSimvY= -k8s.io/api v0.33.2/go.mod h1:fhrbphQJSM2cXzCWgqU29xLDuks4mu7ti9vveEnpSXs= -k8s.io/apiextensions-apiserver v0.32.2 h1:2YMk285jWMk2188V2AERy5yDwBYrjgWYggscghPCvV4= -k8s.io/apiextensions-apiserver v0.32.2/go.mod h1:GPwf8sph7YlJT3H6aKUWtd0E+oyShk/YHWQHf/OOgCA= -k8s.io/apimachinery v0.33.2 h1:IHFVhqg59mb8PJWTLi8m1mAoepkUNYmptHsV+Z1m5jY= -k8s.io/apimachinery v0.33.2/go.mod h1:BHW0YOu7n22fFv/JkYOEfkUYNRN0fj0BlvMFWA7b+SM= -k8s.io/apiserver v0.32.2 h1:WzyxAu4mvLkQxwD9hGa4ZfExo3yZZaYzoYvvVDlM6vw= -k8s.io/apiserver v0.32.2/go.mod h1:PEwREHiHNU2oFdte7BjzA1ZyjWjuckORLIK/wLV5goM= -k8s.io/cli-runtime v0.32.2 h1:aKQR4foh9qeyckKRkNXUccP9moxzffyndZAvr+IXMks= -k8s.io/cli-runtime v0.32.2/go.mod h1:a/JpeMztz3xDa7GCyyShcwe55p8pbcCVQxvqZnIwXN8= -k8s.io/client-go v0.33.2 h1:z8CIcc0P581x/J1ZYf4CNzRKxRvQAwoAolYPbtQes+E= -k8s.io/client-go v0.33.2/go.mod h1:9mCgT4wROvL948w6f6ArJNb7yQd7QsvqavDeZHvNmHo= -k8s.io/component-base v0.32.2 h1:1aUL5Vdmu7qNo4ZsE+569PV5zFatM9hl+lb3dEea2zU= -k8s.io/component-base v0.32.2/go.mod h1:PXJ61Vx9Lg+P5mS8TLd7bCIr+eMJRQTyXe8KvkrvJq0= +helm.sh/helm/v3 v3.18.5 h1:Cc3Z5vd6kDrZq9wO9KxKLNEickiTho6/H/dBNRVSos4= +helm.sh/helm/v3 v3.18.5/go.mod h1:L/dXDR2r539oPlFP1PJqKAC1CUgqHJDLkxKpDGrWnyg= +k8s.io/api v0.33.3 h1:SRd5t//hhkI1buzxb288fy2xvjubstenEKL9K51KBI8= +k8s.io/api v0.33.3/go.mod h1:01Y/iLUjNBM3TAvypct7DIj0M0NIZc+PzAHCIo0CYGE= +k8s.io/apiextensions-apiserver v0.33.3 h1:qmOcAHN6DjfD0v9kxL5udB27SRP6SG/MTopmge3MwEs= +k8s.io/apiextensions-apiserver v0.33.3/go.mod h1:oROuctgo27mUsyp9+Obahos6CWcMISSAPzQ77CAQGz8= +k8s.io/apimachinery v0.33.3 h1:4ZSrmNa0c/ZpZJhAgRdcsFcZOw1PQU1bALVQ0B3I5LA= +k8s.io/apimachinery v0.33.3/go.mod h1:BHW0YOu7n22fFv/JkYOEfkUYNRN0fj0BlvMFWA7b+SM= +k8s.io/apiserver v0.33.3 h1:Wv0hGc+QFdMJB4ZSiHrCgN3zL3QRatu56+rpccKC3J4= +k8s.io/apiserver v0.33.3/go.mod h1:05632ifFEe6TxwjdAIrwINHWE2hLwyADFk5mBsQa15E= +k8s.io/cli-runtime v0.33.3 h1:Dgy4vPjNIu8LMJBSvs8W0LcdV0PX/8aGG1DA1W8lklA= +k8s.io/cli-runtime v0.33.3/go.mod h1:yklhLklD4vLS8HNGgC9wGiuHWze4g7x6XQZ+8edsKEo= +k8s.io/client-go v0.33.3 h1:M5AfDnKfYmVJif92ngN532gFqakcGi6RvaOF16efrpA= +k8s.io/client-go v0.33.3/go.mod h1:luqKBQggEf3shbxHY4uVENAxrDISLOarxpTKMiUuujg= +k8s.io/component-base v0.33.3 h1:mlAuyJqyPlKZM7FyaoM/LcunZaaY353RXiOd2+B5tGA= +k8s.io/component-base v0.33.3/go.mod h1:ktBVsBzkI3imDuxYXmVxZ2zxJnYTZ4HAsVj9iF09qp4= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff h1:/usPimJzUKKu+m+TE36gUyGcf03XZEP0ZIKgKj35LS4= k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff/go.mod h1:5jIi+8yX4RIb8wk3XwBo5Pq2ccx4FP10ohkbSKCZoK8= -k8s.io/kubectl v0.32.2 h1:TAkag6+XfSBgkqK9I7ZvwtF0WVtUAvK8ZqTt+5zi1Us= -k8s.io/kubectl v0.32.2/go.mod h1:+h/NQFSPxiDZYX/WZaWw9fwYezGLISP0ud8nQKg+3g8= +k8s.io/kubectl v0.33.3 h1:r/phHvH1iU7gO/l7tTjQk2K01ER7/OAJi8uFHHyWSac= +k8s.io/kubectl v0.33.3/go.mod h1:euj2bG56L6kUGOE/ckZbCoudPwuj4Kud7BR0GzyNiT0= k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 h1:M3sRQVHv7vB20Xc2ybTt7ODCeFj6JSWYFzOFnYeS6Ro= k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= -oras.land/oras-go v1.2.5 h1:XpYuAwAb0DfQsunIyMfeET92emK8km3W4yEzZvUbsTo= -oras.land/oras-go v1.2.5/go.mod h1:PuAwRShRZCsZb7g8Ar3jKKQR/2A/qN+pkYxIOd/FAoo= +oras.land/oras-go/v2 v2.6.0 h1:X4ELRsiGkrbeox69+9tzTu492FMUu7zJQW6eJU+I2oc= +oras.land/oras-go/v2 v2.6.0/go.mod h1:magiQDfG6H1O9APp+rOsvCPcW1GD2MM7vgnKY0Y+u1o= sigs.k8s.io/controller-runtime v0.19.0 h1:nWVM7aq+Il2ABxwiCizrVDSlmDcshi9llbaFbC0ji/Q= sigs.k8s.io/controller-runtime v0.19.0/go.mod h1:iRmWllt8IlaLjvTTDLhRBXIEtkCK6hwVBJJsYS9Ajf4= sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8= sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo= -sigs.k8s.io/kustomize/api v0.18.0 h1:hTzp67k+3NEVInwz5BHyzc9rGxIauoXferXyjv5lWPo= -sigs.k8s.io/kustomize/api v0.18.0/go.mod h1:f8isXnX+8b+SGLHQ6yO4JG1rdkZlvhaCf/uZbLVMb0U= -sigs.k8s.io/kustomize/kyaml v0.18.1 h1:WvBo56Wzw3fjS+7vBjN6TeivvpbW9GmRaWZ9CIVmt4E= -sigs.k8s.io/kustomize/kyaml v0.18.1/go.mod h1:C3L2BFVU1jgcddNBE1TxuVLgS46TjObMwW5FT9FcjYo= +sigs.k8s.io/kustomize/api v0.19.0 h1:F+2HB2mU1MSiR9Hp1NEgoU2q9ItNOaBJl0I4Dlus5SQ= +sigs.k8s.io/kustomize/api v0.19.0/go.mod h1:/BbwnivGVcBh1r+8m3tH1VNxJmHSk1PzP5fkP6lbL1o= +sigs.k8s.io/kustomize/kyaml v0.19.0 h1:RFge5qsO1uHhwJsu3ipV7RNolC7Uozc0jUBC/61XSlA= +sigs.k8s.io/kustomize/kyaml v0.19.0/go.mod h1:FeKD5jEOH+FbZPpqUghBP8mrLjJ3+zD3/rf9NNu1cwY= sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= sigs.k8s.io/structured-merge-diff/v4 v4.6.0 h1:IUA9nvMmnKWcj5jl84xn+T5MnlZKThmUW1TdblaLVAc= sigs.k8s.io/structured-merge-diff/v4 v4.6.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= -sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= +sigs.k8s.io/yaml v1.5.0 h1:M10b2U7aEUY6hRtU870n2VTPgR5RZiL/I6Lcc2F4NUQ= +sigs.k8s.io/yaml v1.5.0/go.mod h1:wZs27Rbxoai4C0f8/9urLZtZtF3avA3gKvGyPdDqTO4= diff --git a/internal/actions/check_node_deleted.go b/internal/actions/check_node_deleted.go index 2d32afde..6c7cad85 100644 --- a/internal/actions/check_node_deleted.go +++ b/internal/actions/check_node_deleted.go @@ -73,7 +73,7 @@ func (h *CheckNodeDeletedHandler) Handle(ctx context.Context, action *castai.Clu boff, h.cfg.retries, func(ctx context.Context) (bool, error) { - return checkNodeDeleted(ctx, h.clientset.CoreV1().Nodes(), req.NodeName, req.NodeID, req.ProviderId, log) + return checkNodeDeleted(ctx, h.clientset, req.NodeName, req.NodeID, req.ProviderId, log) }, func(err error) { log.Warnf("node deletion check failed, will retry: %v", err) diff --git a/internal/actions/check_node_status.go b/internal/actions/check_node_status.go index e423d3d2..228ad166 100644 --- a/internal/actions/check_node_status.go +++ b/internal/actions/check_node_status.go @@ -2,182 +2,62 @@ package actions import ( "context" + "encoding/json" "errors" - "fmt" - "reflect" - "time" - "github.com/samber/lo" "github.com/sirupsen/logrus" corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" - "k8s.io/client-go/kubernetes/typed/core/v1" - "github.com/castai/cluster-controller/internal/castai" - "github.com/castai/cluster-controller/internal/waitext" + "github.com/castai/cluster-controller/internal/informer" ) -var _ ActionHandler = &CheckNodeStatusHandler{} - -func NewCheckNodeStatusHandler(log logrus.FieldLogger, clientset kubernetes.Interface) *CheckNodeStatusHandler { - return &CheckNodeStatusHandler{ +// NewCheckNodeStatusHandler creates a handler for checking node status. +// If informerManager is provided, it uses efficient informer-based watching. +// If informerManager is nil, it falls back to polling-based implementation. +func NewCheckNodeStatusHandler(log logrus.FieldLogger, clientset kubernetes.Interface, informerManager *informer.Manager) ActionHandler { + if informerManager != nil { + log.Info("using informer-based check node status handler") + return &checkNodeStatusInformerHandler{ + log: log, + clientset: clientset, + informerManager: informerManager, + } + } + log.Info("using polling-based check node status handler") + return &checkNodeStatusPollingHandler{ log: log, clientset: clientset, } } -type CheckNodeStatusHandler struct { - log logrus.FieldLogger - clientset kubernetes.Interface -} - -func (h *CheckNodeStatusHandler) Handle(ctx context.Context, action *castai.ClusterAction) error { - if action == nil { - return fmt.Errorf("action is nil %w", errAction) - } - req, ok := action.Data().(*castai.ActionCheckNodeStatus) - if !ok { - return newUnexpectedTypeErr(action.Data(), req) - } - - log := h.log.WithFields(logrus.Fields{ - "node_name": req.NodeName, - "node_id": req.NodeID, - "provider_id": req.ProviderId, - "node_status": req.NodeStatus, - "type": reflect.TypeOf(action.Data().(*castai.ActionCheckNodeStatus)).String(), - ActionIDLogField: action.ID, - }) - - log.Info("checking status of node") - if req.NodeName == "" || - (req.NodeID == "" && req.ProviderId == "") { - return fmt.Errorf("node name or node ID/provider ID is empty %w", errAction) - } - - switch req.NodeStatus { - case castai.ActionCheckNodeStatus_READY: - log.Info("checking node ready") - return h.checkNodeReady(ctx, log, req) - case castai.ActionCheckNodeStatus_DELETED: - log.Info("checking node deleted") - return h.checkNodeDeleted(ctx, log, req) - - } - - return fmt.Errorf("unknown status to check provided node=%s status=%s", req.NodeName, req.NodeStatus) -} - -func (h *CheckNodeStatusHandler) checkNodeDeleted(ctx context.Context, log *logrus.Entry, req *castai.ActionCheckNodeStatus) error { - timeout := 10 - if req.WaitTimeoutSeconds != nil { - timeout = int(*req.WaitTimeoutSeconds) - } - ctx, cancel := context.WithTimeout(ctx, time.Duration(timeout)*time.Second) - defer cancel() - - b := waitext.DefaultExponentialBackoff() - return waitext.Retry( - ctx, - b, - waitext.Forever, - func(ctx context.Context) (bool, error) { - return checkNodeDeleted(ctx, h.clientset.CoreV1().Nodes(), req.NodeName, req.NodeID, req.ProviderId, log) - }, - func(err error) { - log.Warnf("check node %s status failed, will retry: %v", req.NodeName, err) - }, - ) -} +// Shared errors +var errNodeWatcherClosed = errors.New("node watcher closed") -func checkNodeDeleted(ctx context.Context, clientSet v1.NodeInterface, nodeName, nodeID, providerID string, log logrus.FieldLogger) (bool, error) { - // If node is nil - deleted - // If providerID or label have mismatch, then it's reused and deleted - // If label is present and matches - node is not deleted - // All other use cases can be found in tests - n, err := getNodeByIDs(ctx, clientSet, nodeName, nodeID, providerID, log) - if errors.Is(err, errNodeDoesNotMatch) { - // it means that node with given name exists, but it does not match requested node ID or provider ID. +// checkNodeDeleted checks if a node is deleted by verifying it doesn't exist or has been replaced. +// Returns (retry, error) where retry indicates if the operation should be retried. +func checkNodeDeleted(ctx context.Context, clientset kubernetes.Interface, nodeName, nodeID, providerID string, log logrus.FieldLogger) (bool, error) { + n, err := getNodeByIDs(ctx, clientset.CoreV1().Nodes(), nodeName, nodeID, providerID, log) + if isNodeDeletedOrReplaced(err) { return false, nil } - - if errors.Is(err, errNodeNotFound) { - return false, nil - } - if err != nil { return true, err } - if n == nil { return false, nil } - return false, errNodeNotDeleted } -func (h *CheckNodeStatusHandler) checkNodeReady(ctx context.Context, _ *logrus.Entry, req *castai.ActionCheckNodeStatus) error { - timeout := 9 * time.Minute - if req.WaitTimeoutSeconds != nil { - timeout = time.Duration(*req.WaitTimeoutSeconds) * time.Second - } - - watchObject := metav1.SingleObject(metav1.ObjectMeta{ - Name: req.NodeName, - }) - watchObject.TimeoutSeconds = lo.ToPtr(int64(timeout.Seconds())) - - ctx, cancel := context.WithTimeout(ctx, timeout) - defer cancel() - - watch, err := h.clientset.CoreV1().Nodes().Watch(ctx, watchObject) - if err != nil { - return fmt.Errorf("creating node watch: %w", err) - } - defer watch.Stop() - - for { - select { - case <-ctx.Done(): - return fmt.Errorf("node %s request timeout: %v %w", req.NodeName, timeout, ctx.Err()) - case r, ok := <-watch.ResultChan(): - if !ok { - return fmt.Errorf("node %s request timeout: %v %w", req.NodeName, timeout, errNodeWatcherClosed) - } - if node, ok := r.Object.(*corev1.Node); ok { - if h.isNodeReady(node, req.NodeID, req.ProviderId) { - return nil - } - } - } - } -} - -func (h *CheckNodeStatusHandler) isNodeReady(node *corev1.Node, castNodeID, providerID string) bool { - // if node has castai node id label, check if it matches the one we are waiting for - // if it doesn't match, we can skip this node. - if err := isNodeIDProviderIDValid(node, castNodeID, providerID, h.log); err != nil { - h.log.WithFields(logrus.Fields{ - "node": node.Name, - "node_id": castNodeID, - "provider_id": providerID, - }).Warnf("node does not match requested node ID or provider ID: %v", err) - return false - } - - for _, cond := range node.Status.Conditions { - if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue && !containsUninitializedNodeTaint(node.Spec.Taints) { - return true - } - } - - return false +// isNodeDeletedOrReplaced checks if the error indicates the node was deleted or replaced. +func isNodeDeletedOrReplaced(err error) bool { + return errors.Is(err, errNodeDoesNotMatch) || errors.Is(err, errNodeNotFound) } +// containsUninitializedNodeTaint checks if taints contain the cloud provider uninitialized taint. func containsUninitializedNodeTaint(taints []corev1.Taint) bool { for _, taint := range taints { - // Some providers like AKS provider adds this taint even if node contains ready condition. if taint == taintCloudProviderUninitialized { return true } @@ -189,3 +69,30 @@ var taintCloudProviderUninitialized = corev1.Taint{ Key: "node.cloudprovider.kubernetes.io/uninitialized", Effect: corev1.TaintEffectNoSchedule, } + +// patchNodeCapacityIfNeeded patches the node capacity with network bandwidth if the label exists. +func patchNodeCapacityIfNeeded(ctx context.Context, log *logrus.Entry, clientset kubernetes.Interface, node *corev1.Node) { + bandwidth, ok := node.Labels["scheduling.cast.ai/network-bandwidth"] + if !ok { + return + } + + patch, err := json.Marshal(map[string]interface{}{ + "status": map[string]interface{}{ + "capacity": map[string]interface{}{ + "scheduling.cast.ai/network-bandwidth": bandwidth, + }, + }, + }) + if err != nil { + log.WithError(err).Error("failed to marshal node capacity patch") + return + } + + log.Infof("going to patch node capacity: %v", node.Name) + if err := patchNodeStatus(ctx, log, clientset, node.Name, patch); err != nil { + log.WithError(err).Error("failed to patch node capacity") + return + } + log.Infof("patched node capacity: %v", node.Name) +} diff --git a/internal/actions/check_node_status_informer.go b/internal/actions/check_node_status_informer.go new file mode 100644 index 00000000..d44414fc --- /dev/null +++ b/internal/actions/check_node_status_informer.go @@ -0,0 +1,254 @@ +package actions + +import ( + "context" + "errors" + "fmt" + "reflect" + "time" + + "github.com/sirupsen/logrus" + corev1 "k8s.io/api/core/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/cache" + + "github.com/castai/cluster-controller/internal/castai" + "github.com/castai/cluster-controller/internal/informer" +) + +var _ ActionHandler = &checkNodeStatusInformerHandler{} + +type checkNodeStatusInformerHandler struct { + log logrus.FieldLogger + clientset kubernetes.Interface + informerManager *informer.Manager +} + +func (h *checkNodeStatusInformerHandler) Handle(ctx context.Context, action *castai.ClusterAction) error { + if action == nil { + return fmt.Errorf("action is nil %w", errAction) + } + req, ok := action.Data().(*castai.ActionCheckNodeStatus) + if !ok { + return newUnexpectedTypeErr(action.Data(), req) + } + + log := h.log.WithFields(logrus.Fields{ + "node_name": req.NodeName, + "node_id": req.NodeID, + "provider_id": req.ProviderId, + "node_status": req.NodeStatus, + "type": reflect.TypeOf(action.Data().(*castai.ActionCheckNodeStatus)).String(), + ActionIDLogField: action.ID, + }) + + if req.NodeName == "" || + (req.NodeID == "" && req.ProviderId == "") { + return fmt.Errorf("node name or node ID/provider ID is empty %w", errAction) + } + + switch req.NodeStatus { + case castai.ActionCheckNodeStatus_READY: + log.Info("checking node ready") + return h.checkNodeReady(ctx, log, req) + case castai.ActionCheckNodeStatus_DELETED: + log.Info("checking node deleted") + return h.checkNodeDeleted(ctx, log, req) + } + + return fmt.Errorf("unknown status to check provided node=%s status=%s", req.NodeName, req.NodeStatus) +} + +func (h *checkNodeStatusInformerHandler) checkNodeDeleted(ctx context.Context, log *logrus.Entry, req *castai.ActionCheckNodeStatus) error { + timeout := 10 + if req.WaitTimeoutSeconds != nil { + timeout = int(*req.WaitTimeoutSeconds) + } + ctx, cancel := context.WithTimeout(ctx, time.Duration(timeout)*time.Second) + defer cancel() + + return h.checkNodeDeletedWithInformer(ctx, req.NodeName, req.NodeID, req.ProviderId, log) +} + +func (h *checkNodeStatusInformerHandler) checkNodeDeletedWithInformer(ctx context.Context, nodeName, nodeID, providerID string, log logrus.FieldLogger) error { + if err := h.checkNodeAlreadyDeleted(nodeName, nodeID, providerID, log); err != nil { + if errors.Is(err, errNodeNotDeleted) { + return h.waitForNodeDeletion(ctx, nodeName, nodeID, providerID, log) + } + return err + } + return nil +} + +func (h *checkNodeStatusInformerHandler) checkNodeAlreadyDeleted(nodeName, nodeID, providerID string, log logrus.FieldLogger) error { + lister := h.informerManager.GetNodeLister() + node, err := lister.Get(nodeName) + if err != nil { + if k8serrors.IsNotFound(err) { + log.Info("node already deleted in cache") + return nil + } + return fmt.Errorf("getting node from lister: %w", err) + } + + if err := isNodeIDProviderIDValid(node, nodeID, providerID, log); err != nil { + if errors.Is(err, errNodeDoesNotMatch) { + log.Info("node name reused, original node deleted") + return nil + } + return fmt.Errorf("validating node ID/provider ID: %w", err) + } + + return errNodeNotDeleted +} + +func (h *checkNodeStatusInformerHandler) waitForNodeDeletion(ctx context.Context, nodeName, nodeID, providerID string, log logrus.FieldLogger) error { + deleted := make(chan struct{}) + nodeInformer := h.informerManager.GetNodeInformer() + + registration, err := nodeInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + UpdateFunc: func(oldObj, newObj any) { + h.handleNodeDeletedUpdateEvent(newObj, nodeName, nodeID, providerID, deleted, log) + }, + DeleteFunc: func(obj any) { + h.handleNodeDeletedDeleteEvent(obj, nodeName, deleted, log) + }, + }) + if err != nil { + return fmt.Errorf("failed to add event handler: %w", err) + } + defer func() { + if err := nodeInformer.RemoveEventHandler(registration); err != nil { + log.WithError(err).Warn("failed to remove event handler") + } + }() + + select { + case <-deleted: + return nil + case <-ctx.Done(): + return fmt.Errorf("timeout waiting for node to be deleted: %w", ctx.Err()) + } +} + +func (h *checkNodeStatusInformerHandler) handleNodeDeletedUpdateEvent(newObj any, nodeName, nodeID, providerID string, deleted chan struct{}, log logrus.FieldLogger) { + node, ok := newObj.(*corev1.Node) + if !ok || node.Name != nodeName { + return + } + if err := isNodeIDProviderIDValid(node, nodeID, providerID, log); err != nil { + if errors.Is(err, errNodeDoesNotMatch) { + log.Info("node name reused, original node deleted (update event)") + select { + case deleted <- struct{}{}: + default: + } + } + } +} + +func (h *checkNodeStatusInformerHandler) handleNodeDeletedDeleteEvent(obj any, nodeName string, deleted chan struct{}, log logrus.FieldLogger) { + node, ok := obj.(*corev1.Node) + if !ok { + tombstone, ok := obj.(cache.DeletedFinalStateUnknown) + if !ok { + return + } + node, ok = tombstone.Obj.(*corev1.Node) + if !ok { + return + } + } + if node.Name == nodeName { + log.Info("node deleted (delete event)") + select { + case deleted <- struct{}{}: + default: + } + } +} + +func (h *checkNodeStatusInformerHandler) checkNodeReady(ctx context.Context, log *logrus.Entry, req *castai.ActionCheckNodeStatus) error { + timeout := 9 * time.Minute + if req.WaitTimeoutSeconds != nil { + timeout = time.Duration(*req.WaitTimeoutSeconds) * time.Second + } + + ctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + lister := h.informerManager.GetNodeLister() + node, err := lister.Get(req.NodeName) + if err == nil && h.isNodeReady(node, req.NodeID, req.ProviderId) { + log.Info("node already ready in cache") + patchNodeCapacityIfNeeded(ctx, log, h.clientset, node) + return nil + } + + ready := make(chan struct{}) + nodeInformer := h.informerManager.GetNodeInformer() + + registration, err := nodeInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj any) { + h.handleNodeReadyEvent(obj, req, ready, log, "add event") + }, + UpdateFunc: func(oldObj, newObj any) { + h.handleNodeReadyEvent(newObj, req, ready, log, "update event") + }, + }) + if err != nil { + return fmt.Errorf("failed to add event handler: %w", err) + } + defer func() { + if err := nodeInformer.RemoveEventHandler(registration); err != nil { + log.WithError(err).Warn("failed to remove event handler") + } + }() + + select { + case <-ready: + node, err := lister.Get(req.NodeName) + if err != nil { + log.WithError(err).Error("failed to get node, will skip patch") + return nil + } + patchNodeCapacityIfNeeded(ctx, log, h.clientset, node) + return nil + case <-ctx.Done(): + return fmt.Errorf("timeout waiting for node to be ready: %w", ctx.Err()) + } +} + +func (h *checkNodeStatusInformerHandler) handleNodeReadyEvent(obj any, req *castai.ActionCheckNodeStatus, ready chan struct{}, log *logrus.Entry, eventType string) { + node, ok := obj.(*corev1.Node) + if !ok || node.Name != req.NodeName { + return + } + if h.isNodeReady(node, req.NodeID, req.ProviderId) { + log.Infof("node became ready (%s)", eventType) + select { + case ready <- struct{}{}: + default: + } + } +} + +func (h *checkNodeStatusInformerHandler) isNodeReady(node *corev1.Node, castNodeID, providerID string) bool { + if err := isNodeIDProviderIDValid(node, castNodeID, providerID, h.log); err != nil { + h.log.WithFields(logrus.Fields{ + "node": node.Name, + "node_id": castNodeID, + "provider_id": providerID, + }).Warnf("node does not match requested node ID or provider ID: %v", err) + return false + } + + for _, cond := range node.Status.Conditions { + if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue && !containsUninitializedNodeTaint(node.Spec.Taints) { + return true + } + } + + return false +} diff --git a/internal/actions/check_node_status_polling.go b/internal/actions/check_node_status_polling.go new file mode 100644 index 00000000..e8e35220 --- /dev/null +++ b/internal/actions/check_node_status_polling.go @@ -0,0 +1,152 @@ +package actions + +import ( + "context" + "fmt" + "reflect" + "time" + + "github.com/samber/lo" + "github.com/sirupsen/logrus" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + + "github.com/castai/cluster-controller/internal/castai" + "github.com/castai/cluster-controller/internal/waitext" +) + +var _ ActionHandler = &checkNodeStatusPollingHandler{} + +type checkNodeStatusPollingHandler struct { + log logrus.FieldLogger + clientset kubernetes.Interface +} + +func (h *checkNodeStatusPollingHandler) Handle(ctx context.Context, action *castai.ClusterAction) error { + if action == nil { + return fmt.Errorf("action is nil %w", errAction) + } + req, ok := action.Data().(*castai.ActionCheckNodeStatus) + if !ok { + return newUnexpectedTypeErr(action.Data(), req) + } + + log := h.log.WithFields(logrus.Fields{ + "node_name": req.NodeName, + "node_id": req.NodeID, + "provider_id": req.ProviderId, + "node_status": req.NodeStatus, + "type": reflect.TypeOf(action.Data().(*castai.ActionCheckNodeStatus)).String(), + ActionIDLogField: action.ID, + }) + + log.Info("checking status of node") + if req.NodeName == "" || + (req.NodeID == "" && req.ProviderId == "") { + return fmt.Errorf("node name or node ID/provider ID is empty %w", errAction) + } + + switch req.NodeStatus { + case castai.ActionCheckNodeStatus_READY: + log.Info("checking node ready") + return h.checkNodeReady(ctx, log, req) + case castai.ActionCheckNodeStatus_DELETED: + log.Info("checking node deleted") + return h.checkNodeDeleted(ctx, log, req) + } + + return fmt.Errorf("unknown status to check provided node=%s status=%s", req.NodeName, req.NodeStatus) +} + +func (h *checkNodeStatusPollingHandler) checkNodeDeleted(ctx context.Context, log *logrus.Entry, req *castai.ActionCheckNodeStatus) error { + timeout := 10 + if req.WaitTimeoutSeconds != nil { + timeout = int(*req.WaitTimeoutSeconds) + } + ctx, cancel := context.WithTimeout(ctx, time.Duration(timeout)*time.Second) + defer cancel() + + b := waitext.DefaultExponentialBackoff() + return waitext.Retry( + ctx, + b, + waitext.Forever, + func(ctx context.Context) (bool, error) { + return checkNodeDeletedPolling(ctx, h.clientset, req.NodeName, req.NodeID, req.ProviderId, log) + }, + func(err error) { + log.Warnf("check node %s status failed, will retry: %v", req.NodeName, err) + }, + ) +} + +func checkNodeDeletedPolling(ctx context.Context, clientset kubernetes.Interface, nodeName, nodeID, providerID string, log logrus.FieldLogger) (bool, error) { + n, err := getNodeByIDs(ctx, clientset.CoreV1().Nodes(), nodeName, nodeID, providerID, log) + if isNodeDeletedOrReplaced(err) { + return false, nil + } + if err != nil { + return true, err + } + if n == nil { + return false, nil + } + return false, errNodeNotDeleted +} + +func (h *checkNodeStatusPollingHandler) checkNodeReady(ctx context.Context, _ *logrus.Entry, req *castai.ActionCheckNodeStatus) error { + timeout := 9 * time.Minute + if req.WaitTimeoutSeconds != nil { + timeout = time.Duration(*req.WaitTimeoutSeconds) * time.Second + } + + watchObject := metav1.SingleObject(metav1.ObjectMeta{ + Name: req.NodeName, + }) + watchObject.TimeoutSeconds = lo.ToPtr(int64(timeout.Seconds())) + + ctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + watch, err := h.clientset.CoreV1().Nodes().Watch(ctx, watchObject) + if err != nil { + return fmt.Errorf("creating node watch: %w", err) + } + defer watch.Stop() + + for { + select { + case <-ctx.Done(): + return fmt.Errorf("node %s request timeout: %v %w", req.NodeName, timeout, ctx.Err()) + case r, ok := <-watch.ResultChan(): + if !ok { + return fmt.Errorf("node %s request timeout: %v %w", req.NodeName, timeout, errNodeWatcherClosed) + } + if node, ok := r.Object.(*corev1.Node); ok { + if h.isNodeReady(node, req.NodeID, req.ProviderId) { + return nil + } + } + } + } +} + +func (h *checkNodeStatusPollingHandler) isNodeReady(node *corev1.Node, castNodeID, providerID string) bool { + if err := isNodeIDProviderIDValid(node, castNodeID, providerID, h.log); err != nil { + h.log.WithFields(logrus.Fields{ + "node": node.Name, + "node_id": castNodeID, + "provider_id": providerID, + }).Warnf("node does not match requested node ID or provider ID: %v", err) + return false + } + + for _, cond := range node.Status.Conditions { + if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue && !containsUninitializedNodeTaint(node.Spec.Taints) { + return true + } + } + + return false +} diff --git a/internal/actions/check_node_status_test.go b/internal/actions/check_node_status_test.go index 331a49e6..6b2aa670 100644 --- a/internal/actions/check_node_status_test.go +++ b/internal/actions/check_node_status_test.go @@ -3,6 +3,8 @@ package actions import ( "context" "testing" + "testing/synctest" + "time" "github.com/google/uuid" "github.com/samber/lo" @@ -17,6 +19,7 @@ import ( k8stest "k8s.io/client-go/testing" "github.com/castai/cluster-controller/internal/castai" + "github.com/castai/cluster-controller/internal/informer" ) const ( @@ -80,7 +83,7 @@ func TestCheckNodeStatusHandler_Handle_Deleted(t *testing.T) { { name: "provider id of Node is empty but nodeID matches", args: args{ - action: newActionCheckNodeStatus(nodeName, nodeID, providerID, castai.ActionCheckNodeStatus_DELETED, nil), + action: newActionCheckNodeStatus(nodeName, nodeID, providerID, castai.ActionCheckNodeStatus_DELETED, lo.ToPtr(int32(1))), }, fields: fields{ tuneFakeObjects: []runtime.Object{ @@ -97,24 +100,24 @@ func TestCheckNodeStatusHandler_Handle_Deleted(t *testing.T) { }, }, }, - wantErr: errNodeNotDeleted, + wantErr: context.DeadlineExceeded, }, { name: "provider id of request is empty but nodeID matches", args: args{ - action: newActionCheckNodeStatus(nodeName, nodeID, "", castai.ActionCheckNodeStatus_DELETED, nil), + action: newActionCheckNodeStatus(nodeName, nodeID, "", castai.ActionCheckNodeStatus_DELETED, lo.ToPtr(int32(1))), }, fields: fields{ tuneFakeObjects: []runtime.Object{ nodeObject, }, }, - wantErr: errNodeNotDeleted, + wantErr: context.DeadlineExceeded, }, { name: "node id at label is empty but provider ID matches", args: args{ - action: newActionCheckNodeStatus(nodeName, nodeID, providerID, castai.ActionCheckNodeStatus_DELETED, nil), + action: newActionCheckNodeStatus(nodeName, nodeID, providerID, castai.ActionCheckNodeStatus_DELETED, lo.ToPtr(int32(1))), }, fields: fields{ tuneFakeObjects: []runtime.Object{ @@ -129,19 +132,19 @@ func TestCheckNodeStatusHandler_Handle_Deleted(t *testing.T) { }, }, }, - wantErr: errNodeNotDeleted, + wantErr: context.DeadlineExceeded, }, { name: "node id at request is empty but provider ID matches", args: args{ - action: newActionCheckNodeStatus(nodeName, "", providerID, castai.ActionCheckNodeStatus_DELETED, nil), + action: newActionCheckNodeStatus(nodeName, "", providerID, castai.ActionCheckNodeStatus_DELETED, lo.ToPtr(int32(1))), }, fields: fields{ tuneFakeObjects: []runtime.Object{ nodeObject, }, }, - wantErr: errNodeNotDeleted, + wantErr: context.DeadlineExceeded, }, { name: "node with the same name exists but IDs does not match", @@ -179,9 +182,9 @@ func TestCheckNodeStatusHandler_Handle_Deleted(t *testing.T) { }, }, args: args{ - action: newActionCheckNodeStatus(nodeName, nodeID, providerID, castai.ActionCheckNodeStatus_DELETED, nil), + action: newActionCheckNodeStatus(nodeName, nodeID, providerID, castai.ActionCheckNodeStatus_DELETED, lo.ToPtr(int32(1))), }, - wantErr: errNodeNotDeleted, + wantErr: context.DeadlineExceeded, }, } for _, tt := range tests { @@ -189,10 +192,25 @@ func TestCheckNodeStatusHandler_Handle_Deleted(t *testing.T) { t.Run(tt.name, func(t *testing.T) { t.Parallel() clientSet := fake.NewClientset(tt.fields.tuneFakeObjects...) + log := logrus.New() log.SetLevel(logrus.DebugLevel) + + infMgr := informer.NewManager(log, clientSet, 10*time.Minute) + + // Start informer manager + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + + go func() { + _ = infMgr.Start(ctx) + }() + + // Wait for informer to sync + time.Sleep(100 * time.Millisecond) + h := NewCheckNodeStatusHandler( - log, clientSet) + log, clientSet, infMgr) err := h.Handle(context.Background(), tt.args.action) require.ErrorIs(t, err, tt.wantErr, "unexpected error: %v", err) }) @@ -434,13 +452,33 @@ func TestCheckNodeStatusHandler_Handle_Ready(t *testing.T) { }, } for _, tt := range tests { - tt := tt t.Run(tt.name, func(t *testing.T) { t.Parallel() clientSet := fake.NewClientset() watcher := watch.NewFake() - defer watcher.Stop() + + // Set up watch reactor before starting informer + clientSet.PrependWatchReactor("nodes", k8stest.DefaultWatchReactor(watcher, nil)) + + log := logrus.New() + log.SetLevel(logrus.DebugLevel) + infMgr := informer.NewManager(log, clientSet, 10*time.Minute) + + // Start informer manager + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(func() { + cancel() + watcher.Stop() + }) + go func() { + _ = infMgr.Start(ctx) + }() + + // Wait for informer to sync + time.Sleep(100 * time.Millisecond) + + // Send watch events after informer is ready go func() { if len(tt.fields.tuneFakeObjects) == 0 { return @@ -451,11 +489,8 @@ func TestCheckNodeStatusHandler_Handle_Ready(t *testing.T) { watcher.Action(obj.event, obj.object) } }() - clientSet.PrependWatchReactor("nodes", k8stest.DefaultWatchReactor(watcher, nil)) - log := logrus.New() - log.SetLevel(logrus.DebugLevel) - h := NewCheckNodeStatusHandler(log, clientSet) + h := NewCheckNodeStatusHandler(log, clientSet, infMgr) err := h.Handle(context.Background(), tt.args.action) require.ErrorIs(t, err, tt.wantErr, "unexpected error: %v", err) @@ -475,3 +510,369 @@ func newActionCheckNodeStatus(nodeName, nodeID, providerID string, status castai }, } } + +func TestCheckNodeStatusHandler_PatchNodeCapacity(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + node *v1.Node + expectPatch bool + patchShouldErr bool + }{ + { + name: "should patch node with network bandwidth label", + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + "scheduling.cast.ai/network-bandwidth": "10Gi", + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + }, + expectPatch: true, + }, + { + name: "should not patch node without network bandwidth label", + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + }, + expectPatch: false, + }, + { + name: "should handle patch error gracefully", + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + "scheduling.cast.ai/network-bandwidth": "10Gi", + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + }, + expectPatch: true, + patchShouldErr: true, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + clientSet := fake.NewClientset(tt.node) + + if tt.patchShouldErr { + // Make patch fail + clientSet.PrependReactor("patch", "nodes", func(action k8stest.Action) (handled bool, ret runtime.Object, err error) { + return true, nil, context.DeadlineExceeded + }) + } + + log := logrus.New() + log.SetLevel(logrus.DebugLevel) + + ctx := context.Background() + patchNodeCapacityIfNeeded(ctx, log.WithField("test", tt.name), clientSet, tt.node) + + if tt.expectPatch && !tt.patchShouldErr { + // Verify patch was called + actions := clientSet.Actions() + var patchFound bool + for _, action := range actions { + if action.GetVerb() == "patch" { + patchFound = true + break + } + } + require.True(t, patchFound, "expected patch action to be called") + } + + if !tt.expectPatch { + // Verify patch was NOT called + actions := clientSet.Actions() + for _, action := range actions { + require.NotEqual(t, "patch", action.GetVerb(), "patch should not be called") + } + } + }) + } +} + +func TestCheckNodeStatusHandler_Handle_Deleted_WithEvents(t *testing.T) { + t.Parallel() + + nodeObject := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + } + + nodeObjectDifferentID := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: "different-node-id", + }, + }, + Spec: v1.NodeSpec{ + ProviderID: "different-provider-id", + }, + } + + tests := []struct { + name string + initialNode *v1.Node + eventType watch.EventType + eventNode *v1.Node + wantErr error + }{ + { + name: "should succeed when node is deleted via delete event", + initialNode: nodeObject, + eventType: watch.Deleted, + eventNode: nodeObject, + wantErr: nil, + }, + { + name: "should succeed when node name is reused via update event", + initialNode: nodeObject, + eventType: watch.Modified, + eventNode: nodeObjectDifferentID, + wantErr: nil, + }, + { + name: "should handle tombstone on delete", + initialNode: nodeObject, + eventType: watch.Deleted, + eventNode: nodeObject, + wantErr: nil, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + clientSet := fake.NewClientset(tt.initialNode) + watcher := watch.NewFake() + + clientSet.PrependWatchReactor("nodes", k8stest.DefaultWatchReactor(watcher, nil)) + + log := logrus.New() + log.SetLevel(logrus.DebugLevel) + + infMgr := informer.NewManager(log, clientSet, 10*time.Minute) + + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(func() { + cancel() + watcher.Stop() + }) + + go func() { + _ = infMgr.Start(ctx) + }() + + // Wait for informer to sync + time.Sleep(100 * time.Millisecond) + + // Send initial add event + watcher.Add(tt.initialNode) + time.Sleep(50 * time.Millisecond) + + // Send delete/update event after a delay + go func() { + time.Sleep(100 * time.Millisecond) + watcher.Action(tt.eventType, tt.eventNode) + }() + + h := NewCheckNodeStatusHandler(log, clientSet, infMgr) + action := newActionCheckNodeStatus(nodeName, nodeID, providerID, castai.ActionCheckNodeStatus_DELETED, lo.ToPtr(int32(5))) + + err := h.Handle(context.Background(), action) + require.ErrorIs(t, err, tt.wantErr) + }) + } +} + +func TestCheckNodeStatusHandler_Handle_Ready_WithAddEvent(t *testing.T) { + t.Parallel() + + nodeUID := types.UID(uuid.New().String()) + nodeObjectReady := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + UID: nodeUID, + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + }, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + } + + tests := []struct { + name string + eventType watch.EventType + eventNode *v1.Node + wantErr error + }{ + { + name: "should succeed when node becomes ready via add event", + eventType: watch.Added, + eventNode: nodeObjectReady, + wantErr: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + synctest.Test(t, func(t *testing.T) { + clientSet := fake.NewClientset() + watcher := watch.NewFake() + + clientSet.PrependWatchReactor("nodes", k8stest.DefaultWatchReactor(watcher, nil)) + + log := logrus.New() + log.SetLevel(logrus.DebugLevel) + + infMgr := informer.NewManager(log, clientSet, 10*time.Minute) + + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(func() { + cancel() + watcher.Stop() + }) + + go func() { + _ = infMgr.Start(ctx) + }() + + // Wait for informer to sync (cache will be empty) + synctest.Wait() + + h := NewCheckNodeStatusHandler(log, clientSet, infMgr) + action := newActionCheckNodeStatus(nodeName, nodeID, providerID, castai.ActionCheckNodeStatus_READY, lo.ToPtr(int32(5))) + + // Start Handle in a goroutine, it will block waiting for the node + var handleErr error + go func() { + handleErr = h.Handle(context.Background(), action) + }() + + // Wait for Handle to be blocked waiting for events + synctest.Wait() + + // Now send the add event + watcher.Action(tt.eventType, tt.eventNode) + + // Wait for Handle to complete + synctest.Wait() + + require.ErrorIs(t, handleErr, tt.wantErr) + }) + }) + } +} + +func TestCheckNodeStatusHandler_Handle_Ready_AlreadyInCache(t *testing.T) { + t.Parallel() + + nodeUID := types.UID(uuid.New().String()) + nodeObjectReady := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + UID: nodeUID, + Name: nodeName, + Labels: map[string]string{ + castai.LabelNodeID: nodeID, + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + }, + }, + }, + Spec: v1.NodeSpec{ + ProviderID: providerID, + }, + } + + tests := []struct { + name string + initialNode *v1.Node + wantErr error + }{ + { + name: "should succeed immediately when node already ready in cache", + initialNode: nodeObjectReady, + wantErr: nil, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + clientSet := fake.NewClientset(tt.initialNode) + + log := logrus.New() + log.SetLevel(logrus.DebugLevel) + + infMgr := informer.NewManager(log, clientSet, 10*time.Minute) + + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + + go func() { + _ = infMgr.Start(ctx) + }() + + // Wait for informer to sync with initial state + time.Sleep(100 * time.Millisecond) + + h := NewCheckNodeStatusHandler(log, clientSet, infMgr) + action := newActionCheckNodeStatus(nodeName, nodeID, providerID, castai.ActionCheckNodeStatus_READY, lo.ToPtr(int32(5))) + + err := h.Handle(context.Background(), action) + require.ErrorIs(t, err, tt.wantErr) + }) + } +} diff --git a/internal/actions/delete_node_handler.go b/internal/actions/delete_node_handler.go index 684260a6..ae8c6526 100644 --- a/internal/actions/delete_node_handler.go +++ b/internal/actions/delete_node_handler.go @@ -36,18 +36,15 @@ func NewDeleteNodeHandler(log logrus.FieldLogger, clientset kubernetes.Interface deleteRetryWait: 5 * time.Second, podsTerminationWait: 30 * time.Second, }, - DrainNodeHandler: DrainNodeHandler{ - log: log, - clientset: clientset, - }, + drainCfg: newDrainNodeConfig(""), } } type DeleteNodeHandler struct { - DrainNodeHandler log logrus.FieldLogger clientset kubernetes.Interface cfg deleteNodeConfig + drainCfg drainNodeConfig } func (h *DeleteNodeHandler) Handle(ctx context.Context, action *castai.ClusterAction) error { @@ -142,7 +139,7 @@ func (h *DeleteNodeHandler) Handle(ctx context.Context, action *castai.ClusterAc // Create delete options with grace period 0 - force delete. deleteOptions := metav1.NewDeleteOptions(0) deletePod := func(ctx context.Context, pod v1.Pod) error { - return h.deletePod(ctx, *deleteOptions, pod) + return deletePodWithRetry(ctx, h.log, h.clientset, *deleteOptions, pod, h.drainCfg) } deletedPods, failedPods := executeBatchPodActions(ctx, log, pods, deletePod, "delete-pod") diff --git a/internal/actions/delete_node_handler_test.go b/internal/actions/delete_node_handler_test.go index 07611873..84f7e081 100644 --- a/internal/actions/delete_node_handler_test.go +++ b/internal/actions/delete_node_handler_test.go @@ -138,10 +138,10 @@ func TestDeleteNodeHandler_Handle(t *testing.T) { clientSet = fake.NewClientset(tt.fields.tuneFakeObjects...) } h := &DeleteNodeHandler{ - DrainNodeHandler: DrainNodeHandler{clientset: clientSet, log: logrus.New()}, - log: logrus.New(), - clientset: clientSet, - cfg: tt.fields.cfg, + log: logrus.New(), + clientset: clientSet, + cfg: tt.fields.cfg, + drainCfg: newDrainNodeConfig(""), } err := h.Handle(context.Background(), tt.args.action) require.Equal(t, tt.wantErr != nil, err != nil, "expected error: %v, got: %v", tt.wantErr, err) diff --git a/internal/actions/drain_node_handler.go b/internal/actions/drain_node_handler.go index 4ac119b1..806b9c4e 100644 --- a/internal/actions/drain_node_handler.go +++ b/internal/actions/drain_node_handler.go @@ -4,28 +4,21 @@ import ( "context" "errors" "fmt" - "reflect" "strings" "time" "github.com/samber/lo" "github.com/sirupsen/logrus" v1 "k8s.io/api/core/v1" - policyv1 "k8s.io/api/policy/v1" - "k8s.io/api/policy/v1beta1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/fields" - "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/client-go/kubernetes" - "k8s.io/kubectl/pkg/drain" "github.com/castai/cluster-controller/internal/castai" + "github.com/castai/cluster-controller/internal/informer" "github.com/castai/cluster-controller/internal/waitext" ) -var _ ActionHandler = &DrainNodeHandler{} - const ( minDrainTimeout = 0 // Minimal pod drain timeout. ) @@ -40,25 +33,44 @@ type drainNodeConfig struct { skipDeletedTimeoutSeconds int } -func NewDrainNodeHandler(log logrus.FieldLogger, clientset kubernetes.Interface, castNamespace string) *DrainNodeHandler { - return &DrainNodeHandler{ +func newDrainNodeConfig(castNamespace string) drainNodeConfig { + return drainNodeConfig{ + podsDeleteTimeout: 2 * time.Minute, + podDeleteRetries: 5, + podDeleteRetryDelay: 5 * time.Second, + podEvictRetryDelay: 5 * time.Second, + podsTerminationWaitRetryDelay: 10 * time.Second, + castNamespace: castNamespace, + skipDeletedTimeoutSeconds: 60, + } +} + +// NewDrainNodeHandler creates a handler for draining nodes. +// If informerManager is provided, it uses efficient informer-based watching. +// If informerManager is nil, it falls back to polling-based implementation. +func NewDrainNodeHandler(log logrus.FieldLogger, clientset kubernetes.Interface, castNamespace string, informerManager *informer.Manager) ActionHandler { + cfg := newDrainNodeConfig(castNamespace) + + if informerManager != nil { + log.Info("using informer-based drain node handler") + return &drainNodeInformerHandler{ + log: log, + clientset: clientset, + informerManager: informerManager, + cfg: cfg, + } + } + log.Info("using polling-based drain node handler") + return &drainNodePollingHandler{ log: log, clientset: clientset, - cfg: drainNodeConfig{ - podsDeleteTimeout: 2 * time.Minute, - podDeleteRetries: 5, - podDeleteRetryDelay: 5 * time.Second, - podEvictRetryDelay: 5 * time.Second, - podsTerminationWaitRetryDelay: 10 * time.Second, - castNamespace: castNamespace, - skipDeletedTimeoutSeconds: 60, - }, + cfg: cfg, } } // getDrainTimeout returns drain timeout adjusted to action creation time. // the result is clamped between 0s and the requested timeout. -func (h *DrainNodeHandler) getDrainTimeout(action *castai.ClusterAction) time.Duration { +func getDrainTimeout(action *castai.ClusterAction) time.Duration { timeSinceCreated := time.Since(action.CreatedAt) requestedTimeout := time.Duration(action.ActionDrainNode.DrainTimeoutSeconds) * time.Second @@ -67,122 +79,13 @@ func (h *DrainNodeHandler) getDrainTimeout(action *castai.ClusterAction) time.Du return lo.Clamp(drainTimeout, minDrainTimeout*time.Second, requestedTimeout) } -type DrainNodeHandler struct { - log logrus.FieldLogger - clientset kubernetes.Interface - cfg drainNodeConfig -} - -func (h *DrainNodeHandler) Handle(ctx context.Context, action *castai.ClusterAction) error { - if action == nil { - return fmt.Errorf("action is nil %w", errAction) - } - req, ok := action.Data().(*castai.ActionDrainNode) - if !ok { - return newUnexpectedTypeErr(action.Data(), req) - } - drainTimeout := h.getDrainTimeout(action) - - log := h.log.WithFields(logrus.Fields{ - "node_name": req.NodeName, - "node_id": req.NodeID, - "provider_id": req.ProviderId, - "action": reflect.TypeOf(action.Data().(*castai.ActionDrainNode)).String(), - ActionIDLogField: action.ID, - }) - - log.Info("draining kubernetes node") - if req.NodeName == "" || - (req.NodeID == "" && req.ProviderId == "") { - return fmt.Errorf("node name or node ID/provider ID is empty %w", errAction) - } - - node, err := getNodeByIDs(ctx, h.clientset.CoreV1().Nodes(), req.NodeName, req.NodeID, req.ProviderId, log) - if errors.Is(err, errNodeNotFound) || errors.Is(err, errNodeDoesNotMatch) { - log.Info("node not found, skipping draining") - return nil - } - if err != nil { - return err - } - - log.Info("cordoning node for draining") - - if err := h.cordonNode(ctx, node); err != nil { - return fmt.Errorf("cordoning node %q: %w", req.NodeName, err) - } - - log.Infof("draining node, drain_timeout_seconds=%f, force=%v created_at=%s", drainTimeout.Seconds(), req.Force, action.CreatedAt) - - // First try to evict pods gracefully using eviction API. - evictCtx, evictCancel := context.WithTimeout(ctx, drainTimeout) - defer evictCancel() - - err = h.evictNodePods(evictCtx, log, node) - - if err == nil { - log.Info("node fully drained via graceful eviction") - return nil - } - - if !req.Force { - return fmt.Errorf("node failed to drain via graceful eviction, force=%v, timeout=%f, will not force delete pods: %w", req.Force, drainTimeout.Seconds(), err) - } - - var podsFailedEvictionErr *podFailedActionError - switch { - case errors.Is(err, context.DeadlineExceeded): - log.Infof("timeout=%f exceeded during pod eviction, force=%v, starting pod deletion", drainTimeout.Seconds(), req.Force) - case errors.As(err, &podsFailedEvictionErr): - log.Infof("some pods failed eviction, force=%v, starting pod deletion: %v", req.Force, err) - default: - // Expected to be errors where we can't continue at all; e.g. missing permissions or lack of connectivity. - return fmt.Errorf("evicting node pods: %w", err) - } - - // If voluntary eviction fails, and we are told to force drain, start deleting pods. - // Try deleting pods gracefully first, then delete with 0 grace period. PDBs are not respected here. - options := []metav1.DeleteOptions{ - {}, - *metav1.NewDeleteOptions(0), - } - - var deleteErr error - for _, o := range options { - deleteCtx, deleteCancel := context.WithTimeout(ctx, h.cfg.podsDeleteTimeout) - - deleteErr = h.deleteNodePods(deleteCtx, log, node, o) - - // Clean-up the child context if we got here; no reason to wait for the function to exit. - deleteCancel() - - if deleteErr == nil { - break - } - - var podsFailedDeletionErr *podFailedActionError - if errors.Is(deleteErr, context.DeadlineExceeded) || errors.As(deleteErr, &podsFailedDeletionErr) { - continue - } - return fmt.Errorf("forcefully deleting pods: %w", deleteErr) - } - - // Note: if some pods remained even after forced deletion, we'd get an error from last call here. - if deleteErr == nil { - log.Info("node drained forcefully") - } else { - log.Warnf("node failed to fully force drain: %v", deleteErr) - } - - return deleteErr -} - -func (h *DrainNodeHandler) cordonNode(ctx context.Context, node *v1.Node) error { +// cordonNode marks a node as unschedulable. +func cordonNode(ctx context.Context, log logrus.FieldLogger, clientset kubernetes.Interface, node *v1.Node) error { if node.Spec.Unschedulable { return nil } - err := patchNode(ctx, h.log, h.clientset, node, func(n *v1.Node) { + err := patchNode(ctx, log, clientset, node, func(n *v1.Node) { n.Spec.Unschedulable = true }) if err != nil { @@ -191,151 +94,15 @@ func (h *DrainNodeHandler) cordonNode(ctx context.Context, node *v1.Node) error return nil } -// Return error if at least one pod failed (but don't wait for it!) => to signal if we should do force delete. - -// evictNodePods attempts voluntarily eviction for all pods on node. -// This method will wait until all evictable pods on the node either terminate or fail deletion. -// A timeout should be used to avoid infinite waits. -// Errors in calling EVICT for individual pods are accumulated. If at least one pod failed this but termination was successful, an instance of podFailedActionError is returned. -// The method will still wait for termination of other evicted pods first. -// A return value of nil means all pods on the node should be evicted and terminated. -func (h *DrainNodeHandler) evictNodePods(ctx context.Context, log logrus.FieldLogger, node *v1.Node) error { - pods, err := h.listNodePodsToEvict(ctx, log, node) - if err != nil { - return err - } - - if len(pods) == 0 { - log.Infof("no pods to evict") - return nil - } - log.Infof("evicting %d pods", len(pods)) - groupVersion, err := drain.CheckEvictionSupport(h.clientset) - if err != nil { - return err - } - evictPod := func(ctx context.Context, pod v1.Pod) error { - return h.evictPod(ctx, pod, groupVersion) - } - - _, podsWithFailedEviction := executeBatchPodActions(ctx, log, pods, evictPod, "evict-pod") - var podsToIgnoreForTermination []*v1.Pod - var failedPodsError *podFailedActionError - if len(podsWithFailedEviction) > 0 { - podErrors := lo.Map(podsWithFailedEviction, func(failure podActionFailure, _ int) error { - return fmt.Errorf("pod %s/%s failed eviction: %w", failure.pod.Namespace, failure.pod.Name, failure.err) - }) - failedPodsError = &podFailedActionError{ - Action: "evict", - Errors: podErrors, - } - log.Warnf("some pods failed eviction, will ignore for termination wait: %v", failedPodsError) - podsToIgnoreForTermination = lo.Map(podsWithFailedEviction, func(failure podActionFailure, _ int) *v1.Pod { - return failure.pod - }) - } - - err = h.waitNodePodsTerminated(ctx, log, node, podsToIgnoreForTermination) - if err != nil { - return err - } - if failedPodsError != nil { - return failedPodsError - } - return nil -} - -// deleteNodePods deletes the pods running on node. Use options to control if eviction is graceful or forced. -// This method will wait until all evictable pods on the node either terminate or fail deletion. -// A timeout should be used to avoid infinite waits. -// Errors in calling DELETE for individual pods are accumulated. If at least one pod failed this but termination was successful, an instance of podFailedActionError is returned. -// The method will still wait for termination of other deleted pods first. -// A return value of nil means all pods on the node should be deleted and terminated. -func (h *DrainNodeHandler) deleteNodePods(ctx context.Context, log logrus.FieldLogger, node *v1.Node, options metav1.DeleteOptions) error { - pods, err := h.listNodePodsToEvict(ctx, log, node) - if err != nil { - return err - } - - if len(pods) == 0 { - log.Infof("no pods to delete") - return nil - } - - if options.GracePeriodSeconds != nil { - log.Infof("forcefully deleting %d pods with gracePeriod %d", len(pods), *options.GracePeriodSeconds) - } else { - log.Infof("forcefully deleting %d pods", len(pods)) - } - - deletePod := func(ctx context.Context, pod v1.Pod) error { - return h.deletePod(ctx, options, pod) - } - - _, podsWithFailedDeletion := executeBatchPodActions(ctx, log, pods, deletePod, "delete-pod") - var podsToIgnoreForTermination []*v1.Pod - var failedPodsError *podFailedActionError - if len(podsWithFailedDeletion) > 0 { - podErrors := lo.Map(podsWithFailedDeletion, func(failure podActionFailure, _ int) error { - return fmt.Errorf("pod %s/%s failed deletion: %w", failure.pod.Namespace, failure.pod.Name, failure.err) - }) - failedPodsError = &podFailedActionError{ - Action: "delete", - Errors: podErrors, - } - log.Warnf("some pods failed deletion, will ignore for termination wait: %v", failedPodsError) - podsToIgnoreForTermination = lo.Map(podsWithFailedDeletion, func(failure podActionFailure, _ int) *v1.Pod { - return failure.pod - }) - } - - err = h.waitNodePodsTerminated(ctx, log, node, podsToIgnoreForTermination) - if err != nil { - return err - } - if failedPodsError != nil { - return failedPodsError - } - return nil -} - -// listNodePodsToEvict creates a list of pods that are "evictable" on the node. -// The following pods are ignored: -// - static pods -// - DaemonSet pods -// - pods that are already finished (Succeeded or Failed) -// - pods that were marked for deletion recently (Terminating state); the meaning of "recently" is controlled by config -func (h *DrainNodeHandler) listNodePodsToEvict(ctx context.Context, log logrus.FieldLogger, node *v1.Node) ([]v1.Pod, error) { - var pods *v1.PodList - err := waitext.Retry( - ctx, - defaultBackoff(), - defaultMaxRetriesK8SOperation, - func(ctx context.Context) (bool, error) { - p, err := h.clientset.CoreV1().Pods(metav1.NamespaceAll).List(ctx, metav1.ListOptions{ - FieldSelector: fields.SelectorFromSet(fields.Set{"spec.nodeName": node.Name}).String(), - }) - if err != nil { - return true, err - } - pods = p - return false, nil - }, - func(err error) { - log.Warnf("listing pods on node %s: %v", node.Name, err) - }, - ) - if err != nil { - return nil, fmt.Errorf("listing node %v pods: %w", node.Name, err) - } - +// filterPodsToEvict filters pods to only include those that should be evicted. +func filterPodsToEvict(pods []v1.Pod, cfg drainNodeConfig) []v1.Pod { podsToEvict := make([]v1.Pod, 0) castPods := make([]v1.Pod, 0) - // Evict CAST PODs as last ones. - for _, p := range pods.Items { + + for _, p := range pods { // Skip pods that have been recently removed. if !p.ObjectMeta.DeletionTimestamp.IsZero() && - int(time.Since(p.ObjectMeta.GetDeletionTimestamp().Time).Seconds()) > h.cfg.skipDeletedTimeoutSeconds { + int(time.Since(p.ObjectMeta.GetDeletionTimestamp().Time).Seconds()) > cfg.skipDeletedTimeoutSeconds { continue } @@ -344,7 +111,7 @@ func (h *DrainNodeHandler) listNodePodsToEvict(ctx context.Context, log logrus.F continue } - if p.Namespace == h.cfg.castNamespace && !isDaemonSetPod(&p) && !isStaticPod(&p) { + if p.Namespace == cfg.castNamespace && !isDaemonSetPod(&p) && !isStaticPod(&p) { castPods = append(castPods, p) continue } @@ -354,145 +121,8 @@ func (h *DrainNodeHandler) listNodePodsToEvict(ctx context.Context, log logrus.F } } - logCastPodsToEvict(log, castPods) podsToEvict = append(podsToEvict, castPods...) - return podsToEvict, nil -} - -// waitNodePodsTerminated waits until the pods on the node terminate. -// The wait only considers evictable pods (see listNodePodsToEvict). -// If podsToIgnore is not empty, the list is further filtered by it. -// This is useful when you don't expect some pods on the node to terminate (e.g. because eviction failed for them) so there is no reason to wait until timeout. -// The wait can potentially run forever if pods are scheduled on the node and are not evicted/deleted by anything. Use a timeout to avoid infinite wait. -func (h *DrainNodeHandler) waitNodePodsTerminated(ctx context.Context, log logrus.FieldLogger, node *v1.Node, podsToIgnore []*v1.Pod) error { - // Check if context is cancelled before starting any work. - select { - case <-ctx.Done(): - return ctx.Err() - default: - // Continue with the work. - } - - podsToIgnoreLookup := make(map[string]struct{}) - for _, pod := range podsToIgnore { - podsToIgnoreLookup[fmt.Sprintf("%s/%s", pod.Namespace, pod.Name)] = struct{}{} - } - - log.Infof("starting wait for pod termination, %d pods in ignore list", len(podsToIgnore)) - return waitext.Retry( - ctx, - waitext.NewConstantBackoff(h.cfg.podsTerminationWaitRetryDelay), - waitext.Forever, - func(ctx context.Context) (bool, error) { - pods, err := h.listNodePodsToEvict(ctx, log, node) - if err != nil { - return true, fmt.Errorf("listing %q pods to be terminated: %w", node.Name, err) - } - - podsNames := lo.Map(pods, func(p v1.Pod, _ int) string { - return fmt.Sprintf("%s/%s", p.Namespace, p.Name) - }) - - remainingPodsList := podsNames - if len(podsToIgnore) > 0 { - remainingPodsList = lo.Filter(remainingPodsList, func(podName string, _ int) bool { - _, ok := podsToIgnoreLookup[podName] - return !ok - }) - } - if remainingPods := len(remainingPodsList); remainingPods > 0 { - return true, fmt.Errorf("waiting for %d pods (%v) to be terminated on node %v", remainingPods, remainingPodsList, node.Name) - } - return false, nil - }, - func(err error) { - h.log.Warnf("waiting for pod termination on node %v, will retry: %v", node.Name, err) - }, - ) -} - -// evictPod from the k8s node. Error handling is based on eviction api documentation: -// https://kubernetes.io/docs/tasks/administer-cluster/safely-drain-node/#the-eviction-api -func (h *DrainNodeHandler) evictPod(ctx context.Context, pod v1.Pod, groupVersion schema.GroupVersion) error { - b := waitext.NewConstantBackoff(h.cfg.podEvictRetryDelay) - action := func(ctx context.Context) (bool, error) { - var err error - - h.log.Debugf("requesting eviction for pod %s/%s", pod.Namespace, pod.Name) - if groupVersion == policyv1.SchemeGroupVersion { - err = h.clientset.PolicyV1().Evictions(pod.Namespace).Evict(ctx, &policyv1.Eviction{ - ObjectMeta: metav1.ObjectMeta{ - Name: pod.Name, - Namespace: pod.Namespace, - }, - }) - } else { - err = h.clientset.CoreV1().Pods(pod.Namespace).EvictV1beta1(ctx, &v1beta1.Eviction{ - TypeMeta: metav1.TypeMeta{ - APIVersion: "policy/v1beta1", - Kind: "Eviction", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: pod.Name, - Namespace: pod.Namespace, - }, - }) - } - - if err != nil { - // Pod is not found - ignore. - if apierrors.IsNotFound(err) { - return false, nil - } - - // Pod is misconfigured - stop retry. - if apierrors.IsInternalError(err) { - return false, err - } - } - - // Other errors - retry. - // This includes 429 TooManyRequests (due to throttling) and 429 TooManyRequests + DisruptionBudgetCause (due to violated PDBs) - // This is done to try and do graceful eviction for as long as possible; - // it is expected that caller has a timeout that will stop this process if the PDB can never be satisfied. - // Note: pods only receive SIGTERM signals if they are evicted; if PDB prevents that, the signal will not happen here. - return true, err - } - err := waitext.Retry(ctx, b, waitext.Forever, action, func(err error) { - h.log.Warnf("evict pod %s on node %s in namespace %s, will retry: %v", pod.Name, pod.Spec.NodeName, pod.Namespace, err) - }) - if err != nil { - return fmt.Errorf("evicting pod %s in namespace %s: %w", pod.Name, pod.Namespace, err) - } - return nil -} - -func (h *DrainNodeHandler) deletePod(ctx context.Context, options metav1.DeleteOptions, pod v1.Pod) error { - b := waitext.NewConstantBackoff(h.cfg.podDeleteRetryDelay) - action := func(ctx context.Context) (bool, error) { - err := h.clientset.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, options) - if err != nil { - // Pod is not found - ignore. - if apierrors.IsNotFound(err) { - return false, nil - } - - // Pod is misconfigured - stop retry. - if apierrors.IsInternalError(err) { - return false, err - } - } - - // Other errors - retry. - return true, err - } - err := waitext.Retry(ctx, b, h.cfg.podDeleteRetries, action, func(err error) { - h.log.Warnf("deleting pod %s on node %s in namespace %s, will retry: %v", pod.Name, pod.Spec.NodeName, pod.Namespace, err) - }) - if err != nil { - return fmt.Errorf("deleting pod %s in namespace %s: %w", pod.Name, pod.Namespace, err) - } - return nil + return podsToEvict } func logCastPodsToEvict(log logrus.FieldLogger, castPods []v1.Pod) { @@ -523,10 +153,22 @@ func isControlledBy(p *v1.Pod, kind string) bool { return ctrl != nil && ctrl.Kind == kind } +// shouldIgnorePod checks if a pod should be ignored for termination wait. +func shouldIgnorePod(pod *v1.Pod, skipDeletedTimeoutSeconds int) bool { + if !pod.ObjectMeta.DeletionTimestamp.IsZero() && + int(time.Since(pod.ObjectMeta.GetDeletionTimestamp().Time).Seconds()) > skipDeletedTimeoutSeconds { + return true + } + + if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed { + return true + } + + return false +} + type podFailedActionError struct { - // Action holds context what was the code trying to do. Action string - // Errors should hold an entry per pod, for which the action failed. Errors []error } @@ -537,3 +179,27 @@ func (p *podFailedActionError) Error() string { func (p *podFailedActionError) Unwrap() []error { return p.Errors } + +// deletePodWithRetry deletes a pod with retries. Shared function for drain and delete node handlers. +func deletePodWithRetry(ctx context.Context, log logrus.FieldLogger, clientset kubernetes.Interface, options metav1.DeleteOptions, pod v1.Pod, cfg drainNodeConfig) error { + b := waitext.NewConstantBackoff(cfg.podDeleteRetryDelay) + action := func(ctx context.Context) (bool, error) { + err := clientset.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, options) + if err != nil { + if apierrors.IsNotFound(err) { + return false, nil + } + if apierrors.IsInternalError(err) { + return false, err + } + } + return true, err + } + err := waitext.Retry(ctx, b, cfg.podDeleteRetries, action, func(err error) { + log.Warnf("deleting pod %s on node %s in namespace %s, will retry: %v", pod.Name, pod.Spec.NodeName, pod.Namespace, err) + }) + if err != nil { + return fmt.Errorf("deleting pod %s in namespace %s: %w", pod.Name, pod.Namespace, err) + } + return nil +} diff --git a/internal/actions/drain_node_handler_informer.go b/internal/actions/drain_node_handler_informer.go new file mode 100644 index 00000000..caa988d6 --- /dev/null +++ b/internal/actions/drain_node_handler_informer.go @@ -0,0 +1,544 @@ +package actions + +import ( + "context" + "errors" + "fmt" + "reflect" + "sync" + "time" + + "github.com/samber/lo" + "github.com/sirupsen/logrus" + v1 "k8s.io/api/core/v1" + policyv1 "k8s.io/api/policy/v1" + "k8s.io/api/policy/v1beta1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/cache" + "k8s.io/kubectl/pkg/drain" + + "github.com/castai/cluster-controller/internal/castai" + "github.com/castai/cluster-controller/internal/informer" + "github.com/castai/cluster-controller/internal/waitext" +) + +var _ ActionHandler = &drainNodeInformerHandler{} + +type drainNodeInformerHandler struct { + log logrus.FieldLogger + clientset kubernetes.Interface + informerManager *informer.Manager + cfg drainNodeConfig +} + +func (h *drainNodeInformerHandler) Handle(ctx context.Context, action *castai.ClusterAction) error { + if action == nil { + return fmt.Errorf("action is nil %w", errAction) + } + req, ok := action.Data().(*castai.ActionDrainNode) + if !ok { + return newUnexpectedTypeErr(action.Data(), req) + } + drainTimeout := getDrainTimeout(action) + + log := h.log.WithFields(logrus.Fields{ + "node_name": req.NodeName, + "node_id": req.NodeID, + "provider_id": req.ProviderId, + "action": reflect.TypeOf(action.Data().(*castai.ActionDrainNode)).String(), + ActionIDLogField: action.ID, + }) + + log.Info("draining kubernetes node") + if req.NodeName == "" || + (req.NodeID == "" && req.ProviderId == "") { + return fmt.Errorf("node name or node ID/provider ID is empty %w", errAction) + } + + node, err := getNodeByIDs(ctx, h.clientset.CoreV1().Nodes(), req.NodeName, req.NodeID, req.ProviderId, log) + if errors.Is(err, errNodeNotFound) || errors.Is(err, errNodeDoesNotMatch) { + log.Info("node not found, skipping draining") + return nil + } + if err != nil { + return err + } + + log.Info("cordoning node for draining") + + if err := cordonNode(ctx, h.log, h.clientset, node); err != nil { + return fmt.Errorf("cordoning node %q: %w", req.NodeName, err) + } + + log.Infof("draining node, drain_timeout_seconds=%f, force=%v created_at=%s", drainTimeout.Seconds(), req.Force, action.CreatedAt) + + // Skip graceful eviction if drain timeout is 0 - go straight to force deletion if allowed. + if drainTimeout <= 0 { + log.Info("drain timeout is 0, skipping graceful eviction") + if !req.Force { + return fmt.Errorf("drain timeout is 0 and force=%v, cannot drain node without force: %w", req.Force, errAction) + } + } else { + // Try to evict pods gracefully using eviction API. + evictCtx, evictCancel := context.WithTimeout(ctx, drainTimeout) + defer evictCancel() + + err = h.evictNodePods(evictCtx, log, node) + + if err == nil { + log.Info("node fully drained via graceful eviction") + return nil + } + + if !req.Force { + return fmt.Errorf("node failed to drain via graceful eviction, force=%v, timeout=%f, will not force delete pods: %w", req.Force, drainTimeout.Seconds(), err) + } + + var podsFailedEvictionErr *podFailedActionError + switch { + case errors.Is(err, context.DeadlineExceeded): + log.Infof("timeout=%f exceeded during pod eviction, force=%v, starting pod deletion", drainTimeout.Seconds(), req.Force) + case errors.As(err, &podsFailedEvictionErr): + log.Infof("some pods failed eviction, force=%v, starting pod deletion: %v", req.Force, err) + default: + // Expected to be errors where we can't continue at all; e.g. missing permissions or lack of connectivity. + return fmt.Errorf("evicting node pods: %w", err) + } + } + + options := []metav1.DeleteOptions{ + {}, + *metav1.NewDeleteOptions(0), + } + + var deleteErr error + for _, o := range options { + deleteCtx, deleteCancel := context.WithTimeout(ctx, h.cfg.podsDeleteTimeout) + + deleteErr = h.deleteNodePods(deleteCtx, log, node, o) + + deleteCancel() + + if deleteErr == nil { + break + } + + var podsFailedDeletionErr *podFailedActionError + if errors.Is(deleteErr, context.DeadlineExceeded) || errors.As(deleteErr, &podsFailedDeletionErr) { + continue + } + return fmt.Errorf("forcefully deleting pods: %w", deleteErr) + } + + if deleteErr == nil { + log.Info("node drained forcefully") + } else { + log.Warnf("node failed to fully force drain: %v", deleteErr) + } + + return deleteErr +} + +func (h *drainNodeInformerHandler) evictNodePods(ctx context.Context, log logrus.FieldLogger, node *v1.Node) error { + pods, err := h.listNodePodsToEvict(ctx, log, node) + if err != nil { + return err + } + + if len(pods) == 0 { + log.Infof("no pods to evict") + return nil + } + log.Infof("evicting %d pods", len(pods)) + groupVersion, err := drain.CheckEvictionSupport(h.clientset) + if err != nil { + return err + } + evictPod := func(ctx context.Context, pod v1.Pod) error { + return h.evictPod(ctx, pod, groupVersion) + } + + _, podsWithFailedEviction := executeBatchPodActions(ctx, log, pods, evictPod, "evict-pod") + var podsToIgnoreForTermination []*v1.Pod + var failedPodsError *podFailedActionError + if len(podsWithFailedEviction) > 0 { + podErrors := lo.Map(podsWithFailedEviction, func(failure podActionFailure, _ int) error { + return fmt.Errorf("pod %s/%s failed eviction: %w", failure.pod.Namespace, failure.pod.Name, failure.err) + }) + failedPodsError = &podFailedActionError{ + Action: "evict", + Errors: podErrors, + } + log.Warnf("some pods failed eviction, will ignore for termination wait: %v", failedPodsError) + podsToIgnoreForTermination = lo.Map(podsWithFailedEviction, func(failure podActionFailure, _ int) *v1.Pod { + return failure.pod + }) + } + + err = h.waitNodePodsTerminated(ctx, log, node, podsToIgnoreForTermination) + if err != nil { + return err + } + if failedPodsError != nil { + return failedPodsError + } + return nil +} + +func (h *drainNodeInformerHandler) deleteNodePods(ctx context.Context, log logrus.FieldLogger, node *v1.Node, options metav1.DeleteOptions) error { + pods, err := h.listNodePodsToEvict(ctx, log, node) + if err != nil { + return err + } + + if len(pods) == 0 { + log.Infof("no pods to delete") + return nil + } + + if options.GracePeriodSeconds != nil { + log.Infof("forcefully deleting %d pods with gracePeriod %d", len(pods), *options.GracePeriodSeconds) + } else { + log.Infof("forcefully deleting %d pods", len(pods)) + } + + deletePod := func(ctx context.Context, pod v1.Pod) error { + return h.deletePod(ctx, options, pod) + } + + _, podsWithFailedDeletion := executeBatchPodActions(ctx, log, pods, deletePod, "delete-pod") + var podsToIgnoreForTermination []*v1.Pod + var failedPodsError *podFailedActionError + if len(podsWithFailedDeletion) > 0 { + podErrors := lo.Map(podsWithFailedDeletion, func(failure podActionFailure, _ int) error { + return fmt.Errorf("pod %s/%s failed deletion: %w", failure.pod.Namespace, failure.pod.Name, failure.err) + }) + failedPodsError = &podFailedActionError{ + Action: "delete", + Errors: podErrors, + } + log.Warnf("some pods failed deletion, will ignore for termination wait: %v", failedPodsError) + podsToIgnoreForTermination = lo.Map(podsWithFailedDeletion, func(failure podActionFailure, _ int) *v1.Pod { + return failure.pod + }) + } + + err = h.waitNodePodsTerminated(ctx, log, node, podsToIgnoreForTermination) + if err != nil { + return err + } + if failedPodsError != nil { + return failedPodsError + } + return nil +} + +func (h *drainNodeInformerHandler) listNodePodsToEvict(ctx context.Context, log logrus.FieldLogger, node *v1.Node) ([]v1.Pod, error) { + lister := h.informerManager.GetPodLister() + + pods, err := lister.List(labels.Everything()) + if err != nil { + return nil, fmt.Errorf("listing pods from cache: %w", err) + } + + podsOnNode := lo.Filter(pods, func(p *v1.Pod, _ int) bool { + return p.Spec.NodeName == node.Name + }) + + // Convert []*v1.Pod to []v1.Pod + podList := make([]v1.Pod, len(podsOnNode)) + for i, p := range podsOnNode { + podList[i] = *p + } + + return filterPodsToEvict(podList, h.cfg), nil +} + +func (h *drainNodeInformerHandler) waitNodePodsTerminated(ctx context.Context, log logrus.FieldLogger, node *v1.Node, podsToIgnore []*v1.Pod) error { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + + podsToIgnoreLookup := make(map[string]struct{}) + for _, pod := range podsToIgnore { + podsToIgnoreLookup[fmt.Sprintf("%s/%s", pod.Namespace, pod.Name)] = struct{}{} + } + + pods, err := h.listNodePodsToEvict(ctx, log, node) + if err != nil { + return fmt.Errorf("listing %q pods to be terminated: %w", node.Name, err) + } + + tracker := newRemainingPodsTracker(pods, podsToIgnoreLookup) + + if tracker.isEmpty() { + log.Info("no pods to wait for termination") + return nil + } + + log.Infof("waiting for %d pods to terminate (using informer watch), %d pods in ignore list", tracker.count(), len(podsToIgnore)) + + done := make(chan struct{}) + podInformer := h.informerManager.GetPodInformer() + + registration, err := podInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + UpdateFunc: func(oldObj, newObj any) { + pod, ok := newObj.(*v1.Pod) + if !ok || pod.Spec.NodeName != node.Name { + return + } + + podKey := fmt.Sprintf("%s/%s", pod.Namespace, pod.Name) + + if !tracker.contains(podKey) { + return + } + + if shouldIgnorePod(pod, h.cfg.skipDeletedTimeoutSeconds) { + remaining := tracker.remove(podKey) + log.Infof("pod %s terminating/completed, %d remaining", podKey, remaining) + + if remaining == 0 { + select { + case done <- struct{}{}: + default: + } + } + } + }, + DeleteFunc: func(obj any) { + pod, ok := obj.(*v1.Pod) + if !ok { + tombstone, ok := obj.(cache.DeletedFinalStateUnknown) + if !ok { + return + } + pod, ok = tombstone.Obj.(*v1.Pod) + if !ok { + return + } + } + + if pod.Spec.NodeName != node.Name { + return + } + + podKey := fmt.Sprintf("%s/%s", pod.Namespace, pod.Name) + remaining := tracker.remove(podKey) + + if remaining == 0 { + log.Info("all pods terminated") + select { + case done <- struct{}{}: + default: + } + } else { + log.Infof("pod %s deleted, %d remaining", podKey, remaining) + } + }, + }) + if err != nil { + return fmt.Errorf("adding event handler: %w", err) + } + defer func() { + if err := podInformer.RemoveEventHandler(registration); err != nil { + log.WithError(err).Warn("failed to remove event handler") + } + }() + + recheckInterval := h.cfg.podsTerminationWaitRetryDelay + if recheckInterval <= 0 { + recheckInterval = 10 * time.Second + } + ticker := time.NewTicker(recheckInterval) + defer ticker.Stop() + + recheckPods := func() (bool, error) { + pods, err := h.listNodePodsToEvict(ctx, log, node) + if err != nil { + return false, err + } + + changed, count := tracker.update(pods, podsToIgnoreLookup) + if changed { + log.Infof("re-check: %d pods remaining", count) + return count == 0, nil + } + return false, nil + } + + if allDone, err := recheckPods(); err == nil && allDone { + return nil + } + + for { + select { + case <-done: + return nil + case <-ticker.C: + if allDone, err := recheckPods(); err != nil { + log.Warnf("failed to re-check pods during termination wait: %v", err) + } else if allDone { + return nil + } + case <-ctx.Done(): + remainingPods := tracker.list() + count := tracker.count() + return fmt.Errorf("timeout waiting for %d pods (%v) to terminate: %w", + count, remainingPods, ctx.Err()) + } + } +} + +func (h *drainNodeInformerHandler) evictPod(ctx context.Context, pod v1.Pod, groupVersion schema.GroupVersion) error { + b := waitext.NewConstantBackoff(h.cfg.podEvictRetryDelay) + action := func(ctx context.Context) (bool, error) { + var err error + + h.log.Debugf("requesting eviction for pod %s/%s", pod.Namespace, pod.Name) + if groupVersion == policyv1.SchemeGroupVersion { + err = h.clientset.PolicyV1().Evictions(pod.Namespace).Evict(ctx, &policyv1.Eviction{ + ObjectMeta: metav1.ObjectMeta{ + Name: pod.Name, + Namespace: pod.Namespace, + }, + }) + } else { + err = h.clientset.CoreV1().Pods(pod.Namespace).EvictV1beta1(ctx, &v1beta1.Eviction{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "policy/v1beta1", + Kind: "Eviction", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: pod.Name, + Namespace: pod.Namespace, + }, + }) + } + + if err != nil { + if apierrors.IsNotFound(err) { + return false, nil + } + if apierrors.IsInternalError(err) { + return false, err + } + } + + return true, err + } + err := waitext.Retry(ctx, b, waitext.Forever, action, func(err error) { + h.log.Warnf("evict pod %s on node %s in namespace %s, will retry: %v", pod.Name, pod.Spec.NodeName, pod.Namespace, err) + }) + if err != nil { + return fmt.Errorf("evicting pod %s in namespace %s: %w", pod.Name, pod.Namespace, err) + } + return nil +} + +func (h *drainNodeInformerHandler) deletePod(ctx context.Context, options metav1.DeleteOptions, pod v1.Pod) error { + b := waitext.NewConstantBackoff(h.cfg.podDeleteRetryDelay) + action := func(ctx context.Context) (bool, error) { + err := h.clientset.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, options) + if err != nil { + if apierrors.IsNotFound(err) { + return false, nil + } + if apierrors.IsInternalError(err) { + return false, err + } + } + return true, err + } + err := waitext.Retry(ctx, b, h.cfg.podDeleteRetries, action, func(err error) { + h.log.Warnf("deleting pod %s on node %s in namespace %s, will retry: %v", pod.Name, pod.Spec.NodeName, pod.Namespace, err) + }) + if err != nil { + return fmt.Errorf("deleting pod %s in namespace %s: %w", pod.Name, pod.Namespace, err) + } + return nil +} + +// remainingPodsTracker is a thread-safe tracker for remaining pods during drain operation. +type remainingPodsTracker struct { + mu sync.Mutex + pods map[string]bool +} + +func newRemainingPodsTracker(pods []v1.Pod, podsToIgnore map[string]struct{}) *remainingPodsTracker { + tracker := &remainingPodsTracker{ + pods: make(map[string]bool), + } + for _, p := range pods { + podKey := fmt.Sprintf("%s/%s", p.Namespace, p.Name) + if _, ignored := podsToIgnore[podKey]; !ignored { + tracker.pods[podKey] = true + } + } + return tracker +} + +func (t *remainingPodsTracker) remove(podKey string) int { + t.mu.Lock() + defer t.mu.Unlock() + + delete(t.pods, podKey) + return len(t.pods) +} + +func (t *remainingPodsTracker) contains(podKey string) bool { + t.mu.Lock() + defer t.mu.Unlock() + + return t.pods[podKey] +} + +func (t *remainingPodsTracker) count() int { + t.mu.Lock() + defer t.mu.Unlock() + + return len(t.pods) +} + +func (t *remainingPodsTracker) isEmpty() bool { + t.mu.Lock() + defer t.mu.Unlock() + + return len(t.pods) == 0 +} + +func (t *remainingPodsTracker) list() []string { + t.mu.Lock() + defer t.mu.Unlock() + + pods := make([]string, 0, len(t.pods)) + for podKey := range t.pods { + pods = append(pods, podKey) + } + return pods +} + +func (t *remainingPodsTracker) update(newPods []v1.Pod, podsToIgnore map[string]struct{}) (changed bool, count int) { + t.mu.Lock() + defer t.mu.Unlock() + + newPodsMap := make(map[string]bool) + for _, p := range newPods { + podKey := fmt.Sprintf("%s/%s", p.Namespace, p.Name) + if _, ignored := podsToIgnore[podKey]; !ignored && t.pods[podKey] { + newPodsMap[podKey] = true + } + } + + changed = len(newPodsMap) != len(t.pods) + if changed { + t.pods = newPodsMap + } + return changed, len(t.pods) +} diff --git a/internal/actions/drain_node_handler_polling.go b/internal/actions/drain_node_handler_polling.go new file mode 100644 index 00000000..a00588a7 --- /dev/null +++ b/internal/actions/drain_node_handler_polling.go @@ -0,0 +1,373 @@ +package actions + +import ( + "context" + "errors" + "fmt" + "reflect" + + "github.com/samber/lo" + "github.com/sirupsen/logrus" + v1 "k8s.io/api/core/v1" + policyv1 "k8s.io/api/policy/v1" + "k8s.io/api/policy/v1beta1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/fields" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/kubernetes" + "k8s.io/kubectl/pkg/drain" + + "github.com/castai/cluster-controller/internal/castai" + "github.com/castai/cluster-controller/internal/waitext" +) + +var _ ActionHandler = &drainNodePollingHandler{} + +type drainNodePollingHandler struct { + log logrus.FieldLogger + clientset kubernetes.Interface + cfg drainNodeConfig +} + +func (h *drainNodePollingHandler) Handle(ctx context.Context, action *castai.ClusterAction) error { + if action == nil { + return fmt.Errorf("action is nil %w", errAction) + } + req, ok := action.Data().(*castai.ActionDrainNode) + if !ok { + return newUnexpectedTypeErr(action.Data(), req) + } + drainTimeout := getDrainTimeout(action) + + log := h.log.WithFields(logrus.Fields{ + "node_name": req.NodeName, + "node_id": req.NodeID, + "provider_id": req.ProviderId, + "action": reflect.TypeOf(action.Data().(*castai.ActionDrainNode)).String(), + ActionIDLogField: action.ID, + }) + + log.Info("draining kubernetes node") + if req.NodeName == "" || + (req.NodeID == "" && req.ProviderId == "") { + return fmt.Errorf("node name or node ID/provider ID is empty %w", errAction) + } + + node, err := getNodeByIDs(ctx, h.clientset.CoreV1().Nodes(), req.NodeName, req.NodeID, req.ProviderId, log) + if errors.Is(err, errNodeNotFound) || errors.Is(err, errNodeDoesNotMatch) { + log.Info("node not found, skipping draining") + return nil + } + if err != nil { + return err + } + + log.Info("cordoning node for draining") + + if err := cordonNode(ctx, h.log, h.clientset, node); err != nil { + return fmt.Errorf("cordoning node %q: %w", req.NodeName, err) + } + + log.Infof("draining node, drain_timeout_seconds=%f, force=%v created_at=%s", drainTimeout.Seconds(), req.Force, action.CreatedAt) + + // Skip graceful eviction if drain timeout is 0 - go straight to force deletion if allowed. + if drainTimeout <= 0 { + log.Info("drain timeout is 0, skipping graceful eviction") + if !req.Force { + return fmt.Errorf("drain timeout is 0 and force=%v, cannot drain node without force: %w", req.Force, errAction) + } + } else { + // Try to evict pods gracefully using eviction API. + evictCtx, evictCancel := context.WithTimeout(ctx, drainTimeout) + defer evictCancel() + + err = h.evictNodePods(evictCtx, log, node) + + if err == nil { + log.Info("node fully drained via graceful eviction") + return nil + } + + if !req.Force { + return fmt.Errorf("node failed to drain via graceful eviction, force=%v, timeout=%f, will not force delete pods: %w", req.Force, drainTimeout.Seconds(), err) + } + + var podsFailedEvictionErr *podFailedActionError + switch { + case errors.Is(err, context.DeadlineExceeded): + log.Infof("timeout=%f exceeded during pod eviction, force=%v, starting pod deletion", drainTimeout.Seconds(), req.Force) + case errors.As(err, &podsFailedEvictionErr): + log.Infof("some pods failed eviction, force=%v, starting pod deletion: %v", req.Force, err) + default: + return fmt.Errorf("evicting node pods: %w", err) + } + } + + options := []metav1.DeleteOptions{ + {}, + *metav1.NewDeleteOptions(0), + } + + var deleteErr error + for _, o := range options { + deleteCtx, deleteCancel := context.WithTimeout(ctx, h.cfg.podsDeleteTimeout) + + deleteErr = h.deleteNodePods(deleteCtx, log, node, o) + + deleteCancel() + + if deleteErr == nil { + break + } + + var podsFailedDeletionErr *podFailedActionError + if errors.Is(deleteErr, context.DeadlineExceeded) || errors.As(deleteErr, &podsFailedDeletionErr) { + continue + } + return fmt.Errorf("forcefully deleting pods: %w", deleteErr) + } + + if deleteErr == nil { + log.Info("node drained forcefully") + } else { + log.Warnf("node failed to fully force drain: %v", deleteErr) + } + + return deleteErr +} + +func (h *drainNodePollingHandler) evictNodePods(ctx context.Context, log logrus.FieldLogger, node *v1.Node) error { + pods, err := h.listNodePodsToEvict(ctx, log, node) + if err != nil { + return err + } + + if len(pods) == 0 { + log.Infof("no pods to evict") + return nil + } + log.Infof("evicting %d pods", len(pods)) + groupVersion, err := drain.CheckEvictionSupport(h.clientset) + if err != nil { + return err + } + evictPod := func(ctx context.Context, pod v1.Pod) error { + return h.evictPod(ctx, pod, groupVersion) + } + + _, podsWithFailedEviction := executeBatchPodActions(ctx, log, pods, evictPod, "evict-pod") + var podsToIgnoreForTermination []*v1.Pod + var failedPodsError *podFailedActionError + if len(podsWithFailedEviction) > 0 { + podErrors := lo.Map(podsWithFailedEviction, func(failure podActionFailure, _ int) error { + return fmt.Errorf("pod %s/%s failed eviction: %w", failure.pod.Namespace, failure.pod.Name, failure.err) + }) + failedPodsError = &podFailedActionError{ + Action: "evict", + Errors: podErrors, + } + log.Warnf("some pods failed eviction, will ignore for termination wait: %v", failedPodsError) + podsToIgnoreForTermination = lo.Map(podsWithFailedEviction, func(failure podActionFailure, _ int) *v1.Pod { + return failure.pod + }) + } + + err = h.waitNodePodsTerminated(ctx, log, node, podsToIgnoreForTermination) + if err != nil { + return err + } + if failedPodsError != nil { + return failedPodsError + } + return nil +} + +func (h *drainNodePollingHandler) deleteNodePods(ctx context.Context, log logrus.FieldLogger, node *v1.Node, options metav1.DeleteOptions) error { + pods, err := h.listNodePodsToEvict(ctx, log, node) + if err != nil { + return err + } + + if len(pods) == 0 { + log.Infof("no pods to delete") + return nil + } + + if options.GracePeriodSeconds != nil { + log.Infof("forcefully deleting %d pods with gracePeriod %d", len(pods), *options.GracePeriodSeconds) + } else { + log.Infof("forcefully deleting %d pods", len(pods)) + } + + deletePod := func(ctx context.Context, pod v1.Pod) error { + return h.deletePod(ctx, options, pod) + } + + _, podsWithFailedDeletion := executeBatchPodActions(ctx, log, pods, deletePod, "delete-pod") + var podsToIgnoreForTermination []*v1.Pod + var failedPodsError *podFailedActionError + if len(podsWithFailedDeletion) > 0 { + podErrors := lo.Map(podsWithFailedDeletion, func(failure podActionFailure, _ int) error { + return fmt.Errorf("pod %s/%s failed deletion: %w", failure.pod.Namespace, failure.pod.Name, failure.err) + }) + failedPodsError = &podFailedActionError{ + Action: "delete", + Errors: podErrors, + } + log.Warnf("some pods failed deletion, will ignore for termination wait: %v", failedPodsError) + podsToIgnoreForTermination = lo.Map(podsWithFailedDeletion, func(failure podActionFailure, _ int) *v1.Pod { + return failure.pod + }) + } + + err = h.waitNodePodsTerminated(ctx, log, node, podsToIgnoreForTermination) + if err != nil { + return err + } + if failedPodsError != nil { + return failedPodsError + } + return nil +} + +func (h *drainNodePollingHandler) listNodePodsToEvict(ctx context.Context, log logrus.FieldLogger, node *v1.Node) ([]v1.Pod, error) { + var pods *v1.PodList + err := waitext.Retry( + ctx, + defaultBackoff(), + defaultMaxRetriesK8SOperation, + func(ctx context.Context) (bool, error) { + p, err := h.clientset.CoreV1().Pods(metav1.NamespaceAll).List(ctx, metav1.ListOptions{ + FieldSelector: fields.SelectorFromSet(fields.Set{"spec.nodeName": node.Name}).String(), + }) + if err != nil { + return true, err + } + pods = p + return false, nil + }, + func(err error) { + log.Warnf("listing pods on node %s: %v", node.Name, err) + }, + ) + if err != nil { + return nil, fmt.Errorf("listing node %v pods: %w", node.Name, err) + } + + return filterPodsToEvict(pods.Items, h.cfg), nil +} + +func (h *drainNodePollingHandler) waitNodePodsTerminated(ctx context.Context, log logrus.FieldLogger, node *v1.Node, podsToIgnore []*v1.Pod) error { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + + podsToIgnoreLookup := make(map[string]struct{}) + for _, pod := range podsToIgnore { + podsToIgnoreLookup[fmt.Sprintf("%s/%s", pod.Namespace, pod.Name)] = struct{}{} + } + + log.Infof("starting wait for pod termination, %d pods in ignore list", len(podsToIgnore)) + return waitext.Retry( + ctx, + waitext.NewConstantBackoff(h.cfg.podsTerminationWaitRetryDelay), + waitext.Forever, + func(ctx context.Context) (bool, error) { + pods, err := h.listNodePodsToEvict(ctx, log, node) + if err != nil { + return true, fmt.Errorf("listing %q pods to be terminated: %w", node.Name, err) + } + + podsNames := lo.Map(pods, func(p v1.Pod, _ int) string { + return fmt.Sprintf("%s/%s", p.Namespace, p.Name) + }) + + remainingPodsList := podsNames + if len(podsToIgnore) > 0 { + remainingPodsList = lo.Filter(remainingPodsList, func(podName string, _ int) bool { + _, ok := podsToIgnoreLookup[podName] + return !ok + }) + } + if remainingPods := len(remainingPodsList); remainingPods > 0 { + return true, fmt.Errorf("waiting for %d pods (%v) to be terminated on node %v", remainingPods, remainingPodsList, node.Name) + } + return false, nil + }, + func(err error) { + h.log.Warnf("waiting for pod termination on node %v, will retry: %v", node.Name, err) + }, + ) +} + +func (h *drainNodePollingHandler) evictPod(ctx context.Context, pod v1.Pod, groupVersion schema.GroupVersion) error { + b := waitext.NewConstantBackoff(h.cfg.podEvictRetryDelay) + action := func(ctx context.Context) (bool, error) { + var err error + + h.log.Debugf("requesting eviction for pod %s/%s", pod.Namespace, pod.Name) + if groupVersion == policyv1.SchemeGroupVersion { + err = h.clientset.PolicyV1().Evictions(pod.Namespace).Evict(ctx, &policyv1.Eviction{ + ObjectMeta: metav1.ObjectMeta{ + Name: pod.Name, + Namespace: pod.Namespace, + }, + }) + } else { + err = h.clientset.CoreV1().Pods(pod.Namespace).EvictV1beta1(ctx, &v1beta1.Eviction{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "policy/v1beta1", + Kind: "Eviction", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: pod.Name, + Namespace: pod.Namespace, + }, + }) + } + + if err != nil { + if apierrors.IsNotFound(err) { + return false, nil + } + if apierrors.IsInternalError(err) { + return false, err + } + } + + return true, err + } + err := waitext.Retry(ctx, b, waitext.Forever, action, func(err error) { + h.log.Warnf("evict pod %s on node %s in namespace %s, will retry: %v", pod.Name, pod.Spec.NodeName, pod.Namespace, err) + }) + if err != nil { + return fmt.Errorf("evicting pod %s in namespace %s: %w", pod.Name, pod.Namespace, err) + } + return nil +} + +func (h *drainNodePollingHandler) deletePod(ctx context.Context, options metav1.DeleteOptions, pod v1.Pod) error { + b := waitext.NewConstantBackoff(h.cfg.podDeleteRetryDelay) + action := func(ctx context.Context) (bool, error) { + err := h.clientset.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, options) + if err != nil { + if apierrors.IsNotFound(err) { + return false, nil + } + if apierrors.IsInternalError(err) { + return false, err + } + } + return true, err + } + err := waitext.Retry(ctx, b, h.cfg.podDeleteRetries, action, func(err error) { + h.log.Warnf("deleting pod %s on node %s in namespace %s, will retry: %v", pod.Name, pod.Spec.NodeName, pod.Namespace, err) + }) + if err != nil { + return fmt.Errorf("deleting pod %s in namespace %s: %w", pod.Name, pod.Namespace, err) + } + return nil +} diff --git a/internal/actions/drain_node_handler_test.go b/internal/actions/drain_node_handler_test.go index 036343bd..c75bf8cb 100644 --- a/internal/actions/drain_node_handler_test.go +++ b/internal/actions/drain_node_handler_test.go @@ -20,6 +20,7 @@ import ( ktest "k8s.io/client-go/testing" "github.com/castai/cluster-controller/internal/castai" + "github.com/castai/cluster-controller/internal/informer" ) func TestGetDrainTimeout(t *testing.T) { @@ -37,12 +38,8 @@ func TestGetDrainTimeout(t *testing.T) { }, CreatedAt: time.Now().UTC(), } - h := DrainNodeHandler{ - log: log, - cfg: drainNodeConfig{}, - } - timeout := h.getDrainTimeout(action) + timeout := getDrainTimeout(action) // We give some wiggle room as the test might get here a few milliseconds late. r.InDelta((100 * time.Second).Milliseconds(), timeout.Milliseconds(), 10) @@ -59,12 +56,8 @@ func TestGetDrainTimeout(t *testing.T) { }, CreatedAt: time.Now().UTC().Add(-3 * time.Minute), } - h := DrainNodeHandler{ - log: log, - cfg: drainNodeConfig{}, - } - timeout := h.getDrainTimeout(action) + timeout := getDrainTimeout(action) r.Less(int(math.Floor(timeout.Seconds())), 600) }) @@ -79,12 +72,8 @@ func TestGetDrainTimeout(t *testing.T) { }, CreatedAt: time.Now().UTC().Add(-60 * time.Minute), } - h := DrainNodeHandler{ - log: log, - cfg: drainNodeConfig{}, - } - timeout := h.getDrainTimeout(action) + timeout := getDrainTimeout(action) r.Equal(0, int(timeout.Seconds())) }) } @@ -390,7 +379,7 @@ func TestDrainNodeHandler_Handle(t *testing.T) { wantErrorContains: "failed to drain via graceful eviction", }, { - name: "when eviction timeout is reached and force=false, leaves node cordoned and skip deletion", + name: "when drain timeout is 0 and force=false, returns error without attempting eviction", fields: fields{ clientSet: func(t *testing.T) *fake.Clientset { c := setupFakeClientWithNodePodEviction(nodeName, nodeID, providerID, podName) @@ -402,8 +391,8 @@ func TestDrainNodeHandler_Handle(t *testing.T) { cfg: drainNodeConfig{}, action: newActionDrainNode(nodeName, nodeID, providerID, 0, false), }, - wantErr: context.DeadlineExceeded, - wantErrorContains: "failed to drain via graceful eviction", + wantErr: errAction, + wantErrorContains: "drain timeout is 0", }, { name: "eviction fails and force=true, force remove pods: timeout during eviction", @@ -576,11 +565,22 @@ func TestDrainNodeHandler_Handle(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { t.Parallel() - h := &DrainNodeHandler{ - log: logrus.New(), - clientset: tt.fields.clientSet(t), - cfg: tt.args.cfg, - } + log := logrus.New() + clientset := tt.fields.clientSet(t) + + // Create and start informer manager + infMgr := informer.NewManager(log, clientset, 10*time.Minute) + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + + go func() { + _ = infMgr.Start(ctx) + }() + + // Wait for informer caches to sync + time.Sleep(100 * time.Millisecond) + + h := NewDrainNodeHandler(log, clientset, tt.args.cfg.castNamespace, infMgr) err := h.Handle(context.Background(), tt.args.action) require.Equal(t, tt.wantErr != nil, err != nil, "expected error: %v, got: %v", tt.wantErr, err) if tt.wantErr != nil { @@ -592,15 +592,15 @@ func TestDrainNodeHandler_Handle(t *testing.T) { return } - n, err := h.clientset.CoreV1().Nodes().Get(context.Background(), tt.args.action.ActionDrainNode.NodeName, metav1.GetOptions{}) + n, err := clientset.CoreV1().Nodes().Get(context.Background(), tt.args.action.ActionDrainNode.NodeName, metav1.GetOptions{}) require.True(t, (err != nil && apierrors.IsNotFound(err)) || (err == nil && n.Spec.Unschedulable == !tt.wantNodeNotCordoned), "expected node to be not found or cordoned, got: %v", err) - _, err = h.clientset.CoreV1().Pods("default").Get(context.Background(), podName, metav1.GetOptions{}) + _, err = clientset.CoreV1().Pods("default").Get(context.Background(), podName, metav1.GetOptions{}) require.True(t, (tt.wantPodIsNotFound && apierrors.IsNotFound(err)) || (!tt.wantPodIsNotFound && err == nil), "expected pod to be not found, got: %v", err) - checkPods(t, h.clientset, "ds-pod", "static-pod", "job-pod") + checkPods(t, clientset, "ds-pod", "static-pod", "job-pod") }) } } diff --git a/internal/actions/types.go b/internal/actions/types.go index 1d60069f..d4e98897 100644 --- a/internal/actions/types.go +++ b/internal/actions/types.go @@ -19,10 +19,9 @@ const ( ) var ( - errAction = errors.New("not valid action") - errNodeNotFound = errors.New("node not found") - errNodeDoesNotMatch = fmt.Errorf("node does not match") - errNodeWatcherClosed = fmt.Errorf("node watcher closed, no more events will be received") + errAction = errors.New("not valid action") + errNodeNotFound = errors.New("node not found") + errNodeDoesNotMatch = fmt.Errorf("node does not match") ) func newUnexpectedTypeErr(value, expectedType interface{}) error { diff --git a/internal/config/config.go b/internal/config/config.go index b6b5a844..80503ccf 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -32,8 +32,13 @@ type Config struct { // MaxActionsInProgress serves as a safeguard to limit the number of Goroutines in progress. MaxActionsInProgress int - MonitorMetadataPath string `mapstructure:"monitor_metadata"` - SelfPod Pod `mapstructure:"self_pod"` + MonitorMetadataPath string `mapstructure:"monitor_metadata"` + SelfPod Pod `mapstructure:"self_pod"` + Informer InformerConfig `mapstructure:"informer"` +} + +type InformerConfig struct { + ResyncPeriod time.Duration `mapstructure:"resync_period"` } type Pod struct { @@ -109,6 +114,8 @@ func Get() Config { _ = viper.BindEnv("metrics.port", "METRICS_PORT") _ = viper.BindEnv("metrics.exportenabled", "METRICS_EXPORT_ENABLED") _ = viper.BindEnv("metrics.exportinterval", "METRICS_EXPORT_INTERVAL") + _ = viper.BindEnv("informer.enabled", "INFORMER_ENABLED") + _ = viper.BindEnv("informer.resyncperiod", "INFORMER_RESYNC_PERIOD") cfg = &Config{} if err := viper.Unmarshal(&cfg); err != nil { @@ -173,6 +180,12 @@ func Get() Config { cfg.Metrics.ExportInterval = 30 * time.Second } + // Informer defaults + if cfg.Informer.ResyncPeriod < 1*time.Hour { + // Default to 12 hours, consistent with CSR informer + cfg.Informer.ResyncPeriod = 12 * time.Hour + } + return *cfg } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index c5c8ae0e..80d3a2c1 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -55,6 +55,9 @@ func TestConfig(t *testing.T) { ExportEnabled: false, ExportInterval: 30 * time.Second, }, + Informer: InformerConfig{ + ResyncPeriod: 12 * time.Hour, + }, } require.Equal(t, expected, cfg) diff --git a/internal/controller/controller.go b/internal/controller/controller.go index 5906a508..f8b7f2cf 100644 --- a/internal/controller/controller.go +++ b/internal/controller/controller.go @@ -18,6 +18,7 @@ import ( "github.com/castai/cluster-controller/internal/actions" "github.com/castai/cluster-controller/internal/castai" "github.com/castai/cluster-controller/internal/helm" + "github.com/castai/cluster-controller/internal/informer" "github.com/castai/cluster-controller/internal/metrics" "github.com/castai/cluster-controller/internal/waitext" ) @@ -43,6 +44,7 @@ func NewService( castaiClient castai.CastAIClient, helmClient helm.Client, healthCheck *health.HealthzProvider, + informerManager *informer.Manager, ) *Controller { return &Controller{ log: log, @@ -52,7 +54,7 @@ func NewService( startedActions: map[string]struct{}{}, actionHandlers: map[reflect.Type]actions.ActionHandler{ reflect.TypeOf(&castai.ActionDeleteNode{}): actions.NewDeleteNodeHandler(log, clientset), - reflect.TypeOf(&castai.ActionDrainNode{}): actions.NewDrainNodeHandler(log, clientset, cfg.Namespace), + reflect.TypeOf(&castai.ActionDrainNode{}): actions.NewDrainNodeHandler(log, clientset, cfg.Namespace, informerManager), reflect.TypeOf(&castai.ActionPatchNode{}): actions.NewPatchNodeHandler(log, clientset), reflect.TypeOf(&castai.ActionCreateEvent{}): actions.NewCreateEventHandler(log, clientset), reflect.TypeOf(&castai.ActionChartUpsert{}): actions.NewChartUpsertHandler(log, helmClient), @@ -60,7 +62,7 @@ func NewService( reflect.TypeOf(&castai.ActionChartRollback{}): actions.NewChartRollbackHandler(log, helmClient, cfg.Version), reflect.TypeOf(&castai.ActionDisconnectCluster{}): actions.NewDisconnectClusterHandler(log, clientset), reflect.TypeOf(&castai.ActionCheckNodeDeleted{}): actions.NewCheckNodeDeletedHandler(log, clientset), - reflect.TypeOf(&castai.ActionCheckNodeStatus{}): actions.NewCheckNodeStatusHandler(log, clientset), + reflect.TypeOf(&castai.ActionCheckNodeStatus{}): actions.NewCheckNodeStatusHandler(log, clientset, informerManager), reflect.TypeOf(&castai.ActionEvictPod{}): actions.NewEvictPodHandler(log, clientset), reflect.TypeOf(&castai.ActionPatch{}): actions.NewPatchHandler(log, dynamicClient), reflect.TypeOf(&castai.ActionCreate{}): actions.NewCreateHandler(log, dynamicClient), diff --git a/internal/controller/controller_test.go b/internal/controller/controller_test.go index 9213b920..17679cbc 100644 --- a/internal/controller/controller_test.go +++ b/internal/controller/controller_test.go @@ -16,7 +16,7 @@ import ( "github.com/castai/cluster-controller/health" mock_actions "github.com/castai/cluster-controller/internal/actions/mock" "github.com/castai/cluster-controller/internal/castai" - "github.com/castai/cluster-controller/internal/castai/mock" + mock_castai "github.com/castai/cluster-controller/internal/castai/mock" ) // nolint: govet @@ -248,7 +248,7 @@ func TestController_Run(t *testing.T) { nil, client, nil, - health.NewHealthzProvider(health.HealthzCfg{HealthyPollIntervalLimit: pollTimeout}, logrus.New())) + health.NewHealthzProvider(health.HealthzCfg{HealthyPollIntervalLimit: pollTimeout}, logrus.New()), nil) handler := mock_actions.NewMockActionHandler(m) if tt.fields.tuneMockHandler != nil { tt.fields.tuneMockHandler(handler) diff --git a/internal/informer/manager.go b/internal/informer/manager.go new file mode 100644 index 00000000..169c5ec7 --- /dev/null +++ b/internal/informer/manager.go @@ -0,0 +1,200 @@ +package informer + +import ( + "context" + "fmt" + "sync" + "time" + + "github.com/sirupsen/logrus" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/informers" + "k8s.io/client-go/kubernetes" + listerv1 "k8s.io/client-go/listers/core/v1" + "k8s.io/client-go/tools/cache" + + "github.com/castai/cluster-controller/internal/metrics" +) + +const ( + cacheSyncTimeout = 30 * time.Second +) + +// Manager manages the global SharedInformerFactory and provides +// access to specific informers and listers. +type Manager struct { + log logrus.FieldLogger + clientset kubernetes.Interface + factory informers.SharedInformerFactory + + nodeInformer cache.SharedIndexInformer + nodeLister listerv1.NodeLister + + podInformer cache.SharedIndexInformer + podLister listerv1.PodLister + + started bool + cancelFunc context.CancelFunc + mu sync.RWMutex +} + +// NewManager creates a new Manager with the given clientset and resync period. +func NewManager( + log logrus.FieldLogger, + clientset kubernetes.Interface, + resyncPeriod time.Duration, +) *Manager { + factory := informers.NewSharedInformerFactory(clientset, resyncPeriod) + + // Create node informer + nodeInformer := factory.Core().V1().Nodes().Informer() + nodeLister := factory.Core().V1().Nodes().Lister() + + // Create pod informer + podInformer := factory.Core().V1().Pods().Informer() + podLister := factory.Core().V1().Pods().Lister() + + return &Manager{ + log: log, + clientset: clientset, + factory: factory, + nodeInformer: nodeInformer, + nodeLister: nodeLister, + podInformer: podInformer, + podLister: podLister, + } +} + +// Start starts the informer factory and waits for all caches to sync. +// This method blocks until caches are synchronized or the context is cancelled. +func (m *Manager) Start(ctx context.Context) error { + m.mu.Lock() + if m.started { + m.mu.Unlock() + m.log.Warn("informer manager already started") + return nil + } + m.mu.Unlock() + + // Create a cancellable context for the informer + ctx, cancel := context.WithCancel(ctx) + m.mu.Lock() + m.cancelFunc = cancel + m.mu.Unlock() + + // Start the factory in a goroutine + stopCh := make(chan struct{}) + go func() { + <-ctx.Done() + close(stopCh) + }() + + m.log.Info("starting shared informer factory...") + m.factory.Start(stopCh) + + // Wait for cache sync with timeout + syncCtx, syncCancel := context.WithTimeout(ctx, cacheSyncTimeout) + defer syncCancel() + + m.log.Info("waiting for informer caches to sync...") + if !cache.WaitForCacheSync(syncCtx.Done(), m.nodeInformer.HasSynced, m.podInformer.HasSynced) { + metrics.IncrementInformerCacheSyncs("node", "failure") + metrics.IncrementInformerCacheSyncs("pod", "failure") + return fmt.Errorf("failed to sync informer caches within %v", cacheSyncTimeout) + } + + metrics.IncrementInformerCacheSyncs("node", "success") + metrics.IncrementInformerCacheSyncs("pod", "success") + + m.mu.Lock() + m.started = true + m.mu.Unlock() + + m.log.Info("informer caches synced successfully") + + // Start background cache size reporter + go m.reportCacheSize(ctx) + + return nil +} + +// Stop gracefully stops the informer factory. +func (m *Manager) Stop() { + m.mu.Lock() + defer m.mu.Unlock() + + if !m.started { + return + } + + m.log.Info("stopping informer manager...") + if m.cancelFunc != nil { + m.cancelFunc() + m.cancelFunc = nil + } + m.started = false + m.log.Info("informer manager stopped") +} + +// IsStarted returns true if the informer manager has been started and caches are synced. +func (m *Manager) IsStarted() bool { + m.mu.RLock() + defer m.mu.RUnlock() + return m.started +} + +// GetNodeLister returns the node lister for querying the node cache. +func (m *Manager) GetNodeLister() listerv1.NodeLister { + return m.nodeLister +} + +// GetNodeInformer returns the node informer for watching node events. +func (m *Manager) GetNodeInformer() cache.SharedIndexInformer { + return m.nodeInformer +} + +// GetPodLister returns the pod lister for querying the pod cache. +func (m *Manager) GetPodLister() listerv1.PodLister { + return m.podLister +} + +// GetPodInformer returns the pod informer for watching pod events. +func (m *Manager) GetPodInformer() cache.SharedIndexInformer { + return m.podInformer +} + +// GetFactory returns the underlying SharedInformerFactory for advanced use cases. +func (m *Manager) GetFactory() informers.SharedInformerFactory { + return m.factory +} + +// reportCacheSize periodically reports the node and pod cache sizes as metrics. +func (m *Manager) reportCacheSize(ctx context.Context) { + ticker := time.NewTicker(30 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + nodes, err := m.nodeLister.List(labels.Everything()) + if err != nil { + m.log.WithError(err).Warn("failed to list nodes for cache size metric") + } else { + size := len(nodes) + m.log.WithField("cache_size", size).Debug("node informer cache size") + metrics.SetInformerCacheSize("node", size) + } + + pods, err := m.podLister.List(labels.Everything()) + if err != nil { + m.log.WithError(err).Warn("failed to list pods for cache size metric") + } else { + size := len(pods) + m.log.WithField("cache_size", size).Debug("pod informer cache size") + metrics.SetInformerCacheSize("pod", size) + } + } + } +} diff --git a/internal/informer/manager_test.go b/internal/informer/manager_test.go new file mode 100644 index 00000000..6408c8fb --- /dev/null +++ b/internal/informer/manager_test.go @@ -0,0 +1,280 @@ +package informer + +import ( + "context" + "testing" + "time" + + "github.com/sirupsen/logrus" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/kubernetes/fake" +) + +func TestNewManager(t *testing.T) { + t.Parallel() + + log := logrus.New() + clientset := fake.NewSimpleClientset() + resyncPeriod := 12 * time.Hour + + manager := NewManager(log, clientset, resyncPeriod) + + require.NotNil(t, manager) + require.NotNil(t, manager.factory) + require.NotNil(t, manager.nodeInformer) + require.NotNil(t, manager.nodeLister) + require.False(t, manager.IsStarted()) +} + +func TestManager_Start_Success(t *testing.T) { + t.Parallel() + + log := logrus.New() + log.SetLevel(logrus.ErrorLevel) // Reduce test noise + + // Create fake clientset with some nodes + node1 := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-1", + Labels: map[string]string{ + "test": "true", + }, + }, + } + node2 := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-2", + }, + } + + clientset := fake.NewSimpleClientset(node1, node2) + manager := NewManager(log, clientset, 0) + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + // Start should succeed and sync caches + err := manager.Start(ctx) + require.NoError(t, err) + require.True(t, manager.IsStarted()) + + // Should be able to list nodes from cache + lister := manager.GetNodeLister() + require.NotNil(t, lister) + + nodes, err := lister.List(labels.Everything()) + require.NoError(t, err) + require.Len(t, nodes, 2) + + // Cleanup + manager.Stop() + require.False(t, manager.IsStarted()) +} + +func TestManager_Start_AlreadyStarted(t *testing.T) { + t.Parallel() + + log := logrus.New() + log.SetLevel(logrus.ErrorLevel) + + clientset := fake.NewSimpleClientset() + manager := NewManager(log, clientset, 0) + + ctx := context.Background() + + // Start first time + err := manager.Start(ctx) + require.NoError(t, err) + require.True(t, manager.IsStarted()) + + // Start second time should return nil without error + err = manager.Start(ctx) + require.NoError(t, err) + require.True(t, manager.IsStarted()) + + manager.Stop() +} + +func TestManager_Start_ContextCanceled(t *testing.T) { + t.Parallel() + + log := logrus.New() + log.SetLevel(logrus.ErrorLevel) + + clientset := fake.NewSimpleClientset() + manager := NewManager(log, clientset, 0) + + // Create a context that's already canceled + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + // Start should fail due to context cancellation + err := manager.Start(ctx) + require.Error(t, err) + require.Contains(t, err.Error(), "failed to sync informer caches") + require.False(t, manager.IsStarted()) +} + +func TestManager_GetNodeLister(t *testing.T) { + t.Parallel() + + log := logrus.New() + log.SetLevel(logrus.ErrorLevel) + + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + Labels: map[string]string{ + "env": "test", + }, + }, + Spec: corev1.NodeSpec{ + ProviderID: "test-provider-id", + }, + } + + clientset := fake.NewSimpleClientset(node) + manager := NewManager(log, clientset, 0) + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + err := manager.Start(ctx) + require.NoError(t, err) + defer manager.Stop() + + lister := manager.GetNodeLister() + require.NotNil(t, lister) + + // Test Get by name + retrievedNode, err := lister.Get("test-node") + require.NoError(t, err) + require.Equal(t, "test-node", retrievedNode.Name) + require.Equal(t, "test-provider-id", retrievedNode.Spec.ProviderID) + + // Test Get non-existent node + _, err = lister.Get("non-existent") + require.Error(t, err) + require.Contains(t, err.Error(), "not found") + + // Test List all nodes + nodes, err := lister.List(labels.Everything()) + require.NoError(t, err) + require.Len(t, nodes, 1) +} + +func TestManager_Stop(t *testing.T) { + t.Parallel() + + log := logrus.New() + log.SetLevel(logrus.ErrorLevel) + + clientset := fake.NewSimpleClientset() + manager := NewManager(log, clientset, 0) + + // Stop when not started should be safe + manager.Stop() + require.False(t, manager.IsStarted()) + + // Start and then stop + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + err := manager.Start(ctx) + require.NoError(t, err) + require.True(t, manager.IsStarted()) + + manager.Stop() + require.False(t, manager.IsStarted()) + + // Multiple stops should be safe + manager.Stop() + require.False(t, manager.IsStarted()) +} + +func TestManager_IsStarted(t *testing.T) { + t.Parallel() + + log := logrus.New() + log.SetLevel(logrus.ErrorLevel) + + clientset := fake.NewSimpleClientset() + manager := NewManager(log, clientset, 0) + + // Initially not started + require.False(t, manager.IsStarted()) + + // After start + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + err := manager.Start(ctx) + require.NoError(t, err) + require.True(t, manager.IsStarted()) + + // After stop + manager.Stop() + require.False(t, manager.IsStarted()) +} + +func TestManager_GetFactory(t *testing.T) { + t.Parallel() + + log := logrus.New() + clientset := fake.NewSimpleClientset() + manager := NewManager(log, clientset, 0) + + factory := manager.GetFactory() + require.NotNil(t, factory) +} + +func TestManager_CacheUpdates(t *testing.T) { + t.Parallel() + + log := logrus.New() + log.SetLevel(logrus.ErrorLevel) + + // Start with one node + node1 := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-1", + }, + } + + clientset := fake.NewSimpleClientset(node1) + manager := NewManager(log, clientset, 0) + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + err := manager.Start(ctx) + require.NoError(t, err) + defer manager.Stop() + + lister := manager.GetNodeLister() + + // Should see initial node + nodes, err := lister.List(labels.Everything()) + require.NoError(t, err) + require.Len(t, nodes, 1) + + // Add another node to the fake clientset + node2 := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-2", + }, + } + _, err = clientset.CoreV1().Nodes().Create(context.Background(), node2, metav1.CreateOptions{}) + require.NoError(t, err) + + // Wait a bit for the informer to pick up the change + time.Sleep(100 * time.Millisecond) + + // Should now see both nodes + nodes, err = lister.List(labels.Everything()) + require.NoError(t, err) + require.Len(t, nodes, 2) +} diff --git a/internal/metrics/custom_metrics.go b/internal/metrics/custom_metrics.go index 9c041d25..d1701e06 100644 --- a/internal/metrics/custom_metrics.go +++ b/internal/metrics/custom_metrics.go @@ -51,3 +51,29 @@ func ActionFinished(actionType string, success bool, duration time.Duration) { actionExecutedCounter.With(prometheus.Labels{"success": strconv.FormatBool(success), "type": actionType}).Inc() actionExecutedDuration.With(prometheus.Labels{"type": actionType}).Observe(duration.Seconds()) } + +// informerCacheSize tracks the size of informer caches. +var informerCacheSize = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "informer_cache_size", + Help: "Number of objects in informer cache by resource type.", + }, + []string{"resource"}, +) + +// informerCacheSyncs tracks informer cache sync attempts. +var informerCacheSyncs = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "informer_cache_syncs_total", + Help: "Informer cache sync attempts by resource and status.", + }, + []string{"resource", "status"}, +) + +func SetInformerCacheSize(resource string, size int) { + informerCacheSize.With(prometheus.Labels{"resource": resource}).Set(float64(size)) +} + +func IncrementInformerCacheSyncs(resource, status string) { + informerCacheSyncs.With(prometheus.Labels{"resource": resource, "status": status}).Inc() +} diff --git a/internal/metrics/register.go b/internal/metrics/register.go index 047d0dca..84fada03 100644 --- a/internal/metrics/register.go +++ b/internal/metrics/register.go @@ -9,5 +9,7 @@ func RegisterCustomMetrics() { actionStartedCounter, actionExecutedCounter, actionExecutedDuration, + informerCacheSize, + informerCacheSyncs, ) } diff --git a/loadtest/scenarios/check_node_status.go b/loadtest/scenarios/check_node_status.go index ea3bc7cd..ad41c34d 100644 --- a/loadtest/scenarios/check_node_status.go +++ b/loadtest/scenarios/check_node_status.go @@ -49,7 +49,7 @@ func (s *checkNodeStatusScenario) Preparation(ctx context.Context, namespace str errGroup.Go(func() error { nodeName := fmt.Sprintf("kwok-check-status-%d", i) s.log.Info(fmt.Sprintf("Creating node %s", nodeName)) - node := NewKwokNode(KwokConfig{}, nodeName) + node := NewKwokNode(KwokConfig{ProviderID: fmt.Sprintf("kwok://%s", nodeName)}, nodeName) _, err := clientset.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{}) if err != nil && !apierrors.IsAlreadyExists(err) { @@ -112,6 +112,7 @@ func (s *checkNodeStatusScenario) Run(ctx context.Context, _ string, _ kubernete CreatedAt: time.Now().UTC(), ActionCheckNodeStatus: &castai.ActionCheckNodeStatus{ NodeName: node.Name, + ProviderId: node.Spec.ProviderID, NodeStatus: castai.ActionCheckNodeStatus_READY, }, }) diff --git a/loadtest/scenarios/k8s_objects.go b/loadtest/scenarios/k8s_objects.go index 8d8dd618..ae4411c5 100644 --- a/loadtest/scenarios/k8s_objects.go +++ b/loadtest/scenarios/k8s_objects.go @@ -33,6 +33,8 @@ type KwokConfig struct { // Annotation should match what kwok is configured to use via --manage-nodes-with-annotation-selector // Default is DefaultKwokMarker. Value is always KwokMarkerValue. Annotation string + + ProviderID string } // NewKwokNode creates a fake node with reasonable defaults. @@ -69,6 +71,7 @@ func NewKwokNode(cfg KwokConfig, nodeName string) *corev1.Node { Annotations: defaultAnnotations, }, Spec: corev1.NodeSpec{ + ProviderID: cfg.ProviderID, Taints: []corev1.Taint{ { Key: DefaultKwokMarker,