From 4e9d2e311f480e7059f9a9ac085c17f5a2de71d4 Mon Sep 17 00:00:00 2001 From: Sri Roopa Ramesh Babu Date: Fri, 19 Dec 2025 09:58:53 -0500 Subject: [PATCH] App-server Postgres reconcilation on certificate rotation. --- internal/controller/appserver/deployment.go | 16 +- internal/controller/postgres/deployment.go | 45 ++++- internal/controller/utils/constants.go | 6 + internal/controller/watchers/watchers.go | 185 +++++++++++++++++++- test/e2e/postgres_restart_test.go | 84 +++++++++ 5 files changed, 321 insertions(+), 15 deletions(-) diff --git a/internal/controller/appserver/deployment.go b/internal/controller/appserver/deployment.go index bb3ec0156..41301b742 100644 --- a/internal/controller/appserver/deployment.go +++ b/internal/controller/appserver/deployment.go @@ -357,15 +357,19 @@ func GenerateOLSDeployment(r reconciler.Reconciler, cr *olsv1alpha1.OLSConfig) ( // Get ResourceVersions for tracking - these resources should already exist // If they don't exist, we'll get empty strings which is fine for initial creation configMapResourceVersion, _ := utils.GetConfigMapResourceVersion(r, ctx, utils.OLSConfigCmName) + secretResourceVersion, _ := utils.GetSecretResourceVersion(r, ctx, utils.PostgresSecretName) + + annotations := map[string]string{ + utils.OLSConfigMapResourceVersionAnnotation: configMapResourceVersion, + utils.PostgresSecretResourceVersionAnnotation: secretResourceVersion, + } deployment := appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ - Name: utils.OLSAppServerDeploymentName, - Namespace: r.GetNamespace(), - Labels: utils.GenerateAppServerSelectorLabels(), - Annotations: map[string]string{ - utils.OLSConfigMapResourceVersionAnnotation: configMapResourceVersion, - }, + Name: utils.OLSAppServerDeploymentName, + Namespace: r.GetNamespace(), + Labels: utils.GenerateAppServerSelectorLabels(), + Annotations: annotations, }, Spec: appsv1.DeploymentSpec{ Selector: &metav1.LabelSelector{ diff --git a/internal/controller/postgres/deployment.go b/internal/controller/postgres/deployment.go index 553f85704..d4d509fb4 100644 --- a/internal/controller/postgres/deployment.go +++ b/internal/controller/postgres/deployment.go @@ -171,6 +171,8 @@ func GeneratePostgresDeployment(r reconciler.Reconciler, ctx context.Context, cr // Get ResourceVersions for tracking - these resources should already exist // If they don't exist, we'll get empty strings which is fine for initial creation configMapResourceVersion, _ := utils.GetConfigMapResourceVersion(r, ctx, utils.PostgresConfigMap) + secretResourceVersion, _ := utils.GetSecretResourceVersion(r, ctx, utils.PostgresSecretName) + postgresCertsSecretVersion, _ := utils.GetSecretResourceVersion(r, ctx, utils.PostgresCertsSecretName) deployment := appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ @@ -178,11 +180,18 @@ func GeneratePostgresDeployment(r reconciler.Reconciler, ctx context.Context, cr Namespace: r.GetNamespace(), Labels: utils.GeneratePostgresSelectorLabels(), Annotations: map[string]string{ - utils.PostgresConfigMapResourceVersionAnnotation: configMapResourceVersion, + utils.PostgresConfigMapResourceVersionAnnotation: configMapResourceVersion, + utils.PostgresSecretResourceVersionAnnotation: secretResourceVersion, + utils.PostgresCertsSecretResourceVersionAnnotation: postgresCertsSecretVersion, }, }, Spec: appsv1.DeploymentSpec{ Replicas: &cacheReplicas, + // Use Recreate strategy for PostgreSQL since it uses ReadWriteOnce PVC + // This prevents Multi-Attach errors during rolling updates + Strategy: appsv1.DeploymentStrategy{ + Type: appsv1.RecreateDeploymentStrategyType, + }, Selector: &metav1.LabelSelector{ MatchLabels: utils.GeneratePostgresSelectorLabels(), }, @@ -272,6 +281,35 @@ func UpdatePostgresDeployment(r reconciler.Reconciler, ctx context.Context, cr * } } + // Check if Secret ResourceVersion has changed + currentSecretVersion, err := utils.GetSecretResourceVersion(r, ctx, utils.PostgresSecretName) + if err != nil { + r.GetLogger().Info("failed to get Secret ResourceVersion", "error", err) + changed = true + } else { + storedSecretVersion := existingDeployment.Annotations[utils.PostgresSecretResourceVersionAnnotation] + if storedSecretVersion != currentSecretVersion { + changed = true + } + } + + // Check if Postgres TLS Certificate Secret ResourceVersion has changed + // This enables smooth rotation when service-ca-operator regenerates certificates + currentPostgresCertsVersion, err := utils.GetSecretResourceVersion(r, ctx, utils.PostgresCertsSecretName) + if err != nil { + r.GetLogger().Info("failed to get Postgres TLS Certificate Secret ResourceVersion", "error", err) + // Don't set changed = true here - TLS cert secret might not exist yet (service-ca-operator creates it) + // We'll track it when it becomes available + } else { + storedPostgresCertsVersion := existingDeployment.Annotations[utils.PostgresCertsSecretResourceVersionAnnotation] + if storedPostgresCertsVersion != currentPostgresCertsVersion { + r.GetLogger().Info("Postgres TLS certificate secret ResourceVersion changed, triggering restart", + "oldVersion", storedPostgresCertsVersion, + "newVersion", currentPostgresCertsVersion) + changed = true + } + } + // If nothing changed, skip update if !changed { return nil @@ -280,6 +318,11 @@ func UpdatePostgresDeployment(r reconciler.Reconciler, ctx context.Context, cr * // Apply changes - always update spec and annotations since something changed existingDeployment.Spec = desiredDeployment.Spec existingDeployment.Annotations[utils.PostgresConfigMapResourceVersionAnnotation] = desiredDeployment.Annotations[utils.PostgresConfigMapResourceVersionAnnotation] + existingDeployment.Annotations[utils.PostgresSecretResourceVersionAnnotation] = desiredDeployment.Annotations[utils.PostgresSecretResourceVersionAnnotation] + // Update TLS cert secret ResourceVersion if it exists in desired deployment + if postgresCertsVersion, ok := desiredDeployment.Annotations[utils.PostgresCertsSecretResourceVersionAnnotation]; ok { + existingDeployment.Annotations[utils.PostgresCertsSecretResourceVersionAnnotation] = postgresCertsVersion + } r.GetLogger().Info("updating OLS postgres deployment", "name", existingDeployment.Name) diff --git a/internal/controller/utils/constants.go b/internal/controller/utils/constants.go index 48e71d470..6a0e17c5e 100644 --- a/internal/controller/utils/constants.go +++ b/internal/controller/utils/constants.go @@ -180,6 +180,12 @@ const ( // PostgresSecretResourceVersionAnnotation is the annotation key for tracking Secret ResourceVersion //nolint:gosec // G101: This is an annotation key name, not a credential PostgresSecretResourceVersionAnnotation = "ols.openshift.io/postgres-secret-version" + // PostgresCertsSecretResourceVersionAnnotation is the annotation key for tracking Postgres TLS certificate Secret ResourceVersion + //nolint:gosec // not a credential, just an annotation key + PostgresCertsSecretResourceVersionAnnotation = "ols.openshift.io/postgres-certs-secret-version" + // ConsoleUIServiceCertSecretResourceVersionAnnotation is the annotation key for tracking Console UI TLS certificate Secret ResourceVersion + //nolint:gosec // not a credential, just an annotation key + ConsoleUIServiceCertSecretResourceVersionAnnotation = "ols.openshift.io/console-ui-certs-secret-version" // PostgresServiceName is the name of OLS application Postgres server service PostgresServiceName = "lightspeed-postgres-server" // PostgresSecretName is the name of OLS application Postgres secret diff --git a/internal/controller/watchers/watchers.go b/internal/controller/watchers/watchers.go index 1df81ac5c..5ecd022f7 100644 --- a/internal/controller/watchers/watchers.go +++ b/internal/controller/watchers/watchers.go @@ -3,11 +3,14 @@ package watchers import ( "context" "fmt" + "time" appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" apiequality "k8s.io/apimachinery/pkg/api/equality" + "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/util/workqueue" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/event" @@ -155,7 +158,24 @@ func (h *ConfigMapUpdateHandler) Create(ctx context.Context, evt event.CreateEve } } - // Fetch the OLSConfig CR to check if this configmap should be watched + // FIRST: Check if this is a system ConfigMap (before checking CR reference) + // This handles cases where system ConfigMaps like openshift-service-ca.crt are recreated + watcherConfig, _ := h.Reconciler.GetWatcherConfig().(*utils.WatcherConfig) + if watcherConfig != nil { + for _, systemCM := range watcherConfig.ConfigMaps.SystemResources { + if cm.GetNamespace() == systemCM.Namespace && cm.GetName() == systemCM.Name { + // This is a system ConfigMap - trigger restart directly + h.Reconciler.GetLogger().Info("Detected system configmap recreation", + "configmap", systemCM.Name, + "namespace", systemCM.Namespace, + "description", systemCM.Description) + ConfigMapWatcherFilter(h.Reconciler, ctx, obj) + return + } + } + } + + // SECOND: Check if this configmap is referenced in the CR (for user-provided configmaps) cr := &olsv1alpha1.OLSConfig{} err := h.Reconciler.Get(ctx, types.NamespacedName{Name: utils.OLSConfigName}, cr) if err != nil { @@ -368,20 +388,170 @@ var restartFuncs = map[string]RestartFunc{ utils.ConsoleUIDeploymentName: console.RestartConsoleUI, } -// restart corresponding deployment -func restartDeployment(r reconciler.Reconciler, ctx context.Context, affectedDeployments []string, namespace string, name string, useLCore bool) { +// deploymentExistsAndReady checks if a deployment exists and is in a stable state (not being created or updated) +// Returns true if deployment exists and the controller has observed the current generation +func deploymentExistsAndReady(r reconciler.Reconciler, ctx context.Context, deploymentName string) bool { + deployment := &appsv1.Deployment{} + err := r.Get(ctx, client.ObjectKey{Name: deploymentName, Namespace: r.GetNamespace()}, deployment) + if err != nil || errors.IsNotFound(err) { + return false + } + // Check if deployment is in a stable state (not being created or updated) + // Deployment is considered ready if the controller has observed the current generation + // ObservedGeneration == Generation means the deployment is stable + return deployment.Status.ObservedGeneration == deployment.Generation +} + +// waitForDeploymentReady waits for a deployment to become ready after restart +func waitForDeploymentReady(r reconciler.Reconciler, ctx context.Context, deploymentName string, timeout time.Duration) error { + return wait.PollUntilContextTimeout(ctx, 2*time.Second, timeout, true, func(ctx context.Context) (bool, error) { + deployment := &appsv1.Deployment{} + err := r.Get(ctx, client.ObjectKey{Name: deploymentName, Namespace: r.GetNamespace()}, deployment) + if err != nil { + return false, err + } + + // Check if deployment is ready + if deployment.Status.ReadyReplicas == *deployment.Spec.Replicas && + deployment.Status.UpdatedReplicas == *deployment.Spec.Replicas && + deployment.Status.UnavailableReplicas == 0 { + // Check Available condition + for _, condition := range deployment.Status.Conditions { + if condition.Type == appsv1.DeploymentAvailable && condition.Status == v1.ConditionTrue { + return true, nil + } + } + } + return false, nil + }) +} +// restartDeployment restarts affected deployments. When conversationCache uses postgres, +// it ensures Postgres restarts first and waits for it to be ready before restarting app server. +func restartDeployment(r reconciler.Reconciler, ctx context.Context, affectedDeployments []string, namespace string, name string, useLCore bool) { + // Resolve ACTIVE_BACKEND to actual deployment name and build deployment list + resolvedDeployments := make([]string, 0, len(affectedDeployments)) for _, depName := range affectedDeployments { - // Resolve ACTIVE_BACKEND to actual deployment name if depName == "ACTIVE_BACKEND" { if useLCore { - depName = utils.LCoreDeploymentName + resolvedDeployments = append(resolvedDeployments, utils.LCoreDeploymentName) } else { - depName = utils.OLSAppServerDeploymentName + resolvedDeployments = append(resolvedDeployments, utils.OLSAppServerDeploymentName) } + } else { + resolvedDeployments = append(resolvedDeployments, depName) + } + } + + // Check if conversationCache uses postgres to determine restart strategy + cr := &olsv1alpha1.OLSConfig{} + usesPostgresCache := false + getErr := r.Get(ctx, types.NamespacedName{Name: utils.OLSConfigName}, cr) + if getErr == nil { + usesPostgresCache = cr.Spec.OLSConfig.ConversationCache.Type == olsv1alpha1.Postgres + } + + // Separate deployments into categories + var postgresDeployments []string + var appServerDeployments []string + var otherDeployments []string + + for _, depName := range resolvedDeployments { + switch depName { + case utils.PostgresDeploymentName: + postgresDeployments = append(postgresDeployments, depName) + case utils.OLSAppServerDeploymentName, utils.LCoreDeploymentName: + appServerDeployments = append(appServerDeployments, depName) + default: + otherDeployments = append(otherDeployments, depName) } + } + + // If conversationCache uses postgres and both postgres and app server need restart, + // handle postgres first, then wait, then app server + if usesPostgresCache && len(postgresDeployments) > 0 && len(appServerDeployments) > 0 { + // Check if deployments exist and are in a stable state before trying to restart them + // During initial setup, deployments might not exist yet or might still be being created + postgresReady := deploymentExistsAndReady(r, ctx, utils.PostgresDeploymentName) + appServerReady := false + for _, depName := range appServerDeployments { + if deploymentExistsAndReady(r, ctx, depName) { + appServerReady = true + break + } + } + + if !postgresReady || !appServerReady { + r.GetLogger().Info("skipping restart - deployments not ready yet", + "postgresReady", postgresReady, + "appServerReady", appServerReady, + "resource", name, "namespace", namespace) + return + } + + // Step 1: Restart Postgres first + for _, pgDepName := range postgresDeployments { + err := postgres.RestartPostgres(r, ctx) + if err != nil { + r.GetLogger().Error(err, "failed to restart postgres deployment", + "deployment", pgDepName, "resource", name, "namespace", namespace) + return + } + } + + // Step 2: Wait for Postgres to be ready before restarting app server + waitErr := waitForDeploymentReady(r, ctx, utils.PostgresDeploymentName, 5*time.Minute) + if waitErr != nil { + r.GetLogger().Error(waitErr, "postgres deployment did not become ready in time", + "resource", name, "namespace", namespace) + return + } + + // Step 3: Restart app server deployments after postgres is ready + for _, appDepName := range appServerDeployments { + var err error + switch appDepName { + case utils.OLSAppServerDeploymentName: + err = appserver.RestartAppServer(r, ctx) + case utils.LCoreDeploymentName: + err = lcore.RestartLCore(r, ctx) + default: + r.GetLogger().Info("unknown app server deployment name", "deployment", appDepName) + continue + } + + if err != nil { + r.GetLogger().Error(err, "failed to restart deployment", + "deployment", appDepName, "resource", name, "namespace", namespace) + } else { + r.GetLogger().Info("restarted deployment", + "deployment", appDepName, "resource", name, "namespace", namespace) + } + } + + // Step 4: Restart other deployments (e.g., ConsoleUI) + for _, otherDepName := range otherDeployments { + restartFunc, exists := restartFuncs[otherDepName] + if !exists { + r.GetLogger().Info("unknown deployment name", "deployment", otherDepName) + continue + } + + err := restartFunc(r, ctx) + if err != nil { + r.GetLogger().Error(err, "failed to restart deployment", + "deployment", otherDepName, "resource", name, "namespace", namespace) + } else { + r.GetLogger().Info("restarted deployment", + "deployment", otherDepName, "resource", name, "namespace", namespace) + } + } + + return + } - // Restart the deployment using the appropriate function + // Normal case: restart all deployments without postgres dependency + for _, depName := range resolvedDeployments { restartFunc, exists := restartFuncs[depName] if !exists { r.GetLogger().Info("unknown deployment name", "deployment", depName) @@ -392,7 +562,6 @@ func restartDeployment(r reconciler.Reconciler, ctx context.Context, affectedDep if err != nil { r.GetLogger().Error(err, "failed to restart deployment", "deployment", depName, "resource", name, "namespace", namespace) - // Continue with other deployments } else { r.GetLogger().Info("restarted deployment", "deployment", depName, "resource", name, "namespace", namespace) diff --git a/test/e2e/postgres_restart_test.go b/test/e2e/postgres_restart_test.go index 6646a93e4..b5b75549d 100644 --- a/test/e2e/postgres_restart_test.go +++ b/test/e2e/postgres_restart_test.go @@ -12,6 +12,8 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" + + olsv1alpha1 "github.com/openshift/lightspeed-operator/api/v1alpha1" ) func invokeOLS(env *OLSTestEnvironment, secret *corev1.Secret, query string, expected_success bool) { @@ -95,4 +97,86 @@ var _ = Describe("Postgres restart", Ordered, Label("Postgres restart"), func() By("Testing HTTPS POST on /v1/query endpoint by OLS user - should work after recovery") invokeOLS(env, secret, "how do I stop a VM?", true) }) + + It("should wait for postgres to be ready before starting app server when conversationCache is postgres", FlakeAttempts(3), func() { + By("Verifying conversation cache type is Postgres") + olsConfig := &olsv1alpha1.OLSConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: OLSCRName, + }, + } + err = env.Client.Get(olsConfig) + Expect(err).NotTo(HaveOccurred()) + if olsConfig.Spec.OLSConfig.ConversationCache.Type != olsv1alpha1.Postgres { + Skip("Skipping test - conversation cache type is not Postgres") + } + + postgresDeployment := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: PostgresDeploymentName, + Namespace: OLSNameSpace, + }, + } + appServerDeployment := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: AppServerDeploymentName, + Namespace: OLSNameSpace, + }, + } + + By("Recording initial app server generation") + err = env.Client.Get(appServerDeployment) + Expect(err).NotTo(HaveOccurred()) + initialAppServerGeneration := appServerDeployment.Generation + + By("Scaling down Postgres to trigger a restart") + shutdownPostgres(env) + + By("Waiting for Postgres to reconcile back to ready state") + Eventually(func() bool { + err = env.Client.Get(postgresDeployment) + if err != nil { + return false + } + return postgresDeployment.Status.ReadyReplicas == *postgresDeployment.Spec.Replicas && + postgresDeployment.Status.UpdatedReplicas == *postgresDeployment.Spec.Replicas + }, 5*time.Minute, 5*time.Second).Should(BeTrue(), "Postgres should become ready") + + By("Verifying App Server waits for Postgres before becoming ready") + // App Server should only become ready after Postgres is ready + Eventually(func() bool { + err = env.Client.Get(appServerDeployment) + if err != nil { + return false + } + // Check if app server has been updated/restarted + appServerUpdated := appServerDeployment.Generation > initialAppServerGeneration || + appServerDeployment.Status.UpdatedReplicas == *appServerDeployment.Spec.Replicas + + // Verify postgres is ready + err = env.Client.Get(postgresDeployment) + if err != nil { + return false + } + postgresReady := postgresDeployment.Status.ReadyReplicas == *postgresDeployment.Spec.Replicas + + // App server should only be ready if postgres is ready + if appServerUpdated { + Expect(postgresReady).To(BeTrue(), "Postgres must be ready before App Server becomes ready") + } + + return appServerDeployment.Status.ReadyReplicas == *appServerDeployment.Spec.Replicas + }, 8*time.Minute, 5*time.Second).Should(BeTrue(), "App Server should become ready after Postgres is ready") + + By("Verifying both deployments are fully operational") + err = env.Client.Get(postgresDeployment) + Expect(err).NotTo(HaveOccurred()) + Expect(postgresDeployment.Status.ReadyReplicas).To(Equal(*postgresDeployment.Spec.Replicas), + "Postgres should have all replicas ready") + + err = env.Client.Get(appServerDeployment) + Expect(err).NotTo(HaveOccurred()) + Expect(appServerDeployment.Status.ReadyReplicas).To(Equal(*appServerDeployment.Spec.Replicas), + "App Server should have all replicas ready") + }) })