diff --git a/.github/workflows/ci-external-config.yaml b/.github/workflows/ci-external-config.yaml new file mode 100644 index 00000000..39eaed48 --- /dev/null +++ b/.github/workflows/ci-external-config.yaml @@ -0,0 +1,93 @@ +name: ci [external-config] + +on: + push: + branches: + - "**" + pull_request: + branches: + - "main" + +jobs: + deploy-at-github: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Set up Kubernetes cluster with Kind + uses: helm/kind-action@v1.6.0 + with: + cluster_name: gh-k8s-cluster + + - name: Set up Helm + uses: azure/setup-helm@v3 + with: + version: v3.12.0 + + - name: Create CMS namespace + run: | + kubectl create namespace cms + + - name: Install Prometheus Operator CRDs + run: | + helm repo add prometheus-community https://prometheus-community.github.io/helm-charts + helm repo update + kubectl create namespace monitoring + helm install prometheus-operator prometheus-community/kube-prometheus-stack --namespace monitoring --set prometheusOperator.createCustomResource=false --set defaultRules.create=false --set alertmanager.enabled=false --set prometheus.enabled=false --set grafana.enabled=false + + - name: Install KEDA Autoscaler + run: | + helm repo add kedacore https://kedacore.github.io/charts + helm repo update + kubectl create namespace keda + helm install keda kedacore/keda --namespace keda + + - name: Mount CVMFS + run: | + kubectl create namespace cvmfs-csi + helm install -n cvmfs-csi cvmfs-csi oci://registry.cern.ch/kubernetes/charts/cvmfs-csi --values cvmfs/values-cvmfs-csi.yaml + kubectl apply -f cvmfs/cvmfs-storageclass.yaml -n cvmfs-csi + + - name: Create external Envoy ConfigMap + run: | + kubectl apply -f tests/envoy-config-test.yaml -n cms + + - name: Deploy Helm chart with external Envoy config + run: | + helm repo add grafana https://grafana.github.io/helm-charts + helm repo add opentelemetry https://open-telemetry.github.io/opentelemetry-helm-charts + helm dependency build ./helm/supersonic + helm upgrade --install supersonic ./helm/supersonic \ + --values tests/values-external-envoy-config.yaml -n cms + + - name: CVMFS Mount ready + run: | + kubectl wait --for condition=Ready pod --all -n cvmfs-csi --timeout 120s + + - name: Envoy proxy ready + run: | + kubectl wait --for condition=Ready pod -l app.kubernetes.io/component=envoy --timeout 120s -n cms + + - name: Triton server ready + run: | + kubectl describe pod -l app.kubernetes.io/component=triton -n cms + kubectl wait --for condition=Ready pod -l app.kubernetes.io/component=triton --timeout 500s -n cms + + - name: Validate Deployment + run: | + kubectl get all -n cms + + - name: Run Perf Analyzer Job + run: | + kubectl apply -f tests/perf-analyzer-job-ci.yaml + kubectl wait --for=condition=complete job/perf-analyzer-job -n cms --timeout=300s || \ + (echo "Perf-analyzer job did not complete in time or failed." && exit 1) + + POD_NAME=$(kubectl get pods -n cms -l job-name=perf-analyzer-job -o jsonpath="{.items[0].metadata.name}") + echo "========== Perf Analyzer Logs ==========" + kubectl logs -n cms "$POD_NAME" + echo "========================================" + + - name: Cleanup + run: kind delete cluster --name gh-k8s-cluster diff --git a/docs/.values-table.md b/docs/.values-table.md index b9082b9b..78b6fe85 100644 --- a/docs/.values-table.md +++ b/docs/.values-table.md @@ -27,6 +27,10 @@ | envoy.replicas | int | `1` | Number of Envoy Proxy pods in Deployment | | envoy.image | string | `"envoyproxy/envoy:v1.30.9"` | Envoy Proxy Docker image | | envoy.args | list | `["--config-path","/etc/envoy/envoy.yaml","--log-level","info","--log-path","/dev/stdout"]` | Arguments for Envoy | +| envoy.external_config | object | `{"configmap_key":"","configmap_name":"","load_from_configmap":false}` | External Envoy configuration settings | +| envoy.external_config.load_from_configmap | bool | `false` | If true, load Envoy configuration from an external ConfigMap instead of generating it dynamically | +| envoy.external_config.configmap_name | string | `""` | Name of the external ConfigMap containing the Envoy configuration | +| envoy.external_config.configmap_key | string | `""` | Key name in the external ConfigMap (defaults to "envoy.yaml") | | envoy.resources | object | `{"limits":{"cpu":8,"memory":"4G"},"requests":{"cpu":1,"memory":"2G"}}` | Resource requests and limits for Envoy Proxy. Note: an Envoy Proxy with too many connections might run out of CPU | | envoy.annotations | object | `{}` | Annotations for Envoy pods | | envoy.nodeSelector | object | `{}` | Node selector for Envoy pods | diff --git a/docs/configuration-guide.rst b/docs/configuration-guide.rst index 4c4788dc..4f6011a9 100644 --- a/docs/configuration-guide.rst +++ b/docs/configuration-guide.rst @@ -149,6 +149,19 @@ There are two options: In this case, the client connections should be established to ``:8001`` and NOT use SSL. +Some Envoy Proxy parameters, such as load balancing policy, rate limiting, and authentication, +can be cofigured directly in the ``values.yaml`` file as described in sections below. + +Alternatively, you can provide an external Envoy configuration file to override the +default configuration completely (the configuration file must be supplied as a ConfigMap): + +.. code-block:: yaml + + envoy: + external_config: + load_from_configmap: true + configmap_name: external-envoy-config + configmap_key: envoy.yaml 5. (Optional) Configure Rate Limiting in Envoy Proxy ====================================================== diff --git a/helm/supersonic/templates/envoy/configmaps.yaml b/helm/supersonic/templates/envoy/configmaps.yaml index 06e80815..63b2132e 100644 --- a/helm/supersonic/templates/envoy/configmaps.yaml +++ b/helm/supersonic/templates/envoy/configmaps.yaml @@ -1,4 +1,5 @@ {{- /* Define the Envoy configuration in YAML at the top level */}} +{{- /* (it will be used only if envoy.external_config.load_from_configmap is false) */}} {{- define "envoy.configuration.yaml" }} {{- with . }} admin: @@ -182,6 +183,11 @@ static_resources: {{- /* Begin iterating over servers */}} {{- if .Values.envoy.enabled }} +{{- /* Initialize envoyContext variable */}} +{{- $envoyContext := dict }} + +{{- if not .Values.envoy.external_config.load_from_configmap }} +{{- /* Only prepare dynamic configuration context when not using external config */}} {{- /* Define variables for ports */}} {{- $tritonGrpcPort := "" -}} {{- $envoyGrpcPort := "" -}} @@ -202,7 +208,7 @@ static_resources: {{- $tritonName := include "supersonic.tritonName" . }} {{- /* Create a context to pass to the template */}} -{{- $envoyContext := dict +{{- $envoyContext = dict "envoyAdminPort" $envoyAdminPort "envoyGrpcPort" $envoyGrpcPort "tritonGrpcPort" $tritonGrpcPort @@ -212,6 +218,7 @@ static_resources: "tritonName" $tritonName "root" . }} +{{- end }} apiVersion: v1 kind: ConfigMap @@ -223,10 +230,37 @@ metadata: app.kubernetes.io/component: envoy data: envoy.yaml: |- +{{- if .Values.envoy.external_config.load_from_configmap }} + {{- if .Values.envoy.external_config.configmap_name }} + {{- /* Load configuration from external ConfigMap */}} + {{- $configmapName := .Values.envoy.external_config.configmap_name }} + {{- $dataKey := .Values.envoy.external_config.configmap_key | default "envoy.yaml" }} + {{- $externalConfig := (lookup "v1" "ConfigMap" .Release.Namespace $configmapName) }} + {{- if $externalConfig }} + {{- if $externalConfig.data }} + {{- if hasKey $externalConfig.data $dataKey }} + {{- /* Use the data from the external ConfigMap */}} + {{- (get $externalConfig.data $dataKey) | nindent 4 }} + {{- else }} + {{- fail (printf "Expected key '%s' not found in ConfigMap '%s/%s'" $dataKey .Release.Namespace $configmapName) }} + {{- end }} + {{- else }} + {{- fail (printf "No data found in ConfigMap '%s/%s'" .Release.Namespace $configmapName) }} + {{- end }} + {{- else }} + {{- fail (printf "External ConfigMap '%s/%s' not found. Please ensure the ConfigMap exists." .Release.Namespace $configmapName) }} + {{- end }} + {{- else }} + {{- /* Error if load_from_configmap is true but configmap_name is empty */}} + {{- fail "envoy.external_config.configmap_name must be specified when envoy.external_config.load_from_configmap is true" }} + {{- end }} +{{- else }} +{{- /* Use dynamically generated configuration */}} {{ include "envoy.configuration.yaml" $envoyContext | indent 4 }} +{{- end }} --- -{{- if .Values.envoy.rate_limiter.prometheus_based.enabled }} +{{- if and .Values.envoy.rate_limiter.prometheus_based.enabled (not .Values.envoy.external_config.load_from_configmap) }} {{- /* Create a ConfigMap for the Lua filter */}} apiVersion: v1 kind: ConfigMap diff --git a/helm/supersonic/values.schema.json b/helm/supersonic/values.schema.json index 089419e7..993c0792 100644 --- a/helm/supersonic/values.schema.json +++ b/helm/supersonic/values.schema.json @@ -252,6 +252,25 @@ "type": "string" } }, + "external_config": { + "type": "object", + "properties": { + "load_from_configmap": { + "type": "boolean" + }, + "configmap_name": { + "type": "string" + }, + "configmap_key": { + "type": "string" + } + }, + "required": [ + "configmap_key", + "configmap_name", + "load_from_configmap" + ] + }, "resources": { "type": "object", "properties": { @@ -450,6 +469,7 @@ "args", "auth", "enabled", + "external_config", "grpc_route_timeout", "image", "ingress", diff --git a/helm/supersonic/values.yaml b/helm/supersonic/values.yaml index 6ac4ef07..ca3e2266 100644 --- a/helm/supersonic/values.yaml +++ b/helm/supersonic/values.yaml @@ -125,6 +125,15 @@ envoy: # -- Arguments for Envoy args: ["--config-path", "/etc/envoy/envoy.yaml", "--log-level", "info", "--log-path", "/dev/stdout"] + # -- External Envoy configuration settings + external_config: + # -- If true, load Envoy configuration from an external ConfigMap instead of generating it dynamically + load_from_configmap: false + # -- Name of the external ConfigMap containing the Envoy configuration + configmap_name: "" + # -- Key name in the external ConfigMap (defaults to "envoy.yaml") + configmap_key: "" + # -- Resource requests and limits for Envoy Proxy. # Note: an Envoy Proxy with too many connections might run out of CPU resources: diff --git a/tests/envoy-config-test.yaml b/tests/envoy-config-test.yaml new file mode 100644 index 00000000..793e249e --- /dev/null +++ b/tests/envoy-config-test.yaml @@ -0,0 +1,64 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: external-envoy-config + namespace: cms +data: + envoy.yaml: | + # External Envoy configuration for CI testing + admin: + access_log_path: /tmp/admin_access.log + address: + socket_address: + address: 0.0.0.0 + port_value: 9901 + + static_resources: + listeners: + - name: listener_grpc + address: + socket_address: + address: 0.0.0.0 + port_value: 8001 + + filter_chains: + - filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + generate_request_id: true + stat_prefix: ingress_grpc + codec_type: AUTO + route_config: + name: local_route_grpc + virtual_hosts: + - name: backend_grpc + domains: ["*"] + routes: + - match: + prefix: "/" + route: + cluster: triton_grpc_service + timeout: 0s + + http_filters: + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + http2_protocol_options: {} + + clusters: + - name: triton_grpc_service + connect_timeout: 0.25s + type: STRICT_DNS + lb_policy: LEAST_REQUEST + http2_protocol_options: {} + load_assignment: + cluster_name: triton_grpc_service + endpoints: + - lb_endpoints: + - endpoint: + address: + socket_address: + address: supersonic-triton + port_value: 8001 \ No newline at end of file diff --git a/tests/values-external-envoy-config.yaml b/tests/values-external-envoy-config.yaml new file mode 100644 index 00000000..bcff6586 --- /dev/null +++ b/tests/values-external-envoy-config.yaml @@ -0,0 +1,47 @@ +# Test values for external Envoy configuration testing + +triton: + replicas: 1 + image: fastml/triton-torchgeo:22.07-py3-geometric # works for CMSSW run3 + command: ["/bin/sh", "-c"] + args: + - | + /opt/tritonserver/bin/tritonserver \ + --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoBTag/Combined/data/models/ \ + --log-verbose=0 \ + --disable-auto-complete-config \ + --exit-timeout-secs=60 + resources: + limits: { cpu: 1, memory: 3Gi} + requests: { cpu: 1, memory: 1Gi} + modelRepository: + enabled: true + storageType: cvmfs-pvc + mountPath: /cvmfs + readinessProbe: + reset: true + +envoy: + enabled: true + external_config: + load_from_configmap: true + configmap_name: external-envoy-config + configmap_key: envoy.yaml + +prometheus: + enabled: false + +grafana: + enabled: false + +keda: + enabled: false + +tempo: + enabled: false + +opentelemetry-collector: + enabled: false + +metricsCollector: + enabled: false