From 7148c0932819ab65bfff1a73902c3196d45d595e Mon Sep 17 00:00:00 2001 From: kondratyevd Date: Tue, 29 Jul 2025 16:10:03 -0400 Subject: [PATCH 01/27] remove plugin from documentation and create a CI job to test installation instructions --- ...aller-plugin.yaml => ci-installation.yaml} | 15 ++++----- README.md | 26 ++-------------- docs/advanced-monitoring.rst | 3 +- docs/getting-started.rst | 31 +++---------------- 4 files changed, 16 insertions(+), 59 deletions(-) rename .github/workflows/{ci-github-installer-plugin.yaml => ci-installation.yaml} (78%) diff --git a/.github/workflows/ci-github-installer-plugin.yaml b/.github/workflows/ci-installation.yaml similarity index 78% rename from .github/workflows/ci-github-installer-plugin.yaml rename to .github/workflows/ci-installation.yaml index 85295b9e..389f97a8 100644 --- a/.github/workflows/ci-github-installer-plugin.yaml +++ b/.github/workflows/ci-installation.yaml @@ -1,4 +1,4 @@ -name: ci [installer plugin] +name: ci [installation] on: push: @@ -9,7 +9,7 @@ on: - "main" jobs: - test-installer-plugin: + test-installation: runs-on: ubuntu-latest steps: - name: Checkout code @@ -25,9 +25,9 @@ jobs: with: version: v3.12.0 - - name: Create CMS namespace + - name: Create test namespace run: | - kubectl create namespace cms + kubectl create namespace test-ns - name: Install Prometheus Operator CRDs run: | @@ -36,7 +36,7 @@ jobs: kubectl create namespace monitoring helm install prometheus-operator prometheus-community/kube-prometheus-stack --namespace monitoring --set prometheusOperator.createCustomResource=false --set defaultRules.create=false --set alertmanager.enabled=false --set prometheus.enabled=false --set grafana.enabled=false - - name: Install KEDA Autoscaler + - name: Install KEDA Autoscaler CRDs run: | helm repo add kedacore https://kedacore.github.io/charts helm repo update @@ -45,8 +45,9 @@ jobs: - name: Install SuperSONIC from remote repo via plugin run: | - helm plugin install . - helm install-supersonic supersonic --local --values values/values-cms-ci.yaml -n cms + helm repo add fastml https://github.com/fastmachinelearning/SuperSONIC/ + helm repo update + helm install supersonic fastml/supersonic -n test-ns -f values/values-cms-ci.yaml - name: Cleanup run: kind delete cluster --name gh-k8s-cluster \ No newline at end of file diff --git a/README.md b/README.md index e63ca654..15c28a52 100644 --- a/README.md +++ b/README.md @@ -31,30 +31,10 @@ The main components of SuperSONIC are: ## Installation -The installation is done via a custom Helm plugin which takes care of -internal connectivity of the chart components. Standard Helm installation -is also supported, but requires a lot more manual configuration. - -``` -helm plugin install https://github.com/fastmachinelearning/SuperSONIC/ -helm install-supersonic -n -f ``` - -Installer plugin usage: -``` -Usage: - helm install-supersonic [RELEASE_NAME] [flags] - -Flags: - -h, --help Show this help message - -f, --values Specify values file for custom configuration - -n, --namespace Specify Kubernetes namespace for deployment - --version Specify chart version (default: latest version) - Note: Ignored if --local flag is set - --local Install from local chart path instead of remote repository - --path Local chart path (default: ./helm/supersonic) - Only used when --local flag is set -Additional flags will be passed directly to the 'helm install' command +helm repo add fastml https://github.com/fastmachinelearning/SuperSONIC/ +helm repo update +helm install fastml/supersonic -n -f ``` To construct the `values.yaml` file for your application, follow [Configuration guide](http://fastmachinelearning.org/SuperSONIC/configuration-guide.html "Configuration guide"). diff --git a/docs/advanced-monitoring.rst b/docs/advanced-monitoring.rst index 13e63a9f..632789cc 100644 --- a/docs/advanced-monitoring.rst +++ b/docs/advanced-monitoring.rst @@ -43,8 +43,7 @@ Displaying Tracing Data in Grafana If Grafana is enabled in your ``values.yaml``, you can display the tracing data in the Grafana dashboard. In order to achieve this, Grafana needs to have a -Tempo datasource configured. This is done automatically when you install -SuperSONIC via the ``install-supersonic`` plugin. +Tempo datasource configured. If OpenTelemetry Collector and Tempo are enabled, the default Grafana dashboard will include an interactive server map, where you can study tracing data in detail diff --git a/docs/getting-started.rst b/docs/getting-started.rst index 51c246fe..ec36ac8b 100644 --- a/docs/getting-started.rst +++ b/docs/getting-started.rst @@ -19,41 +19,18 @@ Installation - `Configuration reference `_ - `Example values.yaml files `_ - 2. Install Helm plugin to handle SuperSONIC installation + 2. Install Helm repository .. code:: shell - helm plugin install https://github.com/fastmachinelearning/SuperSONIC/ - - - The Helm plugin is needed to ensure internal connectivity of the SuperSONIC - components. Standard Helm installation without a plugin is also supported, - but requires a lot more manual configuration. + helm repo add fastml https://github.com/fastmachinelearning/SuperSONIC/ + helm repo update 3. Modify the following command to install the chart at your cluster: .. code:: shell - helm install-supersonic -n -f - - Installer plugin usage: - - .. code:: shell - - Usage: - helm install-supersonic [RELEASE_NAME] [flags] - - Flags: - -h, --help Show this help message - -f, --values Specify values file for custom configuration - -n, --namespace Specify Kubernetes namespace for deployment - --version Specify chart version (default: latest version) - Note: Ignored if --local flag is set - --local Install from local chart path instead of remote repository - --path Local chart path (default: ./helm/supersonic) - Only used when --local flag is set - Additional flags will be passed directly to the 'helm install' command - + helm install fastml/supersonic -n -f Use a unique meaningful lowercase value as , for example ``supersonic-cms-run3``. From b449a77b2a82adecb1cbb77af87cbf4c87cd06f1 Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Tue, 29 Jul 2025 20:10:34 +0000 Subject: [PATCH 02/27] Update helm docs --- helm/supersonic/README.md | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/helm/supersonic/README.md b/helm/supersonic/README.md index e63ca654..15c28a52 100644 --- a/helm/supersonic/README.md +++ b/helm/supersonic/README.md @@ -31,30 +31,10 @@ The main components of SuperSONIC are: ## Installation -The installation is done via a custom Helm plugin which takes care of -internal connectivity of the chart components. Standard Helm installation -is also supported, but requires a lot more manual configuration. - -``` -helm plugin install https://github.com/fastmachinelearning/SuperSONIC/ -helm install-supersonic -n -f ``` - -Installer plugin usage: -``` -Usage: - helm install-supersonic [RELEASE_NAME] [flags] - -Flags: - -h, --help Show this help message - -f, --values Specify values file for custom configuration - -n, --namespace Specify Kubernetes namespace for deployment - --version Specify chart version (default: latest version) - Note: Ignored if --local flag is set - --local Install from local chart path instead of remote repository - --path Local chart path (default: ./helm/supersonic) - Only used when --local flag is set -Additional flags will be passed directly to the 'helm install' command +helm repo add fastml https://github.com/fastmachinelearning/SuperSONIC/ +helm repo update +helm install fastml/supersonic -n -f ``` To construct the `values.yaml` file for your application, follow [Configuration guide](http://fastmachinelearning.org/SuperSONIC/configuration-guide.html "Configuration guide"). From 1311da54764bb26f63791ca5af7ff4536734601a Mon Sep 17 00:00:00 2001 From: kondratyevd Date: Tue, 29 Jul 2025 16:14:10 -0400 Subject: [PATCH 03/27] fix repo URL --- .github/workflows/ci-installation.yaml | 2 +- README.md | 2 +- docs/getting-started.rst | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci-installation.yaml b/.github/workflows/ci-installation.yaml index 389f97a8..a33da8f0 100644 --- a/.github/workflows/ci-installation.yaml +++ b/.github/workflows/ci-installation.yaml @@ -45,7 +45,7 @@ jobs: - name: Install SuperSONIC from remote repo via plugin run: | - helm repo add fastml https://github.com/fastmachinelearning/SuperSONIC/ + helm repo add fastml https://fastmachinelearning.org/SuperSONIC/ helm repo update helm install supersonic fastml/supersonic -n test-ns -f values/values-cms-ci.yaml diff --git a/README.md b/README.md index 15c28a52..ff7b3362 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ The main components of SuperSONIC are: ## Installation ``` -helm repo add fastml https://github.com/fastmachinelearning/SuperSONIC/ +helm repo add fastml https://fastmachinelearning.org/SuperSONIC helm repo update helm install fastml/supersonic -n -f ``` diff --git a/docs/getting-started.rst b/docs/getting-started.rst index ec36ac8b..bccb249a 100644 --- a/docs/getting-started.rst +++ b/docs/getting-started.rst @@ -23,7 +23,7 @@ Installation .. code:: shell - helm repo add fastml https://github.com/fastmachinelearning/SuperSONIC/ + helm repo add fastml https://fastmachinelearning.org/SuperSONIC/ helm repo update 3. Modify the following command to install the chart at your cluster: From 4f6a0b55e9b9ded5205f9d01d9f33d576157721b Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Tue, 29 Jul 2025 20:14:46 +0000 Subject: [PATCH 04/27] Update helm docs --- helm/supersonic/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/supersonic/README.md b/helm/supersonic/README.md index 15c28a52..ff7b3362 100644 --- a/helm/supersonic/README.md +++ b/helm/supersonic/README.md @@ -32,7 +32,7 @@ The main components of SuperSONIC are: ## Installation ``` -helm repo add fastml https://github.com/fastmachinelearning/SuperSONIC/ +helm repo add fastml https://fastmachinelearning.org/SuperSONIC helm repo update helm install fastml/supersonic -n -f ``` From 75128be51dd8320bcf0faf44008f2569c6e550d1 Mon Sep 17 00:00:00 2001 From: kondratyevd Date: Tue, 29 Jul 2025 16:29:39 -0400 Subject: [PATCH 05/27] expand values.yaml for CMS Geddes config --- values/values-geddes-cms.yaml | 66 ++++++++++++++++++++++++++++++++--- 1 file changed, 61 insertions(+), 5 deletions(-) diff --git a/values/values-geddes-cms.yaml b/values/values-geddes-cms.yaml index 453ae6e6..a602f624 100644 --- a/values/values-geddes-cms.yaml +++ b/values/values-geddes-cms.yaml @@ -1,4 +1,5 @@ -serverLoadThreshold: 100 +serverLoadThreshold: 20 +serverLoadMetric: 'sum by (release) (rate(nv_inference_queue_duration_us{release=~"sonic-server"}[30s]) / (rate(nv_inference_exec_count{release=~"sonic-server"}[30s]) * 1000 + 0.001))' triton: # image: fastml/triton-torchgeo:21.02-py3-geometric # run2 @@ -15,6 +16,12 @@ triton: --log-verbose=0 \ --strict-model-config=false \ --exit-timeout-secs=60 + # --trace-config mode=opentelemetry + # --trace-config=opentelemetry,resource=pod_name=$(hostname) + # --trace-config opentelemetry,url=supersonic-opentelemetry-collector:4318/v1/traces + # --trace-config rate=0.0001 + # --trace-config level=TIMESTAMPS + # --trace-config count=-1 resources: limits: { nvidia.com/gpu: 1, cpu: 2, memory: 4G} requests: { nvidia.com/gpu: 1, cpu: 2, memory: 4G} @@ -57,14 +64,63 @@ tolerations: prometheus: enabled: true server: + useExistingClusterRoleName: supersonic-prometheus-role ingress: enabled: true - hostName: prometheus-cms.geddes.rcac.purdue.edu + hosts: + - prometheus-cms.geddes.rcac.purdue.edu + tls: + - hosts: + - prometheus-cms.geddes.rcac.purdue.edu ingressClassName: public + serviceAccounts: + server: + name: supersonic-prometheus-sa grafana: enabled: true + dashboardsConfigMaps: + default: supersonic-grafana-default-dashboard + datasources: + datasources.yaml: + datasources: + - name: prometheus + type: prometheus + access: proxy + isDefault: true + url: http://supersonic-prometheus-server:9090 + jsonData: + timeInterval: "5s" + tlsSkipVerify: true + - name: tempo + type: tempo + url: http://supersonic-tempo:3100 + access: proxy + isDefault: false + basicAuth: false + jsonData: + timeInterval: "5s" + tlsSkipVerify: true + serviceMap: + datasourceUid: 'prometheus' + nodeGraph: + enabled: true ingress: - enabled: true - hostName: grafana-cms.geddes.rcac.purdue.edu - ingressClassName: public \ No newline at end of file + hosts: + - grafana-cms.geddes.rcac.purdue.edu + tls: + - hosts: + - grafana-cms.geddes.rcac.purdue.edu + ingressClassName: public + grafana.ini: + server: + root_url: https://grafana-cms.geddes.rcac.purdue.edu + serviceAccounts: + grafana: + name: supersonic-grafana-sa + +opentelemetry-collector: + enabled: true + +tempo: + enabled: true \ No newline at end of file From aae90a79c7c98cdefacc3edaeada4dfd55cb2dff Mon Sep 17 00:00:00 2001 From: kondratyevd Date: Tue, 29 Jul 2025 16:41:06 -0400 Subject: [PATCH 06/27] don't validate prometheus --- helm/supersonic/templates/NOTES.txt | 1 - helm/supersonic/templates/_helpers/_prometheus.tpl | 7 ------- 2 files changed, 8 deletions(-) diff --git a/helm/supersonic/templates/NOTES.txt b/helm/supersonic/templates/NOTES.txt index 8c89dc51..57f24f60 100644 --- a/helm/supersonic/templates/NOTES.txt +++ b/helm/supersonic/templates/NOTES.txt @@ -2,7 +2,6 @@ {{- /* Run validation checks */ -}} {{- include "supersonic.validateGrafanaAddressConsistency" . -}} {{- include "supersonic.validateGrafanaValues" . -}} -{{- include "supersonic.validatePrometheus" . -}} {{- include "supersonic.validatePrometheusAddressConsistency" . -}} {{- include "supersonic.validatePrometheusValues" . -}} diff --git a/helm/supersonic/templates/_helpers/_prometheus.tpl b/helm/supersonic/templates/_helpers/_prometheus.tpl index 2a7ffa8a..312d53da 100644 --- a/helm/supersonic/templates/_helpers/_prometheus.tpl +++ b/helm/supersonic/templates/_helpers/_prometheus.tpl @@ -62,13 +62,6 @@ Check if Prometheus exists in the namespace {{- include "supersonic.common.serviceExists" (dict "serviceName" "prometheus" "root" .) -}} {{- end -}} -{{/* -Validate that there is no existing Prometheus instance when enabling a new one -*/}} -{{- define "supersonic.validatePrometheus" -}} -{{- include "supersonic.common.validateNoExistingService" (dict "serviceType" "prometheus" "values" .Values "root" .) -}} -{{- end -}} - {{/* Validate RBAC permissions for Prometheus */}} From 83cd87d0199912eeb4a7a6129754824b717c75f0 Mon Sep 17 00:00:00 2001 From: kondratyevd Date: Tue, 29 Jul 2025 16:41:16 -0400 Subject: [PATCH 07/27] instructions to install from GitHub --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index ff7b3362..85837a4a 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,14 @@ To construct the `values.yaml` file for your application, follow [Configuration The full list of configuration parameters is available in the [Configuration reference](http://fastmachinelearning.org/SuperSONIC/configuration-reference.html "Configuration reference"). +### Install from GitHub + +``` +git clone https://github.com/fastmachinelearning/SuperSONIC.git +cd SuperSONIC +helm dependency build helm/supersonic +helm install helm/supersonic -n -f +``` ## Server diagram From 0bbc22e2121c2be1a7ea77baf416ec424c3d8948 Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Tue, 29 Jul 2025 20:41:46 +0000 Subject: [PATCH 08/27] Update helm docs --- helm/supersonic/README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/helm/supersonic/README.md b/helm/supersonic/README.md index ff7b3362..85837a4a 100644 --- a/helm/supersonic/README.md +++ b/helm/supersonic/README.md @@ -41,6 +41,14 @@ To construct the `values.yaml` file for your application, follow [Configuration The full list of configuration parameters is available in the [Configuration reference](http://fastmachinelearning.org/SuperSONIC/configuration-reference.html "Configuration reference"). +### Install from GitHub + +``` +git clone https://github.com/fastmachinelearning/SuperSONIC.git +cd SuperSONIC +helm dependency build helm/supersonic +helm install helm/supersonic -n -f +``` ## Server diagram From 1b20f4e708c64e859e6b7593e15294b1e4c7c2d8 Mon Sep 17 00:00:00 2001 From: kondratyevd Date: Tue, 29 Jul 2025 16:46:53 -0400 Subject: [PATCH 09/27] update cms values --- values/values-geddes-cms.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/values/values-geddes-cms.yaml b/values/values-geddes-cms.yaml index a602f624..2af0c764 100644 --- a/values/values-geddes-cms.yaml +++ b/values/values-geddes-cms.yaml @@ -18,7 +18,7 @@ triton: --exit-timeout-secs=60 # --trace-config mode=opentelemetry # --trace-config=opentelemetry,resource=pod_name=$(hostname) - # --trace-config opentelemetry,url=supersonic-opentelemetry-collector:4318/v1/traces + # --trace-config opentelemetry,url=sonic-server-opentelemetry-collector:4318/v1/traces # --trace-config rate=0.0001 # --trace-config level=TIMESTAMPS # --trace-config count=-1 @@ -64,7 +64,7 @@ tolerations: prometheus: enabled: true server: - useExistingClusterRoleName: supersonic-prometheus-role + useExistingClusterRoleName: sonic-server-prometheus-role ingress: enabled: true hosts: @@ -75,12 +75,12 @@ prometheus: ingressClassName: public serviceAccounts: server: - name: supersonic-prometheus-sa + name: sonic-server-prometheus-sa grafana: enabled: true dashboardsConfigMaps: - default: supersonic-grafana-default-dashboard + default: sonic-server-grafana-default-dashboard datasources: datasources.yaml: datasources: @@ -88,13 +88,13 @@ grafana: type: prometheus access: proxy isDefault: true - url: http://supersonic-prometheus-server:9090 + url: http://sonic-server-prometheus-server:9090 jsonData: timeInterval: "5s" tlsSkipVerify: true - name: tempo type: tempo - url: http://supersonic-tempo:3100 + url: http://sonic-server-tempo:3100 access: proxy isDefault: false basicAuth: false @@ -117,7 +117,7 @@ grafana: root_url: https://grafana-cms.geddes.rcac.purdue.edu serviceAccounts: grafana: - name: supersonic-grafana-sa + name: sonic-server-grafana-sa opentelemetry-collector: enabled: true From 8c2a376166cbef0facd47885f102aa9c6050888d Mon Sep 17 00:00:00 2001 From: kondratyevd Date: Tue, 29 Jul 2025 16:47:09 -0400 Subject: [PATCH 10/27] configure tracing --- values/values-geddes-cms.yaml | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/values/values-geddes-cms.yaml b/values/values-geddes-cms.yaml index 2af0c764..f1278b0c 100644 --- a/values/values-geddes-cms.yaml +++ b/values/values-geddes-cms.yaml @@ -121,6 +121,17 @@ grafana: opentelemetry-collector: enabled: true - + config: + exporters: + otlp: + endpoint: http://sonic-server-tempo:4317 + otlphttp: + endpoint: http://sonic-server-tempo:4318 + prometheusremotewrite: + endpoint: http://sonic-server-prometheus-server:9090/api/v1/write tempo: - enabled: true \ No newline at end of file + enabled: true + tempo: + metricsGenerator: + enabled: true + remoteWriteUrl: http://sonic-server-prometheus-server:9090/api/v1/write \ No newline at end of file From da1f007caee5c7f13effb7d5a33347018b403c1c Mon Sep 17 00:00:00 2001 From: kondratyevd Date: Tue, 29 Jul 2025 16:53:04 -0400 Subject: [PATCH 11/27] decouple parameters that control tracing rate in OTel and Triton --- helm/supersonic/templates/envoy/configmaps.yaml | 2 +- helm/supersonic/values.yaml | 11 +++++++++-- values/values-geddes-cms.yaml | 2 +- values/values-nautilus-cms.yaml | 2 +- 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/helm/supersonic/templates/envoy/configmaps.yaml b/helm/supersonic/templates/envoy/configmaps.yaml index a96eaecf..06e80815 100644 --- a/helm/supersonic/templates/envoy/configmaps.yaml +++ b/helm/supersonic/templates/envoy/configmaps.yaml @@ -36,7 +36,7 @@ static_resources: {{- if (index .root.Values "opentelemetry-collector" "enabled") }} tracing: random_sampling: - value: {{ mulf .root.Values.tracing_sampling_rate 100 }} + value: {{ mulf .envoy.tracing_sampling_rate 100 }} provider: name: envoy.tracers.opentelemetry typed_config: diff --git a/helm/supersonic/values.yaml b/helm/supersonic/values.yaml index c261f198..367b6783 100644 --- a/helm/supersonic/values.yaml +++ b/helm/supersonic/values.yaml @@ -25,6 +25,13 @@ triton: --model-repository=/tmp/ \ --log-verbose=0 \ --exit-timeout-secs=60 + # To enable OpenTelemetry tracing: + # --trace-config mode=opentelemetry + # --trace-config=opentelemetry,resource=pod_name=$(hostname) + # --trace-config opentelemetry,url=sonic-server-opentelemetry-collector:4318/v1/traces + # --trace-config rate=100 # 1 in 100 requests + # --trace-config level=TIMESTAMPS + # --trace-config count=-1 # -- Resource limits and requests for each Triton instance. # You can add necessary GPU request here. @@ -170,6 +177,8 @@ envoy: audiences: [] url: "" port: 443 + + tracing_sampling_rate: 0.01 # must be 1 / triton sampling rate autoscaler: @@ -395,8 +404,6 @@ tempo: readinessProbe: initialDelaySeconds: 0 -tracing_sampling_rate: 0.01 - opentelemetry-collector: enabled: false image: diff --git a/values/values-geddes-cms.yaml b/values/values-geddes-cms.yaml index f1278b0c..1b17642e 100644 --- a/values/values-geddes-cms.yaml +++ b/values/values-geddes-cms.yaml @@ -19,7 +19,7 @@ triton: # --trace-config mode=opentelemetry # --trace-config=opentelemetry,resource=pod_name=$(hostname) # --trace-config opentelemetry,url=sonic-server-opentelemetry-collector:4318/v1/traces - # --trace-config rate=0.0001 + # --trace-config rate=100 # 1 in 100 requests # --trace-config level=TIMESTAMPS # --trace-config count=-1 resources: diff --git a/values/values-nautilus-cms.yaml b/values/values-nautilus-cms.yaml index 51bad052..7a015cf3 100644 --- a/values/values-nautilus-cms.yaml +++ b/values/values-nautilus-cms.yaml @@ -59,6 +59,7 @@ envoy: haproxy-ingress.github.io/timeout-queue: "1m" haproxy-ingress.github.io/health-check-interval: "30s" haproxy-ingress.github.io/health-check-rise-count: "1" + tracing_sampling_rate: 0.001 autoscaler: enabled: false @@ -99,7 +100,6 @@ grafana: haproxy-ingress.github.io/proxy-body-size: "512m" haproxy-ingress.github.io/timeout-http-request: "5m" -tracing_sampling_rate: 0.001 opentelemetry-collector: enabled: true tempo: From d4dd10064bea4060ec5776528a47aa3d46237103 Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Tue, 29 Jul 2025 20:53:18 +0000 Subject: [PATCH 12/27] Update JSON schema --- helm/supersonic/values.schema.json | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/helm/supersonic/values.schema.json b/helm/supersonic/values.schema.json index 4ad91bde..c5974cef 100644 --- a/helm/supersonic/values.schema.json +++ b/helm/supersonic/values.schema.json @@ -419,6 +419,9 @@ "port", "url" ] + }, + "tracing_sampling_rate": { + "type": "number" } }, "required": [ @@ -432,7 +435,8 @@ "rate_limiter", "replicas", "resources", - "service" + "service", + "tracing_sampling_rate" ] }, "autoscaler": { @@ -1360,9 +1364,6 @@ "tempo" ] }, - "tracing_sampling_rate": { - "type": "number" - }, "opentelemetry-collector": { "type": "object", "properties": { @@ -1948,7 +1949,6 @@ "serverLoadThreshold", "tempo", "tolerations", - "tracing_sampling_rate", "triton" ] } \ No newline at end of file From 4e3a661f6cc6ada55d75d387f8de2294cc79491e Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Tue, 29 Jul 2025 20:53:37 +0000 Subject: [PATCH 13/27] Update helm docs --- docs/.values-table.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/.values-table.md b/docs/.values-table.md index 0847bf82..8bebe32f 100644 --- a/docs/.values-table.md +++ b/docs/.values-table.md @@ -43,6 +43,7 @@ | envoy.auth.audiences | list | `[]` | | | envoy.auth.url | string | `""` | | | envoy.auth.port | int | `443` | | +| envoy.tracing_sampling_rate | float | `0.01` | | | autoscaler.enabled | bool | `false` | Enable autoscaling (requires Prometheus to also be enabled). Autoscaling will be based on the metric is taken from parameter ``prometheus.serverLoadMetric``, new Triton servers will spawn if the metric exceedds the threshold set by ``prometheus.serverLoadThreshold``. | | autoscaler.minReplicaCount | int | `1` | Minimum and maximum number of Triton servers. Warning: if min=0 and desired Prometheus metric is empty, the first server will never start | | autoscaler.maxReplicaCount | int | `2` | | @@ -125,7 +126,6 @@ | tempo.tempo.receivers.otlp.protocols.http.endpoint | string | `"0.0.0.0:4318"` | | | tempo.tempo.livenessProbe.initialDelaySeconds | int | `0` | | | tempo.tempo.readinessProbe.initialDelaySeconds | int | `0` | | -| tracing_sampling_rate | float | `0.01` | | | opentelemetry-collector.enabled | bool | `false` | | | opentelemetry-collector.image.repository | string | `"otel/opentelemetry-collector-contrib"` | | | opentelemetry-collector.image.tag | string | `"0.120.0"` | | From c0f247b917378c8946af2c4500c48565f9f05ef2 Mon Sep 17 00:00:00 2001 From: kondratyevd Date: Tue, 29 Jul 2025 16:55:30 -0400 Subject: [PATCH 14/27] test both local and remote installations --- .github/workflows/ci-installation.yaml | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci-installation.yaml b/.github/workflows/ci-installation.yaml index a33da8f0..905e3fad 100644 --- a/.github/workflows/ci-installation.yaml +++ b/.github/workflows/ci-installation.yaml @@ -43,11 +43,20 @@ jobs: kubectl create namespace keda helm install keda kedacore/keda --namespace keda - - name: Install SuperSONIC from remote repo via plugin + - name: Test installation of SuperSONIC from remote repo via plugin run: | helm repo add fastml https://fastmachinelearning.org/SuperSONIC/ helm repo update helm install supersonic fastml/supersonic -n test-ns -f values/values-cms-ci.yaml + helm uninstall supersonic -n test-ns + + - name: Test installation of SuperSONIC from GitHub + run: | + git clone https://github.com/fastmachinelearning/SuperSONIC.git + cd SuperSONIC + helm dependency build helm/supersonic + helm install supersonic helm/supersonic -n test-ns -f values/values-cms-ci.yaml + helm uninstall supersonic -n test-ns - name: Cleanup run: kind delete cluster --name gh-k8s-cluster \ No newline at end of file From e79aeebbb5c99c76472853968ec42b71578940c6 Mon Sep 17 00:00:00 2001 From: kondratyevd Date: Tue, 29 Jul 2025 16:59:32 -0400 Subject: [PATCH 15/27] add dependencies to installation test --- .github/workflows/ci-installation.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/ci-installation.yaml b/.github/workflows/ci-installation.yaml index 905e3fad..f5501502 100644 --- a/.github/workflows/ci-installation.yaml +++ b/.github/workflows/ci-installation.yaml @@ -52,6 +52,13 @@ jobs: - name: Test installation of SuperSONIC from GitHub run: | + # Add dependencies + helm repo add prometheus-community https://prometheus-community.github.io/helm-charts + helm repo add grafana https://grafana.github.io/helm-charts + helm repo add opentelemetry https://open-telemetry.github.io/opentelemetry-helm-charts + helm repo update + + # Install SuperSONIC git clone https://github.com/fastmachinelearning/SuperSONIC.git cd SuperSONIC helm dependency build helm/supersonic From e83cabacb005b6f1cb5d3ac3e74d813a48785dc8 Mon Sep 17 00:00:00 2001 From: kondratyevd Date: Tue, 29 Jul 2025 17:02:03 -0400 Subject: [PATCH 16/27] remove installer plugin files --- installer-plugin/installer.py | 135 ------------------------------ installer-plugin/overrides.py | 108 ------------------------ installer-plugin/utils.py | 153 ---------------------------------- 3 files changed, 396 deletions(-) delete mode 100755 installer-plugin/installer.py delete mode 100644 installer-plugin/overrides.py delete mode 100644 installer-plugin/utils.py diff --git a/installer-plugin/installer.py b/installer-plugin/installer.py deleted file mode 100755 index bfc88526..00000000 --- a/installer-plugin/installer.py +++ /dev/null @@ -1,135 +0,0 @@ -#!/usr/bin/env python3 - -import os -import sys -import yaml -import subprocess -import tempfile -import logging -from typing import Optional, Dict -from utils import ( - deep_merge, - parse_args, - setup_logging -) -from overrides import ( - generate_overrides -) - -REPO_CHART = "fastml/supersonic" -REPO_URL = "https://fastmachinelearning.org/SuperSONIC" - -def process_values(values_file: Optional[str], chart_path: str, release_name: str, use_local: bool, version: Optional[str] = None) -> Dict: - """Process and merge values files.""" - logger = logging.getLogger("supersonic-installer") - logger.info("╔══════════════════════════════════════════════════════════════════════") - logger.info("║ Running Helm plugin 'install-supersonic' ") - logger.info("╠══════════════════════════════════════════════════════════════════════") - - # Get default values - if use_local: - if not os.path.isdir(chart_path): - logger.error(f"Error: SuperSONIC chart not found at {chart_path}") - sys.exit(1) - - default_values_path = os.path.join(chart_path, "values.yaml") - if not os.path.isfile(default_values_path): - logger.error("Error: Default values file not found in chart") - sys.exit(1) - logger.info(f"║ Default values: {default_values_path} ") - with open(default_values_path, 'r') as f: - result = yaml.safe_load(f) or {} - else: - # Add repository and fetch default values from remote - subprocess.run(["helm", "repo", "add", "fastml", REPO_URL], check=True) - subprocess.run(["helm", "repo", "update"], check=True) - - cmd = ["helm", "show", "values", REPO_CHART] - if version: - cmd.extend(["--version", version]) - - logger.info("║ Fetching default values from remote repository") - try: - values_output = subprocess.check_output(cmd, text=True) - result = yaml.safe_load(values_output) or {} - except subprocess.CalledProcessError as e: - logger.error(f"Error: Failed to fetch default values from repository: {e}") - sys.exit(1) - - # Load custom values - if values_file: - if not os.path.isfile(values_file): - logger.error(f"Error: values file '{values_file}' not found") - sys.exit(1) - logger.info(f"║ Custom values: {values_file} ") - with open(values_file, 'r') as f: - # Merge custom values with default values - result = deep_merge(result, yaml.safe_load(f) or {}) - - # Generate overrides - overrides = generate_overrides(release_name, result) - logger.info("║ Generated overrides for config sections:") - for key in overrides: - logger.info(f"║ • {key}") - - # Merge overrides with result - result = deep_merge(result, overrides) - return result - -def main() -> None: - """Main entry point.""" - # Setup logging - logger = setup_logging() - - args, _ = parse_args() - - # Process values: merge default values with custom values, then generate overrides - # and merge them onto the result - merged_values = process_values(args.values_file, args.path, args.release_name, args.local, args.version) - - # Write generated values to a temporary file - with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as tmp: - yaml.dump(merged_values, tmp, default_flow_style=False) - tmp_values_file = tmp.name - logger.info(f"║ Writing merged values to temporary file: {tmp_values_file} ") - logger.info("╚══════════════════════════════════════════════════════════════════════") - - try: - # Construct and execute helm command - chart_source = args.path if args.local else REPO_CHART - - # Add dependencies - repo_commands = [] - if merged_values.get("prometheus", {}).get("enabled", False): - repo_commands.append(["helm", "repo", "add", "prometheus-community", "https://prometheus-community.github.io/helm-charts"]) - if merged_values.get("grafana", {}).get("enabled", False): - repo_commands.append(["helm", "repo", "add", "grafana", "https://grafana.github.io/helm-charts"]) - if merged_values.get("opentelemetry-collector", {}).get("enabled", False): - repo_commands.append(["helm", "repo", "add", "opentelemetry", "https://open-telemetry.github.io/opentelemetry-helm-charts"]) - if args.local: - repo_commands.append(["helm", "dependency", "build", chart_source]) - - for cmd in repo_commands: - logger.info(f"\nExecuting: {' '.join(cmd)}") - subprocess.run(cmd, check=True) - - cmd = ["helm", "install", args.release_name, chart_source, "-f", tmp_values_file] - if args.namespace: - cmd.extend(["-n", args.namespace]) - if args.helm_args: - cmd.extend(args.helm_args) - if not args.local and args.version: - cmd.extend(["--version", args.version]) - - logger.info(f"\nExecuting: {' '.join(cmd)}\n") - result = subprocess.run(cmd) - if result.returncode != 0: - sys.exit(result.returncode) - - finally: - # Clean up temporary file - logger.info(f"\n=== Cleaning up temporary valuesfile: {tmp_values_file} ===") - os.unlink(tmp_values_file) - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/installer-plugin/overrides.py b/installer-plugin/overrides.py deleted file mode 100644 index 492907a1..00000000 --- a/installer-plugin/overrides.py +++ /dev/null @@ -1,108 +0,0 @@ -import yaml -from typing import Dict - -def generate_overrides(release_name: str, values: Dict) -> Dict: - """Generate overrides to ensure internal consistency of SuperSONIC components""" - - # This may be changed in the future - prometheus_host = values.get("prometheus", {}).get("server", {}).get("ingress", {}).get("hostName", "") - grafana_host = values.get("grafana", {}).get("ingress", {}).get("hostName", "") - metrics_collector_host = values.get("metricsCollector", {}).get("ingress", {}).get("hostName", "") - - if values.get("prometheus", {}).get("external", {}).get("enabled", False): - prometheus_server = values.get("prometheus", {}).get("external", {}).get("url", "") - prometheus_server = "https://" + prometheus_server.split("//")[-1] - elif values.get("prometheus", {}).get("enabled", False): - prometheus_server = f"http://{release_name}-prometheus-server:9090" - else: - prometheus_server = "" - - # Start with overrides template - overrides_yaml = f""" -prometheus: - server: - useExistingClusterRoleName: {release_name}-prometheus-role - ingress: - hosts: [{prometheus_host}] - tls: - - hosts: [{prometheus_host}] - serviceAccounts: - server: - name: {release_name}-prometheus-sa - -grafana: - dashboardsConfigMaps: - default: {release_name}-grafana-default-dashboard - datasources: - datasources.yaml: - datasources: - - name: prometheus - type: prometheus - access: proxy - isDefault: true - url: {prometheus_server} - jsonData: - timeInterval: "5s" - tlsSkipVerify: true - - name: tempo - type: tempo - url: http://{release_name}-tempo:3100 - access: proxy - isDefault: false - basicAuth: false - jsonData: - timeInterval: "5s" - tlsSkipVerify: true - serviceMap: - datasourceUid: 'prometheus' - nodeGraph: - enabled: true - ingress: - hosts: [{grafana_host}] - tls: - - hosts: [{grafana_host}] - grafana.ini: - server: - root_url: https://{grafana_host} - -metricsCollector: - ingress: - hosts: [{metrics_collector_host}] - tls: - - hosts: [{metrics_collector_host}] -""" - # Parse YAML string into dictionary - overrides = yaml.safe_load(overrides_yaml) - - # Clean up empty values - if not values.get("prometheus", {}).get("server", {}).get("ingress", {}).get("enabled", False): - del overrides["prometheus"]["server"]["ingress"] - if not values.get("grafana", {}).get("ingress", {}).get("enabled", False): - del overrides["grafana"]["ingress"] - del overrides["grafana"]["grafana.ini"] - - # Add OpenTelemetry configuration to Triton args if enabled - if values.get("opentelemetry-collector", {}).get("enabled", False): - # Get existing args from values - triton_args = values.get("triton", {}).get("args", []) - sampling_rate = values.get("tracing_sampling_rate") - if triton_args and sampling_rate>0: - # Get the first (and should be only) argument string - args_str = triton_args[0] - # Remove the last line continuation if it exists - args_str = args_str.rstrip(" \\\n") - # Calculate sampling rate for Triton (1/sampling) - - sampling = max(1, int(1/sampling_rate)) - # Add OpenTelemetry flags - args_str += " \\\n" - args_str += "--trace-config mode=opentelemetry \\\n" - args_str += "--trace-config=opentelemetry,resource=pod_name=$(hostname) \\\n" - args_str += f"--trace-config opentelemetry,url={release_name}-opentelemetry-collector:4318/v1/traces \\\n" - args_str += f"--trace-config rate={sampling} \\\n" - args_str += "--trace-config level=TIMESTAMPS \\\n" - args_str += "--trace-config count=-1" - - overrides["triton"] = {"args": [args_str]} - - return overrides \ No newline at end of file diff --git a/installer-plugin/utils.py b/installer-plugin/utils.py deleted file mode 100644 index 038ca596..00000000 --- a/installer-plugin/utils.py +++ /dev/null @@ -1,153 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import sys -import logging -from typing import Dict, Any, List, Tuple - -def setup_logging(log_level=logging.INFO) -> logging.Logger: - """ - Set up and configure logging for the installer plugin. - - Args: - log_level: The logging level to use (default: logging.INFO) - - Returns: - The configured logger. - """ - # Get or create the logger - logger = logging.getLogger("supersonic-installer") - - # Clear any existing handlers to avoid duplicate messages - # if the function is called multiple times - if logger.handlers: - logger.handlers.clear() - - logger.setLevel(log_level) - - # Create console handler with a specific format to match the previous print output - console_handler = logging.StreamHandler(sys.stdout) - console_handler.setLevel(log_level) - - # Create a formatter that doesn't include the logger name or timestamp - formatter = logging.Formatter('%(message)s') - console_handler.setFormatter(formatter) - - # Add the handler to the logger - logger.addHandler(console_handler) - - # Prevent propagation to the root logger to avoid duplicate messages - logger.propagate = False - - return logger - -def deep_merge(base: Dict, custom: Dict) -> Dict: - """ - Recursively merge two dictionaries. - Custom values override base values at each level. - """ - result = base.copy() - for key, value in custom.items(): - if key in result and isinstance(result[key], dict) and isinstance(value, dict): - result[key] = deep_merge(result[key], value) - else: - result[key] = value - return result - -class CustomHelpFormatter(argparse.HelpFormatter): - """Custom formatter to match the previous help message style.""" - def format_help(self) -> str: - help_text = [] - help_text.append("SuperSONIC Helm Plugin") - help_text.append("======================") - help_text.append("") - help_text.append("This plugin simplifies the SuperSONIC installation process by") - help_text.append("handling chart dependencies and generating appropriate configurations.") - help_text.append("") - help_text.append("Usage:") - help_text.append(" helm install-supersonic [RELEASE_NAME] [flags]") - help_text.append("") - help_text.append("Flags:") - help_text.append(" -h, --help Show this help message") - help_text.append(" -f, --values Specify values file for custom configuration") - help_text.append(" -n, --namespace Specify Kubernetes namespace for deployment") - help_text.append(" --version Specify chart version (default: latest version)") - help_text.append(" Note: Ignored if --local flag is set") - help_text.append(" --local Install from local chart path instead of remote repository") - help_text.append(" --path Local chart path (default: ./helm/supersonic)") - help_text.append(" Only used when --local flag is set") - help_text.append("Additional flags will be passed directly to the 'helm install' command") - help_text.append("") - help_text.append("Examples:") - help_text.append(" # Install SuperSONIC from official repository") - help_text.append(" helm install-supersonic my-release -f my-values.yaml -n my-namespace") - help_text.append("") - help_text.append(" # Install SuperSONIC from local chart") - help_text.append(" helm install-supersonic my-release -f my-values.yaml -n my-namespace --local --path /helm/supersonic") - help_text.append("") - return "\n".join(help_text) - -def process_remaining_args(args: argparse.Namespace, remaining: List[str]) -> List[str]: - """Process remaining arguments to handle values file and return other helm args.""" - logger = logging.getLogger("supersonic-installer") - helm_args = [] - i = 0 - while i < len(remaining): - if remaining[i] in ['-f', '--values']: - if args.values_file is not None: - logger.error("Error: Multiple values files specified. Only one values file is allowed.") - sys.exit(1) - if i + 1 < len(remaining): - args.values_file = remaining[i + 1] - i += 2 - continue - helm_args.append(remaining[i]) - i += 1 - return helm_args - -def create_parser() -> argparse.ArgumentParser: - """Create and configure argument parser.""" - parser = argparse.ArgumentParser( - description="SuperSONIC Helm Plugin", - formatter_class=CustomHelpFormatter, - usage=argparse.SUPPRESS, # Help is shown in custom format - ) - parser.add_argument( - 'release_name', - help=argparse.SUPPRESS # Help is shown in custom format - ) - parser.add_argument( - '-f', '--values', - dest='values_file', - help=argparse.SUPPRESS, # Help is shown in custom format - default=None - ) - parser.add_argument( - '-n', '--namespace', - help=argparse.SUPPRESS # Help is shown in custom format - ) - parser.add_argument( - '--local', - action='store_true', - help=argparse.SUPPRESS, # Help is shown in custom format - default=False - ) - parser.add_argument( - '--path', - help=argparse.SUPPRESS, # Help is shown in custom format - default="helm/supersonic" - ) - parser.add_argument( - '--version', - help=argparse.SUPPRESS, # Help is shown in custom format - default=None - ) - - return parser - -def parse_args() -> Tuple[argparse.Namespace, List[str]]: - """Parse and process command line arguments.""" - parser = create_parser() - args, remaining = parser.parse_known_args() - args.helm_args = process_remaining_args(args, remaining) - return args, remaining \ No newline at end of file From 87377cabcdc3180c7cfee8e22b18c2ee77d7789a Mon Sep 17 00:00:00 2001 From: kondratyevd Date: Tue, 29 Jul 2025 17:25:56 -0400 Subject: [PATCH 17/27] don't validate grafana existence --- .../templates/_helpers/_grafana.tpl | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/helm/supersonic/templates/_helpers/_grafana.tpl b/helm/supersonic/templates/_helpers/_grafana.tpl index 08b924cb..66393133 100644 --- a/helm/supersonic/templates/_helpers/_grafana.tpl +++ b/helm/supersonic/templates/_helpers/_grafana.tpl @@ -53,26 +53,6 @@ Get full Grafana URL {{- include "supersonic.common.getServiceUrl" (dict "scheme" (include "supersonic.grafanaScheme" .) "host" (include "supersonic.grafanaHost" .) "port" (include "supersonic.grafanaPort" .)) -}} {{- end -}} -{{/* -Check if Grafana exists in the namespace -*/}} -{{- define "supersonic.grafanaExists" -}} -{{- include "supersonic.common.serviceExists" (dict "serviceName" "grafana" "root" .) -}} -{{- end -}} - -{{/* -Validate that there is no existing Grafana instance when enabling a new one -*/}} -{{- define "supersonic.validateGrafana" -}} -{{- if .Values.grafana.enabled -}} - {{- if include "supersonic.grafanaExists" . -}} - {{- $details := fromJson (include "supersonic.common.getExistingServiceDetails" (dict "serviceType" "grafana" "root" .)) -}} - {{- $url := include "supersonic.common.getServiceDisplayUrl" (dict "scheme" $details.scheme "host" $details.host) -}} - {{- fail (printf "Error: Found existing Grafana instance in the namespace:\n- Namespace: %s\n- URL: %s\n\nTo proceed, either:\n1. Set grafana.enabled=false in values.yaml to use the existing Grafana instance, OR\n2. Uninstall the existing Grafana instance" .Release.Namespace $url) -}} - {{- end -}} -{{- end -}} -{{- end -}} - {{/* Validate Grafana address consistency */}} From e01cbfa7abf30431f3f9a735d7466f42e00317db Mon Sep 17 00:00:00 2001 From: kondratyevd Date: Tue, 29 Jul 2025 17:26:29 -0400 Subject: [PATCH 18/27] fix condition --- helm/supersonic/templates/monitoring/default-dashboard.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/supersonic/templates/monitoring/default-dashboard.yaml b/helm/supersonic/templates/monitoring/default-dashboard.yaml index db472212..30ed0fb6 100644 --- a/helm/supersonic/templates/monitoring/default-dashboard.yaml +++ b/helm/supersonic/templates/monitoring/default-dashboard.yaml @@ -1,4 +1,4 @@ -{{- if and .Values.grafana.enabled (ne (include "supersonic.grafanaExists" .) "true") }} +{{- if .Values.grafana.enabled -}} apiVersion: v1 kind: ConfigMap metadata: From ec296f572a735e807261e699e371475f481e029d Mon Sep 17 00:00:00 2001 From: kondratyevd Date: Tue, 29 Jul 2025 17:36:27 -0400 Subject: [PATCH 19/27] update values files --- values/values-geddes-cms.yaml | 4 +-- values/values-nautilus-atlas.yaml | 28 +++++++++++++++++++-- values/values-nautilus-cms.yaml | 38 +++++++++++++++++++++++++---- values/values-nautilus-icecube.yaml | 24 ++++++++++++++++-- 4 files changed, 82 insertions(+), 12 deletions(-) diff --git a/values/values-geddes-cms.yaml b/values/values-geddes-cms.yaml index 1b17642e..b7381fb8 100644 --- a/values/values-geddes-cms.yaml +++ b/values/values-geddes-cms.yaml @@ -106,6 +106,7 @@ grafana: nodeGraph: enabled: true ingress: + enabled: true hosts: - grafana-cms.geddes.rcac.purdue.edu tls: @@ -115,9 +116,6 @@ grafana: grafana.ini: server: root_url: https://grafana-cms.geddes.rcac.purdue.edu - serviceAccounts: - grafana: - name: sonic-server-grafana-sa opentelemetry-collector: enabled: true diff --git a/values/values-nautilus-atlas.yaml b/values/values-nautilus-atlas.yaml index b32ae7da..7a5263f2 100644 --- a/values/values-nautilus-atlas.yaml +++ b/values/values-nautilus-atlas.yaml @@ -69,20 +69,44 @@ prometheus: grafana: enabled: true + dashboardsConfigMaps: + default: traccc-sonic-grafana-default-dashboard + datasources: + datasources.yaml: + datasources: + - name: prometheus + type: prometheus + access: proxy + isDefault: true + url: prometheus.nrp-nautilus.io + jsonData: + timeInterval: "5s" + tlsSkipVerify: true ingress: enabled: true - hostName: grafana-atlas.nrp-nautilus.io + hosts: + - grafana-atlas.nrp-nautilus.io + tls: + - hosts: + - grafana-atlas.nrp-nautilus.io ingressClassName: haproxy annotations: haproxy-ingress.github.io/cors-enable: "true" haproxy-ingress.github.io/proxy-body-size: "512m" haproxy-ingress.github.io/timeout-http-request: "5m" + grafana.ini: + server: + root_url: https://grafana-atlas.nrp-nautilus.io metricsCollector: enabled: true ingress: enabled: true - hostName: metrics-collector-atlas.nrp-nautilus.io + hosts: + - metrics-collector-atlas.nrp-nautilus.io + tls: + - hosts: + - metrics-collector-atlas.nrp-nautilus.io ingressClassName: haproxy annotations: haproxy-ingress.github.io/cors-enable: "true" diff --git a/values/values-nautilus-cms.yaml b/values/values-nautilus-cms.yaml index 7a015cf3..f0667735 100644 --- a/values/values-nautilus-cms.yaml +++ b/values/values-nautilus-cms.yaml @@ -46,7 +46,11 @@ envoy: loadBalancerPolicy: "LEAST_REQUEST" ingress: enabled: true - hostName: sonic-cms.nrp-nautilus.io + hosts: + - sonic-cms.nrp-nautilus.io + tls: + - hosts: + - sonic-cms.nrp-nautilus.io ingressClassName: haproxy annotations: haproxy-ingress.github.io/cors-enable: "true" @@ -82,7 +86,11 @@ prometheus: server: ingress: enabled: true - hostName: prometheus-cms.nrp-nautilus.io + hosts: + - prometheus-cms.nrp-nautilus.io + tls: + - hosts: + - prometheus-cms.nrp-nautilus.io ingressClassName: haproxy annotations: haproxy-ingress.github.io/cors-enable: "true" @@ -91,17 +99,37 @@ prometheus: grafana: enabled: true + dashboardsConfigMaps: + default: supersonic-grafana-default-dashboard + datasources: + datasources.yaml: + datasources: + - name: prometheus + type: prometheus + access: proxy + isDefault: true + url: http://supersonic-prometheus-server:9090 + jsonData: + timeInterval: "5s" + tlsSkipVerify: true ingress: enabled: true - hostName: grafana-cms.nrp-nautilus.io + hosts: + - grafana-cms.nrp-nautilus.io + tls: + - hosts: + - grafana-cms.nrp-nautilus.io ingressClassName: haproxy annotations: haproxy-ingress.github.io/cors-enable: "true" haproxy-ingress.github.io/proxy-body-size: "512m" haproxy-ingress.github.io/timeout-http-request: "5m" + grafana.ini: + server: + root_url: https://grafana-cms.nrp-nautilus.io opentelemetry-collector: - enabled: true + enabled: false tempo: - enabled: true + enabled: false diff --git a/values/values-nautilus-icecube.yaml b/values/values-nautilus-icecube.yaml index 04c33f48..17169228 100644 --- a/values/values-nautilus-icecube.yaml +++ b/values/values-nautilus-icecube.yaml @@ -87,11 +87,31 @@ prometheus: grafana: enabled: true + dashboardsConfigMaps: + default: supersonic-grafana-default-dashboard + datasources: + datasources.yaml: + datasources: + - name: prometheus + type: prometheus + access: proxy + isDefault: true + url: prometheus.nrp-nautilus.io + jsonData: + timeInterval: "5s" + tlsSkipVerify: true ingress: enabled: true - hostName: grafana-icecube.nrp-nautilus.io + hosts: + - grafana-icecube.nrp-nautilus.io + tls: + - hosts: + - grafana-icecube.nrp-nautilus.io ingressClassName: haproxy annotations: haproxy-ingress.github.io/cors-enable: "true" haproxy-ingress.github.io/backend-protocol: "h2" - haproxy-ingress.github.io/proxy-body-size: "512m" \ No newline at end of file + haproxy-ingress.github.io/proxy-body-size: "512m" + grafana.ini: + server: + root_url: https://grafana-icecube.nrp-nautilus.io \ No newline at end of file From 491fbb02bad09ee509a21496c0772acb1264444d Mon Sep 17 00:00:00 2001 From: kondratyevd Date: Tue, 29 Jul 2025 17:36:36 -0400 Subject: [PATCH 20/27] fix condition --- helm/supersonic/templates/NOTES.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/supersonic/templates/NOTES.txt b/helm/supersonic/templates/NOTES.txt index 57f24f60..59bd5bb7 100644 --- a/helm/supersonic/templates/NOTES.txt +++ b/helm/supersonic/templates/NOTES.txt @@ -37,7 +37,7 @@ Scaling threshold: {{ include "supersonic.defaultThreshold" . }} | | Prometheus UI: {{ include "supersonic.prometheusDisplayUrl" . }} {{- end }} -{{- if or .Values.grafana.enabled (include "supersonic.grafanaExists" .) }} +{{- if .Values.grafana.enabled }} | | Grafana dashboard: {{ include "supersonic.grafanaDisplayUrl" . }} {{- end }} From 7cc7f109f3cf9dcd06cac86a80f594dd601f5e19 Mon Sep 17 00:00:00 2001 From: kondratyevd Date: Tue, 29 Jul 2025 17:47:51 -0400 Subject: [PATCH 21/27] delete obsolete installation scripts --- deploy-geddes-cms.sh | 2 -- deploy-nautilus-atlas.sh | 2 -- deploy-nautilus-cms.sh | 2 -- 3 files changed, 6 deletions(-) delete mode 100644 deploy-geddes-cms.sh delete mode 100644 deploy-nautilus-atlas.sh delete mode 100644 deploy-nautilus-cms.sh diff --git a/deploy-geddes-cms.sh b/deploy-geddes-cms.sh deleted file mode 100644 index 68aeddb0..00000000 --- a/deploy-geddes-cms.sh +++ /dev/null @@ -1,2 +0,0 @@ -helm plugin install https://github.com/fastmachinelearning/SuperSONIC/ -helm install-supersonic supersonic -n cms -f values/values-geddes-cms.yaml \ No newline at end of file diff --git a/deploy-nautilus-atlas.sh b/deploy-nautilus-atlas.sh deleted file mode 100644 index 5429c095..00000000 --- a/deploy-nautilus-atlas.sh +++ /dev/null @@ -1,2 +0,0 @@ -helm plugin install https://github.com/fastmachinelearning/SuperSONIC/ -helm install-supersonic supersonic -n atlas-sonic -f values/values-nautilus-atlas.yaml \ No newline at end of file diff --git a/deploy-nautilus-cms.sh b/deploy-nautilus-cms.sh deleted file mode 100644 index 05646261..00000000 --- a/deploy-nautilus-cms.sh +++ /dev/null @@ -1,2 +0,0 @@ -helm plugin install https://github.com/fastmachinelearning/SuperSONIC/ -helm install-supersonic supersonic -n sonic-server -f values/values-nautilus-cms.yaml \ No newline at end of file From 69f18a7de8e10377156f9da8a30358f913f2ece7 Mon Sep 17 00:00:00 2001 From: kondratyevd Date: Tue, 29 Jul 2025 17:55:01 -0400 Subject: [PATCH 22/27] update ci workflows and add minimal values file --- .../{ci-github-cms.yaml => ci-full.yaml} | 4 +-- ...s-cms-ci.yaml => values-minimal-full.yaml} | 3 +- values/values-minimal.yaml | 29 +++++++++++++++++++ 3 files changed, 32 insertions(+), 4 deletions(-) rename .github/workflows/{ci-github-cms.yaml => ci-full.yaml} (98%) rename values/{values-cms-ci.yaml => values-minimal-full.yaml} (91%) create mode 100644 values/values-minimal.yaml diff --git a/.github/workflows/ci-github-cms.yaml b/.github/workflows/ci-full.yaml similarity index 98% rename from .github/workflows/ci-github-cms.yaml rename to .github/workflows/ci-full.yaml index 5cafb20f..fe7bcf72 100644 --- a/.github/workflows/ci-github-cms.yaml +++ b/.github/workflows/ci-full.yaml @@ -1,4 +1,4 @@ -name: ci [CMS] +name: ci [full] on: push: @@ -55,7 +55,7 @@ jobs: helm repo add opentelemetry https://open-telemetry.github.io/opentelemetry-helm-charts helm dependency build ./helm/supersonic helm upgrade --install supersonic ./helm/supersonic \ - --values values/values-cms-ci.yaml -n cms + --values values/values-minimal-full.yaml -n cms - name: CVMFS Mount ready run: | diff --git a/values/values-cms-ci.yaml b/values/values-minimal-full.yaml similarity index 91% rename from values/values-cms-ci.yaml rename to values/values-minimal-full.yaml index 788aeec9..dd5181d9 100644 --- a/values/values-cms-ci.yaml +++ b/values/values-minimal-full.yaml @@ -1,7 +1,6 @@ triton: replicas: 1 - # image: fastml/triton-torchgeo:21.02-py3-geometric # run2 - image: fastml/triton-torchgeo:22.07-py3-geometric # run3 + image: fastml/triton-torchgeo:22.07-py3-geometric # works for CMSSW run3 command: ["/bin/sh", "-c"] args: - | diff --git a/values/values-minimal.yaml b/values/values-minimal.yaml new file mode 100644 index 00000000..d37b5fca --- /dev/null +++ b/values/values-minimal.yaml @@ -0,0 +1,29 @@ +triton: + replicas: 1 + image: fastml/triton-torchgeo:22.07-py3-geometric # works for CMSSW run3 + command: ["/bin/sh", "-c"] + args: + - | + /opt/tritonserver/bin/tritonserver \ + --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoBTag/Combined/data/models/ \ + --log-verbose=0 \ + --disable-auto-complete-config \ + --exit-timeout-secs=60 + resources: + limits: { cpu: 1, memory: 3Gi} + requests: { cpu: 1, memory: 1Gi} + modelRepository: + enabled: true + storageType: cvmfs-pvc + mountPath: /cvmfs + readinessProbe: + reset: true + +envoy: + enabled: true + resources: + requests: + cpu: 0.1 + memory: "128Mi" + service: + type: LoadBalancer \ No newline at end of file From a68a0edc6f64cd47399501669f708f4dd147a01e Mon Sep 17 00:00:00 2001 From: kondratyevd Date: Tue, 29 Jul 2025 17:56:00 -0400 Subject: [PATCH 23/27] increase timeout for keda --- .github/workflows/ci-full.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-full.yaml b/.github/workflows/ci-full.yaml index fe7bcf72..38b68fec 100644 --- a/.github/workflows/ci-full.yaml +++ b/.github/workflows/ci-full.yaml @@ -84,8 +84,8 @@ jobs: - name: Autoscaler ready run: | - kubectl wait --for condition=AbleToScale hpa -l app.kubernetes.io/component=keda --timeout 120s -n cms - kubectl wait --for condition=Ready so -l app.kubernetes.io/component=keda --timeout 120s -n cms + kubectl wait --for condition=AbleToScale hpa -l app.kubernetes.io/component=keda --timeout 180s -n cms + kubectl wait --for condition=Ready so -l app.kubernetes.io/component=keda --timeout 180s -n cms - name: Triton server ready run: | From b28fee475e067d5233a66c854ae7c3492048e645 Mon Sep 17 00:00:00 2001 From: kondratyevd Date: Tue, 29 Jul 2025 17:58:14 -0400 Subject: [PATCH 24/27] fix CI --- .github/workflows/ci-installation.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-installation.yaml b/.github/workflows/ci-installation.yaml index f5501502..fc6a8d48 100644 --- a/.github/workflows/ci-installation.yaml +++ b/.github/workflows/ci-installation.yaml @@ -47,7 +47,7 @@ jobs: run: | helm repo add fastml https://fastmachinelearning.org/SuperSONIC/ helm repo update - helm install supersonic fastml/supersonic -n test-ns -f values/values-cms-ci.yaml + helm install supersonic fastml/supersonic -n test-ns -f values/values-minimal.yaml helm uninstall supersonic -n test-ns - name: Test installation of SuperSONIC from GitHub @@ -62,7 +62,7 @@ jobs: git clone https://github.com/fastmachinelearning/SuperSONIC.git cd SuperSONIC helm dependency build helm/supersonic - helm install supersonic helm/supersonic -n test-ns -f values/values-cms-ci.yaml + helm install supersonic helm/supersonic -n test-ns -f values/values-minimal.yaml helm uninstall supersonic -n test-ns - name: Cleanup From 108a573d2c5f3a027d3a8ae4cc01eb915882b59f Mon Sep 17 00:00:00 2001 From: kondratyevd Date: Tue, 29 Jul 2025 18:03:51 -0400 Subject: [PATCH 25/27] fix CI - we are already in the repo --- .github/workflows/ci-installation.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/ci-installation.yaml b/.github/workflows/ci-installation.yaml index fc6a8d48..74835b59 100644 --- a/.github/workflows/ci-installation.yaml +++ b/.github/workflows/ci-installation.yaml @@ -59,8 +59,6 @@ jobs: helm repo update # Install SuperSONIC - git clone https://github.com/fastmachinelearning/SuperSONIC.git - cd SuperSONIC helm dependency build helm/supersonic helm install supersonic helm/supersonic -n test-ns -f values/values-minimal.yaml helm uninstall supersonic -n test-ns From f3b5d402b0c73adf2d6dc7975f0378ee5c8f25d1 Mon Sep 17 00:00:00 2001 From: kondratyevd Date: Tue, 29 Jul 2025 18:11:23 -0400 Subject: [PATCH 26/27] update README --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 85837a4a..ecb4c4d7 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,8 @@ The main components of SuperSONIC are: ## Installation +### Install from Helm repository + ``` helm repo add fastml https://fastmachinelearning.org/SuperSONIC helm repo update @@ -46,6 +48,7 @@ The full list of configuration parameters is available in the [Configuration ref ``` git clone https://github.com/fastmachinelearning/SuperSONIC.git cd SuperSONIC +git checkout helm dependency build helm/supersonic helm install helm/supersonic -n -f ``` From ff109d78dc44b955621074ee7449ccc71c729335 Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Tue, 29 Jul 2025 22:11:55 +0000 Subject: [PATCH 27/27] Update helm docs --- helm/supersonic/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/helm/supersonic/README.md b/helm/supersonic/README.md index 85837a4a..ecb4c4d7 100644 --- a/helm/supersonic/README.md +++ b/helm/supersonic/README.md @@ -31,6 +31,8 @@ The main components of SuperSONIC are: ## Installation +### Install from Helm repository + ``` helm repo add fastml https://fastmachinelearning.org/SuperSONIC helm repo update @@ -46,6 +48,7 @@ The full list of configuration parameters is available in the [Configuration ref ``` git clone https://github.com/fastmachinelearning/SuperSONIC.git cd SuperSONIC +git checkout helm dependency build helm/supersonic helm install helm/supersonic -n -f ```