diff --git a/.github/workflows/ci-github-cms.yaml b/.github/workflows/ci-full.yaml similarity index 96% rename from .github/workflows/ci-github-cms.yaml rename to .github/workflows/ci-full.yaml index 5cafb20f..38b68fec 100644 --- a/.github/workflows/ci-github-cms.yaml +++ b/.github/workflows/ci-full.yaml @@ -1,4 +1,4 @@ -name: ci [CMS] +name: ci [full] on: push: @@ -55,7 +55,7 @@ jobs: helm repo add opentelemetry https://open-telemetry.github.io/opentelemetry-helm-charts helm dependency build ./helm/supersonic helm upgrade --install supersonic ./helm/supersonic \ - --values values/values-cms-ci.yaml -n cms + --values values/values-minimal-full.yaml -n cms - name: CVMFS Mount ready run: | @@ -84,8 +84,8 @@ jobs: - name: Autoscaler ready run: | - kubectl wait --for condition=AbleToScale hpa -l app.kubernetes.io/component=keda --timeout 120s -n cms - kubectl wait --for condition=Ready so -l app.kubernetes.io/component=keda --timeout 120s -n cms + kubectl wait --for condition=AbleToScale hpa -l app.kubernetes.io/component=keda --timeout 180s -n cms + kubectl wait --for condition=Ready so -l app.kubernetes.io/component=keda --timeout 180s -n cms - name: Triton server ready run: | diff --git a/.github/workflows/ci-github-installer-plugin.yaml b/.github/workflows/ci-installation.yaml similarity index 53% rename from .github/workflows/ci-github-installer-plugin.yaml rename to .github/workflows/ci-installation.yaml index 85295b9e..74835b59 100644 --- a/.github/workflows/ci-github-installer-plugin.yaml +++ b/.github/workflows/ci-installation.yaml @@ -1,4 +1,4 @@ -name: ci [installer plugin] +name: ci [installation] on: push: @@ -9,7 +9,7 @@ on: - "main" jobs: - test-installer-plugin: + test-installation: runs-on: ubuntu-latest steps: - name: Checkout code @@ -25,9 +25,9 @@ jobs: with: version: v3.12.0 - - name: Create CMS namespace + - name: Create test namespace run: | - kubectl create namespace cms + kubectl create namespace test-ns - name: Install Prometheus Operator CRDs run: | @@ -36,17 +36,32 @@ jobs: kubectl create namespace monitoring helm install prometheus-operator prometheus-community/kube-prometheus-stack --namespace monitoring --set prometheusOperator.createCustomResource=false --set defaultRules.create=false --set alertmanager.enabled=false --set prometheus.enabled=false --set grafana.enabled=false - - name: Install KEDA Autoscaler + - name: Install KEDA Autoscaler CRDs run: | helm repo add kedacore https://kedacore.github.io/charts helm repo update kubectl create namespace keda helm install keda kedacore/keda --namespace keda - - name: Install SuperSONIC from remote repo via plugin + - name: Test installation of SuperSONIC from remote repo via plugin run: | - helm plugin install . - helm install-supersonic supersonic --local --values values/values-cms-ci.yaml -n cms + helm repo add fastml https://fastmachinelearning.org/SuperSONIC/ + helm repo update + helm install supersonic fastml/supersonic -n test-ns -f values/values-minimal.yaml + helm uninstall supersonic -n test-ns + + - name: Test installation of SuperSONIC from GitHub + run: | + # Add dependencies + helm repo add prometheus-community https://prometheus-community.github.io/helm-charts + helm repo add grafana https://grafana.github.io/helm-charts + helm repo add opentelemetry https://open-telemetry.github.io/opentelemetry-helm-charts + helm repo update + + # Install SuperSONIC + helm dependency build helm/supersonic + helm install supersonic helm/supersonic -n test-ns -f values/values-minimal.yaml + helm uninstall supersonic -n test-ns - name: Cleanup run: kind delete cluster --name gh-k8s-cluster \ No newline at end of file diff --git a/README.md b/README.md index e63ca654..ecb4c4d7 100644 --- a/README.md +++ b/README.md @@ -31,36 +31,27 @@ The main components of SuperSONIC are: ## Installation -The installation is done via a custom Helm plugin which takes care of -internal connectivity of the chart components. Standard Helm installation -is also supported, but requires a lot more manual configuration. +### Install from Helm repository ``` -helm plugin install https://github.com/fastmachinelearning/SuperSONIC/ -helm install-supersonic -n -f -``` - -Installer plugin usage: -``` -Usage: - helm install-supersonic [RELEASE_NAME] [flags] - -Flags: - -h, --help Show this help message - -f, --values Specify values file for custom configuration - -n, --namespace Specify Kubernetes namespace for deployment - --version Specify chart version (default: latest version) - Note: Ignored if --local flag is set - --local Install from local chart path instead of remote repository - --path Local chart path (default: ./helm/supersonic) - Only used when --local flag is set -Additional flags will be passed directly to the 'helm install' command +helm repo add fastml https://fastmachinelearning.org/SuperSONIC +helm repo update +helm install fastml/supersonic -n -f ``` To construct the `values.yaml` file for your application, follow [Configuration guide](http://fastmachinelearning.org/SuperSONIC/configuration-guide.html "Configuration guide"). The full list of configuration parameters is available in the [Configuration reference](http://fastmachinelearning.org/SuperSONIC/configuration-reference.html "Configuration reference"). +### Install from GitHub + +``` +git clone https://github.com/fastmachinelearning/SuperSONIC.git +cd SuperSONIC +git checkout +helm dependency build helm/supersonic +helm install helm/supersonic -n -f +``` ## Server diagram diff --git a/deploy-geddes-cms.sh b/deploy-geddes-cms.sh deleted file mode 100644 index 68aeddb0..00000000 --- a/deploy-geddes-cms.sh +++ /dev/null @@ -1,2 +0,0 @@ -helm plugin install https://github.com/fastmachinelearning/SuperSONIC/ -helm install-supersonic supersonic -n cms -f values/values-geddes-cms.yaml \ No newline at end of file diff --git a/deploy-nautilus-atlas.sh b/deploy-nautilus-atlas.sh deleted file mode 100644 index 5429c095..00000000 --- a/deploy-nautilus-atlas.sh +++ /dev/null @@ -1,2 +0,0 @@ -helm plugin install https://github.com/fastmachinelearning/SuperSONIC/ -helm install-supersonic supersonic -n atlas-sonic -f values/values-nautilus-atlas.yaml \ No newline at end of file diff --git a/deploy-nautilus-cms.sh b/deploy-nautilus-cms.sh deleted file mode 100644 index 05646261..00000000 --- a/deploy-nautilus-cms.sh +++ /dev/null @@ -1,2 +0,0 @@ -helm plugin install https://github.com/fastmachinelearning/SuperSONIC/ -helm install-supersonic supersonic -n sonic-server -f values/values-nautilus-cms.yaml \ No newline at end of file diff --git a/docs/.values-table.md b/docs/.values-table.md index 0847bf82..8bebe32f 100644 --- a/docs/.values-table.md +++ b/docs/.values-table.md @@ -43,6 +43,7 @@ | envoy.auth.audiences | list | `[]` | | | envoy.auth.url | string | `""` | | | envoy.auth.port | int | `443` | | +| envoy.tracing_sampling_rate | float | `0.01` | | | autoscaler.enabled | bool | `false` | Enable autoscaling (requires Prometheus to also be enabled). Autoscaling will be based on the metric is taken from parameter ``prometheus.serverLoadMetric``, new Triton servers will spawn if the metric exceedds the threshold set by ``prometheus.serverLoadThreshold``. | | autoscaler.minReplicaCount | int | `1` | Minimum and maximum number of Triton servers. Warning: if min=0 and desired Prometheus metric is empty, the first server will never start | | autoscaler.maxReplicaCount | int | `2` | | @@ -125,7 +126,6 @@ | tempo.tempo.receivers.otlp.protocols.http.endpoint | string | `"0.0.0.0:4318"` | | | tempo.tempo.livenessProbe.initialDelaySeconds | int | `0` | | | tempo.tempo.readinessProbe.initialDelaySeconds | int | `0` | | -| tracing_sampling_rate | float | `0.01` | | | opentelemetry-collector.enabled | bool | `false` | | | opentelemetry-collector.image.repository | string | `"otel/opentelemetry-collector-contrib"` | | | opentelemetry-collector.image.tag | string | `"0.120.0"` | | diff --git a/docs/advanced-monitoring.rst b/docs/advanced-monitoring.rst index 13e63a9f..632789cc 100644 --- a/docs/advanced-monitoring.rst +++ b/docs/advanced-monitoring.rst @@ -43,8 +43,7 @@ Displaying Tracing Data in Grafana If Grafana is enabled in your ``values.yaml``, you can display the tracing data in the Grafana dashboard. In order to achieve this, Grafana needs to have a -Tempo datasource configured. This is done automatically when you install -SuperSONIC via the ``install-supersonic`` plugin. +Tempo datasource configured. If OpenTelemetry Collector and Tempo are enabled, the default Grafana dashboard will include an interactive server map, where you can study tracing data in detail diff --git a/docs/getting-started.rst b/docs/getting-started.rst index 51c246fe..bccb249a 100644 --- a/docs/getting-started.rst +++ b/docs/getting-started.rst @@ -19,41 +19,18 @@ Installation - `Configuration reference `_ - `Example values.yaml files `_ - 2. Install Helm plugin to handle SuperSONIC installation + 2. Install Helm repository .. code:: shell - helm plugin install https://github.com/fastmachinelearning/SuperSONIC/ - - - The Helm plugin is needed to ensure internal connectivity of the SuperSONIC - components. Standard Helm installation without a plugin is also supported, - but requires a lot more manual configuration. + helm repo add fastml https://fastmachinelearning.org/SuperSONIC/ + helm repo update 3. Modify the following command to install the chart at your cluster: .. code:: shell - helm install-supersonic -n -f - - Installer plugin usage: - - .. code:: shell - - Usage: - helm install-supersonic [RELEASE_NAME] [flags] - - Flags: - -h, --help Show this help message - -f, --values Specify values file for custom configuration - -n, --namespace Specify Kubernetes namespace for deployment - --version Specify chart version (default: latest version) - Note: Ignored if --local flag is set - --local Install from local chart path instead of remote repository - --path Local chart path (default: ./helm/supersonic) - Only used when --local flag is set - Additional flags will be passed directly to the 'helm install' command - + helm install fastml/supersonic -n -f Use a unique meaningful lowercase value as , for example ``supersonic-cms-run3``. diff --git a/helm/supersonic/README.md b/helm/supersonic/README.md index e63ca654..ecb4c4d7 100644 --- a/helm/supersonic/README.md +++ b/helm/supersonic/README.md @@ -31,36 +31,27 @@ The main components of SuperSONIC are: ## Installation -The installation is done via a custom Helm plugin which takes care of -internal connectivity of the chart components. Standard Helm installation -is also supported, but requires a lot more manual configuration. +### Install from Helm repository ``` -helm plugin install https://github.com/fastmachinelearning/SuperSONIC/ -helm install-supersonic -n -f -``` - -Installer plugin usage: -``` -Usage: - helm install-supersonic [RELEASE_NAME] [flags] - -Flags: - -h, --help Show this help message - -f, --values Specify values file for custom configuration - -n, --namespace Specify Kubernetes namespace for deployment - --version Specify chart version (default: latest version) - Note: Ignored if --local flag is set - --local Install from local chart path instead of remote repository - --path Local chart path (default: ./helm/supersonic) - Only used when --local flag is set -Additional flags will be passed directly to the 'helm install' command +helm repo add fastml https://fastmachinelearning.org/SuperSONIC +helm repo update +helm install fastml/supersonic -n -f ``` To construct the `values.yaml` file for your application, follow [Configuration guide](http://fastmachinelearning.org/SuperSONIC/configuration-guide.html "Configuration guide"). The full list of configuration parameters is available in the [Configuration reference](http://fastmachinelearning.org/SuperSONIC/configuration-reference.html "Configuration reference"). +### Install from GitHub + +``` +git clone https://github.com/fastmachinelearning/SuperSONIC.git +cd SuperSONIC +git checkout +helm dependency build helm/supersonic +helm install helm/supersonic -n -f +``` ## Server diagram diff --git a/helm/supersonic/templates/NOTES.txt b/helm/supersonic/templates/NOTES.txt index 8c89dc51..59bd5bb7 100644 --- a/helm/supersonic/templates/NOTES.txt +++ b/helm/supersonic/templates/NOTES.txt @@ -2,7 +2,6 @@ {{- /* Run validation checks */ -}} {{- include "supersonic.validateGrafanaAddressConsistency" . -}} {{- include "supersonic.validateGrafanaValues" . -}} -{{- include "supersonic.validatePrometheus" . -}} {{- include "supersonic.validatePrometheusAddressConsistency" . -}} {{- include "supersonic.validatePrometheusValues" . -}} @@ -38,7 +37,7 @@ Scaling threshold: {{ include "supersonic.defaultThreshold" . }} | | Prometheus UI: {{ include "supersonic.prometheusDisplayUrl" . }} {{- end }} -{{- if or .Values.grafana.enabled (include "supersonic.grafanaExists" .) }} +{{- if .Values.grafana.enabled }} | | Grafana dashboard: {{ include "supersonic.grafanaDisplayUrl" . }} {{- end }} diff --git a/helm/supersonic/templates/_helpers/_grafana.tpl b/helm/supersonic/templates/_helpers/_grafana.tpl index 08b924cb..66393133 100644 --- a/helm/supersonic/templates/_helpers/_grafana.tpl +++ b/helm/supersonic/templates/_helpers/_grafana.tpl @@ -53,26 +53,6 @@ Get full Grafana URL {{- include "supersonic.common.getServiceUrl" (dict "scheme" (include "supersonic.grafanaScheme" .) "host" (include "supersonic.grafanaHost" .) "port" (include "supersonic.grafanaPort" .)) -}} {{- end -}} -{{/* -Check if Grafana exists in the namespace -*/}} -{{- define "supersonic.grafanaExists" -}} -{{- include "supersonic.common.serviceExists" (dict "serviceName" "grafana" "root" .) -}} -{{- end -}} - -{{/* -Validate that there is no existing Grafana instance when enabling a new one -*/}} -{{- define "supersonic.validateGrafana" -}} -{{- if .Values.grafana.enabled -}} - {{- if include "supersonic.grafanaExists" . -}} - {{- $details := fromJson (include "supersonic.common.getExistingServiceDetails" (dict "serviceType" "grafana" "root" .)) -}} - {{- $url := include "supersonic.common.getServiceDisplayUrl" (dict "scheme" $details.scheme "host" $details.host) -}} - {{- fail (printf "Error: Found existing Grafana instance in the namespace:\n- Namespace: %s\n- URL: %s\n\nTo proceed, either:\n1. Set grafana.enabled=false in values.yaml to use the existing Grafana instance, OR\n2. Uninstall the existing Grafana instance" .Release.Namespace $url) -}} - {{- end -}} -{{- end -}} -{{- end -}} - {{/* Validate Grafana address consistency */}} diff --git a/helm/supersonic/templates/_helpers/_prometheus.tpl b/helm/supersonic/templates/_helpers/_prometheus.tpl index 2a7ffa8a..312d53da 100644 --- a/helm/supersonic/templates/_helpers/_prometheus.tpl +++ b/helm/supersonic/templates/_helpers/_prometheus.tpl @@ -62,13 +62,6 @@ Check if Prometheus exists in the namespace {{- include "supersonic.common.serviceExists" (dict "serviceName" "prometheus" "root" .) -}} {{- end -}} -{{/* -Validate that there is no existing Prometheus instance when enabling a new one -*/}} -{{- define "supersonic.validatePrometheus" -}} -{{- include "supersonic.common.validateNoExistingService" (dict "serviceType" "prometheus" "values" .Values "root" .) -}} -{{- end -}} - {{/* Validate RBAC permissions for Prometheus */}} diff --git a/helm/supersonic/templates/envoy/configmaps.yaml b/helm/supersonic/templates/envoy/configmaps.yaml index a96eaecf..06e80815 100644 --- a/helm/supersonic/templates/envoy/configmaps.yaml +++ b/helm/supersonic/templates/envoy/configmaps.yaml @@ -36,7 +36,7 @@ static_resources: {{- if (index .root.Values "opentelemetry-collector" "enabled") }} tracing: random_sampling: - value: {{ mulf .root.Values.tracing_sampling_rate 100 }} + value: {{ mulf .envoy.tracing_sampling_rate 100 }} provider: name: envoy.tracers.opentelemetry typed_config: diff --git a/helm/supersonic/templates/monitoring/default-dashboard.yaml b/helm/supersonic/templates/monitoring/default-dashboard.yaml index db472212..30ed0fb6 100644 --- a/helm/supersonic/templates/monitoring/default-dashboard.yaml +++ b/helm/supersonic/templates/monitoring/default-dashboard.yaml @@ -1,4 +1,4 @@ -{{- if and .Values.grafana.enabled (ne (include "supersonic.grafanaExists" .) "true") }} +{{- if .Values.grafana.enabled -}} apiVersion: v1 kind: ConfigMap metadata: diff --git a/helm/supersonic/values.schema.json b/helm/supersonic/values.schema.json index 4ad91bde..c5974cef 100644 --- a/helm/supersonic/values.schema.json +++ b/helm/supersonic/values.schema.json @@ -419,6 +419,9 @@ "port", "url" ] + }, + "tracing_sampling_rate": { + "type": "number" } }, "required": [ @@ -432,7 +435,8 @@ "rate_limiter", "replicas", "resources", - "service" + "service", + "tracing_sampling_rate" ] }, "autoscaler": { @@ -1360,9 +1364,6 @@ "tempo" ] }, - "tracing_sampling_rate": { - "type": "number" - }, "opentelemetry-collector": { "type": "object", "properties": { @@ -1948,7 +1949,6 @@ "serverLoadThreshold", "tempo", "tolerations", - "tracing_sampling_rate", "triton" ] } \ No newline at end of file diff --git a/helm/supersonic/values.yaml b/helm/supersonic/values.yaml index c261f198..367b6783 100644 --- a/helm/supersonic/values.yaml +++ b/helm/supersonic/values.yaml @@ -25,6 +25,13 @@ triton: --model-repository=/tmp/ \ --log-verbose=0 \ --exit-timeout-secs=60 + # To enable OpenTelemetry tracing: + # --trace-config mode=opentelemetry + # --trace-config=opentelemetry,resource=pod_name=$(hostname) + # --trace-config opentelemetry,url=sonic-server-opentelemetry-collector:4318/v1/traces + # --trace-config rate=100 # 1 in 100 requests + # --trace-config level=TIMESTAMPS + # --trace-config count=-1 # -- Resource limits and requests for each Triton instance. # You can add necessary GPU request here. @@ -170,6 +177,8 @@ envoy: audiences: [] url: "" port: 443 + + tracing_sampling_rate: 0.01 # must be 1 / triton sampling rate autoscaler: @@ -395,8 +404,6 @@ tempo: readinessProbe: initialDelaySeconds: 0 -tracing_sampling_rate: 0.01 - opentelemetry-collector: enabled: false image: diff --git a/installer-plugin/installer.py b/installer-plugin/installer.py deleted file mode 100755 index bfc88526..00000000 --- a/installer-plugin/installer.py +++ /dev/null @@ -1,135 +0,0 @@ -#!/usr/bin/env python3 - -import os -import sys -import yaml -import subprocess -import tempfile -import logging -from typing import Optional, Dict -from utils import ( - deep_merge, - parse_args, - setup_logging -) -from overrides import ( - generate_overrides -) - -REPO_CHART = "fastml/supersonic" -REPO_URL = "https://fastmachinelearning.org/SuperSONIC" - -def process_values(values_file: Optional[str], chart_path: str, release_name: str, use_local: bool, version: Optional[str] = None) -> Dict: - """Process and merge values files.""" - logger = logging.getLogger("supersonic-installer") - logger.info("╔══════════════════════════════════════════════════════════════════════") - logger.info("║ Running Helm plugin 'install-supersonic' ") - logger.info("╠══════════════════════════════════════════════════════════════════════") - - # Get default values - if use_local: - if not os.path.isdir(chart_path): - logger.error(f"Error: SuperSONIC chart not found at {chart_path}") - sys.exit(1) - - default_values_path = os.path.join(chart_path, "values.yaml") - if not os.path.isfile(default_values_path): - logger.error("Error: Default values file not found in chart") - sys.exit(1) - logger.info(f"║ Default values: {default_values_path} ") - with open(default_values_path, 'r') as f: - result = yaml.safe_load(f) or {} - else: - # Add repository and fetch default values from remote - subprocess.run(["helm", "repo", "add", "fastml", REPO_URL], check=True) - subprocess.run(["helm", "repo", "update"], check=True) - - cmd = ["helm", "show", "values", REPO_CHART] - if version: - cmd.extend(["--version", version]) - - logger.info("║ Fetching default values from remote repository") - try: - values_output = subprocess.check_output(cmd, text=True) - result = yaml.safe_load(values_output) or {} - except subprocess.CalledProcessError as e: - logger.error(f"Error: Failed to fetch default values from repository: {e}") - sys.exit(1) - - # Load custom values - if values_file: - if not os.path.isfile(values_file): - logger.error(f"Error: values file '{values_file}' not found") - sys.exit(1) - logger.info(f"║ Custom values: {values_file} ") - with open(values_file, 'r') as f: - # Merge custom values with default values - result = deep_merge(result, yaml.safe_load(f) or {}) - - # Generate overrides - overrides = generate_overrides(release_name, result) - logger.info("║ Generated overrides for config sections:") - for key in overrides: - logger.info(f"║ • {key}") - - # Merge overrides with result - result = deep_merge(result, overrides) - return result - -def main() -> None: - """Main entry point.""" - # Setup logging - logger = setup_logging() - - args, _ = parse_args() - - # Process values: merge default values with custom values, then generate overrides - # and merge them onto the result - merged_values = process_values(args.values_file, args.path, args.release_name, args.local, args.version) - - # Write generated values to a temporary file - with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as tmp: - yaml.dump(merged_values, tmp, default_flow_style=False) - tmp_values_file = tmp.name - logger.info(f"║ Writing merged values to temporary file: {tmp_values_file} ") - logger.info("╚══════════════════════════════════════════════════════════════════════") - - try: - # Construct and execute helm command - chart_source = args.path if args.local else REPO_CHART - - # Add dependencies - repo_commands = [] - if merged_values.get("prometheus", {}).get("enabled", False): - repo_commands.append(["helm", "repo", "add", "prometheus-community", "https://prometheus-community.github.io/helm-charts"]) - if merged_values.get("grafana", {}).get("enabled", False): - repo_commands.append(["helm", "repo", "add", "grafana", "https://grafana.github.io/helm-charts"]) - if merged_values.get("opentelemetry-collector", {}).get("enabled", False): - repo_commands.append(["helm", "repo", "add", "opentelemetry", "https://open-telemetry.github.io/opentelemetry-helm-charts"]) - if args.local: - repo_commands.append(["helm", "dependency", "build", chart_source]) - - for cmd in repo_commands: - logger.info(f"\nExecuting: {' '.join(cmd)}") - subprocess.run(cmd, check=True) - - cmd = ["helm", "install", args.release_name, chart_source, "-f", tmp_values_file] - if args.namespace: - cmd.extend(["-n", args.namespace]) - if args.helm_args: - cmd.extend(args.helm_args) - if not args.local and args.version: - cmd.extend(["--version", args.version]) - - logger.info(f"\nExecuting: {' '.join(cmd)}\n") - result = subprocess.run(cmd) - if result.returncode != 0: - sys.exit(result.returncode) - - finally: - # Clean up temporary file - logger.info(f"\n=== Cleaning up temporary valuesfile: {tmp_values_file} ===") - os.unlink(tmp_values_file) - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/installer-plugin/overrides.py b/installer-plugin/overrides.py deleted file mode 100644 index 492907a1..00000000 --- a/installer-plugin/overrides.py +++ /dev/null @@ -1,108 +0,0 @@ -import yaml -from typing import Dict - -def generate_overrides(release_name: str, values: Dict) -> Dict: - """Generate overrides to ensure internal consistency of SuperSONIC components""" - - # This may be changed in the future - prometheus_host = values.get("prometheus", {}).get("server", {}).get("ingress", {}).get("hostName", "") - grafana_host = values.get("grafana", {}).get("ingress", {}).get("hostName", "") - metrics_collector_host = values.get("metricsCollector", {}).get("ingress", {}).get("hostName", "") - - if values.get("prometheus", {}).get("external", {}).get("enabled", False): - prometheus_server = values.get("prometheus", {}).get("external", {}).get("url", "") - prometheus_server = "https://" + prometheus_server.split("//")[-1] - elif values.get("prometheus", {}).get("enabled", False): - prometheus_server = f"http://{release_name}-prometheus-server:9090" - else: - prometheus_server = "" - - # Start with overrides template - overrides_yaml = f""" -prometheus: - server: - useExistingClusterRoleName: {release_name}-prometheus-role - ingress: - hosts: [{prometheus_host}] - tls: - - hosts: [{prometheus_host}] - serviceAccounts: - server: - name: {release_name}-prometheus-sa - -grafana: - dashboardsConfigMaps: - default: {release_name}-grafana-default-dashboard - datasources: - datasources.yaml: - datasources: - - name: prometheus - type: prometheus - access: proxy - isDefault: true - url: {prometheus_server} - jsonData: - timeInterval: "5s" - tlsSkipVerify: true - - name: tempo - type: tempo - url: http://{release_name}-tempo:3100 - access: proxy - isDefault: false - basicAuth: false - jsonData: - timeInterval: "5s" - tlsSkipVerify: true - serviceMap: - datasourceUid: 'prometheus' - nodeGraph: - enabled: true - ingress: - hosts: [{grafana_host}] - tls: - - hosts: [{grafana_host}] - grafana.ini: - server: - root_url: https://{grafana_host} - -metricsCollector: - ingress: - hosts: [{metrics_collector_host}] - tls: - - hosts: [{metrics_collector_host}] -""" - # Parse YAML string into dictionary - overrides = yaml.safe_load(overrides_yaml) - - # Clean up empty values - if not values.get("prometheus", {}).get("server", {}).get("ingress", {}).get("enabled", False): - del overrides["prometheus"]["server"]["ingress"] - if not values.get("grafana", {}).get("ingress", {}).get("enabled", False): - del overrides["grafana"]["ingress"] - del overrides["grafana"]["grafana.ini"] - - # Add OpenTelemetry configuration to Triton args if enabled - if values.get("opentelemetry-collector", {}).get("enabled", False): - # Get existing args from values - triton_args = values.get("triton", {}).get("args", []) - sampling_rate = values.get("tracing_sampling_rate") - if triton_args and sampling_rate>0: - # Get the first (and should be only) argument string - args_str = triton_args[0] - # Remove the last line continuation if it exists - args_str = args_str.rstrip(" \\\n") - # Calculate sampling rate for Triton (1/sampling) - - sampling = max(1, int(1/sampling_rate)) - # Add OpenTelemetry flags - args_str += " \\\n" - args_str += "--trace-config mode=opentelemetry \\\n" - args_str += "--trace-config=opentelemetry,resource=pod_name=$(hostname) \\\n" - args_str += f"--trace-config opentelemetry,url={release_name}-opentelemetry-collector:4318/v1/traces \\\n" - args_str += f"--trace-config rate={sampling} \\\n" - args_str += "--trace-config level=TIMESTAMPS \\\n" - args_str += "--trace-config count=-1" - - overrides["triton"] = {"args": [args_str]} - - return overrides \ No newline at end of file diff --git a/installer-plugin/utils.py b/installer-plugin/utils.py deleted file mode 100644 index 038ca596..00000000 --- a/installer-plugin/utils.py +++ /dev/null @@ -1,153 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import sys -import logging -from typing import Dict, Any, List, Tuple - -def setup_logging(log_level=logging.INFO) -> logging.Logger: - """ - Set up and configure logging for the installer plugin. - - Args: - log_level: The logging level to use (default: logging.INFO) - - Returns: - The configured logger. - """ - # Get or create the logger - logger = logging.getLogger("supersonic-installer") - - # Clear any existing handlers to avoid duplicate messages - # if the function is called multiple times - if logger.handlers: - logger.handlers.clear() - - logger.setLevel(log_level) - - # Create console handler with a specific format to match the previous print output - console_handler = logging.StreamHandler(sys.stdout) - console_handler.setLevel(log_level) - - # Create a formatter that doesn't include the logger name or timestamp - formatter = logging.Formatter('%(message)s') - console_handler.setFormatter(formatter) - - # Add the handler to the logger - logger.addHandler(console_handler) - - # Prevent propagation to the root logger to avoid duplicate messages - logger.propagate = False - - return logger - -def deep_merge(base: Dict, custom: Dict) -> Dict: - """ - Recursively merge two dictionaries. - Custom values override base values at each level. - """ - result = base.copy() - for key, value in custom.items(): - if key in result and isinstance(result[key], dict) and isinstance(value, dict): - result[key] = deep_merge(result[key], value) - else: - result[key] = value - return result - -class CustomHelpFormatter(argparse.HelpFormatter): - """Custom formatter to match the previous help message style.""" - def format_help(self) -> str: - help_text = [] - help_text.append("SuperSONIC Helm Plugin") - help_text.append("======================") - help_text.append("") - help_text.append("This plugin simplifies the SuperSONIC installation process by") - help_text.append("handling chart dependencies and generating appropriate configurations.") - help_text.append("") - help_text.append("Usage:") - help_text.append(" helm install-supersonic [RELEASE_NAME] [flags]") - help_text.append("") - help_text.append("Flags:") - help_text.append(" -h, --help Show this help message") - help_text.append(" -f, --values Specify values file for custom configuration") - help_text.append(" -n, --namespace Specify Kubernetes namespace for deployment") - help_text.append(" --version Specify chart version (default: latest version)") - help_text.append(" Note: Ignored if --local flag is set") - help_text.append(" --local Install from local chart path instead of remote repository") - help_text.append(" --path Local chart path (default: ./helm/supersonic)") - help_text.append(" Only used when --local flag is set") - help_text.append("Additional flags will be passed directly to the 'helm install' command") - help_text.append("") - help_text.append("Examples:") - help_text.append(" # Install SuperSONIC from official repository") - help_text.append(" helm install-supersonic my-release -f my-values.yaml -n my-namespace") - help_text.append("") - help_text.append(" # Install SuperSONIC from local chart") - help_text.append(" helm install-supersonic my-release -f my-values.yaml -n my-namespace --local --path /helm/supersonic") - help_text.append("") - return "\n".join(help_text) - -def process_remaining_args(args: argparse.Namespace, remaining: List[str]) -> List[str]: - """Process remaining arguments to handle values file and return other helm args.""" - logger = logging.getLogger("supersonic-installer") - helm_args = [] - i = 0 - while i < len(remaining): - if remaining[i] in ['-f', '--values']: - if args.values_file is not None: - logger.error("Error: Multiple values files specified. Only one values file is allowed.") - sys.exit(1) - if i + 1 < len(remaining): - args.values_file = remaining[i + 1] - i += 2 - continue - helm_args.append(remaining[i]) - i += 1 - return helm_args - -def create_parser() -> argparse.ArgumentParser: - """Create and configure argument parser.""" - parser = argparse.ArgumentParser( - description="SuperSONIC Helm Plugin", - formatter_class=CustomHelpFormatter, - usage=argparse.SUPPRESS, # Help is shown in custom format - ) - parser.add_argument( - 'release_name', - help=argparse.SUPPRESS # Help is shown in custom format - ) - parser.add_argument( - '-f', '--values', - dest='values_file', - help=argparse.SUPPRESS, # Help is shown in custom format - default=None - ) - parser.add_argument( - '-n', '--namespace', - help=argparse.SUPPRESS # Help is shown in custom format - ) - parser.add_argument( - '--local', - action='store_true', - help=argparse.SUPPRESS, # Help is shown in custom format - default=False - ) - parser.add_argument( - '--path', - help=argparse.SUPPRESS, # Help is shown in custom format - default="helm/supersonic" - ) - parser.add_argument( - '--version', - help=argparse.SUPPRESS, # Help is shown in custom format - default=None - ) - - return parser - -def parse_args() -> Tuple[argparse.Namespace, List[str]]: - """Parse and process command line arguments.""" - parser = create_parser() - args, remaining = parser.parse_known_args() - args.helm_args = process_remaining_args(args, remaining) - return args, remaining \ No newline at end of file diff --git a/values/values-geddes-cms.yaml b/values/values-geddes-cms.yaml index 453ae6e6..b7381fb8 100644 --- a/values/values-geddes-cms.yaml +++ b/values/values-geddes-cms.yaml @@ -1,4 +1,5 @@ -serverLoadThreshold: 100 +serverLoadThreshold: 20 +serverLoadMetric: 'sum by (release) (rate(nv_inference_queue_duration_us{release=~"sonic-server"}[30s]) / (rate(nv_inference_exec_count{release=~"sonic-server"}[30s]) * 1000 + 0.001))' triton: # image: fastml/triton-torchgeo:21.02-py3-geometric # run2 @@ -15,6 +16,12 @@ triton: --log-verbose=0 \ --strict-model-config=false \ --exit-timeout-secs=60 + # --trace-config mode=opentelemetry + # --trace-config=opentelemetry,resource=pod_name=$(hostname) + # --trace-config opentelemetry,url=sonic-server-opentelemetry-collector:4318/v1/traces + # --trace-config rate=100 # 1 in 100 requests + # --trace-config level=TIMESTAMPS + # --trace-config count=-1 resources: limits: { nvidia.com/gpu: 1, cpu: 2, memory: 4G} requests: { nvidia.com/gpu: 1, cpu: 2, memory: 4G} @@ -57,14 +64,72 @@ tolerations: prometheus: enabled: true server: + useExistingClusterRoleName: sonic-server-prometheus-role ingress: enabled: true - hostName: prometheus-cms.geddes.rcac.purdue.edu + hosts: + - prometheus-cms.geddes.rcac.purdue.edu + tls: + - hosts: + - prometheus-cms.geddes.rcac.purdue.edu ingressClassName: public + serviceAccounts: + server: + name: sonic-server-prometheus-sa grafana: enabled: true + dashboardsConfigMaps: + default: sonic-server-grafana-default-dashboard + datasources: + datasources.yaml: + datasources: + - name: prometheus + type: prometheus + access: proxy + isDefault: true + url: http://sonic-server-prometheus-server:9090 + jsonData: + timeInterval: "5s" + tlsSkipVerify: true + - name: tempo + type: tempo + url: http://sonic-server-tempo:3100 + access: proxy + isDefault: false + basicAuth: false + jsonData: + timeInterval: "5s" + tlsSkipVerify: true + serviceMap: + datasourceUid: 'prometheus' + nodeGraph: + enabled: true ingress: enabled: true - hostName: grafana-cms.geddes.rcac.purdue.edu - ingressClassName: public \ No newline at end of file + hosts: + - grafana-cms.geddes.rcac.purdue.edu + tls: + - hosts: + - grafana-cms.geddes.rcac.purdue.edu + ingressClassName: public + grafana.ini: + server: + root_url: https://grafana-cms.geddes.rcac.purdue.edu + +opentelemetry-collector: + enabled: true + config: + exporters: + otlp: + endpoint: http://sonic-server-tempo:4317 + otlphttp: + endpoint: http://sonic-server-tempo:4318 + prometheusremotewrite: + endpoint: http://sonic-server-prometheus-server:9090/api/v1/write +tempo: + enabled: true + tempo: + metricsGenerator: + enabled: true + remoteWriteUrl: http://sonic-server-prometheus-server:9090/api/v1/write \ No newline at end of file diff --git a/values/values-cms-ci.yaml b/values/values-minimal-full.yaml similarity index 91% rename from values/values-cms-ci.yaml rename to values/values-minimal-full.yaml index 788aeec9..dd5181d9 100644 --- a/values/values-cms-ci.yaml +++ b/values/values-minimal-full.yaml @@ -1,7 +1,6 @@ triton: replicas: 1 - # image: fastml/triton-torchgeo:21.02-py3-geometric # run2 - image: fastml/triton-torchgeo:22.07-py3-geometric # run3 + image: fastml/triton-torchgeo:22.07-py3-geometric # works for CMSSW run3 command: ["/bin/sh", "-c"] args: - | diff --git a/values/values-minimal.yaml b/values/values-minimal.yaml new file mode 100644 index 00000000..d37b5fca --- /dev/null +++ b/values/values-minimal.yaml @@ -0,0 +1,29 @@ +triton: + replicas: 1 + image: fastml/triton-torchgeo:22.07-py3-geometric # works for CMSSW run3 + command: ["/bin/sh", "-c"] + args: + - | + /opt/tritonserver/bin/tritonserver \ + --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoBTag/Combined/data/models/ \ + --log-verbose=0 \ + --disable-auto-complete-config \ + --exit-timeout-secs=60 + resources: + limits: { cpu: 1, memory: 3Gi} + requests: { cpu: 1, memory: 1Gi} + modelRepository: + enabled: true + storageType: cvmfs-pvc + mountPath: /cvmfs + readinessProbe: + reset: true + +envoy: + enabled: true + resources: + requests: + cpu: 0.1 + memory: "128Mi" + service: + type: LoadBalancer \ No newline at end of file diff --git a/values/values-nautilus-atlas.yaml b/values/values-nautilus-atlas.yaml index b32ae7da..7a5263f2 100644 --- a/values/values-nautilus-atlas.yaml +++ b/values/values-nautilus-atlas.yaml @@ -69,20 +69,44 @@ prometheus: grafana: enabled: true + dashboardsConfigMaps: + default: traccc-sonic-grafana-default-dashboard + datasources: + datasources.yaml: + datasources: + - name: prometheus + type: prometheus + access: proxy + isDefault: true + url: prometheus.nrp-nautilus.io + jsonData: + timeInterval: "5s" + tlsSkipVerify: true ingress: enabled: true - hostName: grafana-atlas.nrp-nautilus.io + hosts: + - grafana-atlas.nrp-nautilus.io + tls: + - hosts: + - grafana-atlas.nrp-nautilus.io ingressClassName: haproxy annotations: haproxy-ingress.github.io/cors-enable: "true" haproxy-ingress.github.io/proxy-body-size: "512m" haproxy-ingress.github.io/timeout-http-request: "5m" + grafana.ini: + server: + root_url: https://grafana-atlas.nrp-nautilus.io metricsCollector: enabled: true ingress: enabled: true - hostName: metrics-collector-atlas.nrp-nautilus.io + hosts: + - metrics-collector-atlas.nrp-nautilus.io + tls: + - hosts: + - metrics-collector-atlas.nrp-nautilus.io ingressClassName: haproxy annotations: haproxy-ingress.github.io/cors-enable: "true" diff --git a/values/values-nautilus-cms.yaml b/values/values-nautilus-cms.yaml index 51bad052..f0667735 100644 --- a/values/values-nautilus-cms.yaml +++ b/values/values-nautilus-cms.yaml @@ -46,7 +46,11 @@ envoy: loadBalancerPolicy: "LEAST_REQUEST" ingress: enabled: true - hostName: sonic-cms.nrp-nautilus.io + hosts: + - sonic-cms.nrp-nautilus.io + tls: + - hosts: + - sonic-cms.nrp-nautilus.io ingressClassName: haproxy annotations: haproxy-ingress.github.io/cors-enable: "true" @@ -59,6 +63,7 @@ envoy: haproxy-ingress.github.io/timeout-queue: "1m" haproxy-ingress.github.io/health-check-interval: "30s" haproxy-ingress.github.io/health-check-rise-count: "1" + tracing_sampling_rate: 0.001 autoscaler: enabled: false @@ -81,7 +86,11 @@ prometheus: server: ingress: enabled: true - hostName: prometheus-cms.nrp-nautilus.io + hosts: + - prometheus-cms.nrp-nautilus.io + tls: + - hosts: + - prometheus-cms.nrp-nautilus.io ingressClassName: haproxy annotations: haproxy-ingress.github.io/cors-enable: "true" @@ -90,18 +99,37 @@ prometheus: grafana: enabled: true + dashboardsConfigMaps: + default: supersonic-grafana-default-dashboard + datasources: + datasources.yaml: + datasources: + - name: prometheus + type: prometheus + access: proxy + isDefault: true + url: http://supersonic-prometheus-server:9090 + jsonData: + timeInterval: "5s" + tlsSkipVerify: true ingress: enabled: true - hostName: grafana-cms.nrp-nautilus.io + hosts: + - grafana-cms.nrp-nautilus.io + tls: + - hosts: + - grafana-cms.nrp-nautilus.io ingressClassName: haproxy annotations: haproxy-ingress.github.io/cors-enable: "true" haproxy-ingress.github.io/proxy-body-size: "512m" haproxy-ingress.github.io/timeout-http-request: "5m" + grafana.ini: + server: + root_url: https://grafana-cms.nrp-nautilus.io -tracing_sampling_rate: 0.001 opentelemetry-collector: - enabled: true + enabled: false tempo: - enabled: true + enabled: false diff --git a/values/values-nautilus-icecube.yaml b/values/values-nautilus-icecube.yaml index 04c33f48..17169228 100644 --- a/values/values-nautilus-icecube.yaml +++ b/values/values-nautilus-icecube.yaml @@ -87,11 +87,31 @@ prometheus: grafana: enabled: true + dashboardsConfigMaps: + default: supersonic-grafana-default-dashboard + datasources: + datasources.yaml: + datasources: + - name: prometheus + type: prometheus + access: proxy + isDefault: true + url: prometheus.nrp-nautilus.io + jsonData: + timeInterval: "5s" + tlsSkipVerify: true ingress: enabled: true - hostName: grafana-icecube.nrp-nautilus.io + hosts: + - grafana-icecube.nrp-nautilus.io + tls: + - hosts: + - grafana-icecube.nrp-nautilus.io ingressClassName: haproxy annotations: haproxy-ingress.github.io/cors-enable: "true" haproxy-ingress.github.io/backend-protocol: "h2" - haproxy-ingress.github.io/proxy-body-size: "512m" \ No newline at end of file + haproxy-ingress.github.io/proxy-body-size: "512m" + grafana.ini: + server: + root_url: https://grafana-icecube.nrp-nautilus.io \ No newline at end of file