From 36e7b11e46f0d3ded19557299e44fe0146f6a635 Mon Sep 17 00:00:00 2001 From: kondratyevd Date: Fri, 1 Aug 2025 14:10:48 -0400 Subject: [PATCH 1/3] remove photonObjectCombined model to avoid loading errors --- docs/advanced-monitoring.rst | 1 - docs/configuration-guide.rst | 3 +- values/values-anvil-cms.yaml | 4 +- values/values-geddes-cms.yaml | 73 +++++++++++++++++---------------- values/values-nautilus-cms.yaml | 1 - 5 files changed, 40 insertions(+), 42 deletions(-) diff --git a/docs/advanced-monitoring.rst b/docs/advanced-monitoring.rst index 5be0df5..4de9f91 100644 --- a/docs/advanced-monitoring.rst +++ b/docs/advanced-monitoring.rst @@ -63,7 +63,6 @@ shows how tracing is configured for CMS SuperSONIC instance: --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoBTag/Combined/data/models/ \ --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoTauTag/TrainingFiles/data/DeepTauIdSONIC/ \ --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoMET/METPUSubtraction/data/models/ \ - --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoEgamma/EgammaPhotonProducers/data/models/ \ --trace-config mode=opentelemetry \ --trace-config=opentelemetry,resource=pod_name=$(hostname) \ --trace-config opentelemetry,url=supersonic-opentelemetry-collector:4318/v1/traces \ diff --git a/docs/configuration-guide.rst b/docs/configuration-guide.rst index d8ac473..4c4788d 100644 --- a/docs/configuration-guide.rst +++ b/docs/configuration-guide.rst @@ -33,7 +33,6 @@ Triton version must be specified in the ``triton.image`` parameter in the values - | /opt/tritonserver/bin/tritonserver \ --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoBTag/Combined/data/models/ \ - --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoEgamma/EgammaPhotonProducers/data/models/ \ --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoTauTag/TrainingFiles/data/DeepTauIdSONIC/ \ --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoMET/METPUSubtraction/data/models/ \ --allow-gpu-metrics=true \ @@ -90,7 +89,7 @@ Triton version must be specified in the ``triton.image`` parameter in the values

-3. Select Resources for Triton Pods +1. Select Resources for Triton Pods ============================================= - You can configure CPU, memory, and GPU resources for Triton pods via the ``triton.resources`` parameter in the values file: diff --git a/values/values-anvil-cms.yaml b/values/values-anvil-cms.yaml index 6b1572a..276bf8f 100644 --- a/values/values-anvil-cms.yaml +++ b/values/values-anvil-cms.yaml @@ -1,12 +1,10 @@ triton: - # image: fastml/triton-torchgeo:21.02-py3-geometric # run2 - image: fastml/triton-torchgeo:22.07-py3-geometric # run3 + image: nvcr.io/nvidia/tritonserver:24.11-py3 command: ["/bin/sh", "-c"] args: - | /opt/tritonserver/bin/tritonserver \ --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoBTag/Combined/data/models/ \ - --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoEgamma/EgammaPhotonProducers/data/models/ \ --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoTauTag/TrainingFiles/data/DeepTauIdSONIC/ \ --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoMET/METPUSubtraction/data/models/ \ --allow-gpu-metrics=true \ diff --git a/values/values-geddes-cms.yaml b/values/values-geddes-cms.yaml index 4378370..a81fda3 100644 --- a/values/values-geddes-cms.yaml +++ b/values/values-geddes-cms.yaml @@ -1,31 +1,30 @@ serverLoadThreshold: 20 serverLoadMetric: 'sum by (release) (rate(nv_inference_queue_duration_us{release=~"sonic-server"}[30s]) / (rate(nv_inference_exec_count{release=~"sonic-server"}[30s]) * 1000 + 0.001))' -triton: - # image: fastml/triton-torchgeo:21.02-py3-geometric # run2 - image: fastml/triton-torchgeo:22.07-py3-geometric # run3 +triton: + image: nvcr.io/nvidia/tritonserver:24.11-py3 command: ["/bin/sh", "-c"] - args: + args: - | /opt/tritonserver/bin/tritonserver \ --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoBTag/Combined/data/models/ \ - --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoEgamma/EgammaPhotonProducers/data/models/ \ --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoTauTag/TrainingFiles/data/DeepTauIdSONIC/ \ --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoMET/METPUSubtraction/data/models/ \ + --trace-config mode=opentelemetry \ + --trace-config=opentelemetry,resource=pod_name=$(hostname) \ + --trace-config opentelemetry,url=sonic-server-opentelemetry-collector:4318/v1/traces \ + --trace-config rate=100 \ + --trace-config level=TIMESTAMPS \ + --trace-config count=-1 \ --allow-gpu-metrics=true \ --log-verbose=0 \ --strict-model-config=false \ --exit-timeout-secs=60 - # --trace-config mode=opentelemetry - # --trace-config=opentelemetry,resource=pod_name=$(hostname) - # --trace-config opentelemetry,url=sonic-server-opentelemetry-collector:4318/v1/traces - # --trace-config rate=100 # 1 in 100 requests - # --trace-config level=TIMESTAMPS - # --trace-config count=-1 + resources: - limits: { nvidia.com/gpu: 1, cpu: 2, memory: 4G} - requests: { nvidia.com/gpu: 1, cpu: 2, memory: 4G} - nodeSelector: {'cms-af-prod': 'true'} + limits: { nvidia.com/gpu: 1, cpu: 2, memory: 4G } + requests: { nvidia.com/gpu: 1, cpu: 2, memory: 4G } + nodeSelector: { "cms-af-prod": "true" } tolerations: - key: hub.jupyter.org/dedicated operator: Equal @@ -43,7 +42,7 @@ triton: envoy: enabled: true - nodeSelector: {'cms-af-prod': 'true'} + nodeSelector: { "cms-af-prod": "true" } tolerations: - key: hub.jupyter.org/dedicated operator: Equal @@ -56,30 +55,33 @@ envoy: enabled: true hostName: sonic-cms.geddes.rcac.purdue.edu ingressClassName: public + rate_limiter: + prometheus_based: + enabled: true + tracing_sampling_rate: 0.01 keda: enabled: true minReplicaCount: 1 - maxReplicaCount: 7 + maxReplicaCount: 11 + scaleUp: + stabilizationWindowSeconds: 30 + periodSeconds: 15 + stepsize: 1 + scaleDown: + stabilizationWindowSeconds: 45 + periodSeconds: 45 + stepsize: 1 ingress: enabled: false prometheus: - enabled: true - server: - useExistingClusterRoleName: sonic-server-prometheus-role - ingress: - enabled: true - hosts: - - prometheus-cms.geddes.rcac.purdue.edu - tls: - - hosts: - - prometheus-cms.geddes.rcac.purdue.edu - ingressClassName: public - serviceAccounts: - server: - name: sonic-server-prometheus-sa + external: + enabled: true + url: prometheus-af.geddes.rcac.purdue.edu + port: 443 + scheme: https grafana: enabled: true @@ -92,7 +94,7 @@ grafana: type: prometheus access: proxy isDefault: true - url: http://sonic-server-prometheus-server:9090 + url: https://prometheus-af.geddes.rcac.purdue.edu jsonData: timeInterval: "5s" tlsSkipVerify: true @@ -106,7 +108,7 @@ grafana: timeInterval: "5s" tlsSkipVerify: true serviceMap: - datasourceUid: 'prometheus' + datasourceUid: "prometheus" nodeGraph: enabled: true ingress: @@ -127,13 +129,14 @@ opentelemetry-collector: exporters: otlp: endpoint: http://sonic-server-tempo:4317 - otlphttp: + otlphttp: endpoint: http://sonic-server-tempo:4318 prometheusremotewrite: - endpoint: http://sonic-server-prometheus-server:9090/api/v1/write + endpoint: http://prometheus-server:9090/api/v1/write + tempo: enabled: true tempo: metricsGenerator: enabled: true - remoteWriteUrl: http://sonic-server-prometheus-server:9090/api/v1/write \ No newline at end of file + remoteWriteUrl: http://prometheus-server:9090/api/v1/write diff --git a/values/values-nautilus-cms.yaml b/values/values-nautilus-cms.yaml index 82850c4..b63169e 100644 --- a/values/values-nautilus-cms.yaml +++ b/values/values-nautilus-cms.yaml @@ -18,7 +18,6 @@ triton: --strict-model-config=false \ --exit-timeout-secs=60 \ --backend-config=onnxruntime,enable-global-threadpool=1 -# --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoEgamma/EgammaPhotonProducers/data/models/ \ resources: limits: { cpu: 1, memory: 3G, nvidia.com/gpu: 1} From fe5bfb11f83f4d031a9b03b37ae4024966f29746 Mon Sep 17 00:00:00 2001 From: kondratyevd Date: Fri, 1 Aug 2025 14:20:15 -0400 Subject: [PATCH 2/3] increase CI timeout --- .github/workflows/ci-full.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-full.yaml b/.github/workflows/ci-full.yaml index b1b5526..90042f6 100644 --- a/.github/workflows/ci-full.yaml +++ b/.github/workflows/ci-full.yaml @@ -99,7 +99,7 @@ jobs: - name: Run Perf Analyzer Job run: | kubectl apply -f ci/perf-analyzer-job.yaml - kubectl wait --for=condition=complete job/perf-analyzer-job -n cms --timeout=240s || \ + kubectl wait --for=condition=complete job/perf-analyzer-job -n cms --timeout=300s || \ (echo "Perf-analyzer job did not complete in time or failed." && exit 1) POD_NAME=$(kubectl get pods -n cms -l job-name=perf-analyzer-job -o jsonpath="{.items[0].metadata.name}") From c9ca1c2325db157fb768376c5d2842106f0f0335 Mon Sep 17 00:00:00 2001 From: kondratyevd Date: Fri, 1 Aug 2025 14:26:24 -0400 Subject: [PATCH 3/3] bump version in Chart.yaml --- helm/supersonic/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/supersonic/Chart.yaml b/helm/supersonic/Chart.yaml index 00440c2..f47c428 100644 --- a/helm/supersonic/Chart.yaml +++ b/helm/supersonic/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: supersonic description: Server infrastructure for inference-as-a-service in large scientific experiments. icon: https://github.com/fastmachinelearning/SuperSONIC/blob/main/docs/img/SuperSONIC_small_512.png?raw=true -version: 0.2.1 +version: 0.3.0 type: application home: https://fastmachinelearning.org/SuperSONIC/ annotations: