Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci-full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ jobs:
- name: Run Perf Analyzer Job
run: |
kubectl apply -f ci/perf-analyzer-job.yaml
kubectl wait --for=condition=complete job/perf-analyzer-job -n cms --timeout=240s || \
kubectl wait --for=condition=complete job/perf-analyzer-job -n cms --timeout=300s || \
(echo "Perf-analyzer job did not complete in time or failed." && exit 1)

POD_NAME=$(kubectl get pods -n cms -l job-name=perf-analyzer-job -o jsonpath="{.items[0].metadata.name}")
Expand Down
1 change: 0 additions & 1 deletion docs/advanced-monitoring.rst
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ shows how tracing is configured for CMS SuperSONIC instance:
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoBTag/Combined/data/models/ \
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoTauTag/TrainingFiles/data/DeepTauIdSONIC/ \
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoMET/METPUSubtraction/data/models/ \
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoEgamma/EgammaPhotonProducers/data/models/ \
--trace-config mode=opentelemetry \
--trace-config=opentelemetry,resource=pod_name=$(hostname) \
--trace-config opentelemetry,url=supersonic-opentelemetry-collector:4318/v1/traces \
Expand Down
3 changes: 1 addition & 2 deletions docs/configuration-guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ Triton version must be specified in the ``triton.image`` parameter in the values
- |
/opt/tritonserver/bin/tritonserver \
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoBTag/Combined/data/models/ \
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoEgamma/EgammaPhotonProducers/data/models/ \
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoTauTag/TrainingFiles/data/DeepTauIdSONIC/ \
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoMET/METPUSubtraction/data/models/ \
--allow-gpu-metrics=true \
Expand Down Expand Up @@ -90,7 +89,7 @@ Triton version must be specified in the ``triton.image`` parameter in the values
<br><br>


3. Select Resources for Triton Pods
1. Select Resources for Triton Pods
=============================================

- You can configure CPU, memory, and GPU resources for Triton pods via the ``triton.resources`` parameter in the values file:
Expand Down
2 changes: 1 addition & 1 deletion helm/supersonic/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ apiVersion: v2
name: supersonic
description: Server infrastructure for inference-as-a-service in large scientific experiments.
icon: https://github.com/fastmachinelearning/SuperSONIC/blob/main/docs/img/SuperSONIC_small_512.png?raw=true
version: 0.2.1
version: 0.3.0
type: application
home: https://fastmachinelearning.org/SuperSONIC/
annotations:
Expand Down
4 changes: 1 addition & 3 deletions values/values-anvil-cms.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
triton:
# image: fastml/triton-torchgeo:21.02-py3-geometric # run2
image: fastml/triton-torchgeo:22.07-py3-geometric # run3
image: nvcr.io/nvidia/tritonserver:24.11-py3
command: ["/bin/sh", "-c"]
args:
- |
/opt/tritonserver/bin/tritonserver \
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoBTag/Combined/data/models/ \
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoEgamma/EgammaPhotonProducers/data/models/ \
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoTauTag/TrainingFiles/data/DeepTauIdSONIC/ \
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoMET/METPUSubtraction/data/models/ \
--allow-gpu-metrics=true \
Expand Down
73 changes: 38 additions & 35 deletions values/values-geddes-cms.yaml
Original file line number Diff line number Diff line change
@@ -1,31 +1,30 @@
serverLoadThreshold: 20
serverLoadMetric: 'sum by (release) (rate(nv_inference_queue_duration_us{release=~"sonic-server"}[30s]) / (rate(nv_inference_exec_count{release=~"sonic-server"}[30s]) * 1000 + 0.001))'

triton:
# image: fastml/triton-torchgeo:21.02-py3-geometric # run2
image: fastml/triton-torchgeo:22.07-py3-geometric # run3
triton:
image: nvcr.io/nvidia/tritonserver:24.11-py3
command: ["/bin/sh", "-c"]
args:
args:
- |
/opt/tritonserver/bin/tritonserver \
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoBTag/Combined/data/models/ \
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoEgamma/EgammaPhotonProducers/data/models/ \
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoTauTag/TrainingFiles/data/DeepTauIdSONIC/ \
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoMET/METPUSubtraction/data/models/ \
--trace-config mode=opentelemetry \
--trace-config=opentelemetry,resource=pod_name=$(hostname) \
--trace-config opentelemetry,url=sonic-server-opentelemetry-collector:4318/v1/traces \
--trace-config rate=100 \
--trace-config level=TIMESTAMPS \
--trace-config count=-1 \
--allow-gpu-metrics=true \
--log-verbose=0 \
--strict-model-config=false \
--exit-timeout-secs=60
# --trace-config mode=opentelemetry
# --trace-config=opentelemetry,resource=pod_name=$(hostname)
# --trace-config opentelemetry,url=sonic-server-opentelemetry-collector:4318/v1/traces
# --trace-config rate=100 # 1 in 100 requests
# --trace-config level=TIMESTAMPS
# --trace-config count=-1

resources:
limits: { nvidia.com/gpu: 1, cpu: 2, memory: 4G}
requests: { nvidia.com/gpu: 1, cpu: 2, memory: 4G}
nodeSelector: {'cms-af-prod': 'true'}
limits: { nvidia.com/gpu: 1, cpu: 2, memory: 4G }
requests: { nvidia.com/gpu: 1, cpu: 2, memory: 4G }
nodeSelector: { "cms-af-prod": "true" }
tolerations:
- key: hub.jupyter.org/dedicated
operator: Equal
Expand All @@ -43,7 +42,7 @@ triton:

envoy:
enabled: true
nodeSelector: {'cms-af-prod': 'true'}
nodeSelector: { "cms-af-prod": "true" }
tolerations:
- key: hub.jupyter.org/dedicated
operator: Equal
Expand All @@ -56,30 +55,33 @@ envoy:
enabled: true
hostName: sonic-cms.geddes.rcac.purdue.edu
ingressClassName: public
rate_limiter:
prometheus_based:
enabled: true
tracing_sampling_rate: 0.01

keda:
enabled: true
minReplicaCount: 1
maxReplicaCount: 7
maxReplicaCount: 11
scaleUp:
stabilizationWindowSeconds: 30
periodSeconds: 15
stepsize: 1
scaleDown:
stabilizationWindowSeconds: 45
periodSeconds: 45
stepsize: 1

ingress:
enabled: false

prometheus:
enabled: true
server:
useExistingClusterRoleName: sonic-server-prometheus-role
ingress:
enabled: true
hosts:
- prometheus-cms.geddes.rcac.purdue.edu
tls:
- hosts:
- prometheus-cms.geddes.rcac.purdue.edu
ingressClassName: public
serviceAccounts:
server:
name: sonic-server-prometheus-sa
external:
enabled: true
url: prometheus-af.geddes.rcac.purdue.edu
port: 443
scheme: https

grafana:
enabled: true
Expand All @@ -92,7 +94,7 @@ grafana:
type: prometheus
access: proxy
isDefault: true
url: http://sonic-server-prometheus-server:9090
url: https://prometheus-af.geddes.rcac.purdue.edu
jsonData:
timeInterval: "5s"
tlsSkipVerify: true
Expand All @@ -106,7 +108,7 @@ grafana:
timeInterval: "5s"
tlsSkipVerify: true
serviceMap:
datasourceUid: 'prometheus'
datasourceUid: "prometheus"
nodeGraph:
enabled: true
ingress:
Expand All @@ -127,13 +129,14 @@ opentelemetry-collector:
exporters:
otlp:
endpoint: http://sonic-server-tempo:4317
otlphttp:
otlphttp:
endpoint: http://sonic-server-tempo:4318
prometheusremotewrite:
endpoint: http://sonic-server-prometheus-server:9090/api/v1/write
endpoint: http://prometheus-server:9090/api/v1/write

tempo:
enabled: true
tempo:
metricsGenerator:
enabled: true
remoteWriteUrl: http://sonic-server-prometheus-server:9090/api/v1/write
remoteWriteUrl: http://prometheus-server:9090/api/v1/write
1 change: 0 additions & 1 deletion values/values-nautilus-cms.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ triton:
--strict-model-config=false \
--exit-timeout-secs=60 \
--backend-config=onnxruntime,enable-global-threadpool=1
# --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoEgamma/EgammaPhotonProducers/data/models/ \

resources:
limits: { cpu: 1, memory: 3G, nvidia.com/gpu: 1}
Expand Down
Loading