Skip to content

Commit da56a3b

Browse files
committed
Whitelist Nvidia metrics
1 parent c9935ad commit da56a3b

File tree

2 files changed

+16
-0
lines changed

2 files changed

+16
-0
lines changed

cmd/do-agent/aggregation.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,17 @@ var amdAggregatedLabels = []string{
105105
"namespace", "pod", "serial_number", "usergroup_id", "vbios_version", "workload_id",
106106
}
107107

108+
// nvidiaAggregatedLabels contains all the labels we want to aggregate on for NVIDIA GPU metrics.
109+
// keep only gpu, UUID, and Hostname
110+
var nvidiaAggregatedLabels = []string{"pci_bus_id", "device", "modelName", "DCGM_FI_DRIVER_VERSION"}
111+
108112
var gpuAggregationSpec = map[string][]string{
113+
// NVIDIA GPU metrics
114+
"DCGM_FI_DEV_GPU_UTIL": nvidiaAggregatedLabels,
115+
"DCGM_FI_DEV_FB_FREE": nvidiaAggregatedLabels,
116+
"DCGM_FI_DEV_FB_USED": nvidiaAggregatedLabels,
117+
"DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL": nvidiaAggregatedLabels,
118+
109119
// GPU Utilization metrics
110120
"amd_gpu_prof_gui_util_percent": amdAggregatedLabels,
111121
"amd_gpu_prof_valu_pipe_issue_util": amdAggregatedLabels,

cmd/do-agent/whitelist.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,12 @@ var dbaasWhitelist = map[string]bool{
165165
}
166166

167167
var gpuWhitelist = map[string]bool{
168+
// NVIDIA GPU metrics
169+
"DCGM_FI_DEV_GPU_UTIL": true,
170+
"DCGM_FI_DEV_FB_FREE": true,
171+
"DCGM_FI_DEV_FB_USED": true,
172+
"DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL": true,
173+
168174
// GPU Utilization
169175
"amd_gpu_prof_gui_util_percent": true,
170176
"amd_gpu_prof_valu_pipe_issue_util": true,

0 commit comments

Comments
 (0)