diff --git a/cmd/do-agent/aggregation.go b/cmd/do-agent/aggregation.go index 847a7fb0..23f0ab31 100644 --- a/cmd/do-agent/aggregation.go +++ b/cmd/do-agent/aggregation.go @@ -95,3 +95,73 @@ var k8sAggregationSpec = map[string][]string{ var mongoAggregationSpec = map[string][]string{ "mongoagent_data_usage_percentage": {"cluster_uuid"}, } + +// amdAggregatedLabels contains all the labels we want to aggregate on for AMD GPU metrics. +// keep only gpu_id and hostname +var amdAggregatedLabels = []string{ + "card_model", "card_series", "card_vendor", "cluster_name", "container", + "driver_version", "gpu_compute_partition_type", "gpu_memory_partition_type", + "gpu_partition_id", "gpu_uuid", "job_id", "job_partition", "job_user", + "namespace", "pod", "serial_number", "usergroup_id", "vbios_version", "workload_id", +} + +var gpuAggregationSpec = map[string][]string{ + // GPU Utilization metrics + "amd_gpu_prof_gui_util_percent": amdAggregatedLabels, + "amd_gpu_prof_valu_pipe_issue_util": amdAggregatedLabels, + "amd_gpu_prof_tensor_active_percent": amdAggregatedLabels, + "amd_gpu_prof_occupancy_percent": amdAggregatedLabels, + "amd_gpu_prof_fetch_size": amdAggregatedLabels, + "amd_gpu_prof_write_size": amdAggregatedLabels, + + // GPU VRAM usage metrics + "amd_gpu_used_vram": amdAggregatedLabels, + "amd_gpu_total_vram": amdAggregatedLabels, + "amd_gpu_free_vram": amdAggregatedLabels, + + // XGMI Bandwidth metrics - all neighbors 0-7 + "amd_xgmi_neighbor_0_tx_throughput": amdAggregatedLabels, + "amd_xgmi_neighbor_1_tx_throughput": amdAggregatedLabels, + "amd_xgmi_neighbor_2_tx_throughput": amdAggregatedLabels, + "amd_xgmi_neighbor_3_tx_throughput": amdAggregatedLabels, + "amd_xgmi_neighbor_4_tx_throughput": amdAggregatedLabels, + "amd_xgmi_neighbor_5_tx_throughput": amdAggregatedLabels, + "amd_xgmi_neighbor_6_tx_throughput": amdAggregatedLabels, + "amd_xgmi_neighbor_7_tx_throughput": amdAggregatedLabels, + + "amd_xgmi_neighbor_0_response_tx": amdAggregatedLabels, + "amd_xgmi_neighbor_1_response_tx": amdAggregatedLabels, + "amd_xgmi_neighbor_2_response_tx": amdAggregatedLabels, + "amd_xgmi_neighbor_3_response_tx": amdAggregatedLabels, + "amd_xgmi_neighbor_4_response_tx": amdAggregatedLabels, + "amd_xgmi_neighbor_5_response_tx": amdAggregatedLabels, + "amd_xgmi_neighbor_6_response_tx": amdAggregatedLabels, + "amd_xgmi_neighbor_7_response_tx": amdAggregatedLabels, + + "amd_xgmi_neighbor_0_request_tx": amdAggregatedLabels, + "amd_xgmi_neighbor_1_request_tx": amdAggregatedLabels, + "amd_xgmi_neighbor_2_request_tx": amdAggregatedLabels, + "amd_xgmi_neighbor_3_request_tx": amdAggregatedLabels, + "amd_xgmi_neighbor_4_request_tx": amdAggregatedLabels, + "amd_xgmi_neighbor_5_request_tx": amdAggregatedLabels, + "amd_xgmi_neighbor_6_request_tx": amdAggregatedLabels, + "amd_xgmi_neighbor_7_request_tx": amdAggregatedLabels, + + // PCIe bandwidth + "amd_pcie_bandwidth": amdAggregatedLabels, + + "amd_gpu_ecc_uncorrect_total": amdAggregatedLabels, + "amd_pcie_replay_count": amdAggregatedLabels, + "amd_pcie_recovery_count": amdAggregatedLabels, + "amd_pcie_replay_rollover_count": amdAggregatedLabels, + "amd_pcie_max_speed": amdAggregatedLabels, + "amd_pcie_speed": amdAggregatedLabels, + "amd_gpu_prof_cpf_cpf_stat_stall": amdAggregatedLabels, + "amd_gpu_clock": amdAggregatedLabels, + "amd_gpu_violation_proc_hot_residency_accumulated": amdAggregatedLabels, + "amd_gpu_violation_soc_thermal_residency_accumulated": amdAggregatedLabels, + "amd_gpu_violation_ppt_residency_accumulated": amdAggregatedLabels, + "amd_gpu_violation_hbm_thermal_residency_accumulated": amdAggregatedLabels, + "amd_gpu_violation_vr_thermal_tracking_accumulated": amdAggregatedLabels, + "amd_gpu_junction_temperature": amdAggregatedLabels, +} diff --git a/cmd/do-agent/config.go b/cmd/do-agent/config.go index 4da83090..278866fd 100644 --- a/cmd/do-agent/config.go +++ b/cmd/do-agent/config.go @@ -10,11 +10,11 @@ import ( "time" "github.com/alecthomas/kingpin/v2" + "github.com/digitalocean/do-agent/internal/flags" "github.com/prometheus/client_golang/prometheus" dto "github.com/prometheus/client_model/go" "github.com/prometheus/common/model" - "github.com/digitalocean/do-agent/internal/flags" "github.com/digitalocean/do-agent/internal/log" "github.com/digitalocean/do-agent/internal/process" "github.com/digitalocean/do-agent/pkg/clients/tsclient" @@ -46,6 +46,7 @@ var ( defaultMaxBatchSize int defaultMaxMetricLength int promAddr string + gpuMetricsPath string topK int scrapeTimeout time.Duration } @@ -123,6 +124,9 @@ func init() { kingpin.Flag("metrics-path", "enable metrics collection from a prometheus endpoint"). StringVar(&config.promAddr) + kingpin.Flag("gpu-metrics-path", "enable GPU metrics collection from a prometheus endpoint (e.g., AMD device-metrics-exporter)"). + StringVar(&config.gpuMetricsPath) + kingpin.Flag("web.listen", "enable a local endpoint for scrapeable prometheus metrics as well"). Default("false"). BoolVar(&config.webListen) @@ -155,6 +159,7 @@ func initConfig() { // parse all command line flags which are defined across the app kingpin.HelpFlag.Short('h') kingpin.Parse() + } func checkConfig() error { @@ -247,6 +252,12 @@ func initAggregatorSpecs() map[string][]string { } } + if config.gpuMetricsPath != "" { + for k, v := range gpuAggregationSpec { + aggregateSpecs[k] = append(aggregateSpecs[k], v...) + } + } + return aggregateSpecs } @@ -321,6 +332,15 @@ func initCollectors() []prometheus.Collector { } } + if config.gpuMetricsPath != "" { + gpu, err := collector.NewScraper("gpu", config.gpuMetricsPath, nil, gpuWhitelist, collector.WithTimeout(config.scrapeTimeout)) + if err != nil { + log.Error("Failed to initialize GPU metrics collector: %+v", err) + } else { + cols = append(cols, gpu) + } + } + // create the default DO agent to collect metrics about // this device if !config.noNode { diff --git a/cmd/do-agent/whitelist.go b/cmd/do-agent/whitelist.go index 5ff50d69..7341ca6e 100644 --- a/cmd/do-agent/whitelist.go +++ b/cmd/do-agent/whitelist.go @@ -163,3 +163,64 @@ var dbaasWhitelist = map[string]bool{ "opensearch_http_total_opened": true, } + +var gpuWhitelist = map[string]bool{ + // GPU Utilization + "amd_gpu_prof_gui_util_percent": true, + "amd_gpu_prof_valu_pipe_issue_util": true, + "amd_gpu_prof_tensor_active_percent": true, + "amd_gpu_prof_occupancy_percent": true, + "amd_gpu_prof_fetch_size": true, + "amd_gpu_prof_write_size": true, + + // GPU VRAM usage + "amd_gpu_used_vram": true, + "amd_gpu_total_vram": true, + "amd_gpu_free_vram": true, + + // XGMI Bandwidth + "amd_xgmi_neighbor_0_tx_throughput": true, + "amd_xgmi_neighbor_1_tx_throughput": true, + "amd_xgmi_neighbor_2_tx_throughput": true, + "amd_xgmi_neighbor_3_tx_throughput": true, + "amd_xgmi_neighbor_4_tx_throughput": true, + "amd_xgmi_neighbor_5_tx_throughput": true, + "amd_xgmi_neighbor_6_tx_throughput": true, + "amd_xgmi_neighbor_7_tx_throughput": true, + + "amd_xgmi_neighbor_0_response_tx": true, + "amd_xgmi_neighbor_1_response_tx": true, + "amd_xgmi_neighbor_2_response_tx": true, + "amd_xgmi_neighbor_3_response_tx": true, + "amd_xgmi_neighbor_4_response_tx": true, + "amd_xgmi_neighbor_5_response_tx": true, + "amd_xgmi_neighbor_6_response_tx": true, + "amd_xgmi_neighbor_7_response_tx": true, + + "amd_xgmi_neighbor_0_request_tx": true, + "amd_xgmi_neighbor_1_request_tx": true, + "amd_xgmi_neighbor_2_request_tx": true, + "amd_xgmi_neighbor_3_request_tx": true, + "amd_xgmi_neighbor_4_request_tx": true, + "amd_xgmi_neighbor_5_request_tx": true, + "amd_xgmi_neighbor_6_request_tx": true, + "amd_xgmi_neighbor_7_request_tx": true, + + // PCIe bandwidth + "amd_pcie_bandwidth": true, + + "amd_gpu_ecc_uncorrect_total": true, + "amd_pcie_replay_count": true, + "amd_pcie_recovery_count": true, + "amd_pcie_replay_rollover_count": true, + "amd_pcie_max_speed": true, + "amd_pcie_speed": true, + "amd_gpu_prof_cpf_cpf_stat_stall": true, + "amd_gpu_clock": true, + "amd_gpu_violation_proc_hot_residency_accumulated": true, + "amd_gpu_violation_soc_thermal_residency_accumulated": true, + "amd_gpu_violation_ppt_residency_accumulated": true, + "amd_gpu_violation_hbm_thermal_residency_accumulated": true, + "amd_gpu_violation_vr_thermal_tracking_accumulated": true, + "amd_gpu_junction_temperature": true, +}