From 3c15b8602b6fea3fee5b6cd14adb419d1dea4e95 Mon Sep 17 00:00:00 2001 From: Conner Swann <2635475+yourbuddyconner@users.noreply.github.com> Date: Thu, 16 Jan 2025 16:17:12 -0800 Subject: [PATCH] gpu telemetry --- Cargo.lock | 89 +++++++++++++++++++++++++++++++- Cargo.toml | 3 +- src/main.rs | 2 + src/telemetry.rs | 132 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 224 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3d4918a..6802492 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1039,7 +1039,7 @@ checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" dependencies = [ "glob", "libc", - "libloading", + "libloading 0.8.6", ] [[package]] @@ -1297,6 +1297,41 @@ dependencies = [ "cipher", ] +[[package]] +name = "darling" +version = "0.20.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.20.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.95", +] + +[[package]] +name = "darling_macro" +version = "0.20.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" +dependencies = [ + "darling_core", + "quote", + "syn 2.0.95", +] + [[package]] name = "dashu" version = "0.4.2" @@ -3110,6 +3145,12 @@ dependencies = [ "syn 2.0.95", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "1.0.3" @@ -3450,6 +3491,16 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "libloading" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" +dependencies = [ + "cfg-if", + "winapi", +] + [[package]] name = "libloading" version = "0.8.6" @@ -3860,6 +3911,29 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" +[[package]] +name = "nvml-wrapper" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cd21b9f5a1cce3c3515c9ffa85f5c7443e07162dae0ccf4339bb7ca38ad3454" +dependencies = [ + "bitflags 1.3.2", + "libloading 0.7.4", + "nvml-wrapper-sys", + "static_assertions", + "thiserror 1.0.69", + "wrapcenum-derive", +] + +[[package]] +name = "nvml-wrapper-sys" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c961a2ea9e91c59a69b78e69090f6f5b867bb46c0c56de9482da232437c4987e" +dependencies = [ + "libloading 0.7.4", +] + [[package]] name = "objc" version = "0.2.7" @@ -7545,6 +7619,18 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "wrapcenum-derive" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a76ff259533532054cfbaefb115c613203c73707017459206380f03b3b3f266e" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn 2.0.95", +] + [[package]] name = "write16" version = "1.0.0" @@ -7734,6 +7820,7 @@ dependencies = [ "ethers 2.0.14 (git+https://github.com/yetanotherco/ethers-rs.git?tag=v2.0.15-fix-reconnections)", "hex", "log", + "nvml-wrapper", "regex", "reqwest 0.11.27", "risc0-zkvm", diff --git a/Cargo.toml b/Cargo.toml index 74045b8..b16152b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,4 +38,5 @@ dirs = "5.0.0" serde_json = "1.0" sysinfo = "0.33.1" serde = { version = "1.0", features = ["derive"] } -chrono = "0.4" \ No newline at end of file +chrono = "0.4" +nvml-wrapper = "0.9.0" \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index 3297b1b..9f5113e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -41,6 +41,7 @@ async fn main() -> io::Result<()> { let telemetry = TelemetryCollector::new( "SP1", args.precompiles, + args.gpu, args.enable_telemetry, &args.guest_path, ); @@ -308,6 +309,7 @@ async fn main() -> io::Result<()> { let telemetry = TelemetryCollector::new( "RISC0", args.precompiles, + args.gpu, args.enable_telemetry, &args.guest_path, ); diff --git a/src/telemetry.rs b/src/telemetry.rs index 8f83ea6..d5174db 100644 --- a/src/telemetry.rs +++ b/src/telemetry.rs @@ -1,4 +1,5 @@ use log::{debug, info}; +use nvml_wrapper::Nvml; use serde::Serialize; use std::fs; use std::io::Read; @@ -66,6 +67,13 @@ pub struct ResourceMetrics { samples: usize, } +#[derive(Default, Serialize, Clone)] +pub struct GpuInfo { + pub name: String, + pub memory_total_kb: Option, + pub vendor: String, +} + #[derive(Default, Serialize, Clone)] pub struct SystemInfo { pub os_name: String, @@ -75,8 +83,11 @@ pub struct SystemInfo { pub cpu_brand: String, pub cpu_count: usize, pub cpu_frequency_mhz: u64, + pub gpu_enabled: bool, + pub gpus: Vec, pub is_ec2: bool, pub ec2_instance_type: Option, + pub llvm_version: Option, } #[derive(Default, Serialize, Clone)] @@ -85,6 +96,7 @@ pub struct TelemetryData { pub resources: ResourceMetrics, pub proving_system: String, pub precompiles_enabled: bool, + pub gpu_enabled: bool, pub program: ProgramInfo, pub zk_metrics: ZkMetrics, pub system_info: SystemInfo, @@ -102,6 +114,7 @@ impl TelemetryCollector { pub fn new( proving_system: &str, precompiles_enabled: bool, + gpu_enabled: bool, enabled: bool, guest_path: &str, ) -> Self { @@ -147,6 +160,21 @@ impl TelemetryCollector { // Fetch EC2 metadata let (is_ec2, ec2_instance_type) = Self::fetch_ec2_metadata(); + // Discover GPUs + let gpus = if gpu_enabled { + Self::discover_gpus() + } else { + Vec::new() + }; + + // Get LLVM version + let llvm_version = Self::get_llvm_version(); + if let Some(version) = &llvm_version { + debug!("Detected LLVM version: {}", version); + } else { + debug!("Could not detect LLVM version"); + } + // Collect system information let system_info = SystemInfo { os_name: System::name().unwrap_or_else(|| "unknown".to_string()), @@ -160,13 +188,17 @@ impl TelemetryCollector { .unwrap_or_else(|| "unknown".to_string()), cpu_count: system.cpus().len(), cpu_frequency_mhz: cpu_frequency, + gpu_enabled, + gpus, is_ec2, ec2_instance_type, + llvm_version, }; let metrics = TelemetryData { proving_system: proving_system.to_string(), precompiles_enabled, + gpu_enabled, program: ProgramInfo { file_path: guest_path.to_string(), file_name, @@ -329,6 +361,84 @@ impl TelemetryCollector { } } + fn discover_gpus() -> Vec { + let mut gpus = Vec::new(); + + // Try to initialize NVIDIA Management Library + match Nvml::init() { + Ok(nvml) => { + // Get all NVIDIA devices + match nvml.device_count() { + Ok(device_count) => { + debug!("Found {} NVIDIA GPU(s)", device_count); + for i in 0..device_count { + if let Ok(device) = nvml.device_by_index(i) { + let mut gpu_info = GpuInfo { + vendor: "NVIDIA".to_string(), + name: device + .name() + .unwrap_or_else(|_| "Unknown NVIDIA GPU".to_string()), + memory_total_kb: None, + }; + + // Get memory information + if let Ok(memory) = device.memory_info() { + gpu_info.memory_total_kb = Some(memory.total / 1024); + } + + gpus.push(gpu_info); + } + } + } + Err(e) => { + debug!("Failed to get NVIDIA GPU count: {}", e); + } + } + } + Err(e) => { + debug!("Failed to initialize NVIDIA GPU detection: {}", e); + } + } + + if gpus.is_empty() { + debug!("No GPUs detected"); + } + + gpus + } + + fn get_llvm_version() -> Option { + // Try to get LLVM version using llvm-config + if let Ok(output) = std::process::Command::new("llvm-config") + .arg("--version") + .output() + { + if output.status.success() { + if let Ok(version) = String::from_utf8(output.stdout) { + return Some(version.trim().to_string()); + } + } + } + + // Try to get LLVM version using clang + if let Ok(output) = std::process::Command::new("clang") + .arg("--version") + .output() + { + if output.status.success() { + if let Ok(version) = String::from_utf8(output.stdout) { + if let Some(v) = version.lines().next() { + if let Some(idx) = v.find("version") { + return Some(v[idx..].trim().to_string()); + } + } + } + } + } + + None + } + pub fn record_workspace_setup(&self, duration: Duration) { if !self.enabled { return; @@ -527,6 +637,28 @@ impl TelemetryCollector { "Total Memory: {} KB", final_metrics.system_info.total_memory_kb ); + if let Some(llvm_version) = &final_metrics.system_info.llvm_version { + info!("LLVM Version: {}", llvm_version); + } else { + info!("LLVM Version: Not detected"); + } + info!( + "GPU Acceleration: {}", + if final_metrics.system_info.gpu_enabled { + "Enabled" + } else { + "Disabled" + } + ); + + if !final_metrics.system_info.gpus.is_empty() { + for (i, gpu) in final_metrics.system_info.gpus.iter().enumerate() { + info!("GPU {}: {} ({})", i + 1, gpu.name, gpu.vendor); + if let Some(total) = gpu.memory_total_kb { + info!(" Memory Total: {} KB", total); + } + } + } // Log Guest Cargo metadata let metadata = &final_metrics.program.guest_metadata;