From 6d3b503c6dc86af09661a098b3e159156bc70cd9 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 17 Oct 2025 10:49:17 -0700 Subject: [PATCH 01/18] [support bundle] Refactor into tasks --- .../tasks/support_bundle_collector.rs | 682 ++++++++++++------ 1 file changed, 457 insertions(+), 225 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 8dc13e7ab42..ee2224edba8 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -59,12 +59,11 @@ use std::future::Future; use std::io::Write; use std::num::NonZeroU64; use std::sync::Arc; -use std::sync::atomic::{AtomicUsize, Ordering}; use tokio::io::AsyncReadExt; use tokio::io::AsyncSeekExt; use tokio::io::AsyncWriteExt; use tokio::io::SeekFrom; -use tokio_util::task::AbortOnDropHandle; +use tokio::sync::OnceCell; use tufaceous_artifact::ArtifactHash; use uuid::Uuid; use zip::ZipArchive; @@ -428,8 +427,6 @@ impl SupportBundleCollector { request: request.clone(), bundle: bundle.clone(), transfer_chunk_size: request.transfer_chunk_size, - host_ereports_collected: AtomicUsize::new(0), - sp_ereports_collected: AtomicUsize::new(0), }); let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); @@ -475,8 +472,60 @@ struct BundleCollection { request: BundleRequest, bundle: SupportBundle, transfer_chunk_size: NonZeroU64, - host_ereports_collected: AtomicUsize, - sp_ereports_collected: AtomicUsize, +} + +type CollectionStepFn = Box< + dyn for<'b> FnOnce( + &'b Arc, + &'b Utf8Path, + ) + -> BoxFuture<'b, anyhow::Result> + + Send, +>; + +enum CollectionStepOutput { + HostEreports(SupportBundleEreportStatus), + SpEreports(SupportBundleEreportStatus), + SavingSpDumps { listed_sps: bool }, + // NOTE: The ditinction between this and "Spawn" is pretty artificial - + // it's just to preserve a part of the report which says "we tried to + // list in-service sleds". + // + // If we changed the collection report, this could easily be combined + // with the "Spawn" variant. + SpawnSleds { extra_steps: Vec<(&'static str, CollectionStepFn)> }, + Spawn { extra_steps: Vec<(&'static str, CollectionStepFn)> }, + None, +} + +impl CollectionStepOutput { + // Updates the collection report based on the output of a collection step, + // and possibly extends the set of all steps to be executed. + fn process( + self, + report: &mut SupportBundleCollectionReport, + steps: &mut Vec<(&'static str, CollectionStepFn)>, + ) { + match self { + CollectionStepOutput::HostEreports(status) => { + report.host_ereports = status; + } + CollectionStepOutput::SpEreports(status) => { + report.sp_ereports = status; + } + CollectionStepOutput::SavingSpDumps { listed_sps } => { + report.listed_sps = listed_sps; + } + CollectionStepOutput::SpawnSleds { extra_steps } => { + report.listed_in_service_sleds = true; + steps.extend(extra_steps); + } + CollectionStepOutput::Spawn { extra_steps } => { + steps.extend(extra_steps); + } + CollectionStepOutput::None => (), + } + } } impl BundleCollection { @@ -656,37 +705,72 @@ impl BundleCollection { Ok(()) } - // Perform the work of collecting the support bundle into a temporary directory - // - // - "dir" is a directory where data can be stored. - // - "bundle" is metadata about the bundle being collected. - // - // If a partial bundle can be collected, it should be returned as - // an Ok(SupportBundleCollectionReport). Any failures from this function - // will prevent the support bundle from being collected altogether. - // - // NOTE: The background task infrastructure will periodically check to see - // if the bundle has been cancelled by a user while it is being collected. - // If that happens, this function will be CANCELLED at an await point. - // - // As a result, it is important that this function be implemented as - // cancel-safe. - async fn collect_bundle_as_file( + async fn run_collect_bundle_steps( self: &Arc, - dir: &Utf8TempDir, - ) -> anyhow::Result { - let log = &self.log; - - info!(&log, "Collecting bundle as local file"); + output: &Utf8TempDir, + mut steps: Vec<(&'static str, CollectionStepFn)>, + ) -> SupportBundleCollectionReport { let mut report = SupportBundleCollectionReport::new(self.bundle.id.into()); - tokio::fs::write( - dir.path().join("bundle_id.txt"), - self.bundle.id.to_string(), - ) - .await?; + const MAX_CONCURRENT_STEPS: usize = 16; + let mut tasks = + ParallelTaskSet::new_with_parallelism(MAX_CONCURRENT_STEPS); + + loop { + // Process all the currently-planned steps + while let Some((step_name, step)) = steps.pop() { + let previous_result = tasks.spawn({ + let collection = self.clone(); + let dir = output.path().to_path_buf(); + async move { + debug!(collection.log, "Running step"; "name" => &step_name); + step(&collection, dir.as_path()).await.inspect_err(|err| { + warn!( + collection.log, + "Step failed"; + "name" => &step_name, + InlineErrorChain::new(err.as_ref()), + ); + }) + } + }).await; + + if let Some(Ok(output)) = previous_result { + output.process(&mut report, &mut steps); + }; + } + + // If we've run out of tasks to spawn, join all the existing steps. + while let Some(previous_result) = tasks.join_next().await { + if let Ok(output) = previous_result { + output.process(&mut report, &mut steps); + }; + } + + // Executing steps may create additional steps, as follow-up work. + // + // Only finish if we've exhausted all possible steps and joined all spawned work. + if steps.is_empty() { + return report; + } + } + } + async fn collect_bundle_id( + &self, + dir: &Utf8Path, + ) -> anyhow::Result { + tokio::fs::write(dir.join("bundle_id.txt"), self.bundle.id.to_string()) + .await?; + + Ok(CollectionStepOutput::None) + } + + async fn collect_reconfigurator_state( + &self, + dir: &Utf8Path, + ) -> anyhow::Result { // Collect reconfigurator state const NMAX_BLUEPRINTS: usize = 300; match reconfigurator_state_load( @@ -697,7 +781,7 @@ impl BundleCollection { .await { Ok(state) => { - let file_path = dir.path().join("reconfigurator_state.json"); + let file_path = dir.join("reconfigurator_state.json"); let file = std::fs::OpenOptions::new() .create(true) .write(true) @@ -713,7 +797,7 @@ impl BundleCollection { }, )?; info!( - log, + self.log, "Support bundle: collected reconfigurator state"; "target_blueprint" => ?state.target_blueprint, "num_blueprints" => state.blueprints.len(), @@ -722,152 +806,322 @@ impl BundleCollection { } Err(err) => { warn!( - log, + self.log, "Support bundle: failed to collect reconfigurator state"; "err" => ?err, ); } - } + }; + + Ok(CollectionStepOutput::None) + } - let ereport_collection = if let Some(ref ereport_filters) = - self.request.ereport_query + async fn collect_host_ereports( + self: &Arc, + dir: &Utf8Path, + ) -> anyhow::Result { + let Some(ref ereport_filters) = self.request.ereport_query else { + debug!(self.log, "Support bundle: ereports not requested"); + return Ok(CollectionStepOutput::None); + }; + let ereports_dir = dir.join("ereports"); + let status = match self + .save_host_ereports(ereport_filters.clone(), ereports_dir.clone()) + .await { - // If ereports are to be included in the bundle, have someone go do - // that in the background while we're gathering up other stuff. Note - // that the `JoinHandle`s for these tasks are wrapped in - // `AbortOnDropHandle`s for cancellation correctness; this ensures - // that if collecting the bundle is cancelled and this future is - // dropped, the tasks that we've spawned to collect ereports are - // aborted as well. - let dir = dir.path().join("ereports"); - let host = AbortOnDropHandle::new(tokio::spawn( - self.clone().collect_host_ereports( - ereport_filters.clone(), - dir.clone(), - ), - )); - let sp = AbortOnDropHandle::new(tokio::spawn( - self.clone().collect_sp_ereports(ereport_filters.clone(), dir), - )); - Some((host, sp)) - } else { - debug!(log, "Support bundle: ereports not requested"); - None + Ok(n_collected) => { + SupportBundleEreportStatus::Collected { n_collected } + } + Err((n_collected, err)) => { + warn!( + &self.log, + "Support bundle: host ereport collection failed \ + ({n_collected} collected successfully)"; + InlineErrorChain::new(err.as_ref()), + ); + + SupportBundleEreportStatus::Failed { + n_collected, + error: err.to_string(), + } + } }; - let all_sleds = self - .datastore - .sled_list_all_batched(&self.opctx, SledFilter::InService) - .await; + Ok(CollectionStepOutput::HostEreports(status)) + } - if let Ok(mgs_client) = self.create_mgs_client().await { - if let Err(e) = write_sled_info( - &self.log, - &mgs_client, - all_sleds.as_deref().ok(), - dir.path(), - ) + async fn collect_sp_ereports( + self: &Arc, + dir: &Utf8Path, + ) -> anyhow::Result { + let Some(ref ereport_filters) = self.request.ereport_query else { + debug!(self.log, "Support bundle: ereports not requested"); + return Ok(CollectionStepOutput::None); + }; + let ereports_dir = dir.join("ereports"); + let status = match self + .save_sp_ereports(ereport_filters.clone(), ereports_dir.clone()) .await - { - error!(log, "Failed to write sled_info.json"; "error" => InlineErrorChain::new(e.as_ref())); + { + Ok(n_collected) => { + SupportBundleEreportStatus::Collected { n_collected } } + Err((n_collected, err)) => { + warn!( + &self.log, + "Support bundle: sp ereport collection failed \ + ({n_collected} collected successfully)"; + InlineErrorChain::new(err.as_ref()), + ); - let sp_dumps_dir = dir.path().join("sp_task_dumps"); - tokio::fs::create_dir_all(&sp_dumps_dir).await.with_context( - || { - format!( - "Failed to create SP task dump directory {sp_dumps_dir}" - ) - }, - )?; + SupportBundleEreportStatus::Failed { + n_collected, + error: err.to_string(), + } + } + }; - if let Err(e) = - save_all_sp_dumps(log, &mgs_client, &sp_dumps_dir).await - { - error!(log, "Failed to capture SP task dumps"; "error" => InlineErrorChain::new(e.as_ref())); - } else { - report.listed_sps = true; - }; - } else { - warn!(log, "No MGS client, skipping SP task dump collection"); - } + Ok(CollectionStepOutput::SpEreports(status)) + } - if let Ok(all_sleds) = all_sleds { - report.listed_in_service_sleds = true; + async fn get_or_initialize_mgs_client<'a>( + &self, + mgs_client: &'a OnceCell>>, + ) -> &'a Arc> { + mgs_client + .get_or_init(|| async { + Arc::new(self.create_mgs_client().await.ok()) + }) + .await + } - const MAX_CONCURRENT_SLED_REQUESTS: usize = 16; - const FAILURE_MESSAGE: &str = - "Failed to fully collect support bundle info from sled"; - let mut set = ParallelTaskSet::new_with_parallelism( - MAX_CONCURRENT_SLED_REQUESTS, + async fn get_or_initialize_all_sleds<'a>( + &self, + all_sleds: &'a OnceCell>>>, + ) -> &'a Arc>> { + all_sleds + .get_or_init(|| async { + Arc::new( + self.datastore + .sled_list_all_batched( + &self.opctx, + SledFilter::InService, + ) + .await + .ok(), + ) + }) + .await + } + + async fn collect_sled_cubby_info( + &self, + all_sleds: &OnceCell>>>, + mgs_client: &OnceCell>>, + dir: &Utf8Path, + ) -> anyhow::Result { + let Some(mgs_client) = + &**self.get_or_initialize_mgs_client(mgs_client).await + else { + warn!( + self.log, + "No MGS client, skipping sled cubby info collection" ); + return Ok(CollectionStepOutput::None); + }; + let nexus_sleds = self + .get_or_initialize_all_sleds(all_sleds) + .await + .as_deref() + .unwrap_or_default(); + + write_sled_cubby_info(&self.log, mgs_client, nexus_sleds, dir).await?; + + Ok(CollectionStepOutput::None) + } + + async fn spawn_sp_dump_collection( + &self, + mgs_client: &OnceCell>>, + dir: &Utf8Path, + ) -> anyhow::Result { + let Some(mgs_client) = + &**self.get_or_initialize_mgs_client(mgs_client).await + else { + warn!(self.log, "No MGS client, skipping SP task dump collection"); + return Ok(CollectionStepOutput::None); + }; - for sled in all_sleds { - let prev_result = set - .spawn({ - let collection: Arc = self.clone(); - let dir = dir.path().to_path_buf(); + let sp_dumps_dir = dir.join("sp_task_dumps"); + tokio::fs::create_dir_all(&sp_dumps_dir).await.with_context(|| { + format!("Failed to create SP task dump directory {sp_dumps_dir}") + })?; + + let mut extra_steps: Vec<(&'static str, CollectionStepFn)> = vec![]; + for sp in get_available_sps(&mgs_client).await? { + extra_steps.push(( + "sp dump", + Box::new({ + let mgs_client = mgs_client.clone(); + move |collection, dir| { async move { - collection.collect_data_from_sled(&sled, &dir).await + collection + .collect_sp_dump(&mgs_client, sp, dir) + .await } - }) - .await; - if let Some(Err(err)) = prev_result { - warn!(&self.log, "{FAILURE_MESSAGE}"; "err" => ?err); - } - } - while let Some(result) = set.join_next().await { - if let Err(err) = result { - warn!(&self.log, "{FAILURE_MESSAGE}"; "err" => ?err); - } - } + .boxed() + } + }), + )); } - if let Some((host, sp)) = ereport_collection { - let (host, sp) = tokio::join!(host, sp); - const TASK_FAILURE_MSG: &str = "task failed"; - let n_collected = - self.host_ereports_collected.load(Ordering::Acquire); - report.host_ereports = match host - .map_err(|e| anyhow::anyhow!("{TASK_FAILURE_MSG}: {e}")) - .and_then(|x| x) - { - Ok(_) => SupportBundleEreportStatus::Collected { n_collected }, - Err(err) => { - warn!( - &self.log, - "Support bundle: host ereport collection failed \ - ({n_collected} collected successfully)"; - "err" => ?err, - ); - SupportBundleEreportStatus::Failed { - n_collected, - error: err.to_string(), + Ok(CollectionStepOutput::Spawn { extra_steps }) + } + + async fn collect_sp_dump( + &self, + mgs_client: &MgsClient, + sp: SpIdentifier, + dir: &Utf8Path, + ) -> anyhow::Result { + save_sp_dumps(mgs_client, sp, dir) + .await + .with_context(|| format!("SP {} {}", sp.type_, sp.slot))?; + + Ok(CollectionStepOutput::SavingSpDumps { listed_sps: true }) + } + + // Perform the work of collecting the support bundle into a temporary directory + // + // - "dir" is a directory where data can be stored. + // - "bundle" is metadata about the bundle being collected. + // + // If a partial bundle can be collected, it should be returned as + // an Ok(SupportBundleCollectionReport). Any failures from this function + // will prevent the support bundle from being collected altogether. + // + // NOTE: The background task infrastructure will periodically check to see + // if the bundle has been cancelled by a user while it is being collected. + // If that happens, this function will be CANCELLED at an await point. + // + // As a result, it is important that this function be implemented as + // cancel-safe. + async fn collect_bundle_as_file( + self: &Arc, + dir: &Utf8TempDir, + ) -> anyhow::Result { + let log = &self.log; + + info!(&log, "Collecting bundle as local file"); + + // Shared, lazy, fallible initialization for sleds + let all_sleds: OnceCell>>> = OnceCell::new(); + // Shared, lazy, fallible initialization for MGS client + let mgs_client: OnceCell>> = OnceCell::new(); + + let steps: Vec<(&str, CollectionStepFn)> = vec![ + ( + "bundle id", + Box::new(|collection, dir| { + collection.collect_bundle_id(dir).boxed() + }), + ), + ( + "reconfigurator state", + Box::new(|collection, dir| { + collection.collect_reconfigurator_state(dir).boxed() + }), + ), + ( + "host ereports", + Box::new(|collection, dir| { + collection.collect_host_ereports(dir).boxed() + }), + ), + ( + "sp ereports", + Box::new(|collection, dir| { + collection.collect_sp_ereports(dir).boxed() + }), + ), + ( + "sled cubby info", + Box::new({ + let all_sleds = all_sleds.clone(); + let mgs_client = mgs_client.clone(); + move |collection, dir| { + async move { + collection + .collect_sled_cubby_info( + &all_sleds, + &mgs_client, + dir, + ) + .await + } + .boxed() } - } - }; - let n_collected = - self.sp_ereports_collected.load(Ordering::Acquire); - report.sp_ereports = match sp - .map_err(|e| anyhow::anyhow!("{TASK_FAILURE_MSG}: {e}")) - .and_then(|x| x) - { - Ok(_) => SupportBundleEreportStatus::Collected { n_collected }, - Err(err) => { - warn!( - &self.log, - "Support bundle: SP ereport collection failed \ - ({n_collected} collected successfully)"; - "err" => ?err, - ); - SupportBundleEreportStatus::Failed { - n_collected, - error: err.to_string(), + }), + ), + ( + "spawn steps to query all sp dumps", + Box::new({ + let mgs_client = mgs_client.clone(); + move |collection, dir| { + async move { + collection + .spawn_sp_dump_collection(&mgs_client, dir) + .await + } + .boxed() } - } - }; + }), + ), + ( + "spawn steps to query all sleds", + Box::new({ + let all_sleds = all_sleds.clone(); + move |collection, _| { + async move { + collection.spawn_query_all_sleds(&all_sleds).await + } + .boxed() + } + }), + ), + ]; + + Ok(self.run_collect_bundle_steps(dir, steps).await) + } + + async fn spawn_query_all_sleds( + &self, + all_sleds: &OnceCell>>>, + ) -> anyhow::Result { + let Some(all_sleds) = + self.get_or_initialize_all_sleds(all_sleds).await.as_deref() + else { + warn!(self.log, "Could not read list of sleds"); + return Ok(CollectionStepOutput::None); + }; + + let mut extra_steps: Vec<(&'static str, CollectionStepFn)> = vec![]; + for sled in all_sleds { + extra_steps.push(( + "sled data", + Box::new({ + let sled = sled.clone(); + move |collection, dir| { + async move { + collection.collect_data_from_sled(&sled, dir).await + } + .boxed() + } + }), + )); } - Ok(report) + + return Ok(CollectionStepOutput::SpawnSleds { extra_steps }); } // Collect data from a sled, storing it into a directory that will @@ -880,7 +1134,7 @@ impl BundleCollection { &self, sled: &nexus_db_model::Sled, dir: &Utf8Path, - ) -> anyhow::Result<()> { + ) -> anyhow::Result { let log = &self.log; info!(&log, "Collecting bundle info from sled"; "sled" => %sled.id()); let sled_path = dir @@ -893,7 +1147,7 @@ impl BundleCollection { .await?; if self.request.skip_sled_info { - return Ok(()); + return Ok(CollectionStepOutput::None); } let Ok(sled_client) = nexus_networking::sled_client( @@ -909,7 +1163,7 @@ impl BundleCollection { "Could not contact sled", ) .await?; - return Ok(()); + return Ok(CollectionStepOutput::None); }; // NB: As new sled-diagnostic commands are added they should @@ -1014,14 +1268,15 @@ impl BundleCollection { error!(&self.log, "failed to write logs output: {e}"); } } - return Ok(()); + return Ok(CollectionStepOutput::None); } - async fn collect_sp_ereports( - self: Arc, + async fn save_host_ereports( + self: &Arc, filters: EreportFilters, dir: Utf8PathBuf, - ) -> anyhow::Result<()> { + ) -> Result { + let mut reports = 0; let mut paginator = Paginator::new( datastore::SQL_BATCH_SIZE, dropshot::PaginationOrder::Ascending, @@ -1029,40 +1284,50 @@ impl BundleCollection { while let Some(p) = paginator.next() { let ereports = self .datastore - .sp_ereports_fetch_matching( + .host_ereports_fetch_matching( &self.opctx, &filters, &p.current_pagparams(), ) .await .map_err(|e| { - e.internal_context("failed to query for SP ereports") + ( + reports, + e.internal_context( + "failed to query for host OS ereports", + ) + .into(), + ) })?; paginator = p.found_batch(&ereports, &|ereport| { (ereport.restart_id.into_untyped_uuid(), ereport.ena) }); - let n_ereports = ereports.len(); for ereport in ereports { - write_ereport(ereport.into(), &dir).await?; - self.sp_ereports_collected.fetch_add(1, Ordering::Release); + write_ereport(ereport.into(), &dir) + .await + .map_err(|e| (reports, e))?; + reports += 1; } - debug!(self.log, "Support bundle: added {n_ereports} SP ereports"); + debug!( + self.log, + "Support bundle: added {n_ereports} host OS ereports" + ); } info!( self.log, - "Support bundle: collected {} total SP ereports", - self.sp_ereports_collected.load(Ordering::Relaxed) + "Support bundle: collected {} total host ereports", reports ); - Ok(()) + Ok(reports) } - async fn collect_host_ereports( - self: Arc, + async fn save_sp_ereports( + self: &Arc, filters: EreportFilters, dir: Utf8PathBuf, - ) -> anyhow::Result<()> { + ) -> Result { + let mut reports = 0; let mut paginator = Paginator::new( datastore::SQL_BATCH_SIZE, dropshot::PaginationOrder::Ascending, @@ -1070,35 +1335,37 @@ impl BundleCollection { while let Some(p) = paginator.next() { let ereports = self .datastore - .host_ereports_fetch_matching( + .sp_ereports_fetch_matching( &self.opctx, &filters, &p.current_pagparams(), ) .await .map_err(|e| { - e.internal_context("failed to query for host OS ereports") + ( + reports, + e.internal_context("failed to query for SP ereports") + .into(), + ) })?; paginator = p.found_batch(&ereports, &|ereport| { (ereport.restart_id.into_untyped_uuid(), ereport.ena) }); let n_ereports = ereports.len(); for ereport in ereports { - write_ereport(ereport.into(), &dir).await?; - self.host_ereports_collected.fetch_add(1, Ordering::Release); + write_ereport(ereport.into(), &dir) + .await + .map_err(|e| (reports, e))?; + reports += 1; } - debug!( - self.log, - "Support bundle: added {n_ereports} host OS ereports" - ); + debug!(self.log, "Support bundle: added {n_ereports} SP ereports"); } info!( self.log, - "Support bundle: collected {} total host ereports", - self.host_ereports_collected.load(Ordering::Relaxed) + "Support bundle: collected {} total SP ereports", reports ); - Ok(()) + Ok(reports) } async fn create_mgs_client(&self) -> anyhow::Result { @@ -1396,40 +1663,6 @@ where Ok(()) } -/// Collect task dumps from all SPs via MGS and save them to a directory. -async fn save_all_sp_dumps( - log: &slog::Logger, - mgs_client: &MgsClient, - sp_dumps_dir: &Utf8Path, -) -> anyhow::Result<()> { - let available_sps = get_available_sps(&mgs_client).await?; - - let mut tasks = ParallelTaskSet::new(); - for sp in available_sps { - let mgs_client = mgs_client.clone(); - let sp_dumps_dir = sp_dumps_dir.to_owned(); - - tasks - .spawn(async move { - save_sp_dumps(mgs_client, sp, sp_dumps_dir) - .await - .with_context(|| format!("SP {} {}", sp.type_, sp.slot)) - }) - .await; - } - for result in tasks.join_all().await { - if let Err(e) = result { - error!( - log, - "failed to capture task dumps"; - "error" => InlineErrorChain::new(e.as_ref()) - ); - } - } - - Ok(()) -} - /// Use MGS ignition info to find active SPs. async fn get_available_sps( mgs_client: &MgsClient, @@ -1455,9 +1688,9 @@ async fn get_available_sps( /// Fetch and save task dumps from a single SP. async fn save_sp_dumps( - mgs_client: MgsClient, + mgs_client: &MgsClient, sp: SpIdentifier, - sp_dumps_dir: Utf8PathBuf, + sp_dumps_dir: &Utf8Path, ) -> anyhow::Result<()> { let dump_count = mgs_client .sp_task_dump_count(&sp.type_, sp.slot) @@ -1488,10 +1721,10 @@ async fn save_sp_dumps( /// Write a file with a JSON mapping of sled serial numbers to cubby and UUIDs for easier /// identification of sleds present in a bundle. -async fn write_sled_info( +async fn write_sled_cubby_info( log: &slog::Logger, mgs_client: &MgsClient, - nexus_sleds: Option<&[Sled]>, + nexus_sleds: &[Sled], dir: &Utf8Path, ) -> anyhow::Result<()> { #[derive(Serialize)] @@ -1506,7 +1739,6 @@ async fn write_sled_info( // We can still get a useful mapping of cubby to serial using just the data from MGS. let mut nexus_map: BTreeMap<_, _> = nexus_sleds - .unwrap_or_default() .into_iter() .map(|sled| (sled.serial_number(), sled)) .collect(); From 5abe573ba3ab32dd68dc74afb29691b9b6d4636e Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 17 Oct 2025 16:06:33 -0700 Subject: [PATCH 02/18] [support bundle] More structured data filtering --- .../tasks/support_bundle_collector.rs | 135 +++++++++++++++--- 1 file changed, 115 insertions(+), 20 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index ee2224edba8..1e29e05b2d2 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -55,6 +55,7 @@ use serde_json::json; use sha2::{Digest, Sha256}; use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; +use std::collections::HashSet; use std::future::Future; use std::io::Write; use std::num::NonZeroU64; @@ -82,26 +83,83 @@ fn authz_support_bundle_from_id(id: SupportBundleUuid) -> authz::SupportBundle { authz::SupportBundle::new(authz::FLEET, id, LookupType::by_id(id)) } +// Describes how support bundle data is selected. +// +// Multiple values of this enum are joined together into a HashSet. +// Categories should be additive. +#[derive(Debug, Clone, Hash, Eq, PartialEq)] +enum BundleDataCategory { + Reconfigurator, + HostInfo, + SledCubbyInfo, + SpDumps, +} + +// The set of sleds to include +#[derive(Debug, Clone, Hash, Eq, PartialEq)] +enum SledSelection { + All, + Specific(SledUuid), +} + // Specifies the data to be collected within the Support Bundle. #[derive(Clone)] struct BundleRequest { - // If "false": Skip collecting host-specific info from each sled. - skip_sled_info: bool, - // The size of chunks to use when transferring a bundle from Nexus // to a sled agent. // // Typically, this is CHUNK_SIZE, but can be modified for testing. transfer_chunk_size: NonZeroU64, + // The set of data to be included within this bundle. + data_selection: HashSet, + + // The set of sets to be included within this bundle. + // + // NOTE: This selection is only considered if "data_selection" requests + // data from specific sleds. + sled_selection: HashSet, + + // The set of ereports to be included within this bundle. + // + // "None" causes ereports to be skipped. ereport_query: Option, } +impl BundleRequest { + fn include_reconfigurator_data(&self) -> bool { + self.data_selection.contains(&BundleDataCategory::Reconfigurator) + } + + fn include_host_info(&self) -> bool { + self.data_selection.contains(&BundleDataCategory::HostInfo) + } + + fn include_sled(&self, id: SledUuid) -> bool { + self.sled_selection.contains(&SledSelection::Specific(id)) + || self.sled_selection.contains(&SledSelection::All) + } + + fn include_sled_cubby_info(&self) -> bool { + self.data_selection.contains(&BundleDataCategory::SledCubbyInfo) + } + + fn include_sp_dumps(&self) -> bool { + self.data_selection.contains(&BundleDataCategory::SpDumps) + } +} + impl Default for BundleRequest { fn default() -> Self { Self { - skip_sled_info: false, transfer_chunk_size: CHUNK_SIZE, + data_selection: HashSet::from([ + BundleDataCategory::Reconfigurator, + BundleDataCategory::HostInfo, + BundleDataCategory::SledCubbyInfo, + BundleDataCategory::SpDumps, + ]), + sled_selection: HashSet::from([SledSelection::All]), ereport_query: Some(EreportFilters { start_time: Some(chrono::Utc::now() - chrono::Days::new(7)), ..EreportFilters::default() @@ -771,6 +829,10 @@ impl BundleCollection { &self, dir: &Utf8Path, ) -> anyhow::Result { + if !self.request.include_reconfigurator_data() { + return Ok(CollectionStepOutput::None); + } + // Collect reconfigurator state const NMAX_BLUEPRINTS: usize = 300; match reconfigurator_state_load( @@ -920,6 +982,10 @@ impl BundleCollection { mgs_client: &OnceCell>>, dir: &Utf8Path, ) -> anyhow::Result { + if !self.request.include_sled_cubby_info() { + return Ok(CollectionStepOutput::None); + } + let Some(mgs_client) = &**self.get_or_initialize_mgs_client(mgs_client).await else { @@ -945,6 +1011,10 @@ impl BundleCollection { mgs_client: &OnceCell>>, dir: &Utf8Path, ) -> anyhow::Result { + if !self.request.include_sp_dumps() { + return Ok(CollectionStepOutput::None); + } + let Some(mgs_client) = &**self.get_or_initialize_mgs_client(mgs_client).await else { @@ -984,6 +1054,10 @@ impl BundleCollection { sp: SpIdentifier, dir: &Utf8Path, ) -> anyhow::Result { + if !self.request.include_sp_dumps() { + return Ok(CollectionStepOutput::None); + } + save_sp_dumps(mgs_client, sp, dir) .await .with_context(|| format!("SP {} {}", sp.type_, sp.slot))?; @@ -1098,6 +1172,10 @@ impl BundleCollection { &self, all_sleds: &OnceCell>>>, ) -> anyhow::Result { + if !self.request.include_host_info() { + return Ok(CollectionStepOutput::None); + } + let Some(all_sleds) = self.get_or_initialize_all_sleds(all_sleds).await.as_deref() else { @@ -1107,6 +1185,10 @@ impl BundleCollection { let mut extra_steps: Vec<(&'static str, CollectionStepFn)> = vec![]; for sled in all_sleds { + if !self.request.include_sled(sled.id()) { + continue; + } + extra_steps.push(( "sled data", Box::new({ @@ -1135,6 +1217,12 @@ impl BundleCollection { sled: &nexus_db_model::Sled, dir: &Utf8Path, ) -> anyhow::Result { + if !self.request.include_host_info() + || !self.request.include_sled(sled.id()) + { + return Ok(CollectionStepOutput::None); + } + let log = &self.log; info!(&log, "Collecting bundle info from sled"; "sled" => %sled.id()); let sled_path = dir @@ -1146,10 +1234,6 @@ impl BundleCollection { tokio::fs::write(sled_path.join("sled.txt"), format!("{sled:?}")) .await?; - if self.request.skip_sled_info { - return Ok(CollectionStepOutput::None); - } - let Ok(sled_client) = nexus_networking::sled_client( &self.datastore, &self.opctx, @@ -2266,7 +2350,7 @@ mod test { let request = BundleRequest { // NOTE: The support bundle querying interface isn't supported on // the simulated sled agent (yet?) so we're skipping this step. - skip_sled_info: true, + sled_selection: HashSet::new(), ..Default::default() }; let report = collector @@ -2340,9 +2424,10 @@ mod test { // We're going to use a really small chunk size here to force the bundle // to get split up. let request = BundleRequest { - skip_sled_info: true, transfer_chunk_size: NonZeroU64::new(16).unwrap(), + sled_selection: HashSet::new(), ereport_query: None, + ..Default::default() }; let report = collector @@ -2430,8 +2515,10 @@ mod test { ); // Each time we call "collect_bundle", we collect a SINGLE bundle. - let request = - BundleRequest { skip_sled_info: true, ..Default::default() }; + let request = BundleRequest { + sled_selection: HashSet::new(), + ..Default::default() + }; let report = collector .collect_bundle(&opctx, &request) .await @@ -2579,8 +2666,10 @@ mod test { false, nexus.id(), ); - let request = - BundleRequest { skip_sled_info: true, ..Default::default() }; + let request = BundleRequest { + sled_selection: HashSet::new(), + ..Default::default() + }; let report = collector .collect_bundle(&opctx, &request) .await @@ -2726,8 +2815,10 @@ mod test { false, nexus.id(), ); - let request = - BundleRequest { skip_sled_info: true, ..Default::default() }; + let request = BundleRequest { + sled_selection: HashSet::new(), + ..Default::default() + }; let report = collector .collect_bundle(&opctx, &request) .await @@ -2811,8 +2902,10 @@ mod test { false, nexus.id(), ); - let request = - BundleRequest { skip_sled_info: true, ..Default::default() }; + let request = BundleRequest { + sled_selection: HashSet::new(), + ..Default::default() + }; let report = collector .collect_bundle(&opctx, &request) .await @@ -2897,8 +2990,10 @@ mod test { ); // Collect the bundle - let request = - BundleRequest { skip_sled_info: true, ..Default::default() }; + let request = BundleRequest { + sled_selection: HashSet::new(), + ..Default::default() + }; let report = collector .collect_bundle(&opctx, &request) .await From 0cb9a6a303da78f39eedd1d4fd230ca264bb4371 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 1 Dec 2025 16:08:36 -0800 Subject: [PATCH 03/18] docs --- .../src/app/background/tasks/support_bundle_collector.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 9692db353b4..9a1cc86f909 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -90,13 +90,22 @@ fn authz_support_bundle_from_id(id: SupportBundleUuid) -> authz::SupportBundle { // Categories should be additive. #[derive(Debug, Clone, Hash, Eq, PartialEq)] enum BundleDataCategory { + // Collects reconfigurator state (some of the latest blueprints, + // information about the target blueprint). Reconfigurator, + // Collects info from sled agents, running a handful of + // diagnostic commands (e.g., zoneadm, dladm, etc). HostInfo, + // Collects sled serial numbers, cubby numbers, and UUIDs. SledCubbyInfo, + // Saves task dumps from SPs. SpDumps, } // The set of sleds to include +// +// Multiple values of this enum are joined together into a HashSet. +// Therefore "SledSelection::All" overrides specific sleds. #[derive(Debug, Clone, Hash, Eq, PartialEq)] enum SledSelection { All, From 015f2e971641b59692839fcc3ae03f4406316c7e Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 2 Dec 2025 11:00:20 -0800 Subject: [PATCH 04/18] meh --- nexus/src/app/background/tasks/support_bundle_collector.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 9a1cc86f909..0adc94f37e1 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -1095,7 +1095,7 @@ impl BundleCollection { // Perform the work of collecting the support bundle into a temporary directory // - // "dir" is a directory where data can be stored. + // "dir" is an output directory where data can be stored. // // If a partial bundle can be collected, it should be returned as // an Ok(SupportBundleCollectionReport). Any failures from this function From 92415e172d4436e920533d548d1b1fa1bea92db1 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 2 Dec 2025 11:24:46 -0800 Subject: [PATCH 05/18] Improve the support bundle report on a step-by-step basis --- dev-tools/omdb/src/bin/omdb/nexus.rs | 15 ++ .../tasks/support_bundle_collector.rs | 180 ++++++++++++------ .../integration_tests/support_bundles.rs | 43 ++--- nexus/types/src/internal_api/background.rs | 31 +++ 4 files changed, 186 insertions(+), 83 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 6a842a1d7e7..d8d6bab1bc7 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -98,6 +98,7 @@ use std::fs::OpenOptions; use std::os::unix::fs::PermissionsExt; use std::str::FromStr; use std::sync::Arc; +use std::time::Duration; use support_bundle_viewer::LocalFileAccess; use support_bundle_viewer::SupportBundleAccessor; use tabled::Tabled; @@ -2612,6 +2613,7 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) { listed_in_service_sleds, listed_sps, activated_in_db_ok, + mut steps, ereports, }) = collection_report { @@ -2623,6 +2625,19 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) { println!( " Bundle was able to list service processors: {listed_sps}" ); + + steps.sort_unstable_by_key(|s| s.start); + for step in steps { + let duration = (step.end - step.start) + .to_std() + .unwrap_or(Duration::from_millis(0)); + println!( + " Step {} ({}ms): {}", + step.name, + duration.as_millis(), + step.status + ); + } println!( " Bundle was activated in the database: {activated_in_db_ok}" ); diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 0adc94f37e1..2b63ac6e93b 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -6,6 +6,7 @@ use crate::app::background::BackgroundTask; use anyhow::Context; +use anyhow::bail; use base64::Engine; use camino::Utf8DirEntry; use camino::Utf8Path; @@ -13,6 +14,8 @@ use camino::Utf8PathBuf; use camino_tempfile::Utf8TempDir; use camino_tempfile::tempdir_in; use camino_tempfile::tempfile_in; +use chrono::DateTime; +use chrono::Utc; use futures::FutureExt; use futures::StreamExt; use futures::future::BoxFuture; @@ -38,6 +41,8 @@ use nexus_types::fm::Ereport; use nexus_types::identity::Asset; use nexus_types::internal_api::background::SupportBundleCleanupReport; use nexus_types::internal_api::background::SupportBundleCollectionReport; +use nexus_types::internal_api::background::SupportBundleCollectionStep; +use nexus_types::internal_api::background::SupportBundleCollectionStepStatus; use nexus_types::internal_api::background::SupportBundleEreportStatus; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; @@ -563,47 +568,115 @@ type CollectionStepFn = Box< + Send, >; -enum CollectionStepOutput { - Ereports(SupportBundleEreportStatus), - SavingSpDumps { listed_sps: bool }, - // NOTE: The distinction between this and "Spawn" is pretty artificial - - // it's just to preserve a part of the report which says "we tried to - // list in-service sleds". - // - // If we changed the collection report, this could easily be combined - // with the "Spawn" variant. - SpawnSleds { extra_steps: Vec<(&'static str, CollectionStepFn)> }, - Spawn { extra_steps: Vec<(&'static str, CollectionStepFn)> }, - None, +struct CollectionStep { + name: String, + step_fn: CollectionStepFn, +} + +impl CollectionStep { + fn new(name: impl Into, step_fn: CollectionStepFn) -> Self { + Self { name: name.into(), step_fn } + } + + async fn run( + self, + collection: &Arc, + output: &Utf8Path, + ) -> CompletedCollectionStep { + let start = Utc::now(); + + let output = (self.step_fn)(collection, output) + .await + .inspect_err(|err| { + warn!( + collection.log, + "Step failed"; + "name" => &self.name, + InlineErrorChain::new(err.as_ref()), + ); + }) + .unwrap_or_else(|err| CollectionStepOutput::Failed(err)); + + let end = Utc::now(); + + CompletedCollectionStep { name: self.name, start, end, output } + } +} + +struct CompletedCollectionStep { + name: String, + start: DateTime, + end: DateTime, + output: CollectionStepOutput, } -impl CollectionStepOutput { +impl CompletedCollectionStep { // Updates the collection report based on the output of a collection step, // and possibly extends the set of all steps to be executed. fn process( self, report: &mut SupportBundleCollectionReport, - steps: &mut Vec<(&'static str, CollectionStepFn)>, + steps: &mut Vec, ) { - match self { + use SupportBundleCollectionStepStatus as Status; + + let status = match self.output { + CollectionStepOutput::Skipped => Status::Skipped, + CollectionStepOutput::Failed(err) => { + Status::Failed(err.to_string()) + } CollectionStepOutput::Ereports(status) => { report.ereports = Some(status); + Status::Ok } CollectionStepOutput::SavingSpDumps { listed_sps } => { report.listed_sps = listed_sps; + Status::Ok } CollectionStepOutput::SpawnSleds { extra_steps } => { report.listed_in_service_sleds = true; steps.extend(extra_steps); + Status::Ok } CollectionStepOutput::Spawn { extra_steps } => { steps.extend(extra_steps); + Status::Ok } - CollectionStepOutput::None => (), - } + CollectionStepOutput::None => Status::Ok, + }; + + // Add information about this completed step the bundle report. + let step = SupportBundleCollectionStep { + name: self.name, + start: self.start, + end: self.end, + status, + }; + report.steps.push(step); } } +enum CollectionStepOutput { + // The step was not executed intentionally + Skipped, + // The step encountered a fatal error and could not complete. + // + // It may have still saved a partial set of data to the bundle. + Failed(anyhow::Error), + Ereports(SupportBundleEreportStatus), + SavingSpDumps { listed_sps: bool }, + // NOTE: The distinction between this and "Spawn" is pretty artificial - + // it's just to preserve a part of the report which says "we tried to + // list in-service sleds". + // + // If we changed the collection report, this could easily be combined + // with the "Spawn" variant. + SpawnSleds { extra_steps: Vec }, + Spawn { extra_steps: Vec }, + // The step completed with nothing to report, and no follow-up steps + None, +} + impl BundleCollection { // Collect the bundle within Nexus, and store it on a target sled. async fn collect_bundle_and_store_on_sled( @@ -856,7 +929,7 @@ impl BundleCollection { async fn run_collect_bundle_steps( self: &Arc, output: &Utf8TempDir, - mut steps: Vec<(&'static str, CollectionStepFn)>, + mut steps: Vec, ) -> SupportBundleCollectionReport { let mut report = SupportBundleCollectionReport::new(self.bundle.id.into()); @@ -867,34 +940,25 @@ impl BundleCollection { loop { // Process all the currently-planned steps - while let Some((step_name, step)) = steps.pop() { + while let Some(step) = steps.pop() { let previous_result = tasks.spawn({ let collection = self.clone(); let dir = output.path().to_path_buf(); async move { - debug!(collection.log, "Running step"; "name" => &step_name); - step(&collection, dir.as_path()).await.inspect_err(|err| { - warn!( - collection.log, - "Step failed"; - "name" => &step_name, - InlineErrorChain::new(err.as_ref()), - ); - }) + debug!(collection.log, "Running step"; "name" => &step.name); + step.run(&collection, dir.as_path()).await } }).await; - if let Some(Ok(output)) = previous_result { + if let Some(output) = previous_result { output.process(&mut report, &mut steps); }; } // If we've run out of tasks to spawn, join any of the previously // spawned tasks, if any exist. - if let Some(previous_result) = tasks.join_next().await { - if let Ok(output) = previous_result { - output.process(&mut report, &mut steps); - }; + if let Some(output) = tasks.join_next().await { + output.process(&mut report, &mut steps); // As soon as any task completes, see if we can spawn more work // immediately. This ensures that the ParallelTaskSet is @@ -926,7 +990,7 @@ impl BundleCollection { dir: &Utf8Path, ) -> anyhow::Result { if !self.request.include_reconfigurator_data() { - return Ok(CollectionStepOutput::None); + return Ok(CollectionStepOutput::Skipped); } // Collect reconfigurator state @@ -1011,17 +1075,13 @@ impl BundleCollection { dir: &Utf8Path, ) -> anyhow::Result { if !self.request.include_sled_cubby_info() { - return Ok(CollectionStepOutput::None); + return Ok(CollectionStepOutput::Skipped); } let Some(mgs_client) = &**self.get_or_initialize_mgs_client(mgs_client).await else { - warn!( - self.log, - "No MGS client, skipping sled cubby info collection" - ); - return Ok(CollectionStepOutput::None); + bail!("Could not initialize MGS client"); }; let nexus_sleds = self .get_or_initialize_all_sleds(all_sleds) @@ -1040,14 +1100,13 @@ impl BundleCollection { dir: &Utf8Path, ) -> anyhow::Result { if !self.request.include_sp_dumps() { - return Ok(CollectionStepOutput::None); + return Ok(CollectionStepOutput::Skipped); } let Some(mgs_client) = &**self.get_or_initialize_mgs_client(mgs_client).await else { - warn!(self.log, "No MGS client, skipping SP task dump collection"); - return Ok(CollectionStepOutput::None); + bail!("Could not initialize MGS client"); }; let sp_dumps_dir = dir.join("sp_task_dumps"); @@ -1055,9 +1114,9 @@ impl BundleCollection { format!("Failed to create SP task dump directory {sp_dumps_dir}") })?; - let mut extra_steps: Vec<(&'static str, CollectionStepFn)> = vec![]; + let mut extra_steps: Vec = vec![]; for sp in get_available_sps(&mgs_client).await? { - extra_steps.push(( + extra_steps.push(CollectionStep::new( "SP dump", Box::new({ let mgs_client = mgs_client.clone(); @@ -1083,7 +1142,7 @@ impl BundleCollection { dir: &Utf8Path, ) -> anyhow::Result { if !self.request.include_sp_dumps() { - return Ok(CollectionStepOutput::None); + return Ok(CollectionStepOutput::Skipped); } save_sp_dumps(mgs_client, sp, dir).await.with_context(|| { @@ -1124,26 +1183,26 @@ impl BundleCollection { // Shared, lazy, fallible initialization for MGS client let mgs_client: OnceCell>> = OnceCell::new(); - let steps: Vec<(&str, CollectionStepFn)> = vec![ - ( + let steps: Vec = vec![ + CollectionStep::new( "bundle id", Box::new(|collection, dir| { collection.collect_bundle_id(dir).boxed() }), ), - ( + CollectionStep::new( "reconfigurator state", Box::new(|collection, dir| { collection.collect_reconfigurator_state(dir).boxed() }), ), - ( + CollectionStep::new( "ereports", Box::new(|collection, dir| { collection.collect_ereports(dir).boxed() }), ), - ( + CollectionStep::new( "sled cubby info", Box::new({ let all_sleds = all_sleds.clone(); @@ -1162,7 +1221,7 @@ impl BundleCollection { } }), ), - ( + CollectionStep::new( "spawn steps to query all SP dumps", Box::new({ let mgs_client = mgs_client.clone(); @@ -1176,7 +1235,7 @@ impl BundleCollection { } }), ), - ( + CollectionStep::new( "spawn steps to query all sleds", Box::new({ let all_sleds = all_sleds.clone(); @@ -1198,23 +1257,22 @@ impl BundleCollection { all_sleds: &OnceCell>>>, ) -> anyhow::Result { if !self.request.include_host_info() { - return Ok(CollectionStepOutput::None); + return Ok(CollectionStepOutput::Skipped); } let Some(all_sleds) = self.get_or_initialize_all_sleds(all_sleds).await.as_deref() else { - warn!(self.log, "Could not read list of sleds"); - return Ok(CollectionStepOutput::None); + bail!("Could not read list of sleds"); }; - let mut extra_steps: Vec<(&'static str, CollectionStepFn)> = vec![]; + let mut extra_steps: Vec = vec![]; for sled in all_sleds { if !self.request.include_sled(sled.id()) { continue; } - extra_steps.push(( + extra_steps.push(CollectionStep::new( "sled data", Box::new({ let sled = sled.clone(); @@ -1245,7 +1303,7 @@ impl BundleCollection { if !self.request.include_host_info() || !self.request.include_sled(sled.id()) { - return Ok(CollectionStepOutput::None); + return Ok(CollectionStepOutput::Skipped); } let log = &self.log; @@ -1272,7 +1330,7 @@ impl BundleCollection { "Could not contact sled", ) .await?; - return Ok(CollectionStepOutput::None); + bail!("Could not contact sled"); }; // NB: As new sled-diagnostic commands are added they should @@ -1386,7 +1444,7 @@ impl BundleCollection { ) -> anyhow::Result { let Some(ref ereport_filters) = self.request.ereport_query else { debug!(self.log, "Support bundle: ereports not requested"); - return Ok(CollectionStepOutput::None); + return Ok(CollectionStepOutput::Skipped); }; let ereports_dir = dir.join("ereports"); let mut status = SupportBundleEreportStatus::default(); diff --git a/nexus/tests/integration_tests/support_bundles.rs b/nexus/tests/integration_tests/support_bundles.rs index 716bc228ca9..75bb4bfa64d 100644 --- a/nexus/tests/integration_tests/support_bundles.rs +++ b/nexus/tests/integration_tests/support_bundles.rs @@ -486,18 +486,18 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) { output.cleanup_report, Some(SupportBundleCleanupReport { ..Default::default() }) ); + + let report = output.collection_report.as_ref().expect("Missing report"); + assert_eq!(report.bundle, bundle.id); + assert!(report.listed_in_service_sleds); + assert!(report.listed_sps); + assert!(report.activated_in_db_ok); assert_eq!( - output.collection_report, - Some(SupportBundleCollectionReport { - bundle: bundle.id, - listed_in_service_sleds: true, - listed_sps: true, - activated_in_db_ok: true, - ereports: Some(SupportBundleEreportStatus { - n_collected: 0, - n_found: 0, - errors: Vec::new() - }) + report.ereports, + Some(SupportBundleEreportStatus { + n_collected: 0, + n_found: 0, + errors: Vec::new() }) ); let bundle = bundle_get(&client, bundle.id).await.unwrap(); @@ -588,18 +588,17 @@ async fn test_support_bundle_range_requests( // Finish collection, activate the bundle. let output = activate_bundle_collection_background_task(&cptestctx).await; assert_eq!(output.collection_err, None); + let report = output.collection_report.as_ref().expect("Missing report"); + assert_eq!(report.bundle, bundle.id); + assert!(report.listed_in_service_sleds); + assert!(report.listed_sps); + assert!(report.activated_in_db_ok); assert_eq!( - output.collection_report, - Some(SupportBundleCollectionReport { - bundle: bundle.id, - listed_in_service_sleds: true, - listed_sps: true, - activated_in_db_ok: true, - ereports: Some(SupportBundleEreportStatus { - n_collected: 0, - n_found: 0, - errors: Vec::new() - }) + report.ereports, + Some(SupportBundleEreportStatus { + n_collected: 0, + n_found: 0, + errors: Vec::new() }) ); let bundle = bundle_get(&client, bundle.id).await.unwrap(); diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index 08d343182e6..42264d0411b 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -281,11 +281,41 @@ pub struct SupportBundleCollectionReport { /// True iff the bundle was successfully made 'active' in the database. pub activated_in_db_ok: bool, + /// All steps taken, alongside their timing information, when collecting the + /// bundle. + pub steps: Vec, + /// Status of ereport collection, or `None` if no ereports were requested /// for this support bundle. pub ereports: Option, } +#[derive(Debug, Deserialize, Serialize, PartialEq, Eq)] +pub struct SupportBundleCollectionStep { + pub name: String, + pub start: DateTime, + pub end: DateTime, + pub status: SupportBundleCollectionStepStatus, +} + +#[derive(Debug, Deserialize, Serialize, PartialEq, Eq)] +pub enum SupportBundleCollectionStepStatus { + Ok, + Skipped, + Failed(String), +} + +impl std::fmt::Display for SupportBundleCollectionStepStatus { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + use SupportBundleCollectionStepStatus::*; + match self { + Ok => write!(f, "ok"), + Skipped => write!(f, "skipped"), + Failed(why) => write!(f, "failed: {why}"), + } + } +} + #[derive(Debug, Default, Deserialize, Serialize, PartialEq, Eq)] pub struct SupportBundleEreportStatus { /// The total number of ereports found that match the requested filters. @@ -309,6 +339,7 @@ impl SupportBundleCollectionReport { listed_in_service_sleds: false, listed_sps: false, activated_in_db_ok: false, + steps: vec![], ereports: None, } } From 4d59114117248febe5ca8a1aa8981fa9f0d7b289 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 2 Dec 2025 17:11:36 -0800 Subject: [PATCH 06/18] Better table output, step labels --- dev-tools/omdb/src/bin/omdb/nexus.rs | 36 +++++++++++++------ .../tasks/support_bundle_collector.rs | 4 +-- .../integration_tests/support_bundles.rs | 22 ++++++++++++ 3 files changed, 50 insertions(+), 12 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index d8d6bab1bc7..9833a7bf141 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -2626,17 +2626,33 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) { " Bundle was able to list service processors: {listed_sps}" ); + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct StepRow { + step_name: String, + start_time: String, + duration: String, + status: String, + } + steps.sort_unstable_by_key(|s| s.start); - for step in steps { - let duration = (step.end - step.start) - .to_std() - .unwrap_or(Duration::from_millis(0)); - println!( - " Step {} ({}ms): {}", - step.name, - duration.as_millis(), - step.status - ); + let rows: Vec = steps + .into_iter() + .map(|step| { + let duration = (step.end - step.start) + .to_std() + .unwrap_or(Duration::from_millis(0)); + StepRow { + step_name: step.name, + start_time: step.start.to_rfc3339(), + duration: format!("{:.3}s", duration.as_secs_f64()), + status: step.status.to_string(), + } + }) + .collect(); + + if !rows.is_empty() { + println!("\n{}", tabled::Table::new(rows)); } println!( " Bundle was activated in the database: {activated_in_db_ok}" diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 2b63ac6e93b..41babc3838b 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -1117,7 +1117,7 @@ impl BundleCollection { let mut extra_steps: Vec = vec![]; for sp in get_available_sps(&mgs_client).await? { extra_steps.push(CollectionStep::new( - "SP dump", + format!("SP dump for {:?}", sp), Box::new({ let mgs_client = mgs_client.clone(); move |collection, dir| { @@ -1273,7 +1273,7 @@ impl BundleCollection { } extra_steps.push(CollectionStep::new( - "sled data", + format!("sled data for sled {}", sled.id()), Box::new({ let sled = sled.clone(); move |collection, dir| { diff --git a/nexus/tests/integration_tests/support_bundles.rs b/nexus/tests/integration_tests/support_bundles.rs index 75bb4bfa64d..ade2cbdb2c9 100644 --- a/nexus/tests/integration_tests/support_bundles.rs +++ b/nexus/tests/integration_tests/support_bundles.rs @@ -500,6 +500,17 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) { errors: Vec::new() }) ); + + // Verify that steps were recorded with reasonable timing data + assert!(!report.steps.is_empty(), "Should have recorded some steps"); + for step in &report.steps { + assert!( + step.end >= step.start, + "Step '{}' end time should be >= start time", + step.name + ); + } + let bundle = bundle_get(&client, bundle.id).await.unwrap(); assert_eq!(bundle.state, SupportBundleState::Active); @@ -601,6 +612,17 @@ async fn test_support_bundle_range_requests( errors: Vec::new() }) ); + + // Verify that steps were recorded with reasonable timing data + assert!(!report.steps.is_empty(), "Should have recorded some steps"); + for step in &report.steps { + assert!( + step.end >= step.start, + "Step '{}' end time should be >= start time", + step.name + ); + } + let bundle = bundle_get(&client, bundle.id).await.unwrap(); assert_eq!(bundle.state, SupportBundleState::Active); From e69f392cae63f102cf5ff40752db914eaeda1666 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 3 Dec 2025 12:58:08 -0800 Subject: [PATCH 07/18] [support bundle] Simplify report, relying on new 'steps' infrastructure --- dev-tools/omdb/src/bin/omdb/nexus.rs | 8 -- .../tasks/support_bundle_collector.rs | 94 ++++++++++++------- .../integration_tests/support_bundles.rs | 29 +++++- nexus/types/src/internal_api/background.rs | 21 +++-- 4 files changed, 97 insertions(+), 55 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 9833a7bf141..b17751821a3 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -2610,8 +2610,6 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) { if let Some(SupportBundleCollectionReport { bundle, - listed_in_service_sleds, - listed_sps, activated_in_db_ok, mut steps, ereports, @@ -2619,12 +2617,6 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) { { println!(" Support Bundle Collection Report:"); println!(" Bundle ID: {bundle}"); - println!( - " Bundle was able to list in-service sleds: {listed_in_service_sleds}" - ); - println!( - " Bundle was able to list service processors: {listed_sps}" - ); #[derive(Tabled)] #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 41babc3838b..5eb9b6f6370 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -629,15 +629,6 @@ impl CompletedCollectionStep { report.ereports = Some(status); Status::Ok } - CollectionStepOutput::SavingSpDumps { listed_sps } => { - report.listed_sps = listed_sps; - Status::Ok - } - CollectionStepOutput::SpawnSleds { extra_steps } => { - report.listed_in_service_sleds = true; - steps.extend(extra_steps); - Status::Ok - } CollectionStepOutput::Spawn { extra_steps } => { steps.extend(extra_steps); Status::Ok @@ -664,14 +655,7 @@ enum CollectionStepOutput { // It may have still saved a partial set of data to the bundle. Failed(anyhow::Error), Ereports(SupportBundleEreportStatus), - SavingSpDumps { listed_sps: bool }, - // NOTE: The distinction between this and "Spawn" is pretty artificial - - // it's just to preserve a part of the report which says "we tried to - // list in-service sleds". - // - // If we changed the collection report, this could easily be combined - // with the "Spawn" variant. - SpawnSleds { extra_steps: Vec }, + // The step spawned additional steps to execute Spawn { extra_steps: Vec }, // The step completed with nothing to report, and no follow-up steps None, @@ -1149,7 +1133,7 @@ impl BundleCollection { format!("failed to save SP dump from: {} {}", sp.type_, sp.slot) })?; - Ok(CollectionStepOutput::SavingSpDumps { listed_sps: true }) + Ok(CollectionStepOutput::None) } // Perform the work of collecting the support bundle into a temporary directory @@ -1185,25 +1169,25 @@ impl BundleCollection { let steps: Vec = vec![ CollectionStep::new( - "bundle id", + SupportBundleCollectionStep::STEP_BUNDLE_ID, Box::new(|collection, dir| { collection.collect_bundle_id(dir).boxed() }), ), CollectionStep::new( - "reconfigurator state", + SupportBundleCollectionStep::STEP_RECONFIGURATOR_STATE, Box::new(|collection, dir| { collection.collect_reconfigurator_state(dir).boxed() }), ), CollectionStep::new( - "ereports", + SupportBundleCollectionStep::STEP_EREPORTS, Box::new(|collection, dir| { collection.collect_ereports(dir).boxed() }), ), CollectionStep::new( - "sled cubby info", + SupportBundleCollectionStep::STEP_SLED_CUBBY_INFO, Box::new({ let all_sleds = all_sleds.clone(); let mgs_client = mgs_client.clone(); @@ -1222,7 +1206,7 @@ impl BundleCollection { }), ), CollectionStep::new( - "spawn steps to query all SP dumps", + SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS, Box::new({ let mgs_client = mgs_client.clone(); move |collection, dir| { @@ -1236,7 +1220,7 @@ impl BundleCollection { }), ), CollectionStep::new( - "spawn steps to query all sleds", + SupportBundleCollectionStep::STEP_SPAWN_SLEDS, Box::new({ let all_sleds = all_sleds.clone(); move |collection, _| { @@ -1286,7 +1270,7 @@ impl BundleCollection { )); } - return Ok(CollectionStepOutput::SpawnSleds { extra_steps }); + return Ok(CollectionStepOutput::Spawn { extra_steps }); } // Collect data from a sled, storing it into a directory that will @@ -2425,8 +2409,16 @@ mod test { .expect("Collection should have succeeded under test") .expect("Collecting the bundle should have generated a report"); assert_eq!(report.bundle, bundle.id.into()); - assert!(report.listed_in_service_sleds); - assert!(report.listed_sps); + // Verify that we spawned steps to query sleds and SPs + let step_names: Vec<_> = + report.steps.iter().map(|s| s.name.as_str()).collect(); + assert!( + step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SLEDS) + ); + assert!( + step_names + .contains(&SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS) + ); assert!(report.activated_in_db_ok); assert_eq!( report.ereports, @@ -2502,8 +2494,16 @@ mod test { .expect("Collection should have succeeded under test") .expect("Collecting the bundle should have generated a report"); assert_eq!(report.bundle, bundle.id.into()); - assert!(report.listed_in_service_sleds); - assert!(report.listed_sps); + // Verify that we spawned steps to query sleds and SPs + let step_names: Vec<_> = + report.steps.iter().map(|s| s.name.as_str()).collect(); + assert!( + step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SLEDS) + ); + assert!( + step_names + .contains(&SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS) + ); assert!(report.activated_in_db_ok); let observed_bundle = datastore @@ -2591,8 +2591,16 @@ mod test { .expect("Collection should have succeeded under test") .expect("Collecting the bundle should have generated a report"); assert_eq!(report.bundle, bundle1.id.into()); - assert!(report.listed_in_service_sleds); - assert!(report.listed_sps); + // Verify that we spawned steps to query sleds and SPs + let step_names: Vec<_> = + report.steps.iter().map(|s| s.name.as_str()).collect(); + assert!( + step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SLEDS) + ); + assert!( + step_names + .contains(&SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS) + ); assert!(report.activated_in_db_ok); // This is observable by checking the state of bundle1 and bundle2: @@ -2614,8 +2622,16 @@ mod test { .expect("Collection should have succeeded under test") .expect("Collecting the bundle should have generated a report"); assert_eq!(report.bundle, bundle2.id.into()); - assert!(report.listed_in_service_sleds); - assert!(report.listed_sps); + // Verify that we spawned steps to query sleds and SPs + let step_names: Vec<_> = + report.steps.iter().map(|s| s.name.as_str()).collect(); + assert!( + step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SLEDS) + ); + assert!( + step_names + .contains(&SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS) + ); assert!(report.activated_in_db_ok); // After another collection request, we'll see that both bundles have @@ -2742,8 +2758,16 @@ mod test { .expect("Collection should have succeeded under test") .expect("Collecting the bundle should have generated a report"); assert_eq!(report.bundle, bundle.id.into()); - assert!(report.listed_in_service_sleds); - assert!(report.listed_sps); + // Verify that we spawned steps to query sleds and SPs + let step_names: Vec<_> = + report.steps.iter().map(|s| s.name.as_str()).collect(); + assert!( + step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SLEDS) + ); + assert!( + step_names + .contains(&SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS) + ); assert!(report.activated_in_db_ok); // Cancel the bundle after collection has completed diff --git a/nexus/tests/integration_tests/support_bundles.rs b/nexus/tests/integration_tests/support_bundles.rs index ade2cbdb2c9..80ec8af191f 100644 --- a/nexus/tests/integration_tests/support_bundles.rs +++ b/nexus/tests/integration_tests/support_bundles.rs @@ -19,6 +19,7 @@ use nexus_types::external_api::shared::SupportBundleInfo; use nexus_types::external_api::shared::SupportBundleState; use nexus_types::internal_api::background::SupportBundleCleanupReport; use nexus_types::internal_api::background::SupportBundleCollectionReport; +use nexus_types::internal_api::background::SupportBundleCollectionStep; use nexus_types::internal_api::background::SupportBundleEreportStatus; use omicron_uuid_kinds::SupportBundleUuid; use serde::Deserialize; @@ -489,8 +490,6 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) { let report = output.collection_report.as_ref().expect("Missing report"); assert_eq!(report.bundle, bundle.id); - assert!(report.listed_in_service_sleds); - assert!(report.listed_sps); assert!(report.activated_in_db_ok); assert_eq!( report.ereports, @@ -511,6 +510,18 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) { ); } + // Verify that we successfully spawned steps to query sleds and SPs + let step_names: Vec<_> = + report.steps.iter().map(|s| s.name.as_str()).collect(); + assert!( + step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SLEDS), + "Should have attempted to list in-service sleds" + ); + assert!( + step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS), + "Should have attempted to list service processors" + ); + let bundle = bundle_get(&client, bundle.id).await.unwrap(); assert_eq!(bundle.state, SupportBundleState::Active); @@ -601,8 +612,6 @@ async fn test_support_bundle_range_requests( assert_eq!(output.collection_err, None); let report = output.collection_report.as_ref().expect("Missing report"); assert_eq!(report.bundle, bundle.id); - assert!(report.listed_in_service_sleds); - assert!(report.listed_sps); assert!(report.activated_in_db_ok); assert_eq!( report.ereports, @@ -623,6 +632,18 @@ async fn test_support_bundle_range_requests( ); } + // Verify that we successfully spawned steps to query sleds and SPs + let step_names: Vec<_> = + report.steps.iter().map(|s| s.name.as_str()).collect(); + assert!( + step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SLEDS), + "Should have attempted to list in-service sleds" + ); + assert!( + step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS), + "Should have attempted to list service processors" + ); + let bundle = bundle_get(&client, bundle.id).await.unwrap(); assert_eq!(bundle.state, SupportBundleState::Active); diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index 42264d0411b..dfe008198f9 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -272,12 +272,6 @@ pub struct SupportBundleCleanupReport { pub struct SupportBundleCollectionReport { pub bundle: SupportBundleUuid, - /// True iff we could list in-service sleds - pub listed_in_service_sleds: bool, - - /// True iff we could list the service processors. - pub listed_sps: bool, - /// True iff the bundle was successfully made 'active' in the database. pub activated_in_db_ok: bool, @@ -298,6 +292,19 @@ pub struct SupportBundleCollectionStep { pub status: SupportBundleCollectionStepStatus, } +impl SupportBundleCollectionStep { + /// Step name constants for the main collection steps. + /// + /// These are used both when creating steps and when validating in tests. + pub const STEP_BUNDLE_ID: &'static str = "bundle id"; + pub const STEP_RECONFIGURATOR_STATE: &'static str = "reconfigurator state"; + pub const STEP_EREPORTS: &'static str = "ereports"; + pub const STEP_SLED_CUBBY_INFO: &'static str = "sled cubby info"; + pub const STEP_SPAWN_SP_DUMPS: &'static str = + "spawn steps to query all SP dumps"; + pub const STEP_SPAWN_SLEDS: &'static str = "spawn steps to query all sleds"; +} + #[derive(Debug, Deserialize, Serialize, PartialEq, Eq)] pub enum SupportBundleCollectionStepStatus { Ok, @@ -336,8 +343,6 @@ impl SupportBundleCollectionReport { pub fn new(bundle: SupportBundleUuid) -> Self { Self { bundle, - listed_in_service_sleds: false, - listed_sps: false, activated_in_db_ok: false, steps: vec![], ereports: None, From 1886293ab4d50633bea50058e848bdf70ba1f64c Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 3 Dec 2025 13:25:00 -0800 Subject: [PATCH 08/18] improve comment --- nexus/src/app/background/tasks/support_bundle_collector.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 0adc94f37e1..34aa31862c4 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -124,10 +124,10 @@ struct BundleRequest { // The set of data to be included within this bundle. data_selection: HashSet, - // The set of sets to be included within this bundle. + // The set of sleds to be included within this bundle. // // NOTE: This selection is only considered if "data_selection" requests - // data from specific sleds. + // data from sleds. sled_selection: HashSet, // The set of ereports to be included within this bundle. From f60f53fad5ed27a8abd602a12887ab637cc2f025 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 5 Dec 2025 11:46:22 -0800 Subject: [PATCH 09/18] tracing --- .../tasks/support_bundle_collector.rs | 207 ++++++++++++++++++ 1 file changed, 207 insertions(+) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 32591c84c2c..0a712a89480 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -1037,11 +1037,94 @@ impl BundleCollection { // // Only finish if we've exhausted all possible steps and joined all spawned work. if steps.is_empty() { + // Write trace file before returning + if let Err(err) = self.write_trace_file(output, &report).await { + warn!( + self.log, + "Failed to write trace file"; + "error" => ?err + ); + } return report; } } } + // Write a Perfetto Event format JSON file for visualization + async fn write_trace_file( + &self, + output: &Utf8TempDir, + report: &SupportBundleCollectionReport, + ) -> anyhow::Result<()> { + let meta_dir = output.path().join("meta"); + tokio::fs::create_dir_all(&meta_dir).await.with_context(|| { + format!("Failed to create meta directory {meta_dir}") + })?; + + let trace_path = meta_dir.join("trace.json"); + + // Convert steps to Perfetto Trace Event format. + // Sort steps by start time and assign each a unique sequential ID. + // + // This is necessary because the trace event format does not like + // multiple slices to overlap - so we make each slice distinct. + // + // Ideally we'd be able to correlate these with actual tokio tasks, + // but it's hard to convert tokio::task::Id to a u64 because + // of https://github.com/tokio-rs/tokio/issues/7430 + let mut sorted_steps: Vec<_> = report.steps.iter().collect(); + sorted_steps.sort_by_key(|s| s.start); + + // Generate trace events - each step gets a unique ID (1, 2, 3, ...) + // based on its start time order + let trace_events: Vec<_> = sorted_steps + .iter() + .enumerate() + .map(|(i, step)| { + let start_us = step.start.timestamp_micros(); + let duration_us = (step.end - step.start) + .num_microseconds() + .unwrap_or(0) + .max(0); + let step_id = i + 1; + + json!({ + "name": step.name, + "cat": "bundle_collection", + "ph": "X", // Complete event (has duration) + "ts": start_us, + "dur": duration_us, + "pid": 1, + "tid": step_id, + "args": { + "status": step.status.to_string(), + } + }) + }) + .collect(); + + let trace_json = json!({ + "traceEvents": trace_events, + "displayTimeUnit": "ms", + }); + + let trace_content = serde_json::to_string_pretty(&trace_json) + .context("Failed to serialize trace JSON")?; + + tokio::fs::write(&trace_path, trace_content).await.with_context( + || format!("Failed to write trace file to {trace_path}"), + )?; + + info!( + self.log, + "Wrote trace file"; + "path" => %trace_path, + "num_events" => trace_events.len() + ); + + Ok(()) + } + async fn collect_bundle_id( &self, dir: &Utf8Path, @@ -2528,6 +2611,130 @@ mod test { assert!(report.is_none()); } + #[nexus_test(server = crate::Server)] + async fn test_trace_file_generated(cptestctx: &ControlPlaneTestContext) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let resolver = nexus.resolver(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + // Before we can create any bundles, we need to create the + // space for them to be provisioned. + let _datasets = + TestDataset::setup(cptestctx, &datastore, &opctx, 1).await; + + // Create a bundle to collect + let bundle = datastore + .support_bundle_create( + &opctx, + "For trace file testing", + nexus.id(), + None, + ) + .await + .expect("Couldn't allocate a support bundle"); + + let collector = SupportBundleCollector::new( + datastore.clone(), + resolver.clone(), + false, + nexus.id(), + ); + + // Collect the bundle + let mut request = BundleRequest::default(); + request.data_selection.insert(BundleData::HostInfo(HashSet::new())); + let report = collector + .collect_bundle(&opctx, &request) + .await + .expect("Collection should have succeeded") + .expect("Should have generated a report"); + + // Download the trace file from the bundle + let head = false; + let range = None; + let response = nexus + .support_bundle_download( + &opctx, + bundle.id.into(), + SupportBundleQueryType::Path { + file_path: "meta/trace.json".to_string(), + }, + head, + range, + ) + .await + .expect("Should be able to download trace file"); + + // Parse the trace file as JSON + let body_bytes = + response.into_body().collect().await.unwrap().to_bytes(); + let trace_json: serde_json::Value = serde_json::from_slice(&body_bytes) + .expect("Trace file should be valid JSON"); + + // Verify the structure matches Perfetto Trace Event format + let trace_events = trace_json + .get("traceEvents") + .expect("Should have traceEvents field") + .as_array() + .expect("traceEvents should be an array"); + + // We should have at least the main collection steps + assert!( + !trace_events.is_empty(), + "Should have at least one trace event" + ); + + // Verify each event has the expected fields + for event in trace_events { + assert!(event.get("name").is_some(), "Event should have name"); + assert_eq!( + event.get("cat").and_then(|v| v.as_str()), + Some("bundle_collection"), + "Event should have category 'bundle_collection'" + ); + assert_eq!( + event.get("ph").and_then(|v| v.as_str()), + Some("X"), + "Event should be Complete event type" + ); + assert!( + event.get("ts").and_then(|v| v.as_i64()).is_some(), + "Event should have timestamp" + ); + assert!( + event.get("dur").and_then(|v| v.as_i64()).is_some(), + "Event should have duration" + ); + assert!( + event.get("args").is_some(), + "Event should have args field" + ); + } + + // Verify we have the same number of events as steps in the report + assert_eq!( + trace_events.len(), + report.steps.len(), + "Number of events should match number of steps" + ); + + // Verify step names match between report and trace + let trace_names: std::collections::HashSet<_> = trace_events + .iter() + .filter_map(|e| e.get("name").and_then(|v| v.as_str())) + .collect(); + let report_names: std::collections::HashSet<_> = + report.steps.iter().map(|s| s.name.as_str()).collect(); + assert_eq!( + trace_names, report_names, + "Trace event names should match report step names" + ); + } + #[nexus_test(server = crate::Server)] async fn test_collect_chunked(cptestctx: &ControlPlaneTestContext) { let nexus = &cptestctx.server.server_context().nexus; From 3873a571a99de54aa2701011ee6b7aef61285607 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 8 Dec 2025 11:30:22 -0800 Subject: [PATCH 10/18] Extract trace structs --- nexus/src/app/background/tasks/mod.rs | 1 + .../background/tasks/support_bundle/mod.rs | 7 ++ .../tasks/support_bundle/perfetto.rs | 51 ++++++++++ .../tasks/support_bundle_collector.rs | 97 ++++++++----------- 4 files changed, 102 insertions(+), 54 deletions(-) create mode 100644 nexus/src/app/background/tasks/support_bundle/mod.rs create mode 100644 nexus/src/app/background/tasks/support_bundle/perfetto.rs diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs index 64df7770da1..ae27b2d12ca 100644 --- a/nexus/src/app/background/tasks/mod.rs +++ b/nexus/src/app/background/tasks/mod.rs @@ -43,6 +43,7 @@ pub mod region_snapshot_replacement_start; pub mod region_snapshot_replacement_step; pub mod saga_recovery; pub mod service_firewall_rules; +pub mod support_bundle; pub mod support_bundle_collector; pub mod sync_service_zone_nat; pub mod sync_switch_configuration; diff --git a/nexus/src/app/background/tasks/support_bundle/mod.rs b/nexus/src/app/background/tasks/support_bundle/mod.rs new file mode 100644 index 00000000000..9b7b4ac4aa0 --- /dev/null +++ b/nexus/src/app/background/tasks/support_bundle/mod.rs @@ -0,0 +1,7 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Support bundle related types and utilities + +pub mod perfetto; diff --git a/nexus/src/app/background/tasks/support_bundle/perfetto.rs b/nexus/src/app/background/tasks/support_bundle/perfetto.rs new file mode 100644 index 00000000000..8653b7b907b --- /dev/null +++ b/nexus/src/app/background/tasks/support_bundle/perfetto.rs @@ -0,0 +1,51 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Perfetto Trace Event format support for visualizing support bundle collection + +use serde::Deserialize; +use serde::Serialize; + +/// Represents a Perfetto Trace Event format JSON file for visualization. +/// +/// This format is used by the Perfetto trace viewer () +/// to visualize timing information for operations. +#[derive(Serialize, Deserialize)] +pub struct Trace { + #[serde(rename = "traceEvents")] + pub trace_events: Vec, + /// Display unit for time values in the UI (e.g., "ms" for milliseconds) + #[serde(rename = "displayTimeUnit")] + pub display_time_unit: String, +} + +/// A single event in the Perfetto Trace Event format. +/// +/// This represents a complete event (duration event) showing when an operation +/// started and how long it took. +#[derive(Serialize, Deserialize)] +pub struct TraceEvent { + /// Human-readable name of the event + pub name: String, + /// Category name (abbreviated as "cat" in Perfetto format). + /// Used to group related events together in the trace viewer. + pub cat: String, + /// Phase type (abbreviated as "ph" in Perfetto format). + /// "X" means a "Complete" event with both timestamp and duration. + pub ph: String, + /// Timestamp in microseconds (abbreviated as "ts" in Perfetto format). + /// Represents when the event started, as microseconds since the epoch. + pub ts: i64, + /// Duration in microseconds (abbreviated as "dur" in Perfetto format). + /// How long the event took to complete. + pub dur: i64, + /// Process ID. Used to separate events into different process lanes + /// in the trace viewer. + pub pid: u32, + /// Thread ID. Used to separate events into different thread lanes + /// within a process in the trace viewer. + pub tid: usize, + /// Arbitrary key-value pairs with additional event metadata + pub args: serde_json::Value, +} diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 0a712a89480..0d4939f3acc 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -78,6 +78,8 @@ use zip::ZipArchive; use zip::ZipWriter; use zip::write::FullFileOptions; +use super::support_bundle::perfetto; + // We use "/var/tmp" to use Nexus' filesystem for temporary storage, // rather than "/tmp", which would keep this collected data in-memory. const TEMPDIR: &str = "/var/tmp"; @@ -1088,27 +1090,27 @@ impl BundleCollection { .max(0); let step_id = i + 1; - json!({ - "name": step.name, - "cat": "bundle_collection", - "ph": "X", // Complete event (has duration) - "ts": start_us, - "dur": duration_us, - "pid": 1, - "tid": step_id, - "args": { + perfetto::TraceEvent { + name: step.name.clone(), + cat: "bundle_collection".to_string(), + ph: "X".to_string(), + ts: start_us, + dur: duration_us, + pid: 1, + tid: step_id, + args: json!({ "status": step.status.to_string(), - } - }) + }), + } }) .collect(); - let trace_json = json!({ - "traceEvents": trace_events, - "displayTimeUnit": "ms", - }); + let trace = perfetto::Trace { + trace_events, + display_time_unit: "ms".to_string(), + }; - let trace_content = serde_json::to_string_pretty(&trace_json) + let trace_content = serde_json::to_string_pretty(&trace) .context("Failed to serialize trace JSON")?; tokio::fs::write(&trace_path, trace_content).await.with_context( @@ -1119,7 +1121,7 @@ impl BundleCollection { self.log, "Wrote trace file"; "path" => %trace_path, - "num_events" => trace_events.len() + "num_events" => trace.trace_events.len() ); Ok(()) @@ -2669,64 +2671,51 @@ mod test { .await .expect("Should be able to download trace file"); - // Parse the trace file as JSON + // Parse the trace file using our Perfetto structs let body_bytes = response.into_body().collect().await.unwrap().to_bytes(); - let trace_json: serde_json::Value = serde_json::from_slice(&body_bytes) - .expect("Trace file should be valid JSON"); + let trace: perfetto::Trace = serde_json::from_slice(&body_bytes) + .expect("Trace file should be valid Perfetto JSON"); - // Verify the structure matches Perfetto Trace Event format - let trace_events = trace_json - .get("traceEvents") - .expect("Should have traceEvents field") - .as_array() - .expect("traceEvents should be an array"); + // Verify display time unit + assert_eq!( + trace.display_time_unit, "ms", + "Display time unit should be milliseconds" + ); // We should have at least the main collection steps assert!( - !trace_events.is_empty(), + !trace.trace_events.is_empty(), "Should have at least one trace event" ); - // Verify each event has the expected fields - for event in trace_events { - assert!(event.get("name").is_some(), "Event should have name"); + // Verify each event has the expected structure + for event in &trace.trace_events { + // Verify category assert_eq!( - event.get("cat").and_then(|v| v.as_str()), - Some("bundle_collection"), + event.cat, "bundle_collection", "Event should have category 'bundle_collection'" ); - assert_eq!( - event.get("ph").and_then(|v| v.as_str()), - Some("X"), - "Event should be Complete event type" - ); - assert!( - event.get("ts").and_then(|v| v.as_i64()).is_some(), - "Event should have timestamp" - ); - assert!( - event.get("dur").and_then(|v| v.as_i64()).is_some(), - "Event should have duration" - ); - assert!( - event.get("args").is_some(), - "Event should have args field" - ); + // Verify phase type + assert_eq!(event.ph, "X", "Event should be Complete event type"); + // Verify timestamps are positive + assert!(event.ts >= 0, "Event timestamp should be non-negative"); + assert!(event.dur >= 0, "Event duration should be non-negative"); + // Verify process and thread IDs are set + assert_eq!(event.pid, 1, "All events should have pid=1"); + assert!(event.tid > 0, "Event thread ID should be positive"); } // Verify we have the same number of events as steps in the report assert_eq!( - trace_events.len(), + trace.trace_events.len(), report.steps.len(), "Number of events should match number of steps" ); // Verify step names match between report and trace - let trace_names: std::collections::HashSet<_> = trace_events - .iter() - .filter_map(|e| e.get("name").and_then(|v| v.as_str())) - .collect(); + let trace_names: std::collections::HashSet<_> = + trace.trace_events.iter().map(|e| e.name.as_str()).collect(); let report_names: std::collections::HashSet<_> = report.steps.iter().map(|s| s.name.as_str()).collect(); assert_eq!( From a18d57aeae84fff78e59c39fdbc0df10f1b924a3 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 8 Dec 2025 13:14:07 -0800 Subject: [PATCH 11/18] [support bundles] Split support bundles into modules, add README for devs --- .../background/tasks/support_bundle/README.md | 66 + .../background/tasks/support_bundle/cache.rs | 92 + .../tasks/support_bundle/collection.rs | 623 ++++++ .../background/tasks/support_bundle/mod.rs | 5 + .../tasks/support_bundle/request.rs | 195 ++ .../background/tasks/support_bundle/step.rs | 129 ++ .../tasks/support_bundle/steps/bundle_id.rs | 22 + .../tasks/support_bundle/steps/ereports.rs | 211 ++ .../tasks/support_bundle/steps/host_info.rs | 338 ++++ .../tasks/support_bundle/steps/mod.rs | 89 + .../support_bundle/steps/reconfigurator.rs | 64 + .../tasks/support_bundle/steps/sled_cubby.rs | 146 ++ .../tasks/support_bundle/steps/sp_dumps.rs | 110 + .../tasks/support_bundle_collector.rs | 1782 +---------------- 14 files changed, 2110 insertions(+), 1762 deletions(-) create mode 100644 nexus/src/app/background/tasks/support_bundle/README.md create mode 100644 nexus/src/app/background/tasks/support_bundle/cache.rs create mode 100644 nexus/src/app/background/tasks/support_bundle/collection.rs create mode 100644 nexus/src/app/background/tasks/support_bundle/request.rs create mode 100644 nexus/src/app/background/tasks/support_bundle/step.rs create mode 100644 nexus/src/app/background/tasks/support_bundle/steps/bundle_id.rs create mode 100644 nexus/src/app/background/tasks/support_bundle/steps/ereports.rs create mode 100644 nexus/src/app/background/tasks/support_bundle/steps/host_info.rs create mode 100644 nexus/src/app/background/tasks/support_bundle/steps/mod.rs create mode 100644 nexus/src/app/background/tasks/support_bundle/steps/reconfigurator.rs create mode 100644 nexus/src/app/background/tasks/support_bundle/steps/sled_cubby.rs create mode 100644 nexus/src/app/background/tasks/support_bundle/steps/sp_dumps.rs diff --git a/nexus/src/app/background/tasks/support_bundle/README.md b/nexus/src/app/background/tasks/support_bundle/README.md new file mode 100644 index 00000000000..e6a52539afd --- /dev/null +++ b/nexus/src/app/background/tasks/support_bundle/README.md @@ -0,0 +1,66 @@ +# Support Bundles + +**Support Bundles** provide a mechanism for extracting information about a +running Oxide system, and giving operators control over the exfiltration of that +data. + +This README is intended for developers trying to add data to the bundle. + +## Step Execution Framework + +Support Bundles are collected using **steps**, which are named functions acting +on the `BundleCollection` that can: + +* Read from the database, or query arbitrary services +* Emit data to the output zipfile +* Produce additional follow-up **steps**, if necessary + +If you're interested in adding data to a support bundle, you will probably be +adding data to an existing **step**, or creating a new one. + +The set of all initial steps is defined in +`nexus/src/app/background/tasks/support_bundle/steps/mod.rs`, within a function +called `all()`. Some of these steps may themselves spawn additional steps, +such as `STEP_SPAWN_SLEDS`, which spawns a per-sled step to query the sled +host OS itself. + +### Tracing + +**Steps** are automatically instrumented, and their durations are emitted to an +output file in the bundle named `meta/trace.json`. These traces are in a format +which can be understood by **Perfetto**, a trace-viewer, and which provides +a browser-based interface at . + +## Filtering Bundle Contents + +Support Bundles are collected by the `support_bundle_collector` +background task. They are collected as zipfiles within a single Nexus instance, +which are then transferred to durable storage. + +The contents of a bundle may be controlled by modifying the **BundleRequest** +structure. This request provides filters for controlling the categories of +data which are collected (e.g., "Host OS info") as well as arguments for +more specific constraints (e.g., "Collect info from a specific Sled"). + +Bundle **steps** may query the `BundleRequest` to identify whether or not their +contents should be included. + +## Overview for adding new data + +* **Determine if your data should exist in a new step**. The existing set of + steps exists in `support_bundle/steps`. Adding a new step provides a new unit + of execution (it can be executed concurrently with other steps), and a unit of + tracing (it will be instrumented independently of other steps). +* If you're adding a new step... + * **Add it as a new module**, within `support_bundle/steps`. + * **Ensure it's part of `steps::all()`, or spawned by an existing step**. This + will be necessary for your step to be executed. + * **Provide a way for bundles to opt-out of collecting this data**. Check the + `BundleRequest` to see if your data exists in one of the current filters, or + consider adding a new one if your step involves a new category of data. Either + way, your new step should read `BundleRequest` to decide if it should trigger + before performing any subsequent operations. +* **Consider Caching**. If your new data requires performing any potentially + expensive operations which might be shared with other steps (e.g., reading + from the database, creating and using progenitor clients, etc) consider adding + that data to `support_bundle/cache`. diff --git a/nexus/src/app/background/tasks/support_bundle/cache.rs b/nexus/src/app/background/tasks/support_bundle/cache.rs new file mode 100644 index 00000000000..314345c64b7 --- /dev/null +++ b/nexus/src/app/background/tasks/support_bundle/cache.rs @@ -0,0 +1,92 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Cached data or clients which are collected by the bundle +//! +//! This is used to share data which may be used by multiple +//! otherwise independent steps. + +use crate::app::background::tasks::support_bundle::collection::BundleCollection; + +use gateway_client::Client as MgsClient; +use internal_dns_types::names::ServiceName; +use nexus_db_model::Sled; +use nexus_types::deployment::SledFilter; +use slog_error_chain::InlineErrorChain; +use std::sync::Arc; +use tokio::sync::OnceCell; + +/// Caches information which can be derived from the BundleCollection. +/// +/// This is exists as a small optimization for independent steps which may try +/// to read / access similar data, especially when it's fallible: we only need +/// to attempt to look it up once, and all steps can share it. +#[derive(Clone)] +pub struct Cache { + inner: Arc, +} + +struct Inner { + all_sleds: OnceCell>>, + mgs_client: OnceCell>, +} + +impl Cache { + pub fn new() -> Self { + Self { + inner: Arc::new(Inner { + all_sleds: OnceCell::new(), + mgs_client: OnceCell::new(), + }), + } + } + + pub async fn get_or_initialize_all_sleds<'a>( + &'a self, + collection: &BundleCollection, + ) -> Option<&'a Vec> { + self.inner + .all_sleds + .get_or_init(|| async { + collection + .datastore() + .sled_list_all_batched( + &collection.opctx(), + SledFilter::InService, + ) + .await + .ok() + }) + .await + .as_ref() + } + + pub async fn get_or_initialize_mgs_client<'a>( + &'a self, + collection: &BundleCollection, + ) -> Option<&'a MgsClient> { + self.inner + .mgs_client + .get_or_init(|| async { create_mgs_client(collection).await.ok() }) + .await + .as_ref() + } +} + +async fn create_mgs_client( + collection: &BundleCollection, +) -> anyhow::Result { + let log = collection.log(); + collection + .resolver() + .lookup_socket_v6(ServiceName::ManagementGatewayService) + .await + .map(|sockaddr| { + let url = format!("http://{}", sockaddr); + gateway_client::Client::new(&url, log.clone()) + }).map_err(|e| { + error!(log, "failed to resolve MGS address"; "error" => InlineErrorChain::new(&e)); + e.into() + }) +} diff --git a/nexus/src/app/background/tasks/support_bundle/collection.rs b/nexus/src/app/background/tasks/support_bundle/collection.rs new file mode 100644 index 00000000000..1008c85128f --- /dev/null +++ b/nexus/src/app/background/tasks/support_bundle/collection.rs @@ -0,0 +1,623 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! The entrypoint to all support bundle collection. +//! +//! These are the primitives used to look up everything else within the bundle. + +use crate::app::background::tasks::support_bundle::cache::Cache; +use crate::app::background::tasks::support_bundle::perfetto; +use crate::app::background::tasks::support_bundle::request::BundleRequest; +use crate::app::background::tasks::support_bundle::request::TEMPDIR; +use crate::app::background::tasks::support_bundle::step::CollectionStep; +use crate::app::background::tasks::support_bundle::steps; + +use anyhow::Context; +use camino::Utf8DirEntry; +use camino::Utf8Path; +use camino_tempfile::Utf8TempDir; +use camino_tempfile::tempdir_in; +use camino_tempfile::tempfile_in; +use internal_dns_resolver::Resolver; +use nexus_db_model::SupportBundle; +use nexus_db_model::SupportBundleState; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::internal_api::background::SupportBundleCollectionReport; +use omicron_common::api::external::Error; +use omicron_uuid_kinds::DatasetUuid; +use omicron_uuid_kinds::SupportBundleUuid; +use omicron_uuid_kinds::ZpoolUuid; +use parallel_task_set::ParallelTaskSet; +use serde_json::json; +use sha2::Digest; +use sha2::Sha256; +use slog_error_chain::InlineErrorChain; +use std::io::Write; +use std::num::NonZeroU64; +use std::sync::Arc; +use tokio::io::AsyncReadExt; +use tokio::io::AsyncSeekExt; +use tokio::io::SeekFrom; +use tufaceous_artifact::ArtifactHash; +use zip::ZipWriter; +use zip::write::FullFileOptions; + +/// Wraps up all arguments to perform a single support bundle collection +pub struct BundleCollection { + datastore: Arc, + resolver: Resolver, + log: slog::Logger, + opctx: OpContext, + request: BundleRequest, + bundle: SupportBundle, + transfer_chunk_size: NonZeroU64, +} + +impl BundleCollection { + pub fn new( + datastore: Arc, + resolver: Resolver, + log: slog::Logger, + opctx: OpContext, + request: BundleRequest, + bundle: SupportBundle, + transfer_chunk_size: NonZeroU64, + ) -> Self { + Self { + datastore, + resolver, + log, + opctx, + request, + bundle, + transfer_chunk_size, + } + } + + pub fn datastore(&self) -> &Arc { + &self.datastore + } + + pub fn resolver(&self) -> &Resolver { + &self.resolver + } + + pub fn log(&self) -> &slog::Logger { + &self.log + } + + pub fn opctx(&self) -> &OpContext { + &self.opctx + } + + pub fn request(&self) -> &BundleRequest { + &self.request + } + + pub fn bundle(&self) -> &SupportBundle { + &self.bundle + } + + /// Collect the bundle within Nexus, and store it on a target sled. + pub async fn collect_bundle_and_store_on_sled( + self: &Arc, + ) -> anyhow::Result { + // Create a temporary directory where we'll store the support bundle + // as it's being collected. + let dir = tempdir_in(TEMPDIR)?; + + let report = self.collect_bundle_locally(&dir).await?; + self.store_bundle_on_sled(dir).await?; + Ok(report) + } + + // Create the support bundle, placing the contents into a user-specified + // directory. + // + // Does not attempt to convert the contents into a zipfile, nor send them + // to any durable storage. + async fn collect_bundle_locally( + self: &Arc, + dir: &Utf8TempDir, + ) -> anyhow::Result { + // TL;DR: This `tokio::select` is allowed to poll multiple futures, but + // should not do any async work within the body of any chosen branch. A + // previous iteration of this code polled the "collection" as "&mut + // collection", and checked the status of the support bundle within a + // branch of the "select" polling "yield_interval.tick()". + // + // We organize this work to "check for cancellation" as a whole future + // for a critical, but subtle reason: After the tick timer yields, + // we may then try to `await` a database function. + // + // This, at a surface-level glance seems innocent enough. However, there + // is something potentially insidious here: if calling a datastore + // function - such as "support_bundle_get" - awaits acquiring access + // to a connection from the connection pool, while creating the + // collection ALSO potentially awaits acquiring access to the + // connection pool, it is possible for: + // + // 1. The `&mut collection` arm to have created a future, currently + // yielded, which wants access to this underlying resource. + // 2. The current operation executing in `support_bundle_get` to + // be awaiting access to this same underlying resource. + // + // In this specific case, the connection pool would be attempting to + // yield to the `&mut collection` arm, which cannot run, if we were + // awaiting in the body of a different async select arm. This would + // result in a deadlock. + // + // In the future, we may attempt to make access to the connection pool + // safer from concurrent asynchronous access - it is unsettling that + // multiple concurrent `.claim()` functions can cause this behavior - + // but in the meantime, we perform this cancellation check in a single + // future that always is polled concurrently with the collection work. + // Because of this separation, each future is polled until one + // completes, at which point we deterministically exit. + // + // For more details, see: + // https://github.com/oxidecomputer/omicron/issues/9259 + + tokio::select! { + // Returns if the bundle should no longer be collected. + why = self.check_for_cancellation() => { + warn!( + &self.log, + "Support Bundle cancelled - stopping collection"; + "bundle" => %self.bundle.id, + "state" => ?self.bundle.state + ); + return Err(why); + }, + // Otherwise, keep making progress on the collection itself. + report = self.collect_bundle_as_file(&dir) => { + info!( + &self.log, + "Bundle Collection completed"; + "bundle" => %self.bundle.id + ); + return report; + }, + } + } + + async fn store_bundle_on_sled( + &self, + dir: Utf8TempDir, + ) -> anyhow::Result<()> { + // Create the zipfile as a temporary file + let mut zipfile = tokio::fs::File::from_std(bundle_to_zipfile(&dir)?); + let total_len = zipfile.metadata().await?.len(); + + // Collect the hash locally before we send it over the network + // + // We'll use this later during finalization to confirm the bundle + // has been stored successfully. + zipfile.seek(SeekFrom::Start(0)).await?; + let hash = sha2_hash(&mut zipfile).await?; + + // Find the sled where we're storing this bundle. + let sled_id = self + .datastore + .zpool_get_sled_if_in_service( + &self.opctx, + self.bundle.zpool_id.into(), + ) + .await?; + let sled_client = nexus_networking::sled_client( + &self.datastore, + &self.opctx, + sled_id, + &self.log, + ) + .await?; + + let zpool = ZpoolUuid::from(self.bundle.zpool_id); + let dataset = DatasetUuid::from(self.bundle.dataset_id); + let support_bundle = SupportBundleUuid::from(self.bundle.id); + + // Tell this sled to create the bundle. + let creation_result = sled_client + .support_bundle_start_creation(&zpool, &dataset, &support_bundle) + .await + .with_context(|| "Support bundle failed to start creation")?; + + if matches!( + creation_result.state, + sled_agent_client::types::SupportBundleState::Complete + ) { + // Early exit case: the bundle was already created -- we must have either + // crashed or failed between "finalizing" and "writing to the database that we + // finished". + info!(&self.log, "Support bundle was already collected"; "bundle" => %self.bundle.id); + return Ok(()); + } + info!(&self.log, "Support bundle creation started"; "bundle" => %self.bundle.id); + + let mut offset = 0; + while offset < total_len { + // Stream the zipfile to the sled where it should be kept + let mut file = zipfile + .try_clone() + .await + .with_context(|| "Failed to clone zipfile")?; + file.seek(SeekFrom::Start(offset)).await.with_context(|| { + format!("Failed to seek to offset {offset} / {total_len} within zipfile") + })?; + + // Only stream at most "transfer_chunk_size" bytes at once + let chunk_size = std::cmp::min( + self.transfer_chunk_size.get(), + total_len - offset, + ); + + let limited_file = file.take(chunk_size); + let stream = tokio_util::io::ReaderStream::new(limited_file); + let body = reqwest::Body::wrap_stream(stream); + + info!( + &self.log, + "Streaming bundle chunk"; + "bundle" => %self.bundle.id, + "offset" => offset, + "length" => chunk_size, + ); + + sled_client.support_bundle_transfer( + &zpool, &dataset, &support_bundle, offset, body + ).await.with_context(|| { + format!("Failed to transfer bundle: {chunk_size}@{offset} of {total_len} to sled") + })?; + + offset += chunk_size; + } + + sled_client + .support_bundle_finalize( + &zpool, + &dataset, + &support_bundle, + &hash.to_string(), + ) + .await + .with_context(|| "Failed to finalize bundle")?; + + // Returning from this method should drop all temporary storage + // allocated locally for this support bundle. + Ok(()) + } + + // Indefinitely perform periodic checks about whether or not we should + // cancel the bundle. + // + // Returns an error if: + // - The bundle state is no longer SupportBundleState::Collecting + // (which happens if the bundle has been explicitly cancelled, or + // if the backing storage has been expunged). + // - The bundle has been deleted + // + // Otherwise, keeps checking indefinitely while polled. + async fn check_for_cancellation(&self) -> anyhow::Error { + let work_duration = tokio::time::Duration::from_secs(5); + let mut yield_interval = tokio::time::interval_at( + tokio::time::Instant::now() + work_duration, + work_duration, + ); + + loop { + // Timer fired mid-collection - check if we should stop. + yield_interval.tick().await; + trace!( + self.log, + "Checking if Bundle Collection cancelled"; + "bundle" => %self.bundle.id + ); + + match self + .datastore + .support_bundle_get(&self.opctx, self.bundle.id.into()) + .await + { + Ok(SupportBundle { + state: SupportBundleState::Collecting, + .. + }) => { + // Bundle still collecting; continue... + continue; + } + Ok(_) => { + // Not collecting, for any reason: Time to exit + return anyhow::anyhow!("Support Bundle Cancelled"); + } + Err(Error::ObjectNotFound { .. } | Error::NotFound { .. }) => { + return anyhow::anyhow!("Support Bundle Deleted"); + } + Err(err) => { + warn!( + self.log, + "Database error checking bundle cancellation"; + InlineErrorChain::new(&err) + ); + + // If we cannot contact the database, retry later + continue; + } + } + } + } + + async fn run_collect_bundle_steps( + self: &Arc, + output: &Utf8TempDir, + mut steps: Vec, + ) -> SupportBundleCollectionReport { + let mut report = + SupportBundleCollectionReport::new(self.bundle.id.into()); + + const MAX_CONCURRENT_STEPS: usize = 16; + let mut tasks = + ParallelTaskSet::new_with_parallelism(MAX_CONCURRENT_STEPS); + + loop { + // Process all the currently-planned steps + while let Some(step) = steps.pop() { + let previous_result = tasks + .spawn({ + let collection = self.clone(); + let dir = output.path().to_path_buf(); + let log = self.log.clone(); + async move { + debug!(log, "Running step"; "step" => &step.name); + step.run(&collection, dir.as_path(), &log).await + } + }) + .await; + + if let Some(output) = previous_result { + output.process(&mut report, &mut steps); + }; + } + + // If we've run out of tasks to spawn, join any of the previously + // spawned tasks, if any exist. + if let Some(output) = tasks.join_next().await { + output.process(&mut report, &mut steps); + + // As soon as any task completes, see if we can spawn more work + // immediately. This ensures that the ParallelTaskSet is + // saturated as much as it can be. + continue; + } + + // Executing steps may create additional steps, as follow-up work. + // + // Only finish if we've exhausted all possible steps and joined all spawned work. + if steps.is_empty() { + // Write trace file before returning + if let Err(err) = self.write_trace_file(output, &report).await { + warn!( + self.log, + "Failed to write trace file"; + "error" => ?err + ); + } + return report; + } + } + } + + // Write a Perfetto Event format JSON file for visualization + async fn write_trace_file( + &self, + output: &Utf8TempDir, + report: &SupportBundleCollectionReport, + ) -> anyhow::Result<()> { + let meta_dir = output.path().join("meta"); + tokio::fs::create_dir_all(&meta_dir).await.with_context(|| { + format!("Failed to create meta directory {meta_dir}") + })?; + + let trace_path = meta_dir.join("trace.json"); + + // Convert steps to Perfetto Trace Event format. + // Sort steps by start time and assign each a unique sequential ID. + // + // This is necessary because the trace event format does not like + // multiple slices to overlap - so we make each slice distinct. + // + // Ideally we'd be able to correlate these with actual tokio tasks, + // but it's hard to convert tokio::task::Id to a u64 because + // of https://github.com/tokio-rs/tokio/issues/7430 + let mut sorted_steps: Vec<_> = report.steps.iter().collect(); + sorted_steps.sort_by_key(|s| s.start); + + // Generate trace events - each step gets a unique ID (1, 2, 3, ...) + // based on its start time order + let trace_events: Vec<_> = sorted_steps + .iter() + .enumerate() + .map(|(i, step)| { + let start_us = step.start.timestamp_micros(); + let duration_us = (step.end - step.start) + .num_microseconds() + .unwrap_or(0) + .max(0); + let step_id = i + 1; + + perfetto::TraceEvent { + name: step.name.clone(), + cat: "bundle_collection".to_string(), + ph: "X".to_string(), + ts: start_us, + dur: duration_us, + pid: 1, + tid: step_id, + args: json!({ + "status": step.status.to_string(), + }), + } + }) + .collect(); + + let trace = perfetto::Trace { + trace_events, + display_time_unit: "ms".to_string(), + }; + + let trace_content = serde_json::to_string_pretty(&trace) + .context("Failed to serialize trace JSON")?; + + tokio::fs::write(&trace_path, trace_content).await.with_context( + || format!("Failed to write trace file to {trace_path}"), + )?; + + info!( + self.log, + "Wrote trace file"; + "path" => %trace_path, + "num_events" => trace.trace_events.len() + ); + + Ok(()) + } + + // Perform the work of collecting the support bundle into a temporary directory + // + // "dir" is an output directory where data can be stored. + // + // If a partial bundle can be collected, it should be returned as + // an Ok(SupportBundleCollectionReport). Any failures from this function + // will prevent the support bundle from being collected altogether. + // + // NOTE: The background task infrastructure will periodically check to see + // if the bundle has been cancelled by a user while it is being collected. + // If that happens, this function will be CANCELLED at an await point. + // + // As a result, it is important that this function be implemented as + // cancel-safe. + // + // The "steps" used within this function - passed to + // [`Self::run_collect_bundle_steps`] - are run on a [`ParallelTaskSet`], + // which automatically aborts tasks when it is dropped. + async fn collect_bundle_as_file( + self: &Arc, + dir: &Utf8TempDir, + ) -> anyhow::Result { + let log = &self.log; + + info!(&log, "Collecting bundle as local file"); + + let cache = Cache::new(); + let steps = steps::all(&cache); + Ok(self.run_collect_bundle_steps(dir, steps).await) + } +} + +// Takes a directory "dir", and zips the contents into a single zipfile. +fn bundle_to_zipfile(dir: &Utf8TempDir) -> anyhow::Result { + let tempfile = tempfile_in(TEMPDIR)?; + let mut zip = ZipWriter::new(tempfile); + + recursively_add_directory_to_zipfile(&mut zip, dir.path(), dir.path())?; + + Ok(zip.finish()?) +} + +fn recursively_add_directory_to_zipfile( + zip: &mut ZipWriter, + root_path: &Utf8Path, + dir_path: &Utf8Path, +) -> anyhow::Result<()> { + // Readdir might return entries in a non-deterministic order. + // Let's sort it for the zipfile, to be nice. + let mut entries = dir_path + .read_dir_utf8()? + .filter_map(Result::ok) + .collect::>(); + entries.sort_by(|a, b| a.file_name().cmp(&b.file_name())); + + for entry in &entries { + // Remove the "/tmp/..." prefix from the path when we're storing it in the + // zipfile. + let dst = entry.path().strip_prefix(root_path)?; + + let file_type = entry.file_type()?; + if file_type.is_file() { + let src = entry.path(); + + let zip_time = entry + .path() + .metadata() + .and_then(|m| m.modified()) + .ok() + .and_then(|sys_time| jiff::Zoned::try_from(sys_time).ok()) + .and_then(|zoned| { + zip::DateTime::try_from(zoned.datetime()).ok() + }) + .unwrap_or_else(zip::DateTime::default); + + let opts = FullFileOptions::default() + .last_modified_time(zip_time) + .compression_method(zip::CompressionMethod::Deflated) + .large_file(true); + + zip.start_file_from_path(dst, opts)?; + let mut file = std::fs::File::open(&src)?; + std::io::copy(&mut file, zip)?; + } + if file_type.is_dir() { + let opts = FullFileOptions::default(); + zip.add_directory_from_path(dst, opts)?; + recursively_add_directory_to_zipfile(zip, root_path, entry.path())?; + } + } + Ok(()) +} + +async fn sha2_hash(file: &mut tokio::fs::File) -> anyhow::Result { + let mut buf = vec![0u8; 65536]; + let mut ctx = Sha256::new(); + loop { + let n = file.read(&mut buf).await?; + if n == 0 { + break; + } + ctx.write_all(&buf[0..n])?; + } + + let digest = ctx.finalize(); + Ok(ArtifactHash(digest.as_slice().try_into()?)) +} + +#[cfg(test)] +mod test { + use super::*; + + use camino_tempfile::tempdir; + + // Ensure that we can convert a temporary directory into a zipfile + #[test] + fn test_zipfile_creation() { + let dir = tempdir().unwrap(); + + std::fs::create_dir_all(dir.path().join("dir-a")).unwrap(); + std::fs::create_dir_all(dir.path().join("dir-b")).unwrap(); + std::fs::write(dir.path().join("dir-a").join("file-a"), "some data") + .unwrap(); + std::fs::write(dir.path().join("file-b"), "more data").unwrap(); + + let zipfile = bundle_to_zipfile(&dir) + .expect("Should have been able to bundle zipfile"); + let archive = zip::read::ZipArchive::new(zipfile).unwrap(); + + // We expect the order to be deterministically alphabetical + let mut names = archive.file_names(); + assert_eq!(names.next(), Some("dir-a/")); + assert_eq!(names.next(), Some("dir-a/file-a")); + assert_eq!(names.next(), Some("dir-b/")); + assert_eq!(names.next(), Some("file-b")); + assert_eq!(names.next(), None); + } +} diff --git a/nexus/src/app/background/tasks/support_bundle/mod.rs b/nexus/src/app/background/tasks/support_bundle/mod.rs index 9b7b4ac4aa0..0b62e169bd2 100644 --- a/nexus/src/app/background/tasks/support_bundle/mod.rs +++ b/nexus/src/app/background/tasks/support_bundle/mod.rs @@ -4,4 +4,9 @@ //! Support bundle related types and utilities +mod cache; +pub mod collection; pub mod perfetto; +pub mod request; +mod step; +mod steps; diff --git a/nexus/src/app/background/tasks/support_bundle/request.rs b/nexus/src/app/background/tasks/support_bundle/request.rs new file mode 100644 index 00000000000..90bcbbe3679 --- /dev/null +++ b/nexus/src/app/background/tasks/support_bundle/request.rs @@ -0,0 +1,195 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Support bundle request types and data selection + +use nexus_db_queries::db::datastore::EreportFilters; +use omicron_uuid_kinds::SledUuid; +use std::collections::HashMap; +use std::collections::HashSet; +use std::num::NonZeroU64; + +/// We use "/var/tmp" to use Nexus' filesystem for temporary storage, +/// rather than "/tmp", which would keep this collected data in-memory. +pub const TEMPDIR: &str = "/var/tmp"; + +/// The size of piece of a support bundle to transfer to the sled agent +/// within a single streaming request. +pub const CHUNK_SIZE: NonZeroU64 = NonZeroU64::new(1024 * 1024 * 1024).unwrap(); + +/// Describes the category of support bundle data. +#[derive(Debug, Clone, Copy, Hash, Eq, PartialEq)] +pub enum BundleDataCategory { + /// Collects reconfigurator state (some of the latest blueprints, + /// information about the target blueprint). + Reconfigurator, + /// Collects info from sled agents, running a handful of + /// diagnostic commands (e.g., zoneadm, dladm, etc). + HostInfo, + /// Collects sled serial numbers, cubby numbers, and UUIDs. + SledCubbyInfo, + /// Saves task dumps from SPs. + SpDumps, + /// Collects ereports + Ereports, +} + +/// Specifies what data to collect for a bundle data category. +/// +/// Each variant corresponds to a BundleDataCategory. +/// For categories without additional parameters, the variant is a unit variant. +/// For categories that can be filtered or configured, the variant contains +/// that configuration data. +#[derive(Debug, Clone, Eq, PartialEq)] +pub enum BundleData { + Reconfigurator, + HostInfo(HashSet), + SledCubbyInfo, + SpDumps, + Ereports(EreportFilters), +} + +impl BundleData { + fn category(&self) -> BundleDataCategory { + match self { + Self::Reconfigurator => BundleDataCategory::Reconfigurator, + Self::HostInfo(_) => BundleDataCategory::HostInfo, + Self::SledCubbyInfo => BundleDataCategory::SledCubbyInfo, + Self::SpDumps => BundleDataCategory::SpDumps, + Self::Ereports(_) => BundleDataCategory::Ereports, + } + } +} + +/// A collection of bundle data specifications. +/// +/// This wrapper ensures that categories and data always match - you can't +/// insert (BundleDataCategory::Reconfigurator, BundleData::SpDumps) +/// because each BundleData determines its own category. +#[derive(Debug, Clone)] +pub struct BundleDataSelection { + data: HashMap, +} + +impl BundleDataSelection { + pub fn new() -> Self { + Self { data: HashMap::new() } + } + + /// Inserts BundleData to be queried for a particular category within the + /// bundle. + /// + /// Each category of data can only be specified once (e.g., inserting + /// BundleData::HostInfo multiple times will only use the most-recently + /// inserted specification) + pub fn insert(&mut self, bundle_data: BundleData) { + self.data.insert(bundle_data.category(), bundle_data); + } + + pub fn contains(&self, category: BundleDataCategory) -> bool { + self.data.contains_key(&category) + } + + pub fn get(&self, category: BundleDataCategory) -> Option<&BundleData> { + self.data.get(&category) + } +} + +impl FromIterator for BundleDataSelection { + fn from_iter>(iter: T) -> Self { + let mut selection = Self::new(); + for bundle_data in iter { + selection.insert(bundle_data); + } + selection + } +} + +impl Default for BundleDataSelection { + fn default() -> Self { + [ + BundleData::Reconfigurator, + BundleData::HostInfo(HashSet::from([SledSelection::All])), + BundleData::SledCubbyInfo, + BundleData::SpDumps, + BundleData::Ereports(EreportFilters { + start_time: Some(chrono::Utc::now() - chrono::Days::new(7)), + ..EreportFilters::default() + }), + ] + .into_iter() + .collect() + } +} + +/// The set of sleds to include +/// +/// Multiple values of this enum are joined together into a HashSet. +/// Therefore "SledSelection::All" overrides specific sleds. +#[derive(Debug, Clone, Hash, Eq, PartialEq)] +pub enum SledSelection { + All, + Specific(SledUuid), +} + +/// Specifies the data to be collected within the Support Bundle. +#[derive(Clone)] +pub struct BundleRequest { + /// The size of chunks to use when transferring a bundle from Nexus + /// to a sled agent. + /// + /// Typically, this is CHUNK_SIZE, but can be modified for testing. + pub transfer_chunk_size: NonZeroU64, + + /// The set of data to be included within this bundle. + /// + /// Maps each category to its filter. If a category is not in the map, + /// it is excluded from the bundle. + pub data_selection: BundleDataSelection, +} + +impl BundleRequest { + pub fn include_reconfigurator_data(&self) -> bool { + self.data_selection.contains(BundleDataCategory::Reconfigurator) + } + + pub fn include_host_info(&self) -> bool { + self.data_selection.contains(BundleDataCategory::HostInfo) + } + + pub fn include_sled_host_info(&self, id: SledUuid) -> bool { + let selection = + match self.data_selection.get(BundleDataCategory::HostInfo) { + Some(BundleData::HostInfo(selection)) => selection, + _ => return false, + }; + + selection.contains(&SledSelection::Specific(id)) + || selection.contains(&SledSelection::All) + } + + pub fn get_ereport_filters(&self) -> Option<&EreportFilters> { + match self.data_selection.get(BundleDataCategory::Ereports) { + Some(BundleData::Ereports(filters)) => Some(filters), + _ => None, + } + } + + pub fn include_sled_cubby_info(&self) -> bool { + self.data_selection.contains(BundleDataCategory::SledCubbyInfo) + } + + pub fn include_sp_dumps(&self) -> bool { + self.data_selection.contains(BundleDataCategory::SpDumps) + } +} + +impl Default for BundleRequest { + fn default() -> Self { + Self { + transfer_chunk_size: CHUNK_SIZE, + data_selection: BundleDataSelection::default(), + } + } +} diff --git a/nexus/src/app/background/tasks/support_bundle/step.rs b/nexus/src/app/background/tasks/support_bundle/step.rs new file mode 100644 index 00000000000..5909265b976 --- /dev/null +++ b/nexus/src/app/background/tasks/support_bundle/step.rs @@ -0,0 +1,129 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Support bundle collection step execution framework + +use crate::app::background::tasks::support_bundle::collection::BundleCollection; + +use camino::Utf8Path; +use chrono::DateTime; +use chrono::Utc; +use futures::future::BoxFuture; +use nexus_types::internal_api::background::SupportBundleCollectionReport; +use nexus_types::internal_api::background::SupportBundleCollectionStep; +use nexus_types::internal_api::background::SupportBundleCollectionStepStatus; +use nexus_types::internal_api::background::SupportBundleEreportStatus; +use slog_error_chain::InlineErrorChain; +use std::sync::Arc; + +// This type describes a single step in the Support Bundle collection. +// +// - All steps have access to the "BundleCollection", which includes +// tools for actually acquiring data. +// - All steps have access to an output directory where they can store +// serialized data to a file. +// - Finally, all steps can emit a "CollectionStepOutput", which can either +// update the collection report, or generate more steps. +pub type CollectionStepFn = Box< + dyn for<'b> FnOnce( + &'b Arc, + &'b Utf8Path, + ) + -> BoxFuture<'b, anyhow::Result> + + Send, +>; + +pub struct CollectionStep { + pub name: String, + pub step_fn: CollectionStepFn, +} + +impl CollectionStep { + pub fn new(name: impl Into, step_fn: CollectionStepFn) -> Self { + Self { name: name.into(), step_fn } + } + + pub async fn run( + self, + collection: &Arc, + output: &Utf8Path, + log: &slog::Logger, + ) -> CompletedCollectionStep { + let start = Utc::now(); + + let output = (self.step_fn)(collection, output) + .await + .inspect_err(|err| { + warn!( + log, + "Step failed"; + "step" => &self.name, + InlineErrorChain::new(err.as_ref()), + ); + }) + .unwrap_or_else(|err| CollectionStepOutput::Failed(err)); + + let end = Utc::now(); + + CompletedCollectionStep { name: self.name, start, end, output } + } +} + +pub struct CompletedCollectionStep { + pub name: String, + pub start: DateTime, + pub end: DateTime, + pub output: CollectionStepOutput, +} + +impl CompletedCollectionStep { + // Updates the collection report based on the output of a collection step, + // and possibly extends the set of all steps to be executed. + pub fn process( + self, + report: &mut SupportBundleCollectionReport, + steps: &mut Vec, + ) { + use SupportBundleCollectionStepStatus as Status; + + let status = match self.output { + CollectionStepOutput::Skipped => Status::Skipped, + CollectionStepOutput::Failed(err) => { + Status::Failed(err.to_string()) + } + CollectionStepOutput::Ereports(status) => { + report.ereports = Some(status); + Status::Ok + } + CollectionStepOutput::Spawn { extra_steps } => { + steps.extend(extra_steps); + Status::Ok + } + CollectionStepOutput::None => Status::Ok, + }; + + // Add information about this completed step the bundle report. + let step = SupportBundleCollectionStep { + name: self.name, + start: self.start, + end: self.end, + status, + }; + report.steps.push(step); + } +} + +pub enum CollectionStepOutput { + // The step was not executed intentionally + Skipped, + // The step encountered a fatal error and could not complete. + // + // It may have still saved a partial set of data to the bundle. + Failed(anyhow::Error), + Ereports(SupportBundleEreportStatus), + // The step spawned additional steps to execute + Spawn { extra_steps: Vec }, + // The step completed with nothing to report, and no follow-up steps + None, +} diff --git a/nexus/src/app/background/tasks/support_bundle/steps/bundle_id.rs b/nexus/src/app/background/tasks/support_bundle/steps/bundle_id.rs new file mode 100644 index 00000000000..392dfc21ea6 --- /dev/null +++ b/nexus/src/app/background/tasks/support_bundle/steps/bundle_id.rs @@ -0,0 +1,22 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Collects metadata about the bundle itself (currently only the ID) + +use crate::app::background::tasks::support_bundle::collection::BundleCollection; +use crate::app::background::tasks::support_bundle::step::CollectionStepOutput; +use camino::Utf8Path; + +pub async fn collect( + collection: &BundleCollection, + dir: &Utf8Path, +) -> anyhow::Result { + tokio::fs::write( + dir.join("bundle_id.txt"), + collection.bundle().id.to_string(), + ) + .await?; + + Ok(CollectionStepOutput::None) +} diff --git a/nexus/src/app/background/tasks/support_bundle/steps/ereports.rs b/nexus/src/app/background/tasks/support_bundle/steps/ereports.rs new file mode 100644 index 00000000000..24d8272aefb --- /dev/null +++ b/nexus/src/app/background/tasks/support_bundle/steps/ereports.rs @@ -0,0 +1,211 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Collect ereports for support bundles + +use crate::app::background::tasks::support_bundle::collection::BundleCollection; +use crate::app::background::tasks::support_bundle::step::CollectionStepOutput; + +use anyhow::Context; +use camino::Utf8Path; +use camino::Utf8PathBuf; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_db_queries::db::datastore; +use nexus_db_queries::db::datastore::EreportFilters; +use nexus_db_queries::db::pagination::Paginator; +use nexus_types::fm::Ereport; +use nexus_types::internal_api::background::SupportBundleEreportStatus; +use omicron_uuid_kinds::GenericUuid; +use slog::Logger; +use slog_error_chain::InlineErrorChain; +use std::sync::Arc; + +pub async fn collect( + collection: &BundleCollection, + dir: &Utf8Path, +) -> anyhow::Result { + let (log, opctx, datastore, request) = ( + collection.log(), + collection.opctx(), + collection.datastore(), + collection.request(), + ); + let ereport_filters = request.get_ereport_filters(); + + let Some(ereport_filters) = ereport_filters else { + debug!(log, "Support bundle: ereports not requested"); + return Ok(CollectionStepOutput::Skipped); + }; + let ereports_dir = dir.join("ereports"); + let mut status = SupportBundleEreportStatus::default(); + if let Err(err) = save_ereports( + log, + opctx, + datastore, + ereport_filters.clone(), + ereports_dir, + &mut status, + ) + .await + { + warn!( + log, + "Support bundle: ereport collection failed \ + ({} collected successfully)", + status.n_collected; + InlineErrorChain::new(err.as_ref()) + ); + status.errors.push(InlineErrorChain::new(err.as_ref()).to_string()); + }; + + Ok(CollectionStepOutput::Ereports(status)) +} + +async fn save_ereports( + log: &Logger, + opctx: &OpContext, + datastore: &Arc, + filters: EreportFilters, + dir: Utf8PathBuf, + status: &mut SupportBundleEreportStatus, +) -> anyhow::Result<()> { + let mut paginator = Paginator::new( + datastore::SQL_BATCH_SIZE, + dropshot::PaginationOrder::Ascending, + ); + while let Some(p) = paginator.next() { + let ereports = datastore + .ereport_fetch_matching(&opctx, &filters, &p.current_pagparams()) + .await + .map_err(|e| e.internal_context("failed to query for ereports"))?; + paginator = p.found_batch(&ereports, &|ereport| { + (ereport.restart_id.into_untyped_uuid(), ereport.ena) + }); + + let prev_n_collected = status.n_collected; + let n_ereports = ereports.len(); + status.n_found += n_ereports; + + for ereport in ereports { + match ereport.try_into() { + Ok(ereport) => { + write_ereport(ereport, &dir).await?; + status.n_collected += 1; + } + Err(err) => { + warn!(log, "invalid ereport"; "error" => %err); + status.errors.push(err.to_string()); + } + } + } + debug!( + log, + "Support bundle: added {} ereports ({} found)", + status.n_collected - prev_n_collected, + n_ereports + ); + } + + info!( + log, + "Support bundle: collected {} total ereports", status.n_collected + ); + Ok(()) +} + +async fn write_ereport(ereport: Ereport, dir: &Utf8Path) -> anyhow::Result<()> { + // Here's where we construct the file path for each ereport JSON file, + // given the top-level ereport directory path. Each ereport is stored in a + // subdirectory for the part and serial numbers of the system that produced + // the ereport. Part numbers must be included in addition to serial + // numbers, as the v1 serial scheme only guarantees uniqueness within a + // part number. These paths take the following form: + // + // {part-number}-{serial_number}/{restart_id}/{ENA}.json + // + // We can assume that the restart ID and ENA consist only of + // filesystem-safe characters, as the restart ID is known to be a UUID, and + // the ENA is just an integer. For the serial and part numbers, which + // Nexus doesn't have full control over --- it came from the ereport + // metadata --- we must check that it doesn't contain any characters + // unsuitable for use in a filesystem path. + let pn = ereport + .data + .part_number + .as_deref() + // If the part or serial numbers contain any unsavoury characters, it + // goes in the `unknown_serial` hole! Note that the alleged serial + // number from the ereport will still be present in the JSON as a + // string, so we're not *lying* about what was received; we're just + // giving up on using it in the path. + .filter(|&s| is_fs_safe_single_path_component(s)) + .unwrap_or("unknown_part"); + let sn = ereport + .data + .serial_number + .as_deref() + .filter(|&s| is_fs_safe_single_path_component(s)) + .unwrap_or("unknown_serial"); + let id = &ereport.data.id; + + let dir = dir + .join(format!("{pn}-{sn}")) + // N.B. that we call `into_untyped_uuid()` here, as the `Display` + // implementation for a typed UUID appends " (ereporter_restart)", which + // we don't want. + .join(id.restart_id.into_untyped_uuid().to_string()); + tokio::fs::create_dir_all(&dir) + .await + .with_context(|| format!("failed to create directory '{dir}'"))?; + let file_path = dir.join(format!("{}.json", id.ena)); + let json = serde_json::to_vec(&ereport).with_context(|| { + format!("failed to serialize ereport {pn}:{sn}/{id}") + })?; + tokio::fs::write(&file_path, json) + .await + .with_context(|| format!("failed to write '{file_path}'")) +} + +fn is_fs_safe_single_path_component(s: &str) -> bool { + // Might be path traversal... + if s == "." || s == ".." { + return false; + } + + if s == "~" { + return false; + } + + const BANNED_CHARS: &[char] = &[ + // Check for path separators. + // + // Naively, we might reach for `std::path::is_separator()` here. + // However, this function only checks if a path is a permitted + // separator on the *current* platform --- so, running on illumos, we + // will only check for Unix path separators. But, because the support + // bundle may be extracted on a workstation system by Oxide support + // personnel or by the customer, we should also make sure we don't + // allow the use of Windows path separators, which `is_separator()` + // won't check for on Unix systems. + '/', '\\', + // Characters forbidden on Windows, per: + // https://learn.microsoft.com/en-us/windows/win32/fileio/naming-a-file#naming-conventions + '<', '>', ':', '"', '|', '?', '*', + ]; + + // Rather than using `s.contains()`, we do all the checks in one pass. + for c in s.chars() { + if BANNED_CHARS.contains(&c) { + return false; + } + + // Definitely no control characters! + if c.is_control() { + return false; + } + } + + true +} diff --git a/nexus/src/app/background/tasks/support_bundle/steps/host_info.rs b/nexus/src/app/background/tasks/support_bundle/steps/host_info.rs new file mode 100644 index 00000000000..0e47aa5b0b5 --- /dev/null +++ b/nexus/src/app/background/tasks/support_bundle/steps/host_info.rs @@ -0,0 +1,338 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Collect host information from sleds for support bundles + +use crate::app::background::tasks::support_bundle::cache::Cache; +use crate::app::background::tasks::support_bundle::collection::BundleCollection; +use crate::app::background::tasks::support_bundle::step::CollectionStep; +use crate::app::background::tasks::support_bundle::step::CollectionStepOutput; + +use anyhow::Context; +use anyhow::bail; +use camino::Utf8Path; +use futures::FutureExt; +use futures::StreamExt; +use futures::future::Future; +use futures::stream::FuturesUnordered; +use nexus_db_model::Sled; +use nexus_networking; +use nexus_types::identity::Asset; +use tokio::io::AsyncWriteExt; + +pub async fn spawn_query_all_sleds( + collection: &BundleCollection, + cache: &Cache, +) -> anyhow::Result { + let request = collection.request(); + + if !request.include_host_info() { + return Ok(CollectionStepOutput::Skipped); + } + + let all_sleds = cache.get_or_initialize_all_sleds(collection).await; + + let Some(all_sleds) = all_sleds else { + bail!("Could not read list of sleds"); + }; + + let mut extra_steps: Vec = vec![]; + for sled in all_sleds { + if !request.include_sled_host_info(sled.id()) { + continue; + } + + let sled = sled.clone(); + extra_steps.push(CollectionStep::new( + format!("sled data for sled {}", sled.id()), + Box::new({ + move |collection, dir| { + async move { + collect_data_from_sled(collection, sled, dir).await + } + .boxed() + } + }), + )) + } + + Ok(CollectionStepOutput::Spawn { extra_steps }) +} + +// Collect data from a sled, storing it into a directory that will +// be turned into a support bundle. +// +// - "sled" is the sled from which we should collect data. +// - "dir" is a directory where data can be stored, to be turned +// into a bundle after collection completes. +async fn collect_data_from_sled( + collection: &BundleCollection, + sled: Sled, + dir: &Utf8Path, +) -> anyhow::Result { + let (log, opctx, datastore, request) = ( + collection.log(), + collection.opctx(), + collection.datastore(), + collection.request(), + ); + + if !request.include_sled_host_info(sled.id()) { + return Ok(CollectionStepOutput::Skipped); + } + + info!(log, "Collecting bundle info from sled"; "sled" => %sled.id()); + let sled_path = dir + .join("rack") + .join(sled.rack_id.to_string()) + .join("sled") + .join(sled.id().to_string()); + tokio::fs::create_dir_all(&sled_path).await?; + tokio::fs::write(sled_path.join("sled.txt"), format!("{sled:?}")).await?; + + let sled_client = match nexus_networking::sled_client( + &datastore, + &opctx, + sled.id(), + log, + ) + .await + { + Ok(client) => client, + Err(err) => { + tokio::fs::write( + sled_path.join("error.txt"), + "Could not contact sled", + ) + .await.with_context(|| { + format!("Failed to save 'error.txt' to bundle when recording error: {err}") + })?; + bail!("Could not contact sled: {err}"); + } + }; + + // NB: As new sled-diagnostic commands are added they should + // be added to this array so that their output can be saved + // within the support bundle. + let mut diag_cmds = futures::stream::iter([ + save_diag_cmd_output_or_error( + &sled_path, + "zoneadm", + sled_client.support_zoneadm_info(), + ) + .boxed(), + save_diag_cmd_output_or_error( + &sled_path, + "dladm", + sled_client.support_dladm_info(), + ) + .boxed(), + save_diag_cmd_output_or_error( + &sled_path, + "ipadm", + sled_client.support_ipadm_info(), + ) + .boxed(), + save_diag_cmd_output_or_error( + &sled_path, + "nvmeadm", + sled_client.support_nvmeadm_info(), + ) + .boxed(), + save_diag_cmd_output_or_error( + &sled_path, + "pargs", + sled_client.support_pargs_info(), + ) + .boxed(), + save_diag_cmd_output_or_error( + &sled_path, + "pfiles", + sled_client.support_pfiles_info(), + ) + .boxed(), + save_diag_cmd_output_or_error( + &sled_path, + "pstack", + sled_client.support_pstack_info(), + ) + .boxed(), + save_diag_cmd_output_or_error( + &sled_path, + "zfs", + sled_client.support_zfs_info(), + ) + .boxed(), + save_diag_cmd_output_or_error( + &sled_path, + "zpool", + sled_client.support_zpool_info(), + ) + .boxed(), + save_diag_cmd_output_or_error( + &sled_path, + "health-check", + sled_client.support_health_check(), + ) + .boxed(), + ]) + // Currently we execute up to 10 commands concurrently which + // might be doing their own concurrent work, for example + // collectiong `pstack` output of every Oxide process that is + // found on a sled. + .buffer_unordered(10); + + while let Some(result) = diag_cmds.next().await { + // Log that we failed to write the diag command output to a + // file but don't return early as we wish to get as much + // information as we can. + if let Err(e) = result { + error!( + log, + "failed to write diagnostic command output to \ + file: {e}" + ); + } + } + + // For each zone we concurrently fire off a request to its + // sled-agent to collect its logs in a zip file and write the + // result to the support bundle. + let zones = sled_client.support_logs().await?.into_inner(); + let mut log_futs: FuturesUnordered<_> = zones + .iter() + .map(|zone| { + save_zone_log_zip_or_error(log, &sled_client, zone, &sled_path) + }) + .collect(); + + while let Some(log_collection_result) = log_futs.next().await { + // We log any errors saving the zip file to disk and + // continue on. + if let Err(e) = log_collection_result { + error!(log, "failed to write logs output: {e}"); + } + } + Ok(CollectionStepOutput::None) +} + +// Run a `sled-dianostics` future and save its output to a corresponding file. +async fn save_diag_cmd_output_or_error( + path: &Utf8Path, + command: &str, + future: F, +) -> anyhow::Result<()> +where + F: Future< + Output = Result< + sled_agent_client::ResponseValue, + sled_agent_client::Error, + >, + > + Send, +{ + let result = future.await; + match result { + Ok(result) => { + let output = result.into_inner(); + let json = serde_json::to_string(&output).with_context(|| { + format!("failed to serialize {command} output as json") + })?; + tokio::fs::write(path.join(format!("{command}.json")), json) + .await + .with_context(|| { + format!("failed to write output of {command} to file") + })?; + } + Err(err) => { + tokio::fs::write( + path.join(format!("{command}_err.txt")), + err.to_string(), + ) + .await?; + } + } + Ok(()) +} + +async fn save_zone_log_zip_or_error( + logger: &slog::Logger, + client: &sled_agent_client::Client, + zone: &str, + path: &Utf8Path, +) -> anyhow::Result<()> { + // In the future when support bundle collection exposes tuning parameters + // this can turn into a collection parameter. + const DEFAULT_MAX_ROTATED_LOGS: u32 = 5; + + match client.support_logs_download(zone, DEFAULT_MAX_ROTATED_LOGS).await { + Ok(res) => { + let bytestream = res.into_inner(); + let output_dir = path.join(format!("logs/{zone}")); + let output_path = output_dir.join("logs.zip"); + + // Ensure the logs output directory exists. + tokio::fs::create_dir_all(&output_dir).await.with_context( + || format!("failed to create output directory: {output_dir}"), + )?; + + // Stream the log zip file to disk. + let mut file = + tokio::fs::File::create(&output_path).await.with_context( + || format!("failed to create log zip file: {output_path}"), + )?; + + let stream = bytestream.into_inner().map(|chunk| { + chunk.map_err(|e| std::io::Error::other(e.to_string())) + }); + let mut reader = tokio_util::io::StreamReader::new(stream); + let _nbytes = tokio::io::copy(&mut reader, &mut file).await?; + file.flush().await?; + + // Unzip the log file into the same directory. + let output_path_unzip = output_dir.join("unzipped_logs"); + let zipfile_path = output_path.clone(); + tokio::task::spawn_blocking(move || { + extract_zip_file(&output_path_unzip, &zipfile_path) + }) + .await + .map_err(|join_error| { + anyhow::anyhow!(join_error) + .context("unzipping support bundle logs zip panicked") + })??; + + // Clean up the zip file that was written to disk. + if let Err(e) = tokio::fs::remove_file(&output_path).await { + error!( + logger, + "failed to cleanup temporary logs zip file"; + "error" => %e, + "file" => %output_path, + + ); + } + } + Err(err) => { + tokio::fs::write( + path.join(format!("{zone}.logs.err")), + err.to_string(), + ) + .await?; + } + }; + + Ok(()) +} + +fn extract_zip_file( + output_dir: &Utf8Path, + zip_file: &Utf8Path, +) -> Result<(), anyhow::Error> { + let mut zip = std::fs::File::open(&zip_file) + .with_context(|| format!("failed to open zip file: {zip_file}"))?; + let mut archive = zip::ZipArchive::new(&mut zip)?; + archive.extract(&output_dir).with_context(|| { + format!("failed to extract log zip file to: {output_dir}") + })?; + Ok(()) +} diff --git a/nexus/src/app/background/tasks/support_bundle/steps/mod.rs b/nexus/src/app/background/tasks/support_bundle/steps/mod.rs new file mode 100644 index 00000000000..d2179c74b8c --- /dev/null +++ b/nexus/src/app/background/tasks/support_bundle/steps/mod.rs @@ -0,0 +1,89 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Individual support bundle collection steps + +use crate::app::background::tasks::support_bundle::cache::Cache; +use crate::app::background::tasks::support_bundle::step::CollectionStep; +use futures::FutureExt; +use nexus_types::internal_api::background::SupportBundleCollectionStep; + +mod bundle_id; +mod ereports; +mod host_info; +mod reconfigurator; +mod sled_cubby; +mod sp_dumps; + +/// Returns all steps necessary to collect a bundle. +/// +/// Note that these steps themselves may spawn additional steps while executing +/// (e.g., there is a step to read the set of sleds, from which additional +/// sled-specific steps may be created). +pub fn all(cache: &Cache) -> Vec { + vec![ + CollectionStep::new( + SupportBundleCollectionStep::STEP_BUNDLE_ID, + Box::new(|collection, dir| { + bundle_id::collect(collection, dir).boxed() + }), + ), + CollectionStep::new( + SupportBundleCollectionStep::STEP_RECONFIGURATOR_STATE, + Box::new(|collection, dir| { + reconfigurator::collect(collection, dir).boxed() + }), + ), + CollectionStep::new( + SupportBundleCollectionStep::STEP_EREPORTS, + Box::new(|collection, dir| { + ereports::collect(collection, dir).boxed() + }), + ), + CollectionStep::new( + SupportBundleCollectionStep::STEP_SLED_CUBBY_INFO, + Box::new({ + let cache = cache.clone(); + move |collection, dir| { + async move { + sled_cubby::collect( + collection, + &cache, + dir + ).await + } + .boxed() + } + }), + ), + CollectionStep::new( + SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS, + Box::new({ + let cache = cache.clone(); + move |collection, dir| { + async move { + sp_dumps::spawn_collection_steps( + collection, &cache, dir, + ) + .await + } + .boxed() + } + }), + ), + CollectionStep::new( + SupportBundleCollectionStep::STEP_SPAWN_SLEDS, + Box::new({ + let cache = cache.clone(); + move |collection, _| { + async move { + host_info::spawn_query_all_sleds(collection, &cache) + .await + } + .boxed() + } + }), + ), + ] +} diff --git a/nexus/src/app/background/tasks/support_bundle/steps/reconfigurator.rs b/nexus/src/app/background/tasks/support_bundle/steps/reconfigurator.rs new file mode 100644 index 00000000000..802b9e0953e --- /dev/null +++ b/nexus/src/app/background/tasks/support_bundle/steps/reconfigurator.rs @@ -0,0 +1,64 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Collect reconfigurator state for support bundles + +use crate::app::background::tasks::support_bundle::collection::BundleCollection; +use crate::app::background::tasks::support_bundle::step::CollectionStepOutput; + +use anyhow::Context; +use camino::Utf8Path; +use nexus_reconfigurator_preparation::reconfigurator_state_load; + +pub async fn collect( + collection: &BundleCollection, + dir: &Utf8Path, +) -> anyhow::Result { + let (log, opctx, datastore, request) = ( + collection.log(), + collection.opctx(), + collection.datastore(), + collection.request(), + ); + + if !request.include_reconfigurator_data() { + return Ok(CollectionStepOutput::Skipped); + } + + // Collect reconfigurator state + const NMAX_BLUEPRINTS: usize = 300; + match reconfigurator_state_load(&opctx, &datastore, NMAX_BLUEPRINTS).await { + Ok(state) => { + let file_path = dir.join("reconfigurator_state.json"); + let file = std::fs::OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(&file_path) + .with_context(|| format!("failed to open {}", file_path))?; + serde_json::to_writer_pretty(&file, &state).with_context(|| { + format!( + "failed to serialize reconfigurator state to {}", + file_path + ) + })?; + info!( + log, + "Support bundle: collected reconfigurator state"; + "target_blueprint" => ?state.target_blueprint, + "num_blueprints" => state.blueprints.len(), + "num_collections" => state.collections.len(), + ); + } + Err(err) => { + warn!( + log, + "Support bundle: failed to collect reconfigurator state"; + "err" => ?err, + ); + } + }; + + Ok(CollectionStepOutput::None) +} diff --git a/nexus/src/app/background/tasks/support_bundle/steps/sled_cubby.rs b/nexus/src/app/background/tasks/support_bundle/steps/sled_cubby.rs new file mode 100644 index 00000000000..47755d247af --- /dev/null +++ b/nexus/src/app/background/tasks/support_bundle/steps/sled_cubby.rs @@ -0,0 +1,146 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Collect sled cubby information for support bundles + +use crate::app::background::tasks::support_bundle::cache::Cache; +use crate::app::background::tasks::support_bundle::collection::BundleCollection; +use crate::app::background::tasks::support_bundle::step::CollectionStepOutput; + +use anyhow::Context; +use anyhow::bail; +use camino::Utf8Path; +use gateway_client::Client as MgsClient; +use gateway_client::types::SpIdentifier; +use gateway_client::types::SpIgnition; +use gateway_types::component::SpType; +use nexus_db_model::Sled; +use omicron_uuid_kinds::GenericUuid; +use serde::Serialize; +use slog::Logger; +use slog_error_chain::InlineErrorChain; +use std::collections::BTreeMap; +use uuid::Uuid; + +pub async fn collect( + collection: &BundleCollection, + cache: &Cache, + dir: &Utf8Path, +) -> anyhow::Result { + let (log, request) = (collection.log(), collection.request()); + + if !request.include_sled_cubby_info() { + return Ok(CollectionStepOutput::Skipped); + } + + let mgs_client_option = + cache.get_or_initialize_mgs_client(&collection).await; + let nexus_sleds = cache + .get_or_initialize_all_sleds(&collection) + .await + .map_or(&[][..], |v| v.as_slice()); + + let Some(mgs_client) = mgs_client_option else { + bail!("Could not initialize MGS client"); + }; + + write_sled_cubby_info(log, mgs_client, nexus_sleds, dir).await?; + + Ok(CollectionStepOutput::None) +} + +async fn write_sled_cubby_info( + log: &Logger, + mgs_client: &MgsClient, + nexus_sleds: &[Sled], + dir: &Utf8Path, +) -> anyhow::Result<()> { + #[derive(Serialize)] + struct SledInfo { + cubby: Option, + uuid: Option, + } + + let available_sps = get_available_sps(&mgs_client) + .await + .context("failed to get available SPs")?; + + // We can still get a useful mapping of cubby to serial using just the data from MGS. + let mut nexus_map: BTreeMap<_, _> = nexus_sleds + .into_iter() + .map(|sled| (sled.serial_number(), sled)) + .collect(); + + let mut sled_info = BTreeMap::new(); + for sp in + available_sps.into_iter().filter(|sp| matches!(sp.type_, SpType::Sled)) + { + let sp_state = match mgs_client.sp_get(&sp.type_, sp.slot).await { + Ok(s) => s.into_inner(), + Err(e) => { + error!(log, + "Failed to get SP state for sled_info.json"; + "cubby" => sp.slot, + "component" => %sp.type_, + "error" => InlineErrorChain::new(&e) + ); + continue; + } + }; + + if let Some(sled) = nexus_map.remove(sp_state.serial_number.as_str()) { + sled_info.insert( + sp_state.serial_number.to_string(), + SledInfo { + cubby: Some(sp.slot), + uuid: Some(*sled.identity.id.as_untyped_uuid()), + }, + ); + } else { + sled_info.insert( + sp_state.serial_number.to_string(), + SledInfo { cubby: Some(sp.slot), uuid: None }, + ); + } + } + + // Sleds not returned by MGS. + for (serial, sled) in nexus_map { + sled_info.insert( + serial.to_string(), + SledInfo { + cubby: None, + uuid: Some(*sled.identity.id.as_untyped_uuid()), + }, + ); + } + + let json = serde_json::to_string_pretty(&sled_info) + .context("failed to serialize sled info to JSON")?; + tokio::fs::write(dir.join("sled_info.json"), json).await?; + + Ok(()) +} + +pub async fn get_available_sps( + mgs_client: &MgsClient, +) -> anyhow::Result> { + let ignition_info = mgs_client + .ignition_list() + .await + .context("failed to get ignition info from MGS")? + .into_inner(); + + let mut active_sps = Vec::new(); + for info in ignition_info { + if let SpIgnition::Yes { power, flt_sp, .. } = info.details { + // Only return SPs that are powered on and are not in a faulted state. + if power && !flt_sp { + active_sps.push(info.id); + } + } + } + + Ok(active_sps) +} diff --git a/nexus/src/app/background/tasks/support_bundle/steps/sp_dumps.rs b/nexus/src/app/background/tasks/support_bundle/steps/sp_dumps.rs new file mode 100644 index 00000000000..2c745dd7649 --- /dev/null +++ b/nexus/src/app/background/tasks/support_bundle/steps/sp_dumps.rs @@ -0,0 +1,110 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Collect SP task dumps for support bundles + +use crate::app::background::tasks::support_bundle::cache::Cache; +use crate::app::background::tasks::support_bundle::collection::BundleCollection; +use crate::app::background::tasks::support_bundle::step::CollectionStep; +use crate::app::background::tasks::support_bundle::step::CollectionStepOutput; +use crate::app::background::tasks::support_bundle::steps; + +use anyhow::Context; +use anyhow::bail; +use base64::Engine; +use camino::Utf8Path; +use futures::FutureExt; +use gateway_client::Client as MgsClient; +use gateway_client::types::SpIdentifier; + +pub async fn spawn_collection_steps( + collection: &BundleCollection, + cache: &Cache, + dir: &Utf8Path, +) -> anyhow::Result { + let request = collection.request(); + + if !request.include_sp_dumps() { + return Ok(CollectionStepOutput::Skipped); + } + + let Some(mgs_client) = cache.get_or_initialize_mgs_client(collection).await + else { + bail!("Could not initialize MGS client"); + }; + + let sp_dumps_dir = dir.join("sp_task_dumps"); + tokio::fs::create_dir_all(&sp_dumps_dir).await.with_context(|| { + format!("Failed to create SP task dump directory {sp_dumps_dir}") + })?; + + let mut extra_steps: Vec = vec![]; + for sp in steps::sled_cubby::get_available_sps(&mgs_client).await? { + extra_steps.push(CollectionStep::new( + format!("SP dump for {:?}", sp), + Box::new({ + let mgs_client = mgs_client.clone(); + move |collection, dir| { + async move { + collect_sp_dump(collection, &mgs_client, sp, dir).await + } + .boxed() + } + }), + )); + } + + Ok(CollectionStepOutput::Spawn { extra_steps }) +} + +async fn collect_sp_dump( + collection: &BundleCollection, + mgs_client: &MgsClient, + sp: SpIdentifier, + dir: &Utf8Path, +) -> anyhow::Result { + if !collection.request().include_sp_dumps() { + return Ok(CollectionStepOutput::Skipped); + } + + save_sp_dumps(mgs_client, sp, dir).await.with_context(|| { + format!("failed to save SP dump from: {} {}", sp.type_, sp.slot) + })?; + + Ok(CollectionStepOutput::None) +} + +async fn save_sp_dumps( + mgs_client: &MgsClient, + sp: SpIdentifier, + sp_dumps_dir: &Utf8Path, +) -> anyhow::Result<()> { + let dump_count = mgs_client + .sp_task_dump_count(&sp.type_, sp.slot) + .await + .context("failed to get task dump count from SP")? + .into_inner(); + + let output_dir = sp_dumps_dir.join(format!("{}_{}", sp.type_, sp.slot)); + tokio::fs::create_dir_all(&output_dir).await.with_context(|| { + format!("Failed to create output directory {output_dir}") + })?; + + for i in 0..dump_count { + let task_dump = mgs_client + .sp_task_dump_get(&sp.type_, sp.slot, i) + .await + .with_context(|| format!("failed to get task dump {i} from SP"))? + .into_inner(); + + let zip_bytes = base64::engine::general_purpose::STANDARD + .decode(task_dump.base64_zip) + .context("failed to decode base64-encoded SP task dump zip")?; + + tokio::fs::write(output_dir.join(format!("dump-{i}.zip")), zip_bytes) + .await + .context("failed to write SP task dump zip to disk")?; + } + Ok(()) +} diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 0d4939f3acc..887be497a17 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -5,269 +5,37 @@ //! Background task for managing Support Bundles use crate::app::background::BackgroundTask; -use anyhow::Context; -use anyhow::bail; -use base64::Engine; -use camino::Utf8DirEntry; -use camino::Utf8Path; -use camino::Utf8PathBuf; -use camino_tempfile::Utf8TempDir; -use camino_tempfile::tempdir_in; -use camino_tempfile::tempfile_in; -use chrono::DateTime; -use chrono::Utc; use futures::FutureExt; -use futures::StreamExt; use futures::future::BoxFuture; -use futures::stream::FuturesUnordered; -use gateway_client::Client as MgsClient; -use gateway_client::types::SpIdentifier; -use gateway_client::types::SpIgnition; -use gateway_types::component::SpType; use internal_dns_resolver::Resolver; -use internal_dns_types::names::ServiceName; -use nexus_db_model::Sled; use nexus_db_model::SupportBundle; use nexus_db_model::SupportBundleState; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; -use nexus_db_queries::db::datastore; -use nexus_db_queries::db::datastore::EreportFilters; -use nexus_db_queries::db::pagination::Paginator; -use nexus_reconfigurator_preparation::reconfigurator_state_load; -use nexus_types::deployment::SledFilter; -use nexus_types::fm::Ereport; -use nexus_types::identity::Asset; use nexus_types::internal_api::background::SupportBundleCleanupReport; use nexus_types::internal_api::background::SupportBundleCollectionReport; -use nexus_types::internal_api::background::SupportBundleCollectionStep; -use nexus_types::internal_api::background::SupportBundleCollectionStepStatus; -use nexus_types::internal_api::background::SupportBundleEreportStatus; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; use omicron_common::api::external::LookupType; use omicron_common::api::external::ResourceType; use omicron_uuid_kinds::DatasetUuid; -use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::SledUuid; use omicron_uuid_kinds::SupportBundleUuid; use omicron_uuid_kinds::ZpoolUuid; -use parallel_task_set::ParallelTaskSet; -use serde::Serialize; use serde_json::json; -use sha2::{Digest, Sha256}; use sled_agent_types::support_bundle::NESTED_DATASET_NOT_FOUND; use slog_error_chain::InlineErrorChain; -use std::collections::BTreeMap; -use std::collections::HashMap; -use std::collections::HashSet; -use std::future::Future; -use std::io::Write; -use std::num::NonZeroU64; use std::sync::Arc; -use tokio::io::AsyncReadExt; -use tokio::io::AsyncSeekExt; -use tokio::io::AsyncWriteExt; -use tokio::io::SeekFrom; -use tokio::sync::OnceCell; -use tufaceous_artifact::ArtifactHash; -use uuid::Uuid; -use zip::ZipArchive; -use zip::ZipWriter; -use zip::write::FullFileOptions; - -use super::support_bundle::perfetto; - -// We use "/var/tmp" to use Nexus' filesystem for temporary storage, -// rather than "/tmp", which would keep this collected data in-memory. -const TEMPDIR: &str = "/var/tmp"; - -// The size of piece of a support bundle to transfer to the sled agent -// within a single streaming request. -const CHUNK_SIZE: NonZeroU64 = NonZeroU64::new(1024 * 1024 * 1024).unwrap(); + +use super::support_bundle::collection::BundleCollection; +use super::support_bundle::request::BundleRequest; fn authz_support_bundle_from_id(id: SupportBundleUuid) -> authz::SupportBundle { authz::SupportBundle::new(authz::FLEET, id, LookupType::by_id(id)) } -// Describes the category of support bundle data. -#[derive(Debug, Clone, Copy, Hash, Eq, PartialEq)] -enum BundleDataCategory { - // Collects reconfigurator state (some of the latest blueprints, - // information about the target blueprint). - Reconfigurator, - // Collects info from sled agents, running a handful of - // diagnostic commands (e.g., zoneadm, dladm, etc). - HostInfo, - // Collects sled serial numbers, cubby numbers, and UUIDs. - SledCubbyInfo, - // Saves task dumps from SPs. - SpDumps, - // Collects ereports - Ereports, -} - -// Specifies what data to collect for a bundle data category. -// -// Each variant corresponds to a BundleDataCategory. -// For categories without additional parameters, the variant is a unit variant. -// For categories that can be filtered or configured, the variant contains -// that configuration data. -#[derive(Debug, Clone, Eq, PartialEq)] -enum BundleData { - Reconfigurator, - HostInfo(HashSet), - SledCubbyInfo, - SpDumps, - Ereports(EreportFilters), -} - -impl BundleData { - fn category(&self) -> BundleDataCategory { - match self { - Self::Reconfigurator => BundleDataCategory::Reconfigurator, - Self::HostInfo(_) => BundleDataCategory::HostInfo, - Self::SledCubbyInfo => BundleDataCategory::SledCubbyInfo, - Self::SpDumps => BundleDataCategory::SpDumps, - Self::Ereports(_) => BundleDataCategory::Ereports, - } - } -} - -// A collection of bundle data specifications. -// -// This wrapper ensures that categories and data always match - you can't -// insert (BundleDataCategory::Reconfigurator, BundleData::SpDumps) -// because each BundleData determines its own category. -#[derive(Debug, Clone)] -struct BundleDataSelection { - data: HashMap, -} - -impl BundleDataSelection { - fn new() -> Self { - Self { data: HashMap::new() } - } - - // Inserts BundleData to be queried for a particular category within the - // bundle. - // - // Each category of data can only be specified once (e.g., inserting - // BundleData::HostInfo multiple times will only use the most-recently - // inserted specification) - fn insert(&mut self, bundle_data: BundleData) { - self.data.insert(bundle_data.category(), bundle_data); - } - - fn contains(&self, category: BundleDataCategory) -> bool { - self.data.contains_key(&category) - } - - fn get(&self, category: BundleDataCategory) -> Option<&BundleData> { - self.data.get(&category) - } -} - -impl FromIterator for BundleDataSelection { - fn from_iter>(iter: T) -> Self { - let mut selection = Self::new(); - for bundle_data in iter { - selection.insert(bundle_data); - } - selection - } -} - -impl Default for BundleDataSelection { - fn default() -> Self { - [ - BundleData::Reconfigurator, - BundleData::HostInfo(HashSet::from([SledSelection::All])), - BundleData::SledCubbyInfo, - BundleData::SpDumps, - BundleData::Ereports(EreportFilters { - start_time: Some(chrono::Utc::now() - chrono::Days::new(7)), - ..EreportFilters::default() - }), - ] - .into_iter() - .collect() - } -} - -// The set of sleds to include -// -// Multiple values of this enum are joined together into a HashSet. -// Therefore "SledSelection::All" overrides specific sleds. -#[derive(Debug, Clone, Hash, Eq, PartialEq)] -enum SledSelection { - All, - Specific(SledUuid), -} - -// Specifies the data to be collected within the Support Bundle. -#[derive(Clone)] -struct BundleRequest { - // The size of chunks to use when transferring a bundle from Nexus - // to a sled agent. - // - // Typically, this is CHUNK_SIZE, but can be modified for testing. - transfer_chunk_size: NonZeroU64, - - // The set of data to be included within this bundle. - // - // Maps each category to its filter. If a category is not in the map, - // it is excluded from the bundle. - data_selection: BundleDataSelection, -} - -impl BundleRequest { - fn include_reconfigurator_data(&self) -> bool { - self.data_selection.contains(BundleDataCategory::Reconfigurator) - } - - fn include_host_info(&self) -> bool { - self.data_selection.contains(BundleDataCategory::HostInfo) - } - - fn include_sled_host_info(&self, id: SledUuid) -> bool { - let selection = - match self.data_selection.get(BundleDataCategory::HostInfo) { - Some(BundleData::HostInfo(selection)) => selection, - _ => return false, - }; - - selection.contains(&SledSelection::Specific(id)) - || selection.contains(&SledSelection::All) - } - - fn get_ereport_filters(&self) -> Option<&EreportFilters> { - match self.data_selection.get(BundleDataCategory::Ereports) { - Some(BundleData::Ereports(filters)) => Some(filters), - _ => None, - } - } - - fn include_sled_cubby_info(&self) -> bool { - self.data_selection.contains(BundleDataCategory::SledCubbyInfo) - } - - fn include_sp_dumps(&self) -> bool { - self.data_selection.contains(BundleDataCategory::SpDumps) - } -} - -impl Default for BundleRequest { - fn default() -> Self { - Self { - transfer_chunk_size: CHUNK_SIZE, - data_selection: BundleDataSelection::default(), - } - } -} - // Result of asking a sled agent to clean up a bundle enum SledAgentBundleCleanupResult { Deleted, @@ -581,15 +349,15 @@ impl SupportBundleCollector { } }; - let collection = Arc::new(BundleCollection { - datastore: self.datastore.clone(), - resolver: self.resolver.clone(), - log: opctx.log.new(slog::o!("bundle" => bundle.id.to_string())), - opctx: opctx.child(std::collections::BTreeMap::new()), - request: request.clone(), - bundle: bundle.clone(), - transfer_chunk_size: request.transfer_chunk_size, - }); + let collection = Arc::new(BundleCollection::new( + self.datastore.clone(), + self.resolver.clone(), + opctx.log.new(slog::o!("bundle" => bundle.id.to_string())), + opctx.child(std::collections::BTreeMap::new()), + request.clone(), + bundle.clone(), + request.transfer_chunk_size, + )); let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); let mut report = collection.collect_bundle_and_store_on_sled().await?; @@ -625,1073 +393,6 @@ impl SupportBundleCollector { } } -// Wraps up all arguments to perform a single support bundle collection -struct BundleCollection { - datastore: Arc, - resolver: Resolver, - log: slog::Logger, - opctx: OpContext, - request: BundleRequest, - bundle: SupportBundle, - transfer_chunk_size: NonZeroU64, -} - -// This type describes a single step in the Support Bundle collection. -// -// - All steps have access to the "BundleCollection", which includes -// tools for actually acquiring data. -// - All steps have access to an output directory where they can store -// serialized data to a file. -// - Finally, all steps can emit a "CollectionStepOutput", which can either -// update the collection report, or generate more steps. -type CollectionStepFn = Box< - dyn for<'b> FnOnce( - &'b Arc, - &'b Utf8Path, - ) - -> BoxFuture<'b, anyhow::Result> - + Send, ->; - -struct CollectionStep { - name: String, - step_fn: CollectionStepFn, -} - -impl CollectionStep { - fn new(name: impl Into, step_fn: CollectionStepFn) -> Self { - Self { name: name.into(), step_fn } - } - - async fn run( - self, - collection: &Arc, - output: &Utf8Path, - ) -> CompletedCollectionStep { - let start = Utc::now(); - - let output = (self.step_fn)(collection, output) - .await - .inspect_err(|err| { - warn!( - collection.log, - "Step failed"; - "step" => &self.name, - InlineErrorChain::new(err.as_ref()), - ); - }) - .unwrap_or_else(|err| CollectionStepOutput::Failed(err)); - - let end = Utc::now(); - - CompletedCollectionStep { name: self.name, start, end, output } - } -} - -struct CompletedCollectionStep { - name: String, - start: DateTime, - end: DateTime, - output: CollectionStepOutput, -} - -impl CompletedCollectionStep { - // Updates the collection report based on the output of a collection step, - // and possibly extends the set of all steps to be executed. - fn process( - self, - report: &mut SupportBundleCollectionReport, - steps: &mut Vec, - ) { - use SupportBundleCollectionStepStatus as Status; - - let status = match self.output { - CollectionStepOutput::Skipped => Status::Skipped, - CollectionStepOutput::Failed(err) => { - Status::Failed(err.to_string()) - } - CollectionStepOutput::Ereports(status) => { - report.ereports = Some(status); - Status::Ok - } - CollectionStepOutput::Spawn { extra_steps } => { - steps.extend(extra_steps); - Status::Ok - } - CollectionStepOutput::None => Status::Ok, - }; - - // Add information about this completed step the bundle report. - let step = SupportBundleCollectionStep { - name: self.name, - start: self.start, - end: self.end, - status, - }; - report.steps.push(step); - } -} - -enum CollectionStepOutput { - // The step was not executed intentionally - Skipped, - // The step encountered a fatal error and could not complete. - // - // It may have still saved a partial set of data to the bundle. - Failed(anyhow::Error), - Ereports(SupportBundleEreportStatus), - // The step spawned additional steps to execute - Spawn { extra_steps: Vec }, - // The step completed with nothing to report, and no follow-up steps - None, -} - -impl BundleCollection { - // Collect the bundle within Nexus, and store it on a target sled. - async fn collect_bundle_and_store_on_sled( - self: &Arc, - ) -> anyhow::Result { - // Create a temporary directory where we'll store the support bundle - // as it's being collected. - let dir = tempdir_in(TEMPDIR)?; - - let report = self.collect_bundle_locally(&dir).await?; - self.store_bundle_on_sled(dir).await?; - Ok(report) - } - - // Create the support bundle, placing the contents into a user-specified - // directory. - // - // Does not attempt to convert the contents into a zipfile, nor send them - // to any durable storage. - async fn collect_bundle_locally( - self: &Arc, - dir: &Utf8TempDir, - ) -> anyhow::Result { - // TL;DR: This `tokio::select` is allowed to poll multiple futures, but - // should not do any async work within the body of any chosen branch. A - // previous iteration of this code polled the "collection" as "&mut - // collection", and checked the status of the support bundle within a - // branch of the "select" polling "yield_interval.tick()". - // - // We organize this work to "check for cancellation" as a whole future - // for a critical, but subtle reason: After the tick timer yields, - // we may then try to `await` a database function. - // - // This, at a surface-level glance seems innocent enough. However, there - // is something potentially insidious here: if calling a datastore - // function - such as "support_bundle_get" - awaits acquiring access - // to a connection from the connection pool, while creating the - // collection ALSO potentially awaits acquiring access to the - // connection pool, it is possible for: - // - // 1. The `&mut collection` arm to have created a future, currently - // yielded, which wants access to this underlying resource. - // 2. The current operation executing in `support_bundle_get` to - // be awaiting access to this same underlying resource. - // - // In this specific case, the connection pool would be attempting to - // yield to the `&mut collection` arm, which cannot run, if we were - // awaiting in the body of a different async select arm. This would - // result in a deadlock. - // - // In the future, we may attempt to make access to the connection pool - // safer from concurrent asynchronous access - it is unsettling that - // multiple concurrent `.claim()` functions can cause this behavior - - // but in the meantime, we perform this cancellation check in a single - // future that always is polled concurrently with the collection work. - // Because of this separation, each future is polled until one - // completes, at which point we deterministically exit. - // - // For more details, see: - // https://github.com/oxidecomputer/omicron/issues/9259 - - tokio::select! { - // Returns if the bundle should no longer be collected. - why = self.check_for_cancellation() => { - warn!( - &self.log, - "Support Bundle cancelled - stopping collection"; - "bundle" => %self.bundle.id, - "state" => ?self.bundle.state - ); - return Err(why); - }, - // Otherwise, keep making progress on the collection itself. - report = self.collect_bundle_as_file(&dir) => { - info!( - &self.log, - "Bundle Collection completed"; - "bundle" => %self.bundle.id - ); - return report; - }, - } - } - - async fn store_bundle_on_sled( - &self, - dir: Utf8TempDir, - ) -> anyhow::Result<()> { - // Create the zipfile as a temporary file - let mut zipfile = tokio::fs::File::from_std(bundle_to_zipfile(&dir)?); - let total_len = zipfile.metadata().await?.len(); - - // Collect the hash locally before we send it over the network - // - // We'll use this later during finalization to confirm the bundle - // has been stored successfully. - zipfile.seek(SeekFrom::Start(0)).await?; - let hash = sha2_hash(&mut zipfile).await?; - - // Find the sled where we're storing this bundle. - let sled_id = self - .datastore - .zpool_get_sled_if_in_service( - &self.opctx, - self.bundle.zpool_id.into(), - ) - .await?; - let sled_client = nexus_networking::sled_client( - &self.datastore, - &self.opctx, - sled_id, - &self.log, - ) - .await?; - - let zpool = ZpoolUuid::from(self.bundle.zpool_id); - let dataset = DatasetUuid::from(self.bundle.dataset_id); - let support_bundle = SupportBundleUuid::from(self.bundle.id); - - // Tell this sled to create the bundle. - let creation_result = sled_client - .support_bundle_start_creation(&zpool, &dataset, &support_bundle) - .await - .with_context(|| "Support bundle failed to start creation")?; - - if matches!( - creation_result.state, - sled_agent_client::types::SupportBundleState::Complete - ) { - // Early exit case: the bundle was already created -- we must have either - // crashed or failed between "finalizing" and "writing to the database that we - // finished". - info!(&self.log, "Support bundle was already collected"; "bundle" => %self.bundle.id); - return Ok(()); - } - info!(&self.log, "Support bundle creation started"; "bundle" => %self.bundle.id); - - let mut offset = 0; - while offset < total_len { - // Stream the zipfile to the sled where it should be kept - let mut file = zipfile - .try_clone() - .await - .with_context(|| "Failed to clone zipfile")?; - file.seek(SeekFrom::Start(offset)).await.with_context(|| { - format!("Failed to seek to offset {offset} / {total_len} within zipfile") - })?; - - // Only stream at most "transfer_chunk_size" bytes at once - let chunk_size = std::cmp::min( - self.transfer_chunk_size.get(), - total_len - offset, - ); - - let limited_file = file.take(chunk_size); - let stream = tokio_util::io::ReaderStream::new(limited_file); - let body = reqwest::Body::wrap_stream(stream); - - info!( - &self.log, - "Streaming bundle chunk"; - "bundle" => %self.bundle.id, - "offset" => offset, - "length" => chunk_size, - ); - - sled_client.support_bundle_transfer( - &zpool, &dataset, &support_bundle, offset, body - ).await.with_context(|| { - format!("Failed to transfer bundle: {chunk_size}@{offset} of {total_len} to sled") - })?; - - offset += chunk_size; - } - - sled_client - .support_bundle_finalize( - &zpool, - &dataset, - &support_bundle, - &hash.to_string(), - ) - .await - .with_context(|| "Failed to finalize bundle")?; - - // Returning from this method should drop all temporary storage - // allocated locally for this support bundle. - Ok(()) - } - - // Indefinitely perform periodic checks about whether or not we should - // cancel the bundle. - // - // Returns an error if: - // - The bundle state is no longer SupportBundleState::Collecting - // (which happens if the bundle has been explicitly cancelled, or - // if the backing storage has been expunged). - // - The bundle has been deleted - // - // Otherwise, keeps checking indefinitely while polled. - async fn check_for_cancellation(&self) -> anyhow::Error { - let work_duration = tokio::time::Duration::from_secs(5); - let mut yield_interval = tokio::time::interval_at( - tokio::time::Instant::now() + work_duration, - work_duration, - ); - - loop { - // Timer fired mid-collection - check if we should stop. - yield_interval.tick().await; - trace!( - self.log, - "Checking if Bundle Collection cancelled"; - "bundle" => %self.bundle.id - ); - - match self - .datastore - .support_bundle_get(&self.opctx, self.bundle.id.into()) - .await - { - Ok(SupportBundle { - state: SupportBundleState::Collecting, - .. - }) => { - // Bundle still collecting; continue... - continue; - } - Ok(_) => { - // Not collecting, for any reason: Time to exit - return anyhow::anyhow!("Support Bundle Cancelled"); - } - Err(Error::ObjectNotFound { .. } | Error::NotFound { .. }) => { - return anyhow::anyhow!("Support Bundle Deleted"); - } - Err(err) => { - warn!( - self.log, - "Database error checking bundle cancellation"; - InlineErrorChain::new(&err) - ); - - // If we cannot contact the database, retry later - continue; - } - } - } - } - - async fn run_collect_bundle_steps( - self: &Arc, - output: &Utf8TempDir, - mut steps: Vec, - ) -> SupportBundleCollectionReport { - let mut report = - SupportBundleCollectionReport::new(self.bundle.id.into()); - - const MAX_CONCURRENT_STEPS: usize = 16; - let mut tasks = - ParallelTaskSet::new_with_parallelism(MAX_CONCURRENT_STEPS); - - loop { - // Process all the currently-planned steps - while let Some(step) = steps.pop() { - let previous_result = tasks.spawn({ - let collection = self.clone(); - let dir = output.path().to_path_buf(); - async move { - debug!(collection.log, "Running step"; "step" => &step.name); - step.run(&collection, dir.as_path()).await - } - }).await; - - if let Some(output) = previous_result { - output.process(&mut report, &mut steps); - }; - } - - // If we've run out of tasks to spawn, join any of the previously - // spawned tasks, if any exist. - if let Some(output) = tasks.join_next().await { - output.process(&mut report, &mut steps); - - // As soon as any task completes, see if we can spawn more work - // immediately. This ensures that the ParallelTaskSet is - // saturated as much as it can be. - continue; - } - - // Executing steps may create additional steps, as follow-up work. - // - // Only finish if we've exhausted all possible steps and joined all spawned work. - if steps.is_empty() { - // Write trace file before returning - if let Err(err) = self.write_trace_file(output, &report).await { - warn!( - self.log, - "Failed to write trace file"; - "error" => ?err - ); - } - return report; - } - } - } - - // Write a Perfetto Event format JSON file for visualization - async fn write_trace_file( - &self, - output: &Utf8TempDir, - report: &SupportBundleCollectionReport, - ) -> anyhow::Result<()> { - let meta_dir = output.path().join("meta"); - tokio::fs::create_dir_all(&meta_dir).await.with_context(|| { - format!("Failed to create meta directory {meta_dir}") - })?; - - let trace_path = meta_dir.join("trace.json"); - - // Convert steps to Perfetto Trace Event format. - // Sort steps by start time and assign each a unique sequential ID. - // - // This is necessary because the trace event format does not like - // multiple slices to overlap - so we make each slice distinct. - // - // Ideally we'd be able to correlate these with actual tokio tasks, - // but it's hard to convert tokio::task::Id to a u64 because - // of https://github.com/tokio-rs/tokio/issues/7430 - let mut sorted_steps: Vec<_> = report.steps.iter().collect(); - sorted_steps.sort_by_key(|s| s.start); - - // Generate trace events - each step gets a unique ID (1, 2, 3, ...) - // based on its start time order - let trace_events: Vec<_> = sorted_steps - .iter() - .enumerate() - .map(|(i, step)| { - let start_us = step.start.timestamp_micros(); - let duration_us = (step.end - step.start) - .num_microseconds() - .unwrap_or(0) - .max(0); - let step_id = i + 1; - - perfetto::TraceEvent { - name: step.name.clone(), - cat: "bundle_collection".to_string(), - ph: "X".to_string(), - ts: start_us, - dur: duration_us, - pid: 1, - tid: step_id, - args: json!({ - "status": step.status.to_string(), - }), - } - }) - .collect(); - - let trace = perfetto::Trace { - trace_events, - display_time_unit: "ms".to_string(), - }; - - let trace_content = serde_json::to_string_pretty(&trace) - .context("Failed to serialize trace JSON")?; - - tokio::fs::write(&trace_path, trace_content).await.with_context( - || format!("Failed to write trace file to {trace_path}"), - )?; - - info!( - self.log, - "Wrote trace file"; - "path" => %trace_path, - "num_events" => trace.trace_events.len() - ); - - Ok(()) - } - - async fn collect_bundle_id( - &self, - dir: &Utf8Path, - ) -> anyhow::Result { - tokio::fs::write(dir.join("bundle_id.txt"), self.bundle.id.to_string()) - .await?; - - Ok(CollectionStepOutput::None) - } - - async fn collect_reconfigurator_state( - &self, - dir: &Utf8Path, - ) -> anyhow::Result { - if !self.request.include_reconfigurator_data() { - return Ok(CollectionStepOutput::Skipped); - } - - // Collect reconfigurator state - const NMAX_BLUEPRINTS: usize = 300; - match reconfigurator_state_load( - &self.opctx, - &self.datastore, - NMAX_BLUEPRINTS, - ) - .await - { - Ok(state) => { - let file_path = dir.join("reconfigurator_state.json"); - let file = std::fs::OpenOptions::new() - .create(true) - .write(true) - .truncate(true) - .open(&file_path) - .with_context(|| format!("failed to open {}", file_path))?; - serde_json::to_writer_pretty(&file, &state).with_context( - || { - format!( - "failed to serialize reconfigurator state to {}", - file_path - ) - }, - )?; - info!( - self.log, - "Support bundle: collected reconfigurator state"; - "target_blueprint" => ?state.target_blueprint, - "num_blueprints" => state.blueprints.len(), - "num_collections" => state.collections.len(), - ); - } - Err(err) => { - warn!( - self.log, - "Support bundle: failed to collect reconfigurator state"; - "err" => ?err, - ); - } - }; - - Ok(CollectionStepOutput::None) - } - - async fn get_or_initialize_mgs_client<'a>( - &self, - mgs_client: &'a OnceCell>>, - ) -> &'a Arc> { - mgs_client - .get_or_init(|| async { - Arc::new(self.create_mgs_client().await.ok()) - }) - .await - } - - async fn get_or_initialize_all_sleds<'a>( - &self, - all_sleds: &'a OnceCell>>>, - ) -> &'a Arc>> { - all_sleds - .get_or_init(|| async { - Arc::new( - self.datastore - .sled_list_all_batched( - &self.opctx, - SledFilter::InService, - ) - .await - .ok(), - ) - }) - .await - } - - async fn collect_sled_cubby_info( - &self, - all_sleds: &OnceCell>>>, - mgs_client: &OnceCell>>, - dir: &Utf8Path, - ) -> anyhow::Result { - if !self.request.include_sled_cubby_info() { - return Ok(CollectionStepOutput::Skipped); - } - - let Some(mgs_client) = - &**self.get_or_initialize_mgs_client(mgs_client).await - else { - bail!("Could not initialize MGS client"); - }; - let nexus_sleds = self - .get_or_initialize_all_sleds(all_sleds) - .await - .as_deref() - .unwrap_or_default(); - - write_sled_cubby_info(&self.log, mgs_client, nexus_sleds, dir).await?; - - Ok(CollectionStepOutput::None) - } - - async fn spawn_sp_dump_collection( - &self, - mgs_client: &OnceCell>>, - dir: &Utf8Path, - ) -> anyhow::Result { - if !self.request.include_sp_dumps() { - return Ok(CollectionStepOutput::Skipped); - } - - let Some(mgs_client) = - &**self.get_or_initialize_mgs_client(mgs_client).await - else { - bail!("Could not initialize MGS client"); - }; - - let sp_dumps_dir = dir.join("sp_task_dumps"); - tokio::fs::create_dir_all(&sp_dumps_dir).await.with_context(|| { - format!("Failed to create SP task dump directory {sp_dumps_dir}") - })?; - - let mut extra_steps: Vec = vec![]; - for sp in get_available_sps(&mgs_client).await? { - extra_steps.push(CollectionStep::new( - format!("SP dump for {:?}", sp), - Box::new({ - let mgs_client = mgs_client.clone(); - move |collection, dir| { - async move { - collection - .collect_sp_dump(&mgs_client, sp, dir) - .await - } - .boxed() - } - }), - )); - } - - Ok(CollectionStepOutput::Spawn { extra_steps }) - } - - async fn collect_sp_dump( - &self, - mgs_client: &MgsClient, - sp: SpIdentifier, - dir: &Utf8Path, - ) -> anyhow::Result { - if !self.request.include_sp_dumps() { - return Ok(CollectionStepOutput::Skipped); - } - - save_sp_dumps(mgs_client, sp, dir).await.with_context(|| { - format!("failed to save SP dump from: {} {}", sp.type_, sp.slot) - })?; - - Ok(CollectionStepOutput::None) - } - - // Perform the work of collecting the support bundle into a temporary directory - // - // "dir" is an output directory where data can be stored. - // - // If a partial bundle can be collected, it should be returned as - // an Ok(SupportBundleCollectionReport). Any failures from this function - // will prevent the support bundle from being collected altogether. - // - // NOTE: The background task infrastructure will periodically check to see - // if the bundle has been cancelled by a user while it is being collected. - // If that happens, this function will be CANCELLED at an await point. - // - // As a result, it is important that this function be implemented as - // cancel-safe. - // - // The "steps" used within this function - passed to - // [`Self::run_collect_bundle_steps`] - are run on a [`ParallelTaskSet`], - // which automatically aborts tasks when it is dropped. - async fn collect_bundle_as_file( - self: &Arc, - dir: &Utf8TempDir, - ) -> anyhow::Result { - let log = &self.log; - - info!(&log, "Collecting bundle as local file"); - - // Shared, lazy, fallible initialization for sleds - let all_sleds: OnceCell>>> = OnceCell::new(); - // Shared, lazy, fallible initialization for MGS client - let mgs_client: OnceCell>> = OnceCell::new(); - - let steps: Vec = vec![ - CollectionStep::new( - SupportBundleCollectionStep::STEP_BUNDLE_ID, - Box::new(|collection, dir| { - collection.collect_bundle_id(dir).boxed() - }), - ), - CollectionStep::new( - SupportBundleCollectionStep::STEP_RECONFIGURATOR_STATE, - Box::new(|collection, dir| { - collection.collect_reconfigurator_state(dir).boxed() - }), - ), - CollectionStep::new( - SupportBundleCollectionStep::STEP_EREPORTS, - Box::new(|collection, dir| { - collection.collect_ereports(dir).boxed() - }), - ), - CollectionStep::new( - SupportBundleCollectionStep::STEP_SLED_CUBBY_INFO, - Box::new({ - let all_sleds = all_sleds.clone(); - let mgs_client = mgs_client.clone(); - move |collection, dir| { - async move { - collection - .collect_sled_cubby_info( - &all_sleds, - &mgs_client, - dir, - ) - .await - } - .boxed() - } - }), - ), - CollectionStep::new( - SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS, - Box::new({ - let mgs_client = mgs_client.clone(); - move |collection, dir| { - async move { - collection - .spawn_sp_dump_collection(&mgs_client, dir) - .await - } - .boxed() - } - }), - ), - CollectionStep::new( - SupportBundleCollectionStep::STEP_SPAWN_SLEDS, - Box::new({ - let all_sleds = all_sleds.clone(); - move |collection, _| { - async move { - collection.spawn_query_all_sleds(&all_sleds).await - } - .boxed() - } - }), - ), - ]; - - Ok(self.run_collect_bundle_steps(dir, steps).await) - } - - async fn spawn_query_all_sleds( - &self, - all_sleds: &OnceCell>>>, - ) -> anyhow::Result { - if !self.request.include_host_info() { - return Ok(CollectionStepOutput::Skipped); - } - - let Some(all_sleds) = - self.get_or_initialize_all_sleds(all_sleds).await.as_deref() - else { - bail!("Could not read list of sleds"); - }; - - let mut extra_steps: Vec = vec![]; - for sled in all_sleds { - if !self.request.include_sled_host_info(sled.id()) { - continue; - } - - extra_steps.push(CollectionStep::new( - format!("sled data for sled {}", sled.id()), - Box::new({ - let sled = sled.clone(); - move |collection, dir| { - async move { - collection.collect_data_from_sled(&sled, dir).await - } - .boxed() - } - }), - )); - } - - return Ok(CollectionStepOutput::Spawn { extra_steps }); - } - - // Collect data from a sled, storing it into a directory that will - // be turned into a support bundle. - // - // - "sled" is the sled from which we should collect data. - // - "dir" is a directory where data can be stored, to be turned - // into a bundle after collection completes. - async fn collect_data_from_sled( - &self, - sled: &nexus_db_model::Sled, - dir: &Utf8Path, - ) -> anyhow::Result { - if !self.request.include_sled_host_info(sled.id()) { - return Ok(CollectionStepOutput::Skipped); - } - - let log = &self.log; - info!(&log, "Collecting bundle info from sled"; "sled" => %sled.id()); - let sled_path = dir - .join("rack") - .join(sled.rack_id.to_string()) - .join("sled") - .join(sled.id().to_string()); - tokio::fs::create_dir_all(&sled_path).await?; - tokio::fs::write(sled_path.join("sled.txt"), format!("{sled:?}")) - .await?; - - let sled_client = match nexus_networking::sled_client( - &self.datastore, - &self.opctx, - sled.id(), - log, - ) - .await - { - Ok(client) => client, - Err(err) => { - tokio::fs::write( - sled_path.join("error.txt"), - "Could not contact sled", - ) - .await.with_context(|| { - format!("Failed to save 'error.txt' to bundle when recording error: {err}") - })?; - bail!("Could not contact sled: {err}"); - } - }; - - // NB: As new sled-diagnostic commands are added they should - // be added to this array so that their output can be saved - // within the support bundle. - let mut diag_cmds = futures::stream::iter([ - save_diag_cmd_output_or_error( - &sled_path, - "zoneadm", - sled_client.support_zoneadm_info(), - ) - .boxed(), - save_diag_cmd_output_or_error( - &sled_path, - "dladm", - sled_client.support_dladm_info(), - ) - .boxed(), - save_diag_cmd_output_or_error( - &sled_path, - "ipadm", - sled_client.support_ipadm_info(), - ) - .boxed(), - save_diag_cmd_output_or_error( - &sled_path, - "nvmeadm", - sled_client.support_nvmeadm_info(), - ) - .boxed(), - save_diag_cmd_output_or_error( - &sled_path, - "pargs", - sled_client.support_pargs_info(), - ) - .boxed(), - save_diag_cmd_output_or_error( - &sled_path, - "pfiles", - sled_client.support_pfiles_info(), - ) - .boxed(), - save_diag_cmd_output_or_error( - &sled_path, - "pstack", - sled_client.support_pstack_info(), - ) - .boxed(), - save_diag_cmd_output_or_error( - &sled_path, - "zfs", - sled_client.support_zfs_info(), - ) - .boxed(), - save_diag_cmd_output_or_error( - &sled_path, - "zpool", - sled_client.support_zpool_info(), - ) - .boxed(), - save_diag_cmd_output_or_error( - &sled_path, - "health-check", - sled_client.support_health_check(), - ) - .boxed(), - ]) - // Currently we execute up to 10 commands concurrently which - // might be doing their own concurrent work, for example - // collectiong `pstack` output of every Oxide process that is - // found on a sled. - .buffer_unordered(10); - - while let Some(result) = diag_cmds.next().await { - // Log that we failed to write the diag command output to a - // file but don't return early as we wish to get as much - // information as we can. - if let Err(e) = result { - error!( - &self.log, - "failed to write diagnostic command output to \ - file: {e}" - ); - } - } - - // For each zone we concurrently fire off a request to its - // sled-agent to collect its logs in a zip file and write the - // result to the support bundle. - let zones = sled_client.support_logs().await?.into_inner(); - let mut log_futs: FuturesUnordered<_> = zones - .iter() - .map(|zone| { - save_zone_log_zip_or_error(log, &sled_client, zone, &sled_path) - }) - .collect(); - - while let Some(log_collection_result) = log_futs.next().await { - // We log any errors saving the zip file to disk and - // continue on. - if let Err(e) = log_collection_result { - error!(&self.log, "failed to write logs output: {e}"); - } - } - return Ok(CollectionStepOutput::None); - } - - async fn collect_ereports( - self: &Arc, - dir: &Utf8Path, - ) -> anyhow::Result { - let Some(ereport_filters) = self.request.get_ereport_filters() else { - debug!(self.log, "Support bundle: ereports not requested"); - return Ok(CollectionStepOutput::Skipped); - }; - let ereports_dir = dir.join("ereports"); - let mut status = SupportBundleEreportStatus::default(); - if let Err(err) = self - .save_ereports(ereport_filters.clone(), ereports_dir, &mut status) - .await - { - warn!( - &self.log, - "Support bundle: ereport collection failed \ - ({} collected successfully)", - status.n_collected; - InlineErrorChain::new(err.as_ref()) - ); - status.errors.push(InlineErrorChain::new(err.as_ref()).to_string()); - }; - - Ok(CollectionStepOutput::Ereports(status)) - } - - async fn save_ereports( - self: &Arc, - filters: EreportFilters, - dir: Utf8PathBuf, - status: &mut SupportBundleEreportStatus, - ) -> anyhow::Result<()> { - let mut paginator = Paginator::new( - datastore::SQL_BATCH_SIZE, - dropshot::PaginationOrder::Ascending, - ); - while let Some(p) = paginator.next() { - let ereports = self - .datastore - .ereport_fetch_matching( - &self.opctx, - &filters, - &p.current_pagparams(), - ) - .await - .map_err(|e| { - e.internal_context("failed to query for ereports") - })?; - paginator = p.found_batch(&ereports, &|ereport| { - (ereport.restart_id.into_untyped_uuid(), ereport.ena) - }); - - let prev_n_collected = status.n_collected; - let n_ereports = ereports.len(); - status.n_found += n_ereports; - - for ereport in ereports { - match ereport.try_into() { - Ok(ereport) => { - write_ereport(ereport, &dir).await?; - status.n_collected += 1; - } - Err(err) => { - warn!(&self.log, "invalid ereport"; "error" => %err); - status.errors.push(err.to_string()); - } - } - } - debug!( - self.log, - "Support bundle: added {} ereports ({} found)", - status.n_collected - prev_n_collected, - n_ereports - ); - } - - info!( - self.log, - "Support bundle: collected {} total ereports", status.n_collected - ); - Ok(()) - } - - async fn create_mgs_client(&self) -> anyhow::Result { - self - .resolver - .lookup_socket_v6(ServiceName::ManagementGatewayService) - .await - .map(|sockaddr| { - let url = format!("http://{}", sockaddr); - gateway_client::Client::new(&url, self.log.clone()) - }).map_err(|e| { - error!(self.log, "failed to resolve MGS address"; "error" => InlineErrorChain::new(&e)); - e.into() - }) - } -} - impl BackgroundTask for SupportBundleCollector { fn activate<'a>( &'a mut self, @@ -1735,438 +436,13 @@ impl BackgroundTask for SupportBundleCollector { } } -async fn write_ereport(ereport: Ereport, dir: &Utf8Path) -> anyhow::Result<()> { - // Here's where we construct the file path for each ereport JSON file, - // given the top-level ereport directory path. Each ereport is stored in a - // subdirectory for the part and serial numbers of the system that produced - // the ereport. Part numbers must be included in addition to serial - // numbers, as the v1 serial scheme only guarantees uniqueness within a - // part number. These paths take the following form: - // - // {part-number}-{serial_number}/{restart_id}/{ENA}.json - // - // We can assume that the restart ID and ENA consist only of - // filesystem-safe characters, as the restart ID is known to be a UUID, and - // the ENA is just an integer. For the serial and part numbers, which - // Nexus doesn't have full control over --- it came from the ereport - // metadata --- we must check that it doesn't contain any characters - // unsuitable for use in a filesystem path. - let pn = ereport - .data - .part_number - .as_deref() - // If the part or serial numbers contain any unsavoury characters, it - // goes in the `unknown_serial` hole! Note that the alleged serial - // number from the ereport will still be present in the JSON as a - // string, so we're not *lying* about what was received; we're just - // giving up on using it in the path. - .filter(|&s| is_fs_safe_single_path_component(s)) - .unwrap_or("unknown_part"); - let sn = ereport - .data - .serial_number - .as_deref() - .filter(|&s| is_fs_safe_single_path_component(s)) - .unwrap_or("unknown_serial"); - let id = &ereport.data.id; - - let dir = dir - .join(format!("{pn}-{sn}")) - // N.B. that we call `into_untyped_uuid()` here, as the `Display` - // implementation for a typed UUID appends " (ereporter_restart)", which - // we don't want. - .join(id.restart_id.into_untyped_uuid().to_string()); - tokio::fs::create_dir_all(&dir) - .await - .with_context(|| format!("failed to create directory '{dir}'"))?; - let file_path = dir.join(format!("{}.json", id.ena)); - let json = serde_json::to_vec(&ereport).with_context(|| { - format!("failed to serialize ereport {pn}:{sn}/{id}") - })?; - tokio::fs::write(&file_path, json) - .await - .with_context(|| format!("failed to write '{file_path}'")) -} - -// Takes a directory "dir", and zips the contents into a single zipfile. -fn bundle_to_zipfile(dir: &Utf8TempDir) -> anyhow::Result { - let tempfile = tempfile_in(TEMPDIR)?; - let mut zip = ZipWriter::new(tempfile); - - recursively_add_directory_to_zipfile(&mut zip, dir.path(), dir.path())?; - - Ok(zip.finish()?) -} - -fn recursively_add_directory_to_zipfile( - zip: &mut ZipWriter, - root_path: &Utf8Path, - dir_path: &Utf8Path, -) -> anyhow::Result<()> { - // Readdir might return entries in a non-deterministic order. - // Let's sort it for the zipfile, to be nice. - let mut entries = dir_path - .read_dir_utf8()? - .filter_map(Result::ok) - .collect::>(); - entries.sort_by(|a, b| a.file_name().cmp(&b.file_name())); - - for entry in &entries { - // Remove the "/tmp/..." prefix from the path when we're storing it in the - // zipfile. - let dst = entry.path().strip_prefix(root_path)?; - - let file_type = entry.file_type()?; - if file_type.is_file() { - let src = entry.path(); - - let zip_time = entry - .path() - .metadata() - .and_then(|m| m.modified()) - .ok() - .and_then(|sys_time| jiff::Zoned::try_from(sys_time).ok()) - .and_then(|zoned| { - zip::DateTime::try_from(zoned.datetime()).ok() - }) - .unwrap_or_else(zip::DateTime::default); - - let opts = FullFileOptions::default() - .last_modified_time(zip_time) - .compression_method(zip::CompressionMethod::Deflated) - .large_file(true); - - zip.start_file_from_path(dst, opts)?; - let mut file = std::fs::File::open(&src)?; - std::io::copy(&mut file, zip)?; - } - if file_type.is_dir() { - let opts = FullFileOptions::default(); - zip.add_directory_from_path(dst, opts)?; - recursively_add_directory_to_zipfile(zip, root_path, entry.path())?; - } - } - Ok(()) -} - -async fn sha2_hash(file: &mut tokio::fs::File) -> anyhow::Result { - let mut buf = vec![0u8; 65536]; - let mut ctx = Sha256::new(); - loop { - let n = file.read(&mut buf).await?; - if n == 0 { - break; - } - ctx.write_all(&buf[0..n])?; - } - - let digest = ctx.finalize(); - Ok(ArtifactHash(digest.as_slice().try_into()?)) -} - -/// For a given zone, save its service's logs into the provided destination -/// path. This path should be the location to a per-sled directory that will end -/// up in the final support bundle zip file. -async fn save_zone_log_zip_or_error( - logger: &slog::Logger, - client: &sled_agent_client::Client, - zone: &str, - path: &Utf8Path, -) -> anyhow::Result<()> { - // In the future when support bundle collection exposes tuning parameters - // this can turn into a collection parameter. - const DEFAULT_MAX_ROTATED_LOGS: u32 = 5; - - match client.support_logs_download(zone, DEFAULT_MAX_ROTATED_LOGS).await { - Ok(res) => { - let bytestream = res.into_inner(); - let output_dir = path.join(format!("logs/{zone}")); - let output_path = output_dir.join("logs.zip"); - - // Ensure the logs output directory exists. - tokio::fs::create_dir_all(&output_dir).await.with_context( - || format!("failed to create output directory: {output_dir}"), - )?; - - let mut file = - tokio::fs::File::create(&output_path).await.with_context( - || format!("failed to create file: {output_path}"), - )?; - - let stream = bytestream.into_inner().map(|chunk| { - chunk.map_err(|e| std::io::Error::other(e.to_string())) - }); - let mut reader = tokio_util::io::StreamReader::new(stream); - let _nbytes = tokio::io::copy(&mut reader, &mut file).await?; - file.flush().await?; - - // Unpack the zip so we don't end up with zip files inside of our - // final zip - let zipfile_path = output_path.clone(); - tokio::task::spawn_blocking(move || { - extract_zip_file(&output_dir, &zipfile_path) - }) - .await - .map_err(|join_error| { - anyhow::anyhow!(join_error) - .context("unzipping support bundle logs zip panicked") - })??; - - // Cleanup the zip file since we no longer need it - if let Err(e) = tokio::fs::remove_file(&output_path).await { - error!( - logger, - "failed to cleanup temporary logs zip file"; - "error" => %e, - "file" => %output_path, - - ); - } - } - Err(err) => { - tokio::fs::write( - path.join(format!("{zone}.logs.err")), - err.to_string(), - ) - .await?; - } - }; - - Ok(()) -} - -fn extract_zip_file( - output_dir: &Utf8Path, - zip_file: &Utf8Path, -) -> Result<(), anyhow::Error> { - let mut zip = std::fs::File::open(&zip_file) - .with_context(|| format!("failed to open zip file: {zip_file}"))?; - let mut archive = ZipArchive::new(&mut zip)?; - archive.extract(&output_dir).with_context(|| { - format!("failed to extract log zip file to: {output_dir}") - })?; - Ok(()) -} - -/// Run a `sled-dianostics` future and save its output to a corresponding file. -async fn save_diag_cmd_output_or_error( - path: &Utf8Path, - command: &str, - future: F, -) -> anyhow::Result<()> -where - F: Future< - Output = Result< - sled_agent_client::ResponseValue, - sled_agent_client::Error, - >, - > + Send, -{ - let result = future.await; - match result { - Ok(result) => { - let output = result.into_inner(); - let json = serde_json::to_string(&output).with_context(|| { - format!("failed to serialize {command} output as json") - })?; - tokio::fs::write(path.join(format!("{command}.json")), json) - .await - .with_context(|| { - format!("failed to write output of {command} to file") - })?; - } - Err(err) => { - tokio::fs::write( - path.join(format!("{command}_err.txt")), - err.to_string(), - ) - .await?; - } - } - Ok(()) -} - -/// Use MGS ignition info to find active SPs. -async fn get_available_sps( - mgs_client: &MgsClient, -) -> anyhow::Result> { - let ignition_info = mgs_client - .ignition_list() - .await - .context("failed to get ignition info from MGS")? - .into_inner(); - - let mut active_sps = Vec::new(); - for info in ignition_info { - if let SpIgnition::Yes { power, flt_sp, .. } = info.details { - // Only return SPs that are powered on and are not in a faulted state. - if power && !flt_sp { - active_sps.push(info.id); - } - } - } - - Ok(active_sps) -} - -/// Fetch and save task dumps from a single SP. -async fn save_sp_dumps( - mgs_client: &MgsClient, - sp: SpIdentifier, - sp_dumps_dir: &Utf8Path, -) -> anyhow::Result<()> { - let dump_count = mgs_client - .sp_task_dump_count(&sp.type_, sp.slot) - .await - .context("failed to get task dump count from SP")? - .into_inner(); - - let output_dir = sp_dumps_dir.join(format!("{}_{}", sp.type_, sp.slot)); - tokio::fs::create_dir_all(&output_dir).await.with_context(|| { - format!("Failed to create output directory {output_dir}") - })?; - - for i in 0..dump_count { - let task_dump = mgs_client - .sp_task_dump_get(&sp.type_, sp.slot, i) - .await - .with_context(|| format!("failed to get task dump {i} from SP"))? - .into_inner(); - - let zip_bytes = base64::engine::general_purpose::STANDARD - .decode(task_dump.base64_zip) - .context("failed to decode base64-encoded SP task dump zip")?; - - tokio::fs::write(output_dir.join(format!("dump-{i}.zip")), zip_bytes) - .await - .context("failed to write SP task dump zip to disk")?; - } - Ok(()) -} - -/// Write a file with a JSON mapping of sled serial numbers to cubby and UUIDs for easier -/// identification of sleds present in a bundle. -async fn write_sled_cubby_info( - log: &slog::Logger, - mgs_client: &MgsClient, - nexus_sleds: &[Sled], - dir: &Utf8Path, -) -> anyhow::Result<()> { - #[derive(Serialize)] - struct SledInfo { - cubby: Option, - uuid: Option, - } - - let available_sps = get_available_sps(&mgs_client) - .await - .context("failed to get available SPs")?; - - // We can still get a useful mapping of cubby to serial using just the data from MGS. - let mut nexus_map: BTreeMap<_, _> = nexus_sleds - .into_iter() - .map(|sled| (sled.serial_number(), sled)) - .collect(); - - let mut sled_info = BTreeMap::new(); - for sp in - available_sps.into_iter().filter(|sp| matches!(sp.type_, SpType::Sled)) - { - let sp_state = match mgs_client.sp_get(&sp.type_, sp.slot).await { - Ok(s) => s.into_inner(), - Err(e) => { - error!(log, - "Failed to get SP state for sled_info.json"; - "cubby" => sp.slot, - "component" => %sp.type_, - "error" => InlineErrorChain::new(&e) - ); - continue; - } - }; - - if let Some(sled) = nexus_map.remove(sp_state.serial_number.as_str()) { - sled_info.insert( - sp_state.serial_number.to_string(), - SledInfo { - cubby: Some(sp.slot), - uuid: Some(*sled.identity.id.as_untyped_uuid()), - }, - ); - } else { - sled_info.insert( - sp_state.serial_number.to_string(), - SledInfo { cubby: Some(sp.slot), uuid: None }, - ); - } - } - - // Sleds not returned by MGS. - for (serial, sled) in nexus_map { - sled_info.insert( - serial.to_string(), - SledInfo { - cubby: None, - uuid: Some(*sled.identity.id.as_untyped_uuid()), - }, - ); - } - - let json = serde_json::to_string_pretty(&sled_info) - .context("failed to serialize sled info to JSON")?; - tokio::fs::write(dir.join("sled_info.json"), json).await?; - - Ok(()) -} - -fn is_fs_safe_single_path_component(s: &str) -> bool { - // Might be path traversal... - if s == "." || s == ".." { - return false; - } - - if s == "~" { - return false; - } - - const BANNED_CHARS: &[char] = &[ - // Check for path separators. - // - // Naively, we might reach for `std::path::is_separator()` here. - // However, this function only checks if a path is a permitted - // separator on the *current* platform --- so, running on illumos, we - // will only check for Unix path separators. But, because the support - // bundle may be extracted on a workstation system by Oxide support - // personnel or by the customer, we should also make sure we don't - // allow the use of Windows path separators, which `is_separator()` - // won't check for on Unix systems. - '/', '\\', - // Characters forbidden on Windows, per: - // https://learn.microsoft.com/en-us/windows/win32/fileio/naming-a-file#naming-conventions - '<', '>', ':', '"', '|', '?', '*', - ]; - - // Rather than using `s.contains()`, we do all the checks in one pass. - for c in s.chars() { - if BANNED_CHARS.contains(&c) { - return false; - } - - // Definitely no control characters! - if c.is_control() { - return false; - } - } - - true -} - #[cfg(test)] mod test { use super::*; + use crate::app::background::tasks::support_bundle::perfetto; + use crate::app::background::tasks::support_bundle::request::BundleData; use crate::app::support_bundles::SupportBundleQueryType; - use camino_tempfile::tempdir; use http_body_util::BodyExt; use nexus_db_model::PhysicalDisk; use nexus_db_model::PhysicalDiskKind; @@ -2175,6 +451,9 @@ mod test { use nexus_test_utils::SLED_AGENT_UUID; use nexus_test_utils_macros::nexus_test; use nexus_types::fm::ereport::{EreportData, EreportId, Reporter}; + use nexus_types::identity::Asset; + use nexus_types::internal_api::background::SupportBundleCollectionStep; + use nexus_types::internal_api::background::SupportBundleEreportStatus; use nexus_types::inventory::SpType; use omicron_common::api::external::ByteCount; use omicron_common::api::internal::shared::DatasetKind; @@ -2183,39 +462,18 @@ mod test { use omicron_common::disk::DatasetsConfig; use omicron_common::disk::SharedDatasetConfig; use omicron_common::zpool_name::ZpoolName; + use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::{ BlueprintUuid, DatasetUuid, EreporterRestartUuid, OmicronZoneUuid, PhysicalDiskUuid, SledUuid, }; + use std::collections::HashSet; + use std::num::NonZeroU64; use uuid::Uuid; type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; - // Ensure that we can convert a temporary directory into a zipfile - #[test] - fn test_zipfile_creation() { - let dir = tempdir().unwrap(); - - std::fs::create_dir_all(dir.path().join("dir-a")).unwrap(); - std::fs::create_dir_all(dir.path().join("dir-b")).unwrap(); - std::fs::write(dir.path().join("dir-a").join("file-a"), "some data") - .unwrap(); - std::fs::write(dir.path().join("file-b"), "more data").unwrap(); - - let zipfile = bundle_to_zipfile(&dir) - .expect("Should have been able to bundle zipfile"); - let archive = zip::read::ZipArchive::new(zipfile).unwrap(); - - // We expect the order to be deterministically alphabetical - let mut names = archive.file_names(); - assert_eq!(names.next(), Some("dir-a/")); - assert_eq!(names.next(), Some("dir-a/file-a")); - assert_eq!(names.next(), Some("dir-b/")); - assert_eq!(names.next(), Some("file-b")); - assert_eq!(names.next(), None); - } - // If we have not populated any bundles needing cleanup, the cleanup // process should succeed with an empty cleanup report. #[nexus_test(server = crate::Server)] From 73fe0ec6a609c8e66130b2f15cdc8d6b60a328fe Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 10 Dec 2025 10:46:18 -0800 Subject: [PATCH 12/18] fix integration test --- nexus/tests/integration_tests/support_bundles.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nexus/tests/integration_tests/support_bundles.rs b/nexus/tests/integration_tests/support_bundles.rs index 80ec8af191f..4bd8d1e2258 100644 --- a/nexus/tests/integration_tests/support_bundles.rs +++ b/nexus/tests/integration_tests/support_bundles.rs @@ -530,6 +530,8 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) { let archive = ZipArchive::new(Cursor::new(&contents)).unwrap(); let mut names = archive.file_names(); assert_eq!(names.next(), Some("bundle_id.txt")); + assert_eq!(names.next(), Some("meta/")); + assert_eq!(names.next(), Some("meta/trace.json")); assert_eq!(names.next(), Some("rack/")); assert!(names.any(|n| n == "sp_task_dumps/")); // There's much more data in the bundle, but validating it isn't the point From 7979e2377f069a9cf20e907c2eb8dc85367a4491 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 12 Dec 2025 10:48:01 -0800 Subject: [PATCH 13/18] Elaborate on directory structure --- .../background/tasks/support_bundle/README.md | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/nexus/src/app/background/tasks/support_bundle/README.md b/nexus/src/app/background/tasks/support_bundle/README.md index e6a52539afd..b36a6911364 100644 --- a/nexus/src/app/background/tasks/support_bundle/README.md +++ b/nexus/src/app/background/tasks/support_bundle/README.md @@ -64,3 +64,23 @@ contents should be included. expensive operations which might be shared with other steps (e.g., reading from the database, creating and using progenitor clients, etc) consider adding that data to `support_bundle/cache`. + +## Bundle Directory Structure + +The following is the convention for Support Bundle files. It can, and should, +change over time. However, we list it here to make sure data is located +somewhere consistent and predictable. + +(Please keep this list alphabetized) + +* `bundle_id.txt` - UUID of the bundle itself +* `ereports/` - All requested error reports +* `ereports/{part number}-{serial number}/{id}.json` - Individual reports +* `meta/` - Metadata about the bundle +* `meta/trace.json` - Perfetto-formatted trace of the bundle's collection +* `rack/{rack id}/sled/{sled id}/` - Sled-specific host OS info +* `reconfigurator_state.json` - A dump of all reconfigurator state +* `sled_info.json` - Mapping of sled identifiers to cubby location +* `sp_task_dumps/` - All SP dumps +* `sp_task_dumps/{SP type}_{SP slot}/dump-{id}.zip` - Individual SP dumps + From 8001cbd355de3473c159f7876c531e3d33490476 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 15 Dec 2025 15:24:32 -0800 Subject: [PATCH 14/18] omdb integration for bg task list --- nexus/src/app/background/init.rs | 4 ++ .../tasks/support_bundle/collection.rs | 7 +++ .../tasks/support_bundle/steps/mod.rs | 5 ++ .../tasks/support_bundle/steps/omdb.rs | 46 +++++++++++++++++++ .../tasks/support_bundle_collector.rs | 23 +++++++++- nexus/src/app/mod.rs | 1 + .../integration_tests/support_bundles.rs | 9 +++- nexus/types/src/internal_api/background.rs | 1 + 8 files changed, 94 insertions(+), 2 deletions(-) create mode 100644 nexus/src/app/background/tasks/support_bundle/steps/omdb.rs diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index c56a26970a1..89ee67a7430 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -144,6 +144,7 @@ use nexus_background_task_interface::Activator; use nexus_background_task_interface::BackgroundTasks; use nexus_config::BackgroundTaskConfig; use nexus_config::DnsTasksConfig; +use nexus_config::OmdbConfig; use nexus_db_model::DnsGroup; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; @@ -632,6 +633,7 @@ impl BackgroundTasksInitializer { resolver.clone(), config.support_bundle_collector.disable, nexus_id, + args.omdb_config.clone(), ), ), opctx: opctx.child(BTreeMap::new()), @@ -1191,6 +1193,8 @@ pub struct BackgroundTasksData { /// Channel for exposing the latest loaded fault-management sitrep. pub sitrep_load_tx: watch::Sender>>, + /// PATH information for `omdb`, for tasks that want to invoke it directly + pub omdb_config: OmdbConfig, } /// Starts the three DNS-propagation-related background tasks for either diff --git a/nexus/src/app/background/tasks/support_bundle/collection.rs b/nexus/src/app/background/tasks/support_bundle/collection.rs index 1008c85128f..46a8a721e22 100644 --- a/nexus/src/app/background/tasks/support_bundle/collection.rs +++ b/nexus/src/app/background/tasks/support_bundle/collection.rs @@ -53,6 +53,7 @@ pub struct BundleCollection { request: BundleRequest, bundle: SupportBundle, transfer_chunk_size: NonZeroU64, + omdb_config: nexus_config::OmdbConfig, } impl BundleCollection { @@ -64,6 +65,7 @@ impl BundleCollection { request: BundleRequest, bundle: SupportBundle, transfer_chunk_size: NonZeroU64, + omdb_config: nexus_config::OmdbConfig, ) -> Self { Self { datastore, @@ -73,6 +75,7 @@ impl BundleCollection { request, bundle, transfer_chunk_size, + omdb_config, } } @@ -100,6 +103,10 @@ impl BundleCollection { &self.bundle } + pub fn omdb_config(&self) -> &nexus_config::OmdbConfig { + &self.omdb_config + } + /// Collect the bundle within Nexus, and store it on a target sled. pub async fn collect_bundle_and_store_on_sled( self: &Arc, diff --git a/nexus/src/app/background/tasks/support_bundle/steps/mod.rs b/nexus/src/app/background/tasks/support_bundle/steps/mod.rs index d2179c74b8c..cade1943ff0 100644 --- a/nexus/src/app/background/tasks/support_bundle/steps/mod.rs +++ b/nexus/src/app/background/tasks/support_bundle/steps/mod.rs @@ -12,6 +12,7 @@ use nexus_types::internal_api::background::SupportBundleCollectionStep; mod bundle_id; mod ereports; mod host_info; +mod omdb; mod reconfigurator; mod sled_cubby; mod sp_dumps; @@ -85,5 +86,9 @@ pub fn all(cache: &Cache) -> Vec { } }), ), + CollectionStep::new( + SupportBundleCollectionStep::STEP_OMDB, + Box::new(|collection, dir| omdb::collect(collection, dir).boxed()), + ), ] } diff --git a/nexus/src/app/background/tasks/support_bundle/steps/omdb.rs b/nexus/src/app/background/tasks/support_bundle/steps/omdb.rs new file mode 100644 index 00000000000..b5ec795b4d3 --- /dev/null +++ b/nexus/src/app/background/tasks/support_bundle/steps/omdb.rs @@ -0,0 +1,46 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Collects output from omdb commands + +use crate::app::background::tasks::support_bundle::collection::BundleCollection; +use crate::app::background::tasks::support_bundle::step::CollectionStepOutput; +use camino::Utf8Path; +use tokio::process::Command; + +pub async fn collect( + collection: &BundleCollection, + dir: &Utf8Path, +) -> anyhow::Result { + // Create the omdb/nexus/background-tasks directory + let omdb_dir = dir.join("omdb/nexus/background-tasks"); + tokio::fs::create_dir_all(&omdb_dir).await?; + + // Run the omdb command + let omdb_path = &collection.omdb_config().bin_path; + let output = Command::new(omdb_path) + .arg("nexus") + .arg("background-tasks") + .arg("list") + .output() + .await?; + + // Write the output to list.txt + let output_path = omdb_dir.join("list.txt"); + let output_text = if output.status.success() { + String::from_utf8_lossy(&output.stdout).to_string() + } else { + // If the command failed, include both stdout and stderr + format!( + "Command failed with exit code: {:?}\n\nSTDOUT:\n{}\n\nSTDERR:\n{}", + output.status.code(), + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ) + }; + + tokio::fs::write(output_path, output_text).await?; + + Ok(CollectionStepOutput::None) +} diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 887be497a17..9c4227babf2 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -56,6 +56,7 @@ pub struct SupportBundleCollector { resolver: Resolver, disable: bool, nexus_id: OmicronZoneUuid, + omdb_config: nexus_config::OmdbConfig, } impl SupportBundleCollector { @@ -64,8 +65,15 @@ impl SupportBundleCollector { resolver: Resolver, disable: bool, nexus_id: OmicronZoneUuid, + omdb_config: nexus_config::OmdbConfig, ) -> Self { - SupportBundleCollector { datastore, resolver, disable, nexus_id } + SupportBundleCollector { + datastore, + resolver, + disable, + nexus_id, + omdb_config, + } } // Tells a sled agent to delete a support bundle @@ -357,6 +365,7 @@ impl SupportBundleCollector { request.clone(), bundle.clone(), request.transfer_chunk_size, + self.omdb_config.clone(), )); let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); @@ -490,6 +499,7 @@ mod test { resolver.clone(), false, nexus.id(), + nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() }, ); let report = collector @@ -516,6 +526,7 @@ mod test { resolver.clone(), false, nexus.id(), + nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() }, ); let request = BundleRequest::default(); @@ -823,6 +834,7 @@ mod test { resolver.clone(), false, nexus.id(), + nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() }, ); // The bundle collection should complete successfully. @@ -902,6 +914,7 @@ mod test { resolver.clone(), false, nexus.id(), + nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() }, ); // Collect the bundle @@ -1013,6 +1026,7 @@ mod test { resolver.clone(), false, nexus.id(), + nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() }, ); // The bundle collection should complete successfully. @@ -1121,6 +1135,7 @@ mod test { resolver.clone(), false, nexus.id(), + nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() }, ); // Each time we call "collect_bundle", we collect a SINGLE bundle. @@ -1235,6 +1250,7 @@ mod test { resolver.clone(), false, nexus.id(), + nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() }, ); let report = collector @@ -1288,6 +1304,7 @@ mod test { resolver.clone(), false, nexus.id(), + nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() }, ); let mut request = BundleRequest::default(); request.data_selection.insert(BundleData::HostInfo(HashSet::new())); @@ -1387,6 +1404,7 @@ mod test { resolver.clone(), false, nexus.id(), + nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() }, ); let report = collector @@ -1443,6 +1461,7 @@ mod test { resolver.clone(), false, nexus.id(), + nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() }, ); let mut request = BundleRequest::default(); request.data_selection.insert(BundleData::HostInfo(HashSet::new())); @@ -1528,6 +1547,7 @@ mod test { resolver.clone(), false, nexus.id(), + nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() }, ); let mut request = BundleRequest::default(); request.data_selection.insert(BundleData::HostInfo(HashSet::new())); @@ -1612,6 +1632,7 @@ mod test { resolver.clone(), false, nexus.id(), + nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() }, ); // Collect the bundle diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index f9a8057958c..42f7750159b 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -647,6 +647,7 @@ impl Nexus { mgs_updates_tx, blueprint_load_tx, sitrep_load_tx, + omdb_config: task_config.pkg.omdb.clone(), }, ); diff --git a/nexus/tests/integration_tests/support_bundles.rs b/nexus/tests/integration_tests/support_bundles.rs index 4bd8d1e2258..d605168ee18 100644 --- a/nexus/tests/integration_tests/support_bundles.rs +++ b/nexus/tests/integration_tests/support_bundles.rs @@ -528,10 +528,17 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) { // Now we should be able to download the bundle let contents = bundle_download(&client, bundle.id).await.unwrap(); let archive = ZipArchive::new(Cursor::new(&contents)).unwrap(); - let mut names = archive.file_names(); + let mut names = archive.file_names().peekable(); assert_eq!(names.next(), Some("bundle_id.txt")); assert_eq!(names.next(), Some("meta/")); assert_eq!(names.next(), Some("meta/trace.json")); + assert_eq!(names.next(), Some("omdb/")); + while let Some(name) = names.peek() { + if !name.starts_with("omdb/") { + break; + } + let _ = names.next(); + } assert_eq!(names.next(), Some("rack/")); assert!(names.any(|n| n == "sp_task_dumps/")); // There's much more data in the bundle, but validating it isn't the point diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index dfe008198f9..6661592a192 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -303,6 +303,7 @@ impl SupportBundleCollectionStep { pub const STEP_SPAWN_SP_DUMPS: &'static str = "spawn steps to query all SP dumps"; pub const STEP_SPAWN_SLEDS: &'static str = "spawn steps to query all sleds"; + pub const STEP_OMDB: &'static str = "omdb diagnostic output"; } #[derive(Debug, Deserialize, Serialize, PartialEq, Eq)] From ceb03e4d2e40b84231f8c057f81f64315c98fc10 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 15 Dec 2025 17:00:58 -0800 Subject: [PATCH 15/18] more commands --- .../tasks/support_bundle/collection.rs | 1 + .../tasks/support_bundle/steps/omdb.rs | 106 +++++++++++++++--- 2 files changed, 91 insertions(+), 16 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle/collection.rs b/nexus/src/app/background/tasks/support_bundle/collection.rs index 46a8a721e22..b042abf72cf 100644 --- a/nexus/src/app/background/tasks/support_bundle/collection.rs +++ b/nexus/src/app/background/tasks/support_bundle/collection.rs @@ -57,6 +57,7 @@ pub struct BundleCollection { } impl BundleCollection { + #[allow(clippy::too_many_arguments)] pub fn new( datastore: Arc, resolver: Resolver, diff --git a/nexus/src/app/background/tasks/support_bundle/steps/omdb.rs b/nexus/src/app/background/tasks/support_bundle/steps/omdb.rs index b5ec795b4d3..e089e5f26db 100644 --- a/nexus/src/app/background/tasks/support_bundle/steps/omdb.rs +++ b/nexus/src/app/background/tasks/support_bundle/steps/omdb.rs @@ -9,38 +9,112 @@ use crate::app::background::tasks::support_bundle::step::CollectionStepOutput; use camino::Utf8Path; use tokio::process::Command; -pub async fn collect( +/// Run an omdb command and write its output to a file within the bundle. +/// +/// This function returns an error if we cannot write to our local filesystem, +/// or cannot run the omdb command at all. However, if the omdb command runs +/// and fails, it returns "Ok()". +/// +/// # Arguments +/// * `collection` - The bundle collection context +/// * `dir` - The root directory of the bundle +/// * `args` - The arguments to pass to omdb (e.g., `&["nexus", "background-tasks", "list"]`) +/// * `output_path` - The relative path within the bundle where output should be written +/// (e.g., `"omdb/nexus/background-tasks/list.txt"`) +async fn run_omdb( collection: &BundleCollection, dir: &Utf8Path, -) -> anyhow::Result { - // Create the omdb/nexus/background-tasks directory - let omdb_dir = dir.join("omdb/nexus/background-tasks"); - tokio::fs::create_dir_all(&omdb_dir).await?; + args: &[&str], + output_path: &str, +) -> anyhow::Result<()> { + let full_output_path = dir.join(output_path); + + // Create parent directories if they don't exist + if let Some(parent) = full_output_path.parent() { + tokio::fs::create_dir_all(parent).await?; + } // Run the omdb command let omdb_path = &collection.omdb_config().bin_path; - let output = Command::new(omdb_path) - .arg("nexus") - .arg("background-tasks") - .arg("list") - .output() - .await?; - - // Write the output to list.txt - let output_path = omdb_dir.join("list.txt"); + let output = Command::new(omdb_path).args(args).output().await?; + + // Format the output let output_text = if output.status.success() { String::from_utf8_lossy(&output.stdout).to_string() } else { // If the command failed, include both stdout and stderr format!( - "Command failed with exit code: {:?}\n\nSTDOUT:\n{}\n\nSTDERR:\n{}", + "Command {} failed with exit code: {:?}\n\nSTDOUT:\n{}\n\nSTDERR:\n{}", + args.join(" "), output.status.code(), String::from_utf8_lossy(&output.stdout), String::from_utf8_lossy(&output.stderr) ) }; - tokio::fs::write(output_path, output_text).await?; + tokio::fs::write(full_output_path, output_text).await?; + Ok(()) +} + +/// Collect diagnostic output from various omdb commands. +/// +/// This function runs multiple omdb queries and stores their output in the bundle. +/// To add more omdb queries, simply add another `run_omdb()` call with the +/// appropriate arguments and output path. +pub async fn collect( + collection: &BundleCollection, + dir: &Utf8Path, +) -> anyhow::Result { + // NOTE: We could parallelize these commands, if they take a while. + // + // NOTE: These commands issues queries to "some Nexus", as returned by DNS - + // not necessarily our own Nexus. We may want to include queries to + // each Nexus instance individually in a future iteration, especially for + // "nexus-specific" commands. + + // Run a sequence of omdb commands. If any of these commands fail, we'll + // save the stdout and stderr, and proceed to the next one (note that + // "run_omdb" does not return an error when the output is not successfull). + + run_omdb( + collection, + dir, + &["nexus", "background-tasks", "list"], + "omdb/nexus/background-tasks/list.txt", + ) + .await?; + + run_omdb( + collection, + dir, + &["nexus", "quiesce", "show"], + "omdb/nexus/quiesce/show.txt", + ) + .await?; + + run_omdb( + collection, + dir, + &["nexus", "mgs-updates"], + "omdb/nexus/mgs-updates.txt", + ) + .await?; + + run_omdb( + collection, + dir, + &["nexus", "update-status"], + "omdb/nexus/update-status.txt", + ) + .await?; + + run_omdb( + collection, + dir, + &["db", "saga", "running"], + "omdb/db/saga/running", + ) + .await?; Ok(CollectionStepOutput::None) } From 1cb8fcb237cd1e02e776a9b251e1fba902b53bf9 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 17 Dec 2025 13:23:37 -0800 Subject: [PATCH 16/18] integration testing --- .config/nextest.toml | 10 ++++ .../tasks/support_bundle/steps/omdb.rs | 10 +++- nexus/test-utils/src/starter.rs | 27 +++++++++ .../integration_tests/support_bundles.rs | 57 ++++++++++++++++++- 4 files changed, 102 insertions(+), 2 deletions(-) diff --git a/.config/nextest.toml b/.config/nextest.toml index c77c8627f1e..96460bccfab 100644 --- a/.config/nextest.toml +++ b/.config/nextest.toml @@ -15,6 +15,12 @@ experimental = ["setup-scripts"] filter = 'rdeps(nexus-test-utils) - package(omicron-dev) - package(omicron-live-tests)' setup = 'crdb-seed' +[[profile.default.scripts]] +# Build omdb for usage within Nexus integration tests. +# This was initially added for the support bundle integration tests. +filter = 'rdeps(nexus-test-utils) - package(omicron-dev) - package(omicron-live-tests)' +setup = 'omdb-build' + [profile.ci] fail-fast = false @@ -26,6 +32,10 @@ path = "junit.xml" # invocations of nextest happen. command = 'cargo run -p crdb-seed --profile test' +[scripts.setup.omdb-build] +# Build omdb binary for usage by integration tests +command = 'cargo build --bin omdb' + [[profile.default.scripts]] filter = 'package(omicron-clickhouse-admin)' setup = 'clickhouse-cluster' diff --git a/nexus/src/app/background/tasks/support_bundle/steps/omdb.rs b/nexus/src/app/background/tasks/support_bundle/steps/omdb.rs index e089e5f26db..8d4d6768464 100644 --- a/nexus/src/app/background/tasks/support_bundle/steps/omdb.rs +++ b/nexus/src/app/background/tasks/support_bundle/steps/omdb.rs @@ -36,7 +36,15 @@ async fn run_omdb( // Run the omdb command let omdb_path = &collection.omdb_config().bin_path; - let output = Command::new(omdb_path).args(args).output().await?; + let output = + Command::new(omdb_path).args(args).output().await.map_err(|e| { + anyhow::anyhow!( + "Failed to execute omdb at {:?} with args {:?}: {}", + omdb_path, + args, + e + ) + })?; // Format the output let output_text = if output.status.success() { diff --git a/nexus/test-utils/src/starter.rs b/nexus/test-utils/src/starter.rs index 412959d3d63..cac54381db1 100644 --- a/nexus/test-utils/src/starter.rs +++ b/nexus/test-utils/src/starter.rs @@ -560,6 +560,33 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { .clone(), }; + // Configure the omdb binary path for tests. + // The binary is built by cargo at the workspace root in target//omdb. + // Tests run from the nexus directory, so we need to go up one level. + let workspace_root = std::env::current_dir() + .expect("Failed to get current directory") + .parent() + .expect("Current directory should have a parent") + .to_path_buf(); + let omdb_debug = workspace_root.join("target/debug/omdb"); + let omdb_release = workspace_root.join("target/release/omdb"); + + self.config.pkg.omdb.bin_path = if omdb_release.exists() { + camino::Utf8PathBuf::try_from(omdb_release) + .expect("Failed to convert release path to UTF-8") + } else if omdb_debug.exists() { + camino::Utf8PathBuf::try_from(omdb_debug) + .expect("Failed to convert debug path to UTF-8") + } else { + // omdb hasn't been built yet - use a path that will fail gracefully + // when tests try to use it. + // + // Our rules in ".config/nextest.toml" should prevent this, but this + // acts as a defensive buffer against running without nextest, or + // changing the directory layout. + camino::Utf8PathBuf::from("/nonexistent/omdb") + }; + let nexus_internal = N::start_internal(&self.config, &log).await?; let nexus_internal_addr = nexus_internal.get_http_server_internal_address(); diff --git a/nexus/tests/integration_tests/support_bundles.rs b/nexus/tests/integration_tests/support_bundles.rs index d605168ee18..22b7082e82c 100644 --- a/nexus/tests/integration_tests/support_bundles.rs +++ b/nexus/tests/integration_tests/support_bundles.rs @@ -521,6 +521,10 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) { step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS), "Should have attempted to list service processors" ); + assert!( + step_names.contains(&SupportBundleCollectionStep::STEP_OMDB), + "Should have run omdb diagnostic commands" + ); let bundle = bundle_get(&client, bundle.id).await.unwrap(); assert_eq!(bundle.state, SupportBundleState::Active); @@ -528,17 +532,68 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) { // Now we should be able to download the bundle let contents = bundle_download(&client, bundle.id).await.unwrap(); let archive = ZipArchive::new(Cursor::new(&contents)).unwrap(); - let mut names = archive.file_names().peekable(); + let mut names = archive.file_names().collect::>(); + names.sort(); + let mut names = names.into_iter().peekable(); + assert_eq!(names.next(), Some("bundle_id.txt")); assert_eq!(names.next(), Some("meta/")); assert_eq!(names.next(), Some("meta/trace.json")); assert_eq!(names.next(), Some("omdb/")); + + // Collect omdb file names and verify they exist + let mut omdb_files = Vec::new(); while let Some(name) = names.peek() { if !name.starts_with("omdb/") { break; } + omdb_files.push(*name); let _ = names.next(); } + + // Verify we have omdb output files + assert!(!omdb_files.is_empty(), "Should have omdb output files"); + + // Verify that none of the omdb output files contain "error: unrecognized + // subcommand" This catches regressions where omdb's command structure + // changes and our hardcoded commands become invalid. + let mut archive = ZipArchive::new(Cursor::new(&contents)).unwrap(); + let mut files_checked = 0; + for file_name in &omdb_files { + // Skip directories + if file_name.ends_with('/') { + continue; + } + + let mut file = archive + .by_name(file_name) + .unwrap_or_else(|_| panic!("Should be able to open {}", file_name)); + let mut content = String::new(); + std::io::Read::read_to_string(&mut file, &mut content) + .unwrap_or_else(|_| panic!("Should be able to read {}", file_name)); + + files_checked += 1; + + // Validate that the omdb command is valid, even if it can't connect + // to a runnine Nexus right now. + assert!( + !content.contains("error: unrecognized subcommand"), + "File {} contains 'error: unrecognized subcommand'.\n\ + This indicates the omdb command is invalid. Content:\n{}", + file_name, + content + ); + } + + // Make sure we actually checked at least one omdb output file. + // If this fails, it means the bundle had omdb directories but no actual + // output files, which would be a bug. + assert!( + files_checked > 0, + "Expected to check at least one omdb output file, but found only directories. Files: {:?}", + omdb_files + ); + assert_eq!(names.next(), Some("rack/")); assert!(names.any(|n| n == "sp_task_dumps/")); // There's much more data in the bundle, but validating it isn't the point From b1d5434b0e3b818217fcf81ffec9707c6b625312 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 17 Dec 2025 14:13:36 -0800 Subject: [PATCH 17/18] typos, docs --- nexus/src/app/background/tasks/support_bundle/README.md | 1 + nexus/tests/integration_tests/support_bundles.rs | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/nexus/src/app/background/tasks/support_bundle/README.md b/nexus/src/app/background/tasks/support_bundle/README.md index 446cab83e83..8eebf508883 100644 --- a/nexus/src/app/background/tasks/support_bundle/README.md +++ b/nexus/src/app/background/tasks/support_bundle/README.md @@ -78,6 +78,7 @@ somewhere consistent and predictable. * `ereports/{part number}-{serial number}/{id}.json` - Individual reports * `meta/` - Metadata about the bundle * `meta/trace.json` - Perfetto-formatted trace of the bundle's collection +* `omdb/` - Output from omdb commands * `rack/{rack id}/sled/{sled id}/` - Sled-specific host OS info * `reconfigurator_state.json` - A dump of all reconfigurator state * `sled_info.json` - Mapping of sled identifiers to cubby location diff --git a/nexus/tests/integration_tests/support_bundles.rs b/nexus/tests/integration_tests/support_bundles.rs index 22b7082e82c..42e93bc522e 100644 --- a/nexus/tests/integration_tests/support_bundles.rs +++ b/nexus/tests/integration_tests/support_bundles.rs @@ -575,7 +575,7 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) { files_checked += 1; // Validate that the omdb command is valid, even if it can't connect - // to a runnine Nexus right now. + // to a running Nexus right now. assert!( !content.contains("error: unrecognized subcommand"), "File {} contains 'error: unrecognized subcommand'.\n\ From adc037a511c8338f3446ba2e67c7143116a60f5a Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 18 Dec 2025 12:53:43 -0800 Subject: [PATCH 18/18] split omdb into lib/bin, make dup, use it in nexus tests --- .config/nextest.toml | 10 - Cargo.lock | 2 + dev-tools/omdb/src/bin/omdb/main.rs | 309 +---------------- .../omdb/src/{bin/omdb => }/crucible_agent.rs | 0 .../src/{bin/omdb => }/crucible_pantry.rs | 0 dev-tools/omdb/src/{bin/omdb => }/db.rs | 0 dev-tools/omdb/src/{bin/omdb => }/db/alert.rs | 0 .../omdb/src/{bin/omdb => }/db/blueprints.rs | 0 .../omdb/src/{bin/omdb => }/db/db_metadata.rs | 0 .../omdb/src/{bin/omdb => }/db/ereport.rs | 0 dev-tools/omdb/src/{bin/omdb => }/db/saga.rs | 0 .../omdb/src/{bin/omdb => }/db/sitrep.rs | 0 .../src/{bin/omdb => }/db/user_data_export.rs | 0 .../omdb/src/{bin/omdb => }/db/whatis.rs | 0 dev-tools/omdb/src/{bin/omdb => }/helpers.rs | 0 dev-tools/omdb/src/lib.rs | 317 ++++++++++++++++++ dev-tools/omdb/src/{bin/omdb => }/mgs.rs | 0 .../omdb/src/{bin/omdb => }/mgs/dashboard.rs | 0 .../omdb/src/{bin/omdb => }/mgs/sensors.rs | 0 dev-tools/omdb/src/{bin/omdb => }/nexus.rs | 0 .../omdb/src/{bin/omdb => }/nexus/quiesce.rs | 0 .../omdb => }/nexus/reconfigurator_config.rs | 0 .../src/{bin/omdb => }/nexus/update_status.rs | 0 dev-tools/omdb/src/{bin/omdb => }/oximeter.rs | 0 dev-tools/omdb/src/{bin/omdb => }/oxql.rs | 0 .../omdb/src/{bin/omdb => }/reconfigurator.rs | 0 .../omdb/src/{bin/omdb => }/sled_agent.rs | 0 .../omdb/src/{bin/omdb => }/support_bundle.rs | 0 nexus/Cargo.toml | 6 + nexus/src/bin/omdb-dup.rs | 25 ++ nexus/test-utils-macros/src/lib.rs | 15 + nexus/test-utils/src/starter.rs | 27 -- 32 files changed, 373 insertions(+), 338 deletions(-) rename dev-tools/omdb/src/{bin/omdb => }/crucible_agent.rs (100%) rename dev-tools/omdb/src/{bin/omdb => }/crucible_pantry.rs (100%) rename dev-tools/omdb/src/{bin/omdb => }/db.rs (100%) rename dev-tools/omdb/src/{bin/omdb => }/db/alert.rs (100%) rename dev-tools/omdb/src/{bin/omdb => }/db/blueprints.rs (100%) rename dev-tools/omdb/src/{bin/omdb => }/db/db_metadata.rs (100%) rename dev-tools/omdb/src/{bin/omdb => }/db/ereport.rs (100%) rename dev-tools/omdb/src/{bin/omdb => }/db/saga.rs (100%) rename dev-tools/omdb/src/{bin/omdb => }/db/sitrep.rs (100%) rename dev-tools/omdb/src/{bin/omdb => }/db/user_data_export.rs (100%) rename dev-tools/omdb/src/{bin/omdb => }/db/whatis.rs (100%) rename dev-tools/omdb/src/{bin/omdb => }/helpers.rs (100%) create mode 100644 dev-tools/omdb/src/lib.rs rename dev-tools/omdb/src/{bin/omdb => }/mgs.rs (100%) rename dev-tools/omdb/src/{bin/omdb => }/mgs/dashboard.rs (100%) rename dev-tools/omdb/src/{bin/omdb => }/mgs/sensors.rs (100%) rename dev-tools/omdb/src/{bin/omdb => }/nexus.rs (100%) rename dev-tools/omdb/src/{bin/omdb => }/nexus/quiesce.rs (100%) rename dev-tools/omdb/src/{bin/omdb => }/nexus/reconfigurator_config.rs (100%) rename dev-tools/omdb/src/{bin/omdb => }/nexus/update_status.rs (100%) rename dev-tools/omdb/src/{bin/omdb => }/oximeter.rs (100%) rename dev-tools/omdb/src/{bin/omdb => }/oxql.rs (100%) rename dev-tools/omdb/src/{bin/omdb => }/reconfigurator.rs (100%) rename dev-tools/omdb/src/{bin/omdb => }/sled_agent.rs (100%) rename dev-tools/omdb/src/{bin/omdb => }/support_bundle.rs (100%) create mode 100644 nexus/src/bin/omdb-dup.rs diff --git a/.config/nextest.toml b/.config/nextest.toml index 96460bccfab..c77c8627f1e 100644 --- a/.config/nextest.toml +++ b/.config/nextest.toml @@ -15,12 +15,6 @@ experimental = ["setup-scripts"] filter = 'rdeps(nexus-test-utils) - package(omicron-dev) - package(omicron-live-tests)' setup = 'crdb-seed' -[[profile.default.scripts]] -# Build omdb for usage within Nexus integration tests. -# This was initially added for the support bundle integration tests. -filter = 'rdeps(nexus-test-utils) - package(omicron-dev) - package(omicron-live-tests)' -setup = 'omdb-build' - [profile.ci] fail-fast = false @@ -32,10 +26,6 @@ path = "junit.xml" # invocations of nextest happen. command = 'cargo run -p crdb-seed --profile test' -[scripts.setup.omdb-build] -# Build omdb binary for usage by integration tests -command = 'cargo build --bin omdb' - [[profile.default.scripts]] filter = 'package(omicron-clickhouse-admin)' setup = 'clickhouse-cluster' diff --git a/Cargo.lock b/Cargo.lock index 004917e5a80..f2d89bf7868 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8322,6 +8322,7 @@ dependencies = [ "num-integer", "omicron-cockroach-metrics", "omicron-common", + "omicron-omdb", "omicron-passwords", "omicron-rpaths", "omicron-sled-agent", @@ -8372,6 +8373,7 @@ dependencies = [ "serde_urlencoded", "serde_with", "sha2", + "sigpipe", "similar-asserts", "sled-agent-client", "sled-agent-types", diff --git a/dev-tools/omdb/src/bin/omdb/main.rs b/dev-tools/omdb/src/bin/omdb/main.rs index 61e558daa43..44ceec034ce 100644 --- a/dev-tools/omdb/src/bin/omdb/main.rs +++ b/dev-tools/omdb/src/bin/omdb/main.rs @@ -2,311 +2,18 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! CLI for debugging Omicron internal state +//! Omicron debugger (omdb) - binary entrypoint //! -//! GROUND RULES: -//! -//! 1. There aren't a lot of ground rules here. At least for now, this is a -//! place to put any kind of runtime tooling for Omicron that seems useful. -//! You can query the database directly (see notes in db.rs), use internal -//! APIs, etc. To the degree that we can stick to stable interfaces, great. -//! But at this stage we'd rather have tools that work on latest than not -//! have them because we couldn't prioritize keeping them stable. -//! -//! 2. Debuggers should never lie! Documentation and command names should be -//! precise about what they're reporting. In a working system, these things -//! might all be the same: -//! -//! - the list of instances with zones and propolis processes running on -//! a sled -//! - the list of instances that sled agent knows about -//! - the list of instances that Nexus or the database reports should be -//! running on a sled -//! -//! But in a broken system, these things might be all different. People use -//! debuggers to understand broken systems. The debugger should say which of -//! these it's reporting, rather than "the list of instances on a sled". -//! -//! 3. Where possible, when the tool encounters something unexpected, it should -//! print what it can (including the error message and bad data) and then -//! continue. It generally shouldn't stop on the first error. (We often -//! find strange things when debugging but we need our tools to tell us as -//! much as they can!) +//! This is a small shim over `lib.rs`, and is structured this way so that other +//! crates can depend on omicron-omdb as a library. -use anyhow::Context; -use anyhow::anyhow; -use anyhow::ensure; -use clap::Args; -use clap::ColorChoice; use clap::Parser; -use clap::Subcommand; -use futures::StreamExt; -use internal_dns_types::names::ServiceName; -use omicron_common::address::Ipv6Subnet; -use std::net::SocketAddr; -use std::net::SocketAddrV6; -use tokio::net::TcpSocket; - -mod crucible_agent; -mod crucible_pantry; -mod db; -mod helpers; -mod mgs; -mod nexus; -mod oximeter; -mod oxql; -mod reconfigurator; -mod sled_agent; -mod support_bundle; +use omicron_omdb::Omdb; fn main() -> Result<(), anyhow::Error> { sigpipe::reset(); - oxide_tokio_rt::run(main_impl()) -} - -async fn main_impl() -> Result<(), anyhow::Error> { - let args = Omdb::parse(); - - let log = dropshot::ConfigLogging::StderrTerminal { - level: args.log_level.clone(), - } - .to_logger("omdb") - .context("failed to create logger")?; - - match &args.command { - OmdbCommands::Db(db) => db.run_cmd(&args, &log).await, - OmdbCommands::Mgs(mgs) => mgs.run_cmd(&args, &log).await, - OmdbCommands::Nexus(nexus) => nexus.run_cmd(&args, &log).await, - OmdbCommands::Oximeter(oximeter) => oximeter.run_cmd(&args, &log).await, - OmdbCommands::Oxql(oxql) => oxql.run_cmd(&args, &log).await, - OmdbCommands::Reconfigurator(reconfig) => { - reconfig.run_cmd(&args, &log).await - } - OmdbCommands::SledAgent(sled) => sled.run_cmd(&args, &log).await, - OmdbCommands::CrucibleAgent(crucible) => crucible.run_cmd(&args).await, - OmdbCommands::CruciblePantry(crucible) => crucible.run_cmd(&args).await, - } -} - -/// Omicron debugger (unstable) -/// -/// This tool provides commands for directly querying Omicron components about -/// their internal state using internal APIs. This is a prototype. The -/// commands and output are unstable and may change. -#[derive(Debug, Parser)] -struct Omdb { - /// log level filter - #[arg( - env, - long, - value_parser = parse_dropshot_log_level, - default_value = "warn", - global = true, - )] - log_level: dropshot::ConfigLoggingLevel, - - #[arg( - long, - env = "OMDB_DNS_SERVER", - global = true, - help_heading = helpers::CONNECTION_OPTIONS_HEADING, - )] - dns_server: Option, - - /// Allow potentially-destructive subcommands. - #[arg( - short = 'w', - long = "destructive", - global = true, - help_heading = helpers::SAFETY_OPTIONS_HEADING, - )] - allow_destructive: bool, - - #[command(flatten)] - output: OutputOpts, - - #[command(subcommand)] - command: OmdbCommands, -} - -#[derive(Debug, Args)] -struct OutputOpts { - /// Color output - #[arg(long, global = true, value_enum, default_value_t)] - color: ColorChoice, -} - -mod check_allow_destructive { - /// Zero-size type that potentially-destructive functions can accept to - /// ensure `Omdb::check_allow_destructive` has been called. - // This is tucked away inside a module to prevent it from being constructed - // by anything other than `Omdb::check_allow_destructive`. - #[must_use] - pub(crate) struct DestructiveOperationToken(()); - - impl super::Omdb { - pub(crate) fn check_allow_destructive( - &self, - ) -> anyhow::Result { - anyhow::ensure!( - self.allow_destructive, - "This command is potentially destructive. \ - Pass the `-w` / `--destructive` flag to allow it." - ); - Ok(DestructiveOperationToken(())) - } - } -} - -impl Omdb { - /// Return the socket addresses of all instances of a service in DNS - async fn dns_lookup_all( - &self, - log: slog::Logger, - service_name: ServiceName, - ) -> Result, anyhow::Error> { - let resolver = self.dns_resolver(log).await?; - resolver - .lookup_all_socket_v6(service_name) - .await - .with_context(|| format!("looking up {:?} in DNS", service_name)) - } - - /// Return the socket address of one instance of a service that we can at - /// least successfully connect to - async fn dns_lookup_one( - &self, - log: slog::Logger, - service_name: ServiceName, - ) -> Result { - let addrs = self.dns_lookup_all(log, service_name).await?; - ensure!( - !addrs.is_empty(), - "expected at least one address from successful DNS lookup for {:?}", - service_name - ); - - // The caller is going to pick one of these addresses to connect to. - // Let's try to pick one that's at least not obviously broken by - // attempting to connect to whatever we found and returning any that we - // successfully connected to. It'd be nice if we could return the - // socket directly, but our callers are creating reqwest clients that - // cannot easily consume a socket directly. - // - // This approach scales poorly and there are many failure modes that - // this does not cover. But in the absence of better connection - // management, and with the risks in `omdb` being pretty low, and the - // value of it working pretty high, here we are. This approach should - // not be replicated elsewhere. - async fn try_connect( - sockaddr_v6: SocketAddrV6, - ) -> Result<(), anyhow::Error> { - let _ = TcpSocket::new_v6() - .context("creating socket")? - .connect(SocketAddr::from(sockaddr_v6)) - .await - .with_context(|| format!("connect \"{}\"", sockaddr_v6))?; - Ok(()) - } - - let mut socket_stream = futures::stream::iter(addrs) - .map(async move |sockaddr_v6| { - (sockaddr_v6, try_connect(sockaddr_v6).await) - }) - .buffer_unordered(3); - - while let Some((sockaddr, connect_result)) = socket_stream.next().await - { - match connect_result { - Ok(()) => return Ok(sockaddr), - Err(error) => { - eprintln!( - "warning: failed to connect to {:?} at {}: {:#}", - service_name, sockaddr, error - ); - } - } - } - - Err(anyhow!("failed to connect to any instances of {:?}", service_name)) - } - - async fn dns_resolver( - &self, - log: slog::Logger, - ) -> Result { - match &self.dns_server { - Some(dns_server) => { - internal_dns_resolver::Resolver::new_from_addrs( - log, - &[*dns_server], - ) - .with_context(|| { - format!( - "creating DNS resolver for DNS server {:?}", - dns_server - ) - }) - } - None => { - // In principle, we should look at /etc/resolv.conf to find the - // DNS servers. In practice, this usually isn't populated - // today. See oxidecomputer/omicron#2122. - // - // However, the address selected below should work for most - // existing Omicron deployments today. That's because while the - // base subnet is in principle configurable in config-rss.toml, - // it's very uncommon to change it from the default value used - // here. - // - // Yet another option would be to find a local IP address that - // looks like it's probably on the underlay network and use that - // to find the subnet to use. But again, this is unlikely to be - // wrong and it's easy to override. - let subnet = - Ipv6Subnet::new("fd00:1122:3344:0100::".parse().unwrap()); - eprintln!("note: using DNS server for subnet {}", subnet.net()); - eprintln!( - "note: (if this is not right, use --dns-server \ - to specify an alternate DNS server)", - ); - internal_dns_resolver::Resolver::new_from_subnet(log, subnet) - .with_context(|| { - format!( - "creating DNS resolver for subnet {}", - subnet.net() - ) - }) - } - } - } -} - -#[derive(Debug, Subcommand)] -#[allow(clippy::large_enum_variant)] -enum OmdbCommands { - /// Debug a specific crucible-agent - CrucibleAgent(crucible_agent::CrucibleAgentArgs), - /// Query a specific crucible-pantry - CruciblePantry(crucible_pantry::CruciblePantryArgs), - /// Query the control plane database (CockroachDB) - Db(db::DbArgs), - /// Debug a specific Management Gateway Service instance - Mgs(mgs::MgsArgs), - /// Debug a specific Nexus instance - Nexus(nexus::NexusArgs), - /// Query oximeter collector state - Oximeter(oximeter::OximeterArgs), - /// Enter the Oximeter Query Language shell for interactive querying. - Oxql(oxql::OxqlArgs), - /// Interact with the Reconfigurator system - Reconfigurator(reconfigurator::ReconfiguratorArgs), - /// Debug a specific Sled - SledAgent(sled_agent::SledAgentArgs), -} - -fn parse_dropshot_log_level( - s: &str, -) -> Result { - serde_json::from_str(&format!("{:?}", s)).context("parsing log level") + oxide_tokio_rt::run(async { + let cmd = Omdb::parse(); + cmd.exec().await + }) } diff --git a/dev-tools/omdb/src/bin/omdb/crucible_agent.rs b/dev-tools/omdb/src/crucible_agent.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/crucible_agent.rs rename to dev-tools/omdb/src/crucible_agent.rs diff --git a/dev-tools/omdb/src/bin/omdb/crucible_pantry.rs b/dev-tools/omdb/src/crucible_pantry.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/crucible_pantry.rs rename to dev-tools/omdb/src/crucible_pantry.rs diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/db.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/db.rs rename to dev-tools/omdb/src/db.rs diff --git a/dev-tools/omdb/src/bin/omdb/db/alert.rs b/dev-tools/omdb/src/db/alert.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/db/alert.rs rename to dev-tools/omdb/src/db/alert.rs diff --git a/dev-tools/omdb/src/bin/omdb/db/blueprints.rs b/dev-tools/omdb/src/db/blueprints.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/db/blueprints.rs rename to dev-tools/omdb/src/db/blueprints.rs diff --git a/dev-tools/omdb/src/bin/omdb/db/db_metadata.rs b/dev-tools/omdb/src/db/db_metadata.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/db/db_metadata.rs rename to dev-tools/omdb/src/db/db_metadata.rs diff --git a/dev-tools/omdb/src/bin/omdb/db/ereport.rs b/dev-tools/omdb/src/db/ereport.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/db/ereport.rs rename to dev-tools/omdb/src/db/ereport.rs diff --git a/dev-tools/omdb/src/bin/omdb/db/saga.rs b/dev-tools/omdb/src/db/saga.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/db/saga.rs rename to dev-tools/omdb/src/db/saga.rs diff --git a/dev-tools/omdb/src/bin/omdb/db/sitrep.rs b/dev-tools/omdb/src/db/sitrep.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/db/sitrep.rs rename to dev-tools/omdb/src/db/sitrep.rs diff --git a/dev-tools/omdb/src/bin/omdb/db/user_data_export.rs b/dev-tools/omdb/src/db/user_data_export.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/db/user_data_export.rs rename to dev-tools/omdb/src/db/user_data_export.rs diff --git a/dev-tools/omdb/src/bin/omdb/db/whatis.rs b/dev-tools/omdb/src/db/whatis.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/db/whatis.rs rename to dev-tools/omdb/src/db/whatis.rs diff --git a/dev-tools/omdb/src/bin/omdb/helpers.rs b/dev-tools/omdb/src/helpers.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/helpers.rs rename to dev-tools/omdb/src/helpers.rs diff --git a/dev-tools/omdb/src/lib.rs b/dev-tools/omdb/src/lib.rs new file mode 100644 index 00000000000..6a4a84c904b --- /dev/null +++ b/dev-tools/omdb/src/lib.rs @@ -0,0 +1,317 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Omicron debugger (omdb) - library interface +//! +//! This module exposes omdb's CLI functionality as a library, allowing other +//! crates to create their own omdb binaries. +//! +//! GROUND RULES: +//! +//! 1. There aren't a lot of ground rules here. At least for now, this is a +//! place to put any kind of runtime tooling for Omicron that seems useful. +//! You can query the database directly (see notes in db.rs), use internal +//! APIs, etc. To the degree that we can stick to stable interfaces, great. +//! But at this stage we'd rather have tools that work on latest than not +//! have them because we couldn't prioritize keeping them stable. +//! +//! 2. Debuggers should never lie! Documentation and command names should be +//! precise about what they're reporting. In a working system, these things +//! might all be the same: +//! +//! - the list of instances with zones and propolis processes running on +//! a sled +//! - the list of instances that sled agent knows about +//! - the list of instances that Nexus or the database reports should be +//! running on a sled +//! +//! But in a broken system, these things might be all different. People use +//! debuggers to understand broken systems. The debugger should say which of +//! these it's reporting, rather than "the list of instances on a sled". +//! +//! 3. Where possible, when the tool encounters something unexpected, it should +//! print what it can (including the error message and bad data) and then +//! continue. It generally shouldn't stop on the first error. (We often +//! find strange things when debugging but we need our tools to tell us as +//! much as they can!) + +use anyhow::Context; +use anyhow::anyhow; +use anyhow::ensure; +use clap::Args; +use clap::ColorChoice; +use clap::Parser; +use clap::Subcommand; +use futures::StreamExt; +use internal_dns_types::names::ServiceName; +use omicron_common::address::Ipv6Subnet; +use std::net::SocketAddr; +use std::net::SocketAddrV6; +use tokio::net::TcpSocket; + +mod crucible_agent; +mod crucible_pantry; +mod db; +mod helpers; +mod mgs; +mod nexus; +mod oximeter; +mod oxql; +mod reconfigurator; +mod sled_agent; +mod support_bundle; + +/// Omicron debugger (unstable) +/// +/// This tool provides commands for directly querying Omicron components about +/// their internal state using internal APIs. This is a prototype. The +/// commands and output are unstable and may change. +#[derive(Debug, Parser)] +pub struct Omdb { + /// log level filter + #[arg( + env, + long, + value_parser = parse_dropshot_log_level, + default_value = "warn", + global = true, + )] + log_level: dropshot::ConfigLoggingLevel, + + #[arg( + long, + env = "OMDB_DNS_SERVER", + global = true, + help_heading = helpers::CONNECTION_OPTIONS_HEADING, + )] + dns_server: Option, + + /// Allow potentially-destructive subcommands. + #[arg( + short = 'w', + long = "destructive", + global = true, + help_heading = helpers::SAFETY_OPTIONS_HEADING, + )] + allow_destructive: bool, + + #[command(flatten)] + output: OutputOpts, + + #[command(subcommand)] + command: OmdbCommands, +} + +impl Omdb { + /// Execute the omdb command. + pub async fn exec(self) -> Result<(), anyhow::Error> { + let log = dropshot::ConfigLogging::StderrTerminal { + level: self.log_level.clone(), + } + .to_logger("omdb") + .context("failed to create logger")?; + + match &self.command { + OmdbCommands::Db(db) => db.run_cmd(&self, &log).await, + OmdbCommands::Mgs(mgs) => mgs.run_cmd(&self, &log).await, + OmdbCommands::Nexus(nexus) => nexus.run_cmd(&self, &log).await, + OmdbCommands::Oximeter(oximeter) => { + oximeter.run_cmd(&self, &log).await + } + OmdbCommands::Oxql(oxql) => oxql.run_cmd(&self, &log).await, + OmdbCommands::Reconfigurator(reconfig) => { + reconfig.run_cmd(&self, &log).await + } + OmdbCommands::SledAgent(sled) => sled.run_cmd(&self, &log).await, + OmdbCommands::CrucibleAgent(crucible) => { + crucible.run_cmd(&self).await + } + OmdbCommands::CruciblePantry(crucible) => { + crucible.run_cmd(&self).await + } + } + } +} + +#[derive(Debug, Args)] +struct OutputOpts { + /// Color output + #[arg(long, global = true, value_enum, default_value_t)] + color: ColorChoice, +} + +mod check_allow_destructive { + /// Zero-size type that potentially-destructive functions can accept to + /// ensure `Omdb::check_allow_destructive` has been called. + // This is tucked away inside a module to prevent it from being constructed + // by anything other than `Omdb::check_allow_destructive`. + #[must_use] + pub(crate) struct DestructiveOperationToken(()); + + impl super::Omdb { + pub(crate) fn check_allow_destructive( + &self, + ) -> anyhow::Result { + anyhow::ensure!( + self.allow_destructive, + "This command is potentially destructive. \ + Pass the `-w` / `--destructive` flag to allow it." + ); + Ok(DestructiveOperationToken(())) + } + } +} + +impl Omdb { + /// Return the socket addresses of all instances of a service in DNS + async fn dns_lookup_all( + &self, + log: slog::Logger, + service_name: ServiceName, + ) -> Result, anyhow::Error> { + let resolver = self.dns_resolver(log).await?; + resolver + .lookup_all_socket_v6(service_name) + .await + .with_context(|| format!("looking up {:?} in DNS", service_name)) + } + + /// Return the socket address of one instance of a service that we can at + /// least successfully connect to + async fn dns_lookup_one( + &self, + log: slog::Logger, + service_name: ServiceName, + ) -> Result { + let addrs = self.dns_lookup_all(log, service_name).await?; + ensure!( + !addrs.is_empty(), + "expected at least one address from successful DNS lookup for {:?}", + service_name + ); + + // The caller is going to pick one of these addresses to connect to. + // Let's try to pick one that's at least not obviously broken by + // attempting to connect to whatever we found and returning any that we + // successfully connected to. It'd be nice if we could return the + // socket directly, but our callers are creating reqwest clients that + // cannot easily consume a socket directly. + // + // This approach scales poorly and there are many failure modes that + // this does not cover. But in the absence of better connection + // management, and with the risks in `omdb` being pretty low, and the + // value of it working pretty high, here we are. This approach should + // not be replicated elsewhere. + async fn try_connect( + sockaddr_v6: SocketAddrV6, + ) -> Result<(), anyhow::Error> { + let _ = TcpSocket::new_v6() + .context("creating socket")? + .connect(SocketAddr::from(sockaddr_v6)) + .await + .with_context(|| format!("connect \"{}\"", sockaddr_v6))?; + Ok(()) + } + + let mut socket_stream = futures::stream::iter(addrs) + .map(async move |sockaddr_v6| { + (sockaddr_v6, try_connect(sockaddr_v6).await) + }) + .buffer_unordered(3); + + while let Some((sockaddr, connect_result)) = socket_stream.next().await + { + match connect_result { + Ok(()) => return Ok(sockaddr), + Err(error) => { + eprintln!( + "warning: failed to connect to {:?} at {}: {:#}", + service_name, sockaddr, error + ); + } + } + } + + Err(anyhow!("failed to connect to any instances of {:?}", service_name)) + } + + async fn dns_resolver( + &self, + log: slog::Logger, + ) -> Result { + match &self.dns_server { + Some(dns_server) => { + internal_dns_resolver::Resolver::new_from_addrs( + log, + &[*dns_server], + ) + .with_context(|| { + format!( + "creating DNS resolver for DNS server {:?}", + dns_server + ) + }) + } + None => { + // In principle, we should look at /etc/resolv.conf to find the + // DNS servers. In practice, this usually isn't populated + // today. See oxidecomputer/omicron#2122. + // + // However, the address selected below should work for most + // existing Omicron deployments today. That's because while the + // base subnet is in principle configurable in config-rss.toml, + // it's very uncommon to change it from the default value used + // here. + // + // Yet another option would be to find a local IP address that + // looks like it's probably on the underlay network and use that + // to find the subnet to use. But again, this is unlikely to be + // wrong and it's easy to override. + let subnet = + Ipv6Subnet::new("fd00:1122:3344:0100::".parse().unwrap()); + eprintln!("note: using DNS server for subnet {}", subnet.net()); + eprintln!( + "note: (if this is not right, use --dns-server \ + to specify an alternate DNS server)", + ); + internal_dns_resolver::Resolver::new_from_subnet(log, subnet) + .with_context(|| { + format!( + "creating DNS resolver for subnet {}", + subnet.net() + ) + }) + } + } + } +} + +#[derive(Debug, Subcommand)] +#[allow(clippy::large_enum_variant)] +enum OmdbCommands { + /// Debug a specific crucible-agent + CrucibleAgent(crucible_agent::CrucibleAgentArgs), + /// Query a specific crucible-pantry + CruciblePantry(crucible_pantry::CruciblePantryArgs), + /// Query the control plane database (CockroachDB) + Db(db::DbArgs), + /// Debug a specific Management Gateway Service instance + Mgs(mgs::MgsArgs), + /// Debug a specific Nexus instance + Nexus(nexus::NexusArgs), + /// Query oximeter collector state + Oximeter(oximeter::OximeterArgs), + /// Enter the Oximeter Query Language shell for interactive querying. + Oxql(oxql::OxqlArgs), + /// Interact with the Reconfigurator system + Reconfigurator(reconfigurator::ReconfiguratorArgs), + /// Debug a specific Sled + SledAgent(sled_agent::SledAgentArgs), +} + +fn parse_dropshot_log_level( + s: &str, +) -> Result { + serde_json::from_str(&format!("{:?}", s)).context("parsing log level") +} diff --git a/dev-tools/omdb/src/bin/omdb/mgs.rs b/dev-tools/omdb/src/mgs.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/mgs.rs rename to dev-tools/omdb/src/mgs.rs diff --git a/dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs b/dev-tools/omdb/src/mgs/dashboard.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs rename to dev-tools/omdb/src/mgs/dashboard.rs diff --git a/dev-tools/omdb/src/bin/omdb/mgs/sensors.rs b/dev-tools/omdb/src/mgs/sensors.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/mgs/sensors.rs rename to dev-tools/omdb/src/mgs/sensors.rs diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/nexus.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/nexus.rs rename to dev-tools/omdb/src/nexus.rs diff --git a/dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs b/dev-tools/omdb/src/nexus/quiesce.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs rename to dev-tools/omdb/src/nexus/quiesce.rs diff --git a/dev-tools/omdb/src/bin/omdb/nexus/reconfigurator_config.rs b/dev-tools/omdb/src/nexus/reconfigurator_config.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/nexus/reconfigurator_config.rs rename to dev-tools/omdb/src/nexus/reconfigurator_config.rs diff --git a/dev-tools/omdb/src/bin/omdb/nexus/update_status.rs b/dev-tools/omdb/src/nexus/update_status.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/nexus/update_status.rs rename to dev-tools/omdb/src/nexus/update_status.rs diff --git a/dev-tools/omdb/src/bin/omdb/oximeter.rs b/dev-tools/omdb/src/oximeter.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/oximeter.rs rename to dev-tools/omdb/src/oximeter.rs diff --git a/dev-tools/omdb/src/bin/omdb/oxql.rs b/dev-tools/omdb/src/oxql.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/oxql.rs rename to dev-tools/omdb/src/oxql.rs diff --git a/dev-tools/omdb/src/bin/omdb/reconfigurator.rs b/dev-tools/omdb/src/reconfigurator.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/reconfigurator.rs rename to dev-tools/omdb/src/reconfigurator.rs diff --git a/dev-tools/omdb/src/bin/omdb/sled_agent.rs b/dev-tools/omdb/src/sled_agent.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/sled_agent.rs rename to dev-tools/omdb/src/sled_agent.rs diff --git a/dev-tools/omdb/src/bin/omdb/support_bundle.rs b/dev-tools/omdb/src/support_bundle.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/support_bundle.rs rename to dev-tools/omdb/src/support_bundle.rs diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 81d4ed9bbfb..0827f1be1fd 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -132,8 +132,10 @@ nexus-reconfigurator-preparation.workspace = true nexus-reconfigurator-rendezvous.workspace = true nexus-types.workspace = true omicron-common.workspace = true +omicron-omdb.workspace = true omicron-passwords.workspace = true oxide-tokio-rt.workspace = true +sigpipe.workspace = true oximeter.workspace = true oximeter-instruments = { workspace = true, features = ["http-instruments"] } oximeter-producer.workspace = true @@ -199,3 +201,7 @@ harness = false [[bin]] name = "nexus" doc = false + +[[bin]] +name = "omdb-dup" +doc = false diff --git a/nexus/src/bin/omdb-dup.rs b/nexus/src/bin/omdb-dup.rs new file mode 100644 index 00000000000..d2596968b6f --- /dev/null +++ b/nexus/src/bin/omdb-dup.rs @@ -0,0 +1,25 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! A copy of omdb's `main.rs`. +//! +//! This is a workaround for the fact that Cargo only lets integration tests use +//! binaries defined in the same crate. We'd like two sets of integration tests +//! against omdb: quicker ones that live in that crate, and slower ones that +//! depend on Nexus and live here. +//! +//! The tests don't have to use omdb as a binary. They could also use it as a +//! library, but doing that properly would require stdout and stderr to be +//! redirected to in-memory buffers. This small binary works around that. + +use clap::Parser; +use omicron_omdb::Omdb; + +fn main() -> Result<(), anyhow::Error> { + sigpipe::reset(); + oxide_tokio_rt::run(async { + let cmd = Omdb::parse(); + cmd.exec().await + }) +} diff --git a/nexus/test-utils-macros/src/lib.rs b/nexus/test-utils-macros/src/lib.rs index 767f09b9d39..96408b12701 100644 --- a/nexus/test-utils-macros/src/lib.rs +++ b/nexus/test-utils-macros/src/lib.rs @@ -130,6 +130,21 @@ pub fn nexus_test(attrs: TokenStream, input: TokenStream) -> TokenStream { #func_ident_string, ) .with_extra_sled_agents(#extra_sled_agents) + .customize_nexus_config(&|config| { + // Set omdb binary path from CARGO_BIN_EXE_omdb-dup if available. + // This env var is set by cargo test/nextest for binaries in the + // same package - but it's only accessible to integration tests + // and benchmarks. + // + // We use option_env!() here (which expands in test code) to + // avoid compile errors during cargo check when the binary + // doesn't exist. If the env var isn't set, we leave the path + // unchanged (it uses a default). + if let Some(omdb_path) = option_env!("CARGO_BIN_EXE_omdb-dup") { + config.pkg.omdb.bin_path = + ::camino::Utf8PathBuf::from(omdb_path); + } + }) .start::<#which_nexus>() .await; #func_ident(&ctx).await; diff --git a/nexus/test-utils/src/starter.rs b/nexus/test-utils/src/starter.rs index cac54381db1..412959d3d63 100644 --- a/nexus/test-utils/src/starter.rs +++ b/nexus/test-utils/src/starter.rs @@ -560,33 +560,6 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { .clone(), }; - // Configure the omdb binary path for tests. - // The binary is built by cargo at the workspace root in target//omdb. - // Tests run from the nexus directory, so we need to go up one level. - let workspace_root = std::env::current_dir() - .expect("Failed to get current directory") - .parent() - .expect("Current directory should have a parent") - .to_path_buf(); - let omdb_debug = workspace_root.join("target/debug/omdb"); - let omdb_release = workspace_root.join("target/release/omdb"); - - self.config.pkg.omdb.bin_path = if omdb_release.exists() { - camino::Utf8PathBuf::try_from(omdb_release) - .expect("Failed to convert release path to UTF-8") - } else if omdb_debug.exists() { - camino::Utf8PathBuf::try_from(omdb_debug) - .expect("Failed to convert debug path to UTF-8") - } else { - // omdb hasn't been built yet - use a path that will fail gracefully - // when tests try to use it. - // - // Our rules in ".config/nextest.toml" should prevent this, but this - // acts as a defensive buffer against running without nextest, or - // changing the directory layout. - camino::Utf8PathBuf::from("/nonexistent/omdb") - }; - let nexus_internal = N::start_internal(&self.config, &log).await?; let nexus_internal_addr = nexus_internal.get_http_server_internal_address();