From 6d3b503c6dc86af09661a098b3e159156bc70cd9 Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxidecomputer.com>
Date: Fri, 17 Oct 2025 10:49:17 -0700
Subject: [PATCH 01/18] [support bundle] Refactor into tasks

---
 .../tasks/support_bundle_collector.rs         | 682 ++++++++++++------
 1 file changed, 457 insertions(+), 225 deletions(-)
diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs
index 8dc13e7ab42..ee2224edba8 100644
--- a/nexus/src/app/background/tasks/support_bundle_collector.rs
+++ b/nexus/src/app/background/tasks/support_bundle_collector.rs
@@ -59,12 +59,11 @@ use std::future::Future;
 use std::io::Write;
 use std::num::NonZeroU64;
 use std::sync::Arc;
-use std::sync::atomic::{AtomicUsize, Ordering};
 use tokio::io::AsyncReadExt;
 use tokio::io::AsyncSeekExt;
 use tokio::io::AsyncWriteExt;
 use tokio::io::SeekFrom;
-use tokio_util::task::AbortOnDropHandle;
+use tokio::sync::OnceCell;
 use tufaceous_artifact::ArtifactHash;
 use uuid::Uuid;
 use zip::ZipArchive;
@@ -428,8 +427,6 @@ impl SupportBundleCollector {
             request: request.clone(),
             bundle: bundle.clone(),
             transfer_chunk_size: request.transfer_chunk_size,
-            host_ereports_collected: AtomicUsize::new(0),
-            sp_ereports_collected: AtomicUsize::new(0),
         });
 
         let authz_bundle = authz_support_bundle_from_id(bundle.id.into());
@@ -475,8 +472,60 @@ struct BundleCollection {
     request: BundleRequest,
     bundle: SupportBundle,
     transfer_chunk_size: NonZeroU64,
-    host_ereports_collected: AtomicUsize,
-    sp_ereports_collected: AtomicUsize,
+}
+
+type CollectionStepFn = Box<
+    dyn for<'b> FnOnce(
+            &'b Arc<BundleCollection>,
+            &'b Utf8Path,
+        )
+            -> BoxFuture<'b, anyhow::Result<CollectionStepOutput>>
+        + Send,
+>;
+
+enum CollectionStepOutput {
+    HostEreports(SupportBundleEreportStatus),
+    SpEreports(SupportBundleEreportStatus),
+    SavingSpDumps { listed_sps: bool },
+    // NOTE: The ditinction between this and "Spawn" is pretty artificial -
+    // it's just to preserve a part of the report which says "we tried to
+    // list in-service sleds".
+    //
+    // If we changed the collection report, this could easily be combined
+    // with the "Spawn" variant.
+    SpawnSleds { extra_steps: Vec<(&'static str, CollectionStepFn)> },
+    Spawn { extra_steps: Vec<(&'static str, CollectionStepFn)> },
+    None,
+}
+
+impl CollectionStepOutput {
+    // Updates the collection report based on the output of a collection step,
+    // and possibly extends the set of all steps to be executed.
+    fn process(
+        self,
+        report: &mut SupportBundleCollectionReport,
+        steps: &mut Vec<(&'static str, CollectionStepFn)>,
+    ) {
+        match self {
+            CollectionStepOutput::HostEreports(status) => {
+                report.host_ereports = status;
+            }
+            CollectionStepOutput::SpEreports(status) => {
+                report.sp_ereports = status;
+            }
+            CollectionStepOutput::SavingSpDumps { listed_sps } => {
+                report.listed_sps = listed_sps;
+            }
+            CollectionStepOutput::SpawnSleds { extra_steps } => {
+                report.listed_in_service_sleds = true;
+                steps.extend(extra_steps);
+            }
+            CollectionStepOutput::Spawn { extra_steps } => {
+                steps.extend(extra_steps);
+            }
+            CollectionStepOutput::None => (),
+        }
+    }
 }
 
 impl BundleCollection {
@@ -656,37 +705,72 @@ impl BundleCollection {
         Ok(())
     }
 
-    // Perform the work of collecting the support bundle into a temporary directory
-    //
-    // - "dir" is a directory where data can be stored.
-    // - "bundle" is metadata about the bundle being collected.
-    //
-    // If a partial bundle can be collected, it should be returned as
-    // an Ok(SupportBundleCollectionReport). Any failures from this function
-    // will prevent the support bundle from being collected altogether.
-    //
-    // NOTE: The background task infrastructure will periodically check to see
-    // if the bundle has been cancelled by a user while it is being collected.
-    // If that happens, this function will be CANCELLED at an await point.
-    //
-    // As a result, it is important that this function be implemented as
-    // cancel-safe.
-    async fn collect_bundle_as_file(
+    async fn run_collect_bundle_steps(
         self: &Arc<Self>,
-        dir: &Utf8TempDir,
-    ) -> anyhow::Result<SupportBundleCollectionReport> {
-        let log = &self.log;
-
-        info!(&log, "Collecting bundle as local file");
+        output: &Utf8TempDir,
+        mut steps: Vec<(&'static str, CollectionStepFn)>,
+    ) -> SupportBundleCollectionReport {
         let mut report =
             SupportBundleCollectionReport::new(self.bundle.id.into());
 
-        tokio::fs::write(
-            dir.path().join("bundle_id.txt"),
-            self.bundle.id.to_string(),
-        )
-        .await?;
+        const MAX_CONCURRENT_STEPS: usize = 16;
+        let mut tasks =
+            ParallelTaskSet::new_with_parallelism(MAX_CONCURRENT_STEPS);
+
+        loop {
+            // Process all the currently-planned steps
+            while let Some((step_name, step)) = steps.pop() {
+                let previous_result = tasks.spawn({
+                    let collection = self.clone();
+                    let dir = output.path().to_path_buf();
+                    async move {
+                        debug!(collection.log, "Running step"; "name" => &step_name);
+                        step(&collection, dir.as_path()).await.inspect_err(|err| {
+                            warn!(
+                                collection.log,
+                                "Step failed";
+                                "name" => &step_name,
+                                InlineErrorChain::new(err.as_ref()),
+                            );
+                        })
+                    }
+                }).await;
+
+                if let Some(Ok(output)) = previous_result {
+                    output.process(&mut report, &mut steps);
+                };
+            }
+
+            // If we've run out of tasks to spawn, join all the existing steps.
+            while let Some(previous_result) = tasks.join_next().await {
+                if let Ok(output) = previous_result {
+                    output.process(&mut report, &mut steps);
+                };
+            }
+
+            // Executing steps may create additional steps, as follow-up work.
+            //
+            // Only finish if we've exhausted all possible steps and joined all spawned work.
+            if steps.is_empty() {
+                return report;
+            }
+        }
+    }
 
+    async fn collect_bundle_id(
+        &self,
+        dir: &Utf8Path,
+    ) -> anyhow::Result<CollectionStepOutput> {
+        tokio::fs::write(dir.join("bundle_id.txt"), self.bundle.id.to_string())
+            .await?;
+
+        Ok(CollectionStepOutput::None)
+    }
+
+    async fn collect_reconfigurator_state(
+        &self,
+        dir: &Utf8Path,
+    ) -> anyhow::Result<CollectionStepOutput> {
         // Collect reconfigurator state
         const NMAX_BLUEPRINTS: usize = 300;
         match reconfigurator_state_load(
@@ -697,7 +781,7 @@ impl BundleCollection {
         .await
         {
             Ok(state) => {
-                let file_path = dir.path().join("reconfigurator_state.json");
+                let file_path = dir.join("reconfigurator_state.json");
                 let file = std::fs::OpenOptions::new()
                     .create(true)
                     .write(true)
@@ -713,7 +797,7 @@ impl BundleCollection {
                     },
                 )?;
                 info!(
-                    log,
+                    self.log,
                     "Support bundle: collected reconfigurator state";
                     "target_blueprint" => ?state.target_blueprint,
                     "num_blueprints" => state.blueprints.len(),
@@ -722,152 +806,322 @@ impl BundleCollection {
             }
             Err(err) => {
                 warn!(
-                    log,
+                    self.log,
                     "Support bundle: failed to collect reconfigurator state";
                     "err" => ?err,
                 );
             }
-        }
+        };
+
+        Ok(CollectionStepOutput::None)
+    }
 
-        let ereport_collection = if let Some(ref ereport_filters) =
-            self.request.ereport_query
+    async fn collect_host_ereports(
+        self: &Arc<Self>,
+        dir: &Utf8Path,
+    ) -> anyhow::Result<CollectionStepOutput> {
+        let Some(ref ereport_filters) = self.request.ereport_query else {
+            debug!(self.log, "Support bundle: ereports not requested");
+            return Ok(CollectionStepOutput::None);
+        };
+        let ereports_dir = dir.join("ereports");
+        let status = match self
+            .save_host_ereports(ereport_filters.clone(), ereports_dir.clone())
+            .await
         {
-            // If ereports are to be included in the bundle, have someone go do
-            // that in the background while we're gathering up other stuff. Note
-            // that the `JoinHandle`s for these tasks are wrapped in
-            // `AbortOnDropHandle`s for cancellation correctness; this ensures
-            // that if collecting the bundle is cancelled and this future is
-            // dropped, the tasks that we've spawned to collect ereports are
-            // aborted as well.
-            let dir = dir.path().join("ereports");
-            let host = AbortOnDropHandle::new(tokio::spawn(
-                self.clone().collect_host_ereports(
-                    ereport_filters.clone(),
-                    dir.clone(),
-                ),
-            ));
-            let sp = AbortOnDropHandle::new(tokio::spawn(
-                self.clone().collect_sp_ereports(ereport_filters.clone(), dir),
-            ));
-            Some((host, sp))
-        } else {
-            debug!(log, "Support bundle: ereports not requested");
-            None
+            Ok(n_collected) => {
+                SupportBundleEreportStatus::Collected { n_collected }
+            }
+            Err((n_collected, err)) => {
+                warn!(
+                    &self.log,
+                    "Support bundle: host ereport collection failed \
+                     ({n_collected} collected successfully)";
+                    InlineErrorChain::new(err.as_ref()),
+                );
+
+                SupportBundleEreportStatus::Failed {
+                    n_collected,
+                    error: err.to_string(),
+                }
+            }
         };
 
-        let all_sleds = self
-            .datastore
-            .sled_list_all_batched(&self.opctx, SledFilter::InService)
-            .await;
+        Ok(CollectionStepOutput::HostEreports(status))
+    }
 
-        if let Ok(mgs_client) = self.create_mgs_client().await {
-            if let Err(e) = write_sled_info(
-                &self.log,
-                &mgs_client,
-                all_sleds.as_deref().ok(),
-                dir.path(),
-            )
+    async fn collect_sp_ereports(
+        self: &Arc<Self>,
+        dir: &Utf8Path,
+    ) -> anyhow::Result<CollectionStepOutput> {
+        let Some(ref ereport_filters) = self.request.ereport_query else {
+            debug!(self.log, "Support bundle: ereports not requested");
+            return Ok(CollectionStepOutput::None);
+        };
+        let ereports_dir = dir.join("ereports");
+        let status = match self
+            .save_sp_ereports(ereport_filters.clone(), ereports_dir.clone())
             .await
-            {
-                error!(log, "Failed to write sled_info.json"; "error" => InlineErrorChain::new(e.as_ref()));
+        {
+            Ok(n_collected) => {
+                SupportBundleEreportStatus::Collected { n_collected }
             }
+            Err((n_collected, err)) => {
+                warn!(
+                    &self.log,
+                    "Support bundle: sp ereport collection failed \
+                     ({n_collected} collected successfully)";
+                    InlineErrorChain::new(err.as_ref()),
+                );
 
-            let sp_dumps_dir = dir.path().join("sp_task_dumps");
-            tokio::fs::create_dir_all(&sp_dumps_dir).await.with_context(
-                || {
-                    format!(
-                        "Failed to create SP task dump directory {sp_dumps_dir}"
-                    )
-                },
-            )?;
+                SupportBundleEreportStatus::Failed {
+                    n_collected,
+                    error: err.to_string(),
+                }
+            }
+        };
 
-            if let Err(e) =
-                save_all_sp_dumps(log, &mgs_client, &sp_dumps_dir).await
-            {
-                error!(log, "Failed to capture SP task dumps"; "error" => InlineErrorChain::new(e.as_ref()));
-            } else {
-                report.listed_sps = true;
-            };
-        } else {
-            warn!(log, "No MGS client, skipping SP task dump collection");
-        }
+        Ok(CollectionStepOutput::SpEreports(status))
+    }
 
-        if let Ok(all_sleds) = all_sleds {
-            report.listed_in_service_sleds = true;
+    async fn get_or_initialize_mgs_client<'a>(
+        &self,
+        mgs_client: &'a OnceCell<Arc<Option<MgsClient>>>,
+    ) -> &'a Arc<Option<MgsClient>> {
+        mgs_client
+            .get_or_init(|| async {
+                Arc::new(self.create_mgs_client().await.ok())
+            })
+            .await
+    }
 
-            const MAX_CONCURRENT_SLED_REQUESTS: usize = 16;
-            const FAILURE_MESSAGE: &str =
-                "Failed to fully collect support bundle info from sled";
-            let mut set = ParallelTaskSet::new_with_parallelism(
-                MAX_CONCURRENT_SLED_REQUESTS,
+    async fn get_or_initialize_all_sleds<'a>(
+        &self,
+        all_sleds: &'a OnceCell<Arc<Option<Vec<Sled>>>>,
+    ) -> &'a Arc<Option<Vec<Sled>>> {
+        all_sleds
+            .get_or_init(|| async {
+                Arc::new(
+                    self.datastore
+                        .sled_list_all_batched(
+                            &self.opctx,
+                            SledFilter::InService,
+                        )
+                        .await
+                        .ok(),
+                )
+            })
+            .await
+    }
+
+    async fn collect_sled_cubby_info(
+        &self,
+        all_sleds: &OnceCell<Arc<Option<Vec<Sled>>>>,
+        mgs_client: &OnceCell<Arc<Option<MgsClient>>>,
+        dir: &Utf8Path,
+    ) -> anyhow::Result<CollectionStepOutput> {
+        let Some(mgs_client) =
+            &**self.get_or_initialize_mgs_client(mgs_client).await
+        else {
+            warn!(
+                self.log,
+                "No MGS client, skipping sled cubby info collection"
             );
+            return Ok(CollectionStepOutput::None);
+        };
+        let nexus_sleds = self
+            .get_or_initialize_all_sleds(all_sleds)
+            .await
+            .as_deref()
+            .unwrap_or_default();
+
+        write_sled_cubby_info(&self.log, mgs_client, nexus_sleds, dir).await?;
+
+        Ok(CollectionStepOutput::None)
+    }
+
+    async fn spawn_sp_dump_collection(
+        &self,
+        mgs_client: &OnceCell<Arc<Option<MgsClient>>>,
+        dir: &Utf8Path,
+    ) -> anyhow::Result<CollectionStepOutput> {
+        let Some(mgs_client) =
+            &**self.get_or_initialize_mgs_client(mgs_client).await
+        else {
+            warn!(self.log, "No MGS client, skipping SP task dump collection");
+            return Ok(CollectionStepOutput::None);
+        };
 
-            for sled in all_sleds {
-                let prev_result = set
-                    .spawn({
-                        let collection: Arc<BundleCollection> = self.clone();
-                        let dir = dir.path().to_path_buf();
+        let sp_dumps_dir = dir.join("sp_task_dumps");
+        tokio::fs::create_dir_all(&sp_dumps_dir).await.with_context(|| {
+            format!("Failed to create SP task dump directory {sp_dumps_dir}")
+        })?;
+
+        let mut extra_steps: Vec<(&'static str, CollectionStepFn)> = vec![];
+        for sp in get_available_sps(&mgs_client).await? {
+            extra_steps.push((
+                "sp dump",
+                Box::new({
+                    let mgs_client = mgs_client.clone();
+                    move |collection, dir| {
                         async move {
-                            collection.collect_data_from_sled(&sled, &dir).await
+                            collection
+                                .collect_sp_dump(&mgs_client, sp, dir)
+                                .await
                         }
-                    })
-                    .await;
-                if let Some(Err(err)) = prev_result {
-                    warn!(&self.log, "{FAILURE_MESSAGE}"; "err" => ?err);
-                }
-            }
-            while let Some(result) = set.join_next().await {
-                if let Err(err) = result {
-                    warn!(&self.log, "{FAILURE_MESSAGE}"; "err" => ?err);
-                }
-            }
+                        .boxed()
+                    }
+                }),
+            ));
         }
 
-        if let Some((host, sp)) = ereport_collection {
-            let (host, sp) = tokio::join!(host, sp);
-            const TASK_FAILURE_MSG: &str = "task failed";
-            let n_collected =
-                self.host_ereports_collected.load(Ordering::Acquire);
-            report.host_ereports = match host
-                .map_err(|e| anyhow::anyhow!("{TASK_FAILURE_MSG}: {e}"))
-                .and_then(|x| x)
-            {
-                Ok(_) => SupportBundleEreportStatus::Collected { n_collected },
-                Err(err) => {
-                    warn!(
-                        &self.log,
-                        "Support bundle: host ereport collection failed \
-                         ({n_collected} collected successfully)";
-                        "err" => ?err,
-                    );
-                    SupportBundleEreportStatus::Failed {
-                        n_collected,
-                        error: err.to_string(),
+        Ok(CollectionStepOutput::Spawn { extra_steps })
+    }
+
+    async fn collect_sp_dump(
+        &self,
+        mgs_client: &MgsClient,
+        sp: SpIdentifier,
+        dir: &Utf8Path,
+    ) -> anyhow::Result<CollectionStepOutput> {
+        save_sp_dumps(mgs_client, sp, dir)
+            .await
+            .with_context(|| format!("SP {} {}", sp.type_, sp.slot))?;
+
+        Ok(CollectionStepOutput::SavingSpDumps { listed_sps: true })
+    }
+
+    // Perform the work of collecting the support bundle into a temporary directory
+    //
+    // - "dir" is a directory where data can be stored.
+    // - "bundle" is metadata about the bundle being collected.
+    //
+    // If a partial bundle can be collected, it should be returned as
+    // an Ok(SupportBundleCollectionReport). Any failures from this function
+    // will prevent the support bundle from being collected altogether.
+    //
+    // NOTE: The background task infrastructure will periodically check to see
+    // if the bundle has been cancelled by a user while it is being collected.
+    // If that happens, this function will be CANCELLED at an await point.
+    //
+    // As a result, it is important that this function be implemented as
+    // cancel-safe.
+    async fn collect_bundle_as_file(
+        self: &Arc<Self>,
+        dir: &Utf8TempDir,
+    ) -> anyhow::Result<SupportBundleCollectionReport> {
+        let log = &self.log;
+
+        info!(&log, "Collecting bundle as local file");
+
+        // Shared, lazy, fallible initialization for sleds
+        let all_sleds: OnceCell<Arc<Option<Vec<Sled>>>> = OnceCell::new();
+        // Shared, lazy, fallible initialization for MGS client
+        let mgs_client: OnceCell<Arc<Option<MgsClient>>> = OnceCell::new();
+
+        let steps: Vec<(&str, CollectionStepFn)> = vec![
+            (
+                "bundle id",
+                Box::new(|collection, dir| {
+                    collection.collect_bundle_id(dir).boxed()
+                }),
+            ),
+            (
+                "reconfigurator state",
+                Box::new(|collection, dir| {
+                    collection.collect_reconfigurator_state(dir).boxed()
+                }),
+            ),
+            (
+                "host ereports",
+                Box::new(|collection, dir| {
+                    collection.collect_host_ereports(dir).boxed()
+                }),
+            ),
+            (
+                "sp ereports",
+                Box::new(|collection, dir| {
+                    collection.collect_sp_ereports(dir).boxed()
+                }),
+            ),
+            (
+                "sled cubby info",
+                Box::new({
+                    let all_sleds = all_sleds.clone();
+                    let mgs_client = mgs_client.clone();
+                    move |collection, dir| {
+                        async move {
+                            collection
+                                .collect_sled_cubby_info(
+                                    &all_sleds,
+                                    &mgs_client,
+                                    dir,
+                                )
+                                .await
+                        }
+                        .boxed()
                     }
-                }
-            };
-            let n_collected =
-                self.sp_ereports_collected.load(Ordering::Acquire);
-            report.sp_ereports = match sp
-                .map_err(|e| anyhow::anyhow!("{TASK_FAILURE_MSG}: {e}"))
-                .and_then(|x| x)
-            {
-                Ok(_) => SupportBundleEreportStatus::Collected { n_collected },
-                Err(err) => {
-                    warn!(
-                        &self.log,
-                        "Support bundle: SP ereport collection failed \
-                         ({n_collected} collected successfully)";
-                        "err" => ?err,
-                    );
-                    SupportBundleEreportStatus::Failed {
-                        n_collected,
-                        error: err.to_string(),
+                }),
+            ),
+            (
+                "spawn steps to query all sp dumps",
+                Box::new({
+                    let mgs_client = mgs_client.clone();
+                    move |collection, dir| {
+                        async move {
+                            collection
+                                .spawn_sp_dump_collection(&mgs_client, dir)
+                                .await
+                        }
+                        .boxed()
                     }
-                }
-            };
+                }),
+            ),
+            (
+                "spawn steps to query all sleds",
+                Box::new({
+                    let all_sleds = all_sleds.clone();
+                    move |collection, _| {
+                        async move {
+                            collection.spawn_query_all_sleds(&all_sleds).await
+                        }
+                        .boxed()
+                    }
+                }),
+            ),
+        ];
+
+        Ok(self.run_collect_bundle_steps(dir, steps).await)
+    }
+
+    async fn spawn_query_all_sleds(
+        &self,
+        all_sleds: &OnceCell<Arc<Option<Vec<Sled>>>>,
+    ) -> anyhow::Result<CollectionStepOutput> {
+        let Some(all_sleds) =
+            self.get_or_initialize_all_sleds(all_sleds).await.as_deref()
+        else {
+            warn!(self.log, "Could not read list of sleds");
+            return Ok(CollectionStepOutput::None);
+        };
+
+        let mut extra_steps: Vec<(&'static str, CollectionStepFn)> = vec![];
+        for sled in all_sleds {
+            extra_steps.push((
+                "sled data",
+                Box::new({
+                    let sled = sled.clone();
+                    move |collection, dir| {
+                        async move {
+                            collection.collect_data_from_sled(&sled, dir).await
+                        }
+                        .boxed()
+                    }
+                }),
+            ));
         }
-        Ok(report)
+
+        return Ok(CollectionStepOutput::SpawnSleds { extra_steps });
     }
 
     // Collect data from a sled, storing it into a directory that will
@@ -880,7 +1134,7 @@ impl BundleCollection {
         &self,
         sled: &nexus_db_model::Sled,
         dir: &Utf8Path,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<CollectionStepOutput> {
         let log = &self.log;
         info!(&log, "Collecting bundle info from sled"; "sled" => %sled.id());
         let sled_path = dir
@@ -893,7 +1147,7 @@ impl BundleCollection {
             .await?;
 
         if self.request.skip_sled_info {
-            return Ok(());
+            return Ok(CollectionStepOutput::None);
         }
 
         let Ok(sled_client) = nexus_networking::sled_client(
@@ -909,7 +1163,7 @@ impl BundleCollection {
                 "Could not contact sled",
             )
             .await?;
-            return Ok(());
+            return Ok(CollectionStepOutput::None);
         };
 
         // NB: As new sled-diagnostic commands are added they should
@@ -1014,14 +1268,15 @@ impl BundleCollection {
                 error!(&self.log, "failed to write logs output: {e}");
             }
         }
-        return Ok(());
+        return Ok(CollectionStepOutput::None);
     }
 
-    async fn collect_sp_ereports(
-        self: Arc<Self>,
+    async fn save_host_ereports(
+        self: &Arc<Self>,
         filters: EreportFilters,
         dir: Utf8PathBuf,
-    ) -> anyhow::Result<()> {
+    ) -> Result<usize, (usize, anyhow::Error)> {
+        let mut reports = 0;
         let mut paginator = Paginator::new(
             datastore::SQL_BATCH_SIZE,
             dropshot::PaginationOrder::Ascending,
@@ -1029,40 +1284,50 @@ impl BundleCollection {
         while let Some(p) = paginator.next() {
             let ereports = self
                 .datastore
-                .sp_ereports_fetch_matching(
+                .host_ereports_fetch_matching(
                     &self.opctx,
                     &filters,
                     &p.current_pagparams(),
                 )
                 .await
                 .map_err(|e| {
-                    e.internal_context("failed to query for SP ereports")
+                    (
+                        reports,
+                        e.internal_context(
+                            "failed to query for host OS ereports",
+                        )
+                        .into(),
+                    )
                 })?;
             paginator = p.found_batch(&ereports, &|ereport| {
                 (ereport.restart_id.into_untyped_uuid(), ereport.ena)
             });
-
             let n_ereports = ereports.len();
             for ereport in ereports {
-                write_ereport(ereport.into(), &dir).await?;
-                self.sp_ereports_collected.fetch_add(1, Ordering::Release);
+                write_ereport(ereport.into(), &dir)
+                    .await
+                    .map_err(|e| (reports, e))?;
+                reports += 1;
             }
-            debug!(self.log, "Support bundle: added {n_ereports} SP ereports");
+            debug!(
+                self.log,
+                "Support bundle: added {n_ereports} host OS ereports"
+            );
         }
 
         info!(
             self.log,
-            "Support bundle: collected {} total SP ereports",
-            self.sp_ereports_collected.load(Ordering::Relaxed)
+            "Support bundle: collected {} total host ereports", reports
         );
-        Ok(())
+        Ok(reports)
     }
 
-    async fn collect_host_ereports(
-        self: Arc<Self>,
+    async fn save_sp_ereports(
+        self: &Arc<Self>,
         filters: EreportFilters,
         dir: Utf8PathBuf,
-    ) -> anyhow::Result<()> {
+    ) -> Result<usize, (usize, anyhow::Error)> {
+        let mut reports = 0;
         let mut paginator = Paginator::new(
             datastore::SQL_BATCH_SIZE,
             dropshot::PaginationOrder::Ascending,
@@ -1070,35 +1335,37 @@ impl BundleCollection {
         while let Some(p) = paginator.next() {
             let ereports = self
                 .datastore
-                .host_ereports_fetch_matching(
+                .sp_ereports_fetch_matching(
                     &self.opctx,
                     &filters,
                     &p.current_pagparams(),
                 )
                 .await
                 .map_err(|e| {
-                    e.internal_context("failed to query for host OS ereports")
+                    (
+                        reports,
+                        e.internal_context("failed to query for SP ereports")
+                            .into(),
+                    )
                 })?;
             paginator = p.found_batch(&ereports, &|ereport| {
                 (ereport.restart_id.into_untyped_uuid(), ereport.ena)
             });
             let n_ereports = ereports.len();
             for ereport in ereports {
-                write_ereport(ereport.into(), &dir).await?;
-                self.host_ereports_collected.fetch_add(1, Ordering::Release);
+                write_ereport(ereport.into(), &dir)
+                    .await
+                    .map_err(|e| (reports, e))?;
+                reports += 1;
             }
-            debug!(
-                self.log,
-                "Support bundle: added {n_ereports} host OS ereports"
-            );
+            debug!(self.log, "Support bundle: added {n_ereports} SP ereports");
         }
 
         info!(
             self.log,
-            "Support bundle: collected {} total host ereports",
-            self.host_ereports_collected.load(Ordering::Relaxed)
+            "Support bundle: collected {} total SP ereports", reports
         );
-        Ok(())
+        Ok(reports)
     }
 
     async fn create_mgs_client(&self) -> anyhow::Result<MgsClient> {
@@ -1396,40 +1663,6 @@ where
     Ok(())
 }
 
-/// Collect task dumps from all SPs via MGS and save them to a directory.
-async fn save_all_sp_dumps(
-    log: &slog::Logger,
-    mgs_client: &MgsClient,
-    sp_dumps_dir: &Utf8Path,
-) -> anyhow::Result<()> {
-    let available_sps = get_available_sps(&mgs_client).await?;
-
-    let mut tasks = ParallelTaskSet::new();
-    for sp in available_sps {
-        let mgs_client = mgs_client.clone();
-        let sp_dumps_dir = sp_dumps_dir.to_owned();
-
-        tasks
-            .spawn(async move {
-                save_sp_dumps(mgs_client, sp, sp_dumps_dir)
-                    .await
-                    .with_context(|| format!("SP {} {}", sp.type_, sp.slot))
-            })
-            .await;
-    }
-    for result in tasks.join_all().await {
-        if let Err(e) = result {
-            error!(
-                log,
-                "failed to capture task dumps";
-                "error" => InlineErrorChain::new(e.as_ref())
-            );
-        }
-    }
-
-    Ok(())
-}
-
 /// Use MGS ignition info to find active SPs.
 async fn get_available_sps(
     mgs_client: &MgsClient,
@@ -1455,9 +1688,9 @@ async fn get_available_sps(
 
 /// Fetch and save task dumps from a single SP.
 async fn save_sp_dumps(
-    mgs_client: MgsClient,
+    mgs_client: &MgsClient,
     sp: SpIdentifier,
-    sp_dumps_dir: Utf8PathBuf,
+    sp_dumps_dir: &Utf8Path,
 ) -> anyhow::Result<()> {
     let dump_count = mgs_client
         .sp_task_dump_count(&sp.type_, sp.slot)
@@ -1488,10 +1721,10 @@ async fn save_sp_dumps(
 
 /// Write a file with a JSON mapping of sled serial numbers to cubby and UUIDs for easier
 /// identification of sleds present in a bundle.
-async fn write_sled_info(
+async fn write_sled_cubby_info(
     log: &slog::Logger,
     mgs_client: &MgsClient,
-    nexus_sleds: Option<&[Sled]>,
+    nexus_sleds: &[Sled],
     dir: &Utf8Path,
 ) -> anyhow::Result<()> {
     #[derive(Serialize)]
@@ -1506,7 +1739,6 @@ async fn write_sled_info(
 
     // We can still get a useful mapping of cubby to serial using just the data from MGS.
     let mut nexus_map: BTreeMap<_, _> = nexus_sleds
-        .unwrap_or_default()
         .into_iter()
         .map(|sled| (sled.serial_number(), sled))
         .collect();

From 5abe573ba3ab32dd68dc74afb29691b9b6d4636e Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxidecomputer.com>
Date: Fri, 17 Oct 2025 16:06:33 -0700
Subject: [PATCH 02/18] [support bundle] More structured data filtering

---
 .../tasks/support_bundle_collector.rs         | 135 +++++++++++++++---
 1 file changed, 115 insertions(+), 20 deletions(-)

diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs
index ee2224edba8..1e29e05b2d2 100644
--- a/nexus/src/app/background/tasks/support_bundle_collector.rs
+++ b/nexus/src/app/background/tasks/support_bundle_collector.rs
@@ -55,6 +55,7 @@ use serde_json::json;
 use sha2::{Digest, Sha256};
 use slog_error_chain::InlineErrorChain;
 use std::collections::BTreeMap;
+use std::collections::HashSet;
 use std::future::Future;
 use std::io::Write;
 use std::num::NonZeroU64;
@@ -82,26 +83,83 @@ fn authz_support_bundle_from_id(id: SupportBundleUuid) -> authz::SupportBundle {
     authz::SupportBundle::new(authz::FLEET, id, LookupType::by_id(id))
 }
 
+// Describes how support bundle data is selected.
+//
+// Multiple values of this enum are joined together into a HashSet.
+// Categories should be additive.
+#[derive(Debug, Clone, Hash, Eq, PartialEq)]
+enum BundleDataCategory {
+    Reconfigurator,
+    HostInfo,
+    SledCubbyInfo,
+    SpDumps,
+}
+
+// The set of sleds to include
+#[derive(Debug, Clone, Hash, Eq, PartialEq)]
+enum SledSelection {
+    All,
+    Specific(SledUuid),
+}
+
 // Specifies the data to be collected within the Support Bundle.
 #[derive(Clone)]
 struct BundleRequest {
-    // If "false": Skip collecting host-specific info from each sled.
-    skip_sled_info: bool,
-
     // The size of chunks to use when transferring a bundle from Nexus
     // to a sled agent.
     //
     // Typically, this is CHUNK_SIZE, but can be modified for testing.
     transfer_chunk_size: NonZeroU64,
 
+    // The set of data to be included within this bundle.
+    data_selection: HashSet<BundleDataCategory>,
+
+    // The set of sets to be included within this bundle.
+    //
+    // NOTE: This selection is only considered if "data_selection" requests
+    // data from specific sleds.
+    sled_selection: HashSet<SledSelection>,
+
+    // The set of ereports to be included within this bundle.
+    //
+    // "None" causes ereports to be skipped.
     ereport_query: Option<EreportFilters>,
 }
 
+impl BundleRequest {
+    fn include_reconfigurator_data(&self) -> bool {
+        self.data_selection.contains(&BundleDataCategory::Reconfigurator)
+    }
+
+    fn include_host_info(&self) -> bool {
+        self.data_selection.contains(&BundleDataCategory::HostInfo)
+    }
+
+    fn include_sled(&self, id: SledUuid) -> bool {
+        self.sled_selection.contains(&SledSelection::Specific(id))
+            || self.sled_selection.contains(&SledSelection::All)
+    }
+
+    fn include_sled_cubby_info(&self) -> bool {
+        self.data_selection.contains(&BundleDataCategory::SledCubbyInfo)
+    }
+
+    fn include_sp_dumps(&self) -> bool {
+        self.data_selection.contains(&BundleDataCategory::SpDumps)
+    }
+}
+
 impl Default for BundleRequest {
     fn default() -> Self {
         Self {
-            skip_sled_info: false,
             transfer_chunk_size: CHUNK_SIZE,
+            data_selection: HashSet::from([
+                BundleDataCategory::Reconfigurator,
+                BundleDataCategory::HostInfo,
+                BundleDataCategory::SledCubbyInfo,
+                BundleDataCategory::SpDumps,
+            ]),
+            sled_selection: HashSet::from([SledSelection::All]),
             ereport_query: Some(EreportFilters {
                 start_time: Some(chrono::Utc::now() - chrono::Days::new(7)),
                 ..EreportFilters::default()
@@ -771,6 +829,10 @@ impl BundleCollection {
         &self,
         dir: &Utf8Path,
     ) -> anyhow::Result<CollectionStepOutput> {
+        if !self.request.include_reconfigurator_data() {
+            return Ok(CollectionStepOutput::None);
+        }
+
         // Collect reconfigurator state
         const NMAX_BLUEPRINTS: usize = 300;
         match reconfigurator_state_load(
@@ -920,6 +982,10 @@ impl BundleCollection {
         mgs_client: &OnceCell<Arc<Option<MgsClient>>>,
         dir: &Utf8Path,
     ) -> anyhow::Result<CollectionStepOutput> {
+        if !self.request.include_sled_cubby_info() {
+            return Ok(CollectionStepOutput::None);
+        }
+
         let Some(mgs_client) =
             &**self.get_or_initialize_mgs_client(mgs_client).await
         else {
@@ -945,6 +1011,10 @@ impl BundleCollection {
         mgs_client: &OnceCell<Arc<Option<MgsClient>>>,
         dir: &Utf8Path,
     ) -> anyhow::Result<CollectionStepOutput> {
+        if !self.request.include_sp_dumps() {
+            return Ok(CollectionStepOutput::None);
+        }
+
         let Some(mgs_client) =
             &**self.get_or_initialize_mgs_client(mgs_client).await
         else {
@@ -984,6 +1054,10 @@ impl BundleCollection {
         sp: SpIdentifier,
         dir: &Utf8Path,
     ) -> anyhow::Result<CollectionStepOutput> {
+        if !self.request.include_sp_dumps() {
+            return Ok(CollectionStepOutput::None);
+        }
+
         save_sp_dumps(mgs_client, sp, dir)
             .await
             .with_context(|| format!("SP {} {}", sp.type_, sp.slot))?;
@@ -1098,6 +1172,10 @@ impl BundleCollection {
         &self,
         all_sleds: &OnceCell<Arc<Option<Vec<Sled>>>>,
     ) -> anyhow::Result<CollectionStepOutput> {
+        if !self.request.include_host_info() {
+            return Ok(CollectionStepOutput::None);
+        }
+
         let Some(all_sleds) =
             self.get_or_initialize_all_sleds(all_sleds).await.as_deref()
         else {
@@ -1107,6 +1185,10 @@ impl BundleCollection {
 
         let mut extra_steps: Vec<(&'static str, CollectionStepFn)> = vec![];
         for sled in all_sleds {
+            if !self.request.include_sled(sled.id()) {
+                continue;
+            }
+
             extra_steps.push((
                 "sled data",
                 Box::new({
@@ -1135,6 +1217,12 @@ impl BundleCollection {
         sled: &nexus_db_model::Sled,
         dir: &Utf8Path,
     ) -> anyhow::Result<CollectionStepOutput> {
+        if !self.request.include_host_info()
+            || !self.request.include_sled(sled.id())
+        {
+            return Ok(CollectionStepOutput::None);
+        }
+
         let log = &self.log;
         info!(&log, "Collecting bundle info from sled"; "sled" => %sled.id());
         let sled_path = dir
@@ -1146,10 +1234,6 @@ impl BundleCollection {
         tokio::fs::write(sled_path.join("sled.txt"), format!("{sled:?}"))
             .await?;
 
-        if self.request.skip_sled_info {
-            return Ok(CollectionStepOutput::None);
-        }
-
         let Ok(sled_client) = nexus_networking::sled_client(
             &self.datastore,
             &self.opctx,
@@ -2266,7 +2350,7 @@ mod test {
         let request = BundleRequest {
             // NOTE: The support bundle querying interface isn't supported on
             // the simulated sled agent (yet?) so we're skipping this step.
-            skip_sled_info: true,
+            sled_selection: HashSet::new(),
             ..Default::default()
         };
         let report = collector
@@ -2340,9 +2424,10 @@ mod test {
         // We're going to use a really small chunk size here to force the bundle
         // to get split up.
         let request = BundleRequest {
-            skip_sled_info: true,
             transfer_chunk_size: NonZeroU64::new(16).unwrap(),
+            sled_selection: HashSet::new(),
             ereport_query: None,
+            ..Default::default()
         };
 
         let report = collector
@@ -2430,8 +2515,10 @@ mod test {
         );
 
         // Each time we call "collect_bundle", we collect a SINGLE bundle.
-        let request =
-            BundleRequest { skip_sled_info: true, ..Default::default() };
+        let request = BundleRequest {
+            sled_selection: HashSet::new(),
+            ..Default::default()
+        };
         let report = collector
             .collect_bundle(&opctx, &request)
             .await
@@ -2579,8 +2666,10 @@ mod test {
             false,
             nexus.id(),
         );
-        let request =
-            BundleRequest { skip_sled_info: true, ..Default::default() };
+        let request = BundleRequest {
+            sled_selection: HashSet::new(),
+            ..Default::default()
+        };
         let report = collector
             .collect_bundle(&opctx, &request)
             .await
@@ -2726,8 +2815,10 @@ mod test {
             false,
             nexus.id(),
         );
-        let request =
-            BundleRequest { skip_sled_info: true, ..Default::default() };
+        let request = BundleRequest {
+            sled_selection: HashSet::new(),
+            ..Default::default()
+        };
         let report = collector
             .collect_bundle(&opctx, &request)
             .await
@@ -2811,8 +2902,10 @@ mod test {
             false,
             nexus.id(),
         );
-        let request =
-            BundleRequest { skip_sled_info: true, ..Default::default() };
+        let request = BundleRequest {
+            sled_selection: HashSet::new(),
+            ..Default::default()
+        };
         let report = collector
             .collect_bundle(&opctx, &request)
             .await
@@ -2897,8 +2990,10 @@ mod test {
         );
 
         // Collect the bundle
-        let request =
-            BundleRequest { skip_sled_info: true, ..Default::default() };
+        let request = BundleRequest {
+            sled_selection: HashSet::new(),
+            ..Default::default()
+        };
         let report = collector
             .collect_bundle(&opctx, &request)
             .await

From 0cb9a6a303da78f39eedd1d4fd230ca264bb4371 Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxidecomputer.com>
Date: Mon, 1 Dec 2025 16:08:36 -0800
Subject: [PATCH 03/18] docs

---
 .../src/app/background/tasks/support_bundle_collector.rs | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs
index 9692db353b4..9a1cc86f909 100644
--- a/nexus/src/app/background/tasks/support_bundle_collector.rs
+++ b/nexus/src/app/background/tasks/support_bundle_collector.rs
@@ -90,13 +90,22 @@ fn authz_support_bundle_from_id(id: SupportBundleUuid) -> authz::SupportBundle {
 // Categories should be additive.
 #[derive(Debug, Clone, Hash, Eq, PartialEq)]
 enum BundleDataCategory {
+    // Collects reconfigurator state (some of the latest blueprints,
+    // information about the target blueprint).
     Reconfigurator,
+    // Collects info from sled agents, running a handful of
+    // diagnostic commands (e.g., zoneadm, dladm, etc).
     HostInfo,
+    // Collects sled serial numbers, cubby numbers, and UUIDs.
     SledCubbyInfo,
+    // Saves task dumps from SPs.
     SpDumps,
 }
 
 // The set of sleds to include
+//
+// Multiple values of this enum are joined together into a HashSet.
+// Therefore "SledSelection::All" overrides specific sleds.
 #[derive(Debug, Clone, Hash, Eq, PartialEq)]
 enum SledSelection {
     All,

From 015f2e971641b59692839fcc3ae03f4406316c7e Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxidecomputer.com>
Date: Tue, 2 Dec 2025 11:00:20 -0800
Subject: [PATCH 04/18] meh

---
 nexus/src/app/background/tasks/support_bundle_collector.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs
index 9a1cc86f909..0adc94f37e1 100644
--- a/nexus/src/app/background/tasks/support_bundle_collector.rs
+++ b/nexus/src/app/background/tasks/support_bundle_collector.rs
@@ -1095,7 +1095,7 @@ impl BundleCollection {
 
     // Perform the work of collecting the support bundle into a temporary directory
     //
-    // "dir" is a directory where data can be stored.
+    // "dir" is an output directory where data can be stored.
     //
     // If a partial bundle can be collected, it should be returned as
     // an Ok(SupportBundleCollectionReport). Any failures from this function

From 92415e172d4436e920533d548d1b1fa1bea92db1 Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxidecomputer.com>
Date: Tue, 2 Dec 2025 11:24:46 -0800
Subject: [PATCH 05/18] Improve the support bundle report on a step-by-step
 basis

---
 dev-tools/omdb/src/bin/omdb/nexus.rs          |  15 ++
 .../tasks/support_bundle_collector.rs         | 180 ++++++++++++------
 .../integration_tests/support_bundles.rs      |  43 ++---
 nexus/types/src/internal_api/background.rs    |  31 +++
 4 files changed, 186 insertions(+), 83 deletions(-)

diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs
index 6a842a1d7e7..d8d6bab1bc7 100644
--- a/dev-tools/omdb/src/bin/omdb/nexus.rs
+++ b/dev-tools/omdb/src/bin/omdb/nexus.rs
@@ -98,6 +98,7 @@ use std::fs::OpenOptions;
 use std::os::unix::fs::PermissionsExt;
 use std::str::FromStr;
 use std::sync::Arc;
+use std::time::Duration;
 use support_bundle_viewer::LocalFileAccess;
 use support_bundle_viewer::SupportBundleAccessor;
 use tabled::Tabled;
@@ -2612,6 +2613,7 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) {
                 listed_in_service_sleds,
                 listed_sps,
                 activated_in_db_ok,
+                mut steps,
                 ereports,
             }) = collection_report
             {
@@ -2623,6 +2625,19 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) {
                 println!(
                     "      Bundle was able to list service processors: {listed_sps}"
                 );
+
+                steps.sort_unstable_by_key(|s| s.start);
+                for step in steps {
+                    let duration = (step.end - step.start)
+                        .to_std()
+                        .unwrap_or(Duration::from_millis(0));
+                    println!(
+                        "      Step {} ({}ms): {}",
+                        step.name,
+                        duration.as_millis(),
+                        step.status
+                    );
+                }
                 println!(
                     "      Bundle was activated in the database: {activated_in_db_ok}"
                 );
diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs
index 0adc94f37e1..2b63ac6e93b 100644
--- a/nexus/src/app/background/tasks/support_bundle_collector.rs
+++ b/nexus/src/app/background/tasks/support_bundle_collector.rs
@@ -6,6 +6,7 @@
 
 use crate::app::background::BackgroundTask;
 use anyhow::Context;
+use anyhow::bail;
 use base64::Engine;
 use camino::Utf8DirEntry;
 use camino::Utf8Path;
@@ -13,6 +14,8 @@ use camino::Utf8PathBuf;
 use camino_tempfile::Utf8TempDir;
 use camino_tempfile::tempdir_in;
 use camino_tempfile::tempfile_in;
+use chrono::DateTime;
+use chrono::Utc;
 use futures::FutureExt;
 use futures::StreamExt;
 use futures::future::BoxFuture;
@@ -38,6 +41,8 @@ use nexus_types::fm::Ereport;
 use nexus_types::identity::Asset;
 use nexus_types::internal_api::background::SupportBundleCleanupReport;
 use nexus_types::internal_api::background::SupportBundleCollectionReport;
+use nexus_types::internal_api::background::SupportBundleCollectionStep;
+use nexus_types::internal_api::background::SupportBundleCollectionStepStatus;
 use nexus_types::internal_api::background::SupportBundleEreportStatus;
 use omicron_common::api::external::DataPageParams;
 use omicron_common::api::external::Error;
@@ -563,47 +568,115 @@ type CollectionStepFn = Box<
         + Send,
 >;
 
-enum CollectionStepOutput {
-    Ereports(SupportBundleEreportStatus),
-    SavingSpDumps { listed_sps: bool },
-    // NOTE: The distinction between this and "Spawn" is pretty artificial -
-    // it's just to preserve a part of the report which says "we tried to
-    // list in-service sleds".
-    //
-    // If we changed the collection report, this could easily be combined
-    // with the "Spawn" variant.
-    SpawnSleds { extra_steps: Vec<(&'static str, CollectionStepFn)> },
-    Spawn { extra_steps: Vec<(&'static str, CollectionStepFn)> },
-    None,
+struct CollectionStep {
+    name: String,
+    step_fn: CollectionStepFn,
+}
+
+impl CollectionStep {
+    fn new(name: impl Into<String>, step_fn: CollectionStepFn) -> Self {
+        Self { name: name.into(), step_fn }
+    }
+
+    async fn run(
+        self,
+        collection: &Arc<BundleCollection>,
+        output: &Utf8Path,
+    ) -> CompletedCollectionStep {
+        let start = Utc::now();
+
+        let output = (self.step_fn)(collection, output)
+            .await
+            .inspect_err(|err| {
+                warn!(
+                    collection.log,
+                    "Step failed";
+                    "name" => &self.name,
+                    InlineErrorChain::new(err.as_ref()),
+                );
+            })
+            .unwrap_or_else(|err| CollectionStepOutput::Failed(err));
+
+        let end = Utc::now();
+
+        CompletedCollectionStep { name: self.name, start, end, output }
+    }
+}
+
+struct CompletedCollectionStep {
+    name: String,
+    start: DateTime<Utc>,
+    end: DateTime<Utc>,
+    output: CollectionStepOutput,
 }
 
-impl CollectionStepOutput {
+impl CompletedCollectionStep {
     // Updates the collection report based on the output of a collection step,
     // and possibly extends the set of all steps to be executed.
     fn process(
         self,
         report: &mut SupportBundleCollectionReport,
-        steps: &mut Vec<(&'static str, CollectionStepFn)>,
+        steps: &mut Vec<CollectionStep>,
     ) {
-        match self {
+        use SupportBundleCollectionStepStatus as Status;
+
+        let status = match self.output {
+            CollectionStepOutput::Skipped => Status::Skipped,
+            CollectionStepOutput::Failed(err) => {
+                Status::Failed(err.to_string())
+            }
             CollectionStepOutput::Ereports(status) => {
                 report.ereports = Some(status);
+                Status::Ok
             }
             CollectionStepOutput::SavingSpDumps { listed_sps } => {
                 report.listed_sps = listed_sps;
+                Status::Ok
             }
             CollectionStepOutput::SpawnSleds { extra_steps } => {
                 report.listed_in_service_sleds = true;
                 steps.extend(extra_steps);
+                Status::Ok
             }
             CollectionStepOutput::Spawn { extra_steps } => {
                 steps.extend(extra_steps);
+                Status::Ok
             }
-            CollectionStepOutput::None => (),
-        }
+            CollectionStepOutput::None => Status::Ok,
+        };
+
+        // Add information about this completed step the bundle report.
+        let step = SupportBundleCollectionStep {
+            name: self.name,
+            start: self.start,
+            end: self.end,
+            status,
+        };
+        report.steps.push(step);
     }
 }
 
+enum CollectionStepOutput {
+    // The step was not executed intentionally
+    Skipped,
+    // The step encountered a fatal error and could not complete.
+    //
+    // It may have still saved a partial set of data to the bundle.
+    Failed(anyhow::Error),
+    Ereports(SupportBundleEreportStatus),
+    SavingSpDumps { listed_sps: bool },
+    // NOTE: The distinction between this and "Spawn" is pretty artificial -
+    // it's just to preserve a part of the report which says "we tried to
+    // list in-service sleds".
+    //
+    // If we changed the collection report, this could easily be combined
+    // with the "Spawn" variant.
+    SpawnSleds { extra_steps: Vec<CollectionStep> },
+    Spawn { extra_steps: Vec<CollectionStep> },
+    // The step completed with nothing to report, and no follow-up steps
+    None,
+}
+
 impl BundleCollection {
     // Collect the bundle within Nexus, and store it on a target sled.
     async fn collect_bundle_and_store_on_sled(
@@ -856,7 +929,7 @@ impl BundleCollection {
     async fn run_collect_bundle_steps(
         self: &Arc<Self>,
         output: &Utf8TempDir,
-        mut steps: Vec<(&'static str, CollectionStepFn)>,
+        mut steps: Vec<CollectionStep>,
     ) -> SupportBundleCollectionReport {
         let mut report =
             SupportBundleCollectionReport::new(self.bundle.id.into());
@@ -867,34 +940,25 @@ impl BundleCollection {
 
         loop {
             // Process all the currently-planned steps
-            while let Some((step_name, step)) = steps.pop() {
+            while let Some(step) = steps.pop() {
                 let previous_result = tasks.spawn({
                     let collection = self.clone();
                     let dir = output.path().to_path_buf();
                     async move {
-                        debug!(collection.log, "Running step"; "name" => &step_name);
-                        step(&collection, dir.as_path()).await.inspect_err(|err| {
-                            warn!(
-                                collection.log,
-                                "Step failed";
-                                "name" => &step_name,
-                                InlineErrorChain::new(err.as_ref()),
-                            );
-                        })
+                        debug!(collection.log, "Running step"; "name" => &step.name);
+                        step.run(&collection, dir.as_path()).await
                     }
                 }).await;
 
-                if let Some(Ok(output)) = previous_result {
+                if let Some(output) = previous_result {
                     output.process(&mut report, &mut steps);
                 };
             }
 
             // If we've run out of tasks to spawn, join any of the previously
             // spawned tasks, if any exist.
-            if let Some(previous_result) = tasks.join_next().await {
-                if let Ok(output) = previous_result {
-                    output.process(&mut report, &mut steps);
-                };
+            if let Some(output) = tasks.join_next().await {
+                output.process(&mut report, &mut steps);
 
                 // As soon as any task completes, see if we can spawn more work
                 // immediately. This ensures that the ParallelTaskSet is
@@ -926,7 +990,7 @@ impl BundleCollection {
         dir: &Utf8Path,
     ) -> anyhow::Result<CollectionStepOutput> {
         if !self.request.include_reconfigurator_data() {
-            return Ok(CollectionStepOutput::None);
+            return Ok(CollectionStepOutput::Skipped);
         }
 
         // Collect reconfigurator state
@@ -1011,17 +1075,13 @@ impl BundleCollection {
         dir: &Utf8Path,
     ) -> anyhow::Result<CollectionStepOutput> {
         if !self.request.include_sled_cubby_info() {
-            return Ok(CollectionStepOutput::None);
+            return Ok(CollectionStepOutput::Skipped);
         }
 
         let Some(mgs_client) =
             &**self.get_or_initialize_mgs_client(mgs_client).await
         else {
-            warn!(
-                self.log,
-                "No MGS client, skipping sled cubby info collection"
-            );
-            return Ok(CollectionStepOutput::None);
+            bail!("Could not initialize MGS client");
         };
         let nexus_sleds = self
             .get_or_initialize_all_sleds(all_sleds)
@@ -1040,14 +1100,13 @@ impl BundleCollection {
         dir: &Utf8Path,
     ) -> anyhow::Result<CollectionStepOutput> {
         if !self.request.include_sp_dumps() {
-            return Ok(CollectionStepOutput::None);
+            return Ok(CollectionStepOutput::Skipped);
         }
 
         let Some(mgs_client) =
             &**self.get_or_initialize_mgs_client(mgs_client).await
         else {
-            warn!(self.log, "No MGS client, skipping SP task dump collection");
-            return Ok(CollectionStepOutput::None);
+            bail!("Could not initialize MGS client");
         };
 
         let sp_dumps_dir = dir.join("sp_task_dumps");
@@ -1055,9 +1114,9 @@ impl BundleCollection {
             format!("Failed to create SP task dump directory {sp_dumps_dir}")
         })?;
 
-        let mut extra_steps: Vec<(&'static str, CollectionStepFn)> = vec![];
+        let mut extra_steps: Vec<CollectionStep> = vec![];
         for sp in get_available_sps(&mgs_client).await? {
-            extra_steps.push((
+            extra_steps.push(CollectionStep::new(
                 "SP dump",
                 Box::new({
                     let mgs_client = mgs_client.clone();
@@ -1083,7 +1142,7 @@ impl BundleCollection {
         dir: &Utf8Path,
     ) -> anyhow::Result<CollectionStepOutput> {
         if !self.request.include_sp_dumps() {
-            return Ok(CollectionStepOutput::None);
+            return Ok(CollectionStepOutput::Skipped);
         }
 
         save_sp_dumps(mgs_client, sp, dir).await.with_context(|| {
@@ -1124,26 +1183,26 @@ impl BundleCollection {
         // Shared, lazy, fallible initialization for MGS client
         let mgs_client: OnceCell<Arc<Option<MgsClient>>> = OnceCell::new();
 
-        let steps: Vec<(&str, CollectionStepFn)> = vec![
-            (
+        let steps: Vec<CollectionStep> = vec![
+            CollectionStep::new(
                 "bundle id",
                 Box::new(|collection, dir| {
                     collection.collect_bundle_id(dir).boxed()
                 }),
             ),
-            (
+            CollectionStep::new(
                 "reconfigurator state",
                 Box::new(|collection, dir| {
                     collection.collect_reconfigurator_state(dir).boxed()
                 }),
             ),
-            (
+            CollectionStep::new(
                 "ereports",
                 Box::new(|collection, dir| {
                     collection.collect_ereports(dir).boxed()
                 }),
             ),
-            (
+            CollectionStep::new(
                 "sled cubby info",
                 Box::new({
                     let all_sleds = all_sleds.clone();
@@ -1162,7 +1221,7 @@ impl BundleCollection {
                     }
                 }),
             ),
-            (
+            CollectionStep::new(
                 "spawn steps to query all SP dumps",
                 Box::new({
                     let mgs_client = mgs_client.clone();
@@ -1176,7 +1235,7 @@ impl BundleCollection {
                     }
                 }),
             ),
-            (
+            CollectionStep::new(
                 "spawn steps to query all sleds",
                 Box::new({
                     let all_sleds = all_sleds.clone();
@@ -1198,23 +1257,22 @@ impl BundleCollection {
         all_sleds: &OnceCell<Arc<Option<Vec<Sled>>>>,
     ) -> anyhow::Result<CollectionStepOutput> {
         if !self.request.include_host_info() {
-            return Ok(CollectionStepOutput::None);
+            return Ok(CollectionStepOutput::Skipped);
         }
 
         let Some(all_sleds) =
             self.get_or_initialize_all_sleds(all_sleds).await.as_deref()
         else {
-            warn!(self.log, "Could not read list of sleds");
-            return Ok(CollectionStepOutput::None);
+            bail!("Could not read list of sleds");
         };
 
-        let mut extra_steps: Vec<(&'static str, CollectionStepFn)> = vec![];
+        let mut extra_steps: Vec<CollectionStep> = vec![];
         for sled in all_sleds {
             if !self.request.include_sled(sled.id()) {
                 continue;
             }
 
-            extra_steps.push((
+            extra_steps.push(CollectionStep::new(
                 "sled data",
                 Box::new({
                     let sled = sled.clone();
@@ -1245,7 +1303,7 @@ impl BundleCollection {
         if !self.request.include_host_info()
             || !self.request.include_sled(sled.id())
         {
-            return Ok(CollectionStepOutput::None);
+            return Ok(CollectionStepOutput::Skipped);
         }
 
         let log = &self.log;
@@ -1272,7 +1330,7 @@ impl BundleCollection {
                 "Could not contact sled",
             )
             .await?;
-            return Ok(CollectionStepOutput::None);
+            bail!("Could not contact sled");
         };
 
         // NB: As new sled-diagnostic commands are added they should
@@ -1386,7 +1444,7 @@ impl BundleCollection {
     ) -> anyhow::Result<CollectionStepOutput> {
         let Some(ref ereport_filters) = self.request.ereport_query else {
             debug!(self.log, "Support bundle: ereports not requested");
-            return Ok(CollectionStepOutput::None);
+            return Ok(CollectionStepOutput::Skipped);
         };
         let ereports_dir = dir.join("ereports");
         let mut status = SupportBundleEreportStatus::default();
diff --git a/nexus/tests/integration_tests/support_bundles.rs b/nexus/tests/integration_tests/support_bundles.rs
index 716bc228ca9..75bb4bfa64d 100644
--- a/nexus/tests/integration_tests/support_bundles.rs
+++ b/nexus/tests/integration_tests/support_bundles.rs
@@ -486,18 +486,18 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) {
         output.cleanup_report,
         Some(SupportBundleCleanupReport { ..Default::default() })
     );
+
+    let report = output.collection_report.as_ref().expect("Missing report");
+    assert_eq!(report.bundle, bundle.id);
+    assert!(report.listed_in_service_sleds);
+    assert!(report.listed_sps);
+    assert!(report.activated_in_db_ok);
     assert_eq!(
-        output.collection_report,
-        Some(SupportBundleCollectionReport {
-            bundle: bundle.id,
-            listed_in_service_sleds: true,
-            listed_sps: true,
-            activated_in_db_ok: true,
-            ereports: Some(SupportBundleEreportStatus {
-                n_collected: 0,
-                n_found: 0,
-                errors: Vec::new()
-            })
+        report.ereports,
+        Some(SupportBundleEreportStatus {
+            n_collected: 0,
+            n_found: 0,
+            errors: Vec::new()
         })
     );
     let bundle = bundle_get(&client, bundle.id).await.unwrap();
@@ -588,18 +588,17 @@ async fn test_support_bundle_range_requests(
     // Finish collection, activate the bundle.
     let output = activate_bundle_collection_background_task(&cptestctx).await;
     assert_eq!(output.collection_err, None);
+    let report = output.collection_report.as_ref().expect("Missing report");
+    assert_eq!(report.bundle, bundle.id);
+    assert!(report.listed_in_service_sleds);
+    assert!(report.listed_sps);
+    assert!(report.activated_in_db_ok);
     assert_eq!(
-        output.collection_report,
-        Some(SupportBundleCollectionReport {
-            bundle: bundle.id,
-            listed_in_service_sleds: true,
-            listed_sps: true,
-            activated_in_db_ok: true,
-            ereports: Some(SupportBundleEreportStatus {
-                n_collected: 0,
-                n_found: 0,
-                errors: Vec::new()
-            })
+        report.ereports,
+        Some(SupportBundleEreportStatus {
+            n_collected: 0,
+            n_found: 0,
+            errors: Vec::new()
         })
     );
     let bundle = bundle_get(&client, bundle.id).await.unwrap();
diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs
index 08d343182e6..42264d0411b 100644
--- a/nexus/types/src/internal_api/background.rs
+++ b/nexus/types/src/internal_api/background.rs
@@ -281,11 +281,41 @@ pub struct SupportBundleCollectionReport {
     /// True iff the bundle was successfully made 'active' in the database.
     pub activated_in_db_ok: bool,
 
+    /// All steps taken, alongside their timing information, when collecting the
+    /// bundle.
+    pub steps: Vec<SupportBundleCollectionStep>,
+
     /// Status of ereport collection, or `None` if no ereports were requested
     /// for this support bundle.
     pub ereports: Option<SupportBundleEreportStatus>,
 }
 
+#[derive(Debug, Deserialize, Serialize, PartialEq, Eq)]
+pub struct SupportBundleCollectionStep {
+    pub name: String,
+    pub start: DateTime<Utc>,
+    pub end: DateTime<Utc>,
+    pub status: SupportBundleCollectionStepStatus,
+}
+
+#[derive(Debug, Deserialize, Serialize, PartialEq, Eq)]
+pub enum SupportBundleCollectionStepStatus {
+    Ok,
+    Skipped,
+    Failed(String),
+}
+
+impl std::fmt::Display for SupportBundleCollectionStepStatus {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        use SupportBundleCollectionStepStatus::*;
+        match self {
+            Ok => write!(f, "ok"),
+            Skipped => write!(f, "skipped"),
+            Failed(why) => write!(f, "failed: {why}"),
+        }
+    }
+}
+
 #[derive(Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
 pub struct SupportBundleEreportStatus {
     /// The total number of ereports found that match the requested filters.
@@ -309,6 +339,7 @@ impl SupportBundleCollectionReport {
             listed_in_service_sleds: false,
             listed_sps: false,
             activated_in_db_ok: false,
+            steps: vec![],
             ereports: None,
         }
     }

From 4d59114117248febe5ca8a1aa8981fa9f0d7b289 Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxidecomputer.com>
Date: Tue, 2 Dec 2025 17:11:36 -0800
Subject: [PATCH 06/18] Better table output, step labels

---
 dev-tools/omdb/src/bin/omdb/nexus.rs          | 36 +++++++++++++------
 .../tasks/support_bundle_collector.rs         |  4 +--
 .../integration_tests/support_bundles.rs      | 22 ++++++++++++
 3 files changed, 50 insertions(+), 12 deletions(-)

diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs
index d8d6bab1bc7..9833a7bf141 100644
--- a/dev-tools/omdb/src/bin/omdb/nexus.rs
+++ b/dev-tools/omdb/src/bin/omdb/nexus.rs
@@ -2626,17 +2626,33 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) {
                     "      Bundle was able to list service processors: {listed_sps}"
                 );
 
+                #[derive(Tabled)]
+                #[tabled(rename_all = "SCREAMING_SNAKE_CASE")]
+                struct StepRow {
+                    step_name: String,
+                    start_time: String,
+                    duration: String,
+                    status: String,
+                }
+
                 steps.sort_unstable_by_key(|s| s.start);
-                for step in steps {
-                    let duration = (step.end - step.start)
-                        .to_std()
-                        .unwrap_or(Duration::from_millis(0));
-                    println!(
-                        "      Step {} ({}ms): {}",
-                        step.name,
-                        duration.as_millis(),
-                        step.status
-                    );
+                let rows: Vec<StepRow> = steps
+                    .into_iter()
+                    .map(|step| {
+                        let duration = (step.end - step.start)
+                            .to_std()
+                            .unwrap_or(Duration::from_millis(0));
+                        StepRow {
+                            step_name: step.name,
+                            start_time: step.start.to_rfc3339(),
+                            duration: format!("{:.3}s", duration.as_secs_f64()),
+                            status: step.status.to_string(),
+                        }
+                    })
+                    .collect();
+
+                if !rows.is_empty() {
+                    println!("\n{}", tabled::Table::new(rows));
                 }
                 println!(
                     "      Bundle was activated in the database: {activated_in_db_ok}"
diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs
index 2b63ac6e93b..41babc3838b 100644
--- a/nexus/src/app/background/tasks/support_bundle_collector.rs
+++ b/nexus/src/app/background/tasks/support_bundle_collector.rs
@@ -1117,7 +1117,7 @@ impl BundleCollection {
         let mut extra_steps: Vec<CollectionStep> = vec![];
         for sp in get_available_sps(&mgs_client).await? {
             extra_steps.push(CollectionStep::new(
-                "SP dump",
+                format!("SP dump for {:?}", sp),
                 Box::new({
                     let mgs_client = mgs_client.clone();
                     move |collection, dir| {
@@ -1273,7 +1273,7 @@ impl BundleCollection {
             }
 
             extra_steps.push(CollectionStep::new(
-                "sled data",
+                format!("sled data for sled {}", sled.id()),
                 Box::new({
                     let sled = sled.clone();
                     move |collection, dir| {
diff --git a/nexus/tests/integration_tests/support_bundles.rs b/nexus/tests/integration_tests/support_bundles.rs
index 75bb4bfa64d..ade2cbdb2c9 100644
--- a/nexus/tests/integration_tests/support_bundles.rs
+++ b/nexus/tests/integration_tests/support_bundles.rs
@@ -500,6 +500,17 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) {
             errors: Vec::new()
         })
     );
+
+    // Verify that steps were recorded with reasonable timing data
+    assert!(!report.steps.is_empty(), "Should have recorded some steps");
+    for step in &report.steps {
+        assert!(
+            step.end >= step.start,
+            "Step '{}' end time should be >= start time",
+            step.name
+        );
+    }
+
     let bundle = bundle_get(&client, bundle.id).await.unwrap();
     assert_eq!(bundle.state, SupportBundleState::Active);
 
@@ -601,6 +612,17 @@ async fn test_support_bundle_range_requests(
             errors: Vec::new()
         })
     );
+
+    // Verify that steps were recorded with reasonable timing data
+    assert!(!report.steps.is_empty(), "Should have recorded some steps");
+    for step in &report.steps {
+        assert!(
+            step.end >= step.start,
+            "Step '{}' end time should be >= start time",
+            step.name
+        );
+    }
+
     let bundle = bundle_get(&client, bundle.id).await.unwrap();
     assert_eq!(bundle.state, SupportBundleState::Active);
 

From e69f392cae63f102cf5ff40752db914eaeda1666 Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxidecomputer.com>
Date: Wed, 3 Dec 2025 12:58:08 -0800
Subject: [PATCH 07/18] [support bundle] Simplify report, relying on new
 'steps' infrastructure

---
 dev-tools/omdb/src/bin/omdb/nexus.rs          |  8 --
 .../tasks/support_bundle_collector.rs         | 94 ++++++++++++-------
 .../integration_tests/support_bundles.rs      | 29 +++++-
 nexus/types/src/internal_api/background.rs    | 21 +++--
 4 files changed, 97 insertions(+), 55 deletions(-)

diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs
index 9833a7bf141..b17751821a3 100644
--- a/dev-tools/omdb/src/bin/omdb/nexus.rs
+++ b/dev-tools/omdb/src/bin/omdb/nexus.rs
@@ -2610,8 +2610,6 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) {
 
             if let Some(SupportBundleCollectionReport {
                 bundle,
-                listed_in_service_sleds,
-                listed_sps,
                 activated_in_db_ok,
                 mut steps,
                 ereports,
@@ -2619,12 +2617,6 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) {
             {
                 println!("    Support Bundle Collection Report:");
                 println!("      Bundle ID: {bundle}");
-                println!(
-                    "      Bundle was able to list in-service sleds: {listed_in_service_sleds}"
-                );
-                println!(
-                    "      Bundle was able to list service processors: {listed_sps}"
-                );
 
                 #[derive(Tabled)]
                 #[tabled(rename_all = "SCREAMING_SNAKE_CASE")]
diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs
index 41babc3838b..5eb9b6f6370 100644
--- a/nexus/src/app/background/tasks/support_bundle_collector.rs
+++ b/nexus/src/app/background/tasks/support_bundle_collector.rs
@@ -629,15 +629,6 @@ impl CompletedCollectionStep {
                 report.ereports = Some(status);
                 Status::Ok
             }
-            CollectionStepOutput::SavingSpDumps { listed_sps } => {
-                report.listed_sps = listed_sps;
-                Status::Ok
-            }
-            CollectionStepOutput::SpawnSleds { extra_steps } => {
-                report.listed_in_service_sleds = true;
-                steps.extend(extra_steps);
-                Status::Ok
-            }
             CollectionStepOutput::Spawn { extra_steps } => {
                 steps.extend(extra_steps);
                 Status::Ok
@@ -664,14 +655,7 @@ enum CollectionStepOutput {
     // It may have still saved a partial set of data to the bundle.
     Failed(anyhow::Error),
     Ereports(SupportBundleEreportStatus),
-    SavingSpDumps { listed_sps: bool },
-    // NOTE: The distinction between this and "Spawn" is pretty artificial -
-    // it's just to preserve a part of the report which says "we tried to
-    // list in-service sleds".
-    //
-    // If we changed the collection report, this could easily be combined
-    // with the "Spawn" variant.
-    SpawnSleds { extra_steps: Vec<CollectionStep> },
+    // The step spawned additional steps to execute
     Spawn { extra_steps: Vec<CollectionStep> },
     // The step completed with nothing to report, and no follow-up steps
     None,
@@ -1149,7 +1133,7 @@ impl BundleCollection {
             format!("failed to save SP dump from: {} {}", sp.type_, sp.slot)
         })?;
 
-        Ok(CollectionStepOutput::SavingSpDumps { listed_sps: true })
+        Ok(CollectionStepOutput::None)
     }
 
     // Perform the work of collecting the support bundle into a temporary directory
@@ -1185,25 +1169,25 @@ impl BundleCollection {
 
         let steps: Vec<CollectionStep> = vec![
             CollectionStep::new(
-                "bundle id",
+                SupportBundleCollectionStep::STEP_BUNDLE_ID,
                 Box::new(|collection, dir| {
                     collection.collect_bundle_id(dir).boxed()
                 }),
             ),
             CollectionStep::new(
-                "reconfigurator state",
+                SupportBundleCollectionStep::STEP_RECONFIGURATOR_STATE,
                 Box::new(|collection, dir| {
                     collection.collect_reconfigurator_state(dir).boxed()
                 }),
             ),
             CollectionStep::new(
-                "ereports",
+                SupportBundleCollectionStep::STEP_EREPORTS,
                 Box::new(|collection, dir| {
                     collection.collect_ereports(dir).boxed()
                 }),
             ),
             CollectionStep::new(
-                "sled cubby info",
+                SupportBundleCollectionStep::STEP_SLED_CUBBY_INFO,
                 Box::new({
                     let all_sleds = all_sleds.clone();
                     let mgs_client = mgs_client.clone();
@@ -1222,7 +1206,7 @@ impl BundleCollection {
                 }),
             ),
             CollectionStep::new(
-                "spawn steps to query all SP dumps",
+                SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS,
                 Box::new({
                     let mgs_client = mgs_client.clone();
                     move |collection, dir| {
@@ -1236,7 +1220,7 @@ impl BundleCollection {
                 }),
             ),
             CollectionStep::new(
-                "spawn steps to query all sleds",
+                SupportBundleCollectionStep::STEP_SPAWN_SLEDS,
                 Box::new({
                     let all_sleds = all_sleds.clone();
                     move |collection, _| {
@@ -1286,7 +1270,7 @@ impl BundleCollection {
             ));
         }
 
-        return Ok(CollectionStepOutput::SpawnSleds { extra_steps });
+        return Ok(CollectionStepOutput::Spawn { extra_steps });
     }
 
     // Collect data from a sled, storing it into a directory that will
@@ -2425,8 +2409,16 @@ mod test {
             .expect("Collection should have succeeded under test")
             .expect("Collecting the bundle should have generated a report");
         assert_eq!(report.bundle, bundle.id.into());
-        assert!(report.listed_in_service_sleds);
-        assert!(report.listed_sps);
+        // Verify that we spawned steps to query sleds and SPs
+        let step_names: Vec<_> =
+            report.steps.iter().map(|s| s.name.as_str()).collect();
+        assert!(
+            step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SLEDS)
+        );
+        assert!(
+            step_names
+                .contains(&SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS)
+        );
         assert!(report.activated_in_db_ok);
         assert_eq!(
             report.ereports,
@@ -2502,8 +2494,16 @@ mod test {
             .expect("Collection should have succeeded under test")
             .expect("Collecting the bundle should have generated a report");
         assert_eq!(report.bundle, bundle.id.into());
-        assert!(report.listed_in_service_sleds);
-        assert!(report.listed_sps);
+        // Verify that we spawned steps to query sleds and SPs
+        let step_names: Vec<_> =
+            report.steps.iter().map(|s| s.name.as_str()).collect();
+        assert!(
+            step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SLEDS)
+        );
+        assert!(
+            step_names
+                .contains(&SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS)
+        );
         assert!(report.activated_in_db_ok);
 
         let observed_bundle = datastore
@@ -2591,8 +2591,16 @@ mod test {
             .expect("Collection should have succeeded under test")
             .expect("Collecting the bundle should have generated a report");
         assert_eq!(report.bundle, bundle1.id.into());
-        assert!(report.listed_in_service_sleds);
-        assert!(report.listed_sps);
+        // Verify that we spawned steps to query sleds and SPs
+        let step_names: Vec<_> =
+            report.steps.iter().map(|s| s.name.as_str()).collect();
+        assert!(
+            step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SLEDS)
+        );
+        assert!(
+            step_names
+                .contains(&SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS)
+        );
         assert!(report.activated_in_db_ok);
 
         // This is observable by checking the state of bundle1 and bundle2:
@@ -2614,8 +2622,16 @@ mod test {
             .expect("Collection should have succeeded under test")
             .expect("Collecting the bundle should have generated a report");
         assert_eq!(report.bundle, bundle2.id.into());
-        assert!(report.listed_in_service_sleds);
-        assert!(report.listed_sps);
+        // Verify that we spawned steps to query sleds and SPs
+        let step_names: Vec<_> =
+            report.steps.iter().map(|s| s.name.as_str()).collect();
+        assert!(
+            step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SLEDS)
+        );
+        assert!(
+            step_names
+                .contains(&SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS)
+        );
         assert!(report.activated_in_db_ok);
 
         // After another collection request, we'll see that both bundles have
@@ -2742,8 +2758,16 @@ mod test {
             .expect("Collection should have succeeded under test")
             .expect("Collecting the bundle should have generated a report");
         assert_eq!(report.bundle, bundle.id.into());
-        assert!(report.listed_in_service_sleds);
-        assert!(report.listed_sps);
+        // Verify that we spawned steps to query sleds and SPs
+        let step_names: Vec<_> =
+            report.steps.iter().map(|s| s.name.as_str()).collect();
+        assert!(
+            step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SLEDS)
+        );
+        assert!(
+            step_names
+                .contains(&SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS)
+        );
         assert!(report.activated_in_db_ok);
 
         // Cancel the bundle after collection has completed
diff --git a/nexus/tests/integration_tests/support_bundles.rs b/nexus/tests/integration_tests/support_bundles.rs
index ade2cbdb2c9..80ec8af191f 100644
--- a/nexus/tests/integration_tests/support_bundles.rs
+++ b/nexus/tests/integration_tests/support_bundles.rs
@@ -19,6 +19,7 @@ use nexus_types::external_api::shared::SupportBundleInfo;
 use nexus_types::external_api::shared::SupportBundleState;
 use nexus_types::internal_api::background::SupportBundleCleanupReport;
 use nexus_types::internal_api::background::SupportBundleCollectionReport;
+use nexus_types::internal_api::background::SupportBundleCollectionStep;
 use nexus_types::internal_api::background::SupportBundleEreportStatus;
 use omicron_uuid_kinds::SupportBundleUuid;
 use serde::Deserialize;
@@ -489,8 +490,6 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) {
 
     let report = output.collection_report.as_ref().expect("Missing report");
     assert_eq!(report.bundle, bundle.id);
-    assert!(report.listed_in_service_sleds);
-    assert!(report.listed_sps);
     assert!(report.activated_in_db_ok);
     assert_eq!(
         report.ereports,
@@ -511,6 +510,18 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) {
         );
     }
 
+    // Verify that we successfully spawned steps to query sleds and SPs
+    let step_names: Vec<_> =
+        report.steps.iter().map(|s| s.name.as_str()).collect();
+    assert!(
+        step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SLEDS),
+        "Should have attempted to list in-service sleds"
+    );
+    assert!(
+        step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS),
+        "Should have attempted to list service processors"
+    );
+
     let bundle = bundle_get(&client, bundle.id).await.unwrap();
     assert_eq!(bundle.state, SupportBundleState::Active);
 
@@ -601,8 +612,6 @@ async fn test_support_bundle_range_requests(
     assert_eq!(output.collection_err, None);
     let report = output.collection_report.as_ref().expect("Missing report");
     assert_eq!(report.bundle, bundle.id);
-    assert!(report.listed_in_service_sleds);
-    assert!(report.listed_sps);
     assert!(report.activated_in_db_ok);
     assert_eq!(
         report.ereports,
@@ -623,6 +632,18 @@ async fn test_support_bundle_range_requests(
         );
     }
 
+    // Verify that we successfully spawned steps to query sleds and SPs
+    let step_names: Vec<_> =
+        report.steps.iter().map(|s| s.name.as_str()).collect();
+    assert!(
+        step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SLEDS),
+        "Should have attempted to list in-service sleds"
+    );
+    assert!(
+        step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS),
+        "Should have attempted to list service processors"
+    );
+
     let bundle = bundle_get(&client, bundle.id).await.unwrap();
     assert_eq!(bundle.state, SupportBundleState::Active);
 
diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs
index 42264d0411b..dfe008198f9 100644
--- a/nexus/types/src/internal_api/background.rs
+++ b/nexus/types/src/internal_api/background.rs
@@ -272,12 +272,6 @@ pub struct SupportBundleCleanupReport {
 pub struct SupportBundleCollectionReport {
     pub bundle: SupportBundleUuid,
 
-    /// True iff we could list in-service sleds
-    pub listed_in_service_sleds: bool,
-
-    /// True iff we could list the service processors.
-    pub listed_sps: bool,
-
     /// True iff the bundle was successfully made 'active' in the database.
     pub activated_in_db_ok: bool,
 
@@ -298,6 +292,19 @@ pub struct SupportBundleCollectionStep {
     pub status: SupportBundleCollectionStepStatus,
 }
 
+impl SupportBundleCollectionStep {
+    /// Step name constants for the main collection steps.
+    ///
+    /// These are used both when creating steps and when validating in tests.
+    pub const STEP_BUNDLE_ID: &'static str = "bundle id";
+    pub const STEP_RECONFIGURATOR_STATE: &'static str = "reconfigurator state";
+    pub const STEP_EREPORTS: &'static str = "ereports";
+    pub const STEP_SLED_CUBBY_INFO: &'static str = "sled cubby info";
+    pub const STEP_SPAWN_SP_DUMPS: &'static str =
+        "spawn steps to query all SP dumps";
+    pub const STEP_SPAWN_SLEDS: &'static str = "spawn steps to query all sleds";
+}
+
 #[derive(Debug, Deserialize, Serialize, PartialEq, Eq)]
 pub enum SupportBundleCollectionStepStatus {
     Ok,
@@ -336,8 +343,6 @@ impl SupportBundleCollectionReport {
     pub fn new(bundle: SupportBundleUuid) -> Self {
         Self {
             bundle,
-            listed_in_service_sleds: false,
-            listed_sps: false,
             activated_in_db_ok: false,
             steps: vec![],
             ereports: None,

From 1886293ab4d50633bea50058e848bdf70ba1f64c Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxidecomputer.com>
Date: Wed, 3 Dec 2025 13:25:00 -0800
Subject: [PATCH 08/18] improve comment

---
 nexus/src/app/background/tasks/support_bundle_collector.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs
index 0adc94f37e1..34aa31862c4 100644
--- a/nexus/src/app/background/tasks/support_bundle_collector.rs
+++ b/nexus/src/app/background/tasks/support_bundle_collector.rs
@@ -124,10 +124,10 @@ struct BundleRequest {
     // The set of data to be included within this bundle.
     data_selection: HashSet<BundleDataCategory>,
 
-    // The set of sets to be included within this bundle.
+    // The set of sleds to be included within this bundle.
     //
     // NOTE: This selection is only considered if "data_selection" requests
-    // data from specific sleds.
+    // data from sleds.
     sled_selection: HashSet<SledSelection>,
 
     // The set of ereports to be included within this bundle.

From f60f53fad5ed27a8abd602a12887ab637cc2f025 Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxidecomputer.com>
Date: Fri, 5 Dec 2025 11:46:22 -0800
Subject: [PATCH 09/18] tracing

---
 .../tasks/support_bundle_collector.rs         | 207 ++++++++++++++++++
 1 file changed, 207 insertions(+)

diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs
index 32591c84c2c..0a712a89480 100644
--- a/nexus/src/app/background/tasks/support_bundle_collector.rs
+++ b/nexus/src/app/background/tasks/support_bundle_collector.rs
@@ -1037,11 +1037,94 @@ impl BundleCollection {
             //
             // Only finish if we've exhausted all possible steps and joined all spawned work.
             if steps.is_empty() {
+                // Write trace file before returning
+                if let Err(err) = self.write_trace_file(output, &report).await {
+                    warn!(
+                        self.log,
+                        "Failed to write trace file";
+                        "error" => ?err
+                    );
+                }
                 return report;
             }
         }
     }
 
+    // Write a Perfetto Event format JSON file for visualization
+    async fn write_trace_file(
+        &self,
+        output: &Utf8TempDir,
+        report: &SupportBundleCollectionReport,
+    ) -> anyhow::Result<()> {
+        let meta_dir = output.path().join("meta");
+        tokio::fs::create_dir_all(&meta_dir).await.with_context(|| {
+            format!("Failed to create meta directory {meta_dir}")
+        })?;
+
+        let trace_path = meta_dir.join("trace.json");
+
+        // Convert steps to Perfetto Trace Event format.
+        // Sort steps by start time and assign each a unique sequential ID.
+        //
+        // This is necessary because the trace event format does not like
+        // multiple slices to overlap - so we make each slice distinct.
+        //
+        // Ideally we'd be able to correlate these with actual tokio tasks,
+        // but it's hard to convert tokio::task::Id to a u64 because
+        // of https://github.com/tokio-rs/tokio/issues/7430
+        let mut sorted_steps: Vec<_> = report.steps.iter().collect();
+        sorted_steps.sort_by_key(|s| s.start);
+
+        // Generate trace events - each step gets a unique ID (1, 2, 3, ...)
+        // based on its start time order
+        let trace_events: Vec<_> = sorted_steps
+            .iter()
+            .enumerate()
+            .map(|(i, step)| {
+                let start_us = step.start.timestamp_micros();
+                let duration_us = (step.end - step.start)
+                    .num_microseconds()
+                    .unwrap_or(0)
+                    .max(0);
+                let step_id = i + 1;
+
+                json!({
+                    "name": step.name,
+                    "cat": "bundle_collection",
+                    "ph": "X",  // Complete event (has duration)
+                    "ts": start_us,
+                    "dur": duration_us,
+                    "pid": 1,
+                    "tid": step_id,
+                    "args": {
+                        "status": step.status.to_string(),
+                    }
+                })
+            })
+            .collect();
+
+        let trace_json = json!({
+            "traceEvents": trace_events,
+            "displayTimeUnit": "ms",
+        });
+
+        let trace_content = serde_json::to_string_pretty(&trace_json)
+            .context("Failed to serialize trace JSON")?;
+
+        tokio::fs::write(&trace_path, trace_content).await.with_context(
+            || format!("Failed to write trace file to {trace_path}"),
+        )?;
+
+        info!(
+            self.log,
+            "Wrote trace file";
+            "path" => %trace_path,
+            "num_events" => trace_events.len()
+        );
+
+        Ok(())
+    }
+
     async fn collect_bundle_id(
         &self,
         dir: &Utf8Path,
@@ -2528,6 +2611,130 @@ mod test {
         assert!(report.is_none());
     }
 
+    #[nexus_test(server = crate::Server)]
+    async fn test_trace_file_generated(cptestctx: &ControlPlaneTestContext) {
+        let nexus = &cptestctx.server.server_context().nexus;
+        let datastore = nexus.datastore();
+        let resolver = nexus.resolver();
+        let opctx = OpContext::for_tests(
+            cptestctx.logctx.log.clone(),
+            datastore.clone(),
+        );
+
+        // Before we can create any bundles, we need to create the
+        // space for them to be provisioned.
+        let _datasets =
+            TestDataset::setup(cptestctx, &datastore, &opctx, 1).await;
+
+        // Create a bundle to collect
+        let bundle = datastore
+            .support_bundle_create(
+                &opctx,
+                "For trace file testing",
+                nexus.id(),
+                None,
+            )
+            .await
+            .expect("Couldn't allocate a support bundle");
+
+        let collector = SupportBundleCollector::new(
+            datastore.clone(),
+            resolver.clone(),
+            false,
+            nexus.id(),
+        );
+
+        // Collect the bundle
+        let mut request = BundleRequest::default();
+        request.data_selection.insert(BundleData::HostInfo(HashSet::new()));
+        let report = collector
+            .collect_bundle(&opctx, &request)
+            .await
+            .expect("Collection should have succeeded")
+            .expect("Should have generated a report");
+
+        // Download the trace file from the bundle
+        let head = false;
+        let range = None;
+        let response = nexus
+            .support_bundle_download(
+                &opctx,
+                bundle.id.into(),
+                SupportBundleQueryType::Path {
+                    file_path: "meta/trace.json".to_string(),
+                },
+                head,
+                range,
+            )
+            .await
+            .expect("Should be able to download trace file");
+
+        // Parse the trace file as JSON
+        let body_bytes =
+            response.into_body().collect().await.unwrap().to_bytes();
+        let trace_json: serde_json::Value = serde_json::from_slice(&body_bytes)
+            .expect("Trace file should be valid JSON");
+
+        // Verify the structure matches Perfetto Trace Event format
+        let trace_events = trace_json
+            .get("traceEvents")
+            .expect("Should have traceEvents field")
+            .as_array()
+            .expect("traceEvents should be an array");
+
+        // We should have at least the main collection steps
+        assert!(
+            !trace_events.is_empty(),
+            "Should have at least one trace event"
+        );
+
+        // Verify each event has the expected fields
+        for event in trace_events {
+            assert!(event.get("name").is_some(), "Event should have name");
+            assert_eq!(
+                event.get("cat").and_then(|v| v.as_str()),
+                Some("bundle_collection"),
+                "Event should have category 'bundle_collection'"
+            );
+            assert_eq!(
+                event.get("ph").and_then(|v| v.as_str()),
+                Some("X"),
+                "Event should be Complete event type"
+            );
+            assert!(
+                event.get("ts").and_then(|v| v.as_i64()).is_some(),
+                "Event should have timestamp"
+            );
+            assert!(
+                event.get("dur").and_then(|v| v.as_i64()).is_some(),
+                "Event should have duration"
+            );
+            assert!(
+                event.get("args").is_some(),
+                "Event should have args field"
+            );
+        }
+
+        // Verify we have the same number of events as steps in the report
+        assert_eq!(
+            trace_events.len(),
+            report.steps.len(),
+            "Number of events should match number of steps"
+        );
+
+        // Verify step names match between report and trace
+        let trace_names: std::collections::HashSet<_> = trace_events
+            .iter()
+            .filter_map(|e| e.get("name").and_then(|v| v.as_str()))
+            .collect();
+        let report_names: std::collections::HashSet<_> =
+            report.steps.iter().map(|s| s.name.as_str()).collect();
+        assert_eq!(
+            trace_names, report_names,
+            "Trace event names should match report step names"
+        );
+    }
+
     #[nexus_test(server = crate::Server)]
     async fn test_collect_chunked(cptestctx: &ControlPlaneTestContext) {
         let nexus = &cptestctx.server.server_context().nexus;

From 3873a571a99de54aa2701011ee6b7aef61285607 Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxidecomputer.com>
Date: Mon, 8 Dec 2025 11:30:22 -0800
Subject: [PATCH 10/18] Extract trace structs

---
 nexus/src/app/background/tasks/mod.rs         |  1 +
 .../background/tasks/support_bundle/mod.rs    |  7 ++
 .../tasks/support_bundle/perfetto.rs          | 51 ++++++++++
 .../tasks/support_bundle_collector.rs         | 97 ++++++++-----------
 4 files changed, 102 insertions(+), 54 deletions(-)
 create mode 100644 nexus/src/app/background/tasks/support_bundle/mod.rs
 create mode 100644 nexus/src/app/background/tasks/support_bundle/perfetto.rs

diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs
index 64df7770da1..ae27b2d12ca 100644
--- a/nexus/src/app/background/tasks/mod.rs
+++ b/nexus/src/app/background/tasks/mod.rs
@@ -43,6 +43,7 @@ pub mod region_snapshot_replacement_start;
 pub mod region_snapshot_replacement_step;
 pub mod saga_recovery;
 pub mod service_firewall_rules;
+pub mod support_bundle;
 pub mod support_bundle_collector;
 pub mod sync_service_zone_nat;
 pub mod sync_switch_configuration;
diff --git a/nexus/src/app/background/tasks/support_bundle/mod.rs b/nexus/src/app/background/tasks/support_bundle/mod.rs
new file mode 100644
index 00000000000..9b7b4ac4aa0
--- /dev/null
+++ b/nexus/src/app/background/tasks/support_bundle/mod.rs
@@ -0,0 +1,7 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Support bundle related types and utilities
+
+pub mod perfetto;
diff --git a/nexus/src/app/background/tasks/support_bundle/perfetto.rs b/nexus/src/app/background/tasks/support_bundle/perfetto.rs
new file mode 100644
index 00000000000..8653b7b907b
--- /dev/null
+++ b/nexus/src/app/background/tasks/support_bundle/perfetto.rs
@@ -0,0 +1,51 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Perfetto Trace Event format support for visualizing support bundle collection
+
+use serde::Deserialize;
+use serde::Serialize;
+
+/// Represents a Perfetto Trace Event format JSON file for visualization.
+///
+/// This format is used by the Perfetto trace viewer (<https://ui.perfetto.dev/>)
+/// to visualize timing information for operations.
+#[derive(Serialize, Deserialize)]
+pub struct Trace {
+    #[serde(rename = "traceEvents")]
+    pub trace_events: Vec<TraceEvent>,
+    /// Display unit for time values in the UI (e.g., "ms" for milliseconds)
+    #[serde(rename = "displayTimeUnit")]
+    pub display_time_unit: String,
+}
+
+/// A single event in the Perfetto Trace Event format.
+///
+/// This represents a complete event (duration event) showing when an operation
+/// started and how long it took.
+#[derive(Serialize, Deserialize)]
+pub struct TraceEvent {
+    /// Human-readable name of the event
+    pub name: String,
+    /// Category name (abbreviated as "cat" in Perfetto format).
+    /// Used to group related events together in the trace viewer.
+    pub cat: String,
+    /// Phase type (abbreviated as "ph" in Perfetto format).
+    /// "X" means a "Complete" event with both timestamp and duration.
+    pub ph: String,
+    /// Timestamp in microseconds (abbreviated as "ts" in Perfetto format).
+    /// Represents when the event started, as microseconds since the epoch.
+    pub ts: i64,
+    /// Duration in microseconds (abbreviated as "dur" in Perfetto format).
+    /// How long the event took to complete.
+    pub dur: i64,
+    /// Process ID. Used to separate events into different process lanes
+    /// in the trace viewer.
+    pub pid: u32,
+    /// Thread ID. Used to separate events into different thread lanes
+    /// within a process in the trace viewer.
+    pub tid: usize,
+    /// Arbitrary key-value pairs with additional event metadata
+    pub args: serde_json::Value,
+}
diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs
index 0a712a89480..0d4939f3acc 100644
--- a/nexus/src/app/background/tasks/support_bundle_collector.rs
+++ b/nexus/src/app/background/tasks/support_bundle_collector.rs
@@ -78,6 +78,8 @@ use zip::ZipArchive;
 use zip::ZipWriter;
 use zip::write::FullFileOptions;
 
+use super::support_bundle::perfetto;
+
 // We use "/var/tmp" to use Nexus' filesystem for temporary storage,
 // rather than "/tmp", which would keep this collected data in-memory.
 const TEMPDIR: &str = "/var/tmp";
@@ -1088,27 +1090,27 @@ impl BundleCollection {
                     .max(0);
                 let step_id = i + 1;
 
-                json!({
-                    "name": step.name,
-                    "cat": "bundle_collection",
-                    "ph": "X",  // Complete event (has duration)
-                    "ts": start_us,
-                    "dur": duration_us,
-                    "pid": 1,
-                    "tid": step_id,
-                    "args": {
+                perfetto::TraceEvent {
+                    name: step.name.clone(),
+                    cat: "bundle_collection".to_string(),
+                    ph: "X".to_string(),
+                    ts: start_us,
+                    dur: duration_us,
+                    pid: 1,
+                    tid: step_id,
+                    args: json!({
                         "status": step.status.to_string(),
-                    }
-                })
+                    }),
+                }
             })
             .collect();
 
-        let trace_json = json!({
-            "traceEvents": trace_events,
-            "displayTimeUnit": "ms",
-        });
+        let trace = perfetto::Trace {
+            trace_events,
+            display_time_unit: "ms".to_string(),
+        };
 
-        let trace_content = serde_json::to_string_pretty(&trace_json)
+        let trace_content = serde_json::to_string_pretty(&trace)
             .context("Failed to serialize trace JSON")?;
 
         tokio::fs::write(&trace_path, trace_content).await.with_context(
@@ -1119,7 +1121,7 @@ impl BundleCollection {
             self.log,
             "Wrote trace file";
             "path" => %trace_path,
-            "num_events" => trace_events.len()
+            "num_events" => trace.trace_events.len()
         );
 
         Ok(())
@@ -2669,64 +2671,51 @@ mod test {
             .await
             .expect("Should be able to download trace file");
 
-        // Parse the trace file as JSON
+        // Parse the trace file using our Perfetto structs
         let body_bytes =
             response.into_body().collect().await.unwrap().to_bytes();
-        let trace_json: serde_json::Value = serde_json::from_slice(&body_bytes)
-            .expect("Trace file should be valid JSON");
+        let trace: perfetto::Trace = serde_json::from_slice(&body_bytes)
+            .expect("Trace file should be valid Perfetto JSON");
 
-        // Verify the structure matches Perfetto Trace Event format
-        let trace_events = trace_json
-            .get("traceEvents")
-            .expect("Should have traceEvents field")
-            .as_array()
-            .expect("traceEvents should be an array");
+        // Verify display time unit
+        assert_eq!(
+            trace.display_time_unit, "ms",
+            "Display time unit should be milliseconds"
+        );
 
         // We should have at least the main collection steps
         assert!(
-            !trace_events.is_empty(),
+            !trace.trace_events.is_empty(),
             "Should have at least one trace event"
         );
 
-        // Verify each event has the expected fields
-        for event in trace_events {
-            assert!(event.get("name").is_some(), "Event should have name");
+        // Verify each event has the expected structure
+        for event in &trace.trace_events {
+            // Verify category
             assert_eq!(
-                event.get("cat").and_then(|v| v.as_str()),
-                Some("bundle_collection"),
+                event.cat, "bundle_collection",
                 "Event should have category 'bundle_collection'"
             );
-            assert_eq!(
-                event.get("ph").and_then(|v| v.as_str()),
-                Some("X"),
-                "Event should be Complete event type"
-            );
-            assert!(
-                event.get("ts").and_then(|v| v.as_i64()).is_some(),
-                "Event should have timestamp"
-            );
-            assert!(
-                event.get("dur").and_then(|v| v.as_i64()).is_some(),
-                "Event should have duration"
-            );
-            assert!(
-                event.get("args").is_some(),
-                "Event should have args field"
-            );
+            // Verify phase type
+            assert_eq!(event.ph, "X", "Event should be Complete event type");
+            // Verify timestamps are positive
+            assert!(event.ts >= 0, "Event timestamp should be non-negative");
+            assert!(event.dur >= 0, "Event duration should be non-negative");
+            // Verify process and thread IDs are set
+            assert_eq!(event.pid, 1, "All events should have pid=1");
+            assert!(event.tid > 0, "Event thread ID should be positive");
         }
 
         // Verify we have the same number of events as steps in the report
         assert_eq!(
-            trace_events.len(),
+            trace.trace_events.len(),
             report.steps.len(),
             "Number of events should match number of steps"
         );
 
         // Verify step names match between report and trace
-        let trace_names: std::collections::HashSet<_> = trace_events
-            .iter()
-            .filter_map(|e| e.get("name").and_then(|v| v.as_str()))
-            .collect();
+        let trace_names: std::collections::HashSet<_> =
+            trace.trace_events.iter().map(|e| e.name.as_str()).collect();
         let report_names: std::collections::HashSet<_> =
             report.steps.iter().map(|s| s.name.as_str()).collect();
         assert_eq!(

From a18d57aeae84fff78e59c39fdbc0df10f1b924a3 Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxidecomputer.com>
Date: Mon, 8 Dec 2025 13:14:07 -0800
Subject: [PATCH 11/18] [support bundles] Split support bundles into modules,
 add README for devs

---
 .../background/tasks/support_bundle/README.md |   66 +
 .../background/tasks/support_bundle/cache.rs  |   92 +
 .../tasks/support_bundle/collection.rs        |  623 ++++++
 .../background/tasks/support_bundle/mod.rs    |    5 +
 .../tasks/support_bundle/request.rs           |  195 ++
 .../background/tasks/support_bundle/step.rs   |  129 ++
 .../tasks/support_bundle/steps/bundle_id.rs   |   22 +
 .../tasks/support_bundle/steps/ereports.rs    |  211 ++
 .../tasks/support_bundle/steps/host_info.rs   |  338 ++++
 .../tasks/support_bundle/steps/mod.rs         |   89 +
 .../support_bundle/steps/reconfigurator.rs    |   64 +
 .../tasks/support_bundle/steps/sled_cubby.rs  |  146 ++
 .../tasks/support_bundle/steps/sp_dumps.rs    |  110 +
 .../tasks/support_bundle_collector.rs         | 1782 +----------------
 14 files changed, 2110 insertions(+), 1762 deletions(-)
 create mode 100644 nexus/src/app/background/tasks/support_bundle/README.md
 create mode 100644 nexus/src/app/background/tasks/support_bundle/cache.rs
 create mode 100644 nexus/src/app/background/tasks/support_bundle/collection.rs
 create mode 100644 nexus/src/app/background/tasks/support_bundle/request.rs
 create mode 100644 nexus/src/app/background/tasks/support_bundle/step.rs
 create mode 100644 nexus/src/app/background/tasks/support_bundle/steps/bundle_id.rs
 create mode 100644 nexus/src/app/background/tasks/support_bundle/steps/ereports.rs
 create mode 100644 nexus/src/app/background/tasks/support_bundle/steps/host_info.rs
 create mode 100644 nexus/src/app/background/tasks/support_bundle/steps/mod.rs
 create mode 100644 nexus/src/app/background/tasks/support_bundle/steps/reconfigurator.rs
 create mode 100644 nexus/src/app/background/tasks/support_bundle/steps/sled_cubby.rs
 create mode 100644 nexus/src/app/background/tasks/support_bundle/steps/sp_dumps.rs

diff --git a/nexus/src/app/background/tasks/support_bundle/README.md b/nexus/src/app/background/tasks/support_bundle/README.md
new file mode 100644
index 00000000000..e6a52539afd
--- /dev/null
+++ b/nexus/src/app/background/tasks/support_bundle/README.md
@@ -0,0 +1,66 @@
+# Support Bundles
+
+**Support Bundles** provide a mechanism for extracting information about a
+running Oxide system, and giving operators control over the exfiltration of that
+data.
+
+This README is intended for developers trying to add data to the bundle.
+
+## Step Execution Framework
+
+Support Bundles are collected using **steps**, which are named functions acting
+on the `BundleCollection` that can:
+
+* Read from the database, or query arbitrary services
+* Emit data to the output zipfile
+* Produce additional follow-up **steps**, if necessary
+
+If you're interested in adding data to a support bundle, you will probably be
+adding data to an existing **step**, or creating a new one.
+
+The set of all initial steps is defined in
+`nexus/src/app/background/tasks/support_bundle/steps/mod.rs`, within a function
+called `all()`. Some of these steps may themselves spawn additional steps,
+such as `STEP_SPAWN_SLEDS`, which spawns a per-sled step to query the sled
+host OS itself.
+
+### Tracing
+
+**Steps** are automatically instrumented, and their durations are emitted to an
+output file in the bundle named `meta/trace.json`. These traces are in a format
+which can be understood by **Perfetto**, a trace-viewer, and which provides
+a browser-based interface at <https://ui.perfetto.dev/>.
+
+## Filtering Bundle Contents
+
+Support Bundles are collected by the `support_bundle_collector`
+background task. They are collected as zipfiles within a single Nexus instance,
+which are then transferred to durable storage.
+
+The contents of a bundle may be controlled by modifying the **BundleRequest**
+structure. This request provides filters for controlling the categories of
+data which are collected (e.g., "Host OS info") as well as arguments for
+more specific constraints (e.g., "Collect info from a specific Sled").
+
+Bundle **steps** may query the `BundleRequest` to identify whether or not their
+contents should be included.
+
+## Overview for adding new data
+
+* **Determine if your data should exist in a new step**. The existing set of
+  steps exists in `support_bundle/steps`. Adding a new step provides a new unit
+  of execution (it can be executed concurrently with other steps), and a unit of
+  tracing (it will be instrumented independently of other steps).
+* If you're adding a new step...
+  * **Add it as a new module**, within `support_bundle/steps`.
+  * **Ensure it's part of `steps::all()`, or spawned by an existing step**. This
+  will be necessary for your step to be executed.
+  * **Provide a way for bundles to opt-out of collecting this data**. Check the
+  `BundleRequest` to see if your data exists in one of the current filters, or
+  consider adding a new one if your step involves a new category of data. Either
+  way, your new step should read `BundleRequest` to decide if it should trigger
+  before performing any subsequent operations.
+* **Consider Caching**. If your new data requires performing any potentially
+  expensive operations which might be shared with other steps (e.g., reading
+  from the database, creating and using progenitor clients, etc) consider adding
+  that data to `support_bundle/cache`.
diff --git a/nexus/src/app/background/tasks/support_bundle/cache.rs b/nexus/src/app/background/tasks/support_bundle/cache.rs
new file mode 100644
index 00000000000..314345c64b7
--- /dev/null
+++ b/nexus/src/app/background/tasks/support_bundle/cache.rs
@@ -0,0 +1,92 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Cached data or clients which are collected by the bundle
+//!
+//! This is used to share data which may be used by multiple
+//! otherwise independent steps.
+
+use crate::app::background::tasks::support_bundle::collection::BundleCollection;
+
+use gateway_client::Client as MgsClient;
+use internal_dns_types::names::ServiceName;
+use nexus_db_model::Sled;
+use nexus_types::deployment::SledFilter;
+use slog_error_chain::InlineErrorChain;
+use std::sync::Arc;
+use tokio::sync::OnceCell;
+
+/// Caches information which can be derived from the BundleCollection.
+///
+/// This is exists as a small optimization for independent steps which may try
+/// to read / access similar data, especially when it's fallible: we only need
+/// to attempt to look it up once, and all steps can share it.
+#[derive(Clone)]
+pub struct Cache {
+    inner: Arc<Inner>,
+}
+
+struct Inner {
+    all_sleds: OnceCell<Option<Vec<Sled>>>,
+    mgs_client: OnceCell<Option<MgsClient>>,
+}
+
+impl Cache {
+    pub fn new() -> Self {
+        Self {
+            inner: Arc::new(Inner {
+                all_sleds: OnceCell::new(),
+                mgs_client: OnceCell::new(),
+            }),
+        }
+    }
+
+    pub async fn get_or_initialize_all_sleds<'a>(
+        &'a self,
+        collection: &BundleCollection,
+    ) -> Option<&'a Vec<Sled>> {
+        self.inner
+            .all_sleds
+            .get_or_init(|| async {
+                collection
+                    .datastore()
+                    .sled_list_all_batched(
+                        &collection.opctx(),
+                        SledFilter::InService,
+                    )
+                    .await
+                    .ok()
+            })
+            .await
+            .as_ref()
+    }
+
+    pub async fn get_or_initialize_mgs_client<'a>(
+        &'a self,
+        collection: &BundleCollection,
+    ) -> Option<&'a MgsClient> {
+        self.inner
+            .mgs_client
+            .get_or_init(|| async { create_mgs_client(collection).await.ok() })
+            .await
+            .as_ref()
+    }
+}
+
+async fn create_mgs_client(
+    collection: &BundleCollection,
+) -> anyhow::Result<MgsClient> {
+    let log = collection.log();
+    collection
+        .resolver()
+        .lookup_socket_v6(ServiceName::ManagementGatewayService)
+        .await
+        .map(|sockaddr| {
+            let url = format!("http://{}", sockaddr);
+            gateway_client::Client::new(&url, log.clone())
+        }).map_err(|e| {
+            error!(log, "failed to resolve MGS address"; "error" => InlineErrorChain::new(&e));
+            e.into()
+        })
+}
diff --git a/nexus/src/app/background/tasks/support_bundle/collection.rs b/nexus/src/app/background/tasks/support_bundle/collection.rs
new file mode 100644
index 00000000000..1008c85128f
--- /dev/null
+++ b/nexus/src/app/background/tasks/support_bundle/collection.rs
@@ -0,0 +1,623 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! The entrypoint to all support bundle collection.
+//!
+//! These are the primitives used to look up everything else within the bundle.
+
+use crate::app::background::tasks::support_bundle::cache::Cache;
+use crate::app::background::tasks::support_bundle::perfetto;
+use crate::app::background::tasks::support_bundle::request::BundleRequest;
+use crate::app::background::tasks::support_bundle::request::TEMPDIR;
+use crate::app::background::tasks::support_bundle::step::CollectionStep;
+use crate::app::background::tasks::support_bundle::steps;
+
+use anyhow::Context;
+use camino::Utf8DirEntry;
+use camino::Utf8Path;
+use camino_tempfile::Utf8TempDir;
+use camino_tempfile::tempdir_in;
+use camino_tempfile::tempfile_in;
+use internal_dns_resolver::Resolver;
+use nexus_db_model::SupportBundle;
+use nexus_db_model::SupportBundleState;
+use nexus_db_queries::context::OpContext;
+use nexus_db_queries::db::DataStore;
+use nexus_types::internal_api::background::SupportBundleCollectionReport;
+use omicron_common::api::external::Error;
+use omicron_uuid_kinds::DatasetUuid;
+use omicron_uuid_kinds::SupportBundleUuid;
+use omicron_uuid_kinds::ZpoolUuid;
+use parallel_task_set::ParallelTaskSet;
+use serde_json::json;
+use sha2::Digest;
+use sha2::Sha256;
+use slog_error_chain::InlineErrorChain;
+use std::io::Write;
+use std::num::NonZeroU64;
+use std::sync::Arc;
+use tokio::io::AsyncReadExt;
+use tokio::io::AsyncSeekExt;
+use tokio::io::SeekFrom;
+use tufaceous_artifact::ArtifactHash;
+use zip::ZipWriter;
+use zip::write::FullFileOptions;
+
+/// Wraps up all arguments to perform a single support bundle collection
+pub struct BundleCollection {
+    datastore: Arc<DataStore>,
+    resolver: Resolver,
+    log: slog::Logger,
+    opctx: OpContext,
+    request: BundleRequest,
+    bundle: SupportBundle,
+    transfer_chunk_size: NonZeroU64,
+}
+
+impl BundleCollection {
+    pub fn new(
+        datastore: Arc<DataStore>,
+        resolver: Resolver,
+        log: slog::Logger,
+        opctx: OpContext,
+        request: BundleRequest,
+        bundle: SupportBundle,
+        transfer_chunk_size: NonZeroU64,
+    ) -> Self {
+        Self {
+            datastore,
+            resolver,
+            log,
+            opctx,
+            request,
+            bundle,
+            transfer_chunk_size,
+        }
+    }
+
+    pub fn datastore(&self) -> &Arc<DataStore> {
+        &self.datastore
+    }
+
+    pub fn resolver(&self) -> &Resolver {
+        &self.resolver
+    }
+
+    pub fn log(&self) -> &slog::Logger {
+        &self.log
+    }
+
+    pub fn opctx(&self) -> &OpContext {
+        &self.opctx
+    }
+
+    pub fn request(&self) -> &BundleRequest {
+        &self.request
+    }
+
+    pub fn bundle(&self) -> &SupportBundle {
+        &self.bundle
+    }
+
+    /// Collect the bundle within Nexus, and store it on a target sled.
+    pub async fn collect_bundle_and_store_on_sled(
+        self: &Arc<Self>,
+    ) -> anyhow::Result<SupportBundleCollectionReport> {
+        // Create a temporary directory where we'll store the support bundle
+        // as it's being collected.
+        let dir = tempdir_in(TEMPDIR)?;
+
+        let report = self.collect_bundle_locally(&dir).await?;
+        self.store_bundle_on_sled(dir).await?;
+        Ok(report)
+    }
+
+    // Create the support bundle, placing the contents into a user-specified
+    // directory.
+    //
+    // Does not attempt to convert the contents into a zipfile, nor send them
+    // to any durable storage.
+    async fn collect_bundle_locally(
+        self: &Arc<Self>,
+        dir: &Utf8TempDir,
+    ) -> anyhow::Result<SupportBundleCollectionReport> {
+        // TL;DR: This `tokio::select` is allowed to poll multiple futures, but
+        // should not do any async work within the body of any chosen branch. A
+        // previous iteration of this code polled the "collection" as "&mut
+        // collection", and checked the status of the support bundle within a
+        // branch of the "select" polling "yield_interval.tick()".
+        //
+        // We organize this work to "check for cancellation" as a whole future
+        // for a critical, but subtle reason: After the tick timer yields,
+        // we may then try to `await` a database function.
+        //
+        // This, at a surface-level glance seems innocent enough. However, there
+        // is something potentially insidious here: if calling a datastore
+        // function - such as "support_bundle_get" - awaits acquiring access
+        // to a connection from the connection pool, while creating the
+        // collection ALSO potentially awaits acquiring access to the
+        // connection pool, it is possible for:
+        //
+        // 1. The `&mut collection` arm to have created a future, currently
+        //    yielded, which wants access to this underlying resource.
+        // 2. The current operation executing in `support_bundle_get` to
+        //    be awaiting access to this same underlying resource.
+        //
+        // In this specific case, the connection pool would be attempting to
+        // yield to the `&mut collection` arm, which cannot run, if we were
+        // awaiting in the body of a different async select arm. This would
+        // result in a deadlock.
+        //
+        // In the future, we may attempt to make access to the connection pool
+        // safer from concurrent asynchronous access - it is unsettling that
+        // multiple concurrent `.claim()` functions can cause this behavior -
+        // but in the meantime, we perform this cancellation check in a single
+        // future that always is polled concurrently with the collection work.
+        // Because of this separation, each future is polled until one
+        // completes, at which point we deterministically exit.
+        //
+        // For more details, see:
+        // https://github.com/oxidecomputer/omicron/issues/9259
+
+        tokio::select! {
+            // Returns if the bundle should no longer be collected.
+            why = self.check_for_cancellation() => {
+                warn!(
+                    &self.log,
+                    "Support Bundle cancelled - stopping collection";
+                    "bundle" => %self.bundle.id,
+                    "state" => ?self.bundle.state
+                );
+                return Err(why);
+            },
+            // Otherwise, keep making progress on the collection itself.
+            report = self.collect_bundle_as_file(&dir) => {
+                info!(
+                    &self.log,
+                    "Bundle Collection completed";
+                    "bundle" => %self.bundle.id
+                );
+                return report;
+            },
+        }
+    }
+
+    async fn store_bundle_on_sled(
+        &self,
+        dir: Utf8TempDir,
+    ) -> anyhow::Result<()> {
+        // Create the zipfile as a temporary file
+        let mut zipfile = tokio::fs::File::from_std(bundle_to_zipfile(&dir)?);
+        let total_len = zipfile.metadata().await?.len();
+
+        // Collect the hash locally before we send it over the network
+        //
+        // We'll use this later during finalization to confirm the bundle
+        // has been stored successfully.
+        zipfile.seek(SeekFrom::Start(0)).await?;
+        let hash = sha2_hash(&mut zipfile).await?;
+
+        // Find the sled where we're storing this bundle.
+        let sled_id = self
+            .datastore
+            .zpool_get_sled_if_in_service(
+                &self.opctx,
+                self.bundle.zpool_id.into(),
+            )
+            .await?;
+        let sled_client = nexus_networking::sled_client(
+            &self.datastore,
+            &self.opctx,
+            sled_id,
+            &self.log,
+        )
+        .await?;
+
+        let zpool = ZpoolUuid::from(self.bundle.zpool_id);
+        let dataset = DatasetUuid::from(self.bundle.dataset_id);
+        let support_bundle = SupportBundleUuid::from(self.bundle.id);
+
+        // Tell this sled to create the bundle.
+        let creation_result = sled_client
+            .support_bundle_start_creation(&zpool, &dataset, &support_bundle)
+            .await
+            .with_context(|| "Support bundle failed to start creation")?;
+
+        if matches!(
+            creation_result.state,
+            sled_agent_client::types::SupportBundleState::Complete
+        ) {
+            // Early exit case: the bundle was already created -- we must have either
+            // crashed or failed between "finalizing" and "writing to the database that we
+            // finished".
+            info!(&self.log, "Support bundle was already collected"; "bundle" => %self.bundle.id);
+            return Ok(());
+        }
+        info!(&self.log, "Support bundle creation started"; "bundle" => %self.bundle.id);
+
+        let mut offset = 0;
+        while offset < total_len {
+            // Stream the zipfile to the sled where it should be kept
+            let mut file = zipfile
+                .try_clone()
+                .await
+                .with_context(|| "Failed to clone zipfile")?;
+            file.seek(SeekFrom::Start(offset)).await.with_context(|| {
+                format!("Failed to seek to offset {offset} / {total_len} within zipfile")
+            })?;
+
+            // Only stream at most "transfer_chunk_size" bytes at once
+            let chunk_size = std::cmp::min(
+                self.transfer_chunk_size.get(),
+                total_len - offset,
+            );
+
+            let limited_file = file.take(chunk_size);
+            let stream = tokio_util::io::ReaderStream::new(limited_file);
+            let body = reqwest::Body::wrap_stream(stream);
+
+            info!(
+                &self.log,
+                "Streaming bundle chunk";
+                "bundle" => %self.bundle.id,
+                "offset" => offset,
+                "length" => chunk_size,
+            );
+
+            sled_client.support_bundle_transfer(
+                &zpool, &dataset, &support_bundle, offset, body
+            ).await.with_context(|| {
+                format!("Failed to transfer bundle: {chunk_size}@{offset} of {total_len} to sled")
+            })?;
+
+            offset += chunk_size;
+        }
+
+        sled_client
+            .support_bundle_finalize(
+                &zpool,
+                &dataset,
+                &support_bundle,
+                &hash.to_string(),
+            )
+            .await
+            .with_context(|| "Failed to finalize bundle")?;
+
+        // Returning from this method should drop all temporary storage
+        // allocated locally for this support bundle.
+        Ok(())
+    }
+
+    // Indefinitely perform periodic checks about whether or not we should
+    // cancel the bundle.
+    //
+    // Returns an error if:
+    // - The bundle state is no longer SupportBundleState::Collecting
+    // (which happens if the bundle has been explicitly cancelled, or
+    // if the backing storage has been expunged).
+    // - The bundle has been deleted
+    //
+    // Otherwise, keeps checking indefinitely while polled.
+    async fn check_for_cancellation(&self) -> anyhow::Error {
+        let work_duration = tokio::time::Duration::from_secs(5);
+        let mut yield_interval = tokio::time::interval_at(
+            tokio::time::Instant::now() + work_duration,
+            work_duration,
+        );
+
+        loop {
+            // Timer fired mid-collection - check if we should stop.
+            yield_interval.tick().await;
+            trace!(
+                self.log,
+                "Checking if Bundle Collection cancelled";
+                "bundle" => %self.bundle.id
+            );
+
+            match self
+                .datastore
+                .support_bundle_get(&self.opctx, self.bundle.id.into())
+                .await
+            {
+                Ok(SupportBundle {
+                    state: SupportBundleState::Collecting,
+                    ..
+                }) => {
+                    // Bundle still collecting; continue...
+                    continue;
+                }
+                Ok(_) => {
+                    // Not collecting, for any reason: Time to exit
+                    return anyhow::anyhow!("Support Bundle Cancelled");
+                }
+                Err(Error::ObjectNotFound { .. } | Error::NotFound { .. }) => {
+                    return anyhow::anyhow!("Support Bundle Deleted");
+                }
+                Err(err) => {
+                    warn!(
+                        self.log,
+                        "Database error checking bundle cancellation";
+                        InlineErrorChain::new(&err)
+                    );
+
+                    // If we cannot contact the database, retry later
+                    continue;
+                }
+            }
+        }
+    }
+
+    async fn run_collect_bundle_steps(
+        self: &Arc<Self>,
+        output: &Utf8TempDir,
+        mut steps: Vec<CollectionStep>,
+    ) -> SupportBundleCollectionReport {
+        let mut report =
+            SupportBundleCollectionReport::new(self.bundle.id.into());
+
+        const MAX_CONCURRENT_STEPS: usize = 16;
+        let mut tasks =
+            ParallelTaskSet::new_with_parallelism(MAX_CONCURRENT_STEPS);
+
+        loop {
+            // Process all the currently-planned steps
+            while let Some(step) = steps.pop() {
+                let previous_result = tasks
+                    .spawn({
+                        let collection = self.clone();
+                        let dir = output.path().to_path_buf();
+                        let log = self.log.clone();
+                        async move {
+                            debug!(log, "Running step"; "step" => &step.name);
+                            step.run(&collection, dir.as_path(), &log).await
+                        }
+                    })
+                    .await;
+
+                if let Some(output) = previous_result {
+                    output.process(&mut report, &mut steps);
+                };
+            }
+
+            // If we've run out of tasks to spawn, join any of the previously
+            // spawned tasks, if any exist.
+            if let Some(output) = tasks.join_next().await {
+                output.process(&mut report, &mut steps);
+
+                // As soon as any task completes, see if we can spawn more work
+                // immediately. This ensures that the ParallelTaskSet is
+                // saturated as much as it can be.
+                continue;
+            }
+
+            // Executing steps may create additional steps, as follow-up work.
+            //
+            // Only finish if we've exhausted all possible steps and joined all spawned work.
+            if steps.is_empty() {
+                // Write trace file before returning
+                if let Err(err) = self.write_trace_file(output, &report).await {
+                    warn!(
+                        self.log,
+                        "Failed to write trace file";
+                        "error" => ?err
+                    );
+                }
+                return report;
+            }
+        }
+    }
+
+    // Write a Perfetto Event format JSON file for visualization
+    async fn write_trace_file(
+        &self,
+        output: &Utf8TempDir,
+        report: &SupportBundleCollectionReport,
+    ) -> anyhow::Result<()> {
+        let meta_dir = output.path().join("meta");
+        tokio::fs::create_dir_all(&meta_dir).await.with_context(|| {
+            format!("Failed to create meta directory {meta_dir}")
+        })?;
+
+        let trace_path = meta_dir.join("trace.json");
+
+        // Convert steps to Perfetto Trace Event format.
+        // Sort steps by start time and assign each a unique sequential ID.
+        //
+        // This is necessary because the trace event format does not like
+        // multiple slices to overlap - so we make each slice distinct.
+        //
+        // Ideally we'd be able to correlate these with actual tokio tasks,
+        // but it's hard to convert tokio::task::Id to a u64 because
+        // of https://github.com/tokio-rs/tokio/issues/7430
+        let mut sorted_steps: Vec<_> = report.steps.iter().collect();
+        sorted_steps.sort_by_key(|s| s.start);
+
+        // Generate trace events - each step gets a unique ID (1, 2, 3, ...)
+        // based on its start time order
+        let trace_events: Vec<_> = sorted_steps
+            .iter()
+            .enumerate()
+            .map(|(i, step)| {
+                let start_us = step.start.timestamp_micros();
+                let duration_us = (step.end - step.start)
+                    .num_microseconds()
+                    .unwrap_or(0)
+                    .max(0);
+                let step_id = i + 1;
+
+                perfetto::TraceEvent {
+                    name: step.name.clone(),
+                    cat: "bundle_collection".to_string(),
+                    ph: "X".to_string(),
+                    ts: start_us,
+                    dur: duration_us,
+                    pid: 1,
+                    tid: step_id,
+                    args: json!({
+                        "status": step.status.to_string(),
+                    }),
+                }
+            })
+            .collect();
+
+        let trace = perfetto::Trace {
+            trace_events,
+            display_time_unit: "ms".to_string(),
+        };
+
+        let trace_content = serde_json::to_string_pretty(&trace)
+            .context("Failed to serialize trace JSON")?;
+
+        tokio::fs::write(&trace_path, trace_content).await.with_context(
+            || format!("Failed to write trace file to {trace_path}"),
+        )?;
+
+        info!(
+            self.log,
+            "Wrote trace file";
+            "path" => %trace_path,
+            "num_events" => trace.trace_events.len()
+        );
+
+        Ok(())
+    }
+
+    // Perform the work of collecting the support bundle into a temporary directory
+    //
+    // "dir" is an output directory where data can be stored.
+    //
+    // If a partial bundle can be collected, it should be returned as
+    // an Ok(SupportBundleCollectionReport). Any failures from this function
+    // will prevent the support bundle from being collected altogether.
+    //
+    // NOTE: The background task infrastructure will periodically check to see
+    // if the bundle has been cancelled by a user while it is being collected.
+    // If that happens, this function will be CANCELLED at an await point.
+    //
+    // As a result, it is important that this function be implemented as
+    // cancel-safe.
+    //
+    // The "steps" used within this function - passed to
+    // [`Self::run_collect_bundle_steps`] - are run on a [`ParallelTaskSet`],
+    // which automatically aborts tasks when it is dropped.
+    async fn collect_bundle_as_file(
+        self: &Arc<Self>,
+        dir: &Utf8TempDir,
+    ) -> anyhow::Result<SupportBundleCollectionReport> {
+        let log = &self.log;
+
+        info!(&log, "Collecting bundle as local file");
+
+        let cache = Cache::new();
+        let steps = steps::all(&cache);
+        Ok(self.run_collect_bundle_steps(dir, steps).await)
+    }
+}
+
+// Takes a directory "dir", and zips the contents into a single zipfile.
+fn bundle_to_zipfile(dir: &Utf8TempDir) -> anyhow::Result<std::fs::File> {
+    let tempfile = tempfile_in(TEMPDIR)?;
+    let mut zip = ZipWriter::new(tempfile);
+
+    recursively_add_directory_to_zipfile(&mut zip, dir.path(), dir.path())?;
+
+    Ok(zip.finish()?)
+}
+
+fn recursively_add_directory_to_zipfile(
+    zip: &mut ZipWriter<std::fs::File>,
+    root_path: &Utf8Path,
+    dir_path: &Utf8Path,
+) -> anyhow::Result<()> {
+    // Readdir might return entries in a non-deterministic order.
+    // Let's sort it for the zipfile, to be nice.
+    let mut entries = dir_path
+        .read_dir_utf8()?
+        .filter_map(Result::ok)
+        .collect::<Vec<Utf8DirEntry>>();
+    entries.sort_by(|a, b| a.file_name().cmp(&b.file_name()));
+
+    for entry in &entries {
+        // Remove the "/tmp/..." prefix from the path when we're storing it in the
+        // zipfile.
+        let dst = entry.path().strip_prefix(root_path)?;
+
+        let file_type = entry.file_type()?;
+        if file_type.is_file() {
+            let src = entry.path();
+
+            let zip_time = entry
+                .path()
+                .metadata()
+                .and_then(|m| m.modified())
+                .ok()
+                .and_then(|sys_time| jiff::Zoned::try_from(sys_time).ok())
+                .and_then(|zoned| {
+                    zip::DateTime::try_from(zoned.datetime()).ok()
+                })
+                .unwrap_or_else(zip::DateTime::default);
+
+            let opts = FullFileOptions::default()
+                .last_modified_time(zip_time)
+                .compression_method(zip::CompressionMethod::Deflated)
+                .large_file(true);
+
+            zip.start_file_from_path(dst, opts)?;
+            let mut file = std::fs::File::open(&src)?;
+            std::io::copy(&mut file, zip)?;
+        }
+        if file_type.is_dir() {
+            let opts = FullFileOptions::default();
+            zip.add_directory_from_path(dst, opts)?;
+            recursively_add_directory_to_zipfile(zip, root_path, entry.path())?;
+        }
+    }
+    Ok(())
+}
+
+async fn sha2_hash(file: &mut tokio::fs::File) -> anyhow::Result<ArtifactHash> {
+    let mut buf = vec![0u8; 65536];
+    let mut ctx = Sha256::new();
+    loop {
+        let n = file.read(&mut buf).await?;
+        if n == 0 {
+            break;
+        }
+        ctx.write_all(&buf[0..n])?;
+    }
+
+    let digest = ctx.finalize();
+    Ok(ArtifactHash(digest.as_slice().try_into()?))
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    use camino_tempfile::tempdir;
+
+    // Ensure that we can convert a temporary directory into a zipfile
+    #[test]
+    fn test_zipfile_creation() {
+        let dir = tempdir().unwrap();
+
+        std::fs::create_dir_all(dir.path().join("dir-a")).unwrap();
+        std::fs::create_dir_all(dir.path().join("dir-b")).unwrap();
+        std::fs::write(dir.path().join("dir-a").join("file-a"), "some data")
+            .unwrap();
+        std::fs::write(dir.path().join("file-b"), "more data").unwrap();
+
+        let zipfile = bundle_to_zipfile(&dir)
+            .expect("Should have been able to bundle zipfile");
+        let archive = zip::read::ZipArchive::new(zipfile).unwrap();
+
+        // We expect the order to be deterministically alphabetical
+        let mut names = archive.file_names();
+        assert_eq!(names.next(), Some("dir-a/"));
+        assert_eq!(names.next(), Some("dir-a/file-a"));
+        assert_eq!(names.next(), Some("dir-b/"));
+        assert_eq!(names.next(), Some("file-b"));
+        assert_eq!(names.next(), None);
+    }
+}
diff --git a/nexus/src/app/background/tasks/support_bundle/mod.rs b/nexus/src/app/background/tasks/support_bundle/mod.rs
index 9b7b4ac4aa0..0b62e169bd2 100644
--- a/nexus/src/app/background/tasks/support_bundle/mod.rs
+++ b/nexus/src/app/background/tasks/support_bundle/mod.rs
@@ -4,4 +4,9 @@
 
 //! Support bundle related types and utilities
 
+mod cache;
+pub mod collection;
 pub mod perfetto;
+pub mod request;
+mod step;
+mod steps;
diff --git a/nexus/src/app/background/tasks/support_bundle/request.rs b/nexus/src/app/background/tasks/support_bundle/request.rs
new file mode 100644
index 00000000000..90bcbbe3679
--- /dev/null
+++ b/nexus/src/app/background/tasks/support_bundle/request.rs
@@ -0,0 +1,195 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Support bundle request types and data selection
+
+use nexus_db_queries::db::datastore::EreportFilters;
+use omicron_uuid_kinds::SledUuid;
+use std::collections::HashMap;
+use std::collections::HashSet;
+use std::num::NonZeroU64;
+
+/// We use "/var/tmp" to use Nexus' filesystem for temporary storage,
+/// rather than "/tmp", which would keep this collected data in-memory.
+pub const TEMPDIR: &str = "/var/tmp";
+
+/// The size of piece of a support bundle to transfer to the sled agent
+/// within a single streaming request.
+pub const CHUNK_SIZE: NonZeroU64 = NonZeroU64::new(1024 * 1024 * 1024).unwrap();
+
+/// Describes the category of support bundle data.
+#[derive(Debug, Clone, Copy, Hash, Eq, PartialEq)]
+pub enum BundleDataCategory {
+    /// Collects reconfigurator state (some of the latest blueprints,
+    /// information about the target blueprint).
+    Reconfigurator,
+    /// Collects info from sled agents, running a handful of
+    /// diagnostic commands (e.g., zoneadm, dladm, etc).
+    HostInfo,
+    /// Collects sled serial numbers, cubby numbers, and UUIDs.
+    SledCubbyInfo,
+    /// Saves task dumps from SPs.
+    SpDumps,
+    /// Collects ereports
+    Ereports,
+}
+
+/// Specifies what data to collect for a bundle data category.
+///
+/// Each variant corresponds to a BundleDataCategory.
+/// For categories without additional parameters, the variant is a unit variant.
+/// For categories that can be filtered or configured, the variant contains
+/// that configuration data.
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub enum BundleData {
+    Reconfigurator,
+    HostInfo(HashSet<SledSelection>),
+    SledCubbyInfo,
+    SpDumps,
+    Ereports(EreportFilters),
+}
+
+impl BundleData {
+    fn category(&self) -> BundleDataCategory {
+        match self {
+            Self::Reconfigurator => BundleDataCategory::Reconfigurator,
+            Self::HostInfo(_) => BundleDataCategory::HostInfo,
+            Self::SledCubbyInfo => BundleDataCategory::SledCubbyInfo,
+            Self::SpDumps => BundleDataCategory::SpDumps,
+            Self::Ereports(_) => BundleDataCategory::Ereports,
+        }
+    }
+}
+
+/// A collection of bundle data specifications.
+///
+/// This wrapper ensures that categories and data always match - you can't
+/// insert (BundleDataCategory::Reconfigurator, BundleData::SpDumps)
+/// because each BundleData determines its own category.
+#[derive(Debug, Clone)]
+pub struct BundleDataSelection {
+    data: HashMap<BundleDataCategory, BundleData>,
+}
+
+impl BundleDataSelection {
+    pub fn new() -> Self {
+        Self { data: HashMap::new() }
+    }
+
+    /// Inserts BundleData to be queried for a particular category within the
+    /// bundle.
+    ///
+    /// Each category of data can only be specified once (e.g., inserting
+    /// BundleData::HostInfo multiple times will only use the most-recently
+    /// inserted specification)
+    pub fn insert(&mut self, bundle_data: BundleData) {
+        self.data.insert(bundle_data.category(), bundle_data);
+    }
+
+    pub fn contains(&self, category: BundleDataCategory) -> bool {
+        self.data.contains_key(&category)
+    }
+
+    pub fn get(&self, category: BundleDataCategory) -> Option<&BundleData> {
+        self.data.get(&category)
+    }
+}
+
+impl FromIterator<BundleData> for BundleDataSelection {
+    fn from_iter<T: IntoIterator<Item = BundleData>>(iter: T) -> Self {
+        let mut selection = Self::new();
+        for bundle_data in iter {
+            selection.insert(bundle_data);
+        }
+        selection
+    }
+}
+
+impl Default for BundleDataSelection {
+    fn default() -> Self {
+        [
+            BundleData::Reconfigurator,
+            BundleData::HostInfo(HashSet::from([SledSelection::All])),
+            BundleData::SledCubbyInfo,
+            BundleData::SpDumps,
+            BundleData::Ereports(EreportFilters {
+                start_time: Some(chrono::Utc::now() - chrono::Days::new(7)),
+                ..EreportFilters::default()
+            }),
+        ]
+        .into_iter()
+        .collect()
+    }
+}
+
+/// The set of sleds to include
+///
+/// Multiple values of this enum are joined together into a HashSet.
+/// Therefore "SledSelection::All" overrides specific sleds.
+#[derive(Debug, Clone, Hash, Eq, PartialEq)]
+pub enum SledSelection {
+    All,
+    Specific(SledUuid),
+}
+
+/// Specifies the data to be collected within the Support Bundle.
+#[derive(Clone)]
+pub struct BundleRequest {
+    /// The size of chunks to use when transferring a bundle from Nexus
+    /// to a sled agent.
+    ///
+    /// Typically, this is CHUNK_SIZE, but can be modified for testing.
+    pub transfer_chunk_size: NonZeroU64,
+
+    /// The set of data to be included within this bundle.
+    ///
+    /// Maps each category to its filter. If a category is not in the map,
+    /// it is excluded from the bundle.
+    pub data_selection: BundleDataSelection,
+}
+
+impl BundleRequest {
+    pub fn include_reconfigurator_data(&self) -> bool {
+        self.data_selection.contains(BundleDataCategory::Reconfigurator)
+    }
+
+    pub fn include_host_info(&self) -> bool {
+        self.data_selection.contains(BundleDataCategory::HostInfo)
+    }
+
+    pub fn include_sled_host_info(&self, id: SledUuid) -> bool {
+        let selection =
+            match self.data_selection.get(BundleDataCategory::HostInfo) {
+                Some(BundleData::HostInfo(selection)) => selection,
+                _ => return false,
+            };
+
+        selection.contains(&SledSelection::Specific(id))
+            || selection.contains(&SledSelection::All)
+    }
+
+    pub fn get_ereport_filters(&self) -> Option<&EreportFilters> {
+        match self.data_selection.get(BundleDataCategory::Ereports) {
+            Some(BundleData::Ereports(filters)) => Some(filters),
+            _ => None,
+        }
+    }
+
+    pub fn include_sled_cubby_info(&self) -> bool {
+        self.data_selection.contains(BundleDataCategory::SledCubbyInfo)
+    }
+
+    pub fn include_sp_dumps(&self) -> bool {
+        self.data_selection.contains(BundleDataCategory::SpDumps)
+    }
+}
+
+impl Default for BundleRequest {
+    fn default() -> Self {
+        Self {
+            transfer_chunk_size: CHUNK_SIZE,
+            data_selection: BundleDataSelection::default(),
+        }
+    }
+}
diff --git a/nexus/src/app/background/tasks/support_bundle/step.rs b/nexus/src/app/background/tasks/support_bundle/step.rs
new file mode 100644
index 00000000000..5909265b976
--- /dev/null
+++ b/nexus/src/app/background/tasks/support_bundle/step.rs
@@ -0,0 +1,129 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Support bundle collection step execution framework
+
+use crate::app::background::tasks::support_bundle::collection::BundleCollection;
+
+use camino::Utf8Path;
+use chrono::DateTime;
+use chrono::Utc;
+use futures::future::BoxFuture;
+use nexus_types::internal_api::background::SupportBundleCollectionReport;
+use nexus_types::internal_api::background::SupportBundleCollectionStep;
+use nexus_types::internal_api::background::SupportBundleCollectionStepStatus;
+use nexus_types::internal_api::background::SupportBundleEreportStatus;
+use slog_error_chain::InlineErrorChain;
+use std::sync::Arc;
+
+// This type describes a single step in the Support Bundle collection.
+//
+// - All steps have access to the "BundleCollection", which includes
+// tools for actually acquiring data.
+// - All steps have access to an output directory where they can store
+// serialized data to a file.
+// - Finally, all steps can emit a "CollectionStepOutput", which can either
+// update the collection report, or generate more steps.
+pub type CollectionStepFn = Box<
+    dyn for<'b> FnOnce(
+            &'b Arc<BundleCollection>,
+            &'b Utf8Path,
+        )
+            -> BoxFuture<'b, anyhow::Result<CollectionStepOutput>>
+        + Send,
+>;
+
+pub struct CollectionStep {
+    pub name: String,
+    pub step_fn: CollectionStepFn,
+}
+
+impl CollectionStep {
+    pub fn new(name: impl Into<String>, step_fn: CollectionStepFn) -> Self {
+        Self { name: name.into(), step_fn }
+    }
+
+    pub async fn run(
+        self,
+        collection: &Arc<BundleCollection>,
+        output: &Utf8Path,
+        log: &slog::Logger,
+    ) -> CompletedCollectionStep {
+        let start = Utc::now();
+
+        let output = (self.step_fn)(collection, output)
+            .await
+            .inspect_err(|err| {
+                warn!(
+                    log,
+                    "Step failed";
+                    "step" => &self.name,
+                    InlineErrorChain::new(err.as_ref()),
+                );
+            })
+            .unwrap_or_else(|err| CollectionStepOutput::Failed(err));
+
+        let end = Utc::now();
+
+        CompletedCollectionStep { name: self.name, start, end, output }
+    }
+}
+
+pub struct CompletedCollectionStep {
+    pub name: String,
+    pub start: DateTime<Utc>,
+    pub end: DateTime<Utc>,
+    pub output: CollectionStepOutput,
+}
+
+impl CompletedCollectionStep {
+    // Updates the collection report based on the output of a collection step,
+    // and possibly extends the set of all steps to be executed.
+    pub fn process(
+        self,
+        report: &mut SupportBundleCollectionReport,
+        steps: &mut Vec<CollectionStep>,
+    ) {
+        use SupportBundleCollectionStepStatus as Status;
+
+        let status = match self.output {
+            CollectionStepOutput::Skipped => Status::Skipped,
+            CollectionStepOutput::Failed(err) => {
+                Status::Failed(err.to_string())
+            }
+            CollectionStepOutput::Ereports(status) => {
+                report.ereports = Some(status);
+                Status::Ok
+            }
+            CollectionStepOutput::Spawn { extra_steps } => {
+                steps.extend(extra_steps);
+                Status::Ok
+            }
+            CollectionStepOutput::None => Status::Ok,
+        };
+
+        // Add information about this completed step the bundle report.
+        let step = SupportBundleCollectionStep {
+            name: self.name,
+            start: self.start,
+            end: self.end,
+            status,
+        };
+        report.steps.push(step);
+    }
+}
+
+pub enum CollectionStepOutput {
+    // The step was not executed intentionally
+    Skipped,
+    // The step encountered a fatal error and could not complete.
+    //
+    // It may have still saved a partial set of data to the bundle.
+    Failed(anyhow::Error),
+    Ereports(SupportBundleEreportStatus),
+    // The step spawned additional steps to execute
+    Spawn { extra_steps: Vec<CollectionStep> },
+    // The step completed with nothing to report, and no follow-up steps
+    None,
+}
diff --git a/nexus/src/app/background/tasks/support_bundle/steps/bundle_id.rs b/nexus/src/app/background/tasks/support_bundle/steps/bundle_id.rs
new file mode 100644
index 00000000000..392dfc21ea6
--- /dev/null
+++ b/nexus/src/app/background/tasks/support_bundle/steps/bundle_id.rs
@@ -0,0 +1,22 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Collects metadata about the bundle itself (currently only the ID)
+
+use crate::app::background::tasks::support_bundle::collection::BundleCollection;
+use crate::app::background::tasks::support_bundle::step::CollectionStepOutput;
+use camino::Utf8Path;
+
+pub async fn collect(
+    collection: &BundleCollection,
+    dir: &Utf8Path,
+) -> anyhow::Result<CollectionStepOutput> {
+    tokio::fs::write(
+        dir.join("bundle_id.txt"),
+        collection.bundle().id.to_string(),
+    )
+    .await?;
+
+    Ok(CollectionStepOutput::None)
+}
diff --git a/nexus/src/app/background/tasks/support_bundle/steps/ereports.rs b/nexus/src/app/background/tasks/support_bundle/steps/ereports.rs
new file mode 100644
index 00000000000..24d8272aefb
--- /dev/null
+++ b/nexus/src/app/background/tasks/support_bundle/steps/ereports.rs
@@ -0,0 +1,211 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Collect ereports for support bundles
+
+use crate::app::background::tasks::support_bundle::collection::BundleCollection;
+use crate::app::background::tasks::support_bundle::step::CollectionStepOutput;
+
+use anyhow::Context;
+use camino::Utf8Path;
+use camino::Utf8PathBuf;
+use nexus_db_queries::context::OpContext;
+use nexus_db_queries::db::DataStore;
+use nexus_db_queries::db::datastore;
+use nexus_db_queries::db::datastore::EreportFilters;
+use nexus_db_queries::db::pagination::Paginator;
+use nexus_types::fm::Ereport;
+use nexus_types::internal_api::background::SupportBundleEreportStatus;
+use omicron_uuid_kinds::GenericUuid;
+use slog::Logger;
+use slog_error_chain::InlineErrorChain;
+use std::sync::Arc;
+
+pub async fn collect(
+    collection: &BundleCollection,
+    dir: &Utf8Path,
+) -> anyhow::Result<CollectionStepOutput> {
+    let (log, opctx, datastore, request) = (
+        collection.log(),
+        collection.opctx(),
+        collection.datastore(),
+        collection.request(),
+    );
+    let ereport_filters = request.get_ereport_filters();
+
+    let Some(ereport_filters) = ereport_filters else {
+        debug!(log, "Support bundle: ereports not requested");
+        return Ok(CollectionStepOutput::Skipped);
+    };
+    let ereports_dir = dir.join("ereports");
+    let mut status = SupportBundleEreportStatus::default();
+    if let Err(err) = save_ereports(
+        log,
+        opctx,
+        datastore,
+        ereport_filters.clone(),
+        ereports_dir,
+        &mut status,
+    )
+    .await
+    {
+        warn!(
+            log,
+            "Support bundle: ereport collection failed \
+             ({} collected successfully)",
+             status.n_collected;
+            InlineErrorChain::new(err.as_ref())
+        );
+        status.errors.push(InlineErrorChain::new(err.as_ref()).to_string());
+    };
+
+    Ok(CollectionStepOutput::Ereports(status))
+}
+
+async fn save_ereports(
+    log: &Logger,
+    opctx: &OpContext,
+    datastore: &Arc<DataStore>,
+    filters: EreportFilters,
+    dir: Utf8PathBuf,
+    status: &mut SupportBundleEreportStatus,
+) -> anyhow::Result<()> {
+    let mut paginator = Paginator::new(
+        datastore::SQL_BATCH_SIZE,
+        dropshot::PaginationOrder::Ascending,
+    );
+    while let Some(p) = paginator.next() {
+        let ereports = datastore
+            .ereport_fetch_matching(&opctx, &filters, &p.current_pagparams())
+            .await
+            .map_err(|e| e.internal_context("failed to query for ereports"))?;
+        paginator = p.found_batch(&ereports, &|ereport| {
+            (ereport.restart_id.into_untyped_uuid(), ereport.ena)
+        });
+
+        let prev_n_collected = status.n_collected;
+        let n_ereports = ereports.len();
+        status.n_found += n_ereports;
+
+        for ereport in ereports {
+            match ereport.try_into() {
+                Ok(ereport) => {
+                    write_ereport(ereport, &dir).await?;
+                    status.n_collected += 1;
+                }
+                Err(err) => {
+                    warn!(log, "invalid ereport"; "error" => %err);
+                    status.errors.push(err.to_string());
+                }
+            }
+        }
+        debug!(
+            log,
+            "Support bundle: added {} ereports ({} found)",
+            status.n_collected - prev_n_collected,
+            n_ereports
+        );
+    }
+
+    info!(
+        log,
+        "Support bundle: collected {} total ereports", status.n_collected
+    );
+    Ok(())
+}
+
+async fn write_ereport(ereport: Ereport, dir: &Utf8Path) -> anyhow::Result<()> {
+    // Here's where we construct the file path for each ereport JSON file,
+    // given the top-level ereport directory path.  Each ereport is stored in a
+    // subdirectory for the part and serial numbers of the system that produced
+    // the ereport.  Part numbers must be included in addition to serial
+    // numbers, as the v1 serial scheme only guarantees uniqueness within a
+    // part number.  These paths take the following form:
+    //
+    //   {part-number}-{serial_number}/{restart_id}/{ENA}.json
+    //
+    // We can assume that the restart ID and ENA consist only of
+    // filesystem-safe characters, as the restart ID is known to be a UUID, and
+    // the ENA is just an integer.  For the serial and part numbers, which
+    // Nexus doesn't have full control over --- it came from the ereport
+    // metadata --- we must check that it doesn't contain any characters
+    // unsuitable for use in a filesystem path.
+    let pn = ereport
+        .data
+        .part_number
+        .as_deref()
+        // If the part or serial numbers contain any unsavoury characters, it
+        // goes in the `unknown_serial` hole! Note that the alleged serial
+        // number from the ereport will still be present in the JSON as a
+        // string, so we're not *lying* about what was received; we're just
+        // giving up on using it in the path.
+        .filter(|&s| is_fs_safe_single_path_component(s))
+        .unwrap_or("unknown_part");
+    let sn = ereport
+        .data
+        .serial_number
+        .as_deref()
+        .filter(|&s| is_fs_safe_single_path_component(s))
+        .unwrap_or("unknown_serial");
+    let id = &ereport.data.id;
+
+    let dir = dir
+        .join(format!("{pn}-{sn}"))
+        // N.B. that we call `into_untyped_uuid()` here, as the `Display`
+        // implementation for a typed UUID appends " (ereporter_restart)", which
+        // we don't want.
+        .join(id.restart_id.into_untyped_uuid().to_string());
+    tokio::fs::create_dir_all(&dir)
+        .await
+        .with_context(|| format!("failed to create directory '{dir}'"))?;
+    let file_path = dir.join(format!("{}.json", id.ena));
+    let json = serde_json::to_vec(&ereport).with_context(|| {
+        format!("failed to serialize ereport {pn}:{sn}/{id}")
+    })?;
+    tokio::fs::write(&file_path, json)
+        .await
+        .with_context(|| format!("failed to write '{file_path}'"))
+}
+
+fn is_fs_safe_single_path_component(s: &str) -> bool {
+    // Might be path traversal...
+    if s == "." || s == ".." {
+        return false;
+    }
+
+    if s == "~" {
+        return false;
+    }
+
+    const BANNED_CHARS: &[char] = &[
+        // Check for path separators.
+        //
+        // Naively, we might reach for `std::path::is_separator()` here.
+        // However, this function only checks if a path is a permitted
+        // separator on the *current* platform --- so, running on illumos, we
+        // will only check for Unix path separators.  But, because the support
+        // bundle may be extracted on a workstation system by Oxide support
+        // personnel or by the customer, we should also make sure we don't
+        // allow the use of Windows path separators, which `is_separator()`
+        // won't check for on Unix systems.
+        '/', '\\',
+        // Characters forbidden on Windows, per:
+        // https://learn.microsoft.com/en-us/windows/win32/fileio/naming-a-file#naming-conventions
+        '<', '>', ':', '"', '|', '?', '*',
+    ];
+
+    // Rather than using `s.contains()`, we do all the checks in one pass.
+    for c in s.chars() {
+        if BANNED_CHARS.contains(&c) {
+            return false;
+        }
+
+        // Definitely no control characters!
+        if c.is_control() {
+            return false;
+        }
+    }
+
+    true
+}
diff --git a/nexus/src/app/background/tasks/support_bundle/steps/host_info.rs b/nexus/src/app/background/tasks/support_bundle/steps/host_info.rs
new file mode 100644
index 00000000000..0e47aa5b0b5
--- /dev/null
+++ b/nexus/src/app/background/tasks/support_bundle/steps/host_info.rs
@@ -0,0 +1,338 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Collect host information from sleds for support bundles
+
+use crate::app::background::tasks::support_bundle::cache::Cache;
+use crate::app::background::tasks::support_bundle::collection::BundleCollection;
+use crate::app::background::tasks::support_bundle::step::CollectionStep;
+use crate::app::background::tasks::support_bundle::step::CollectionStepOutput;
+
+use anyhow::Context;
+use anyhow::bail;
+use camino::Utf8Path;
+use futures::FutureExt;
+use futures::StreamExt;
+use futures::future::Future;
+use futures::stream::FuturesUnordered;
+use nexus_db_model::Sled;
+use nexus_networking;
+use nexus_types::identity::Asset;
+use tokio::io::AsyncWriteExt;
+
+pub async fn spawn_query_all_sleds(
+    collection: &BundleCollection,
+    cache: &Cache,
+) -> anyhow::Result<CollectionStepOutput> {
+    let request = collection.request();
+
+    if !request.include_host_info() {
+        return Ok(CollectionStepOutput::Skipped);
+    }
+
+    let all_sleds = cache.get_or_initialize_all_sleds(collection).await;
+
+    let Some(all_sleds) = all_sleds else {
+        bail!("Could not read list of sleds");
+    };
+
+    let mut extra_steps: Vec<CollectionStep> = vec![];
+    for sled in all_sleds {
+        if !request.include_sled_host_info(sled.id()) {
+            continue;
+        }
+
+        let sled = sled.clone();
+        extra_steps.push(CollectionStep::new(
+            format!("sled data for sled {}", sled.id()),
+            Box::new({
+                move |collection, dir| {
+                    async move {
+                        collect_data_from_sled(collection, sled, dir).await
+                    }
+                    .boxed()
+                }
+            }),
+        ))
+    }
+
+    Ok(CollectionStepOutput::Spawn { extra_steps })
+}
+
+// Collect data from a sled, storing it into a directory that will
+// be turned into a support bundle.
+//
+// - "sled" is the sled from which we should collect data.
+// - "dir" is a directory where data can be stored, to be turned
+// into a bundle after collection completes.
+async fn collect_data_from_sled(
+    collection: &BundleCollection,
+    sled: Sled,
+    dir: &Utf8Path,
+) -> anyhow::Result<CollectionStepOutput> {
+    let (log, opctx, datastore, request) = (
+        collection.log(),
+        collection.opctx(),
+        collection.datastore(),
+        collection.request(),
+    );
+
+    if !request.include_sled_host_info(sled.id()) {
+        return Ok(CollectionStepOutput::Skipped);
+    }
+
+    info!(log, "Collecting bundle info from sled"; "sled" => %sled.id());
+    let sled_path = dir
+        .join("rack")
+        .join(sled.rack_id.to_string())
+        .join("sled")
+        .join(sled.id().to_string());
+    tokio::fs::create_dir_all(&sled_path).await?;
+    tokio::fs::write(sled_path.join("sled.txt"), format!("{sled:?}")).await?;
+
+    let sled_client = match nexus_networking::sled_client(
+        &datastore,
+        &opctx,
+        sled.id(),
+        log,
+    )
+    .await
+    {
+        Ok(client) => client,
+        Err(err) => {
+            tokio::fs::write(
+                sled_path.join("error.txt"),
+                "Could not contact sled",
+            )
+            .await.with_context(|| {
+                format!("Failed to save 'error.txt' to bundle when recording error: {err}")
+            })?;
+            bail!("Could not contact sled: {err}");
+        }
+    };
+
+    // NB: As new sled-diagnostic commands are added they should
+    // be added to this array so that their output can be saved
+    // within the support bundle.
+    let mut diag_cmds = futures::stream::iter([
+        save_diag_cmd_output_or_error(
+            &sled_path,
+            "zoneadm",
+            sled_client.support_zoneadm_info(),
+        )
+        .boxed(),
+        save_diag_cmd_output_or_error(
+            &sled_path,
+            "dladm",
+            sled_client.support_dladm_info(),
+        )
+        .boxed(),
+        save_diag_cmd_output_or_error(
+            &sled_path,
+            "ipadm",
+            sled_client.support_ipadm_info(),
+        )
+        .boxed(),
+        save_diag_cmd_output_or_error(
+            &sled_path,
+            "nvmeadm",
+            sled_client.support_nvmeadm_info(),
+        )
+        .boxed(),
+        save_diag_cmd_output_or_error(
+            &sled_path,
+            "pargs",
+            sled_client.support_pargs_info(),
+        )
+        .boxed(),
+        save_diag_cmd_output_or_error(
+            &sled_path,
+            "pfiles",
+            sled_client.support_pfiles_info(),
+        )
+        .boxed(),
+        save_diag_cmd_output_or_error(
+            &sled_path,
+            "pstack",
+            sled_client.support_pstack_info(),
+        )
+        .boxed(),
+        save_diag_cmd_output_or_error(
+            &sled_path,
+            "zfs",
+            sled_client.support_zfs_info(),
+        )
+        .boxed(),
+        save_diag_cmd_output_or_error(
+            &sled_path,
+            "zpool",
+            sled_client.support_zpool_info(),
+        )
+        .boxed(),
+        save_diag_cmd_output_or_error(
+            &sled_path,
+            "health-check",
+            sled_client.support_health_check(),
+        )
+        .boxed(),
+    ])
+    // Currently we execute up to 10 commands concurrently which
+    // might be doing their own concurrent work, for example
+    // collectiong `pstack` output of every Oxide process that is
+    // found on a sled.
+    .buffer_unordered(10);
+
+    while let Some(result) = diag_cmds.next().await {
+        // Log that we failed to write the diag command output to a
+        // file but don't return early as we wish to get as much
+        // information as we can.
+        if let Err(e) = result {
+            error!(
+                log,
+                "failed to write diagnostic command output to \
+                file: {e}"
+            );
+        }
+    }
+
+    // For each zone we concurrently fire off a request to its
+    // sled-agent to collect its logs in a zip file and write the
+    // result to the support bundle.
+    let zones = sled_client.support_logs().await?.into_inner();
+    let mut log_futs: FuturesUnordered<_> = zones
+        .iter()
+        .map(|zone| {
+            save_zone_log_zip_or_error(log, &sled_client, zone, &sled_path)
+        })
+        .collect();
+
+    while let Some(log_collection_result) = log_futs.next().await {
+        // We log any errors saving the zip file to disk and
+        // continue on.
+        if let Err(e) = log_collection_result {
+            error!(log, "failed to write logs output: {e}");
+        }
+    }
+    Ok(CollectionStepOutput::None)
+}
+
+// Run a `sled-dianostics` future and save its output to a corresponding file.
+async fn save_diag_cmd_output_or_error<F, S: serde::Serialize>(
+    path: &Utf8Path,
+    command: &str,
+    future: F,
+) -> anyhow::Result<()>
+where
+    F: Future<
+            Output = Result<
+                sled_agent_client::ResponseValue<S>,
+                sled_agent_client::Error<sled_agent_client::types::Error>,
+            >,
+        > + Send,
+{
+    let result = future.await;
+    match result {
+        Ok(result) => {
+            let output = result.into_inner();
+            let json = serde_json::to_string(&output).with_context(|| {
+                format!("failed to serialize {command} output as json")
+            })?;
+            tokio::fs::write(path.join(format!("{command}.json")), json)
+                .await
+                .with_context(|| {
+                    format!("failed to write output of {command} to file")
+                })?;
+        }
+        Err(err) => {
+            tokio::fs::write(
+                path.join(format!("{command}_err.txt")),
+                err.to_string(),
+            )
+            .await?;
+        }
+    }
+    Ok(())
+}
+
+async fn save_zone_log_zip_or_error(
+    logger: &slog::Logger,
+    client: &sled_agent_client::Client,
+    zone: &str,
+    path: &Utf8Path,
+) -> anyhow::Result<()> {
+    // In the future when support bundle collection exposes tuning parameters
+    // this can turn into a collection parameter.
+    const DEFAULT_MAX_ROTATED_LOGS: u32 = 5;
+
+    match client.support_logs_download(zone, DEFAULT_MAX_ROTATED_LOGS).await {
+        Ok(res) => {
+            let bytestream = res.into_inner();
+            let output_dir = path.join(format!("logs/{zone}"));
+            let output_path = output_dir.join("logs.zip");
+
+            // Ensure the logs output directory exists.
+            tokio::fs::create_dir_all(&output_dir).await.with_context(
+                || format!("failed to create output directory: {output_dir}"),
+            )?;
+
+            // Stream the log zip file to disk.
+            let mut file =
+                tokio::fs::File::create(&output_path).await.with_context(
+                    || format!("failed to create log zip file: {output_path}"),
+                )?;
+
+            let stream = bytestream.into_inner().map(|chunk| {
+                chunk.map_err(|e| std::io::Error::other(e.to_string()))
+            });
+            let mut reader = tokio_util::io::StreamReader::new(stream);
+            let _nbytes = tokio::io::copy(&mut reader, &mut file).await?;
+            file.flush().await?;
+
+            // Unzip the log file into the same directory.
+            let output_path_unzip = output_dir.join("unzipped_logs");
+            let zipfile_path = output_path.clone();
+            tokio::task::spawn_blocking(move || {
+                extract_zip_file(&output_path_unzip, &zipfile_path)
+            })
+            .await
+            .map_err(|join_error| {
+                anyhow::anyhow!(join_error)
+                    .context("unzipping support bundle logs zip panicked")
+            })??;
+
+            // Clean up the zip file that was written to disk.
+            if let Err(e) = tokio::fs::remove_file(&output_path).await {
+                error!(
+                    logger,
+                    "failed to cleanup temporary logs zip file";
+                    "error" => %e,
+                    "file" => %output_path,
+
+                );
+            }
+        }
+        Err(err) => {
+            tokio::fs::write(
+                path.join(format!("{zone}.logs.err")),
+                err.to_string(),
+            )
+            .await?;
+        }
+    };
+
+    Ok(())
+}
+
+fn extract_zip_file(
+    output_dir: &Utf8Path,
+    zip_file: &Utf8Path,
+) -> Result<(), anyhow::Error> {
+    let mut zip = std::fs::File::open(&zip_file)
+        .with_context(|| format!("failed to open zip file: {zip_file}"))?;
+    let mut archive = zip::ZipArchive::new(&mut zip)?;
+    archive.extract(&output_dir).with_context(|| {
+        format!("failed to extract log zip file to: {output_dir}")
+    })?;
+    Ok(())
+}
diff --git a/nexus/src/app/background/tasks/support_bundle/steps/mod.rs b/nexus/src/app/background/tasks/support_bundle/steps/mod.rs
new file mode 100644
index 00000000000..d2179c74b8c
--- /dev/null
+++ b/nexus/src/app/background/tasks/support_bundle/steps/mod.rs
@@ -0,0 +1,89 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Individual support bundle collection steps
+
+use crate::app::background::tasks::support_bundle::cache::Cache;
+use crate::app::background::tasks::support_bundle::step::CollectionStep;
+use futures::FutureExt;
+use nexus_types::internal_api::background::SupportBundleCollectionStep;
+
+mod bundle_id;
+mod ereports;
+mod host_info;
+mod reconfigurator;
+mod sled_cubby;
+mod sp_dumps;
+
+/// Returns all steps necessary to collect a bundle.
+///
+/// Note that these steps themselves may spawn additional steps while executing
+/// (e.g., there is a step to read the set of sleds, from which additional
+/// sled-specific steps may be created).
+pub fn all(cache: &Cache) -> Vec<CollectionStep> {
+    vec![
+        CollectionStep::new(
+            SupportBundleCollectionStep::STEP_BUNDLE_ID,
+            Box::new(|collection, dir| {
+                bundle_id::collect(collection, dir).boxed()
+            }),
+        ),
+        CollectionStep::new(
+            SupportBundleCollectionStep::STEP_RECONFIGURATOR_STATE,
+            Box::new(|collection, dir| {
+                reconfigurator::collect(collection, dir).boxed()
+            }),
+        ),
+        CollectionStep::new(
+            SupportBundleCollectionStep::STEP_EREPORTS,
+            Box::new(|collection, dir| {
+                ereports::collect(collection, dir).boxed()
+            }),
+        ),
+        CollectionStep::new(
+            SupportBundleCollectionStep::STEP_SLED_CUBBY_INFO,
+            Box::new({
+                let cache = cache.clone();
+                move |collection, dir| {
+                    async move {
+                        sled_cubby::collect(
+                            collection,
+                            &cache,
+                            dir
+                        ).await
+                    }
+                    .boxed()
+                }
+            }),
+        ),
+        CollectionStep::new(
+            SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS,
+            Box::new({
+                let cache = cache.clone();
+                move |collection, dir| {
+                    async move {
+                        sp_dumps::spawn_collection_steps(
+                            collection, &cache, dir,
+                        )
+                        .await
+                    }
+                    .boxed()
+                }
+            }),
+        ),
+        CollectionStep::new(
+            SupportBundleCollectionStep::STEP_SPAWN_SLEDS,
+            Box::new({
+                let cache = cache.clone();
+                move |collection, _| {
+                    async move {
+                        host_info::spawn_query_all_sleds(collection, &cache)
+                            .await
+                    }
+                    .boxed()
+                }
+            }),
+        ),
+    ]
+}
diff --git a/nexus/src/app/background/tasks/support_bundle/steps/reconfigurator.rs b/nexus/src/app/background/tasks/support_bundle/steps/reconfigurator.rs
new file mode 100644
index 00000000000..802b9e0953e
--- /dev/null
+++ b/nexus/src/app/background/tasks/support_bundle/steps/reconfigurator.rs
@@ -0,0 +1,64 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Collect reconfigurator state for support bundles
+
+use crate::app::background::tasks::support_bundle::collection::BundleCollection;
+use crate::app::background::tasks::support_bundle::step::CollectionStepOutput;
+
+use anyhow::Context;
+use camino::Utf8Path;
+use nexus_reconfigurator_preparation::reconfigurator_state_load;
+
+pub async fn collect(
+    collection: &BundleCollection,
+    dir: &Utf8Path,
+) -> anyhow::Result<CollectionStepOutput> {
+    let (log, opctx, datastore, request) = (
+        collection.log(),
+        collection.opctx(),
+        collection.datastore(),
+        collection.request(),
+    );
+
+    if !request.include_reconfigurator_data() {
+        return Ok(CollectionStepOutput::Skipped);
+    }
+
+    // Collect reconfigurator state
+    const NMAX_BLUEPRINTS: usize = 300;
+    match reconfigurator_state_load(&opctx, &datastore, NMAX_BLUEPRINTS).await {
+        Ok(state) => {
+            let file_path = dir.join("reconfigurator_state.json");
+            let file = std::fs::OpenOptions::new()
+                .create(true)
+                .write(true)
+                .truncate(true)
+                .open(&file_path)
+                .with_context(|| format!("failed to open {}", file_path))?;
+            serde_json::to_writer_pretty(&file, &state).with_context(|| {
+                format!(
+                    "failed to serialize reconfigurator state to {}",
+                    file_path
+                )
+            })?;
+            info!(
+                log,
+                "Support bundle: collected reconfigurator state";
+                "target_blueprint" => ?state.target_blueprint,
+                "num_blueprints" => state.blueprints.len(),
+                "num_collections" => state.collections.len(),
+            );
+        }
+        Err(err) => {
+            warn!(
+                log,
+                "Support bundle: failed to collect reconfigurator state";
+                "err" => ?err,
+            );
+        }
+    };
+
+    Ok(CollectionStepOutput::None)
+}
diff --git a/nexus/src/app/background/tasks/support_bundle/steps/sled_cubby.rs b/nexus/src/app/background/tasks/support_bundle/steps/sled_cubby.rs
new file mode 100644
index 00000000000..47755d247af
--- /dev/null
+++ b/nexus/src/app/background/tasks/support_bundle/steps/sled_cubby.rs
@@ -0,0 +1,146 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Collect sled cubby information for support bundles
+
+use crate::app::background::tasks::support_bundle::cache::Cache;
+use crate::app::background::tasks::support_bundle::collection::BundleCollection;
+use crate::app::background::tasks::support_bundle::step::CollectionStepOutput;
+
+use anyhow::Context;
+use anyhow::bail;
+use camino::Utf8Path;
+use gateway_client::Client as MgsClient;
+use gateway_client::types::SpIdentifier;
+use gateway_client::types::SpIgnition;
+use gateway_types::component::SpType;
+use nexus_db_model::Sled;
+use omicron_uuid_kinds::GenericUuid;
+use serde::Serialize;
+use slog::Logger;
+use slog_error_chain::InlineErrorChain;
+use std::collections::BTreeMap;
+use uuid::Uuid;
+
+pub async fn collect(
+    collection: &BundleCollection,
+    cache: &Cache,
+    dir: &Utf8Path,
+) -> anyhow::Result<CollectionStepOutput> {
+    let (log, request) = (collection.log(), collection.request());
+
+    if !request.include_sled_cubby_info() {
+        return Ok(CollectionStepOutput::Skipped);
+    }
+
+    let mgs_client_option =
+        cache.get_or_initialize_mgs_client(&collection).await;
+    let nexus_sleds = cache
+        .get_or_initialize_all_sleds(&collection)
+        .await
+        .map_or(&[][..], |v| v.as_slice());
+
+    let Some(mgs_client) = mgs_client_option else {
+        bail!("Could not initialize MGS client");
+    };
+
+    write_sled_cubby_info(log, mgs_client, nexus_sleds, dir).await?;
+
+    Ok(CollectionStepOutput::None)
+}
+
+async fn write_sled_cubby_info(
+    log: &Logger,
+    mgs_client: &MgsClient,
+    nexus_sleds: &[Sled],
+    dir: &Utf8Path,
+) -> anyhow::Result<()> {
+    #[derive(Serialize)]
+    struct SledInfo {
+        cubby: Option<u16>,
+        uuid: Option<Uuid>,
+    }
+
+    let available_sps = get_available_sps(&mgs_client)
+        .await
+        .context("failed to get available SPs")?;
+
+    // We can still get a useful mapping of cubby to serial using just the data from MGS.
+    let mut nexus_map: BTreeMap<_, _> = nexus_sleds
+        .into_iter()
+        .map(|sled| (sled.serial_number(), sled))
+        .collect();
+
+    let mut sled_info = BTreeMap::new();
+    for sp in
+        available_sps.into_iter().filter(|sp| matches!(sp.type_, SpType::Sled))
+    {
+        let sp_state = match mgs_client.sp_get(&sp.type_, sp.slot).await {
+            Ok(s) => s.into_inner(),
+            Err(e) => {
+                error!(log,
+                    "Failed to get SP state for sled_info.json";
+                    "cubby" => sp.slot,
+                    "component" => %sp.type_,
+                    "error" => InlineErrorChain::new(&e)
+                );
+                continue;
+            }
+        };
+
+        if let Some(sled) = nexus_map.remove(sp_state.serial_number.as_str()) {
+            sled_info.insert(
+                sp_state.serial_number.to_string(),
+                SledInfo {
+                    cubby: Some(sp.slot),
+                    uuid: Some(*sled.identity.id.as_untyped_uuid()),
+                },
+            );
+        } else {
+            sled_info.insert(
+                sp_state.serial_number.to_string(),
+                SledInfo { cubby: Some(sp.slot), uuid: None },
+            );
+        }
+    }
+
+    // Sleds not returned by MGS.
+    for (serial, sled) in nexus_map {
+        sled_info.insert(
+            serial.to_string(),
+            SledInfo {
+                cubby: None,
+                uuid: Some(*sled.identity.id.as_untyped_uuid()),
+            },
+        );
+    }
+
+    let json = serde_json::to_string_pretty(&sled_info)
+        .context("failed to serialize sled info to JSON")?;
+    tokio::fs::write(dir.join("sled_info.json"), json).await?;
+
+    Ok(())
+}
+
+pub async fn get_available_sps(
+    mgs_client: &MgsClient,
+) -> anyhow::Result<Vec<SpIdentifier>> {
+    let ignition_info = mgs_client
+        .ignition_list()
+        .await
+        .context("failed to get ignition info from MGS")?
+        .into_inner();
+
+    let mut active_sps = Vec::new();
+    for info in ignition_info {
+        if let SpIgnition::Yes { power, flt_sp, .. } = info.details {
+            // Only return SPs that are powered on and are not in a faulted state.
+            if power && !flt_sp {
+                active_sps.push(info.id);
+            }
+        }
+    }
+
+    Ok(active_sps)
+}
diff --git a/nexus/src/app/background/tasks/support_bundle/steps/sp_dumps.rs b/nexus/src/app/background/tasks/support_bundle/steps/sp_dumps.rs
new file mode 100644
index 00000000000..2c745dd7649
--- /dev/null
+++ b/nexus/src/app/background/tasks/support_bundle/steps/sp_dumps.rs
@@ -0,0 +1,110 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Collect SP task dumps for support bundles
+
+use crate::app::background::tasks::support_bundle::cache::Cache;
+use crate::app::background::tasks::support_bundle::collection::BundleCollection;
+use crate::app::background::tasks::support_bundle::step::CollectionStep;
+use crate::app::background::tasks::support_bundle::step::CollectionStepOutput;
+use crate::app::background::tasks::support_bundle::steps;
+
+use anyhow::Context;
+use anyhow::bail;
+use base64::Engine;
+use camino::Utf8Path;
+use futures::FutureExt;
+use gateway_client::Client as MgsClient;
+use gateway_client::types::SpIdentifier;
+
+pub async fn spawn_collection_steps(
+    collection: &BundleCollection,
+    cache: &Cache,
+    dir: &Utf8Path,
+) -> anyhow::Result<CollectionStepOutput> {
+    let request = collection.request();
+
+    if !request.include_sp_dumps() {
+        return Ok(CollectionStepOutput::Skipped);
+    }
+
+    let Some(mgs_client) = cache.get_or_initialize_mgs_client(collection).await
+    else {
+        bail!("Could not initialize MGS client");
+    };
+
+    let sp_dumps_dir = dir.join("sp_task_dumps");
+    tokio::fs::create_dir_all(&sp_dumps_dir).await.with_context(|| {
+        format!("Failed to create SP task dump directory {sp_dumps_dir}")
+    })?;
+
+    let mut extra_steps: Vec<CollectionStep> = vec![];
+    for sp in steps::sled_cubby::get_available_sps(&mgs_client).await? {
+        extra_steps.push(CollectionStep::new(
+            format!("SP dump for {:?}", sp),
+            Box::new({
+                let mgs_client = mgs_client.clone();
+                move |collection, dir| {
+                    async move {
+                        collect_sp_dump(collection, &mgs_client, sp, dir).await
+                    }
+                    .boxed()
+                }
+            }),
+        ));
+    }
+
+    Ok(CollectionStepOutput::Spawn { extra_steps })
+}
+
+async fn collect_sp_dump(
+    collection: &BundleCollection,
+    mgs_client: &MgsClient,
+    sp: SpIdentifier,
+    dir: &Utf8Path,
+) -> anyhow::Result<CollectionStepOutput> {
+    if !collection.request().include_sp_dumps() {
+        return Ok(CollectionStepOutput::Skipped);
+    }
+
+    save_sp_dumps(mgs_client, sp, dir).await.with_context(|| {
+        format!("failed to save SP dump from: {} {}", sp.type_, sp.slot)
+    })?;
+
+    Ok(CollectionStepOutput::None)
+}
+
+async fn save_sp_dumps(
+    mgs_client: &MgsClient,
+    sp: SpIdentifier,
+    sp_dumps_dir: &Utf8Path,
+) -> anyhow::Result<()> {
+    let dump_count = mgs_client
+        .sp_task_dump_count(&sp.type_, sp.slot)
+        .await
+        .context("failed to get task dump count from SP")?
+        .into_inner();
+
+    let output_dir = sp_dumps_dir.join(format!("{}_{}", sp.type_, sp.slot));
+    tokio::fs::create_dir_all(&output_dir).await.with_context(|| {
+        format!("Failed to create output directory {output_dir}")
+    })?;
+
+    for i in 0..dump_count {
+        let task_dump = mgs_client
+            .sp_task_dump_get(&sp.type_, sp.slot, i)
+            .await
+            .with_context(|| format!("failed to get task dump {i} from SP"))?
+            .into_inner();
+
+        let zip_bytes = base64::engine::general_purpose::STANDARD
+            .decode(task_dump.base64_zip)
+            .context("failed to decode base64-encoded SP task dump zip")?;
+
+        tokio::fs::write(output_dir.join(format!("dump-{i}.zip")), zip_bytes)
+            .await
+            .context("failed to write SP task dump zip to disk")?;
+    }
+    Ok(())
+}
diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs
index 0d4939f3acc..887be497a17 100644
--- a/nexus/src/app/background/tasks/support_bundle_collector.rs
+++ b/nexus/src/app/background/tasks/support_bundle_collector.rs
@@ -5,269 +5,37 @@
 //! Background task for managing Support Bundles
 
 use crate::app::background::BackgroundTask;
-use anyhow::Context;
-use anyhow::bail;
-use base64::Engine;
-use camino::Utf8DirEntry;
-use camino::Utf8Path;
-use camino::Utf8PathBuf;
-use camino_tempfile::Utf8TempDir;
-use camino_tempfile::tempdir_in;
-use camino_tempfile::tempfile_in;
-use chrono::DateTime;
-use chrono::Utc;
 use futures::FutureExt;
-use futures::StreamExt;
 use futures::future::BoxFuture;
-use futures::stream::FuturesUnordered;
-use gateway_client::Client as MgsClient;
-use gateway_client::types::SpIdentifier;
-use gateway_client::types::SpIgnition;
-use gateway_types::component::SpType;
 use internal_dns_resolver::Resolver;
-use internal_dns_types::names::ServiceName;
-use nexus_db_model::Sled;
 use nexus_db_model::SupportBundle;
 use nexus_db_model::SupportBundleState;
 use nexus_db_queries::authz;
 use nexus_db_queries::context::OpContext;
 use nexus_db_queries::db::DataStore;
-use nexus_db_queries::db::datastore;
-use nexus_db_queries::db::datastore::EreportFilters;
-use nexus_db_queries::db::pagination::Paginator;
-use nexus_reconfigurator_preparation::reconfigurator_state_load;
-use nexus_types::deployment::SledFilter;
-use nexus_types::fm::Ereport;
-use nexus_types::identity::Asset;
 use nexus_types::internal_api::background::SupportBundleCleanupReport;
 use nexus_types::internal_api::background::SupportBundleCollectionReport;
-use nexus_types::internal_api::background::SupportBundleCollectionStep;
-use nexus_types::internal_api::background::SupportBundleCollectionStepStatus;
-use nexus_types::internal_api::background::SupportBundleEreportStatus;
 use omicron_common::api::external::DataPageParams;
 use omicron_common::api::external::Error;
 use omicron_common::api::external::LookupType;
 use omicron_common::api::external::ResourceType;
 use omicron_uuid_kinds::DatasetUuid;
-use omicron_uuid_kinds::GenericUuid;
 use omicron_uuid_kinds::OmicronZoneUuid;
 use omicron_uuid_kinds::SledUuid;
 use omicron_uuid_kinds::SupportBundleUuid;
 use omicron_uuid_kinds::ZpoolUuid;
-use parallel_task_set::ParallelTaskSet;
-use serde::Serialize;
 use serde_json::json;
-use sha2::{Digest, Sha256};
 use sled_agent_types::support_bundle::NESTED_DATASET_NOT_FOUND;
 use slog_error_chain::InlineErrorChain;
-use std::collections::BTreeMap;
-use std::collections::HashMap;
-use std::collections::HashSet;
-use std::future::Future;
-use std::io::Write;
-use std::num::NonZeroU64;
 use std::sync::Arc;
-use tokio::io::AsyncReadExt;
-use tokio::io::AsyncSeekExt;
-use tokio::io::AsyncWriteExt;
-use tokio::io::SeekFrom;
-use tokio::sync::OnceCell;
-use tufaceous_artifact::ArtifactHash;
-use uuid::Uuid;
-use zip::ZipArchive;
-use zip::ZipWriter;
-use zip::write::FullFileOptions;
-
-use super::support_bundle::perfetto;
-
-// We use "/var/tmp" to use Nexus' filesystem for temporary storage,
-// rather than "/tmp", which would keep this collected data in-memory.
-const TEMPDIR: &str = "/var/tmp";
-
-// The size of piece of a support bundle to transfer to the sled agent
-// within a single streaming request.
-const CHUNK_SIZE: NonZeroU64 = NonZeroU64::new(1024 * 1024 * 1024).unwrap();
+
+use super::support_bundle::collection::BundleCollection;
+use super::support_bundle::request::BundleRequest;
 
 fn authz_support_bundle_from_id(id: SupportBundleUuid) -> authz::SupportBundle {
     authz::SupportBundle::new(authz::FLEET, id, LookupType::by_id(id))
 }
 
-// Describes the category of support bundle data.
-#[derive(Debug, Clone, Copy, Hash, Eq, PartialEq)]
-enum BundleDataCategory {
-    // Collects reconfigurator state (some of the latest blueprints,
-    // information about the target blueprint).
-    Reconfigurator,
-    // Collects info from sled agents, running a handful of
-    // diagnostic commands (e.g., zoneadm, dladm, etc).
-    HostInfo,
-    // Collects sled serial numbers, cubby numbers, and UUIDs.
-    SledCubbyInfo,
-    // Saves task dumps from SPs.
-    SpDumps,
-    // Collects ereports
-    Ereports,
-}
-
-// Specifies what data to collect for a bundle data category.
-//
-// Each variant corresponds to a BundleDataCategory.
-// For categories without additional parameters, the variant is a unit variant.
-// For categories that can be filtered or configured, the variant contains
-// that configuration data.
-#[derive(Debug, Clone, Eq, PartialEq)]
-enum BundleData {
-    Reconfigurator,
-    HostInfo(HashSet<SledSelection>),
-    SledCubbyInfo,
-    SpDumps,
-    Ereports(EreportFilters),
-}
-
-impl BundleData {
-    fn category(&self) -> BundleDataCategory {
-        match self {
-            Self::Reconfigurator => BundleDataCategory::Reconfigurator,
-            Self::HostInfo(_) => BundleDataCategory::HostInfo,
-            Self::SledCubbyInfo => BundleDataCategory::SledCubbyInfo,
-            Self::SpDumps => BundleDataCategory::SpDumps,
-            Self::Ereports(_) => BundleDataCategory::Ereports,
-        }
-    }
-}
-
-// A collection of bundle data specifications.
-//
-// This wrapper ensures that categories and data always match - you can't
-// insert (BundleDataCategory::Reconfigurator, BundleData::SpDumps)
-// because each BundleData determines its own category.
-#[derive(Debug, Clone)]
-struct BundleDataSelection {
-    data: HashMap<BundleDataCategory, BundleData>,
-}
-
-impl BundleDataSelection {
-    fn new() -> Self {
-        Self { data: HashMap::new() }
-    }
-
-    // Inserts BundleData to be queried for a particular category within the
-    // bundle.
-    //
-    // Each category of data can only be specified once (e.g., inserting
-    // BundleData::HostInfo multiple times will only use the most-recently
-    // inserted specification)
-    fn insert(&mut self, bundle_data: BundleData) {
-        self.data.insert(bundle_data.category(), bundle_data);
-    }
-
-    fn contains(&self, category: BundleDataCategory) -> bool {
-        self.data.contains_key(&category)
-    }
-
-    fn get(&self, category: BundleDataCategory) -> Option<&BundleData> {
-        self.data.get(&category)
-    }
-}
-
-impl FromIterator<BundleData> for BundleDataSelection {
-    fn from_iter<T: IntoIterator<Item = BundleData>>(iter: T) -> Self {
-        let mut selection = Self::new();
-        for bundle_data in iter {
-            selection.insert(bundle_data);
-        }
-        selection
-    }
-}
-
-impl Default for BundleDataSelection {
-    fn default() -> Self {
-        [
-            BundleData::Reconfigurator,
-            BundleData::HostInfo(HashSet::from([SledSelection::All])),
-            BundleData::SledCubbyInfo,
-            BundleData::SpDumps,
-            BundleData::Ereports(EreportFilters {
-                start_time: Some(chrono::Utc::now() - chrono::Days::new(7)),
-                ..EreportFilters::default()
-            }),
-        ]
-        .into_iter()
-        .collect()
-    }
-}
-
-// The set of sleds to include
-//
-// Multiple values of this enum are joined together into a HashSet.
-// Therefore "SledSelection::All" overrides specific sleds.
-#[derive(Debug, Clone, Hash, Eq, PartialEq)]
-enum SledSelection {
-    All,
-    Specific(SledUuid),
-}
-
-// Specifies the data to be collected within the Support Bundle.
-#[derive(Clone)]
-struct BundleRequest {
-    // The size of chunks to use when transferring a bundle from Nexus
-    // to a sled agent.
-    //
-    // Typically, this is CHUNK_SIZE, but can be modified for testing.
-    transfer_chunk_size: NonZeroU64,
-
-    // The set of data to be included within this bundle.
-    //
-    // Maps each category to its filter. If a category is not in the map,
-    // it is excluded from the bundle.
-    data_selection: BundleDataSelection,
-}
-
-impl BundleRequest {
-    fn include_reconfigurator_data(&self) -> bool {
-        self.data_selection.contains(BundleDataCategory::Reconfigurator)
-    }
-
-    fn include_host_info(&self) -> bool {
-        self.data_selection.contains(BundleDataCategory::HostInfo)
-    }
-
-    fn include_sled_host_info(&self, id: SledUuid) -> bool {
-        let selection =
-            match self.data_selection.get(BundleDataCategory::HostInfo) {
-                Some(BundleData::HostInfo(selection)) => selection,
-                _ => return false,
-            };
-
-        selection.contains(&SledSelection::Specific(id))
-            || selection.contains(&SledSelection::All)
-    }
-
-    fn get_ereport_filters(&self) -> Option<&EreportFilters> {
-        match self.data_selection.get(BundleDataCategory::Ereports) {
-            Some(BundleData::Ereports(filters)) => Some(filters),
-            _ => None,
-        }
-    }
-
-    fn include_sled_cubby_info(&self) -> bool {
-        self.data_selection.contains(BundleDataCategory::SledCubbyInfo)
-    }
-
-    fn include_sp_dumps(&self) -> bool {
-        self.data_selection.contains(BundleDataCategory::SpDumps)
-    }
-}
-
-impl Default for BundleRequest {
-    fn default() -> Self {
-        Self {
-            transfer_chunk_size: CHUNK_SIZE,
-            data_selection: BundleDataSelection::default(),
-        }
-    }
-}
-
 // Result of asking a sled agent to clean up a bundle
 enum SledAgentBundleCleanupResult {
     Deleted,
@@ -581,15 +349,15 @@ impl SupportBundleCollector {
             }
         };
 
-        let collection = Arc::new(BundleCollection {
-            datastore: self.datastore.clone(),
-            resolver: self.resolver.clone(),
-            log: opctx.log.new(slog::o!("bundle" => bundle.id.to_string())),
-            opctx: opctx.child(std::collections::BTreeMap::new()),
-            request: request.clone(),
-            bundle: bundle.clone(),
-            transfer_chunk_size: request.transfer_chunk_size,
-        });
+        let collection = Arc::new(BundleCollection::new(
+            self.datastore.clone(),
+            self.resolver.clone(),
+            opctx.log.new(slog::o!("bundle" => bundle.id.to_string())),
+            opctx.child(std::collections::BTreeMap::new()),
+            request.clone(),
+            bundle.clone(),
+            request.transfer_chunk_size,
+        ));
 
         let authz_bundle = authz_support_bundle_from_id(bundle.id.into());
         let mut report = collection.collect_bundle_and_store_on_sled().await?;
@@ -625,1073 +393,6 @@ impl SupportBundleCollector {
     }
 }
 
-// Wraps up all arguments to perform a single support bundle collection
-struct BundleCollection {
-    datastore: Arc<DataStore>,
-    resolver: Resolver,
-    log: slog::Logger,
-    opctx: OpContext,
-    request: BundleRequest,
-    bundle: SupportBundle,
-    transfer_chunk_size: NonZeroU64,
-}
-
-// This type describes a single step in the Support Bundle collection.
-//
-// - All steps have access to the "BundleCollection", which includes
-// tools for actually acquiring data.
-// - All steps have access to an output directory where they can store
-// serialized data to a file.
-// - Finally, all steps can emit a "CollectionStepOutput", which can either
-// update the collection report, or generate more steps.
-type CollectionStepFn = Box<
-    dyn for<'b> FnOnce(
-            &'b Arc<BundleCollection>,
-            &'b Utf8Path,
-        )
-            -> BoxFuture<'b, anyhow::Result<CollectionStepOutput>>
-        + Send,
->;
-
-struct CollectionStep {
-    name: String,
-    step_fn: CollectionStepFn,
-}
-
-impl CollectionStep {
-    fn new(name: impl Into<String>, step_fn: CollectionStepFn) -> Self {
-        Self { name: name.into(), step_fn }
-    }
-
-    async fn run(
-        self,
-        collection: &Arc<BundleCollection>,
-        output: &Utf8Path,
-    ) -> CompletedCollectionStep {
-        let start = Utc::now();
-
-        let output = (self.step_fn)(collection, output)
-            .await
-            .inspect_err(|err| {
-                warn!(
-                    collection.log,
-                    "Step failed";
-                    "step" => &self.name,
-                    InlineErrorChain::new(err.as_ref()),
-                );
-            })
-            .unwrap_or_else(|err| CollectionStepOutput::Failed(err));
-
-        let end = Utc::now();
-
-        CompletedCollectionStep { name: self.name, start, end, output }
-    }
-}
-
-struct CompletedCollectionStep {
-    name: String,
-    start: DateTime<Utc>,
-    end: DateTime<Utc>,
-    output: CollectionStepOutput,
-}
-
-impl CompletedCollectionStep {
-    // Updates the collection report based on the output of a collection step,
-    // and possibly extends the set of all steps to be executed.
-    fn process(
-        self,
-        report: &mut SupportBundleCollectionReport,
-        steps: &mut Vec<CollectionStep>,
-    ) {
-        use SupportBundleCollectionStepStatus as Status;
-
-        let status = match self.output {
-            CollectionStepOutput::Skipped => Status::Skipped,
-            CollectionStepOutput::Failed(err) => {
-                Status::Failed(err.to_string())
-            }
-            CollectionStepOutput::Ereports(status) => {
-                report.ereports = Some(status);
-                Status::Ok
-            }
-            CollectionStepOutput::Spawn { extra_steps } => {
-                steps.extend(extra_steps);
-                Status::Ok
-            }
-            CollectionStepOutput::None => Status::Ok,
-        };
-
-        // Add information about this completed step the bundle report.
-        let step = SupportBundleCollectionStep {
-            name: self.name,
-            start: self.start,
-            end: self.end,
-            status,
-        };
-        report.steps.push(step);
-    }
-}
-
-enum CollectionStepOutput {
-    // The step was not executed intentionally
-    Skipped,
-    // The step encountered a fatal error and could not complete.
-    //
-    // It may have still saved a partial set of data to the bundle.
-    Failed(anyhow::Error),
-    Ereports(SupportBundleEreportStatus),
-    // The step spawned additional steps to execute
-    Spawn { extra_steps: Vec<CollectionStep> },
-    // The step completed with nothing to report, and no follow-up steps
-    None,
-}
-
-impl BundleCollection {
-    // Collect the bundle within Nexus, and store it on a target sled.
-    async fn collect_bundle_and_store_on_sled(
-        self: &Arc<Self>,
-    ) -> anyhow::Result<SupportBundleCollectionReport> {
-        // Create a temporary directory where we'll store the support bundle
-        // as it's being collected.
-        let dir = tempdir_in(TEMPDIR)?;
-
-        let report = self.collect_bundle_locally(&dir).await?;
-        self.store_bundle_on_sled(dir).await?;
-        Ok(report)
-    }
-
-    // Create the support bundle, placing the contents into a user-specified
-    // directory.
-    //
-    // Does not attempt to convert the contents into a zipfile, nor send them
-    // to any durable storage.
-    async fn collect_bundle_locally(
-        self: &Arc<Self>,
-        dir: &Utf8TempDir,
-    ) -> anyhow::Result<SupportBundleCollectionReport> {
-        // TL;DR: This `tokio::select` is allowed to poll multiple futures, but
-        // should not do any async work within the body of any chosen branch. A
-        // previous iteration of this code polled the "collection" as "&mut
-        // collection", and checked the status of the support bundle within a
-        // branch of the "select" polling "yield_interval.tick()".
-        //
-        // We organize this work to "check for cancellation" as a whole future
-        // for a critical, but subtle reason: After the tick timer yields,
-        // we may then try to `await` a database function.
-        //
-        // This, at a surface-level glance seems innocent enough. However, there
-        // is something potentially insidious here: if calling a datastore
-        // function - such as "support_bundle_get" - awaits acquiring access
-        // to a connection from the connection pool, while creating the
-        // collection ALSO potentially awaits acquiring access to the
-        // connection pool, it is possible for:
-        //
-        // 1. The `&mut collection` arm to have created a future, currently
-        //    yielded, which wants access to this underlying resource.
-        // 2. The current operation executing in `support_bundle_get` to
-        //    be awaiting access to this same underlying resource.
-        //
-        // In this specific case, the connection pool would be attempting to
-        // yield to the `&mut collection` arm, which cannot run, if we were
-        // awaiting in the body of a different async select arm. This would
-        // result in a deadlock.
-        //
-        // In the future, we may attempt to make access to the connection pool
-        // safer from concurrent asynchronous access - it is unsettling that
-        // multiple concurrent `.claim()` functions can cause this behavior -
-        // but in the meantime, we perform this cancellation check in a single
-        // future that always is polled concurrently with the collection work.
-        // Because of this separation, each future is polled until one
-        // completes, at which point we deterministically exit.
-        //
-        // For more details, see:
-        // https://github.com/oxidecomputer/omicron/issues/9259
-
-        tokio::select! {
-            // Returns if the bundle should no longer be collected.
-            why = self.check_for_cancellation() => {
-                warn!(
-                    &self.log,
-                    "Support Bundle cancelled - stopping collection";
-                    "bundle" => %self.bundle.id,
-                    "state" => ?self.bundle.state
-                );
-                return Err(why);
-            },
-            // Otherwise, keep making progress on the collection itself.
-            report = self.collect_bundle_as_file(&dir) => {
-                info!(
-                    &self.log,
-                    "Bundle Collection completed";
-                    "bundle" => %self.bundle.id
-                );
-                return report;
-            },
-        }
-    }
-
-    async fn store_bundle_on_sled(
-        &self,
-        dir: Utf8TempDir,
-    ) -> anyhow::Result<()> {
-        // Create the zipfile as a temporary file
-        let mut zipfile = tokio::fs::File::from_std(bundle_to_zipfile(&dir)?);
-        let total_len = zipfile.metadata().await?.len();
-
-        // Collect the hash locally before we send it over the network
-        //
-        // We'll use this later during finalization to confirm the bundle
-        // has been stored successfully.
-        zipfile.seek(SeekFrom::Start(0)).await?;
-        let hash = sha2_hash(&mut zipfile).await?;
-
-        // Find the sled where we're storing this bundle.
-        let sled_id = self
-            .datastore
-            .zpool_get_sled_if_in_service(
-                &self.opctx,
-                self.bundle.zpool_id.into(),
-            )
-            .await?;
-        let sled_client = nexus_networking::sled_client(
-            &self.datastore,
-            &self.opctx,
-            sled_id,
-            &self.log,
-        )
-        .await?;
-
-        let zpool = ZpoolUuid::from(self.bundle.zpool_id);
-        let dataset = DatasetUuid::from(self.bundle.dataset_id);
-        let support_bundle = SupportBundleUuid::from(self.bundle.id);
-
-        // Tell this sled to create the bundle.
-        let creation_result = sled_client
-            .support_bundle_start_creation(&zpool, &dataset, &support_bundle)
-            .await
-            .with_context(|| "Support bundle failed to start creation")?;
-
-        if matches!(
-            creation_result.state,
-            sled_agent_client::types::SupportBundleState::Complete
-        ) {
-            // Early exit case: the bundle was already created -- we must have either
-            // crashed or failed between "finalizing" and "writing to the database that we
-            // finished".
-            info!(&self.log, "Support bundle was already collected"; "bundle" => %self.bundle.id);
-            return Ok(());
-        }
-        info!(&self.log, "Support bundle creation started"; "bundle" => %self.bundle.id);
-
-        let mut offset = 0;
-        while offset < total_len {
-            // Stream the zipfile to the sled where it should be kept
-            let mut file = zipfile
-                .try_clone()
-                .await
-                .with_context(|| "Failed to clone zipfile")?;
-            file.seek(SeekFrom::Start(offset)).await.with_context(|| {
-                format!("Failed to seek to offset {offset} / {total_len} within zipfile")
-            })?;
-
-            // Only stream at most "transfer_chunk_size" bytes at once
-            let chunk_size = std::cmp::min(
-                self.transfer_chunk_size.get(),
-                total_len - offset,
-            );
-
-            let limited_file = file.take(chunk_size);
-            let stream = tokio_util::io::ReaderStream::new(limited_file);
-            let body = reqwest::Body::wrap_stream(stream);
-
-            info!(
-                &self.log,
-                "Streaming bundle chunk";
-                "bundle" => %self.bundle.id,
-                "offset" => offset,
-                "length" => chunk_size,
-            );
-
-            sled_client.support_bundle_transfer(
-                &zpool, &dataset, &support_bundle, offset, body
-            ).await.with_context(|| {
-                format!("Failed to transfer bundle: {chunk_size}@{offset} of {total_len} to sled")
-            })?;
-
-            offset += chunk_size;
-        }
-
-        sled_client
-            .support_bundle_finalize(
-                &zpool,
-                &dataset,
-                &support_bundle,
-                &hash.to_string(),
-            )
-            .await
-            .with_context(|| "Failed to finalize bundle")?;
-
-        // Returning from this method should drop all temporary storage
-        // allocated locally for this support bundle.
-        Ok(())
-    }
-
-    // Indefinitely perform periodic checks about whether or not we should
-    // cancel the bundle.
-    //
-    // Returns an error if:
-    // - The bundle state is no longer SupportBundleState::Collecting
-    // (which happens if the bundle has been explicitly cancelled, or
-    // if the backing storage has been expunged).
-    // - The bundle has been deleted
-    //
-    // Otherwise, keeps checking indefinitely while polled.
-    async fn check_for_cancellation(&self) -> anyhow::Error {
-        let work_duration = tokio::time::Duration::from_secs(5);
-        let mut yield_interval = tokio::time::interval_at(
-            tokio::time::Instant::now() + work_duration,
-            work_duration,
-        );
-
-        loop {
-            // Timer fired mid-collection - check if we should stop.
-            yield_interval.tick().await;
-            trace!(
-                self.log,
-                "Checking if Bundle Collection cancelled";
-                "bundle" => %self.bundle.id
-            );
-
-            match self
-                .datastore
-                .support_bundle_get(&self.opctx, self.bundle.id.into())
-                .await
-            {
-                Ok(SupportBundle {
-                    state: SupportBundleState::Collecting,
-                    ..
-                }) => {
-                    // Bundle still collecting; continue...
-                    continue;
-                }
-                Ok(_) => {
-                    // Not collecting, for any reason: Time to exit
-                    return anyhow::anyhow!("Support Bundle Cancelled");
-                }
-                Err(Error::ObjectNotFound { .. } | Error::NotFound { .. }) => {
-                    return anyhow::anyhow!("Support Bundle Deleted");
-                }
-                Err(err) => {
-                    warn!(
-                        self.log,
-                        "Database error checking bundle cancellation";
-                        InlineErrorChain::new(&err)
-                    );
-
-                    // If we cannot contact the database, retry later
-                    continue;
-                }
-            }
-        }
-    }
-
-    async fn run_collect_bundle_steps(
-        self: &Arc<Self>,
-        output: &Utf8TempDir,
-        mut steps: Vec<CollectionStep>,
-    ) -> SupportBundleCollectionReport {
-        let mut report =
-            SupportBundleCollectionReport::new(self.bundle.id.into());
-
-        const MAX_CONCURRENT_STEPS: usize = 16;
-        let mut tasks =
-            ParallelTaskSet::new_with_parallelism(MAX_CONCURRENT_STEPS);
-
-        loop {
-            // Process all the currently-planned steps
-            while let Some(step) = steps.pop() {
-                let previous_result = tasks.spawn({
-                    let collection = self.clone();
-                    let dir = output.path().to_path_buf();
-                    async move {
-                        debug!(collection.log, "Running step"; "step" => &step.name);
-                        step.run(&collection, dir.as_path()).await
-                    }
-                }).await;
-
-                if let Some(output) = previous_result {
-                    output.process(&mut report, &mut steps);
-                };
-            }
-
-            // If we've run out of tasks to spawn, join any of the previously
-            // spawned tasks, if any exist.
-            if let Some(output) = tasks.join_next().await {
-                output.process(&mut report, &mut steps);
-
-                // As soon as any task completes, see if we can spawn more work
-                // immediately. This ensures that the ParallelTaskSet is
-                // saturated as much as it can be.
-                continue;
-            }
-
-            // Executing steps may create additional steps, as follow-up work.
-            //
-            // Only finish if we've exhausted all possible steps and joined all spawned work.
-            if steps.is_empty() {
-                // Write trace file before returning
-                if let Err(err) = self.write_trace_file(output, &report).await {
-                    warn!(
-                        self.log,
-                        "Failed to write trace file";
-                        "error" => ?err
-                    );
-                }
-                return report;
-            }
-        }
-    }
-
-    // Write a Perfetto Event format JSON file for visualization
-    async fn write_trace_file(
-        &self,
-        output: &Utf8TempDir,
-        report: &SupportBundleCollectionReport,
-    ) -> anyhow::Result<()> {
-        let meta_dir = output.path().join("meta");
-        tokio::fs::create_dir_all(&meta_dir).await.with_context(|| {
-            format!("Failed to create meta directory {meta_dir}")
-        })?;
-
-        let trace_path = meta_dir.join("trace.json");
-
-        // Convert steps to Perfetto Trace Event format.
-        // Sort steps by start time and assign each a unique sequential ID.
-        //
-        // This is necessary because the trace event format does not like
-        // multiple slices to overlap - so we make each slice distinct.
-        //
-        // Ideally we'd be able to correlate these with actual tokio tasks,
-        // but it's hard to convert tokio::task::Id to a u64 because
-        // of https://github.com/tokio-rs/tokio/issues/7430
-        let mut sorted_steps: Vec<_> = report.steps.iter().collect();
-        sorted_steps.sort_by_key(|s| s.start);
-
-        // Generate trace events - each step gets a unique ID (1, 2, 3, ...)
-        // based on its start time order
-        let trace_events: Vec<_> = sorted_steps
-            .iter()
-            .enumerate()
-            .map(|(i, step)| {
-                let start_us = step.start.timestamp_micros();
-                let duration_us = (step.end - step.start)
-                    .num_microseconds()
-                    .unwrap_or(0)
-                    .max(0);
-                let step_id = i + 1;
-
-                perfetto::TraceEvent {
-                    name: step.name.clone(),
-                    cat: "bundle_collection".to_string(),
-                    ph: "X".to_string(),
-                    ts: start_us,
-                    dur: duration_us,
-                    pid: 1,
-                    tid: step_id,
-                    args: json!({
-                        "status": step.status.to_string(),
-                    }),
-                }
-            })
-            .collect();
-
-        let trace = perfetto::Trace {
-            trace_events,
-            display_time_unit: "ms".to_string(),
-        };
-
-        let trace_content = serde_json::to_string_pretty(&trace)
-            .context("Failed to serialize trace JSON")?;
-
-        tokio::fs::write(&trace_path, trace_content).await.with_context(
-            || format!("Failed to write trace file to {trace_path}"),
-        )?;
-
-        info!(
-            self.log,
-            "Wrote trace file";
-            "path" => %trace_path,
-            "num_events" => trace.trace_events.len()
-        );
-
-        Ok(())
-    }
-
-    async fn collect_bundle_id(
-        &self,
-        dir: &Utf8Path,
-    ) -> anyhow::Result<CollectionStepOutput> {
-        tokio::fs::write(dir.join("bundle_id.txt"), self.bundle.id.to_string())
-            .await?;
-
-        Ok(CollectionStepOutput::None)
-    }
-
-    async fn collect_reconfigurator_state(
-        &self,
-        dir: &Utf8Path,
-    ) -> anyhow::Result<CollectionStepOutput> {
-        if !self.request.include_reconfigurator_data() {
-            return Ok(CollectionStepOutput::Skipped);
-        }
-
-        // Collect reconfigurator state
-        const NMAX_BLUEPRINTS: usize = 300;
-        match reconfigurator_state_load(
-            &self.opctx,
-            &self.datastore,
-            NMAX_BLUEPRINTS,
-        )
-        .await
-        {
-            Ok(state) => {
-                let file_path = dir.join("reconfigurator_state.json");
-                let file = std::fs::OpenOptions::new()
-                    .create(true)
-                    .write(true)
-                    .truncate(true)
-                    .open(&file_path)
-                    .with_context(|| format!("failed to open {}", file_path))?;
-                serde_json::to_writer_pretty(&file, &state).with_context(
-                    || {
-                        format!(
-                            "failed to serialize reconfigurator state to {}",
-                            file_path
-                        )
-                    },
-                )?;
-                info!(
-                    self.log,
-                    "Support bundle: collected reconfigurator state";
-                    "target_blueprint" => ?state.target_blueprint,
-                    "num_blueprints" => state.blueprints.len(),
-                    "num_collections" => state.collections.len(),
-                );
-            }
-            Err(err) => {
-                warn!(
-                    self.log,
-                    "Support bundle: failed to collect reconfigurator state";
-                    "err" => ?err,
-                );
-            }
-        };
-
-        Ok(CollectionStepOutput::None)
-    }
-
-    async fn get_or_initialize_mgs_client<'a>(
-        &self,
-        mgs_client: &'a OnceCell<Arc<Option<MgsClient>>>,
-    ) -> &'a Arc<Option<MgsClient>> {
-        mgs_client
-            .get_or_init(|| async {
-                Arc::new(self.create_mgs_client().await.ok())
-            })
-            .await
-    }
-
-    async fn get_or_initialize_all_sleds<'a>(
-        &self,
-        all_sleds: &'a OnceCell<Arc<Option<Vec<Sled>>>>,
-    ) -> &'a Arc<Option<Vec<Sled>>> {
-        all_sleds
-            .get_or_init(|| async {
-                Arc::new(
-                    self.datastore
-                        .sled_list_all_batched(
-                            &self.opctx,
-                            SledFilter::InService,
-                        )
-                        .await
-                        .ok(),
-                )
-            })
-            .await
-    }
-
-    async fn collect_sled_cubby_info(
-        &self,
-        all_sleds: &OnceCell<Arc<Option<Vec<Sled>>>>,
-        mgs_client: &OnceCell<Arc<Option<MgsClient>>>,
-        dir: &Utf8Path,
-    ) -> anyhow::Result<CollectionStepOutput> {
-        if !self.request.include_sled_cubby_info() {
-            return Ok(CollectionStepOutput::Skipped);
-        }
-
-        let Some(mgs_client) =
-            &**self.get_or_initialize_mgs_client(mgs_client).await
-        else {
-            bail!("Could not initialize MGS client");
-        };
-        let nexus_sleds = self
-            .get_or_initialize_all_sleds(all_sleds)
-            .await
-            .as_deref()
-            .unwrap_or_default();
-
-        write_sled_cubby_info(&self.log, mgs_client, nexus_sleds, dir).await?;
-
-        Ok(CollectionStepOutput::None)
-    }
-
-    async fn spawn_sp_dump_collection(
-        &self,
-        mgs_client: &OnceCell<Arc<Option<MgsClient>>>,
-        dir: &Utf8Path,
-    ) -> anyhow::Result<CollectionStepOutput> {
-        if !self.request.include_sp_dumps() {
-            return Ok(CollectionStepOutput::Skipped);
-        }
-
-        let Some(mgs_client) =
-            &**self.get_or_initialize_mgs_client(mgs_client).await
-        else {
-            bail!("Could not initialize MGS client");
-        };
-
-        let sp_dumps_dir = dir.join("sp_task_dumps");
-        tokio::fs::create_dir_all(&sp_dumps_dir).await.with_context(|| {
-            format!("Failed to create SP task dump directory {sp_dumps_dir}")
-        })?;
-
-        let mut extra_steps: Vec<CollectionStep> = vec![];
-        for sp in get_available_sps(&mgs_client).await? {
-            extra_steps.push(CollectionStep::new(
-                format!("SP dump for {:?}", sp),
-                Box::new({
-                    let mgs_client = mgs_client.clone();
-                    move |collection, dir| {
-                        async move {
-                            collection
-                                .collect_sp_dump(&mgs_client, sp, dir)
-                                .await
-                        }
-                        .boxed()
-                    }
-                }),
-            ));
-        }
-
-        Ok(CollectionStepOutput::Spawn { extra_steps })
-    }
-
-    async fn collect_sp_dump(
-        &self,
-        mgs_client: &MgsClient,
-        sp: SpIdentifier,
-        dir: &Utf8Path,
-    ) -> anyhow::Result<CollectionStepOutput> {
-        if !self.request.include_sp_dumps() {
-            return Ok(CollectionStepOutput::Skipped);
-        }
-
-        save_sp_dumps(mgs_client, sp, dir).await.with_context(|| {
-            format!("failed to save SP dump from: {} {}", sp.type_, sp.slot)
-        })?;
-
-        Ok(CollectionStepOutput::None)
-    }
-
-    // Perform the work of collecting the support bundle into a temporary directory
-    //
-    // "dir" is an output directory where data can be stored.
-    //
-    // If a partial bundle can be collected, it should be returned as
-    // an Ok(SupportBundleCollectionReport). Any failures from this function
-    // will prevent the support bundle from being collected altogether.
-    //
-    // NOTE: The background task infrastructure will periodically check to see
-    // if the bundle has been cancelled by a user while it is being collected.
-    // If that happens, this function will be CANCELLED at an await point.
-    //
-    // As a result, it is important that this function be implemented as
-    // cancel-safe.
-    //
-    // The "steps" used within this function - passed to
-    // [`Self::run_collect_bundle_steps`] - are run on a [`ParallelTaskSet`],
-    // which automatically aborts tasks when it is dropped.
-    async fn collect_bundle_as_file(
-        self: &Arc<Self>,
-        dir: &Utf8TempDir,
-    ) -> anyhow::Result<SupportBundleCollectionReport> {
-        let log = &self.log;
-
-        info!(&log, "Collecting bundle as local file");
-
-        // Shared, lazy, fallible initialization for sleds
-        let all_sleds: OnceCell<Arc<Option<Vec<Sled>>>> = OnceCell::new();
-        // Shared, lazy, fallible initialization for MGS client
-        let mgs_client: OnceCell<Arc<Option<MgsClient>>> = OnceCell::new();
-
-        let steps: Vec<CollectionStep> = vec![
-            CollectionStep::new(
-                SupportBundleCollectionStep::STEP_BUNDLE_ID,
-                Box::new(|collection, dir| {
-                    collection.collect_bundle_id(dir).boxed()
-                }),
-            ),
-            CollectionStep::new(
-                SupportBundleCollectionStep::STEP_RECONFIGURATOR_STATE,
-                Box::new(|collection, dir| {
-                    collection.collect_reconfigurator_state(dir).boxed()
-                }),
-            ),
-            CollectionStep::new(
-                SupportBundleCollectionStep::STEP_EREPORTS,
-                Box::new(|collection, dir| {
-                    collection.collect_ereports(dir).boxed()
-                }),
-            ),
-            CollectionStep::new(
-                SupportBundleCollectionStep::STEP_SLED_CUBBY_INFO,
-                Box::new({
-                    let all_sleds = all_sleds.clone();
-                    let mgs_client = mgs_client.clone();
-                    move |collection, dir| {
-                        async move {
-                            collection
-                                .collect_sled_cubby_info(
-                                    &all_sleds,
-                                    &mgs_client,
-                                    dir,
-                                )
-                                .await
-                        }
-                        .boxed()
-                    }
-                }),
-            ),
-            CollectionStep::new(
-                SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS,
-                Box::new({
-                    let mgs_client = mgs_client.clone();
-                    move |collection, dir| {
-                        async move {
-                            collection
-                                .spawn_sp_dump_collection(&mgs_client, dir)
-                                .await
-                        }
-                        .boxed()
-                    }
-                }),
-            ),
-            CollectionStep::new(
-                SupportBundleCollectionStep::STEP_SPAWN_SLEDS,
-                Box::new({
-                    let all_sleds = all_sleds.clone();
-                    move |collection, _| {
-                        async move {
-                            collection.spawn_query_all_sleds(&all_sleds).await
-                        }
-                        .boxed()
-                    }
-                }),
-            ),
-        ];
-
-        Ok(self.run_collect_bundle_steps(dir, steps).await)
-    }
-
-    async fn spawn_query_all_sleds(
-        &self,
-        all_sleds: &OnceCell<Arc<Option<Vec<Sled>>>>,
-    ) -> anyhow::Result<CollectionStepOutput> {
-        if !self.request.include_host_info() {
-            return Ok(CollectionStepOutput::Skipped);
-        }
-
-        let Some(all_sleds) =
-            self.get_or_initialize_all_sleds(all_sleds).await.as_deref()
-        else {
-            bail!("Could not read list of sleds");
-        };
-
-        let mut extra_steps: Vec<CollectionStep> = vec![];
-        for sled in all_sleds {
-            if !self.request.include_sled_host_info(sled.id()) {
-                continue;
-            }
-
-            extra_steps.push(CollectionStep::new(
-                format!("sled data for sled {}", sled.id()),
-                Box::new({
-                    let sled = sled.clone();
-                    move |collection, dir| {
-                        async move {
-                            collection.collect_data_from_sled(&sled, dir).await
-                        }
-                        .boxed()
-                    }
-                }),
-            ));
-        }
-
-        return Ok(CollectionStepOutput::Spawn { extra_steps });
-    }
-
-    // Collect data from a sled, storing it into a directory that will
-    // be turned into a support bundle.
-    //
-    // - "sled" is the sled from which we should collect data.
-    // - "dir" is a directory where data can be stored, to be turned
-    // into a bundle after collection completes.
-    async fn collect_data_from_sled(
-        &self,
-        sled: &nexus_db_model::Sled,
-        dir: &Utf8Path,
-    ) -> anyhow::Result<CollectionStepOutput> {
-        if !self.request.include_sled_host_info(sled.id()) {
-            return Ok(CollectionStepOutput::Skipped);
-        }
-
-        let log = &self.log;
-        info!(&log, "Collecting bundle info from sled"; "sled" => %sled.id());
-        let sled_path = dir
-            .join("rack")
-            .join(sled.rack_id.to_string())
-            .join("sled")
-            .join(sled.id().to_string());
-        tokio::fs::create_dir_all(&sled_path).await?;
-        tokio::fs::write(sled_path.join("sled.txt"), format!("{sled:?}"))
-            .await?;
-
-        let sled_client = match nexus_networking::sled_client(
-            &self.datastore,
-            &self.opctx,
-            sled.id(),
-            log,
-        )
-        .await
-        {
-            Ok(client) => client,
-            Err(err) => {
-                tokio::fs::write(
-                    sled_path.join("error.txt"),
-                    "Could not contact sled",
-                )
-                .await.with_context(|| {
-                    format!("Failed to save 'error.txt' to bundle when recording error: {err}")
-                })?;
-                bail!("Could not contact sled: {err}");
-            }
-        };
-
-        // NB: As new sled-diagnostic commands are added they should
-        // be added to this array so that their output can be saved
-        // within the support bundle.
-        let mut diag_cmds = futures::stream::iter([
-            save_diag_cmd_output_or_error(
-                &sled_path,
-                "zoneadm",
-                sled_client.support_zoneadm_info(),
-            )
-            .boxed(),
-            save_diag_cmd_output_or_error(
-                &sled_path,
-                "dladm",
-                sled_client.support_dladm_info(),
-            )
-            .boxed(),
-            save_diag_cmd_output_or_error(
-                &sled_path,
-                "ipadm",
-                sled_client.support_ipadm_info(),
-            )
-            .boxed(),
-            save_diag_cmd_output_or_error(
-                &sled_path,
-                "nvmeadm",
-                sled_client.support_nvmeadm_info(),
-            )
-            .boxed(),
-            save_diag_cmd_output_or_error(
-                &sled_path,
-                "pargs",
-                sled_client.support_pargs_info(),
-            )
-            .boxed(),
-            save_diag_cmd_output_or_error(
-                &sled_path,
-                "pfiles",
-                sled_client.support_pfiles_info(),
-            )
-            .boxed(),
-            save_diag_cmd_output_or_error(
-                &sled_path,
-                "pstack",
-                sled_client.support_pstack_info(),
-            )
-            .boxed(),
-            save_diag_cmd_output_or_error(
-                &sled_path,
-                "zfs",
-                sled_client.support_zfs_info(),
-            )
-            .boxed(),
-            save_diag_cmd_output_or_error(
-                &sled_path,
-                "zpool",
-                sled_client.support_zpool_info(),
-            )
-            .boxed(),
-            save_diag_cmd_output_or_error(
-                &sled_path,
-                "health-check",
-                sled_client.support_health_check(),
-            )
-            .boxed(),
-        ])
-        // Currently we execute up to 10 commands concurrently which
-        // might be doing their own concurrent work, for example
-        // collectiong `pstack` output of every Oxide process that is
-        // found on a sled.
-        .buffer_unordered(10);
-
-        while let Some(result) = diag_cmds.next().await {
-            // Log that we failed to write the diag command output to a
-            // file but don't return early as we wish to get as much
-            // information as we can.
-            if let Err(e) = result {
-                error!(
-                    &self.log,
-                    "failed to write diagnostic command output to \
-                    file: {e}"
-                );
-            }
-        }
-
-        // For each zone we concurrently fire off a request to its
-        // sled-agent to collect its logs in a zip file and write the
-        // result to the support bundle.
-        let zones = sled_client.support_logs().await?.into_inner();
-        let mut log_futs: FuturesUnordered<_> = zones
-            .iter()
-            .map(|zone| {
-                save_zone_log_zip_or_error(log, &sled_client, zone, &sled_path)
-            })
-            .collect();
-
-        while let Some(log_collection_result) = log_futs.next().await {
-            // We log any errors saving the zip file to disk and
-            // continue on.
-            if let Err(e) = log_collection_result {
-                error!(&self.log, "failed to write logs output: {e}");
-            }
-        }
-        return Ok(CollectionStepOutput::None);
-    }
-
-    async fn collect_ereports(
-        self: &Arc<Self>,
-        dir: &Utf8Path,
-    ) -> anyhow::Result<CollectionStepOutput> {
-        let Some(ereport_filters) = self.request.get_ereport_filters() else {
-            debug!(self.log, "Support bundle: ereports not requested");
-            return Ok(CollectionStepOutput::Skipped);
-        };
-        let ereports_dir = dir.join("ereports");
-        let mut status = SupportBundleEreportStatus::default();
-        if let Err(err) = self
-            .save_ereports(ereport_filters.clone(), ereports_dir, &mut status)
-            .await
-        {
-            warn!(
-                &self.log,
-                "Support bundle: ereport collection failed \
-                 ({} collected successfully)",
-                 status.n_collected;
-                InlineErrorChain::new(err.as_ref())
-            );
-            status.errors.push(InlineErrorChain::new(err.as_ref()).to_string());
-        };
-
-        Ok(CollectionStepOutput::Ereports(status))
-    }
-
-    async fn save_ereports(
-        self: &Arc<Self>,
-        filters: EreportFilters,
-        dir: Utf8PathBuf,
-        status: &mut SupportBundleEreportStatus,
-    ) -> anyhow::Result<()> {
-        let mut paginator = Paginator::new(
-            datastore::SQL_BATCH_SIZE,
-            dropshot::PaginationOrder::Ascending,
-        );
-        while let Some(p) = paginator.next() {
-            let ereports = self
-                .datastore
-                .ereport_fetch_matching(
-                    &self.opctx,
-                    &filters,
-                    &p.current_pagparams(),
-                )
-                .await
-                .map_err(|e| {
-                    e.internal_context("failed to query for ereports")
-                })?;
-            paginator = p.found_batch(&ereports, &|ereport| {
-                (ereport.restart_id.into_untyped_uuid(), ereport.ena)
-            });
-
-            let prev_n_collected = status.n_collected;
-            let n_ereports = ereports.len();
-            status.n_found += n_ereports;
-
-            for ereport in ereports {
-                match ereport.try_into() {
-                    Ok(ereport) => {
-                        write_ereport(ereport, &dir).await?;
-                        status.n_collected += 1;
-                    }
-                    Err(err) => {
-                        warn!(&self.log, "invalid ereport"; "error" => %err);
-                        status.errors.push(err.to_string());
-                    }
-                }
-            }
-            debug!(
-                self.log,
-                "Support bundle: added {} ereports ({} found)",
-                status.n_collected - prev_n_collected,
-                n_ereports
-            );
-        }
-
-        info!(
-            self.log,
-            "Support bundle: collected {} total ereports", status.n_collected
-        );
-        Ok(())
-    }
-
-    async fn create_mgs_client(&self) -> anyhow::Result<MgsClient> {
-        self
-            .resolver
-            .lookup_socket_v6(ServiceName::ManagementGatewayService)
-            .await
-            .map(|sockaddr| {
-                let url = format!("http://{}", sockaddr);
-                gateway_client::Client::new(&url, self.log.clone())
-            }).map_err(|e| {
-                error!(self.log, "failed to resolve MGS address"; "error" => InlineErrorChain::new(&e));
-                e.into()
-            })
-    }
-}
-
 impl BackgroundTask for SupportBundleCollector {
     fn activate<'a>(
         &'a mut self,
@@ -1735,438 +436,13 @@ impl BackgroundTask for SupportBundleCollector {
     }
 }
 
-async fn write_ereport(ereport: Ereport, dir: &Utf8Path) -> anyhow::Result<()> {
-    // Here's where we construct the file path for each ereport JSON file,
-    // given the top-level ereport directory path.  Each ereport is stored in a
-    // subdirectory for the part and serial numbers of the system that produced
-    // the ereport.  Part numbers must be included in addition to serial
-    // numbers, as the v1 serial scheme only guarantees uniqueness within a
-    // part number.  These paths take the following form:
-    //
-    //   {part-number}-{serial_number}/{restart_id}/{ENA}.json
-    //
-    // We can assume that the restart ID and ENA consist only of
-    // filesystem-safe characters, as the restart ID is known to be a UUID, and
-    // the ENA is just an integer.  For the serial and part numbers, which
-    // Nexus doesn't have full control over --- it came from the ereport
-    // metadata --- we must check that it doesn't contain any characters
-    // unsuitable for use in a filesystem path.
-    let pn = ereport
-        .data
-        .part_number
-        .as_deref()
-        // If the part or serial numbers contain any unsavoury characters, it
-        // goes in the `unknown_serial` hole! Note that the alleged serial
-        // number from the ereport will still be present in the JSON as a
-        // string, so we're not *lying* about what was received; we're just
-        // giving up on using it in the path.
-        .filter(|&s| is_fs_safe_single_path_component(s))
-        .unwrap_or("unknown_part");
-    let sn = ereport
-        .data
-        .serial_number
-        .as_deref()
-        .filter(|&s| is_fs_safe_single_path_component(s))
-        .unwrap_or("unknown_serial");
-    let id = &ereport.data.id;
-
-    let dir = dir
-        .join(format!("{pn}-{sn}"))
-        // N.B. that we call `into_untyped_uuid()` here, as the `Display`
-        // implementation for a typed UUID appends " (ereporter_restart)", which
-        // we don't want.
-        .join(id.restart_id.into_untyped_uuid().to_string());
-    tokio::fs::create_dir_all(&dir)
-        .await
-        .with_context(|| format!("failed to create directory '{dir}'"))?;
-    let file_path = dir.join(format!("{}.json", id.ena));
-    let json = serde_json::to_vec(&ereport).with_context(|| {
-        format!("failed to serialize ereport {pn}:{sn}/{id}")
-    })?;
-    tokio::fs::write(&file_path, json)
-        .await
-        .with_context(|| format!("failed to write '{file_path}'"))
-}
-
-// Takes a directory "dir", and zips the contents into a single zipfile.
-fn bundle_to_zipfile(dir: &Utf8TempDir) -> anyhow::Result<std::fs::File> {
-    let tempfile = tempfile_in(TEMPDIR)?;
-    let mut zip = ZipWriter::new(tempfile);
-
-    recursively_add_directory_to_zipfile(&mut zip, dir.path(), dir.path())?;
-
-    Ok(zip.finish()?)
-}
-
-fn recursively_add_directory_to_zipfile(
-    zip: &mut ZipWriter<std::fs::File>,
-    root_path: &Utf8Path,
-    dir_path: &Utf8Path,
-) -> anyhow::Result<()> {
-    // Readdir might return entries in a non-deterministic order.
-    // Let's sort it for the zipfile, to be nice.
-    let mut entries = dir_path
-        .read_dir_utf8()?
-        .filter_map(Result::ok)
-        .collect::<Vec<Utf8DirEntry>>();
-    entries.sort_by(|a, b| a.file_name().cmp(&b.file_name()));
-
-    for entry in &entries {
-        // Remove the "/tmp/..." prefix from the path when we're storing it in the
-        // zipfile.
-        let dst = entry.path().strip_prefix(root_path)?;
-
-        let file_type = entry.file_type()?;
-        if file_type.is_file() {
-            let src = entry.path();
-
-            let zip_time = entry
-                .path()
-                .metadata()
-                .and_then(|m| m.modified())
-                .ok()
-                .and_then(|sys_time| jiff::Zoned::try_from(sys_time).ok())
-                .and_then(|zoned| {
-                    zip::DateTime::try_from(zoned.datetime()).ok()
-                })
-                .unwrap_or_else(zip::DateTime::default);
-
-            let opts = FullFileOptions::default()
-                .last_modified_time(zip_time)
-                .compression_method(zip::CompressionMethod::Deflated)
-                .large_file(true);
-
-            zip.start_file_from_path(dst, opts)?;
-            let mut file = std::fs::File::open(&src)?;
-            std::io::copy(&mut file, zip)?;
-        }
-        if file_type.is_dir() {
-            let opts = FullFileOptions::default();
-            zip.add_directory_from_path(dst, opts)?;
-            recursively_add_directory_to_zipfile(zip, root_path, entry.path())?;
-        }
-    }
-    Ok(())
-}
-
-async fn sha2_hash(file: &mut tokio::fs::File) -> anyhow::Result<ArtifactHash> {
-    let mut buf = vec![0u8; 65536];
-    let mut ctx = Sha256::new();
-    loop {
-        let n = file.read(&mut buf).await?;
-        if n == 0 {
-            break;
-        }
-        ctx.write_all(&buf[0..n])?;
-    }
-
-    let digest = ctx.finalize();
-    Ok(ArtifactHash(digest.as_slice().try_into()?))
-}
-
-/// For a given zone, save its service's logs into the provided destination
-/// path. This path should be the location to a per-sled directory that will end
-/// up in the final support bundle zip file.
-async fn save_zone_log_zip_or_error(
-    logger: &slog::Logger,
-    client: &sled_agent_client::Client,
-    zone: &str,
-    path: &Utf8Path,
-) -> anyhow::Result<()> {
-    // In the future when support bundle collection exposes tuning parameters
-    // this can turn into a collection parameter.
-    const DEFAULT_MAX_ROTATED_LOGS: u32 = 5;
-
-    match client.support_logs_download(zone, DEFAULT_MAX_ROTATED_LOGS).await {
-        Ok(res) => {
-            let bytestream = res.into_inner();
-            let output_dir = path.join(format!("logs/{zone}"));
-            let output_path = output_dir.join("logs.zip");
-
-            // Ensure the logs output directory exists.
-            tokio::fs::create_dir_all(&output_dir).await.with_context(
-                || format!("failed to create output directory: {output_dir}"),
-            )?;
-
-            let mut file =
-                tokio::fs::File::create(&output_path).await.with_context(
-                    || format!("failed to create file: {output_path}"),
-                )?;
-
-            let stream = bytestream.into_inner().map(|chunk| {
-                chunk.map_err(|e| std::io::Error::other(e.to_string()))
-            });
-            let mut reader = tokio_util::io::StreamReader::new(stream);
-            let _nbytes = tokio::io::copy(&mut reader, &mut file).await?;
-            file.flush().await?;
-
-            // Unpack the zip so we don't end up with zip files inside of our
-            // final zip
-            let zipfile_path = output_path.clone();
-            tokio::task::spawn_blocking(move || {
-                extract_zip_file(&output_dir, &zipfile_path)
-            })
-            .await
-            .map_err(|join_error| {
-                anyhow::anyhow!(join_error)
-                    .context("unzipping support bundle logs zip panicked")
-            })??;
-
-            // Cleanup the zip file since we no longer need it
-            if let Err(e) = tokio::fs::remove_file(&output_path).await {
-                error!(
-                    logger,
-                    "failed to cleanup temporary logs zip file";
-                    "error" => %e,
-                    "file" => %output_path,
-
-                );
-            }
-        }
-        Err(err) => {
-            tokio::fs::write(
-                path.join(format!("{zone}.logs.err")),
-                err.to_string(),
-            )
-            .await?;
-        }
-    };
-
-    Ok(())
-}
-
-fn extract_zip_file(
-    output_dir: &Utf8Path,
-    zip_file: &Utf8Path,
-) -> Result<(), anyhow::Error> {
-    let mut zip = std::fs::File::open(&zip_file)
-        .with_context(|| format!("failed to open zip file: {zip_file}"))?;
-    let mut archive = ZipArchive::new(&mut zip)?;
-    archive.extract(&output_dir).with_context(|| {
-        format!("failed to extract log zip file to: {output_dir}")
-    })?;
-    Ok(())
-}
-
-/// Run a `sled-dianostics` future and save its output to a corresponding file.
-async fn save_diag_cmd_output_or_error<F, S: serde::Serialize>(
-    path: &Utf8Path,
-    command: &str,
-    future: F,
-) -> anyhow::Result<()>
-where
-    F: Future<
-            Output = Result<
-                sled_agent_client::ResponseValue<S>,
-                sled_agent_client::Error<sled_agent_client::types::Error>,
-            >,
-        > + Send,
-{
-    let result = future.await;
-    match result {
-        Ok(result) => {
-            let output = result.into_inner();
-            let json = serde_json::to_string(&output).with_context(|| {
-                format!("failed to serialize {command} output as json")
-            })?;
-            tokio::fs::write(path.join(format!("{command}.json")), json)
-                .await
-                .with_context(|| {
-                    format!("failed to write output of {command} to file")
-                })?;
-        }
-        Err(err) => {
-            tokio::fs::write(
-                path.join(format!("{command}_err.txt")),
-                err.to_string(),
-            )
-            .await?;
-        }
-    }
-    Ok(())
-}
-
-/// Use MGS ignition info to find active SPs.
-async fn get_available_sps(
-    mgs_client: &MgsClient,
-) -> anyhow::Result<Vec<SpIdentifier>> {
-    let ignition_info = mgs_client
-        .ignition_list()
-        .await
-        .context("failed to get ignition info from MGS")?
-        .into_inner();
-
-    let mut active_sps = Vec::new();
-    for info in ignition_info {
-        if let SpIgnition::Yes { power, flt_sp, .. } = info.details {
-            // Only return SPs that are powered on and are not in a faulted state.
-            if power && !flt_sp {
-                active_sps.push(info.id);
-            }
-        }
-    }
-
-    Ok(active_sps)
-}
-
-/// Fetch and save task dumps from a single SP.
-async fn save_sp_dumps(
-    mgs_client: &MgsClient,
-    sp: SpIdentifier,
-    sp_dumps_dir: &Utf8Path,
-) -> anyhow::Result<()> {
-    let dump_count = mgs_client
-        .sp_task_dump_count(&sp.type_, sp.slot)
-        .await
-        .context("failed to get task dump count from SP")?
-        .into_inner();
-
-    let output_dir = sp_dumps_dir.join(format!("{}_{}", sp.type_, sp.slot));
-    tokio::fs::create_dir_all(&output_dir).await.with_context(|| {
-        format!("Failed to create output directory {output_dir}")
-    })?;
-
-    for i in 0..dump_count {
-        let task_dump = mgs_client
-            .sp_task_dump_get(&sp.type_, sp.slot, i)
-            .await
-            .with_context(|| format!("failed to get task dump {i} from SP"))?
-            .into_inner();
-
-        let zip_bytes = base64::engine::general_purpose::STANDARD
-            .decode(task_dump.base64_zip)
-            .context("failed to decode base64-encoded SP task dump zip")?;
-
-        tokio::fs::write(output_dir.join(format!("dump-{i}.zip")), zip_bytes)
-            .await
-            .context("failed to write SP task dump zip to disk")?;
-    }
-    Ok(())
-}
-
-/// Write a file with a JSON mapping of sled serial numbers to cubby and UUIDs for easier
-/// identification of sleds present in a bundle.
-async fn write_sled_cubby_info(
-    log: &slog::Logger,
-    mgs_client: &MgsClient,
-    nexus_sleds: &[Sled],
-    dir: &Utf8Path,
-) -> anyhow::Result<()> {
-    #[derive(Serialize)]
-    struct SledInfo {
-        cubby: Option<u16>,
-        uuid: Option<Uuid>,
-    }
-
-    let available_sps = get_available_sps(&mgs_client)
-        .await
-        .context("failed to get available SPs")?;
-
-    // We can still get a useful mapping of cubby to serial using just the data from MGS.
-    let mut nexus_map: BTreeMap<_, _> = nexus_sleds
-        .into_iter()
-        .map(|sled| (sled.serial_number(), sled))
-        .collect();
-
-    let mut sled_info = BTreeMap::new();
-    for sp in
-        available_sps.into_iter().filter(|sp| matches!(sp.type_, SpType::Sled))
-    {
-        let sp_state = match mgs_client.sp_get(&sp.type_, sp.slot).await {
-            Ok(s) => s.into_inner(),
-            Err(e) => {
-                error!(log,
-                    "Failed to get SP state for sled_info.json";
-                    "cubby" => sp.slot,
-                    "component" => %sp.type_,
-                    "error" => InlineErrorChain::new(&e)
-                );
-                continue;
-            }
-        };
-
-        if let Some(sled) = nexus_map.remove(sp_state.serial_number.as_str()) {
-            sled_info.insert(
-                sp_state.serial_number.to_string(),
-                SledInfo {
-                    cubby: Some(sp.slot),
-                    uuid: Some(*sled.identity.id.as_untyped_uuid()),
-                },
-            );
-        } else {
-            sled_info.insert(
-                sp_state.serial_number.to_string(),
-                SledInfo { cubby: Some(sp.slot), uuid: None },
-            );
-        }
-    }
-
-    // Sleds not returned by MGS.
-    for (serial, sled) in nexus_map {
-        sled_info.insert(
-            serial.to_string(),
-            SledInfo {
-                cubby: None,
-                uuid: Some(*sled.identity.id.as_untyped_uuid()),
-            },
-        );
-    }
-
-    let json = serde_json::to_string_pretty(&sled_info)
-        .context("failed to serialize sled info to JSON")?;
-    tokio::fs::write(dir.join("sled_info.json"), json).await?;
-
-    Ok(())
-}
-
-fn is_fs_safe_single_path_component(s: &str) -> bool {
-    // Might be path traversal...
-    if s == "." || s == ".." {
-        return false;
-    }
-
-    if s == "~" {
-        return false;
-    }
-
-    const BANNED_CHARS: &[char] = &[
-        // Check for path separators.
-        //
-        // Naively, we might reach for `std::path::is_separator()` here.
-        // However, this function only checks if a path is a permitted
-        // separator on the *current* platform --- so, running on illumos, we
-        // will only check for Unix path separators.  But, because the support
-        // bundle may be extracted on a workstation system by Oxide support
-        // personnel or by the customer, we should also make sure we don't
-        // allow the use of Windows path separators, which `is_separator()`
-        // won't check for on Unix systems.
-        '/', '\\',
-        // Characters forbidden on Windows, per:
-        // https://learn.microsoft.com/en-us/windows/win32/fileio/naming-a-file#naming-conventions
-        '<', '>', ':', '"', '|', '?', '*',
-    ];
-
-    // Rather than using `s.contains()`, we do all the checks in one pass.
-    for c in s.chars() {
-        if BANNED_CHARS.contains(&c) {
-            return false;
-        }
-
-        // Definitely no control characters!
-        if c.is_control() {
-            return false;
-        }
-    }
-
-    true
-}
-
 #[cfg(test)]
 mod test {
     use super::*;
 
+    use crate::app::background::tasks::support_bundle::perfetto;
+    use crate::app::background::tasks::support_bundle::request::BundleData;
     use crate::app::support_bundles::SupportBundleQueryType;
-    use camino_tempfile::tempdir;
     use http_body_util::BodyExt;
     use nexus_db_model::PhysicalDisk;
     use nexus_db_model::PhysicalDiskKind;
@@ -2175,6 +451,9 @@ mod test {
     use nexus_test_utils::SLED_AGENT_UUID;
     use nexus_test_utils_macros::nexus_test;
     use nexus_types::fm::ereport::{EreportData, EreportId, Reporter};
+    use nexus_types::identity::Asset;
+    use nexus_types::internal_api::background::SupportBundleCollectionStep;
+    use nexus_types::internal_api::background::SupportBundleEreportStatus;
     use nexus_types::inventory::SpType;
     use omicron_common::api::external::ByteCount;
     use omicron_common::api::internal::shared::DatasetKind;
@@ -2183,39 +462,18 @@ mod test {
     use omicron_common::disk::DatasetsConfig;
     use omicron_common::disk::SharedDatasetConfig;
     use omicron_common::zpool_name::ZpoolName;
+    use omicron_uuid_kinds::GenericUuid;
     use omicron_uuid_kinds::{
         BlueprintUuid, DatasetUuid, EreporterRestartUuid, OmicronZoneUuid,
         PhysicalDiskUuid, SledUuid,
     };
+    use std::collections::HashSet;
+    use std::num::NonZeroU64;
     use uuid::Uuid;
 
     type ControlPlaneTestContext =
         nexus_test_utils::ControlPlaneTestContext<crate::Server>;
 
-    // Ensure that we can convert a temporary directory into a zipfile
-    #[test]
-    fn test_zipfile_creation() {
-        let dir = tempdir().unwrap();
-
-        std::fs::create_dir_all(dir.path().join("dir-a")).unwrap();
-        std::fs::create_dir_all(dir.path().join("dir-b")).unwrap();
-        std::fs::write(dir.path().join("dir-a").join("file-a"), "some data")
-            .unwrap();
-        std::fs::write(dir.path().join("file-b"), "more data").unwrap();
-
-        let zipfile = bundle_to_zipfile(&dir)
-            .expect("Should have been able to bundle zipfile");
-        let archive = zip::read::ZipArchive::new(zipfile).unwrap();
-
-        // We expect the order to be deterministically alphabetical
-        let mut names = archive.file_names();
-        assert_eq!(names.next(), Some("dir-a/"));
-        assert_eq!(names.next(), Some("dir-a/file-a"));
-        assert_eq!(names.next(), Some("dir-b/"));
-        assert_eq!(names.next(), Some("file-b"));
-        assert_eq!(names.next(), None);
-    }
-
     // If we have not populated any bundles needing cleanup, the cleanup
     // process should succeed with an empty cleanup report.
     #[nexus_test(server = crate::Server)]

From 73fe0ec6a609c8e66130b2f15cdc8d6b60a328fe Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxidecomputer.com>
Date: Wed, 10 Dec 2025 10:46:18 -0800
Subject: [PATCH 12/18] fix integration test

---
 nexus/tests/integration_tests/support_bundles.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/nexus/tests/integration_tests/support_bundles.rs b/nexus/tests/integration_tests/support_bundles.rs
index 80ec8af191f..4bd8d1e2258 100644
--- a/nexus/tests/integration_tests/support_bundles.rs
+++ b/nexus/tests/integration_tests/support_bundles.rs
@@ -530,6 +530,8 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) {
     let archive = ZipArchive::new(Cursor::new(&contents)).unwrap();
     let mut names = archive.file_names();
     assert_eq!(names.next(), Some("bundle_id.txt"));
+    assert_eq!(names.next(), Some("meta/"));
+    assert_eq!(names.next(), Some("meta/trace.json"));
     assert_eq!(names.next(), Some("rack/"));
     assert!(names.any(|n| n == "sp_task_dumps/"));
     // There's much more data in the bundle, but validating it isn't the point

From 7979e2377f069a9cf20e907c2eb8dc85367a4491 Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxidecomputer.com>
Date: Fri, 12 Dec 2025 10:48:01 -0800
Subject: [PATCH 13/18] Elaborate on directory structure

---
 .../background/tasks/support_bundle/README.md | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/nexus/src/app/background/tasks/support_bundle/README.md b/nexus/src/app/background/tasks/support_bundle/README.md
index e6a52539afd..b36a6911364 100644
--- a/nexus/src/app/background/tasks/support_bundle/README.md
+++ b/nexus/src/app/background/tasks/support_bundle/README.md
@@ -64,3 +64,23 @@ contents should be included.
   expensive operations which might be shared with other steps (e.g., reading
   from the database, creating and using progenitor clients, etc) consider adding
   that data to `support_bundle/cache`.
+
+## Bundle Directory Structure
+
+The following is the convention for Support Bundle files. It can, and should,
+change over time. However, we list it here to make sure data is located
+somewhere consistent and predictable.
+
+(Please keep this list alphabetized)
+
+* `bundle_id.txt` - UUID of the bundle itself
+* `ereports/` - All requested error reports
+* `ereports/{part number}-{serial number}/{id}.json` - Individual reports
+* `meta/` - Metadata about the bundle
+* `meta/trace.json` - Perfetto-formatted trace of the bundle's collection
+* `rack/{rack id}/sled/{sled id}/` - Sled-specific host OS info
+* `reconfigurator_state.json` - A dump of all reconfigurator state
+* `sled_info.json` - Mapping of sled identifiers to cubby location
+* `sp_task_dumps/` - All SP dumps
+* `sp_task_dumps/{SP type}_{SP slot}/dump-{id}.zip` - Individual SP dumps
+

From 8001cbd355de3473c159f7876c531e3d33490476 Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxidecomputer.com>
Date: Mon, 15 Dec 2025 15:24:32 -0800
Subject: [PATCH 14/18] omdb integration for bg task list

---
 nexus/src/app/background/init.rs              |  4 ++
 .../tasks/support_bundle/collection.rs        |  7 +++
 .../tasks/support_bundle/steps/mod.rs         |  5 ++
 .../tasks/support_bundle/steps/omdb.rs        | 46 +++++++++++++++++++
 .../tasks/support_bundle_collector.rs         | 23 +++++++++-
 nexus/src/app/mod.rs                          |  1 +
 .../integration_tests/support_bundles.rs      |  9 +++-
 nexus/types/src/internal_api/background.rs    |  1 +
 8 files changed, 94 insertions(+), 2 deletions(-)
 create mode 100644 nexus/src/app/background/tasks/support_bundle/steps/omdb.rs

diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs
index c56a26970a1..89ee67a7430 100644
--- a/nexus/src/app/background/init.rs
+++ b/nexus/src/app/background/init.rs
@@ -144,6 +144,7 @@ use nexus_background_task_interface::Activator;
 use nexus_background_task_interface::BackgroundTasks;
 use nexus_config::BackgroundTaskConfig;
 use nexus_config::DnsTasksConfig;
+use nexus_config::OmdbConfig;
 use nexus_db_model::DnsGroup;
 use nexus_db_queries::context::OpContext;
 use nexus_db_queries::db::DataStore;
@@ -632,6 +633,7 @@ impl BackgroundTasksInitializer {
                     resolver.clone(),
                     config.support_bundle_collector.disable,
                     nexus_id,
+                    args.omdb_config.clone(),
                 ),
             ),
             opctx: opctx.child(BTreeMap::new()),
@@ -1191,6 +1193,8 @@ pub struct BackgroundTasksData {
     /// Channel for exposing the latest loaded fault-management sitrep.
     pub sitrep_load_tx:
         watch::Sender<Option<Arc<(fm::SitrepVersion, fm::Sitrep)>>>,
+    /// PATH information for `omdb`, for tasks that want to invoke it directly
+    pub omdb_config: OmdbConfig,
 }
 
 /// Starts the three DNS-propagation-related background tasks for either
diff --git a/nexus/src/app/background/tasks/support_bundle/collection.rs b/nexus/src/app/background/tasks/support_bundle/collection.rs
index 1008c85128f..46a8a721e22 100644
--- a/nexus/src/app/background/tasks/support_bundle/collection.rs
+++ b/nexus/src/app/background/tasks/support_bundle/collection.rs
@@ -53,6 +53,7 @@ pub struct BundleCollection {
     request: BundleRequest,
     bundle: SupportBundle,
     transfer_chunk_size: NonZeroU64,
+    omdb_config: nexus_config::OmdbConfig,
 }
 
 impl BundleCollection {
@@ -64,6 +65,7 @@ impl BundleCollection {
         request: BundleRequest,
         bundle: SupportBundle,
         transfer_chunk_size: NonZeroU64,
+        omdb_config: nexus_config::OmdbConfig,
     ) -> Self {
         Self {
             datastore,
@@ -73,6 +75,7 @@ impl BundleCollection {
             request,
             bundle,
             transfer_chunk_size,
+            omdb_config,
         }
     }
 
@@ -100,6 +103,10 @@ impl BundleCollection {
         &self.bundle
     }
 
+    pub fn omdb_config(&self) -> &nexus_config::OmdbConfig {
+        &self.omdb_config
+    }
+
     /// Collect the bundle within Nexus, and store it on a target sled.
     pub async fn collect_bundle_and_store_on_sled(
         self: &Arc<Self>,
diff --git a/nexus/src/app/background/tasks/support_bundle/steps/mod.rs b/nexus/src/app/background/tasks/support_bundle/steps/mod.rs
index d2179c74b8c..cade1943ff0 100644
--- a/nexus/src/app/background/tasks/support_bundle/steps/mod.rs
+++ b/nexus/src/app/background/tasks/support_bundle/steps/mod.rs
@@ -12,6 +12,7 @@ use nexus_types::internal_api::background::SupportBundleCollectionStep;
 mod bundle_id;
 mod ereports;
 mod host_info;
+mod omdb;
 mod reconfigurator;
 mod sled_cubby;
 mod sp_dumps;
@@ -85,5 +86,9 @@ pub fn all(cache: &Cache) -> Vec<CollectionStep> {
                 }
             }),
         ),
+        CollectionStep::new(
+            SupportBundleCollectionStep::STEP_OMDB,
+            Box::new(|collection, dir| omdb::collect(collection, dir).boxed()),
+        ),
     ]
 }
diff --git a/nexus/src/app/background/tasks/support_bundle/steps/omdb.rs b/nexus/src/app/background/tasks/support_bundle/steps/omdb.rs
new file mode 100644
index 00000000000..b5ec795b4d3
--- /dev/null
+++ b/nexus/src/app/background/tasks/support_bundle/steps/omdb.rs
@@ -0,0 +1,46 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Collects output from omdb commands
+
+use crate::app::background::tasks::support_bundle::collection::BundleCollection;
+use crate::app::background::tasks::support_bundle::step::CollectionStepOutput;
+use camino::Utf8Path;
+use tokio::process::Command;
+
+pub async fn collect(
+    collection: &BundleCollection,
+    dir: &Utf8Path,
+) -> anyhow::Result<CollectionStepOutput> {
+    // Create the omdb/nexus/background-tasks directory
+    let omdb_dir = dir.join("omdb/nexus/background-tasks");
+    tokio::fs::create_dir_all(&omdb_dir).await?;
+
+    // Run the omdb command
+    let omdb_path = &collection.omdb_config().bin_path;
+    let output = Command::new(omdb_path)
+        .arg("nexus")
+        .arg("background-tasks")
+        .arg("list")
+        .output()
+        .await?;
+
+    // Write the output to list.txt
+    let output_path = omdb_dir.join("list.txt");
+    let output_text = if output.status.success() {
+        String::from_utf8_lossy(&output.stdout).to_string()
+    } else {
+        // If the command failed, include both stdout and stderr
+        format!(
+            "Command failed with exit code: {:?}\n\nSTDOUT:\n{}\n\nSTDERR:\n{}",
+            output.status.code(),
+            String::from_utf8_lossy(&output.stdout),
+            String::from_utf8_lossy(&output.stderr)
+        )
+    };
+
+    tokio::fs::write(output_path, output_text).await?;
+
+    Ok(CollectionStepOutput::None)
+}
diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs
index 887be497a17..9c4227babf2 100644
--- a/nexus/src/app/background/tasks/support_bundle_collector.rs
+++ b/nexus/src/app/background/tasks/support_bundle_collector.rs
@@ -56,6 +56,7 @@ pub struct SupportBundleCollector {
     resolver: Resolver,
     disable: bool,
     nexus_id: OmicronZoneUuid,
+    omdb_config: nexus_config::OmdbConfig,
 }
 
 impl SupportBundleCollector {
@@ -64,8 +65,15 @@ impl SupportBundleCollector {
         resolver: Resolver,
         disable: bool,
         nexus_id: OmicronZoneUuid,
+        omdb_config: nexus_config::OmdbConfig,
     ) -> Self {
-        SupportBundleCollector { datastore, resolver, disable, nexus_id }
+        SupportBundleCollector {
+            datastore,
+            resolver,
+            disable,
+            nexus_id,
+            omdb_config,
+        }
     }
 
     // Tells a sled agent to delete a support bundle
@@ -357,6 +365,7 @@ impl SupportBundleCollector {
             request.clone(),
             bundle.clone(),
             request.transfer_chunk_size,
+            self.omdb_config.clone(),
         ));
 
         let authz_bundle = authz_support_bundle_from_id(bundle.id.into());
@@ -490,6 +499,7 @@ mod test {
             resolver.clone(),
             false,
             nexus.id(),
+            nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() },
         );
 
         let report = collector
@@ -516,6 +526,7 @@ mod test {
             resolver.clone(),
             false,
             nexus.id(),
+            nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() },
         );
 
         let request = BundleRequest::default();
@@ -823,6 +834,7 @@ mod test {
             resolver.clone(),
             false,
             nexus.id(),
+            nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() },
         );
 
         // The bundle collection should complete successfully.
@@ -902,6 +914,7 @@ mod test {
             resolver.clone(),
             false,
             nexus.id(),
+            nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() },
         );
 
         // Collect the bundle
@@ -1013,6 +1026,7 @@ mod test {
             resolver.clone(),
             false,
             nexus.id(),
+            nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() },
         );
 
         // The bundle collection should complete successfully.
@@ -1121,6 +1135,7 @@ mod test {
             resolver.clone(),
             false,
             nexus.id(),
+            nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() },
         );
 
         // Each time we call "collect_bundle", we collect a SINGLE bundle.
@@ -1235,6 +1250,7 @@ mod test {
             resolver.clone(),
             false,
             nexus.id(),
+            nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() },
         );
 
         let report = collector
@@ -1288,6 +1304,7 @@ mod test {
             resolver.clone(),
             false,
             nexus.id(),
+            nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() },
         );
         let mut request = BundleRequest::default();
         request.data_selection.insert(BundleData::HostInfo(HashSet::new()));
@@ -1387,6 +1404,7 @@ mod test {
             resolver.clone(),
             false,
             nexus.id(),
+            nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() },
         );
 
         let report = collector
@@ -1443,6 +1461,7 @@ mod test {
             resolver.clone(),
             false,
             nexus.id(),
+            nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() },
         );
         let mut request = BundleRequest::default();
         request.data_selection.insert(BundleData::HostInfo(HashSet::new()));
@@ -1528,6 +1547,7 @@ mod test {
             resolver.clone(),
             false,
             nexus.id(),
+            nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() },
         );
         let mut request = BundleRequest::default();
         request.data_selection.insert(BundleData::HostInfo(HashSet::new()));
@@ -1612,6 +1632,7 @@ mod test {
             resolver.clone(),
             false,
             nexus.id(),
+            nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() },
         );
 
         // Collect the bundle
diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs
index f9a8057958c..42f7750159b 100644
--- a/nexus/src/app/mod.rs
+++ b/nexus/src/app/mod.rs
@@ -647,6 +647,7 @@ impl Nexus {
                     mgs_updates_tx,
                     blueprint_load_tx,
                     sitrep_load_tx,
+                    omdb_config: task_config.pkg.omdb.clone(),
                 },
             );
 
diff --git a/nexus/tests/integration_tests/support_bundles.rs b/nexus/tests/integration_tests/support_bundles.rs
index 4bd8d1e2258..d605168ee18 100644
--- a/nexus/tests/integration_tests/support_bundles.rs
+++ b/nexus/tests/integration_tests/support_bundles.rs
@@ -528,10 +528,17 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) {
     // Now we should be able to download the bundle
     let contents = bundle_download(&client, bundle.id).await.unwrap();
     let archive = ZipArchive::new(Cursor::new(&contents)).unwrap();
-    let mut names = archive.file_names();
+    let mut names = archive.file_names().peekable();
     assert_eq!(names.next(), Some("bundle_id.txt"));
     assert_eq!(names.next(), Some("meta/"));
     assert_eq!(names.next(), Some("meta/trace.json"));
+    assert_eq!(names.next(), Some("omdb/"));
+    while let Some(name) = names.peek() {
+        if !name.starts_with("omdb/") {
+            break;
+        }
+        let _ = names.next();
+    }
     assert_eq!(names.next(), Some("rack/"));
     assert!(names.any(|n| n == "sp_task_dumps/"));
     // There's much more data in the bundle, but validating it isn't the point
diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs
index dfe008198f9..6661592a192 100644
--- a/nexus/types/src/internal_api/background.rs
+++ b/nexus/types/src/internal_api/background.rs
@@ -303,6 +303,7 @@ impl SupportBundleCollectionStep {
     pub const STEP_SPAWN_SP_DUMPS: &'static str =
         "spawn steps to query all SP dumps";
     pub const STEP_SPAWN_SLEDS: &'static str = "spawn steps to query all sleds";
+    pub const STEP_OMDB: &'static str = "omdb diagnostic output";
 }
 
 #[derive(Debug, Deserialize, Serialize, PartialEq, Eq)]

From ceb03e4d2e40b84231f8c057f81f64315c98fc10 Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxidecomputer.com>
Date: Mon, 15 Dec 2025 17:00:58 -0800
Subject: [PATCH 15/18] more commands

---
 .../tasks/support_bundle/collection.rs        |   1 +
 .../tasks/support_bundle/steps/omdb.rs        | 106 +++++++++++++++---
 2 files changed, 91 insertions(+), 16 deletions(-)

diff --git a/nexus/src/app/background/tasks/support_bundle/collection.rs b/nexus/src/app/background/tasks/support_bundle/collection.rs
index 46a8a721e22..b042abf72cf 100644
--- a/nexus/src/app/background/tasks/support_bundle/collection.rs
+++ b/nexus/src/app/background/tasks/support_bundle/collection.rs
@@ -57,6 +57,7 @@ pub struct BundleCollection {
 }
 
 impl BundleCollection {
+    #[allow(clippy::too_many_arguments)]
     pub fn new(
         datastore: Arc<DataStore>,
         resolver: Resolver,
diff --git a/nexus/src/app/background/tasks/support_bundle/steps/omdb.rs b/nexus/src/app/background/tasks/support_bundle/steps/omdb.rs
index b5ec795b4d3..e089e5f26db 100644
--- a/nexus/src/app/background/tasks/support_bundle/steps/omdb.rs
+++ b/nexus/src/app/background/tasks/support_bundle/steps/omdb.rs
@@ -9,38 +9,112 @@ use crate::app::background::tasks::support_bundle::step::CollectionStepOutput;
 use camino::Utf8Path;
 use tokio::process::Command;
 
-pub async fn collect(
+/// Run an omdb command and write its output to a file within the bundle.
+///
+/// This function returns an error if we cannot write to our local filesystem,
+/// or cannot run the omdb command at all. However, if the omdb command runs
+/// and fails, it returns "Ok()".
+///
+/// # Arguments
+/// * `collection` - The bundle collection context
+/// * `dir` - The root directory of the bundle
+/// * `args` - The arguments to pass to omdb (e.g., `&["nexus", "background-tasks", "list"]`)
+/// * `output_path` - The relative path within the bundle where output should be written
+///                   (e.g., `"omdb/nexus/background-tasks/list.txt"`)
+async fn run_omdb(
     collection: &BundleCollection,
     dir: &Utf8Path,
-) -> anyhow::Result<CollectionStepOutput> {
-    // Create the omdb/nexus/background-tasks directory
-    let omdb_dir = dir.join("omdb/nexus/background-tasks");
-    tokio::fs::create_dir_all(&omdb_dir).await?;
+    args: &[&str],
+    output_path: &str,
+) -> anyhow::Result<()> {
+    let full_output_path = dir.join(output_path);
+
+    // Create parent directories if they don't exist
+    if let Some(parent) = full_output_path.parent() {
+        tokio::fs::create_dir_all(parent).await?;
+    }
 
     // Run the omdb command
     let omdb_path = &collection.omdb_config().bin_path;
-    let output = Command::new(omdb_path)
-        .arg("nexus")
-        .arg("background-tasks")
-        .arg("list")
-        .output()
-        .await?;
-
-    // Write the output to list.txt
-    let output_path = omdb_dir.join("list.txt");
+    let output = Command::new(omdb_path).args(args).output().await?;
+
+    // Format the output
     let output_text = if output.status.success() {
         String::from_utf8_lossy(&output.stdout).to_string()
     } else {
         // If the command failed, include both stdout and stderr
         format!(
-            "Command failed with exit code: {:?}\n\nSTDOUT:\n{}\n\nSTDERR:\n{}",
+            "Command {} failed with exit code: {:?}\n\nSTDOUT:\n{}\n\nSTDERR:\n{}",
+            args.join(" "),
             output.status.code(),
             String::from_utf8_lossy(&output.stdout),
             String::from_utf8_lossy(&output.stderr)
         )
     };
 
-    tokio::fs::write(output_path, output_text).await?;
+    tokio::fs::write(full_output_path, output_text).await?;
+    Ok(())
+}
+
+/// Collect diagnostic output from various omdb commands.
+///
+/// This function runs multiple omdb queries and stores their output in the bundle.
+/// To add more omdb queries, simply add another `run_omdb()` call with the
+/// appropriate arguments and output path.
+pub async fn collect(
+    collection: &BundleCollection,
+    dir: &Utf8Path,
+) -> anyhow::Result<CollectionStepOutput> {
+    // NOTE: We could parallelize these commands, if they take a while.
+    //
+    // NOTE: These commands issues queries to "some Nexus", as returned by DNS -
+    // not necessarily our own Nexus. We may want to include queries to
+    // each Nexus instance individually in a future iteration, especially for
+    // "nexus-specific" commands.
+
+    // Run a sequence of omdb commands. If any of these commands fail, we'll
+    // save the stdout and stderr, and proceed to the next one (note that
+    // "run_omdb" does not return an error when the output is not successfull).
+
+    run_omdb(
+        collection,
+        dir,
+        &["nexus", "background-tasks", "list"],
+        "omdb/nexus/background-tasks/list.txt",
+    )
+    .await?;
+
+    run_omdb(
+        collection,
+        dir,
+        &["nexus", "quiesce", "show"],
+        "omdb/nexus/quiesce/show.txt",
+    )
+    .await?;
+
+    run_omdb(
+        collection,
+        dir,
+        &["nexus", "mgs-updates"],
+        "omdb/nexus/mgs-updates.txt",
+    )
+    .await?;
+
+    run_omdb(
+        collection,
+        dir,
+        &["nexus", "update-status"],
+        "omdb/nexus/update-status.txt",
+    )
+    .await?;
+
+    run_omdb(
+        collection,
+        dir,
+        &["db", "saga", "running"],
+        "omdb/db/saga/running",
+    )
+    .await?;
 
     Ok(CollectionStepOutput::None)
 }

From 1cb8fcb237cd1e02e776a9b251e1fba902b53bf9 Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxidecomputer.com>
Date: Wed, 17 Dec 2025 13:23:37 -0800
Subject: [PATCH 16/18] integration testing

---
 .config/nextest.toml                          | 10 ++++
 .../tasks/support_bundle/steps/omdb.rs        | 10 +++-
 nexus/test-utils/src/starter.rs               | 27 +++++++++
 .../integration_tests/support_bundles.rs      | 57 ++++++++++++++++++-
 4 files changed, 102 insertions(+), 2 deletions(-)

diff --git a/.config/nextest.toml b/.config/nextest.toml
index c77c8627f1e..96460bccfab 100644
--- a/.config/nextest.toml
+++ b/.config/nextest.toml
@@ -15,6 +15,12 @@ experimental = ["setup-scripts"]
 filter = 'rdeps(nexus-test-utils) - package(omicron-dev) - package(omicron-live-tests)'
 setup = 'crdb-seed'
 
+[[profile.default.scripts]]
+# Build omdb for usage within Nexus integration tests.
+# This was initially added for the support bundle integration tests.
+filter = 'rdeps(nexus-test-utils) - package(omicron-dev) - package(omicron-live-tests)'
+setup = 'omdb-build'
+
 [profile.ci]
 fail-fast = false
 
@@ -26,6 +32,10 @@ path = "junit.xml"
 # invocations of nextest happen.
 command = 'cargo run -p crdb-seed --profile test'
 
+[scripts.setup.omdb-build]
+# Build omdb binary for usage by integration tests
+command = 'cargo build --bin omdb'
+
 [[profile.default.scripts]]
 filter = 'package(omicron-clickhouse-admin)'
 setup = 'clickhouse-cluster'
diff --git a/nexus/src/app/background/tasks/support_bundle/steps/omdb.rs b/nexus/src/app/background/tasks/support_bundle/steps/omdb.rs
index e089e5f26db..8d4d6768464 100644
--- a/nexus/src/app/background/tasks/support_bundle/steps/omdb.rs
+++ b/nexus/src/app/background/tasks/support_bundle/steps/omdb.rs
@@ -36,7 +36,15 @@ async fn run_omdb(
 
     // Run the omdb command
     let omdb_path = &collection.omdb_config().bin_path;
-    let output = Command::new(omdb_path).args(args).output().await?;
+    let output =
+        Command::new(omdb_path).args(args).output().await.map_err(|e| {
+            anyhow::anyhow!(
+                "Failed to execute omdb at {:?} with args {:?}: {}",
+                omdb_path,
+                args,
+                e
+            )
+        })?;
 
     // Format the output
     let output_text = if output.status.success() {
diff --git a/nexus/test-utils/src/starter.rs b/nexus/test-utils/src/starter.rs
index 412959d3d63..cac54381db1 100644
--- a/nexus/test-utils/src/starter.rs
+++ b/nexus/test-utils/src/starter.rs
@@ -560,6 +560,33 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> {
                 .clone(),
         };
 
+        // Configure the omdb binary path for tests.
+        // The binary is built by cargo at the workspace root in target/<profile>/omdb.
+        // Tests run from the nexus directory, so we need to go up one level.
+        let workspace_root = std::env::current_dir()
+            .expect("Failed to get current directory")
+            .parent()
+            .expect("Current directory should have a parent")
+            .to_path_buf();
+        let omdb_debug = workspace_root.join("target/debug/omdb");
+        let omdb_release = workspace_root.join("target/release/omdb");
+
+        self.config.pkg.omdb.bin_path = if omdb_release.exists() {
+            camino::Utf8PathBuf::try_from(omdb_release)
+                .expect("Failed to convert release path to UTF-8")
+        } else if omdb_debug.exists() {
+            camino::Utf8PathBuf::try_from(omdb_debug)
+                .expect("Failed to convert debug path to UTF-8")
+        } else {
+            // omdb hasn't been built yet - use a path that will fail gracefully
+            // when tests try to use it.
+            //
+            // Our rules in ".config/nextest.toml" should prevent this, but this
+            // acts as a defensive buffer against running without nextest, or
+            // changing the directory layout.
+            camino::Utf8PathBuf::from("/nonexistent/omdb")
+        };
+
         let nexus_internal = N::start_internal(&self.config, &log).await?;
         let nexus_internal_addr =
             nexus_internal.get_http_server_internal_address();
diff --git a/nexus/tests/integration_tests/support_bundles.rs b/nexus/tests/integration_tests/support_bundles.rs
index d605168ee18..22b7082e82c 100644
--- a/nexus/tests/integration_tests/support_bundles.rs
+++ b/nexus/tests/integration_tests/support_bundles.rs
@@ -521,6 +521,10 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) {
         step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS),
         "Should have attempted to list service processors"
     );
+    assert!(
+        step_names.contains(&SupportBundleCollectionStep::STEP_OMDB),
+        "Should have run omdb diagnostic commands"
+    );
 
     let bundle = bundle_get(&client, bundle.id).await.unwrap();
     assert_eq!(bundle.state, SupportBundleState::Active);
@@ -528,17 +532,68 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) {
     // Now we should be able to download the bundle
     let contents = bundle_download(&client, bundle.id).await.unwrap();
     let archive = ZipArchive::new(Cursor::new(&contents)).unwrap();
-    let mut names = archive.file_names().peekable();
+    let mut names = archive.file_names().collect::<Vec<_>>();
+    names.sort();
+    let mut names = names.into_iter().peekable();
+
     assert_eq!(names.next(), Some("bundle_id.txt"));
     assert_eq!(names.next(), Some("meta/"));
     assert_eq!(names.next(), Some("meta/trace.json"));
     assert_eq!(names.next(), Some("omdb/"));
+
+    // Collect omdb file names and verify they exist
+    let mut omdb_files = Vec::new();
     while let Some(name) = names.peek() {
         if !name.starts_with("omdb/") {
             break;
         }
+        omdb_files.push(*name);
         let _ = names.next();
     }
+
+    // Verify we have omdb output files
+    assert!(!omdb_files.is_empty(), "Should have omdb output files");
+
+    // Verify that none of the omdb output files contain "error: unrecognized
+    // subcommand" This catches regressions where omdb's command structure
+    // changes and our hardcoded commands become invalid.
+    let mut archive = ZipArchive::new(Cursor::new(&contents)).unwrap();
+    let mut files_checked = 0;
+    for file_name in &omdb_files {
+        // Skip directories
+        if file_name.ends_with('/') {
+            continue;
+        }
+
+        let mut file = archive
+            .by_name(file_name)
+            .unwrap_or_else(|_| panic!("Should be able to open {}", file_name));
+        let mut content = String::new();
+        std::io::Read::read_to_string(&mut file, &mut content)
+            .unwrap_or_else(|_| panic!("Should be able to read {}", file_name));
+
+        files_checked += 1;
+
+        // Validate that the omdb command is valid, even if it can't connect
+        // to a runnine Nexus right now.
+        assert!(
+            !content.contains("error: unrecognized subcommand"),
+            "File {} contains 'error: unrecognized subcommand'.\n\
+             This indicates the omdb command is invalid. Content:\n{}",
+            file_name,
+            content
+        );
+    }
+
+    // Make sure we actually checked at least one omdb output file.
+    // If this fails, it means the bundle had omdb directories but no actual
+    // output files, which would be a bug.
+    assert!(
+        files_checked > 0,
+        "Expected to check at least one omdb output file, but found only directories. Files: {:?}",
+        omdb_files
+    );
+
     assert_eq!(names.next(), Some("rack/"));
     assert!(names.any(|n| n == "sp_task_dumps/"));
     // There's much more data in the bundle, but validating it isn't the point

From b1d5434b0e3b818217fcf81ffec9707c6b625312 Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxidecomputer.com>
Date: Wed, 17 Dec 2025 14:13:36 -0800
Subject: [PATCH 17/18] typos, docs

---
 nexus/src/app/background/tasks/support_bundle/README.md | 1 +
 nexus/tests/integration_tests/support_bundles.rs        | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/nexus/src/app/background/tasks/support_bundle/README.md b/nexus/src/app/background/tasks/support_bundle/README.md
index 446cab83e83..8eebf508883 100644
--- a/nexus/src/app/background/tasks/support_bundle/README.md
+++ b/nexus/src/app/background/tasks/support_bundle/README.md
@@ -78,6 +78,7 @@ somewhere consistent and predictable.
 * `ereports/{part number}-{serial number}/{id}.json` - Individual reports
 * `meta/` - Metadata about the bundle
 * `meta/trace.json` - Perfetto-formatted trace of the bundle's collection
+* `omdb/` - Output from omdb commands
 * `rack/{rack id}/sled/{sled id}/` - Sled-specific host OS info
 * `reconfigurator_state.json` - A dump of all reconfigurator state
 * `sled_info.json` - Mapping of sled identifiers to cubby location
diff --git a/nexus/tests/integration_tests/support_bundles.rs b/nexus/tests/integration_tests/support_bundles.rs
index 22b7082e82c..42e93bc522e 100644
--- a/nexus/tests/integration_tests/support_bundles.rs
+++ b/nexus/tests/integration_tests/support_bundles.rs
@@ -575,7 +575,7 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) {
         files_checked += 1;
 
         // Validate that the omdb command is valid, even if it can't connect
-        // to a runnine Nexus right now.
+        // to a running Nexus right now.
         assert!(
             !content.contains("error: unrecognized subcommand"),
             "File {} contains 'error: unrecognized subcommand'.\n\

From adc037a511c8338f3446ba2e67c7143116a60f5a Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxidecomputer.com>
Date: Thu, 18 Dec 2025 12:53:43 -0800
Subject: [PATCH 18/18] split omdb into lib/bin, make dup, use it in nexus
 tests

---
 .config/nextest.toml                          |  10 -
 Cargo.lock                                    |   2 +
 dev-tools/omdb/src/bin/omdb/main.rs           | 309 +----------------
 .../omdb/src/{bin/omdb => }/crucible_agent.rs |   0
 .../src/{bin/omdb => }/crucible_pantry.rs     |   0
 dev-tools/omdb/src/{bin/omdb => }/db.rs       |   0
 dev-tools/omdb/src/{bin/omdb => }/db/alert.rs |   0
 .../omdb/src/{bin/omdb => }/db/blueprints.rs  |   0
 .../omdb/src/{bin/omdb => }/db/db_metadata.rs |   0
 .../omdb/src/{bin/omdb => }/db/ereport.rs     |   0
 dev-tools/omdb/src/{bin/omdb => }/db/saga.rs  |   0
 .../omdb/src/{bin/omdb => }/db/sitrep.rs      |   0
 .../src/{bin/omdb => }/db/user_data_export.rs |   0
 .../omdb/src/{bin/omdb => }/db/whatis.rs      |   0
 dev-tools/omdb/src/{bin/omdb => }/helpers.rs  |   0
 dev-tools/omdb/src/lib.rs                     | 317 ++++++++++++++++++
 dev-tools/omdb/src/{bin/omdb => }/mgs.rs      |   0
 .../omdb/src/{bin/omdb => }/mgs/dashboard.rs  |   0
 .../omdb/src/{bin/omdb => }/mgs/sensors.rs    |   0
 dev-tools/omdb/src/{bin/omdb => }/nexus.rs    |   0
 .../omdb/src/{bin/omdb => }/nexus/quiesce.rs  |   0
 .../omdb => }/nexus/reconfigurator_config.rs  |   0
 .../src/{bin/omdb => }/nexus/update_status.rs |   0
 dev-tools/omdb/src/{bin/omdb => }/oximeter.rs |   0
 dev-tools/omdb/src/{bin/omdb => }/oxql.rs     |   0
 .../omdb/src/{bin/omdb => }/reconfigurator.rs |   0
 .../omdb/src/{bin/omdb => }/sled_agent.rs     |   0
 .../omdb/src/{bin/omdb => }/support_bundle.rs |   0
 nexus/Cargo.toml                              |   6 +
 nexus/src/bin/omdb-dup.rs                     |  25 ++
 nexus/test-utils-macros/src/lib.rs            |  15 +
 nexus/test-utils/src/starter.rs               |  27 --
 32 files changed, 373 insertions(+), 338 deletions(-)
 rename dev-tools/omdb/src/{bin/omdb => }/crucible_agent.rs (100%)
 rename dev-tools/omdb/src/{bin/omdb => }/crucible_pantry.rs (100%)
 rename dev-tools/omdb/src/{bin/omdb => }/db.rs (100%)
 rename dev-tools/omdb/src/{bin/omdb => }/db/alert.rs (100%)
 rename dev-tools/omdb/src/{bin/omdb => }/db/blueprints.rs (100%)
 rename dev-tools/omdb/src/{bin/omdb => }/db/db_metadata.rs (100%)
 rename dev-tools/omdb/src/{bin/omdb => }/db/ereport.rs (100%)
 rename dev-tools/omdb/src/{bin/omdb => }/db/saga.rs (100%)
 rename dev-tools/omdb/src/{bin/omdb => }/db/sitrep.rs (100%)
 rename dev-tools/omdb/src/{bin/omdb => }/db/user_data_export.rs (100%)
 rename dev-tools/omdb/src/{bin/omdb => }/db/whatis.rs (100%)
 rename dev-tools/omdb/src/{bin/omdb => }/helpers.rs (100%)
 create mode 100644 dev-tools/omdb/src/lib.rs
 rename dev-tools/omdb/src/{bin/omdb => }/mgs.rs (100%)
 rename dev-tools/omdb/src/{bin/omdb => }/mgs/dashboard.rs (100%)
 rename dev-tools/omdb/src/{bin/omdb => }/mgs/sensors.rs (100%)
 rename dev-tools/omdb/src/{bin/omdb => }/nexus.rs (100%)
 rename dev-tools/omdb/src/{bin/omdb => }/nexus/quiesce.rs (100%)
 rename dev-tools/omdb/src/{bin/omdb => }/nexus/reconfigurator_config.rs (100%)
 rename dev-tools/omdb/src/{bin/omdb => }/nexus/update_status.rs (100%)
 rename dev-tools/omdb/src/{bin/omdb => }/oximeter.rs (100%)
 rename dev-tools/omdb/src/{bin/omdb => }/oxql.rs (100%)
 rename dev-tools/omdb/src/{bin/omdb => }/reconfigurator.rs (100%)
 rename dev-tools/omdb/src/{bin/omdb => }/sled_agent.rs (100%)
 rename dev-tools/omdb/src/{bin/omdb => }/support_bundle.rs (100%)
 create mode 100644 nexus/src/bin/omdb-dup.rs

diff --git a/.config/nextest.toml b/.config/nextest.toml
index 96460bccfab..c77c8627f1e 100644
--- a/.config/nextest.toml
+++ b/.config/nextest.toml
@@ -15,12 +15,6 @@ experimental = ["setup-scripts"]
 filter = 'rdeps(nexus-test-utils) - package(omicron-dev) - package(omicron-live-tests)'
 setup = 'crdb-seed'
 
-[[profile.default.scripts]]
-# Build omdb for usage within Nexus integration tests.
-# This was initially added for the support bundle integration tests.
-filter = 'rdeps(nexus-test-utils) - package(omicron-dev) - package(omicron-live-tests)'
-setup = 'omdb-build'
-
 [profile.ci]
 fail-fast = false
 
@@ -32,10 +26,6 @@ path = "junit.xml"
 # invocations of nextest happen.
 command = 'cargo run -p crdb-seed --profile test'
 
-[scripts.setup.omdb-build]
-# Build omdb binary for usage by integration tests
-command = 'cargo build --bin omdb'
-
 [[profile.default.scripts]]
 filter = 'package(omicron-clickhouse-admin)'
 setup = 'clickhouse-cluster'
diff --git a/Cargo.lock b/Cargo.lock
index 004917e5a80..f2d89bf7868 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -8322,6 +8322,7 @@ dependencies = [
  "num-integer",
  "omicron-cockroach-metrics",
  "omicron-common",
+ "omicron-omdb",
  "omicron-passwords",
  "omicron-rpaths",
  "omicron-sled-agent",
@@ -8372,6 +8373,7 @@ dependencies = [
  "serde_urlencoded",
  "serde_with",
  "sha2",
+ "sigpipe",
  "similar-asserts",
  "sled-agent-client",
  "sled-agent-types",
diff --git a/dev-tools/omdb/src/bin/omdb/main.rs b/dev-tools/omdb/src/bin/omdb/main.rs
index 61e558daa43..44ceec034ce 100644
--- a/dev-tools/omdb/src/bin/omdb/main.rs
+++ b/dev-tools/omdb/src/bin/omdb/main.rs
@@ -2,311 +2,18 @@
 // License, v. 2.0. If a copy of the MPL was not distributed with this
 // file, You can obtain one at https://mozilla.org/MPL/2.0/.
 
-//! CLI for debugging Omicron internal state
+//! Omicron debugger (omdb) - binary entrypoint
 //!
-//! GROUND RULES:
-//!
-//! 1. There aren't a lot of ground rules here.  At least for now, this is a
-//!    place to put any kind of runtime tooling for Omicron that seems useful.
-//!    You can query the database directly (see notes in db.rs), use internal
-//!    APIs, etc.  To the degree that we can stick to stable interfaces, great.
-//!    But at this stage we'd rather have tools that work on latest than not
-//!    have them because we couldn't prioritize keeping them stable.
-//!
-//! 2. Debuggers should never lie!  Documentation and command names should be
-//!    precise about what they're reporting.  In a working system, these things
-//!    might all be the same:
-//!
-//!        - the list of instances with zones and propolis processes running on
-//!          a sled
-//!        - the list of instances that sled agent knows about
-//!        - the list of instances that Nexus or the database reports should be
-//!          running on a sled
-//!
-//!    But in a broken system, these things might be all different.  People use
-//!    debuggers to understand broken systems.  The debugger should say which of
-//!    these it's reporting, rather than "the list of instances on a sled".
-//!
-//! 3. Where possible, when the tool encounters something unexpected, it should
-//!    print what it can (including the error message and bad data) and then
-//!    continue.  It generally shouldn't stop on the first error.  (We often
-//!    find strange things when debugging but we need our tools to tell us as
-//!    much as they can!)
+//! This is a small shim over `lib.rs`, and is structured this way so that other
+//! crates can depend on omicron-omdb as a library.
 
-use anyhow::Context;
-use anyhow::anyhow;
-use anyhow::ensure;
-use clap::Args;
-use clap::ColorChoice;
 use clap::Parser;
-use clap::Subcommand;
-use futures::StreamExt;
-use internal_dns_types::names::ServiceName;
-use omicron_common::address::Ipv6Subnet;
-use std::net::SocketAddr;
-use std::net::SocketAddrV6;
-use tokio::net::TcpSocket;
-
-mod crucible_agent;
-mod crucible_pantry;
-mod db;
-mod helpers;
-mod mgs;
-mod nexus;
-mod oximeter;
-mod oxql;
-mod reconfigurator;
-mod sled_agent;
-mod support_bundle;
+use omicron_omdb::Omdb;
 
 fn main() -> Result<(), anyhow::Error> {
     sigpipe::reset();
-    oxide_tokio_rt::run(main_impl())
-}
-
-async fn main_impl() -> Result<(), anyhow::Error> {
-    let args = Omdb::parse();
-
-    let log = dropshot::ConfigLogging::StderrTerminal {
-        level: args.log_level.clone(),
-    }
-    .to_logger("omdb")
-    .context("failed to create logger")?;
-
-    match &args.command {
-        OmdbCommands::Db(db) => db.run_cmd(&args, &log).await,
-        OmdbCommands::Mgs(mgs) => mgs.run_cmd(&args, &log).await,
-        OmdbCommands::Nexus(nexus) => nexus.run_cmd(&args, &log).await,
-        OmdbCommands::Oximeter(oximeter) => oximeter.run_cmd(&args, &log).await,
-        OmdbCommands::Oxql(oxql) => oxql.run_cmd(&args, &log).await,
-        OmdbCommands::Reconfigurator(reconfig) => {
-            reconfig.run_cmd(&args, &log).await
-        }
-        OmdbCommands::SledAgent(sled) => sled.run_cmd(&args, &log).await,
-        OmdbCommands::CrucibleAgent(crucible) => crucible.run_cmd(&args).await,
-        OmdbCommands::CruciblePantry(crucible) => crucible.run_cmd(&args).await,
-    }
-}
-
-/// Omicron debugger (unstable)
-///
-/// This tool provides commands for directly querying Omicron components about
-/// their internal state using internal APIs.  This is a prototype.  The
-/// commands and output are unstable and may change.
-#[derive(Debug, Parser)]
-struct Omdb {
-    /// log level filter
-    #[arg(
-        env,
-        long,
-        value_parser = parse_dropshot_log_level,
-        default_value = "warn",
-        global = true,
-    )]
-    log_level: dropshot::ConfigLoggingLevel,
-
-    #[arg(
-        long,
-        env = "OMDB_DNS_SERVER",
-        global = true,
-        help_heading = helpers::CONNECTION_OPTIONS_HEADING,
-    )]
-    dns_server: Option<SocketAddr>,
-
-    /// Allow potentially-destructive subcommands.
-    #[arg(
-        short = 'w',
-        long = "destructive",
-        global = true,
-        help_heading = helpers::SAFETY_OPTIONS_HEADING,
-    )]
-    allow_destructive: bool,
-
-    #[command(flatten)]
-    output: OutputOpts,
-
-    #[command(subcommand)]
-    command: OmdbCommands,
-}
-
-#[derive(Debug, Args)]
-struct OutputOpts {
-    /// Color output
-    #[arg(long, global = true, value_enum, default_value_t)]
-    color: ColorChoice,
-}
-
-mod check_allow_destructive {
-    /// Zero-size type that potentially-destructive functions can accept to
-    /// ensure `Omdb::check_allow_destructive` has been called.
-    // This is tucked away inside a module to prevent it from being constructed
-    // by anything other than `Omdb::check_allow_destructive`.
-    #[must_use]
-    pub(crate) struct DestructiveOperationToken(());
-
-    impl super::Omdb {
-        pub(crate) fn check_allow_destructive(
-            &self,
-        ) -> anyhow::Result<DestructiveOperationToken> {
-            anyhow::ensure!(
-                self.allow_destructive,
-                "This command is potentially destructive. \
-                 Pass the `-w` / `--destructive` flag to allow it."
-            );
-            Ok(DestructiveOperationToken(()))
-        }
-    }
-}
-
-impl Omdb {
-    /// Return the socket addresses of all instances of a service in DNS
-    async fn dns_lookup_all(
-        &self,
-        log: slog::Logger,
-        service_name: ServiceName,
-    ) -> Result<Vec<SocketAddrV6>, anyhow::Error> {
-        let resolver = self.dns_resolver(log).await?;
-        resolver
-            .lookup_all_socket_v6(service_name)
-            .await
-            .with_context(|| format!("looking up {:?} in DNS", service_name))
-    }
-
-    /// Return the socket address of one instance of a service that we can at
-    /// least successfully connect to
-    async fn dns_lookup_one(
-        &self,
-        log: slog::Logger,
-        service_name: ServiceName,
-    ) -> Result<SocketAddrV6, anyhow::Error> {
-        let addrs = self.dns_lookup_all(log, service_name).await?;
-        ensure!(
-            !addrs.is_empty(),
-            "expected at least one address from successful DNS lookup for {:?}",
-            service_name
-        );
-
-        // The caller is going to pick one of these addresses to connect to.
-        // Let's try to pick one that's at least not obviously broken by
-        // attempting to connect to whatever we found and returning any that we
-        // successfully connected to.  It'd be nice if we could return the
-        // socket directly, but our callers are creating reqwest clients that
-        // cannot easily consume a socket directly.
-        //
-        // This approach scales poorly and there are many failure modes that
-        // this does not cover.  But in the absence of better connection
-        // management, and with the risks in `omdb` being pretty low, and the
-        // value of it working pretty high, here we are.  This approach should
-        // not be replicated elsewhere.
-        async fn try_connect(
-            sockaddr_v6: SocketAddrV6,
-        ) -> Result<(), anyhow::Error> {
-            let _ = TcpSocket::new_v6()
-                .context("creating socket")?
-                .connect(SocketAddr::from(sockaddr_v6))
-                .await
-                .with_context(|| format!("connect \"{}\"", sockaddr_v6))?;
-            Ok(())
-        }
-
-        let mut socket_stream = futures::stream::iter(addrs)
-            .map(async move |sockaddr_v6| {
-                (sockaddr_v6, try_connect(sockaddr_v6).await)
-            })
-            .buffer_unordered(3);
-
-        while let Some((sockaddr, connect_result)) = socket_stream.next().await
-        {
-            match connect_result {
-                Ok(()) => return Ok(sockaddr),
-                Err(error) => {
-                    eprintln!(
-                        "warning: failed to connect to {:?} at {}: {:#}",
-                        service_name, sockaddr, error
-                    );
-                }
-            }
-        }
-
-        Err(anyhow!("failed to connect to any instances of {:?}", service_name))
-    }
-
-    async fn dns_resolver(
-        &self,
-        log: slog::Logger,
-    ) -> Result<internal_dns_resolver::Resolver, anyhow::Error> {
-        match &self.dns_server {
-            Some(dns_server) => {
-                internal_dns_resolver::Resolver::new_from_addrs(
-                    log,
-                    &[*dns_server],
-                )
-                .with_context(|| {
-                    format!(
-                        "creating DNS resolver for DNS server {:?}",
-                        dns_server
-                    )
-                })
-            }
-            None => {
-                // In principle, we should look at /etc/resolv.conf to find the
-                // DNS servers.  In practice, this usually isn't populated
-                // today.  See oxidecomputer/omicron#2122.
-                //
-                // However, the address selected below should work for most
-                // existing Omicron deployments today.  That's because while the
-                // base subnet is in principle configurable in config-rss.toml,
-                // it's very uncommon to change it from the default value used
-                // here.
-                //
-                // Yet another option would be to find a local IP address that
-                // looks like it's probably on the underlay network and use that
-                // to find the subnet to use.  But again, this is unlikely to be
-                // wrong and it's easy to override.
-                let subnet =
-                    Ipv6Subnet::new("fd00:1122:3344:0100::".parse().unwrap());
-                eprintln!("note: using DNS server for subnet {}", subnet.net());
-                eprintln!(
-                    "note: (if this is not right, use --dns-server \
-                    to specify an alternate DNS server)",
-                );
-                internal_dns_resolver::Resolver::new_from_subnet(log, subnet)
-                    .with_context(|| {
-                        format!(
-                            "creating DNS resolver for subnet {}",
-                            subnet.net()
-                        )
-                    })
-            }
-        }
-    }
-}
-
-#[derive(Debug, Subcommand)]
-#[allow(clippy::large_enum_variant)]
-enum OmdbCommands {
-    /// Debug a specific crucible-agent
-    CrucibleAgent(crucible_agent::CrucibleAgentArgs),
-    /// Query a specific crucible-pantry
-    CruciblePantry(crucible_pantry::CruciblePantryArgs),
-    /// Query the control plane database (CockroachDB)
-    Db(db::DbArgs),
-    /// Debug a specific Management Gateway Service instance
-    Mgs(mgs::MgsArgs),
-    /// Debug a specific Nexus instance
-    Nexus(nexus::NexusArgs),
-    /// Query oximeter collector state
-    Oximeter(oximeter::OximeterArgs),
-    /// Enter the Oximeter Query Language shell for interactive querying.
-    Oxql(oxql::OxqlArgs),
-    /// Interact with the Reconfigurator system
-    Reconfigurator(reconfigurator::ReconfiguratorArgs),
-    /// Debug a specific Sled
-    SledAgent(sled_agent::SledAgentArgs),
-}
-
-fn parse_dropshot_log_level(
-    s: &str,
-) -> Result<dropshot::ConfigLoggingLevel, anyhow::Error> {
-    serde_json::from_str(&format!("{:?}", s)).context("parsing log level")
+    oxide_tokio_rt::run(async {
+        let cmd = Omdb::parse();
+        cmd.exec().await
+    })
 }
diff --git a/dev-tools/omdb/src/bin/omdb/crucible_agent.rs b/dev-tools/omdb/src/crucible_agent.rs
similarity index 100%
rename from dev-tools/omdb/src/bin/omdb/crucible_agent.rs
rename to dev-tools/omdb/src/crucible_agent.rs
diff --git a/dev-tools/omdb/src/bin/omdb/crucible_pantry.rs b/dev-tools/omdb/src/crucible_pantry.rs
similarity index 100%
rename from dev-tools/omdb/src/bin/omdb/crucible_pantry.rs
rename to dev-tools/omdb/src/crucible_pantry.rs
diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/db.rs
similarity index 100%
rename from dev-tools/omdb/src/bin/omdb/db.rs
rename to dev-tools/omdb/src/db.rs
diff --git a/dev-tools/omdb/src/bin/omdb/db/alert.rs b/dev-tools/omdb/src/db/alert.rs
similarity index 100%
rename from dev-tools/omdb/src/bin/omdb/db/alert.rs
rename to dev-tools/omdb/src/db/alert.rs
diff --git a/dev-tools/omdb/src/bin/omdb/db/blueprints.rs b/dev-tools/omdb/src/db/blueprints.rs
similarity index 100%
rename from dev-tools/omdb/src/bin/omdb/db/blueprints.rs
rename to dev-tools/omdb/src/db/blueprints.rs
diff --git a/dev-tools/omdb/src/bin/omdb/db/db_metadata.rs b/dev-tools/omdb/src/db/db_metadata.rs
similarity index 100%
rename from dev-tools/omdb/src/bin/omdb/db/db_metadata.rs
rename to dev-tools/omdb/src/db/db_metadata.rs
diff --git a/dev-tools/omdb/src/bin/omdb/db/ereport.rs b/dev-tools/omdb/src/db/ereport.rs
similarity index 100%
rename from dev-tools/omdb/src/bin/omdb/db/ereport.rs
rename to dev-tools/omdb/src/db/ereport.rs
diff --git a/dev-tools/omdb/src/bin/omdb/db/saga.rs b/dev-tools/omdb/src/db/saga.rs
similarity index 100%
rename from dev-tools/omdb/src/bin/omdb/db/saga.rs
rename to dev-tools/omdb/src/db/saga.rs
diff --git a/dev-tools/omdb/src/bin/omdb/db/sitrep.rs b/dev-tools/omdb/src/db/sitrep.rs
similarity index 100%
rename from dev-tools/omdb/src/bin/omdb/db/sitrep.rs
rename to dev-tools/omdb/src/db/sitrep.rs
diff --git a/dev-tools/omdb/src/bin/omdb/db/user_data_export.rs b/dev-tools/omdb/src/db/user_data_export.rs
similarity index 100%
rename from dev-tools/omdb/src/bin/omdb/db/user_data_export.rs
rename to dev-tools/omdb/src/db/user_data_export.rs
diff --git a/dev-tools/omdb/src/bin/omdb/db/whatis.rs b/dev-tools/omdb/src/db/whatis.rs
similarity index 100%
rename from dev-tools/omdb/src/bin/omdb/db/whatis.rs
rename to dev-tools/omdb/src/db/whatis.rs
diff --git a/dev-tools/omdb/src/bin/omdb/helpers.rs b/dev-tools/omdb/src/helpers.rs
similarity index 100%
rename from dev-tools/omdb/src/bin/omdb/helpers.rs
rename to dev-tools/omdb/src/helpers.rs
diff --git a/dev-tools/omdb/src/lib.rs b/dev-tools/omdb/src/lib.rs
new file mode 100644
index 00000000000..6a4a84c904b
--- /dev/null
+++ b/dev-tools/omdb/src/lib.rs
@@ -0,0 +1,317 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Omicron debugger (omdb) - library interface
+//!
+//! This module exposes omdb's CLI functionality as a library, allowing other
+//! crates to create their own omdb binaries.
+//!
+//! GROUND RULES:
+//!
+//! 1. There aren't a lot of ground rules here.  At least for now, this is a
+//!    place to put any kind of runtime tooling for Omicron that seems useful.
+//!    You can query the database directly (see notes in db.rs), use internal
+//!    APIs, etc.  To the degree that we can stick to stable interfaces, great.
+//!    But at this stage we'd rather have tools that work on latest than not
+//!    have them because we couldn't prioritize keeping them stable.
+//!
+//! 2. Debuggers should never lie!  Documentation and command names should be
+//!    precise about what they're reporting.  In a working system, these things
+//!    might all be the same:
+//!
+//!        - the list of instances with zones and propolis processes running on
+//!          a sled
+//!        - the list of instances that sled agent knows about
+//!        - the list of instances that Nexus or the database reports should be
+//!          running on a sled
+//!
+//!    But in a broken system, these things might be all different.  People use
+//!    debuggers to understand broken systems.  The debugger should say which of
+//!    these it's reporting, rather than "the list of instances on a sled".
+//!
+//! 3. Where possible, when the tool encounters something unexpected, it should
+//!    print what it can (including the error message and bad data) and then
+//!    continue.  It generally shouldn't stop on the first error.  (We often
+//!    find strange things when debugging but we need our tools to tell us as
+//!    much as they can!)
+
+use anyhow::Context;
+use anyhow::anyhow;
+use anyhow::ensure;
+use clap::Args;
+use clap::ColorChoice;
+use clap::Parser;
+use clap::Subcommand;
+use futures::StreamExt;
+use internal_dns_types::names::ServiceName;
+use omicron_common::address::Ipv6Subnet;
+use std::net::SocketAddr;
+use std::net::SocketAddrV6;
+use tokio::net::TcpSocket;
+
+mod crucible_agent;
+mod crucible_pantry;
+mod db;
+mod helpers;
+mod mgs;
+mod nexus;
+mod oximeter;
+mod oxql;
+mod reconfigurator;
+mod sled_agent;
+mod support_bundle;
+
+/// Omicron debugger (unstable)
+///
+/// This tool provides commands for directly querying Omicron components about
+/// their internal state using internal APIs.  This is a prototype.  The
+/// commands and output are unstable and may change.
+#[derive(Debug, Parser)]
+pub struct Omdb {
+    /// log level filter
+    #[arg(
+        env,
+        long,
+        value_parser = parse_dropshot_log_level,
+        default_value = "warn",
+        global = true,
+    )]
+    log_level: dropshot::ConfigLoggingLevel,
+
+    #[arg(
+        long,
+        env = "OMDB_DNS_SERVER",
+        global = true,
+        help_heading = helpers::CONNECTION_OPTIONS_HEADING,
+    )]
+    dns_server: Option<SocketAddr>,
+
+    /// Allow potentially-destructive subcommands.
+    #[arg(
+        short = 'w',
+        long = "destructive",
+        global = true,
+        help_heading = helpers::SAFETY_OPTIONS_HEADING,
+    )]
+    allow_destructive: bool,
+
+    #[command(flatten)]
+    output: OutputOpts,
+
+    #[command(subcommand)]
+    command: OmdbCommands,
+}
+
+impl Omdb {
+    /// Execute the omdb command.
+    pub async fn exec(self) -> Result<(), anyhow::Error> {
+        let log = dropshot::ConfigLogging::StderrTerminal {
+            level: self.log_level.clone(),
+        }
+        .to_logger("omdb")
+        .context("failed to create logger")?;
+
+        match &self.command {
+            OmdbCommands::Db(db) => db.run_cmd(&self, &log).await,
+            OmdbCommands::Mgs(mgs) => mgs.run_cmd(&self, &log).await,
+            OmdbCommands::Nexus(nexus) => nexus.run_cmd(&self, &log).await,
+            OmdbCommands::Oximeter(oximeter) => {
+                oximeter.run_cmd(&self, &log).await
+            }
+            OmdbCommands::Oxql(oxql) => oxql.run_cmd(&self, &log).await,
+            OmdbCommands::Reconfigurator(reconfig) => {
+                reconfig.run_cmd(&self, &log).await
+            }
+            OmdbCommands::SledAgent(sled) => sled.run_cmd(&self, &log).await,
+            OmdbCommands::CrucibleAgent(crucible) => {
+                crucible.run_cmd(&self).await
+            }
+            OmdbCommands::CruciblePantry(crucible) => {
+                crucible.run_cmd(&self).await
+            }
+        }
+    }
+}
+
+#[derive(Debug, Args)]
+struct OutputOpts {
+    /// Color output
+    #[arg(long, global = true, value_enum, default_value_t)]
+    color: ColorChoice,
+}
+
+mod check_allow_destructive {
+    /// Zero-size type that potentially-destructive functions can accept to
+    /// ensure `Omdb::check_allow_destructive` has been called.
+    // This is tucked away inside a module to prevent it from being constructed
+    // by anything other than `Omdb::check_allow_destructive`.
+    #[must_use]
+    pub(crate) struct DestructiveOperationToken(());
+
+    impl super::Omdb {
+        pub(crate) fn check_allow_destructive(
+            &self,
+        ) -> anyhow::Result<DestructiveOperationToken> {
+            anyhow::ensure!(
+                self.allow_destructive,
+                "This command is potentially destructive. \
+                 Pass the `-w` / `--destructive` flag to allow it."
+            );
+            Ok(DestructiveOperationToken(()))
+        }
+    }
+}
+
+impl Omdb {
+    /// Return the socket addresses of all instances of a service in DNS
+    async fn dns_lookup_all(
+        &self,
+        log: slog::Logger,
+        service_name: ServiceName,
+    ) -> Result<Vec<SocketAddrV6>, anyhow::Error> {
+        let resolver = self.dns_resolver(log).await?;
+        resolver
+            .lookup_all_socket_v6(service_name)
+            .await
+            .with_context(|| format!("looking up {:?} in DNS", service_name))
+    }
+
+    /// Return the socket address of one instance of a service that we can at
+    /// least successfully connect to
+    async fn dns_lookup_one(
+        &self,
+        log: slog::Logger,
+        service_name: ServiceName,
+    ) -> Result<SocketAddrV6, anyhow::Error> {
+        let addrs = self.dns_lookup_all(log, service_name).await?;
+        ensure!(
+            !addrs.is_empty(),
+            "expected at least one address from successful DNS lookup for {:?}",
+            service_name
+        );
+
+        // The caller is going to pick one of these addresses to connect to.
+        // Let's try to pick one that's at least not obviously broken by
+        // attempting to connect to whatever we found and returning any that we
+        // successfully connected to.  It'd be nice if we could return the
+        // socket directly, but our callers are creating reqwest clients that
+        // cannot easily consume a socket directly.
+        //
+        // This approach scales poorly and there are many failure modes that
+        // this does not cover.  But in the absence of better connection
+        // management, and with the risks in `omdb` being pretty low, and the
+        // value of it working pretty high, here we are.  This approach should
+        // not be replicated elsewhere.
+        async fn try_connect(
+            sockaddr_v6: SocketAddrV6,
+        ) -> Result<(), anyhow::Error> {
+            let _ = TcpSocket::new_v6()
+                .context("creating socket")?
+                .connect(SocketAddr::from(sockaddr_v6))
+                .await
+                .with_context(|| format!("connect \"{}\"", sockaddr_v6))?;
+            Ok(())
+        }
+
+        let mut socket_stream = futures::stream::iter(addrs)
+            .map(async move |sockaddr_v6| {
+                (sockaddr_v6, try_connect(sockaddr_v6).await)
+            })
+            .buffer_unordered(3);
+
+        while let Some((sockaddr, connect_result)) = socket_stream.next().await
+        {
+            match connect_result {
+                Ok(()) => return Ok(sockaddr),
+                Err(error) => {
+                    eprintln!(
+                        "warning: failed to connect to {:?} at {}: {:#}",
+                        service_name, sockaddr, error
+                    );
+                }
+            }
+        }
+
+        Err(anyhow!("failed to connect to any instances of {:?}", service_name))
+    }
+
+    async fn dns_resolver(
+        &self,
+        log: slog::Logger,
+    ) -> Result<internal_dns_resolver::Resolver, anyhow::Error> {
+        match &self.dns_server {
+            Some(dns_server) => {
+                internal_dns_resolver::Resolver::new_from_addrs(
+                    log,
+                    &[*dns_server],
+                )
+                .with_context(|| {
+                    format!(
+                        "creating DNS resolver for DNS server {:?}",
+                        dns_server
+                    )
+                })
+            }
+            None => {
+                // In principle, we should look at /etc/resolv.conf to find the
+                // DNS servers.  In practice, this usually isn't populated
+                // today.  See oxidecomputer/omicron#2122.
+                //
+                // However, the address selected below should work for most
+                // existing Omicron deployments today.  That's because while the
+                // base subnet is in principle configurable in config-rss.toml,
+                // it's very uncommon to change it from the default value used
+                // here.
+                //
+                // Yet another option would be to find a local IP address that
+                // looks like it's probably on the underlay network and use that
+                // to find the subnet to use.  But again, this is unlikely to be
+                // wrong and it's easy to override.
+                let subnet =
+                    Ipv6Subnet::new("fd00:1122:3344:0100::".parse().unwrap());
+                eprintln!("note: using DNS server for subnet {}", subnet.net());
+                eprintln!(
+                    "note: (if this is not right, use --dns-server \
+                    to specify an alternate DNS server)",
+                );
+                internal_dns_resolver::Resolver::new_from_subnet(log, subnet)
+                    .with_context(|| {
+                        format!(
+                            "creating DNS resolver for subnet {}",
+                            subnet.net()
+                        )
+                    })
+            }
+        }
+    }
+}
+
+#[derive(Debug, Subcommand)]
+#[allow(clippy::large_enum_variant)]
+enum OmdbCommands {
+    /// Debug a specific crucible-agent
+    CrucibleAgent(crucible_agent::CrucibleAgentArgs),
+    /// Query a specific crucible-pantry
+    CruciblePantry(crucible_pantry::CruciblePantryArgs),
+    /// Query the control plane database (CockroachDB)
+    Db(db::DbArgs),
+    /// Debug a specific Management Gateway Service instance
+    Mgs(mgs::MgsArgs),
+    /// Debug a specific Nexus instance
+    Nexus(nexus::NexusArgs),
+    /// Query oximeter collector state
+    Oximeter(oximeter::OximeterArgs),
+    /// Enter the Oximeter Query Language shell for interactive querying.
+    Oxql(oxql::OxqlArgs),
+    /// Interact with the Reconfigurator system
+    Reconfigurator(reconfigurator::ReconfiguratorArgs),
+    /// Debug a specific Sled
+    SledAgent(sled_agent::SledAgentArgs),
+}
+
+fn parse_dropshot_log_level(
+    s: &str,
+) -> Result<dropshot::ConfigLoggingLevel, anyhow::Error> {
+    serde_json::from_str(&format!("{:?}", s)).context("parsing log level")
+}
diff --git a/dev-tools/omdb/src/bin/omdb/mgs.rs b/dev-tools/omdb/src/mgs.rs
similarity index 100%
rename from dev-tools/omdb/src/bin/omdb/mgs.rs
rename to dev-tools/omdb/src/mgs.rs
diff --git a/dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs b/dev-tools/omdb/src/mgs/dashboard.rs
similarity index 100%
rename from dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs
rename to dev-tools/omdb/src/mgs/dashboard.rs
diff --git a/dev-tools/omdb/src/bin/omdb/mgs/sensors.rs b/dev-tools/omdb/src/mgs/sensors.rs
similarity index 100%
rename from dev-tools/omdb/src/bin/omdb/mgs/sensors.rs
rename to dev-tools/omdb/src/mgs/sensors.rs
diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/nexus.rs
similarity index 100%
rename from dev-tools/omdb/src/bin/omdb/nexus.rs
rename to dev-tools/omdb/src/nexus.rs
diff --git a/dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs b/dev-tools/omdb/src/nexus/quiesce.rs
similarity index 100%
rename from dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs
rename to dev-tools/omdb/src/nexus/quiesce.rs
diff --git a/dev-tools/omdb/src/bin/omdb/nexus/reconfigurator_config.rs b/dev-tools/omdb/src/nexus/reconfigurator_config.rs
similarity index 100%
rename from dev-tools/omdb/src/bin/omdb/nexus/reconfigurator_config.rs
rename to dev-tools/omdb/src/nexus/reconfigurator_config.rs
diff --git a/dev-tools/omdb/src/bin/omdb/nexus/update_status.rs b/dev-tools/omdb/src/nexus/update_status.rs
similarity index 100%
rename from dev-tools/omdb/src/bin/omdb/nexus/update_status.rs
rename to dev-tools/omdb/src/nexus/update_status.rs
diff --git a/dev-tools/omdb/src/bin/omdb/oximeter.rs b/dev-tools/omdb/src/oximeter.rs
similarity index 100%
rename from dev-tools/omdb/src/bin/omdb/oximeter.rs
rename to dev-tools/omdb/src/oximeter.rs
diff --git a/dev-tools/omdb/src/bin/omdb/oxql.rs b/dev-tools/omdb/src/oxql.rs
similarity index 100%
rename from dev-tools/omdb/src/bin/omdb/oxql.rs
rename to dev-tools/omdb/src/oxql.rs
diff --git a/dev-tools/omdb/src/bin/omdb/reconfigurator.rs b/dev-tools/omdb/src/reconfigurator.rs
similarity index 100%
rename from dev-tools/omdb/src/bin/omdb/reconfigurator.rs
rename to dev-tools/omdb/src/reconfigurator.rs
diff --git a/dev-tools/omdb/src/bin/omdb/sled_agent.rs b/dev-tools/omdb/src/sled_agent.rs
similarity index 100%
rename from dev-tools/omdb/src/bin/omdb/sled_agent.rs
rename to dev-tools/omdb/src/sled_agent.rs
diff --git a/dev-tools/omdb/src/bin/omdb/support_bundle.rs b/dev-tools/omdb/src/support_bundle.rs
similarity index 100%
rename from dev-tools/omdb/src/bin/omdb/support_bundle.rs
rename to dev-tools/omdb/src/support_bundle.rs
diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml
index 81d4ed9bbfb..0827f1be1fd 100644
--- a/nexus/Cargo.toml
+++ b/nexus/Cargo.toml
@@ -132,8 +132,10 @@ nexus-reconfigurator-preparation.workspace = true
 nexus-reconfigurator-rendezvous.workspace = true
 nexus-types.workspace = true
 omicron-common.workspace = true
+omicron-omdb.workspace = true
 omicron-passwords.workspace = true
 oxide-tokio-rt.workspace = true
+sigpipe.workspace = true
 oximeter.workspace = true
 oximeter-instruments = { workspace = true, features = ["http-instruments"] }
 oximeter-producer.workspace = true
@@ -199,3 +201,7 @@ harness = false
 [[bin]]
 name = "nexus"
 doc = false
+
+[[bin]]
+name = "omdb-dup"
+doc = false
diff --git a/nexus/src/bin/omdb-dup.rs b/nexus/src/bin/omdb-dup.rs
new file mode 100644
index 00000000000..d2596968b6f
--- /dev/null
+++ b/nexus/src/bin/omdb-dup.rs
@@ -0,0 +1,25 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! A copy of omdb's `main.rs`.
+//!
+//! This is a workaround for the fact that Cargo only lets integration tests use
+//! binaries defined in the same crate. We'd like two sets of integration tests
+//! against omdb: quicker ones that live in that crate, and slower ones that
+//! depend on Nexus and live here.
+//!
+//! The tests don't have to use omdb as a binary. They could also use it as a
+//! library, but doing that properly would require stdout and stderr to be
+//! redirected to in-memory buffers. This small binary works around that.
+
+use clap::Parser;
+use omicron_omdb::Omdb;
+
+fn main() -> Result<(), anyhow::Error> {
+    sigpipe::reset();
+    oxide_tokio_rt::run(async {
+        let cmd = Omdb::parse();
+        cmd.exec().await
+    })
+}
diff --git a/nexus/test-utils-macros/src/lib.rs b/nexus/test-utils-macros/src/lib.rs
index 767f09b9d39..96408b12701 100644
--- a/nexus/test-utils-macros/src/lib.rs
+++ b/nexus/test-utils-macros/src/lib.rs
@@ -130,6 +130,21 @@ pub fn nexus_test(attrs: TokenStream, input: TokenStream) -> TokenStream {
                 #func_ident_string,
             )
             .with_extra_sled_agents(#extra_sled_agents)
+            .customize_nexus_config(&|config| {
+                // Set omdb binary path from CARGO_BIN_EXE_omdb-dup if available.
+                // This env var is set by cargo test/nextest for binaries in the
+                // same package - but it's only accessible to integration tests
+                // and benchmarks.
+                //
+                // We use option_env!() here (which expands in test code) to
+                // avoid compile errors during cargo check when the binary
+                // doesn't exist. If the env var isn't set, we leave the path
+                // unchanged (it uses a default).
+                if let Some(omdb_path) = option_env!("CARGO_BIN_EXE_omdb-dup") {
+                    config.pkg.omdb.bin_path =
+                        ::camino::Utf8PathBuf::from(omdb_path);
+                }
+            })
             .start::<#which_nexus>()
             .await;
             #func_ident(&ctx).await;
diff --git a/nexus/test-utils/src/starter.rs b/nexus/test-utils/src/starter.rs
index cac54381db1..412959d3d63 100644
--- a/nexus/test-utils/src/starter.rs
+++ b/nexus/test-utils/src/starter.rs
@@ -560,33 +560,6 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> {
                 .clone(),
         };
 
-        // Configure the omdb binary path for tests.
-        // The binary is built by cargo at the workspace root in target/<profile>/omdb.
-        // Tests run from the nexus directory, so we need to go up one level.
-        let workspace_root = std::env::current_dir()
-            .expect("Failed to get current directory")
-            .parent()
-            .expect("Current directory should have a parent")
-            .to_path_buf();
-        let omdb_debug = workspace_root.join("target/debug/omdb");
-        let omdb_release = workspace_root.join("target/release/omdb");
-
-        self.config.pkg.omdb.bin_path = if omdb_release.exists() {
-            camino::Utf8PathBuf::try_from(omdb_release)
-                .expect("Failed to convert release path to UTF-8")
-        } else if omdb_debug.exists() {
-            camino::Utf8PathBuf::try_from(omdb_debug)
-                .expect("Failed to convert debug path to UTF-8")
-        } else {
-            // omdb hasn't been built yet - use a path that will fail gracefully
-            // when tests try to use it.
-            //
-            // Our rules in ".config/nextest.toml" should prevent this, but this
-            // acts as a defensive buffer against running without nextest, or
-            // changing the directory layout.
-            camino::Utf8PathBuf::from("/nonexistent/omdb")
-        };
-
         let nexus_internal = N::start_internal(&self.config, &log).await?;
         let nexus_internal_addr =
             nexus_internal.get_http_server_internal_address();