diff --git a/Cargo.lock b/Cargo.lock index 004917e5a80..f2d89bf7868 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8322,6 +8322,7 @@ dependencies = [ "num-integer", "omicron-cockroach-metrics", "omicron-common", + "omicron-omdb", "omicron-passwords", "omicron-rpaths", "omicron-sled-agent", @@ -8372,6 +8373,7 @@ dependencies = [ "serde_urlencoded", "serde_with", "sha2", + "sigpipe", "similar-asserts", "sled-agent-client", "sled-agent-types", diff --git a/dev-tools/omdb/src/bin/omdb/main.rs b/dev-tools/omdb/src/bin/omdb/main.rs index 61e558daa43..44ceec034ce 100644 --- a/dev-tools/omdb/src/bin/omdb/main.rs +++ b/dev-tools/omdb/src/bin/omdb/main.rs @@ -2,311 +2,18 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! CLI for debugging Omicron internal state +//! Omicron debugger (omdb) - binary entrypoint //! -//! GROUND RULES: -//! -//! 1. There aren't a lot of ground rules here. At least for now, this is a -//! place to put any kind of runtime tooling for Omicron that seems useful. -//! You can query the database directly (see notes in db.rs), use internal -//! APIs, etc. To the degree that we can stick to stable interfaces, great. -//! But at this stage we'd rather have tools that work on latest than not -//! have them because we couldn't prioritize keeping them stable. -//! -//! 2. Debuggers should never lie! Documentation and command names should be -//! precise about what they're reporting. In a working system, these things -//! might all be the same: -//! -//! - the list of instances with zones and propolis processes running on -//! a sled -//! - the list of instances that sled agent knows about -//! - the list of instances that Nexus or the database reports should be -//! running on a sled -//! -//! But in a broken system, these things might be all different. People use -//! debuggers to understand broken systems. The debugger should say which of -//! these it's reporting, rather than "the list of instances on a sled". -//! -//! 3. Where possible, when the tool encounters something unexpected, it should -//! print what it can (including the error message and bad data) and then -//! continue. It generally shouldn't stop on the first error. (We often -//! find strange things when debugging but we need our tools to tell us as -//! much as they can!) +//! This is a small shim over `lib.rs`, and is structured this way so that other +//! crates can depend on omicron-omdb as a library. -use anyhow::Context; -use anyhow::anyhow; -use anyhow::ensure; -use clap::Args; -use clap::ColorChoice; use clap::Parser; -use clap::Subcommand; -use futures::StreamExt; -use internal_dns_types::names::ServiceName; -use omicron_common::address::Ipv6Subnet; -use std::net::SocketAddr; -use std::net::SocketAddrV6; -use tokio::net::TcpSocket; - -mod crucible_agent; -mod crucible_pantry; -mod db; -mod helpers; -mod mgs; -mod nexus; -mod oximeter; -mod oxql; -mod reconfigurator; -mod sled_agent; -mod support_bundle; +use omicron_omdb::Omdb; fn main() -> Result<(), anyhow::Error> { sigpipe::reset(); - oxide_tokio_rt::run(main_impl()) -} - -async fn main_impl() -> Result<(), anyhow::Error> { - let args = Omdb::parse(); - - let log = dropshot::ConfigLogging::StderrTerminal { - level: args.log_level.clone(), - } - .to_logger("omdb") - .context("failed to create logger")?; - - match &args.command { - OmdbCommands::Db(db) => db.run_cmd(&args, &log).await, - OmdbCommands::Mgs(mgs) => mgs.run_cmd(&args, &log).await, - OmdbCommands::Nexus(nexus) => nexus.run_cmd(&args, &log).await, - OmdbCommands::Oximeter(oximeter) => oximeter.run_cmd(&args, &log).await, - OmdbCommands::Oxql(oxql) => oxql.run_cmd(&args, &log).await, - OmdbCommands::Reconfigurator(reconfig) => { - reconfig.run_cmd(&args, &log).await - } - OmdbCommands::SledAgent(sled) => sled.run_cmd(&args, &log).await, - OmdbCommands::CrucibleAgent(crucible) => crucible.run_cmd(&args).await, - OmdbCommands::CruciblePantry(crucible) => crucible.run_cmd(&args).await, - } -} - -/// Omicron debugger (unstable) -/// -/// This tool provides commands for directly querying Omicron components about -/// their internal state using internal APIs. This is a prototype. The -/// commands and output are unstable and may change. -#[derive(Debug, Parser)] -struct Omdb { - /// log level filter - #[arg( - env, - long, - value_parser = parse_dropshot_log_level, - default_value = "warn", - global = true, - )] - log_level: dropshot::ConfigLoggingLevel, - - #[arg( - long, - env = "OMDB_DNS_SERVER", - global = true, - help_heading = helpers::CONNECTION_OPTIONS_HEADING, - )] - dns_server: Option, - - /// Allow potentially-destructive subcommands. - #[arg( - short = 'w', - long = "destructive", - global = true, - help_heading = helpers::SAFETY_OPTIONS_HEADING, - )] - allow_destructive: bool, - - #[command(flatten)] - output: OutputOpts, - - #[command(subcommand)] - command: OmdbCommands, -} - -#[derive(Debug, Args)] -struct OutputOpts { - /// Color output - #[arg(long, global = true, value_enum, default_value_t)] - color: ColorChoice, -} - -mod check_allow_destructive { - /// Zero-size type that potentially-destructive functions can accept to - /// ensure `Omdb::check_allow_destructive` has been called. - // This is tucked away inside a module to prevent it from being constructed - // by anything other than `Omdb::check_allow_destructive`. - #[must_use] - pub(crate) struct DestructiveOperationToken(()); - - impl super::Omdb { - pub(crate) fn check_allow_destructive( - &self, - ) -> anyhow::Result { - anyhow::ensure!( - self.allow_destructive, - "This command is potentially destructive. \ - Pass the `-w` / `--destructive` flag to allow it." - ); - Ok(DestructiveOperationToken(())) - } - } -} - -impl Omdb { - /// Return the socket addresses of all instances of a service in DNS - async fn dns_lookup_all( - &self, - log: slog::Logger, - service_name: ServiceName, - ) -> Result, anyhow::Error> { - let resolver = self.dns_resolver(log).await?; - resolver - .lookup_all_socket_v6(service_name) - .await - .with_context(|| format!("looking up {:?} in DNS", service_name)) - } - - /// Return the socket address of one instance of a service that we can at - /// least successfully connect to - async fn dns_lookup_one( - &self, - log: slog::Logger, - service_name: ServiceName, - ) -> Result { - let addrs = self.dns_lookup_all(log, service_name).await?; - ensure!( - !addrs.is_empty(), - "expected at least one address from successful DNS lookup for {:?}", - service_name - ); - - // The caller is going to pick one of these addresses to connect to. - // Let's try to pick one that's at least not obviously broken by - // attempting to connect to whatever we found and returning any that we - // successfully connected to. It'd be nice if we could return the - // socket directly, but our callers are creating reqwest clients that - // cannot easily consume a socket directly. - // - // This approach scales poorly and there are many failure modes that - // this does not cover. But in the absence of better connection - // management, and with the risks in `omdb` being pretty low, and the - // value of it working pretty high, here we are. This approach should - // not be replicated elsewhere. - async fn try_connect( - sockaddr_v6: SocketAddrV6, - ) -> Result<(), anyhow::Error> { - let _ = TcpSocket::new_v6() - .context("creating socket")? - .connect(SocketAddr::from(sockaddr_v6)) - .await - .with_context(|| format!("connect \"{}\"", sockaddr_v6))?; - Ok(()) - } - - let mut socket_stream = futures::stream::iter(addrs) - .map(async move |sockaddr_v6| { - (sockaddr_v6, try_connect(sockaddr_v6).await) - }) - .buffer_unordered(3); - - while let Some((sockaddr, connect_result)) = socket_stream.next().await - { - match connect_result { - Ok(()) => return Ok(sockaddr), - Err(error) => { - eprintln!( - "warning: failed to connect to {:?} at {}: {:#}", - service_name, sockaddr, error - ); - } - } - } - - Err(anyhow!("failed to connect to any instances of {:?}", service_name)) - } - - async fn dns_resolver( - &self, - log: slog::Logger, - ) -> Result { - match &self.dns_server { - Some(dns_server) => { - internal_dns_resolver::Resolver::new_from_addrs( - log, - &[*dns_server], - ) - .with_context(|| { - format!( - "creating DNS resolver for DNS server {:?}", - dns_server - ) - }) - } - None => { - // In principle, we should look at /etc/resolv.conf to find the - // DNS servers. In practice, this usually isn't populated - // today. See oxidecomputer/omicron#2122. - // - // However, the address selected below should work for most - // existing Omicron deployments today. That's because while the - // base subnet is in principle configurable in config-rss.toml, - // it's very uncommon to change it from the default value used - // here. - // - // Yet another option would be to find a local IP address that - // looks like it's probably on the underlay network and use that - // to find the subnet to use. But again, this is unlikely to be - // wrong and it's easy to override. - let subnet = - Ipv6Subnet::new("fd00:1122:3344:0100::".parse().unwrap()); - eprintln!("note: using DNS server for subnet {}", subnet.net()); - eprintln!( - "note: (if this is not right, use --dns-server \ - to specify an alternate DNS server)", - ); - internal_dns_resolver::Resolver::new_from_subnet(log, subnet) - .with_context(|| { - format!( - "creating DNS resolver for subnet {}", - subnet.net() - ) - }) - } - } - } -} - -#[derive(Debug, Subcommand)] -#[allow(clippy::large_enum_variant)] -enum OmdbCommands { - /// Debug a specific crucible-agent - CrucibleAgent(crucible_agent::CrucibleAgentArgs), - /// Query a specific crucible-pantry - CruciblePantry(crucible_pantry::CruciblePantryArgs), - /// Query the control plane database (CockroachDB) - Db(db::DbArgs), - /// Debug a specific Management Gateway Service instance - Mgs(mgs::MgsArgs), - /// Debug a specific Nexus instance - Nexus(nexus::NexusArgs), - /// Query oximeter collector state - Oximeter(oximeter::OximeterArgs), - /// Enter the Oximeter Query Language shell for interactive querying. - Oxql(oxql::OxqlArgs), - /// Interact with the Reconfigurator system - Reconfigurator(reconfigurator::ReconfiguratorArgs), - /// Debug a specific Sled - SledAgent(sled_agent::SledAgentArgs), -} - -fn parse_dropshot_log_level( - s: &str, -) -> Result { - serde_json::from_str(&format!("{:?}", s)).context("parsing log level") + oxide_tokio_rt::run(async { + let cmd = Omdb::parse(); + cmd.exec().await + }) } diff --git a/dev-tools/omdb/src/bin/omdb/crucible_agent.rs b/dev-tools/omdb/src/crucible_agent.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/crucible_agent.rs rename to dev-tools/omdb/src/crucible_agent.rs diff --git a/dev-tools/omdb/src/bin/omdb/crucible_pantry.rs b/dev-tools/omdb/src/crucible_pantry.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/crucible_pantry.rs rename to dev-tools/omdb/src/crucible_pantry.rs diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/db.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/db.rs rename to dev-tools/omdb/src/db.rs diff --git a/dev-tools/omdb/src/bin/omdb/db/alert.rs b/dev-tools/omdb/src/db/alert.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/db/alert.rs rename to dev-tools/omdb/src/db/alert.rs diff --git a/dev-tools/omdb/src/bin/omdb/db/blueprints.rs b/dev-tools/omdb/src/db/blueprints.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/db/blueprints.rs rename to dev-tools/omdb/src/db/blueprints.rs diff --git a/dev-tools/omdb/src/bin/omdb/db/db_metadata.rs b/dev-tools/omdb/src/db/db_metadata.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/db/db_metadata.rs rename to dev-tools/omdb/src/db/db_metadata.rs diff --git a/dev-tools/omdb/src/bin/omdb/db/ereport.rs b/dev-tools/omdb/src/db/ereport.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/db/ereport.rs rename to dev-tools/omdb/src/db/ereport.rs diff --git a/dev-tools/omdb/src/bin/omdb/db/saga.rs b/dev-tools/omdb/src/db/saga.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/db/saga.rs rename to dev-tools/omdb/src/db/saga.rs diff --git a/dev-tools/omdb/src/bin/omdb/db/sitrep.rs b/dev-tools/omdb/src/db/sitrep.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/db/sitrep.rs rename to dev-tools/omdb/src/db/sitrep.rs diff --git a/dev-tools/omdb/src/bin/omdb/db/user_data_export.rs b/dev-tools/omdb/src/db/user_data_export.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/db/user_data_export.rs rename to dev-tools/omdb/src/db/user_data_export.rs diff --git a/dev-tools/omdb/src/bin/omdb/db/whatis.rs b/dev-tools/omdb/src/db/whatis.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/db/whatis.rs rename to dev-tools/omdb/src/db/whatis.rs diff --git a/dev-tools/omdb/src/bin/omdb/helpers.rs b/dev-tools/omdb/src/helpers.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/helpers.rs rename to dev-tools/omdb/src/helpers.rs diff --git a/dev-tools/omdb/src/lib.rs b/dev-tools/omdb/src/lib.rs new file mode 100644 index 00000000000..6a4a84c904b --- /dev/null +++ b/dev-tools/omdb/src/lib.rs @@ -0,0 +1,317 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Omicron debugger (omdb) - library interface +//! +//! This module exposes omdb's CLI functionality as a library, allowing other +//! crates to create their own omdb binaries. +//! +//! GROUND RULES: +//! +//! 1. There aren't a lot of ground rules here. At least for now, this is a +//! place to put any kind of runtime tooling for Omicron that seems useful. +//! You can query the database directly (see notes in db.rs), use internal +//! APIs, etc. To the degree that we can stick to stable interfaces, great. +//! But at this stage we'd rather have tools that work on latest than not +//! have them because we couldn't prioritize keeping them stable. +//! +//! 2. Debuggers should never lie! Documentation and command names should be +//! precise about what they're reporting. In a working system, these things +//! might all be the same: +//! +//! - the list of instances with zones and propolis processes running on +//! a sled +//! - the list of instances that sled agent knows about +//! - the list of instances that Nexus or the database reports should be +//! running on a sled +//! +//! But in a broken system, these things might be all different. People use +//! debuggers to understand broken systems. The debugger should say which of +//! these it's reporting, rather than "the list of instances on a sled". +//! +//! 3. Where possible, when the tool encounters something unexpected, it should +//! print what it can (including the error message and bad data) and then +//! continue. It generally shouldn't stop on the first error. (We often +//! find strange things when debugging but we need our tools to tell us as +//! much as they can!) + +use anyhow::Context; +use anyhow::anyhow; +use anyhow::ensure; +use clap::Args; +use clap::ColorChoice; +use clap::Parser; +use clap::Subcommand; +use futures::StreamExt; +use internal_dns_types::names::ServiceName; +use omicron_common::address::Ipv6Subnet; +use std::net::SocketAddr; +use std::net::SocketAddrV6; +use tokio::net::TcpSocket; + +mod crucible_agent; +mod crucible_pantry; +mod db; +mod helpers; +mod mgs; +mod nexus; +mod oximeter; +mod oxql; +mod reconfigurator; +mod sled_agent; +mod support_bundle; + +/// Omicron debugger (unstable) +/// +/// This tool provides commands for directly querying Omicron components about +/// their internal state using internal APIs. This is a prototype. The +/// commands and output are unstable and may change. +#[derive(Debug, Parser)] +pub struct Omdb { + /// log level filter + #[arg( + env, + long, + value_parser = parse_dropshot_log_level, + default_value = "warn", + global = true, + )] + log_level: dropshot::ConfigLoggingLevel, + + #[arg( + long, + env = "OMDB_DNS_SERVER", + global = true, + help_heading = helpers::CONNECTION_OPTIONS_HEADING, + )] + dns_server: Option, + + /// Allow potentially-destructive subcommands. + #[arg( + short = 'w', + long = "destructive", + global = true, + help_heading = helpers::SAFETY_OPTIONS_HEADING, + )] + allow_destructive: bool, + + #[command(flatten)] + output: OutputOpts, + + #[command(subcommand)] + command: OmdbCommands, +} + +impl Omdb { + /// Execute the omdb command. + pub async fn exec(self) -> Result<(), anyhow::Error> { + let log = dropshot::ConfigLogging::StderrTerminal { + level: self.log_level.clone(), + } + .to_logger("omdb") + .context("failed to create logger")?; + + match &self.command { + OmdbCommands::Db(db) => db.run_cmd(&self, &log).await, + OmdbCommands::Mgs(mgs) => mgs.run_cmd(&self, &log).await, + OmdbCommands::Nexus(nexus) => nexus.run_cmd(&self, &log).await, + OmdbCommands::Oximeter(oximeter) => { + oximeter.run_cmd(&self, &log).await + } + OmdbCommands::Oxql(oxql) => oxql.run_cmd(&self, &log).await, + OmdbCommands::Reconfigurator(reconfig) => { + reconfig.run_cmd(&self, &log).await + } + OmdbCommands::SledAgent(sled) => sled.run_cmd(&self, &log).await, + OmdbCommands::CrucibleAgent(crucible) => { + crucible.run_cmd(&self).await + } + OmdbCommands::CruciblePantry(crucible) => { + crucible.run_cmd(&self).await + } + } + } +} + +#[derive(Debug, Args)] +struct OutputOpts { + /// Color output + #[arg(long, global = true, value_enum, default_value_t)] + color: ColorChoice, +} + +mod check_allow_destructive { + /// Zero-size type that potentially-destructive functions can accept to + /// ensure `Omdb::check_allow_destructive` has been called. + // This is tucked away inside a module to prevent it from being constructed + // by anything other than `Omdb::check_allow_destructive`. + #[must_use] + pub(crate) struct DestructiveOperationToken(()); + + impl super::Omdb { + pub(crate) fn check_allow_destructive( + &self, + ) -> anyhow::Result { + anyhow::ensure!( + self.allow_destructive, + "This command is potentially destructive. \ + Pass the `-w` / `--destructive` flag to allow it." + ); + Ok(DestructiveOperationToken(())) + } + } +} + +impl Omdb { + /// Return the socket addresses of all instances of a service in DNS + async fn dns_lookup_all( + &self, + log: slog::Logger, + service_name: ServiceName, + ) -> Result, anyhow::Error> { + let resolver = self.dns_resolver(log).await?; + resolver + .lookup_all_socket_v6(service_name) + .await + .with_context(|| format!("looking up {:?} in DNS", service_name)) + } + + /// Return the socket address of one instance of a service that we can at + /// least successfully connect to + async fn dns_lookup_one( + &self, + log: slog::Logger, + service_name: ServiceName, + ) -> Result { + let addrs = self.dns_lookup_all(log, service_name).await?; + ensure!( + !addrs.is_empty(), + "expected at least one address from successful DNS lookup for {:?}", + service_name + ); + + // The caller is going to pick one of these addresses to connect to. + // Let's try to pick one that's at least not obviously broken by + // attempting to connect to whatever we found and returning any that we + // successfully connected to. It'd be nice if we could return the + // socket directly, but our callers are creating reqwest clients that + // cannot easily consume a socket directly. + // + // This approach scales poorly and there are many failure modes that + // this does not cover. But in the absence of better connection + // management, and with the risks in `omdb` being pretty low, and the + // value of it working pretty high, here we are. This approach should + // not be replicated elsewhere. + async fn try_connect( + sockaddr_v6: SocketAddrV6, + ) -> Result<(), anyhow::Error> { + let _ = TcpSocket::new_v6() + .context("creating socket")? + .connect(SocketAddr::from(sockaddr_v6)) + .await + .with_context(|| format!("connect \"{}\"", sockaddr_v6))?; + Ok(()) + } + + let mut socket_stream = futures::stream::iter(addrs) + .map(async move |sockaddr_v6| { + (sockaddr_v6, try_connect(sockaddr_v6).await) + }) + .buffer_unordered(3); + + while let Some((sockaddr, connect_result)) = socket_stream.next().await + { + match connect_result { + Ok(()) => return Ok(sockaddr), + Err(error) => { + eprintln!( + "warning: failed to connect to {:?} at {}: {:#}", + service_name, sockaddr, error + ); + } + } + } + + Err(anyhow!("failed to connect to any instances of {:?}", service_name)) + } + + async fn dns_resolver( + &self, + log: slog::Logger, + ) -> Result { + match &self.dns_server { + Some(dns_server) => { + internal_dns_resolver::Resolver::new_from_addrs( + log, + &[*dns_server], + ) + .with_context(|| { + format!( + "creating DNS resolver for DNS server {:?}", + dns_server + ) + }) + } + None => { + // In principle, we should look at /etc/resolv.conf to find the + // DNS servers. In practice, this usually isn't populated + // today. See oxidecomputer/omicron#2122. + // + // However, the address selected below should work for most + // existing Omicron deployments today. That's because while the + // base subnet is in principle configurable in config-rss.toml, + // it's very uncommon to change it from the default value used + // here. + // + // Yet another option would be to find a local IP address that + // looks like it's probably on the underlay network and use that + // to find the subnet to use. But again, this is unlikely to be + // wrong and it's easy to override. + let subnet = + Ipv6Subnet::new("fd00:1122:3344:0100::".parse().unwrap()); + eprintln!("note: using DNS server for subnet {}", subnet.net()); + eprintln!( + "note: (if this is not right, use --dns-server \ + to specify an alternate DNS server)", + ); + internal_dns_resolver::Resolver::new_from_subnet(log, subnet) + .with_context(|| { + format!( + "creating DNS resolver for subnet {}", + subnet.net() + ) + }) + } + } + } +} + +#[derive(Debug, Subcommand)] +#[allow(clippy::large_enum_variant)] +enum OmdbCommands { + /// Debug a specific crucible-agent + CrucibleAgent(crucible_agent::CrucibleAgentArgs), + /// Query a specific crucible-pantry + CruciblePantry(crucible_pantry::CruciblePantryArgs), + /// Query the control plane database (CockroachDB) + Db(db::DbArgs), + /// Debug a specific Management Gateway Service instance + Mgs(mgs::MgsArgs), + /// Debug a specific Nexus instance + Nexus(nexus::NexusArgs), + /// Query oximeter collector state + Oximeter(oximeter::OximeterArgs), + /// Enter the Oximeter Query Language shell for interactive querying. + Oxql(oxql::OxqlArgs), + /// Interact with the Reconfigurator system + Reconfigurator(reconfigurator::ReconfiguratorArgs), + /// Debug a specific Sled + SledAgent(sled_agent::SledAgentArgs), +} + +fn parse_dropshot_log_level( + s: &str, +) -> Result { + serde_json::from_str(&format!("{:?}", s)).context("parsing log level") +} diff --git a/dev-tools/omdb/src/bin/omdb/mgs.rs b/dev-tools/omdb/src/mgs.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/mgs.rs rename to dev-tools/omdb/src/mgs.rs diff --git a/dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs b/dev-tools/omdb/src/mgs/dashboard.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs rename to dev-tools/omdb/src/mgs/dashboard.rs diff --git a/dev-tools/omdb/src/bin/omdb/mgs/sensors.rs b/dev-tools/omdb/src/mgs/sensors.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/mgs/sensors.rs rename to dev-tools/omdb/src/mgs/sensors.rs diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/nexus.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/nexus.rs rename to dev-tools/omdb/src/nexus.rs diff --git a/dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs b/dev-tools/omdb/src/nexus/quiesce.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs rename to dev-tools/omdb/src/nexus/quiesce.rs diff --git a/dev-tools/omdb/src/bin/omdb/nexus/reconfigurator_config.rs b/dev-tools/omdb/src/nexus/reconfigurator_config.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/nexus/reconfigurator_config.rs rename to dev-tools/omdb/src/nexus/reconfigurator_config.rs diff --git a/dev-tools/omdb/src/bin/omdb/nexus/update_status.rs b/dev-tools/omdb/src/nexus/update_status.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/nexus/update_status.rs rename to dev-tools/omdb/src/nexus/update_status.rs diff --git a/dev-tools/omdb/src/bin/omdb/oximeter.rs b/dev-tools/omdb/src/oximeter.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/oximeter.rs rename to dev-tools/omdb/src/oximeter.rs diff --git a/dev-tools/omdb/src/bin/omdb/oxql.rs b/dev-tools/omdb/src/oxql.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/oxql.rs rename to dev-tools/omdb/src/oxql.rs diff --git a/dev-tools/omdb/src/bin/omdb/reconfigurator.rs b/dev-tools/omdb/src/reconfigurator.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/reconfigurator.rs rename to dev-tools/omdb/src/reconfigurator.rs diff --git a/dev-tools/omdb/src/bin/omdb/sled_agent.rs b/dev-tools/omdb/src/sled_agent.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/sled_agent.rs rename to dev-tools/omdb/src/sled_agent.rs diff --git a/dev-tools/omdb/src/bin/omdb/support_bundle.rs b/dev-tools/omdb/src/support_bundle.rs similarity index 100% rename from dev-tools/omdb/src/bin/omdb/support_bundle.rs rename to dev-tools/omdb/src/support_bundle.rs diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 81d4ed9bbfb..0827f1be1fd 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -132,8 +132,10 @@ nexus-reconfigurator-preparation.workspace = true nexus-reconfigurator-rendezvous.workspace = true nexus-types.workspace = true omicron-common.workspace = true +omicron-omdb.workspace = true omicron-passwords.workspace = true oxide-tokio-rt.workspace = true +sigpipe.workspace = true oximeter.workspace = true oximeter-instruments = { workspace = true, features = ["http-instruments"] } oximeter-producer.workspace = true @@ -199,3 +201,7 @@ harness = false [[bin]] name = "nexus" doc = false + +[[bin]] +name = "omdb-dup" +doc = false diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index c56a26970a1..89ee67a7430 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -144,6 +144,7 @@ use nexus_background_task_interface::Activator; use nexus_background_task_interface::BackgroundTasks; use nexus_config::BackgroundTaskConfig; use nexus_config::DnsTasksConfig; +use nexus_config::OmdbConfig; use nexus_db_model::DnsGroup; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; @@ -632,6 +633,7 @@ impl BackgroundTasksInitializer { resolver.clone(), config.support_bundle_collector.disable, nexus_id, + args.omdb_config.clone(), ), ), opctx: opctx.child(BTreeMap::new()), @@ -1191,6 +1193,8 @@ pub struct BackgroundTasksData { /// Channel for exposing the latest loaded fault-management sitrep. pub sitrep_load_tx: watch::Sender>>, + /// PATH information for `omdb`, for tasks that want to invoke it directly + pub omdb_config: OmdbConfig, } /// Starts the three DNS-propagation-related background tasks for either diff --git a/nexus/src/app/background/tasks/support_bundle/README.md b/nexus/src/app/background/tasks/support_bundle/README.md index e6a52539afd..8eebf508883 100644 --- a/nexus/src/app/background/tasks/support_bundle/README.md +++ b/nexus/src/app/background/tasks/support_bundle/README.md @@ -64,3 +64,23 @@ contents should be included. expensive operations which might be shared with other steps (e.g., reading from the database, creating and using progenitor clients, etc) consider adding that data to `support_bundle/cache`. + +## Bundle Directory Structure + +The following is the convention for Support Bundle files. It can, and should, +change over time. However, we list it here to make sure data is located +somewhere consistent and predictable. + +(Please keep this list alphabetized) + +* `bundle_id.txt` - UUID of the bundle itself +* `ereports/` - All requested error reports +* `ereports/{part number}-{serial number}/{id}.json` - Individual reports +* `meta/` - Metadata about the bundle +* `meta/trace.json` - Perfetto-formatted trace of the bundle's collection +* `omdb/` - Output from omdb commands +* `rack/{rack id}/sled/{sled id}/` - Sled-specific host OS info +* `reconfigurator_state.json` - A dump of all reconfigurator state +* `sled_info.json` - Mapping of sled identifiers to cubby location +* `sp_task_dumps/` - All SP dumps +* `sp_task_dumps/{SP type}_{SP slot}/dump-{id}.zip` - Individual SP dumps diff --git a/nexus/src/app/background/tasks/support_bundle/collection.rs b/nexus/src/app/background/tasks/support_bundle/collection.rs index 1008c85128f..b042abf72cf 100644 --- a/nexus/src/app/background/tasks/support_bundle/collection.rs +++ b/nexus/src/app/background/tasks/support_bundle/collection.rs @@ -53,9 +53,11 @@ pub struct BundleCollection { request: BundleRequest, bundle: SupportBundle, transfer_chunk_size: NonZeroU64, + omdb_config: nexus_config::OmdbConfig, } impl BundleCollection { + #[allow(clippy::too_many_arguments)] pub fn new( datastore: Arc, resolver: Resolver, @@ -64,6 +66,7 @@ impl BundleCollection { request: BundleRequest, bundle: SupportBundle, transfer_chunk_size: NonZeroU64, + omdb_config: nexus_config::OmdbConfig, ) -> Self { Self { datastore, @@ -73,6 +76,7 @@ impl BundleCollection { request, bundle, transfer_chunk_size, + omdb_config, } } @@ -100,6 +104,10 @@ impl BundleCollection { &self.bundle } + pub fn omdb_config(&self) -> &nexus_config::OmdbConfig { + &self.omdb_config + } + /// Collect the bundle within Nexus, and store it on a target sled. pub async fn collect_bundle_and_store_on_sled( self: &Arc, diff --git a/nexus/src/app/background/tasks/support_bundle/steps/mod.rs b/nexus/src/app/background/tasks/support_bundle/steps/mod.rs index d2179c74b8c..cade1943ff0 100644 --- a/nexus/src/app/background/tasks/support_bundle/steps/mod.rs +++ b/nexus/src/app/background/tasks/support_bundle/steps/mod.rs @@ -12,6 +12,7 @@ use nexus_types::internal_api::background::SupportBundleCollectionStep; mod bundle_id; mod ereports; mod host_info; +mod omdb; mod reconfigurator; mod sled_cubby; mod sp_dumps; @@ -85,5 +86,9 @@ pub fn all(cache: &Cache) -> Vec { } }), ), + CollectionStep::new( + SupportBundleCollectionStep::STEP_OMDB, + Box::new(|collection, dir| omdb::collect(collection, dir).boxed()), + ), ] } diff --git a/nexus/src/app/background/tasks/support_bundle/steps/omdb.rs b/nexus/src/app/background/tasks/support_bundle/steps/omdb.rs new file mode 100644 index 00000000000..8d4d6768464 --- /dev/null +++ b/nexus/src/app/background/tasks/support_bundle/steps/omdb.rs @@ -0,0 +1,128 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Collects output from omdb commands + +use crate::app::background::tasks::support_bundle::collection::BundleCollection; +use crate::app::background::tasks::support_bundle::step::CollectionStepOutput; +use camino::Utf8Path; +use tokio::process::Command; + +/// Run an omdb command and write its output to a file within the bundle. +/// +/// This function returns an error if we cannot write to our local filesystem, +/// or cannot run the omdb command at all. However, if the omdb command runs +/// and fails, it returns "Ok()". +/// +/// # Arguments +/// * `collection` - The bundle collection context +/// * `dir` - The root directory of the bundle +/// * `args` - The arguments to pass to omdb (e.g., `&["nexus", "background-tasks", "list"]`) +/// * `output_path` - The relative path within the bundle where output should be written +/// (e.g., `"omdb/nexus/background-tasks/list.txt"`) +async fn run_omdb( + collection: &BundleCollection, + dir: &Utf8Path, + args: &[&str], + output_path: &str, +) -> anyhow::Result<()> { + let full_output_path = dir.join(output_path); + + // Create parent directories if they don't exist + if let Some(parent) = full_output_path.parent() { + tokio::fs::create_dir_all(parent).await?; + } + + // Run the omdb command + let omdb_path = &collection.omdb_config().bin_path; + let output = + Command::new(omdb_path).args(args).output().await.map_err(|e| { + anyhow::anyhow!( + "Failed to execute omdb at {:?} with args {:?}: {}", + omdb_path, + args, + e + ) + })?; + + // Format the output + let output_text = if output.status.success() { + String::from_utf8_lossy(&output.stdout).to_string() + } else { + // If the command failed, include both stdout and stderr + format!( + "Command {} failed with exit code: {:?}\n\nSTDOUT:\n{}\n\nSTDERR:\n{}", + args.join(" "), + output.status.code(), + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ) + }; + + tokio::fs::write(full_output_path, output_text).await?; + Ok(()) +} + +/// Collect diagnostic output from various omdb commands. +/// +/// This function runs multiple omdb queries and stores their output in the bundle. +/// To add more omdb queries, simply add another `run_omdb()` call with the +/// appropriate arguments and output path. +pub async fn collect( + collection: &BundleCollection, + dir: &Utf8Path, +) -> anyhow::Result { + // NOTE: We could parallelize these commands, if they take a while. + // + // NOTE: These commands issues queries to "some Nexus", as returned by DNS - + // not necessarily our own Nexus. We may want to include queries to + // each Nexus instance individually in a future iteration, especially for + // "nexus-specific" commands. + + // Run a sequence of omdb commands. If any of these commands fail, we'll + // save the stdout and stderr, and proceed to the next one (note that + // "run_omdb" does not return an error when the output is not successfull). + + run_omdb( + collection, + dir, + &["nexus", "background-tasks", "list"], + "omdb/nexus/background-tasks/list.txt", + ) + .await?; + + run_omdb( + collection, + dir, + &["nexus", "quiesce", "show"], + "omdb/nexus/quiesce/show.txt", + ) + .await?; + + run_omdb( + collection, + dir, + &["nexus", "mgs-updates"], + "omdb/nexus/mgs-updates.txt", + ) + .await?; + + run_omdb( + collection, + dir, + &["nexus", "update-status"], + "omdb/nexus/update-status.txt", + ) + .await?; + + run_omdb( + collection, + dir, + &["db", "saga", "running"], + "omdb/db/saga/running", + ) + .await?; + + Ok(CollectionStepOutput::None) +} diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 887be497a17..9c4227babf2 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -56,6 +56,7 @@ pub struct SupportBundleCollector { resolver: Resolver, disable: bool, nexus_id: OmicronZoneUuid, + omdb_config: nexus_config::OmdbConfig, } impl SupportBundleCollector { @@ -64,8 +65,15 @@ impl SupportBundleCollector { resolver: Resolver, disable: bool, nexus_id: OmicronZoneUuid, + omdb_config: nexus_config::OmdbConfig, ) -> Self { - SupportBundleCollector { datastore, resolver, disable, nexus_id } + SupportBundleCollector { + datastore, + resolver, + disable, + nexus_id, + omdb_config, + } } // Tells a sled agent to delete a support bundle @@ -357,6 +365,7 @@ impl SupportBundleCollector { request.clone(), bundle.clone(), request.transfer_chunk_size, + self.omdb_config.clone(), )); let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); @@ -490,6 +499,7 @@ mod test { resolver.clone(), false, nexus.id(), + nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() }, ); let report = collector @@ -516,6 +526,7 @@ mod test { resolver.clone(), false, nexus.id(), + nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() }, ); let request = BundleRequest::default(); @@ -823,6 +834,7 @@ mod test { resolver.clone(), false, nexus.id(), + nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() }, ); // The bundle collection should complete successfully. @@ -902,6 +914,7 @@ mod test { resolver.clone(), false, nexus.id(), + nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() }, ); // Collect the bundle @@ -1013,6 +1026,7 @@ mod test { resolver.clone(), false, nexus.id(), + nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() }, ); // The bundle collection should complete successfully. @@ -1121,6 +1135,7 @@ mod test { resolver.clone(), false, nexus.id(), + nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() }, ); // Each time we call "collect_bundle", we collect a SINGLE bundle. @@ -1235,6 +1250,7 @@ mod test { resolver.clone(), false, nexus.id(), + nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() }, ); let report = collector @@ -1288,6 +1304,7 @@ mod test { resolver.clone(), false, nexus.id(), + nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() }, ); let mut request = BundleRequest::default(); request.data_selection.insert(BundleData::HostInfo(HashSet::new())); @@ -1387,6 +1404,7 @@ mod test { resolver.clone(), false, nexus.id(), + nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() }, ); let report = collector @@ -1443,6 +1461,7 @@ mod test { resolver.clone(), false, nexus.id(), + nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() }, ); let mut request = BundleRequest::default(); request.data_selection.insert(BundleData::HostInfo(HashSet::new())); @@ -1528,6 +1547,7 @@ mod test { resolver.clone(), false, nexus.id(), + nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() }, ); let mut request = BundleRequest::default(); request.data_selection.insert(BundleData::HostInfo(HashSet::new())); @@ -1612,6 +1632,7 @@ mod test { resolver.clone(), false, nexus.id(), + nexus_config::OmdbConfig { bin_path: "/nonexistent/omdb".into() }, ); // Collect the bundle diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index f9a8057958c..42f7750159b 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -647,6 +647,7 @@ impl Nexus { mgs_updates_tx, blueprint_load_tx, sitrep_load_tx, + omdb_config: task_config.pkg.omdb.clone(), }, ); diff --git a/nexus/src/bin/omdb-dup.rs b/nexus/src/bin/omdb-dup.rs new file mode 100644 index 00000000000..d2596968b6f --- /dev/null +++ b/nexus/src/bin/omdb-dup.rs @@ -0,0 +1,25 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! A copy of omdb's `main.rs`. +//! +//! This is a workaround for the fact that Cargo only lets integration tests use +//! binaries defined in the same crate. We'd like two sets of integration tests +//! against omdb: quicker ones that live in that crate, and slower ones that +//! depend on Nexus and live here. +//! +//! The tests don't have to use omdb as a binary. They could also use it as a +//! library, but doing that properly would require stdout and stderr to be +//! redirected to in-memory buffers. This small binary works around that. + +use clap::Parser; +use omicron_omdb::Omdb; + +fn main() -> Result<(), anyhow::Error> { + sigpipe::reset(); + oxide_tokio_rt::run(async { + let cmd = Omdb::parse(); + cmd.exec().await + }) +} diff --git a/nexus/test-utils-macros/src/lib.rs b/nexus/test-utils-macros/src/lib.rs index 767f09b9d39..96408b12701 100644 --- a/nexus/test-utils-macros/src/lib.rs +++ b/nexus/test-utils-macros/src/lib.rs @@ -130,6 +130,21 @@ pub fn nexus_test(attrs: TokenStream, input: TokenStream) -> TokenStream { #func_ident_string, ) .with_extra_sled_agents(#extra_sled_agents) + .customize_nexus_config(&|config| { + // Set omdb binary path from CARGO_BIN_EXE_omdb-dup if available. + // This env var is set by cargo test/nextest for binaries in the + // same package - but it's only accessible to integration tests + // and benchmarks. + // + // We use option_env!() here (which expands in test code) to + // avoid compile errors during cargo check when the binary + // doesn't exist. If the env var isn't set, we leave the path + // unchanged (it uses a default). + if let Some(omdb_path) = option_env!("CARGO_BIN_EXE_omdb-dup") { + config.pkg.omdb.bin_path = + ::camino::Utf8PathBuf::from(omdb_path); + } + }) .start::<#which_nexus>() .await; #func_ident(&ctx).await; diff --git a/nexus/tests/integration_tests/support_bundles.rs b/nexus/tests/integration_tests/support_bundles.rs index 4bd8d1e2258..42e93bc522e 100644 --- a/nexus/tests/integration_tests/support_bundles.rs +++ b/nexus/tests/integration_tests/support_bundles.rs @@ -521,6 +521,10 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) { step_names.contains(&SupportBundleCollectionStep::STEP_SPAWN_SP_DUMPS), "Should have attempted to list service processors" ); + assert!( + step_names.contains(&SupportBundleCollectionStep::STEP_OMDB), + "Should have run omdb diagnostic commands" + ); let bundle = bundle_get(&client, bundle.id).await.unwrap(); assert_eq!(bundle.state, SupportBundleState::Active); @@ -528,10 +532,68 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) { // Now we should be able to download the bundle let contents = bundle_download(&client, bundle.id).await.unwrap(); let archive = ZipArchive::new(Cursor::new(&contents)).unwrap(); - let mut names = archive.file_names(); + let mut names = archive.file_names().collect::>(); + names.sort(); + let mut names = names.into_iter().peekable(); + assert_eq!(names.next(), Some("bundle_id.txt")); assert_eq!(names.next(), Some("meta/")); assert_eq!(names.next(), Some("meta/trace.json")); + assert_eq!(names.next(), Some("omdb/")); + + // Collect omdb file names and verify they exist + let mut omdb_files = Vec::new(); + while let Some(name) = names.peek() { + if !name.starts_with("omdb/") { + break; + } + omdb_files.push(*name); + let _ = names.next(); + } + + // Verify we have omdb output files + assert!(!omdb_files.is_empty(), "Should have omdb output files"); + + // Verify that none of the omdb output files contain "error: unrecognized + // subcommand" This catches regressions where omdb's command structure + // changes and our hardcoded commands become invalid. + let mut archive = ZipArchive::new(Cursor::new(&contents)).unwrap(); + let mut files_checked = 0; + for file_name in &omdb_files { + // Skip directories + if file_name.ends_with('/') { + continue; + } + + let mut file = archive + .by_name(file_name) + .unwrap_or_else(|_| panic!("Should be able to open {}", file_name)); + let mut content = String::new(); + std::io::Read::read_to_string(&mut file, &mut content) + .unwrap_or_else(|_| panic!("Should be able to read {}", file_name)); + + files_checked += 1; + + // Validate that the omdb command is valid, even if it can't connect + // to a running Nexus right now. + assert!( + !content.contains("error: unrecognized subcommand"), + "File {} contains 'error: unrecognized subcommand'.\n\ + This indicates the omdb command is invalid. Content:\n{}", + file_name, + content + ); + } + + // Make sure we actually checked at least one omdb output file. + // If this fails, it means the bundle had omdb directories but no actual + // output files, which would be a bug. + assert!( + files_checked > 0, + "Expected to check at least one omdb output file, but found only directories. Files: {:?}", + omdb_files + ); + assert_eq!(names.next(), Some("rack/")); assert!(names.any(|n| n == "sp_task_dumps/")); // There's much more data in the bundle, but validating it isn't the point diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index dfe008198f9..6661592a192 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -303,6 +303,7 @@ impl SupportBundleCollectionStep { pub const STEP_SPAWN_SP_DUMPS: &'static str = "spawn steps to query all SP dumps"; pub const STEP_SPAWN_SLEDS: &'static str = "spawn steps to query all sleds"; + pub const STEP_OMDB: &'static str = "omdb diagnostic output"; } #[derive(Debug, Deserialize, Serialize, PartialEq, Eq)]