From 0d8b945d127a102d675b0062fe090033dd8d5d78 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 19 Dec 2025 20:10:18 -0800 Subject: [PATCH 1/2] DebugCollector: first class file archival --- Cargo.lock | 2 + sled-agent/config-reconciler/Cargo.toml | 2 + .../file_archiver/execution.rs | 269 +++++ .../file_archiver/filesystem.rs | 132 +++ .../src/debug_collector/file_archiver/mod.rs | 16 + .../debug_collector/file_archiver/planning.rs | 1001 +++++++++++++++++ .../debug_collector/file_archiver/rules.rs | 297 +++++ .../file_archiver/test_helpers.rs | 377 +++++++ .../src/debug_collector/mod.rs | 1 + .../src/debug_collector/worker.rs | 265 ++--- .../test-data/debug-files.txt | 93 ++ 11 files changed, 2253 insertions(+), 202 deletions(-) create mode 100644 sled-agent/config-reconciler/src/debug_collector/file_archiver/execution.rs create mode 100644 sled-agent/config-reconciler/src/debug_collector/file_archiver/filesystem.rs create mode 100644 sled-agent/config-reconciler/src/debug_collector/file_archiver/mod.rs create mode 100644 sled-agent/config-reconciler/src/debug_collector/file_archiver/planning.rs create mode 100644 sled-agent/config-reconciler/src/debug_collector/file_archiver/rules.rs create mode 100644 sled-agent/config-reconciler/src/debug_collector/file_archiver/test_helpers.rs create mode 100644 sled-agent/config-reconciler/test-data/debug-files.txt diff --git a/Cargo.lock b/Cargo.lock index 5114b3bb3ca..4933f564b11 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -12999,6 +12999,7 @@ dependencies = [ "omicron-workspace-hack", "proptest", "rand 0.9.2", + "regex", "schemars 0.8.22", "scopeguard", "serde", @@ -13012,6 +13013,7 @@ dependencies = [ "sled-storage", "slog", "slog-error-chain", + "strum 0.27.2", "test-strategy", "thiserror 2.0.17", "tokio", diff --git a/sled-agent/config-reconciler/Cargo.toml b/sled-agent/config-reconciler/Cargo.toml index d66d9cfcef2..55dd4860630 100644 --- a/sled-agent/config-reconciler/Cargo.toml +++ b/sled-agent/config-reconciler/Cargo.toml @@ -29,6 +29,7 @@ ntp-admin-client.workspace = true omicron-common.workspace = true omicron-uuid-kinds.workspace = true rand.workspace = true +regex.workspace = true serde.workspace = true sha2.workspace = true sled-agent-api.workspace = true @@ -37,6 +38,7 @@ sled-hardware.workspace = true sled-storage.workspace = true slog.workspace = true slog-error-chain.workspace = true +strum.workspace = true thiserror.workspace = true tokio.workspace = true tufaceous-artifact.workspace = true diff --git a/sled-agent/config-reconciler/src/debug_collector/file_archiver/execution.rs b/sled-agent/config-reconciler/src/debug_collector/file_archiver/execution.rs new file mode 100644 index 00000000000..118e7f5fe5d --- /dev/null +++ b/sled-agent/config-reconciler/src/debug_collector/file_archiver/execution.rs @@ -0,0 +1,269 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Execution of file archival +//! +//! As much as possible, behavior should **not** live here, but in the planning +//! module instead so that it can be tested without touching the filesystem. + +use super::filesystem::FileLister; +use super::planning::ArchiveStep; +use anyhow::Context; +use camino::Utf8Path; +use slog::debug; + +pub(crate) async fn execute_archive_step<'a>( + log: &slog::Logger, + step: ArchiveStep<'a>, + lister: &'a (dyn FileLister + Send + Sync), +) -> Result<(), anyhow::Error> { + match step { + ArchiveStep::Mkdir { output_directory } => { + // We assume that the parent of all output directories + // already exists. That's because in practice it should be + // true: all of the output directories are one level below + // the debug dataset itself. (The test suite verifies + // this.) So if we find at runtime that this isn't true, + // that's a bad sign. Maybe somebody has unmounted the + // debug dataset and deleted its mountpoint? We don't want + // to start spewing stuff to the wrong place. That's why we + // don't use create_dir_all() here. + debug!( + log, + "create directory"; + "directory" => %output_directory + ); + tokio::fs::create_dir(&output_directory) + .await + .or_else(|error| { + if error.kind() == std::io::ErrorKind::AlreadyExists { + Ok(()) + } else { + Err(error) + } + }) + .with_context(|| format!("mkdir {output_directory:?}")) + } + ArchiveStep::ArchiveFile(archive_file) => { + match archive_file.choose_filename(lister) { + Err(error) => Err(error), + Ok(output_filename) => { + let input_path = &archive_file.input_path; + let output_path = archive_file + .output_directory + .join(output_filename.as_ref()); + debug!( + log, + "archive file"; + "input_path" => %input_path, + "output_path" => %output_path, + "delete_original" => + archive_file.delete_original, + ); + archive_one( + &input_path, + &output_path, + archive_file.delete_original, + ) + .await + .with_context(|| { + format!("archive {input_path:?} to {output_path:?}") + }) + } + } + } + } +} + +async fn archive_one( + source: &Utf8Path, + dest: &Utf8Path, + delete_original: bool, +) -> tokio::io::Result<()> { + let mut dest_f = tokio::fs::File::create(&dest).await?; + let mut src_f = tokio::fs::File::open(&source).await?; + + tokio::io::copy(&mut src_f, &mut dest_f).await?; + + dest_f.sync_all().await?; + + drop(src_f); + drop(dest_f); + + if delete_original { + tokio::fs::remove_file(source).await?; + } + + Ok(()) +} + +#[cfg(test)] +mod test { + use crate::debug_collector::file_archiver; + use anyhow::Context; + use camino::Utf8Path; + use camino_tempfile::Utf8TempDir; + use chrono::DateTime; + use chrono::Utc; + use file_archiver::planning::ArchiveKind; + use file_archiver::planning::ArchivePlanner; + use omicron_test_utils::dev::test_setup_log; + use slog::info; + + #[tokio::test] + async fn test_real_archival() { + // Set up the test. + let logctx = test_setup_log("test_archiving_basic"); + let log = &logctx.log; + + // Create a temporary directory in which to store some output files. + let tempdir = Utf8TempDir::new().unwrap(); + info!(log, "temporary directory"; "tempdir" => %tempdir.path()); + + // Populate it with a couple of files. + // + // Note that all of the interesting cases around generating archive + // steps are covered elsewhere. We really only need to smoke check + // basic behavior here. + let outdir = tempdir.path().join("out"); + let zone_name = "an-example-zone"; + let zone_root = tempdir.path().join(zone_name); + let logdir = zone_root.join("var/svc/log"); + let file1_live = logdir.join("svc1.log"); + let file2_rotated = logdir.join("svc1.log.0"); + let file3_rotated = logdir.join("svc2.log.0"); + let coredir = tempdir.path().join("crash"); + let file4_core = coredir.join("core.123"); + + let populate_input = |contents: &str| { + std::fs::create_dir_all(&logdir).unwrap(); + std::fs::create_dir_all(&coredir).unwrap(); + for file in + [&file1_live, &file2_rotated, &file3_rotated, &file4_core] + { + let contents = + format!("{}-{contents}", file.file_name().unwrap()); + std::fs::write(&file, contents).unwrap(); + } + }; + + populate_input("first"); + + // Compute the expected filenames. These depend on the mtimes that the + // files wound up with. + let expected_filename = |base: &str, input: &Utf8Path| { + let found_mtime = input.metadata().unwrap().modified().unwrap(); + let mtime: DateTime = DateTime::from(found_mtime); + format!("{base}{}", mtime.timestamp()) + }; + let file1_expected = expected_filename("svc1.", &file1_live); + let file2_expected = expected_filename("svc1.log.", &file2_rotated); + let file3_expected = expected_filename("svc2.log.", &file3_rotated); + + // Run a complete archive. + std::fs::create_dir(&outdir).unwrap(); + let mut planner = ArchivePlanner::new(log, ArchiveKind::Final, &outdir); + planner.include_cores_directory(&coredir); + planner.include_zone(zone_name, &zone_root); + let () = planner.execute().await.expect("successful execution"); + + // Check each of the output log files. This is a little annoying + // because we don't necessarily know what names they were given, since + // it depends on the mtime on the input file. + let verify_logs = |unchanged| { + for (input_path, expected_filename, deleted_original) in [ + (&file1_live, &file1_expected, false), + (&file2_rotated, &file2_expected, true), + (&file3_rotated, &file3_expected, true), + ] { + let expected_path = + outdir.join(zone_name).join(expected_filename); + let contents = std::fs::read_to_string(&expected_path) + .with_context(|| { + format!("read expected output file {expected_path:?}") + }) + .unwrap(); + assert!(contents.starts_with(input_path.file_name().unwrap())); + assert!(contents.ends_with("-first")); + + if deleted_original { + // Check that the original file is gone. + assert!(!input_path.exists()); + } else { + // The input file should exist. It may or may not match + // what it originally did, depending on what the caller + // says. + let input_contents = std::fs::read_to_string(&input_path) + .with_context(|| { + format!("read expected intput file {input_path:?}") + }) + .unwrap(); + if unchanged { + assert_eq!(contents, input_contents); + } + } + } + }; + + verify_logs(true); + + // Check the output core file, too. + let file4_output = outdir.join("core.123"); + let contents = std::fs::read_to_string(&file4_output) + .with_context(|| { + format!("read expected output file {file4_output:?}") + }) + .unwrap(); + assert_eq!(contents, "core.123-first"); + assert!(!file4_core.exists()); + + // Now, check the behavior for file collisions. + // + // First, re-populate the input tree, but with new data so that we can + // tell when things have been clobbered. + populate_input("second"); + + // Run another archive. + let mut planner = ArchivePlanner::new(log, ArchiveKind::Final, &outdir); + planner.include_cores_directory(&coredir); + planner.include_zone(zone_name, &zone_root); + let () = planner.execute().await.expect("successful execution"); + + // The previously archived log file should still exist, still have the + // same (original) contents, and the input files should be gone again. + verify_logs(false); + + // There should now be new versions of the three log files that contain + // the new contents. + for result in outdir.join(zone_name).read_dir_utf8().unwrap() { + let entry = result.unwrap(); + let contents = std::fs::read_to_string(&entry.path()) + .with_context(|| { + format!("read expected intput file {:?}", entry.path()) + }) + .unwrap(); + + if entry.file_name() == &file1_expected + || entry.file_name() == &file2_expected + || entry.file_name() == &file3_expected + { + assert!(contents.ends_with("-first")); + } else { + assert!(contents.ends_with("-second")); + } + } + + // The core file should have been completely overwritten with new + // contents. + assert!(!file4_core.exists()); + let contents = std::fs::read_to_string(&file4_output) + .with_context(|| { + format!("read expected output file {file4_output:?}") + }) + .unwrap(); + assert_eq!(contents, "core.123-second"); + + logctx.cleanup_successful(); + } +} diff --git a/sled-agent/config-reconciler/src/debug_collector/file_archiver/filesystem.rs b/sled-agent/config-reconciler/src/debug_collector/file_archiver/filesystem.rs new file mode 100644 index 00000000000..fbc1c26f311 --- /dev/null +++ b/sled-agent/config-reconciler/src/debug_collector/file_archiver/filesystem.rs @@ -0,0 +1,132 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use anyhow::Context; +use anyhow::anyhow; +use camino::Utf8Path; +use chrono::DateTime; +use chrono::Utc; +use derive_more::AsRef; +use thiserror::Error; + +/// Describes the final component of a path name (that has no `/` in it) +#[derive(AsRef, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub(crate) struct Filename(String); + +#[derive(Debug, Error)] +#[error("string is not a valid filename (has slashes or is '.' or '..')")] +pub(crate) struct BadFilename; + +impl TryFrom for Filename { + type Error = BadFilename; + fn try_from(value: String) -> Result { + if value == "." || value == ".." || value.contains('/') { + Err(BadFilename) + } else { + Ok(Filename(value)) + } + } +} + +/// Helper trait used to swap out basic filesystem functionality for testing +pub(crate) trait FileLister { + /// List the files within a directory + /// + /// This should return an empty vec when the directory does not exist, + /// rather than an error. + fn list_files( + &self, + path: &Utf8Path, + ) -> Vec>; + + /// Return the modification time of a file + fn file_mtime( + &self, + path: &Utf8Path, + ) -> Result>, anyhow::Error>; + + /// Return whether a file exists + fn file_exists(&self, path: &Utf8Path) -> Result; +} + +/// `FileLister` implementation that uses the real filesystem +pub(crate) struct FilesystemLister; +impl FileLister for FilesystemLister { + fn list_files( + &self, + path: &Utf8Path, + ) -> Vec> { + let entry_iter = match path.read_dir_utf8() { + Ok(iter) => iter, + Err(error) => { + if error.kind() == std::io::ErrorKind::NotFound { + // This interface is more useful if we swallow ENOTFOUND + // rather than propagate it since the caller will treat + // this the same as an empty directory. + return vec![]; + } else { + return vec![Err( + anyhow!(error).context("readdir {path:?}") + )]; + } + } + }; + + entry_iter + .map(|entry| { + entry.context("reading directory entry").and_then(|entry| { + // It should be impossible for this `try_from()` to fail, + // but it's easy enough to handle gracefully. + Filename::try_from(entry.file_name().to_owned()) + .with_context(|| { + format!( + "processing as a file name: {:?}", + entry.file_name(), + ) + }) + }) + }) + .collect() + } + + fn file_mtime( + &self, + path: &Utf8Path, + ) -> Result>, anyhow::Error> { + let metadata = path + .symlink_metadata() + .with_context(|| format!("loading metadata for {path:?}"))?; + + Ok(metadata + .modified() + // This `ok()` ignores an error fetching the mtime. We could + // probably just handle it, since it shouldn't come up. But this + // preserves historical behavior. + .ok() + .map(|m| m.into())) + } + + fn file_exists(&self, path: &Utf8Path) -> Result { + path.try_exists() + .with_context(|| format!("checking existence of {path:?}")) + } +} + +#[cfg(test)] +mod test { + use super::Filename; + + #[test] + fn test_filename() { + assert_eq!( + Filename::try_from(String::from("foo")).unwrap().as_ref(), + "foo" + ); + assert!(Filename::try_from(String::from(".")).is_err()); + assert!(Filename::try_from(String::from("..")).is_err()); + assert!(Filename::try_from(String::from("foo/bar")).is_err()); + assert!(Filename::try_from(String::from("foo/")).is_err()); + assert!(Filename::try_from(String::from("/bar")).is_err()); + } +} diff --git a/sled-agent/config-reconciler/src/debug_collector/file_archiver/mod.rs b/sled-agent/config-reconciler/src/debug_collector/file_archiver/mod.rs new file mode 100644 index 00000000000..dcc3f20cfd4 --- /dev/null +++ b/sled-agent/config-reconciler/src/debug_collector/file_archiver/mod.rs @@ -0,0 +1,16 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Configuration and implementation for archiving ordinary files as debug data +//! (e.g., log files) + +mod execution; +mod filesystem; +mod planning; +mod rules; +#[cfg(test)] +mod test_helpers; + +pub use planning::ArchiveKind; +pub use planning::ArchivePlanner; diff --git a/sled-agent/config-reconciler/src/debug_collector/file_archiver/planning.rs b/sled-agent/config-reconciler/src/debug_collector/file_archiver/planning.rs new file mode 100644 index 00000000000..61752b7ab76 --- /dev/null +++ b/sled-agent/config-reconciler/src/debug_collector/file_archiver/planning.rs @@ -0,0 +1,1001 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Configuration and implementation for archiving ordinary files as debug data +//! (e.g., log files) +//! +//! This system is designed so that as much possible is incorporated into the +//! plan so that it can be tested in simulation without extensive dependency +//! injection. See also [https://mmapped.blog/posts/29-plan-execute](the +//! plan-execute pattern). + +use super::execution::execute_archive_step; +use super::filesystem::FileLister; +use super::filesystem::Filename; +use super::filesystem::FilesystemLister; +use super::rules::ALL_RULES; +use super::rules::ArchiveGroup; +use super::rules::NamingRule; +use super::rules::RuleScope; +use super::rules::Source; +use anyhow::Context; +use anyhow::anyhow; +use camino::Utf8Path; +use camino::Utf8PathBuf; +use chrono::DateTime; +use chrono::Utc; +use slog::Logger; +use slog::debug; +use slog::o; +use slog::warn; +use slog_error_chain::InlineErrorChain; + +/// Describes what kind of archive operation this is, which affects what debug +/// data to collect +#[derive(Debug, Clone, Copy)] +pub enum ArchiveKind { + /// Periodic archive + /// + /// Periodic archives include immutable files like core files and rotated + /// log files, but they ignore live log files since they're still being + /// written-to. Those will get picked up in a subsequent periodic archive + /// (once rotated) or a final archive for this source. + Periodic, + + /// Final archive for this source + /// + /// The final archive for a given source is our last chance to archive debug + /// data from it. It is also generally at rest (or close to it). So this + /// includes everything that a periodic archive includes *plus* live log + /// files. + Final, +} + +/// Used to configure and execute a file archival operation +pub struct ArchivePlanner<'a> { + log: Logger, + what: ArchiveKind, + debug_dir: Utf8PathBuf, + groups: Vec>, + lister: &'a (dyn FileLister + Send + Sync), +} + +impl ArchivePlanner<'static> { + /// Begin an archival operation that will store data into `debug_dir` + pub fn new( + log: &Logger, + what: ArchiveKind, + debug_dir: &Utf8Path, + ) -> ArchivePlanner<'static> { + Self::new_with_lister(log, what, debug_dir, &FilesystemLister) + } +} + +impl<'a> ArchivePlanner<'a> { + // Used by the tests to inject a custom lister. + pub(crate) fn new_with_lister( + log: &Logger, + what: ArchiveKind, + debug_dir: &Utf8Path, + lister: &'a (dyn FileLister + Send + Sync), + ) -> ArchivePlanner<'a> { + let log = log.new(o!( + "component" => "DebugCollectorArchiver", + "debug_dir" => debug_dir.to_string(), + "what" => format!("{what:?}"), + )); + debug!(&log, "planning archival"); + + ArchivePlanner { + log, + what, + debug_dir: debug_dir.to_owned(), + groups: Vec::new(), + lister, + } + } + + /// Configure this archive operation to include debug data from the given + /// illumos zone zone + pub fn include_zone(&mut self, zone_name: &str, zone_root: &Utf8Path) { + debug!( + &self.log, + "archiving debug data from zone"; + "zonename" => zone_name, + "zone_root" => %zone_root, + ); + + let source = Source { + input_prefix: zone_root.to_owned(), + output_prefix: self.debug_dir.join(zone_name), + }; + + let rules = + ALL_RULES.iter().filter(|r| match (&r.rule_scope, &self.what) { + (RuleScope::ZoneAlways, _) => true, + (RuleScope::ZoneMutable, ArchiveKind::Final) => true, + (RuleScope::ZoneMutable, ArchiveKind::Periodic) => false, + (RuleScope::CoresDirectory, _) => false, + }); + + for rule in rules { + self.groups.push(ArchiveGroup { source: source.clone(), rule }); + } + } + + /// Configure this archive operation to include debug data from the given + /// cores directory + pub fn include_cores_directory(&mut self, cores_dir: &Utf8Path) { + debug!( + &self.log, + "archiving debug data from cores directory"; + "cores_dir" => %cores_dir, + ); + + let source = Source { + input_prefix: cores_dir.to_owned(), + output_prefix: self.debug_dir.to_owned(), + }; + + let rules = ALL_RULES.iter().filter(|r| match r.rule_scope { + RuleScope::CoresDirectory => true, + RuleScope::ZoneMutable | RuleScope::ZoneAlways => false, + }); + + for rule in rules { + self.groups.push(ArchiveGroup { source: source.clone(), rule }); + } + } + + /// Returns an `ArchivePlan` that describes more specific steps for + /// archiving the requested debug data + pub fn into_plan(self) -> ArchivePlan<'a> { + ArchivePlan { + log: self.log, + groups: self.groups, + debug_dir: self.debug_dir, + lister: self.lister, + } + } + + /// Generates an `ArchivePlan` and immediately executes it, archiving the + /// requested files + /// + /// Returns a single `anyhow::Error` if there are any problems archiving any + /// files. Details are emitted to the log. (It's assumed that consumers + /// don't generally care to act on detailed failures programmatically, just + /// report them to the log.) + pub async fn execute(self) -> Result<(), anyhow::Error> { + if !self.into_plan().execute().await.is_empty() { + Err(anyhow!("one or more archive steps failed (see logs)")) + } else { + Ok(()) + } + } +} + +/// Describes specific steps to carry out an archive operation +/// +/// Constructed with [`ArchivePlanner`]. +pub(crate) struct ArchivePlan<'a> { + log: slog::Logger, + debug_dir: Utf8PathBuf, + groups: Vec>, + lister: &'a (dyn FileLister + Send + Sync), +} + +impl ArchivePlan<'_> { + #[cfg(test)] + pub(crate) fn to_steps( + &self, + ) -> impl Iterator, anyhow::Error>> { + Self::to_steps_generic( + &self.log, + &self.groups, + &self.debug_dir, + self.lister, + ) + } + + pub(crate) fn to_steps_generic<'a>( + log: &Logger, + groups: &'a [ArchiveGroup<'static>], + debug_dir: &'a Utf8Path, + lister: &'a (dyn FileLister + Send + Sync), + ) -> impl Iterator, anyhow::Error>> { + // This gigantic combinator iterates the list of archive steps, which + // consist of: + // + // - an `ArchiveStep::Mkdir` for each output directory we need to create + // - an `ArchiveStep::ArchiveFile` for each file that we need to archive + // (all files matching all the rules that have been applied to the + // input sources). + // + // In fact, each item that we iterate is a `Result`: it's either one of + // these archive steps or its an error that was encountered along the + // way. + // + // Being one big expression makes this annoying to read and modify, but + // it has the useful property that it operates in a streaming way: at no + // point are all of the files in all of the matching directories read + // into memory at once. + groups + .iter() + // Start with a `mkdir` for each of the output directories. + .filter_map(move |group| { + let output_directory = group.output_directory(debug_dir); + if output_directory != debug_dir { + Some(Ok(ArchiveStep::Mkdir { output_directory })) + } else { + None + } + }) + // Chain this with a list of all the files we need to archive. + .chain( + groups + .iter() + .flat_map(move |group| { + // Each group essentially identifies one directory that + // we need to scan for files to archive. For each of + // these, list the files in the directory. + let input_directory = group.input_directory(); + + debug!( + log, + "listing directory"; + "input_directory" => %input_directory + ); + lister.list_files(&input_directory).into_iter().map( + move |item| item.map(|filename| (group, filename)), + ) + }) + .filter(move |entry| match entry { + // Errors are passed to the end of this pipeline. + Err(_) => true, + + // Files that we found in an input directory are checked + // against the corresponding rule to see if we should + // include them. + Ok((group, filename)) => { + debug!( + log, + "checking file"; + "file" => %filename.as_ref(), + ); + group.rule.include_file(&filename) + } + }) + .map(|entry| match entry { + // Errors are passed to the end of this pipeline. + Err(error) => Err(error), + + // If we found a matching file, fetch its metadata and + // grab the mtime. This is used for naming the archived + // file. + Ok((group, filename)) => { + let input_path = + group.input_directory().join(filename.as_ref()); + lister + .file_mtime(&input_path) + .map(|mtime| (group, input_path, mtime)) + } + }) + .map(|entry| match entry { + // Errors are passed to the end of this pipeline. + Err(error) => Err(error), + + // If we succeeded so far, we have a matching input + // file, its mtime and the associated group. Construct + // an archive step describing that we need to archive + // this file. + Ok((group, input_path, mtime)) => { + let output_directory = + group.output_directory(debug_dir); + Ok(ArchiveStep::ArchiveFile(ArchiveFile { + input_path, + mtime, + output_directory, + namer: group.rule.naming, + delete_original: group.rule.delete_original, + #[cfg(test)] + rule: group.rule.label, + })) + } + }), + ) + } + + pub(crate) async fn execute(self) -> Vec { + let mut errors = Vec::new(); + let log = &self.log; + let groups = self.groups; + let debug_dir = self.debug_dir; + let lister = self.lister; + for step in Self::to_steps_generic(log, &groups, &debug_dir, lister) { + let result = match step { + Err(error) => Err(error), + Ok(step) => execute_archive_step(log, step, lister).await, + }; + + if let Err(error) = result { + warn!( + log, + "error during archival"; + InlineErrorChain::new(&*error) + ); + errors.push(error); + } + } + + errors + } +} + +pub(crate) enum ArchiveStep<'a> { + Mkdir { output_directory: Utf8PathBuf }, + ArchiveFile(ArchiveFile<'a>), +} + +#[derive(Clone)] +pub(crate) struct ArchiveFile<'a> { + pub(crate) input_path: Utf8PathBuf, + pub(crate) mtime: Option>, + pub(crate) output_directory: Utf8PathBuf, + pub(crate) namer: &'a (dyn NamingRule + Send + Sync), + pub(crate) delete_original: bool, + #[cfg(test)] + pub(crate) rule: &'static str, +} + +impl ArchiveFile<'_> { + pub(crate) fn choose_filename( + &self, + lister: &dyn FileLister, + ) -> Result { + let file_name: Filename = self + .input_path + .file_name() + .ok_or_else(|| { + // This should be impossible, but it's easy enough to handle + // gracefully. + anyhow!( + "file for archival has no filename: {:?}", + &self.input_path + ) + })? + .to_owned() + .try_into() + .context("file_name() returned a non-Filename")?; + self.namer.archived_file_name( + &file_name, + self.mtime, + lister, + &self.output_directory, + ) + } +} + +#[cfg(test)] +mod test { + use crate::debug_collector::file_archiver; + use camino::Utf8Path; + use chrono::DateTime; + use chrono::Timelike; + use chrono::Utc; + use file_archiver::planning::ArchiveFile; + use file_archiver::planning::ArchiveKind; + use file_archiver::planning::ArchiveStep; + use file_archiver::rules::ALL_RULES; + use file_archiver::rules::MAX_COLLIDING_FILENAMES; + use file_archiver::rules::NameRotatedLogFile; + use file_archiver::test_helpers::*; + use omicron_test_utils::dev::test_setup_log; + use slog::debug; + use slog::info; + use slog_error_chain::InlineErrorChain; + use std::collections::BTreeSet; + + /// Fully tests archive planning with a bunch of real-world file paths + #[test] + fn test_archiving_basic() { + // Set up the test. + let logctx = test_setup_log("test_archiving_basic"); + let log = &logctx.log; + + // Load the test data + let files = load_test_files().unwrap(); + + // Run a simulated archive. + let fake_output_dir = Utf8Path::new("/fake-output-directory"); + let lister = TestLister::new_for_test_data(&files); + let plan = test_archive( + log, + &files, + fake_output_dir, + ArchiveKind::Final, + &lister, + ); + + // Now, walk through the archive plan and verify it. + let mut directories_created = BTreeSet::new(); + let mut unarchived_files = files.clone(); + let mut rules_unused: BTreeSet<_> = + ALL_RULES.iter().map(|r| r.label).collect(); + for step in plan.to_steps() { + let step = step.expect("no errors with test lister"); + + match step { + // For a `mkdir`, verify that the parent directory matches our + // output directory. (For more on why, see the code where we + // process this Mkdir.) Then record it. We'll use that to + // verify that files are always archived into directories that + // already exist. + ArchiveStep::Mkdir { output_directory } => { + let parent = output_directory + .parent() + .expect("output directory has a parent"); + if parent != fake_output_dir { + panic!( + "archiver created an output directory \ + ({output_directory:?}) whose parent is not the \ + fake debug directory ({fake_output_dir:?}). \ + This is not currently supported." + ); + } + directories_created.insert(output_directory); + } + + ArchiveStep::ArchiveFile(ArchiveFile { + input_path, + delete_original, + output_directory, + rule, + .. + }) => { + println!("archiving: {input_path}"); + + // Check that we have not already archived this file. + // That would imply that two rules matched the same file, + // which would be a bug in the rule definitions. + let test_file = unarchived_files + .remove(input_path.as_path()) + .unwrap_or_else(|| { + panic!( + "attempted to archive the same file multiple \ + times (or it was not in the test dataset): \ + {input_path:?}", + ); + }); + + // Check that we've correctly determined whether to delete + // the original file when archiving it. This is determined + // by the rule that it matched. We check it here against + // what we expect for each kind of file. + match &test_file.kind { + TestFileKind::KernelCrashDump { .. } + | TestFileKind::ProcessCoreDump { .. } + | TestFileKind::LogSmfRotated { .. } + | TestFileKind::LogSyslogRotated { .. } + | TestFileKind::GlobalLogSmfRotated + | TestFileKind::GlobalLogSyslogRotated + | TestFileKind::Ignored => { + assert!( + delete_original, + "expected to delete original file when \ + archiving file of kind {:?}", + test_file.kind, + ); + } + + TestFileKind::LogSmfLive { .. } + | TestFileKind::LogSyslogLive { .. } + | TestFileKind::GlobalLogSmfLive + | TestFileKind::GlobalLogSyslogLive => { + assert!( + !delete_original, + "expected not to delete original file when \ + archiving file of kind {:?}", + test_file.kind, + ); + } + } + + // The output directory must either match the overall output + // directory or else be one of the directories created by a + // Mkdir that we've already processed. + if output_directory != fake_output_dir + && !directories_created.contains(&output_directory) + { + panic!( + "file was archived into a non-existent \ + directory: {}", + test_file.path + ); + } + + // Mark that we've used this rule. It's not a problem if + // we've already done so. + let _ = rules_unused.remove(rule); + } + }; + } + + if !rules_unused.is_empty() { + panic!( + "one or more rules was not covered by the tests: \ + {rules_unused:?}" + ); + } + + println!("files that were not archived: {}", unarchived_files.len()); + for test_file in unarchived_files { + println!(" {}", test_file.path); + if !matches!(test_file.kind, TestFileKind::Ignored) { + panic!( + "non-ignored test file was not archived: {:?}", + test_file.path + ); + } + } + + logctx.cleanup_successful(); + } + + // Tests that when we archive "immutable-only" files: + // - we do archive the stuff we expect + // - we don't archive the stuff that we don't expect + #[test] + fn test_archiving_immutable_only() { + // Set up the test. + let logctx = test_setup_log("test_archiving_immutable_only"); + let log = &logctx.log; + + // Load the test data + let files = load_test_files().unwrap(); + + // Run a simulated archive. + let fake_output_dir = Utf8Path::new("/fake-output-directory"); + let lister = TestLister::new_for_test_data(&files); + let plan = test_archive( + log, + &files, + fake_output_dir, + ArchiveKind::Periodic, + &lister, + ); + + let mut expected_unarchived: BTreeSet<_> = files + .iter() + .filter_map(|test_file| { + let expected = match test_file.kind { + TestFileKind::KernelCrashDump { .. } + | TestFileKind::ProcessCoreDump { .. } + | TestFileKind::LogSmfRotated { .. } + | TestFileKind::LogSyslogRotated { .. } + | TestFileKind::GlobalLogSmfRotated + | TestFileKind::GlobalLogSyslogRotated => true, + TestFileKind::LogSmfLive { .. } + | TestFileKind::LogSyslogLive { .. } + | TestFileKind::GlobalLogSmfLive + | TestFileKind::GlobalLogSyslogLive + | TestFileKind::Ignored => false, + }; + + expected.then_some(&test_file.path) + }) + .collect(); + + // Check that precisely the expected files were collected. + // We do not check all the other expected behaviors around archiving + // here. That's tested in `test_archive_basic` for all files. + for step in plan.to_steps() { + let step = step.expect("no errors with test lister"); + let ArchiveStep::ArchiveFile(archive_file) = step else { + continue; + }; + + let input_path = archive_file.input_path; + let test_file = files.get(input_path.as_path()).expect( + "unexpectedly archived file that was not in the test data", + ); + if matches!(test_file.kind, TestFileKind::Ignored) { + // We don't care whether "ignored" files get archived or not. + continue; + } + + if !expected_unarchived.remove(&input_path) { + panic!( + "unexpectedly archived file (either it should not have \ + been at all or it was archived more than once): \ + {input_path:?}", + ); + } + + // This is technically checked in the other test, but since it's + // related to the file being immutable, we may as well check it + // again here. + assert!( + archive_file.delete_original, + "expected to delete the original when archiving immutable files" + ); + } + + if !expected_unarchived.is_empty() { + panic!( + "did not archive some of the files we expected: {:?}", + expected_unarchived + ); + } + + logctx.cleanup_successful(); + } + + /// Verifies that the archive plan streams rather than pre-computing all the + /// steps it has to do at once + /// + /// This property is important for scalability and memory usage. + #[test] + fn test_archiving_is_streaming() { + // Set up the test. + let logctx = test_setup_log("test_archiving_is_streaming"); + let log = &logctx.log; + + // Load the test data + let files = load_test_files().unwrap(); + + // Begin a simulated archive. + let fake_output_dir = Utf8Path::new("/fake-output-directory"); + let lister = TestLister::new_for_test_data(&files); + let plan = test_archive( + log, + &files, + fake_output_dir, + ArchiveKind::Final, + &lister, + ); + + // Verify that the archiver operates in a streaming way by checking that + // each archived file is contained in the most-recently-listed + // directory. If it's not, then it must have come from some previously + // listed directory, which means that the archiver should have returned + // it before listing the next directory. In other words, that would + // mean that the archiver read ahead of the directory whose files it's + // currently archiving, which is the thing we're trying to check + // doesn't happen. + for step in plan.to_steps() { + let step = step.expect("test lister does not produce errors"); + let ArchiveStep::ArchiveFile(archive_file) = &step else { + continue; + }; + + let last_listed = lister.last_listed(); + let last = last_listed + .as_ref() + .expect("listed a directory before archiving any files"); + assert!( + archive_file.input_path.starts_with(last), + "archived file is not in the most-recently-listed directory", + ); + } + + logctx.cleanup_successful(); + } + + /// Verifies that failure to list a directory does not affect archiving + /// other directories + #[test] + fn test_directory_list_error() { + // Set up the test. + let logctx = test_setup_log("test_directory_list_error"); + let log = &logctx.log; + + // Load the test data + let files = load_test_files().unwrap(); + + // Choose a directory for which to inject an error. + let fail_dir = files + .iter() + .find_map(|test_file| { + if matches!(&test_file.kind, TestFileKind::Ignored) { + None + } else { + let parent = test_file.path.parent().unwrap(); + Some(Utf8Path::new(parent)) + } + }) + .expect("at least one non-ignored file in test data"); + info!( + log, + "injecting error for directory"; + "directory" => fail_dir.as_str(), + ); + + // Begin a simulated archive. Configure the lister to inject an error + // for the directory that we chose. + let fake_output_dir = Utf8Path::new("/fake-output-directory"); + let mut lister = TestLister::new_for_test_data(&files); + lister.inject_error(fail_dir); + let plan = test_archive( + log, + &files, + fake_output_dir, + ArchiveKind::Final, + &lister, + ); + + // Now walk through the archive plan and make sure: + // (1) Everything that's not in this directory gets archived. + // (2) There's an error produced for this directory. + // (3) Nothing is archived within this directory. + let mut unarchived_files = files.clone(); + let mut nerrors = 0; + for step in plan.to_steps() { + let step = match step { + Err(error) => { + let error = InlineErrorChain::new(&*error); + let error_str = error.to_string(); + debug!(log, "found error"; error); + assert!(error_str.contains(fail_dir.as_str())); + assert!(error_str.contains("injected error")); + nerrors += 1; + continue; + } + Ok(step) => step, + }; + + let ArchiveStep::ArchiveFile(archive_file) = &step else { + continue; + }; + + assert!( + !archive_file.input_path.starts_with(fail_dir), + "archived file in the directory where we injected an error" + ); + + let _ = unarchived_files + .remove(archive_file.input_path.as_path()) + .expect("archived file was in list of test files"); + } + + // We should see one error for each time the directory that we chose was + // listed. That should always be at least once. It could be more than + // once, depending on how rules are configured. For example, with two + // rules for syslog (/var/adm/messages.* and /var/adm/messages), there + // would be two errors for /var/adm. + assert_ne!( + nerrors, 0, + "expected at least one error after injecting one" + ); + + for file in unarchived_files { + assert!( + file.path.starts_with(fail_dir), + "missed file: {:?}", + file.path + ); + } + + logctx.cleanup_successful(); + } + + /// Verifies that failure to fetch file details does not affect archiving + /// other files + #[test] + fn test_file_metadata_error() { + // Set up the test. + let logctx = test_setup_log("test_file_metadata_error"); + let log = &logctx.log; + + // Load the test data + let files = load_test_files().unwrap(); + + // Find a directory that contains at least two files. We'll inject an + // error for one of those files. + let mut fail_file = None; + { + let mut dirs_with_files: BTreeSet<_> = BTreeSet::new(); + for test_file in &files { + if matches!(&test_file.kind, TestFileKind::Ignored) { + continue; + } + let file = &test_file.path; + let dir = Utf8Path::new( + file.parent().expect("test file has parent directory"), + ); + if dirs_with_files.contains(dir) { + fail_file = Some(file); + break; + } + + dirs_with_files.insert(dir); + } + }; + let Some(fail_file) = fail_file else { + panic!( + "test data had no directory with multiple non-ignored files" + ); + }; + + // Begin a simulated archive. Configure the lister to inject an error + // on the path that we selected above. + let fake_output_dir = Utf8Path::new("/fake-output-directory"); + let mut lister = TestLister::new_for_test_data(&files); + lister.inject_error(fail_file); + let plan = test_archive( + log, + &files, + fake_output_dir, + ArchiveKind::Final, + &lister, + ); + + // Run through the archive plan and verify: + // + // (1) We get exactly one error and it's for the path we injected an + // error for. + // (2) That file does not get archived. + // (2) Every other file gets archived. + let mut unarchived_files = files.clone(); + let mut nerrors = 0; + for step in plan.to_steps() { + let step = match step { + Err(error) => { + let error = InlineErrorChain::new(&*error); + let error_str = error.to_string(); + debug!(log, "found error"; error); + assert!(error_str.contains(fail_file.as_str())); + assert!(error_str.contains("injected error")); + nerrors += 1; + continue; + } + Ok(step) => step, + }; + + let ArchiveStep::ArchiveFile(ArchiveFile { input_path, .. }) = + &step + else { + continue; + }; + + assert!( + input_path != fail_file, + "unexpectedly archived file for which we injected an error" + ); + + let _ = unarchived_files + .remove(input_path.as_path()) + .expect("archived file was in list of test files"); + } + + // There should be exactly one error. + assert_eq!( + nerrors, 1, + "expected exatcly one error after injecting only one error \ + on a file path", + ); + + // There should be exactly one file that was not archived. + assert_eq!(unarchived_files.len(), 1); + assert!(unarchived_files.contains_key(fail_file.as_path())); + + logctx.cleanup_successful(); + } + + #[test] + fn test_naming_logs() { + // template used for other tests + let template = ArchiveFile { + input_path: Utf8Path::new("/nonexistent/one/two.log.0").to_owned(), + mtime: Some("2025-12-12T16:51:00-07:00".parse().unwrap()), + output_directory: Utf8Path::new("/nonexistent/out").to_owned(), + namer: &NameRotatedLogFile, + delete_original: true, + rule: "dummy rule", + }; + + let empty_lister = TestLister::empty(); + + // ordinary case of a rotated log file name: output filename generated + // based on input and mtime + let input = ArchiveFile { + input_path: Utf8Path::new("/nonexistent/one/two.log.0").to_owned(), + ..template.clone() + }; + let filename = input.choose_filename(&empty_lister).unwrap(); + assert_eq!(filename.as_ref(), "two.log.1765583460"); + + // ordinary case with a live log file name + let input = ArchiveFile { + input_path: Utf8Path::new("/nonexistent/one/two.log").to_owned(), + ..template.clone() + }; + let filename = input.choose_filename(&empty_lister).unwrap(); + assert_eq!(filename.as_ref(), "two.1765583460"); + + // case: rotated log file, no mtime available + // (this may never happen in practice) + // + // The current mtime should be used instead. + let input = ArchiveFile { + input_path: Utf8Path::new("/nonexistent/one/two.log.0").to_owned(), + mtime: None, + ..template.clone() + }; + let before = Utc::now().with_nanosecond(0).unwrap(); + let filename = input.choose_filename(&empty_lister).unwrap(); + let after = Utc::now(); + assert!(before <= after); + // The resulting filename should be "two.log.MTIME". + let (prefix, mtime) = + filename.as_ref().rsplit_once(".").expect("unexpected filename"); + assert_eq!(prefix, "two.log"); + let parsed: DateTime = DateTime::from_timestamp( + mtime.parse().expect("expected Unix timestamp in filename"), + 0, + ) + .unwrap(); + assert!(before <= parsed); + assert!(parsed <= after); + + // case: live log file, no mtime available + // (this may never happen in practice) + // + // The current mtime should be used instead. + let input = ArchiveFile { + input_path: Utf8Path::new("/nonexistent/one/two.log").to_owned(), + mtime: None, + ..template.clone() + }; + let before = Utc::now().with_nanosecond(0).unwrap(); + let filename = input.choose_filename(&empty_lister).unwrap(); + let after = Utc::now(); + assert!(before <= after); + // The resulting filename should be "two.MTIME". + let (prefix, mtime) = + filename.as_ref().rsplit_once(".").expect("unexpected filename"); + assert_eq!(prefix, "two"); + let parsed: DateTime = DateTime::from_timestamp( + mtime.parse().expect("expected Unix timestamp in filename"), + 0, + ) + .unwrap(); + assert!(before <= parsed); + assert!(parsed <= after); + + // case: the normal output filename already exists + // expected behavior: the "mtime" in the filename is incremented + let input = ArchiveFile { + input_path: Utf8Path::new("/nonexistent/one/two.log.0").to_owned(), + ..template.clone() + }; + let lister = TestLister::new(["/nonexistent/out/two.log.1765583460"]); + let filename = input.choose_filename(&lister).unwrap(); + assert_eq!(filename.as_ref(), "two.log.1765583461"); + + // case: several closely-named output filenames also exist + let lister = TestLister::new([ + "/nonexistent/out/two.log.1765583460", + "/nonexistent/out/two.log.1765583461", + "/nonexistent/out/two.log.1765583462", + "/nonexistent/out/two.log.1765583464", + ]); + let filename = input.choose_filename(&lister).unwrap(); + assert_eq!(filename.as_ref(), "two.log.1765583463"); + + // case: too many closely-named output files also exist + let colliding_filenames: Vec<_> = (0..=MAX_COLLIDING_FILENAMES) + .map(|i| { + format!( + "/nonexistent/out/two.log.{}", + 1765583460u64 + u64::from(i) + ) + }) + .collect(); + let lister = TestLister::new(colliding_filenames.iter()); + let error = input.choose_filename(&lister).unwrap_err(); + assert!( + error.to_string().contains("too many files with colliding names") + ); + } +} diff --git a/sled-agent/config-reconciler/src/debug_collector/file_archiver/rules.rs b/sled-agent/config-reconciler/src/debug_collector/file_archiver/rules.rs new file mode 100644 index 00000000000..480697fbc0c --- /dev/null +++ b/sled-agent/config-reconciler/src/debug_collector/file_archiver/rules.rs @@ -0,0 +1,297 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Rules used for determining what debug data to collect + +use super::filesystem::FileLister; +use super::filesystem::Filename; +use anyhow::anyhow; +use camino::Utf8Path; +use camino::Utf8PathBuf; +use chrono::DateTime; +use chrono::Utc; +use iddqd::IdOrdItem; +use iddqd::IdOrdMap; +use iddqd::id_upcast; +use regex::Regex; +use std::sync::LazyLock; + +/// Describes a source of debug data +/// +/// In practice, this corresponds to either: +/// +/// * the root filesystem of an illumos **zone** +/// * a **cores** (also called **crash**) dataset where user process core dumps +/// and kernel crash dumps are initially stored +#[derive(Clone)] +pub(crate) struct Source { + pub(crate) input_prefix: Utf8PathBuf, + pub(crate) output_prefix: Utf8PathBuf, +} + +/// Describes debug data to be archived from within some `Source`. +/// +/// Rules specify a path within the source where the files are found (e.g., +/// "var/svc/log") and a pattern for specifying files within that directory that +/// should be covered by the rule (e.g., "*.log"). The rule is applied across +/// several sources (in this case: illumos zones). A rule might cover "all the +/// files in a given cores dataset" or "the rotated SMF log files for a given +/// zone". +/// +/// It may be easiest to understand this by example. See [`ALL_RULES`] for all +/// of the rules. +/// +/// There are basically two kinds of rules: +/// +/// * **Zone** rules are applied to root filesystems of illumos zones, +/// including the global zone and non-global zones. These have scope +/// `RuleScope::ZoneMutable` or `RuleScope::ZoneAlways`. These describe how +/// to find the zone's log files. +/// +/// * **Cores** rules are applied to cores datasets (also known as "crash +/// datasets"), which contain kernel crash dumps and process core dumps. +pub(crate) struct Rule { + /// human-readable description of the rule + pub label: &'static str, + /// identifies what types of sources this rule is supposed to be applied to + pub rule_scope: RuleScope, + /// identifies the path to a directory within a source's input directory + /// that contains the data described by this rule + pub directory: Utf8PathBuf, + /// describes which files within `directory` are identified by this rule + regex: Regex, + /// configures whether the original files associated with this rule should + /// be deleted once they're archived + /// + /// For example, rotated log files are deleted when archived. Live log + /// files are not. + pub delete_original: bool, + /// Describes how to construct the name of a file that's being archived + pub naming: &'static (dyn NamingRule + Send + Sync), +} + +impl Rule { + /// Returns true if this rule specifies that the given `filename` should be + /// archived + pub(crate) fn include_file(&self, filename: &Filename) -> bool { + self.regex.is_match(filename.as_ref()) + } +} + +impl IdOrdItem for Rule { + type Key<'a> = &'static str; + fn key(&self) -> Self::Key<'_> { + self.label + } + id_upcast!(); +} + +/// Describes what Sources a rule can be applied to +pub(crate) enum RuleScope { + /// this rule applies to all cores directories + CoresDirectory, + /// this rule applies to zone roots for "everything" collections, but not + /// "immutable" ones + ZoneMutable, + /// this rule applies to zone roots always, regardless of whether or not + /// we're collecting immutable data only + ZoneAlways, +} + +/// path within a zone's root filesystem to its SMF logs +static VAR_SVC_LOG: &str = "var/svc/log"; +/// path within a zone's root filesystem to its syslog +static VAR_ADM: &str = "var/adm"; + +/// List of all archive rules in the system +/// +/// **NOTE:** If you change these rules, you may also need to update the testing +/// data used by the test suite. The test suite uses path names from real +/// systems to test various properties about these rules: +/// +/// * that all files in the test data are covered by exactly one rule +/// (rules should not specify overlapping files) +/// * that all rules are covered by the test data +pub(crate) static ALL_RULES: LazyLock> = LazyLock::new(|| { + let rules = [ + Rule { + label: "process core files and kernel crash dumps", + rule_scope: RuleScope::CoresDirectory, + directory: ".".parse().unwrap(), + regex: "^.*$".parse().unwrap(), + delete_original: true, + naming: &NameIdentity, + }, + Rule { + label: "live SMF log files", + rule_scope: RuleScope::ZoneMutable, + directory: VAR_SVC_LOG.parse().unwrap(), + regex: "^.*\\.log$".parse().unwrap(), + delete_original: false, + naming: &NameLiveLogFile, + }, + Rule { + label: "live syslog files", + rule_scope: RuleScope::ZoneMutable, + directory: VAR_ADM.parse().unwrap(), + regex: "^messages$".parse().unwrap(), + delete_original: false, + naming: &NameLiveLogFile, + }, + Rule { + label: "rotated SMF log files", + rule_scope: RuleScope::ZoneAlways, + directory: VAR_SVC_LOG.parse().unwrap(), + regex: "^.*\\.log.[0-9]+$".parse().unwrap(), + delete_original: true, + naming: &NameRotatedLogFile, + }, + Rule { + label: "rotated syslog files", + rule_scope: RuleScope::ZoneAlways, + directory: VAR_ADM.parse().unwrap(), + regex: "^messages\\.[0-9]+$".parse().unwrap(), + delete_original: true, + naming: &NameRotatedLogFile, + }, + ]; + + // We could do this more concisely with a `collect()` or `IdOrdMap::from`, + // but those would silently discard duplicates. We want to detect these and + // provide a clear error message. + let mut rv = IdOrdMap::new(); + for rule in rules { + let label = rule.label; + if let Err(_) = rv.insert_unique(rule) { + panic!("found multiple rules with the same label: {:?}", label); + } + } + + rv +}); + +/// Describes a combination of `source` and `rule` +/// +/// This essentially takes a `Rule` and applies it to a specific source. For +/// example, a rule might say how to find the log files within a given zone. A +/// specific zone will be its own `Source`. An `ArchiveGroup` puts these +/// together to represent collection of log files from a specific zone. +pub(crate) struct ArchiveGroup<'a> { + pub(crate) source: Source, + pub(crate) rule: &'a Rule, +} + +impl<'a> ArchiveGroup<'a> { + pub(crate) fn input_directory(&self) -> Utf8PathBuf { + self.source.input_prefix.join(&self.rule.directory) + } + + pub(crate) fn output_directory(&self, debug_dir: &Utf8Path) -> Utf8PathBuf { + debug_dir.join(&self.source.output_prefix) + } +} + +/// Describes how to construct an archived file's final name based on its +/// original name and mtime +/// +/// `archived_file_name` is provided with `lister`, which can be used to +/// determine if the desired output filename already exists and choose another +/// name. **If the name of an existing file is returned, that file will be +/// overwritten by the file that's being archived.** +pub(crate) trait NamingRule { + fn archived_file_name( + &self, + source_file_name: &Filename, + source_file_mtime: Option>, + lister: &dyn FileLister, + output_directory: &Utf8Path, + ) -> Result; +} + +pub(crate) const MAX_COLLIDING_FILENAMES: u16 = 30; + +/// `NamingRule` that's used for rotated log files +/// +/// These files are typically named `foo.0`, `foo.1`, etc. The integer at the +/// end is provided by logadm(8) and has no meaning for us. This implementation +/// replaces that integer with the file's `mtime` as a Unix timestamp. When +/// that would collide with an existing filename, it increments the `mtime` +/// until it gets a unique value (up to `MAX_COLLIDING_FILENAMES` tries). This +/// behavior is historical and should potentially be revisited. +pub(crate) struct NameRotatedLogFile; +impl NamingRule for NameRotatedLogFile { + fn archived_file_name( + &self, + source_file_name: &Filename, + source_file_mtime: Option>, + lister: &dyn FileLister, + output_directory: &Utf8Path, + ) -> Result { + let filename_base = match source_file_name.as_ref().rsplit_once('.') { + Some((base, _extension)) => base, + None => source_file_name.as_ref(), + }; + + let mtime_as_seconds = + source_file_mtime.unwrap_or_else(|| Utc::now()).timestamp(); + for i in 0..MAX_COLLIDING_FILENAMES { + let rv = + format!("{filename_base}.{}", mtime_as_seconds + i64::from(i)); + let dest = output_directory.join(&rv); + if !lister.file_exists(&dest)? { + // unwrap(): we started with a valid `Filename` and did not add + // any slashes here. + return Ok(Filename::try_from(rv).unwrap()); + } + } + + Err(anyhow!( + "failed to choose archive file name for file {source_file_name:?} \ + because there are too many files with colliding names (at least \ + {MAX_COLLIDING_FILENAMES})" + )) + } +} + +/// `NamingRule` that's used for live log files +/// +/// These files can have an arbitrary name `foo`. (SMF log files have a `.log` +/// suffix, but syslog files do not.) For historical reasons, this uses the +/// same implementation as `NameRotatedLogFile`. This behavior should probably +/// be revisited. +struct NameLiveLogFile; +impl NamingRule for NameLiveLogFile { + fn archived_file_name( + &self, + source_file_name: &Filename, + source_file_mtime: Option>, + lister: &dyn FileLister, + output_directory: &Utf8Path, + ) -> Result { + NameRotatedLogFile.archived_file_name( + source_file_name, + source_file_mtime, + lister, + output_directory, + ) + } +} + +/// `NamingRule` that's used for files whose names get preserved across archival +/// +/// This includes kernel crash dumps, process core dumps, etc. This behavior is +/// historical. It does not account for cases where the output filename already +/// exists, which means those files may be overwritten. +struct NameIdentity; +impl NamingRule for NameIdentity { + fn archived_file_name( + &self, + source_file_name: &Filename, + _source_file_mtime: Option>, + _lister: &dyn FileLister, + _output_directory: &Utf8Path, + ) -> Result { + Ok(source_file_name.clone()) + } +} diff --git a/sled-agent/config-reconciler/src/debug_collector/file_archiver/test_helpers.rs b/sled-agent/config-reconciler/src/debug_collector/file_archiver/test_helpers.rs new file mode 100644 index 00000000000..7720cd635a2 --- /dev/null +++ b/sled-agent/config-reconciler/src/debug_collector/file_archiver/test_helpers.rs @@ -0,0 +1,377 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Helpers for working with the testing data used in the test suite + +use super::filesystem::FileLister; +use super::filesystem::Filename; +use super::planning::ArchiveKind; +use super::planning::ArchivePlan; +use super::planning::ArchivePlanner; +use anyhow::Context; +use anyhow::anyhow; +use anyhow::bail; +use camino::Utf8Path; +use camino::Utf8PathBuf; +use chrono::DateTime; +use chrono::Utc; +use iddqd::IdOrdItem; +use iddqd::IdOrdMap; +use iddqd::id_upcast; +use regex::Regex; +use slog::Logger; +use slog::debug; +use std::collections::BTreeSet; +use std::sync::LazyLock; +use std::sync::Mutex; +use strum::Display; +use strum::EnumDiscriminants; +use strum::EnumIter; +use strum::IntoDiscriminant; +use strum::IntoEnumIterator; + +/// Loads the filenames in the test data +pub(crate) fn load_test_files() -> anyhow::Result> { + load_test_data_paths()? + .into_iter() + .map(|path| { + TestFileKind::try_from(path.as_ref()) + .context("may need to update load_test_files()?") + .map(|kind| TestFile { path, kind }) + }) + .collect() +} + +fn load_test_data_paths() -> anyhow::Result> { + let path = "test-data/debug-files.txt"; + std::fs::read_to_string(&path) + .with_context(|| format!("read {path:?}"))? + .lines() + .enumerate() + .map(|(i, l)| (i, l.trim())) + .filter(|(_i, l)| !l.is_empty() && !l.starts_with("#")) + .map(|(i, l)| { + Utf8PathBuf::try_from(l).map_err(|_err| { + anyhow!("{path:?} line {}: non-UTF8 file path", i + 1) + }) + }) + .collect() +} + +/// Test that our test data includes all the kinds of things that we expect. +/// If you see this test failing, presumably you updated the test data and +/// you'll need to make sure it's still representative. +#[test] +fn test_test_data() { + // Load the test data and determine what kind of file each one is. + let files = load_test_files().unwrap(); + + // Create a set of all the kinds of test files that we have not seen so + // far. We'll remove from this set as we find files of this kind. Any + // kinds left over at the end are missing from our test data. + let mut all_kinds: BTreeSet<_> = + TestFileKindDiscriminants::iter().collect(); + // We don't care about finding the "ignored" kind. + all_kinds.remove(&TestFileKindDiscriminants::Ignored); + for test_file in files { + println!("{} {}", test_file.kind, test_file.path); + all_kinds.remove(&test_file.kind.discriminant()); + } + + if !all_kinds.is_empty() { + panic!("missing file in test data for kinds: {:?}", all_kinds); + } +} + +/// Plan an archive operation based on the testing data +pub(crate) fn test_archive<'a>( + log: &Logger, + test_files: &IdOrdMap, + output_dir: &Utf8Path, + what: ArchiveKind, + lister: &'a TestLister, +) -> ArchivePlan<'a> { + // Construct sources that correspond with the test data. + let cores_datasets: BTreeSet<_> = test_files + .iter() + .filter_map(|test_file| test_file.kind.cores_directory()) + .collect(); + let zone_infos: BTreeSet<_> = test_files + .iter() + .filter_map(|test_file| test_file.kind.zone_info()) + .collect(); + + // Plan an archival pass. + let mut planner = + ArchivePlanner::new_with_lister(log, what, output_dir, lister); + + for cores_dir in cores_datasets { + debug!(log, "including cores directory"; "cores_dir" => %cores_dir); + planner.include_cores_directory(cores_dir); + } + + for (zone_name, zone_root) in zone_infos { + debug!( + log, + "including zone"; + "zone_name" => zone_name, + "zone_root" => %zone_root, + ); + planner.include_zone(zone_name, zone_root); + } + + planner.into_plan() +} + +/// Describes one file path in the testing data +#[derive(Clone)] +pub(crate) struct TestFile { + /// path to the file + pub path: Utf8PathBuf, + /// what kind of file we determined it to be, based on its path + pub kind: TestFileKind, +} + +impl IdOrdItem for TestFile { + type Key<'a> = &'a Utf8Path; + + fn key(&self) -> Self::Key<'_> { + &self.path + } + + id_upcast!(); +} + +/// Describes what kind of file we're looking at and what source it's in +#[derive(Clone, Debug, Display, EnumIter, EnumDiscriminants)] +#[strum_discriminants(derive(EnumIter, Ord, PartialOrd))] +pub(crate) enum TestFileKind { + KernelCrashDump { + cores_directory: String, + }, + ProcessCoreDump { + cores_directory: String, + }, + LogSmfRotated { + zone_name: String, + zone_root: String, + }, + LogSmfLive { + zone_name: String, + zone_root: String, + }, + LogSyslogRotated { + zone_name: String, + zone_root: String, + }, + LogSyslogLive { + zone_name: String, + zone_root: String, + }, + GlobalLogSmfRotated, + GlobalLogSmfLive, + GlobalLogSyslogRotated, + GlobalLogSyslogLive, + /// files we don't especially care about, but are in the test data to + /// ensure that they don't create a problem + Ignored, +} + +impl TestFileKind { + /// Returns information about the cores directory this file is in, if any + pub fn cores_directory(&self) -> Option<&Utf8Path> { + match self { + TestFileKind::KernelCrashDump { cores_directory } + | TestFileKind::ProcessCoreDump { cores_directory } => { + Some(Utf8Path::new(cores_directory)) + } + TestFileKind::LogSmfRotated { .. } + | TestFileKind::LogSmfLive { .. } + | TestFileKind::LogSyslogRotated { .. } + | TestFileKind::LogSyslogLive { .. } + | TestFileKind::GlobalLogSmfRotated + | TestFileKind::GlobalLogSmfLive + | TestFileKind::GlobalLogSyslogRotated + | TestFileKind::GlobalLogSyslogLive + | TestFileKind::Ignored => None, + } + } + + /// Returns information about the zone this file is in, if any + pub fn zone_info(&self) -> Option<(&str, &Utf8Path)> { + match self { + TestFileKind::KernelCrashDump { .. } + | TestFileKind::ProcessCoreDump { .. } + | TestFileKind::Ignored => None, + TestFileKind::LogSmfRotated { zone_name, zone_root } + | TestFileKind::LogSmfLive { zone_name, zone_root } + | TestFileKind::LogSyslogRotated { zone_name, zone_root } + | TestFileKind::LogSyslogLive { zone_name, zone_root } => { + Some((zone_name, Utf8Path::new(zone_root))) + } + TestFileKind::GlobalLogSmfRotated + | TestFileKind::GlobalLogSmfLive + | TestFileKind::GlobalLogSyslogRotated + | TestFileKind::GlobalLogSyslogLive => { + Some(("global", Utf8Path::new("/"))) + } + } + } +} + +static RE_CORES_DATASET: LazyLock = + LazyLock::new(|| Regex::new("^(/pool/int/[^/]+/crash)/[^/]+$").unwrap()); + +static RE_NONGLOBAL_ZONE: LazyLock = LazyLock::new(|| { + Regex::new("^/pool/ext/[^/]+/crypt/zone/([^/]+)/root").unwrap() +}); + +impl TryFrom<&Utf8Path> for TestFileKind { + type Error = anyhow::Error; + + fn try_from(value: &Utf8Path) -> Result { + let s = value.as_str(); + + if let Some(c) = RE_CORES_DATASET.captures(s) { + let (_, [cores_directory]) = c.extract(); + let cores_directory = cores_directory.to_owned(); + if s.ends_with("bounds") { + Ok(TestFileKind::Ignored) + } else if s.contains("/vmdump.") { + Ok(TestFileKind::KernelCrashDump { cores_directory }) + } else if s.contains("/core.") { + Ok(TestFileKind::ProcessCoreDump { cores_directory }) + } else { + Err(anyhow!("unknown cores dataset test file kind")) + } + } else if let Some(c) = RE_NONGLOBAL_ZONE.captures(s) { + let (zone_root, [zone_name]) = c.extract(); + let zone_root = zone_root.to_owned(); + let zone_name = zone_name.to_owned(); + if s.ends_with("/messages") { + Ok(TestFileKind::LogSyslogLive { zone_name, zone_root }) + } else if s.contains("/messages.") { + Ok(TestFileKind::LogSyslogRotated { zone_name, zone_root }) + } else if s.contains("/var/svc/log") { + if s.ends_with(".log") { + Ok(TestFileKind::LogSmfLive { zone_name, zone_root }) + } else { + Ok(TestFileKind::LogSmfRotated { zone_name, zone_root }) + } + } else { + Err(anyhow!("unknown non-global zone test file kind")) + } + } else { + if s == "/var/adm/messages" { + Ok(TestFileKind::GlobalLogSyslogLive) + } else if s.starts_with("/var/adm") && s.contains("/messages.") { + Ok(TestFileKind::GlobalLogSyslogRotated) + } else if s.starts_with("/var/svc/log") { + if s.ends_with(".log") { + Ok(TestFileKind::GlobalLogSmfLive) + } else { + Ok(TestFileKind::GlobalLogSmfRotated) + } + } else { + Err(anyhow!("unknown test file kind")) + } + } + } +} + +/// Implementation of `FileLister` built atop the testing data +pub(crate) struct TestLister<'a> { + /// files in our fake filesystem + files: BTreeSet<&'a Utf8Path>, + /// describes the last path listed (used in tests to verify behavior) + last_listed: Mutex>, + /// inject errors when operating on this path + injected_error: Option<&'a Utf8Path>, +} + +impl<'a> TestLister<'a> { + /// Returns a lister that reports no files + pub fn empty() -> Self { + Self::new::<_, &'a str>(std::iter::empty()) + } + + /// Returns a lister for the test data + pub fn new_for_test_data(files: &'a IdOrdMap) -> Self { + Self::new(files.iter().map(|test_file| test_file.path.as_path())) + } + + /// Returns a lister backed by the specified files + pub fn new(files: I) -> Self + where + I: IntoIterator, + P: AsRef + ?Sized + 'a, + { + Self { + files: files.into_iter().map(|p| p.as_ref()).collect(), + last_listed: Mutex::new(None), + injected_error: None, + } + } + + /// Configure this lister to inject errors when accessing this path + /// + /// Clears any previously injected error. + pub fn inject_error(&mut self, fail_path: &'a Utf8Path) { + self.injected_error = Some(fail_path); + } + + pub fn last_listed(&self) -> Option { + self.last_listed.lock().unwrap().clone() + } +} + +impl FileLister for TestLister<'_> { + fn list_files( + &self, + path: &Utf8Path, + ) -> Vec> { + // Keep track of the last path that was listed. + *self.last_listed.lock().unwrap() = Some(path.to_owned()); + + // Inject any errors we've been configured to inject. + if let Some(fail_path) = self.injected_error { + if path == fail_path { + return vec![Err(anyhow!("injected error for {fail_path:?}"))]; + } + } + + // Create a directory listing from the files in our test data. + self.files + .iter() + .filter_map(|file_path| { + let directory = + file_path.parent().expect("test file has a parent"); + (directory == path).then(|| { + let filename = file_path + .file_name() + .expect("test file has a filename"); + Ok(Filename::try_from(filename.to_owned()) + .expect("filename has no slashes")) + }) + }) + .collect() + } + + fn file_mtime( + &self, + path: &Utf8Path, + ) -> Result>, anyhow::Error> { + if let Some(fail_path) = self.injected_error { + if path == fail_path { + bail!("injected error for {fail_path:?}"); + } + } + + Ok(Some("2025-12-12T16:51:00-07:00".parse().unwrap())) + } + + fn file_exists(&self, path: &Utf8Path) -> Result { + Ok(self.files.contains(path)) + } +} diff --git a/sled-agent/config-reconciler/src/debug_collector/mod.rs b/sled-agent/config-reconciler/src/debug_collector/mod.rs index 367b75029ed..c1764d65708 100644 --- a/sled-agent/config-reconciler/src/debug_collector/mod.rs +++ b/sled-agent/config-reconciler/src/debug_collector/mod.rs @@ -84,6 +84,7 @@ //! +----------------------+ //! ``` +mod file_archiver; mod handle; mod helpers; mod task; diff --git a/sled-agent/config-reconciler/src/debug_collector/worker.rs b/sled-agent/config-reconciler/src/debug_collector/worker.rs index c092a5a4843..e1b2cf1f408 100644 --- a/sled-agent/config-reconciler/src/debug_collector/worker.rs +++ b/sled-agent/config-reconciler/src/debug_collector/worker.rs @@ -186,6 +186,8 @@ //! the _live_ log files are also archived, since they will not have a chance //! to get rotated and so would otherwise be lost. +use super::file_archiver::ArchiveKind; +use super::file_archiver::ArchivePlanner; use super::helpers::CoreDumpAdmInvoker; use super::helpers::ZFS_PROP_AVAILABLE; use super::helpers::ZFS_PROP_USED; @@ -210,12 +212,10 @@ use slog::warn; use slog_error_chain::InlineErrorChain; use std::collections::HashSet; use std::ffi::OsString; -use std::path::{Path, PathBuf}; use std::sync::Arc; -use std::time::{Duration, SystemTime, SystemTimeError, UNIX_EPOCH}; +use std::time::{Duration, SystemTimeError, UNIX_EPOCH}; use tokio::sync::mpsc::Receiver; use tokio::sync::oneshot; -use zone::ZoneError; // Parameters related to management of storage on debug datasets @@ -884,110 +884,62 @@ impl DebugCollectorWorker { } } - async fn archive_files(&self) -> tokio::io::Result<()> { - if let Some(debug_dir) = &self.chosen_debug_dir { - if self.known_core_dirs.is_empty() { - info!(self.log, "No core dump locations yet known."); - } - for core_dir in &self.known_core_dirs { - if let Ok(dir) = core_dir.as_ref().read_dir() { - for entry in dir.flatten() { - if let Some(path) = entry.file_name().to_str() { - let dest = debug_dir.as_ref().join(path); - - if let Err(err) = - Self::copy_sync_and_remove(&entry.path(), &dest) - .await - { - error!( - self.log, - "Failed to archive {entry:?}: {err:?}" - ); - } else { - info!( - self.log, - "Relocated {entry:?} to {dest:?}" - ); - } - } else { + async fn archive_files(&self) -> Result<(), anyhow::Error> { + let log = &self.log; + let Some(debug_dir) = &self.chosen_debug_dir else { + error!( + &log, + "Archiving skipped: no archival destination available" + ); + return Ok(()); + }; + + info!(&log, "Archiving files"); + let mut archiver = ArchivePlanner::new( + log, + ArchiveKind::Periodic, + &debug_dir.as_ref(), + ); + if self.known_core_dirs.is_empty() { + warn!(self.log, "No core dump locations yet known."); + } + for core_dir in &self.known_core_dirs { + archiver.include_cores_directory(core_dir.as_ref()); + } + + match self.zone_invoker.get_zones().await { + Ok(zones) => { + for zone in zones { + let zone_path: &Utf8Path = match zone.path().try_into() { + Ok(zone_path) => zone_path, + Err(error) => { + // This should be impossible in practice. + let error = InlineErrorChain::new(&error); error!( - self.log, - "Non-UTF8 path found while archiving core \ - dumps: {entry:?}" + log, + "Cannot archive zone because its path is \ + not UTF-8"; + "zone_name" => zone.name(), + error ); + continue; } - } + }; + let zone_root = if zone.global() { + zone_path.to_owned() + } else { + zone_path.join("root") + }; + archiver.include_zone(zone.name(), &zone_root); } } - } else { - info!( - self.log, - "No archival destination for crash dumps yet chosen." - ); - } - - if let Err(err) = self.archive_logs_from_running_zones().await { - if !matches!(err, ArchiveLogsError::NoDebugDirYet) { - error!( - self.log, - "Failure while trying to archive logs to debug dataset: \ - {err:?}" - ); + Err(error) => { + let error = InlineErrorChain::new(&error); + warn!(log, "Failed to list running zones"; error); } - } - - Ok(()) - } - - async fn copy_sync_and_remove( - source: impl AsRef, - dest: impl AsRef, - ) -> tokio::io::Result<()> { - let source = source.as_ref(); - let dest = dest.as_ref(); - let mut dest_f = tokio::fs::File::create(&dest).await?; - let mut src_f = tokio::fs::File::open(&source).await?; - - tokio::io::copy(&mut src_f, &mut dest_f).await?; - - dest_f.sync_all().await?; - - drop(src_f); - drop(dest_f); - - tokio::fs::remove_file(source).await?; - Ok(()) - } - - async fn archive_logs_from_running_zones( - &self, - ) -> Result<(), ArchiveLogsError> { - let debug_dir = self - .chosen_debug_dir - .as_ref() - .ok_or(ArchiveLogsError::NoDebugDirYet)?; - let oxz_zones = self.zone_invoker.get_zones().await?; - - for zone in oxz_zones { - let zone_root = if zone.global() { - zone.path().to_owned() - } else { - zone.path().join("root") - }; - let logdir = zone_root.join("var/svc/log"); - let zone_name = zone.name(); - self.archive_logs_from_zone_path( - debug_dir, logdir, "*.log", zone_name, false, - ) - .await?; + }; - let adm_logdir = zone_root.join("var/adm"); - self.archive_logs_from_zone_path( - debug_dir, adm_logdir, "messages", zone_name, false, - ) - .await?; - } - Ok(()) + archiver.execute().await } async fn do_archive_former_zone_root( @@ -1000,16 +952,13 @@ impl DebugCollectorWorker { .chosen_debug_dir .as_ref() .ok_or(ArchiveLogsError::NoDebugDirYet)?; - let logdir = zone_root.join("root/var/svc/log"); - let rv = self - .archive_logs_from_zone_path( - debug_dir, - logdir.into(), - "*.log", - zone_name, - true, - ) - .await; + let mut archiver = ArchivePlanner::new( + &self.log, + ArchiveKind::Final, + &debug_dir.as_ref(), + ); + archiver.include_zone(zone_name, zone_root); + archiver.execute().await.map_err(ArchiveLogsError::Archiver)?; if let Err(()) = completion_tx.send(()) { // In practice, it would be surprising for our caller to have // dropped this channel. Make a note. @@ -1019,90 +968,6 @@ impl DebugCollectorWorker { "error" => "completion channel closed", ); } - rv - } - - // Archives log files found in `logdir` for zone `zone_name` to the - // destination debug dataset. - // - // `log_name_pattern` should be a glob pattern that matches against file - // names, e.g., `*.log`, `mylog`. If `include_live` is `true`, this will - // archive all logs, matching on `{log_name_pattern}*`. If it is `false`, - // only rotated logs will be archived, matching on - // `{log_name_pattern}.[0-9]`. - async fn archive_logs_from_zone_path( - &self, - debug_dir: &DebugDataset, - logdir: PathBuf, - log_name_pattern: &str, - zone_name: &str, - include_live: bool, - ) -> Result<(), ArchiveLogsError> { - let mut rotated_log_files = Vec::new(); - if include_live { - let pattern = logdir - .join(format!("{log_name_pattern}*")) - .to_str() - .ok_or_else(|| ArchiveLogsError::Utf8(zone_name.to_string()))? - .to_string(); - rotated_log_files.extend(glob::glob(&pattern)?.flatten()); - } else { - // patterns matching archived logs, e.g. foo.log.3 - // keep checking for greater numbers of digits until we don't find - // any - for n in 1..9 { - let pattern = logdir - .join(format!("{log_name_pattern}.{}", "[0-9]".repeat(n))) - .to_str() - .ok_or_else(|| { - ArchiveLogsError::Utf8(zone_name.to_string()) - })? - .to_string(); - rotated_log_files.extend(glob::glob(&pattern)?.flatten()); - } - } - let dest_dir = debug_dir.as_ref().join(zone_name).into_std_path_buf(); - if !rotated_log_files.is_empty() { - tokio::fs::create_dir_all(&dest_dir).await?; - let count = rotated_log_files.len(); - info!( - self.log, - "Archiving {count} log files from {zone_name} zone" - ); - } else if include_live { - warn!( - self.log, - "Found no log files from {zone_name} zone, including live \ - log files" - ); - } - for entry in rotated_log_files { - let src_name = entry.file_name().unwrap(); - // as we archive them, logadm will keep resetting to .log.0, - // so we need to maintain our own numbering in the dest dataset. - // we'll use the modified date of the rotated log file, or try - // falling back to the time of archival if that fails, and - // falling back to counting up from 0 if *that* somehow fails. - let mut n = entry - .metadata() - .and_then(|m| m.modified()) - .unwrap_or_else(|_| SystemTime::now()) - .duration_since(UNIX_EPOCH) - .map(|d| d.as_secs()) - .unwrap_or(0); - let mut dest; - loop { - dest = dest_dir.join(src_name).with_extension(format!("{n}")); - if dest.exists() { - n += 1; - } else { - break; - } - } - if let Err(err) = Self::copy_sync_and_remove(&entry, dest).await { - warn!(self.log, "Failed to archive {entry:?}: {err:?}"); - } - } Ok(()) } @@ -1275,18 +1140,12 @@ impl DebugCollectorWorker { #[derive(thiserror::Error, Debug)] pub enum ArchiveLogsError { - #[error("I/O error: {0}")] - IoError(#[from] tokio::io::Error), - #[error("Error calling zoneadm: {0}")] - Zoneadm(#[from] ZoneError), - #[error("Non-UTF8 zone path for zone {0}")] - Utf8(String), - #[error("Glob pattern invalid: {0}")] - Glob(#[from] glob::PatternError), #[error( "No debug dir into which we should archive logs has yet been chosen" )] NoDebugDirYet, + #[error("Archive error")] + Archiver(#[source] anyhow::Error), } #[derive(thiserror::Error, Debug)] @@ -1324,8 +1183,10 @@ mod tests { use sled_storage::dataset::{CRASH_DATASET, DUMP_DATASET}; use std::collections::HashMap; use std::str::FromStr; + use std::time::SystemTime; use tokio::io::AsyncWriteExt; use zone::Zone; + use zone::ZoneError; impl Clone for ZfsGetError { fn clone(&self) -> Self { diff --git a/sled-agent/config-reconciler/test-data/debug-files.txt b/sled-agent/config-reconciler/test-data/debug-files.txt new file mode 100644 index 00000000000..cc146e4b1f4 --- /dev/null +++ b/sled-agent/config-reconciler/test-data/debug-files.txt @@ -0,0 +1,93 @@ +# In this file, blank lines and lines starting with '#' are ignored. +# +# This file contains the full paths to files seen on deployed systems +# (like the dogfood environment) that the debug collector is responsible for +# archiving. There are tests that use these paths to verify that the debug +# collector's path rules correctly collect files from production systems. +# +# If you need to update or regenerate this, you'll want to use something like: +# +# find PATHS -type f +# +# for each of several different paths. Using `find` like this makes sure that +# these are real paths from real systems. If you hand-construct a path here to +# match what the test expects, that defeats the point! +# +# Here are paths that we include: +# +# # Not-yet-archived kernel crash dumps and core files +# /pool/int/*/crash +# +# # Pick some zone and get its un-archived SMF logs and syslog. +# # You'll want to make sure to find one with recently rotated +# # log files (which is a little tricky since logadm only rotates +# # them every hour and the archiver picks them up within 5 +# # minutes). +# /pool/ext/*/crypt/zone/$PICK_A_ZONE/root/var/svc/log/* +# /pool/ext/*/crypt/zone/$PICK_A_ZONE/root/var/adm/messages* +# +# though you should beware that this will produce an enormous amount of output +# on systems that have been deployed for a long time. You will probably want to +# prune most of the log files out. +# +# The output here has been constructed from similar invocations on several +# systems in order to assemble a representative set. + +# Kernel crash dumps and related files in the "crash" dataset +/pool/int/35dcb885-18cf-4842-a17f-fb63e76a5f2c/crash/vmdump.0 +/pool/int/35dcb885-18cf-4842-a17f-fb63e76a5f2c/crash/bounds +/pool/int/35dcb885-18cf-4842-a17f-fb63e76a5f2c/crash/vmdump.1 + +# User process core files in the "crash" dataset +/pool/int/5a058adc-8208-4bff-b4f3-44e2651435b0/crash/core.oxz_switch.cat.27751.1765577436 +/pool/int/7c377f39-95bf-4074-8dda-7a7a102b9d2c/crash/core.oxz_propolis-server_081c9d2b-2d89-4830-b3c5-ff4439013794.propolis-server.5047.1765579111 +/pool/int/41eda85a-0820-4c91-8067-7f28af0cd408/crash/core.oxz_propolis-server_1182aa44-0367-4ea5-be41-f207a95b52bf.propolis-server.28517.1765579115 + +# Live SMF log files +/pool/ext/0c4ef358-5533-43db-ad38-a8eff716e53a/crypt/zone/oxz_crucible_5cc7c840-8e6b-48c8-ac4b-f4297f8cf61a/root/var/svc/log/application-management-net-snmp:default.log +/pool/ext/0c4ef358-5533-43db-ad38-a8eff716e53a/crypt/zone/oxz_crucible_5cc7c840-8e6b-48c8-ac4b-f4297f8cf61a/root/var/svc/log/application-security-tcsd:default.log +/pool/ext/0c4ef358-5533-43db-ad38-a8eff716e53a/crypt/zone/oxz_crucible_5cc7c840-8e6b-48c8-ac4b-f4297f8cf61a/root/var/svc/log/milestone-devices:default.log +/pool/ext/0c4ef358-5533-43db-ad38-a8eff716e53a/crypt/zone/oxz_crucible_5cc7c840-8e6b-48c8-ac4b-f4297f8cf61a/root/var/svc/log/network-tcpkey:default.log +/pool/ext/0c4ef358-5533-43db-ad38-a8eff716e53a/crypt/zone/oxz_crucible_5cc7c840-8e6b-48c8-ac4b-f4297f8cf61a/root/var/svc/log/oxide-crucible-agent:default.log +/pool/ext/0c4ef358-5533-43db-ad38-a8eff716e53a/crypt/zone/oxz_crucible_5cc7c840-8e6b-48c8-ac4b-f4297f8cf61a/root/var/svc/log/oxide-crucible-downstairs:default.log +/pool/ext/0c4ef358-5533-43db-ad38-a8eff716e53a/crypt/zone/oxz_crucible_5cc7c840-8e6b-48c8-ac4b-f4297f8cf61a/root/var/svc/log/oxide-crucible-downstairs:downstairs-5275dc20-1d32-4304-8b61-a62a575839ad.log +/pool/ext/0c4ef358-5533-43db-ad38-a8eff716e53a/crypt/zone/oxz_crucible_5cc7c840-8e6b-48c8-ac4b-f4297f8cf61a/root/var/svc/log/svc.startd.log +/pool/ext/0c4ef358-5533-43db-ad38-a8eff716e53a/crypt/zone/oxz_ntp_0460bff0-4cef-487f-aa5c-fd7e1ecef3e0/root/var/svc/log/oxide-chrony-setup:default.log +/pool/ext/0c4ef358-5533-43db-ad38-a8eff716e53a/crypt/zone/oxz_ntp_0460bff0-4cef-487f-aa5c-fd7e1ecef3e0/root/var/svc/log/oxide-ntp-admin:default.log +/pool/ext/0c4ef358-5533-43db-ad38-a8eff716e53a/crypt/zone/oxz_ntp_0460bff0-4cef-487f-aa5c-fd7e1ecef3e0/root/var/svc/log/oxide-ntp:default.log + +# Live syslog files +/pool/ext/0c4ef358-5533-43db-ad38-a8eff716e53a/crypt/zone/oxz_crucible_5cc7c840-8e6b-48c8-ac4b-f4297f8cf61a/root/var/adm/messages +/pool/ext/0c4ef358-5533-43db-ad38-a8eff716e53a/crypt/zone/oxz_ntp_0460bff0-4cef-487f-aa5c-fd7e1ecef3e0/root/var/adm/messages +/pool/ext/34dadf3f-f60c-4acc-b82b-4b0c82224222/crypt/zone/oxz_crucible_b12aa520-a769-4eac-b56b-09960550a831/root/var/adm/messages +/pool/ext/34dadf3f-f60c-4acc-b82b-4b0c82224222/crypt/zone/oxz_propolis-server_c78103ae-b241-4655-93fb-5f2f8bf50041/root/var/adm/messages +/pool/ext/416232c1-bc8f-403f-bacb-28403dd8fced/crypt/zone/oxz_cockroachdb_e86845b5-eabd-49f5-9a10-6dfef9066209/root/var/adm/messages +/pool/ext/416232c1-bc8f-403f-bacb-28403dd8fced/crypt/zone/oxz_crucible_85bd9bdb-1ec5-4a8d-badb-8b5d502546a1/root/var/adm/messages + +# Rotated SMF log files +/pool/ext/2115b084-be0f-4fba-941b-33a659798a9e/crypt/zone/oxz_ntp_a700528f-f600-4908-94ac-9c06442ef6b4/root/var/svc/log/application-management-net-snmp:default.log.0 +/pool/ext/2115b084-be0f-4fba-941b-33a659798a9e/crypt/zone/oxz_ntp_a700528f-f600-4908-94ac-9c06442ef6b4/root/var/svc/log/oxide-ntp:default.log.0 + +# Rotated syslog files +/pool/ext/2115b084-be0f-4fba-941b-33a659798a9e/crypt/zone/oxz_ntp_a700528f-f600-4908-94ac-9c06442ef6b4/root/var/adm/messages.0 + +# Global zone: live syslog +/var/adm/messages + +# Global zone: rotated syslog +/var/adm/messages.0 +/var/adm/messages.1 +/var/adm/messages.2 +/var/adm/messages.3 + +# Global zone: live SMF logs +/var/svc/log/site-postboot:default.log +/var/svc/log/oxide-pumpkind:default.log +/var/svc/log/oxide-sled-agent:default.log + +# Global zone: rotated SMF logs +/var/svc/log/oxide-mg-ddm:default.log.0 +/var/svc/log/oxide-mg-ddm:default.log.1 +/var/svc/log/oxide-sled-agent:default.log.0 +/var/svc/log/oxide-sled-agent:default.log.1 +/var/svc/log/oxide-sled-agent:default.log.2 From 372db0ad65a709768b974f3e53f262390e787032 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 22 Dec 2025 09:14:48 -0800 Subject: [PATCH 2/2] fix link --- .../src/debug_collector/file_archiver/planning.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sled-agent/config-reconciler/src/debug_collector/file_archiver/planning.rs b/sled-agent/config-reconciler/src/debug_collector/file_archiver/planning.rs index 61752b7ab76..2c906c0f050 100644 --- a/sled-agent/config-reconciler/src/debug_collector/file_archiver/planning.rs +++ b/sled-agent/config-reconciler/src/debug_collector/file_archiver/planning.rs @@ -7,8 +7,8 @@ //! //! This system is designed so that as much possible is incorporated into the //! plan so that it can be tested in simulation without extensive dependency -//! injection. See also [https://mmapped.blog/posts/29-plan-execute](the -//! plan-execute pattern). +//! injection. See also [the plan-execute +//! pattern](https://mmapped.blog/posts/29-plan-execute). use super::execution::execute_archive_step; use super::filesystem::FileLister;