diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 6ee2555..2572029 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -25,10 +25,10 @@ jobs: rust: [stable] task: - name: Test (default-tls) - run: cargo test --features build-binary,default-tls + run: cargo test --features build-binary,lzma,default-tls - name: Test (rustls-tls) - run: cargo test --features build-binary,rustls-tls + run: cargo test --features build-binary,lzma,rustls-tls include: - os: ubuntu-latest diff --git a/CHANGELOG.md b/CHANGELOG.md index 0cbbc6a..0f19a70 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### Added + +- Support `.tar.xz` and `.tar.lzma` archives + +### Fixed + +- Infer archive type from bytes instead of filename extension + ## [v0.7.0](https://github.com/epwalsh/rust-cached-path/releases/tag/v0.7.0) - 2025-05-14 ### Added diff --git a/Cargo.toml b/Cargo.toml index 7fd7dbd..bd7c368 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,12 +41,15 @@ indicatif = "0.16" env_logger = { version = "0.10", optional = true } structopt = { version = "0.3", optional = true } color-eyre = { version = "0.6", optional = true } +infer = "0.19.0" +lzma-rs = { version = "0.3", optional = true } [features] default = ["default-tls"] build-binary = ["env_logger", "structopt", "color-eyre"] rustls-tls = ["reqwest/rustls-tls"] default-tls = ["reqwest/default-tls"] +lzma = ["lzma-rs"] [dev-dependencies] httpmock = "0.7" diff --git a/src/archives.rs b/src/archives.rs index 3095a2a..1816ef1 100644 --- a/src/archives.rs +++ b/src/archives.rs @@ -1,24 +1,88 @@ use crate::error::Error; use flate2::read::GzDecoder; use std::fs::{self, File}; +use std::io::Read; use std::path::Path; use tempfile::tempdir_in; /// Supported archive types. pub(crate) enum ArchiveFormat { TarGz, + #[cfg(feature = "lzma")] + TarXz, + #[cfg(feature = "lzma")] + TarLzma, Zip, } +// see https://github.com/bojand/infer/issues/91 +#[allow(clippy::nonminimal_bool)] +fn is_lzma(buf: &[u8]) -> bool { + buf.len() > 4 + && buf[0] == 0x5D + && buf[1] == 0x00 + && buf[2] == 0x00 + && (buf[3] == 0x80 + || buf[3] == 0x01 + || buf[3] == 0x10 + || buf[3] == 0x08 + || buf[3] == 0x20 + || buf[3] == 0x40 + || buf[3] == 0x80 + || buf[3] == 0x00) + && (buf[4] == 0x00 || buf[4] == 0x01 || buf[4] == 0x02) +} + +fn infer() -> infer::Infer { + let mut infer = infer::Infer::new(); + infer.add("application/x-lzma", "lzma", is_lzma); + infer +} + impl ArchiveFormat { + fn is_tar(read: &mut R) -> bool { + let mut buf = [0; 262]; + read.read_exact(&mut buf) + .is_ok_and(|_| infer::archive::is_tar(&buf)) + } + /// Parse archive type from resource extension. - pub(crate) fn parse_from_extension(resource: &str) -> Result { - if resource.ends_with(".tar.gz") { - Ok(Self::TarGz) - } else if resource.ends_with(".zip") { - Ok(Self::Zip) + pub(crate) fn parse_from_extension(resource: &Path) -> Result { + if let Some(file_type) = infer().get_from_path(resource)? { + let archive_type = match file_type.mime_type() { + "application/gzip" if Self::is_tar(&mut GzDecoder::new(File::open(resource)?)) => { + Self::TarGz + } + #[cfg(feature = "lzma")] + "application/x-xz" + if Self::is_tar(&mut lzma::LzmaDecoder::new( + lzma::Codec::Xz, + File::open(resource)?, + )?) => + { + Self::TarXz + } + #[cfg(feature = "lzma")] + "application/x-lzma" + if Self::is_tar(&mut lzma::LzmaDecoder::new( + lzma::Codec::Lzma, + File::open(resource)?, + )?) => + { + Self::TarLzma + } + "application/zip" => Self::Zip, + tpe => { + return Err(Error::ExtractionError(format!( + "unsupported file format: {tpe}" + ))) + } + }; + Ok(archive_type) } else { - Err(Error::ExtractionError("unsupported archive format".into())) + Err(Error::ExtractionError( + "cannot determine archive file type".into(), + )) } } } @@ -39,6 +103,18 @@ pub(crate) fn extract_archive>( let mut archive = tar::Archive::new(tar); archive.unpack(&temp_target)?; } + #[cfg(feature = "lzma")] + ArchiveFormat::TarXz => { + let xz_decoder = lzma::LzmaDecoder::new(lzma::Codec::Xz, File::open(path)?)?; + let mut archive = tar::Archive::new(xz_decoder); + archive.unpack(&temp_target)?; + } + #[cfg(feature = "lzma")] + ArchiveFormat::TarLzma => { + let lzma_decoder = lzma::LzmaDecoder::new(lzma::Codec::Lzma, File::open(path)?)?; + let mut archive = tar::Archive::new(lzma_decoder); + archive.unpack(&temp_target)?; + } ArchiveFormat::Zip => { let file = File::open(path)?; let mut archive = @@ -54,3 +130,114 @@ pub(crate) fn extract_archive>( Ok(()) } + +#[cfg(feature = "lzma")] +mod lzma { + use std::io::Read; + use std::thread::JoinHandle; + + #[derive(Clone, Copy)] + pub(super) enum Codec { + Lzma, + Xz, + } + + impl std::fmt::Display for Codec { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Codec::Lzma => write!(f, "lzma"), + Codec::Xz => write!(f, "xz"), + } + } + } + + pub(super) struct LzmaDecoder { + codec: Codec, + decoder_handle: Option>>, + pipe_reader: std::io::PipeReader, + } + + impl LzmaDecoder { + pub(super) fn new( + codec: Codec, + reader: R, + ) -> std::io::Result { + let (pipe_reader, mut pipe_writer) = std::io::pipe()?; + let decoder_handle = std::thread::spawn(move || { + let mut reader = std::io::BufReader::new(reader); + match codec { + Codec::Lzma => lzma_rs::lzma_decompress(&mut reader, &mut pipe_writer), + Codec::Xz => lzma_rs::xz_decompress(&mut reader, &mut pipe_writer), + } + }); + Ok(Self { + codec, + decoder_handle: Some(decoder_handle), + pipe_reader, + }) + } + } + + impl Read for LzmaDecoder { + fn read(&mut self, buf: &mut [u8]) -> Result { + let size = self.pipe_reader.read(buf); + if let Some(handle) = self.decoder_handle.take_if(|h| h.is_finished()) { + handle + .join() + .map_err(|_| { + std::io::Error::other(format!( + "{} decompression thread panicked", + self.codec + )) + })? + .map_err(|e| { + std::io::Error::other(format!("{} decompression error: {e}", self.codec)) + })?; + } + // handle 0-byte read edge case + match size { + Ok(0) if self.decoder_handle.is_some() => { + // we read nothing, but the thread is still running, most likely a race condition, retry + self.read(buf) + } + other => other, + } + } + } + + #[cfg(test)] + mod test { + + use super::*; + + #[test] + #[should_panic(expected = "xz decompression error")] + fn test_xz_decoder_empty() { + let mut decoder = LzmaDecoder::new(Codec::Xz, std::io::empty()).unwrap(); + std::io::copy(&mut decoder, &mut Vec::new()).unwrap(); + } + + #[test] + #[should_panic(expected = "xz decompression error")] + fn test_xz_decoder_bad() { + let bad: &[u8] = &[0x42u8; 1024]; + let mut decoder = LzmaDecoder::new(Codec::Xz, bad).unwrap(); + std::io::copy(&mut decoder, &mut Vec::new()).unwrap(); + } + + #[test] + #[should_panic(expected = "lzma decompression error")] + fn test_lzma_decoder_empty() { + let mut decoder = LzmaDecoder::new(Codec::Lzma, std::io::empty()).unwrap(); + std::io::copy(&mut decoder, &mut Vec::new()).unwrap(); + } + + #[test] + #[should_panic(expected = "lzma decompression error")] + fn test_lzma_decoder_bad() { + let bad: &[u8] = &[0x42u8; 1024]; + let mut decoder = LzmaDecoder::new(Codec::Lzma, bad).unwrap(); + std::io::copy(&mut decoder, &mut Vec::new()).unwrap(); + } + } +} diff --git a/src/cache.rs b/src/cache.rs index b667e00..ca260c9 100644 --- a/src/cache.rs +++ b/src/cache.rs @@ -330,7 +330,7 @@ impl Cache { if !dirpath.is_dir() { info!("Extracting {} to {:?}", resource, dirpath); - let format = ArchiveFormat::parse_from_extension(resource)?; + let format = ArchiveFormat::parse_from_extension(&cached_path)?; extract_archive(&cached_path, &dirpath, &format)?; } diff --git a/src/test.rs b/src/test.rs index 24e40b2..3d9bb53 100644 --- a/src/test.rs +++ b/src/test.rs @@ -193,8 +193,7 @@ fn test_cached_path_remote_file_in_subdir() { assert!(Meta::meta_path(&path).is_file()); } -#[test] -fn test_extract_tar_gz() { +fn assert_extract_archive(filename: &str) { let cache_dir = tempdir().unwrap(); let cache = Cache::builder() .dir(cache_dir.path().to_owned()) @@ -202,15 +201,9 @@ fn test_extract_tar_gz() { .build() .unwrap(); - let resource: PathBuf = [ - ".", - "test_fixtures", - "utf-8_sample", - "archives", - "utf-8.tar.gz", - ] - .iter() - .collect(); + let resource: PathBuf = [".", "test_fixtures", "utf-8_sample", "archives", filename] + .iter() + .collect(); let path = cache .cached_path_with_options(resource.to_str().unwrap(), &Options::default().extract()) @@ -226,35 +219,25 @@ fn test_extract_tar_gz() { } #[test] -fn test_extract_zip() { - let cache_dir = tempdir().unwrap(); - let cache = Cache::builder() - .dir(cache_dir.path().to_owned()) - .progress_bar(None) - .build() - .unwrap(); +fn test_extract_tar_gz() { + assert_extract_archive("utf-8.tar.gz"); +} - let resource: PathBuf = [ - ".", - "test_fixtures", - "utf-8_sample", - "archives", - "utf-8.zip", - ] - .iter() - .collect(); +#[cfg(feature = "lzma")] +#[test] +fn test_extract_tar_xz() { + assert_extract_archive("utf-8.tar.xz"); +} - let path = cache - .cached_path_with_options(resource.to_str().unwrap(), &Options::default().extract()) - .unwrap(); - assert!(path.is_dir()); - assert!(path.to_str().unwrap().ends_with("-extracted")); - assert!(path - .to_str() - .unwrap() - .starts_with(cache_dir.path().to_str().unwrap())); - let sample_file_path = path.join("dummy.txt"); - assert!(sample_file_path.is_file()); +#[cfg(feature = "lzma")] +#[test] +fn test_extract_tar_lzma() { + assert_extract_archive("utf-8.tar.lzma"); +} + +#[test] +fn test_extract_zip() { + assert_extract_archive("utf-8.zip"); } #[test] diff --git a/test_fixtures/utf-8_sample/archives/utf-8.tar.xz b/test_fixtures/utf-8_sample/archives/utf-8.tar.xz new file mode 100644 index 0000000..9db8100 Binary files /dev/null and b/test_fixtures/utf-8_sample/archives/utf-8.tar.xz differ