From 5e8ff4b3ad92b1bbd4f2f789f4f0f45b21ae08cf Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Mon, 1 Dec 2025 17:57:04 -0500 Subject: [PATCH] feature: support latin1 characters for identifiers in spectra and chromatograms for mzML --- Cargo.lock | 1 + Cargo.toml | 3 ++- src/io/mzml/reader.rs | 14 +++++++++++--- src/io/mzml/reading_shared.rs | 10 ++++++++-- 4 files changed, 22 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 34f84f7..012ee1c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2071,6 +2071,7 @@ dependencies = [ "chrono", "clap", "criterion", + "encoding_rs", "env_logger", "filename", "flate2", diff --git a/Cargo.toml b/Cargo.toml index bd2162a..c5eaa9b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -60,7 +60,7 @@ default = ["zlib-ng-compat", "mgf", "mzml"] checksum = ["dep:md5", "dep:sha1", "dep:base16ct"] mgf = [] -mzml = ["dep:quick-xml", "checksum", "dep:memchr"] +mzml = ["dep:quick-xml", "checksum", "dep:memchr", "dep:encoding_rs"] imzml = ["mzml", "dep:uuid"] # mzsignal's main functionality requires a linear algebra backend. @@ -201,6 +201,7 @@ pin-project-lite = { version = "0.2.16", optional = true } memchr = { version = "2.7.4", optional = true } libloading = { version = "0.8.6", optional = true } zstd = { version = "0.13.3", optional = true } +encoding_rs = { version = "0.8.35", optional = true } [dev-dependencies] diff --git a/src/io/mzml/reader.rs b/src/io/mzml/reader.rs index 87ff9b3..e16a3e5 100644 --- a/src/io/mzml/reader.rs +++ b/src/io/mzml/reader.rs @@ -738,7 +738,12 @@ impl match attr.key.as_ref() { b"id" => { - self.entry_id = match attr.unescape_value() { + self.entry_id = match attr.unescape_value() + .map(|v| v.to_string()) + .or_else(|_| -> Result { + log::warn!("Detected non-UTF8 character in spectrum id"); + Ok(quick_xml::escape::escape(encoding_rs::mem::decode_latin1(&attr.value).as_ref()).into()) + }) { Ok(value) => value.to_string(), Err(e) => { return Err(xml_error!( @@ -898,8 +903,11 @@ impl { self.entry_id = attr .unescape_value() - .expect("Error decoding id") - .to_string(); + .map(|v| v.to_string()) + .or_else(|_| -> Result { + log::warn!("Detected non-UTF8 character in chromatogram id"); + Ok(quick_xml::escape::escape(encoding_rs::mem::decode_latin1(&attr.value).as_ref()).into()) + }).unwrap(); trace!("Stored chromatogram id = {}", self.entry_id); } b"index" => { diff --git a/src/io/mzml/reading_shared.rs b/src/io/mzml/reading_shared.rs index dd3b077..e38bad3 100644 --- a/src/io/mzml/reading_shared.rs +++ b/src/io/mzml/reading_shared.rs @@ -461,8 +461,14 @@ impl IndexedMzMLIndexExtractor { if attr.key.as_ref() == b"idRef" { self.last_id = attr .unescape_value() - .expect("Error decoding idRef") - .to_string(); + .map(|v| v.to_string()) + .or_else(|_| -> Result { + log::warn!("Detected non-UTF8 character in idRef"); + Ok(quick_xml::escape::escape(encoding_rs::mem::decode_latin1(&attr.value).as_ref()).into()) + }) + .unwrap_or_else(|e| { + panic!("Error decoding idRef on offset {e} from bytes {:?}", attr.value) + }); } } Err(err) => {