From a90fd25656a92f64046616b0355c140ea72c98b4 Mon Sep 17 00:00:00 2001 From: gpas-github-bot Date: Tue, 9 Sep 2025 14:09:54 +0100 Subject: [PATCH 1/7] feat: functional local versions of filtering --- .gitignore | 3 + Cargo.lock | 1264 +++++++++++++++++++++++++++++++++- Cargo.toml | 8 + src/filter.rs | 1111 +----------------------------- src/filter_common.rs | 344 ++++++++++ src/index.rs | 124 ++-- src/lib.rs | 33 +- src/local_filter.rs | 874 +++++++++++++++++++++++ src/main.rs | 3 +- src/minimizers.rs | 1 - src/remote_filter.rs | 1561 ++++++++++++++++++++++++++++++++++++++++++ src/server.rs | 136 ++++ src/server_common.rs | 81 +++ 13 files changed, 4378 insertions(+), 1165 deletions(-) create mode 100644 src/filter_common.rs create mode 100644 src/local_filter.rs create mode 100644 src/remote_filter.rs create mode 100644 src/server.rs create mode 100644 src/server_common.rs diff --git a/.gitignore b/.gitignore index 3e8f542..5db8dd0 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,6 @@ /data .DS_Store *DE.md + +*.fastq +*.fastq.gz \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 9fb825b..71e833b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "addr2line" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" +dependencies = [ + "gimli", +] + [[package]] name = "adler2" version = "2.0.1" @@ -89,12 +98,93 @@ dependencies = [ "wait-timeout", ] +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + [[package]] name = "autocfg" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "axum" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "021e862c184ae977658b36c4500f7feac3221ca5da43e3f25bd04ab6c79a29b5" +dependencies = [ + "axum-core", + "bytes", + "form_urlencoded", + "futures-util", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-util", + "itoa", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "rustversion", + "serde", + "serde_json", + "serde_path_to_error", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "axum-core" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68464cd0412f486726fb3373129ef5d2993f90c34bc2bc1c1e9943b2f4fc7ca6" +dependencies = [ + "bytes", + "futures-core", + "http", + "http-body", + "http-body-util", + "mime", + "pin-project-lite", + "rustversion", + "sync_wrapper", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "backtrace" +version = "0.3.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6806a6321ec58106fea15becdad98371e28d92ccbc7c8f1b3b6dd724fe8f1002" +dependencies = [ + "addr2line", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", + "windows-targets 0.52.6", +] + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + [[package]] name = "bgzip" version = "0.3.1" @@ -171,6 +261,12 @@ version = "1.23.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3995eaeebcdf32f91f980d360f78732ddc061097ab4e39991ae7a6ace9194677" +[[package]] +name = "bytes" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" + [[package]] name = "bzip2" version = "0.4.4" @@ -276,6 +372,22 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + [[package]] name = "crc32fast" version = "1.5.0" @@ -325,6 +437,7 @@ version = "0.10.0" dependencies = [ "anyhow", "assert_cmd", + "axum", "bincode", "clap", "flate2", @@ -337,6 +450,7 @@ dependencies = [ "parking_lot", "predicates", "rayon", + "reqwest", "rstest", "rustc-hash", "serde", @@ -344,6 +458,8 @@ dependencies = [ "simd-minimizers", "tempfile", "thiserror 2.0.14", + "tokio", + "tracing-subscriber", "xxhash-rust", "zstd", ] @@ -354,6 +470,17 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "doc-comment" version = "0.3.3" @@ -372,6 +499,15 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + [[package]] name = "equivalent" version = "1.0.2" @@ -414,12 +550,58 @@ dependencies = [ "num-traits", ] +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + +[[package]] +name = "form_urlencoded" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "futures-channel" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" +dependencies = [ + "futures-core", + "futures-sink", +] + [[package]] name = "futures-core" version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" +[[package]] +name = "futures-io" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" + [[package]] name = "futures-macro" version = "0.3.31" @@ -431,6 +613,12 @@ dependencies = [ "syn", ] +[[package]] +name = "futures-sink" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" + [[package]] name = "futures-task" version = "0.3.31" @@ -450,13 +638,27 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ "futures-core", + "futures-io", "futures-macro", + "futures-sink", "futures-task", + "memchr", "pin-project-lite", "pin-utils", "slab", ] +[[package]] +name = "getrandom" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.11.1+wasi-snapshot-preview1", +] + [[package]] name = "getrandom" version = "0.3.3" @@ -466,15 +668,40 @@ dependencies = [ "cfg-if", "libc", "r-efi", - "wasi", + "wasi 0.14.2+wasi-0.2.4", ] +[[package]] +name = "gimli" +version = "0.31.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" + [[package]] name = "glob" version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" +[[package]] +name = "h2" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3c0b69cfcb4e1b9f1bf2f53f95f766e4661169728ec61cd3fe5a0166f2d1386" +dependencies = [ + "atomic-waker", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "http", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "hashbrown" version = "0.15.5" @@ -487,6 +714,240 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "http" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http", +] + +[[package]] +name = "http-body-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" +dependencies = [ + "bytes", + "futures-core", + "http", + "http-body", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" + +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + +[[package]] +name = "hyper" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb3aa54a13a0dfe7fbe3a59e0c76093041720fdc77b110cc0fc260fafb4dc51e" +dependencies = [ + "atomic-waker", + "bytes", + "futures-channel", + "futures-core", + "h2", + "http", + "http-body", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "pin-utils", + "smallvec", + "tokio", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.27.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" +dependencies = [ + "http", + "hyper", + "hyper-util", + "rustls", + "rustls-pki-types", + "tokio", + "tokio-rustls", + "tower-service", +] + +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", +] + +[[package]] +name = "hyper-util" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d9b05277c7e8da2c93a568989bb6207bef0112e8d17df7a6eda4a3cf143bc5e" +dependencies = [ + "base64", + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "http", + "http-body", + "hyper", + "ipnet", + "libc", + "percent-encoding", + "pin-project-lite", + "socket2", + "system-configuration", + "tokio", + "tower-service", + "tracing", + "windows-registry", +] + +[[package]] +name = "icu_collections" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47" +dependencies = [ + "displaydoc", + "potential_utf", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3" + +[[package]] +name = "icu_properties" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "potential_utf", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632" + +[[package]] +name = "icu_provider" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af" +dependencies = [ + "displaydoc", + "icu_locale_core", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "idna" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + [[package]] name = "indexmap" version = "2.10.0" @@ -510,6 +971,33 @@ dependencies = [ "web-time", ] +[[package]] +name = "io-uring" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "046fa2d4d00aea763528b4950358d0ead425372445dc8ff86312b3c69ff7727b" +dependencies = [ + "bitflags", + "cfg-if", + "libc", +] + +[[package]] +name = "ipnet" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" + +[[package]] +name = "iri-string" +version = "0.7.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbc5ebe9c3a1a7a5127f920a418f7585e9e758e911d0466ed004f393b0e380b2" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.1" @@ -537,7 +1025,7 @@ version = "0.1.33" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a" dependencies = [ - "getrandom", + "getrandom 0.3.3", "libc", ] @@ -551,6 +1039,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + [[package]] name = "libc" version = "0.2.175" @@ -592,6 +1086,12 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" +[[package]] +name = "litemap" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" + [[package]] name = "lock_api" version = "0.4.13" @@ -608,6 +1108,12 @@ version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" +[[package]] +name = "matchit" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" + [[package]] name = "mem_dbg" version = "0.3.0" @@ -635,6 +1141,12 @@ version = "2.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + [[package]] name = "miniz_oxide" version = "0.8.9" @@ -645,10 +1157,38 @@ dependencies = [ ] [[package]] -name = "needletail" -version = "0.6.3" +name = "mio" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6aa22e1ae8bce4ecf257e2475ef2046026caea08d66b1848d073fe7bc77e4351" +checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c" +dependencies = [ + "libc", + "wasi 0.11.1+wasi-snapshot-preview1", + "windows-sys 0.59.0", +] + +[[package]] +name = "native-tls" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87de3442987e9dbec73158d5c715e7ad9072fda936bb03d19d7fa10e00520f0e" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + +[[package]] +name = "needletail" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6aa22e1ae8bce4ecf257e2475ef2046026caea08d66b1848d073fe7bc77e4351" dependencies = [ "buffer-redux", "bytecount", @@ -680,6 +1220,15 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61807f77802ff30975e01f4f071c8ba10c022052f98b3294119f3e615d13e5be" +[[package]] +name = "nu-ansi-term" +version = "0.50.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4a28e057d01f97e61255210fcff094d74ed0466038633e95017f5beb68e4399" +dependencies = [ + "windows-sys 0.52.0", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -695,6 +1244,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" +[[package]] +name = "object" +version = "0.36.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87" +dependencies = [ + "memchr", +] + [[package]] name = "once_cell" version = "1.21.3" @@ -707,6 +1265,50 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" +[[package]] +name = "openssl" +version = "0.10.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8505734d46c8ab1e19a1dce3aef597ad87dcb4c37e7188231769bd6bd51cebf8" +dependencies = [ + "bitflags", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "openssl-probe" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" + +[[package]] +name = "openssl-sys" +version = "0.9.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90096e2e47630d78b7d1c20952dc621f957103f8bc2c8359ec81290d75238571" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "packed-seq" version = "3.2.1" @@ -757,6 +1359,12 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + [[package]] name = "pin-project-lite" version = "0.2.16" @@ -781,6 +1389,15 @@ version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" +[[package]] +name = "potential_utf" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84df19adbe5b5a0782edcab45899906947ab039ccf4573713735ee7de1e6b08a" +dependencies = [ + "zerovec", +] + [[package]] name = "ppv-lite86" version = "0.2.21" @@ -879,7 +1496,7 @@ version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" dependencies = [ - "getrandom", + "getrandom 0.3.3", ] [[package]] @@ -946,6 +1563,62 @@ version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba39f3699c378cd8970968dcbff9c43159ea4cfbd88d43c00b22f2ef10a435d2" +[[package]] +name = "reqwest" +version = "0.12.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d429f34c8092b2d42c7c93cec323bb4adeb7c67698f70839adec842ec10c7ceb" +dependencies = [ + "base64", + "bytes", + "encoding_rs", + "futures-channel", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-rustls", + "hyper-tls", + "hyper-util", + "js-sys", + "log", + "mime", + "native-tls", + "percent-encoding", + "pin-project-lite", + "rustls-pki-types", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tokio-native-tls", + "tower", + "tower-http", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.16", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + [[package]] name = "rstest" version = "0.25.0" @@ -976,6 +1649,12 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "rustc-demangle" +version = "0.1.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" + [[package]] name = "rustc-hash" version = "2.1.1" @@ -1004,6 +1683,45 @@ dependencies = [ "windows-sys 0.60.2", ] +[[package]] +name = "rustls" +version = "0.23.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0ebcbd2f03de0fc1122ad9bb24b127a5a6cd51d72604a3f3c50ac459762b6cc" +dependencies = [ + "once_cell", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-pki-types" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "229a4a4c221013e7e1f1a043678c5cc39fe5171437c88fb47151a21e6f5b5c79" +dependencies = [ + "zeroize", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a17884ae0c1b773f1ccd2bd4a8c72f16da897310a98b0e84bf349ad5ead92fc" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + [[package]] name = "ryu" version = "1.0.20" @@ -1019,12 +1737,44 @@ dependencies = [ "bytemuck", ] +[[package]] +name = "schannel" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f29ebaa345f945cec9fbbc532eb307f0fdad8161f281b6369539c8d84876b3d" +dependencies = [ + "windows-sys 0.59.0", +] + [[package]] name = "scopeguard" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "security-framework" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" +dependencies = [ + "bitflags", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49db231d56a190491cb4aeda9527f1ad45345af50b0851622a7adb8c03b01c32" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "semver" version = "1.0.26" @@ -1063,12 +1813,52 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_path_to_error" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59fab13f937fa393d08645bf3a84bdfe86e296747b506ada67bb15f10f218b2a" +dependencies = [ + "itoa", + "serde", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + [[package]] name = "shlex" version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "signal-hook-registry" +version = "1.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2a4719bff48cee6b39d12c020eeb490953ad2443b7055bd0b21fca26bd8c28b" +dependencies = [ + "libc", +] + [[package]] name = "simd-minimizers" version = "1.3.0" @@ -1092,12 +1882,34 @@ version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +[[package]] +name = "socket2" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "233504af464074f9d066d7b5416c5f9b894a5862a6506e306f7b816cdd6f1807" +dependencies = [ + "libc", + "windows-sys 0.59.0", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + [[package]] name = "strsim" version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + [[package]] name = "syn" version = "2.0.105" @@ -1109,6 +1921,47 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "sync_wrapper" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" +dependencies = [ + "futures-core", +] + +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "system-configuration" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" +dependencies = [ + "bitflags", + "core-foundation", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "tempfile" version = "3.20.0" @@ -1116,7 +1969,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8a64e3985349f2441a1a9ef0b853f869006c3855f2cda6862a94d26ebb9d6a1" dependencies = [ "fastrand", - "getrandom", + "getrandom 0.3.3", "once_cell", "rustix", "windows-sys 0.59.0", @@ -1168,6 +2021,25 @@ dependencies = [ "syn", ] +[[package]] +name = "thread_local" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "tinystr" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" +dependencies = [ + "displaydoc", + "zerovec", +] + [[package]] name = "tinyvec" version = "1.9.0" @@ -1183,6 +2055,70 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "tokio" +version = "1.47.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89e49afdadebb872d3145a5638b59eb0691ea23e46ca484037cfab3b76b95038" +dependencies = [ + "backtrace", + "bytes", + "io-uring", + "libc", + "mio", + "parking_lot", + "pin-project-lite", + "signal-hook-registry", + "slab", + "socket2", + "tokio-macros", + "windows-sys 0.59.0", +] + +[[package]] +name = "tokio-macros" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + +[[package]] +name = "tokio-rustls" +version = "0.26.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e727b36a1a0e8b74c376ac2211e40c2c8af09fb4013c60d910495810f008e9b" +dependencies = [ + "rustls", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14307c986784f72ef81c89db7d9e28d6ac26d16213b109ea501696195e6e3ce5" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + [[package]] name = "toml_datetime" version = "0.6.11" @@ -1200,6 +2136,117 @@ dependencies = [ "winnow", ] +[[package]] +name = "tower" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper", + "tokio", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower-http" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adc82fd73de2a9722ac5da747f12383d2bfdb93591ee6c58486e0097890f05f2" +dependencies = [ + "bitflags", + "bytes", + "futures-util", + "http", + "http-body", + "iri-string", + "pin-project-lite", + "tower", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + +[[package]] +name = "tracing" +version = "0.1.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" +dependencies = [ + "log", + "pin-project-lite", + "tracing-core", +] + +[[package]] +name = "tracing-core" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d12581f227e93f094d3af2ae690a574abb8a2b9b7a96e7cfe9647b2b617678" +dependencies = [ + "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-serde" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1" +dependencies = [ + "serde", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2054a14f5307d601f88daf0553e1cbf472acc4f2c51afab632431cdcd72124d5" +dependencies = [ + "nu-ansi-term", + "serde", + "serde_json", + "sharded-slab", + "smallvec", + "thread_local", + "tracing-core", + "tracing-log", + "tracing-serde", +] + +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + [[package]] name = "unicode-ident" version = "1.0.18" @@ -1212,18 +2259,54 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a1a07cc7db3810833284e8d372ccdc6da29741639ecc70c9ec107df0fa6154c" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + [[package]] name = "unty" version = "0.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae" +[[package]] +name = "url" +version = "2.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08bc136a29a3d1758e07a9cca267be308aeebf5cfd5a10f3f67ab2097683ef5b" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", + "serde", +] + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + [[package]] name = "utf8parse" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "virtue" version = "0.0.18" @@ -1239,6 +2322,21 @@ dependencies = [ "libc", ] +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + [[package]] name = "wasi" version = "0.14.2+wasi-0.2.4" @@ -1256,6 +2354,7 @@ checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" dependencies = [ "cfg-if", "once_cell", + "rustversion", "wasm-bindgen-macro", ] @@ -1273,6 +2372,19 @@ dependencies = [ "wasm-bindgen-shared", ] +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "555d470ec0bc3bb57890405e5d4322cc9ea83cebb085523ced7be4144dac1e61" +dependencies = [ + "cfg-if", + "js-sys", + "once_cell", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "wasm-bindgen-macro" version = "0.2.100" @@ -1305,6 +2417,16 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "web-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "web-time" version = "1.1.0" @@ -1331,6 +2453,44 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" +[[package]] +name = "windows-registry" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b8a9ed28765efc97bbc954883f4e6796c33a06546ebafacbabee9696967499e" +dependencies = [ + "windows-link", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-result" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56f42bd332cc6c8eac5af113fc0c1fd6a8fd2aa08a0119358686e5160d0586c6" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56e6c93f3a0c3b36176cb1327a4958a0353d5d166c2a35cb268ace15e91d3b57" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-sys" version = "0.59.0" @@ -1496,12 +2656,42 @@ dependencies = [ "bitflags", ] +[[package]] +name = "writeable" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" + [[package]] name = "xxhash-rust" version = "0.8.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" +[[package]] +name = "yoke" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + [[package]] name = "zerocopy" version = "0.8.26" @@ -1522,6 +2712,66 @@ dependencies = [ "syn", ] +[[package]] +name = "zerofrom" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zeroize" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" + +[[package]] +name = "zerotrie" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7aa2bd55086f1ab526693ecbe444205da57e25f4489879da80635a46d90e73b" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "zlib-rs" version = "0.5.1" diff --git a/Cargo.toml b/Cargo.toml index d496c89..89fbbc4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,6 +35,14 @@ parking_lot = "0.12" zstd = "0.13" indicatif = "0.17" liblzma = "0.3.1" +tokio = { version = "1.47.0", features = ["full"], optional = true } +axum = { version = "0.8.4", optional = true } +tracing-subscriber = { version = "0.3.19", features = ["fmt", "registry", "json"], optional = true } +reqwest = { version = "0.12.22", features = ["blocking", "json"], optional = true } + +[features] +# default = ["server"] +server = ["dep:tokio", "dep:axum", "dep:tracing-subscriber", "dep:reqwest"] [lints.clippy] too_many_arguments = "allow" diff --git a/src/filter.rs b/src/filter.rs index e95e378..ba971b9 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -1,1107 +1,4 @@ -use crate::{FilterConfig, index::load_minimizer_hashes}; -use anyhow::{Context, Result}; -use flate2::write::GzEncoder; -use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle}; -use liblzma::write::XzEncoder; -use packed_seq::SeqVec; -use paraseq::Record; -use paraseq::fastx::Reader; -use paraseq::parallel::{ - InterleavedParallelProcessor, InterleavedParallelReader, PairedParallelProcessor, - PairedParallelReader, ParallelProcessor, ParallelReader, -}; -use parking_lot::Mutex; -use rustc_hash::FxHashSet; -use serde::{Deserialize, Serialize}; -use simd_minimizers; -use std::fs::{File, OpenOptions}; -use std::io::{self, BufWriter, Write}; -use std::sync::Arc; -use std::time::Instant; -use xxhash_rust; -use zstd::stream::write::Encoder as ZstdEncoder; - -const OUTPUT_BUFFER_SIZE: usize = 8 * 1024 * 1024; // Opt: 8MB output buffer -const DEFAULT_BUFFER_SIZE: usize = 64 * 1024; - -type BoxedWriter = Box; - -/// Config for FilterProcessor -struct FilterProcessorConfig { - abs_threshold: usize, - rel_threshold: f64, - prefix_length: usize, - deplete: bool, - rename: bool, - debug: bool, -} - -/// Create a paraseq reader from optional path (stdin if None or "-") -fn create_paraseq_reader(path: Option<&str>) -> Result>> { - match path { - None | Some("-") => { - let stdin_reader = Box::new(std::io::stdin()) as Box; - Reader::new(stdin_reader) - .map_err(|e| anyhow::anyhow!("Failed to create stdin reader: {}", e)) - } - Some(p) => { - let (reader, _format) = niffler::send::from_path(p) - .map_err(|e| anyhow::anyhow!("Failed to open file {}: {}", p, e))?; - Reader::new(reader) - .map_err(|e| anyhow::anyhow!("Failed to create reader for {}: {}", p, e)) - } - } -} - -/// Format a single record into a buffer (FASTA/FASTQ format) -/// -/// `seq` is the newline-free sequence corresponding to the record, obtained from `record.seq()`. -fn format_record_to_buffer( - record: &R, - seq: &[u8], - counter: u64, - rename: bool, - buffer: &mut Vec, -) -> Result<()> { - let is_fasta = record.qual().is_none(); - - // Header line - buffer.write_all(if is_fasta { b">" } else { b"@" })?; - if rename { - buffer.extend_from_slice(counter.to_string().as_bytes()); - } else { - buffer.extend_from_slice(record.id()); - } - buffer.write_all(b"\n")?; - - // Sequence line - buffer.extend_from_slice(seq); - - if is_fasta { - buffer.write_all(b"\n")?; - } else { - // FASTQ: plus line and quality - buffer.write_all(b"\n+\n")?; - if let Some(qual) = record.qual() { - buffer.extend_from_slice(qual); - } - buffer.write_all(b"\n")?; - } - Ok(()) -} - -/// Validate compression level for the given format -fn validate_compression_level(level: u8, min: u8, max: u8, format: &str) -> Result<()> { - if level < min || level > max { - Err(anyhow::anyhow!( - "Invalid {} compression level {}. Must be between {} and {}.", - format, - level, - min, - max - )) - } else { - Ok(()) - } -} - -// Return a file writer appropriate for the output path extension -fn get_writer(output_path: &str, compression_level: u8) -> Result { - if output_path == "-" { - return Ok(Box::new(BufWriter::with_capacity( - OUTPUT_BUFFER_SIZE, - io::stdout(), - ))); - } - - let file = OpenOptions::new() - .write(true) - .create(true) - .truncate(true) - .open(output_path) - .context(format!("Failed to create output file: {}", output_path))?; - - let buffered_file = BufWriter::with_capacity(OUTPUT_BUFFER_SIZE, file); - - match output_path { - p if p.ends_with(".gz") => { - validate_compression_level(compression_level, 1, 9, "gzip")?; - Ok(Box::new(GzEncoder::new( - buffered_file, - flate2::Compression::new(compression_level as u32), - ))) - } - p if p.ends_with(".zst") => { - validate_compression_level(compression_level, 1, 22, "zstd")?; - Ok(Box::new(ZstdEncoder::new( - buffered_file, - compression_level as i32, - )?)) - } - p if p.ends_with(".xz") => { - validate_compression_level(compression_level, 0, 9, "xz")?; - Ok(Box::new(XzEncoder::new( - buffered_file, - compression_level as u32, - ))) - } - _ => Ok(Box::new(buffered_file)), - } -} - -// JSON summary structure -#[derive(Serialize, Deserialize)] -pub struct FilterSummary { - version: String, - index: String, - input: String, - input2: Option, - output: String, - output2: Option, - k: u8, - w: u8, - abs_threshold: usize, - rel_threshold: f64, - prefix_length: usize, - deplete: bool, - rename: bool, - seqs_in: u64, - seqs_out: u64, - seqs_out_proportion: f64, - seqs_removed: u64, - seqs_removed_proportion: f64, - bp_in: u64, - bp_out: u64, - bp_out_proportion: f64, - bp_removed: u64, - bp_removed_proportion: f64, - time: f64, - seqs_per_second: u64, - bp_per_second: u64, -} - -#[derive(Clone)] -struct FilterProcessor { - // Minimizer matching parameters - minimizer_hashes: Arc>, - kmer_length: u8, - window_size: u8, - abs_threshold: usize, - rel_threshold: f64, - prefix_length: usize, - deplete: bool, - rename: bool, - debug: bool, - - // Local buffers - local_buffer: Vec, - local_buffer2: Vec, // Second buffer for paired output - local_stats: ProcessingStats, - filter_buffers: FilterBuffers, - - // Global state - global_writer: Arc>, - global_writer2: Option>>, - global_stats: Arc>, - spinner: Option>>, - filtering_start_time: Instant, -} - -#[derive(Clone, Default)] -struct ProcessingStats { - total_seqs: u64, - filtered_seqs: u64, - total_bp: u64, - output_bp: u64, - filtered_bp: u64, - output_seq_counter: u64, -} - -#[derive(Default, Clone)] -struct FilterBuffers { - packed_seq: packed_seq::PackedSeqVec, - invalid_mask: Vec, - positions: Vec, - minimizer_values: Vec, -} - -impl FilterProcessor { - /// Calculate required hits based on absolute and relative thresholds - fn calculate_required_hits(&self, total_minimizers: usize) -> usize { - let abs_required = self.abs_threshold; - let rel_required = if total_minimizers == 0 { - 0 - } else { - ((self.rel_threshold * total_minimizers as f64).round() as usize).max(1) - }; - abs_required.max(rel_required) - } - - /// Check if sequence meets filtering criteria - fn meets_filtering_criteria(&self, hit_count: usize, total_minimizers: usize) -> bool { - let required = self.calculate_required_hits(total_minimizers); - if self.deplete { - hit_count < required - } else { - hit_count >= required - } - } - fn new( - minimizer_hashes: Arc>, - kmer_length: u8, - window_size: u8, - config: &FilterProcessorConfig, - writer: BoxedWriter, - writer2: Option, - spinner: Option>>, - filtering_start_time: Instant, - ) -> Self { - Self { - minimizer_hashes, - kmer_length, - window_size, - abs_threshold: config.abs_threshold, - rel_threshold: config.rel_threshold, - prefix_length: config.prefix_length, - deplete: config.deplete, - rename: config.rename, - debug: config.debug, - local_buffer: Vec::with_capacity(DEFAULT_BUFFER_SIZE), - local_buffer2: Vec::with_capacity(DEFAULT_BUFFER_SIZE), - local_stats: ProcessingStats::default(), - filter_buffers: FilterBuffers::default(), - global_writer: Arc::new(Mutex::new(writer)), - global_writer2: writer2.map(|w| Arc::new(Mutex::new(w))), - global_stats: Arc::new(Mutex::new(ProcessingStats::default())), - spinner, - filtering_start_time, - } - } - - fn should_keep_sequence(&mut self, seq: &[u8]) -> (bool, usize, usize, Vec) { - if seq.len() < self.kmer_length as usize { - return (self.deplete, 0, 0, Vec::new()); // If too short, keep if in deplete mode - } - - // Apply prefix length limit if specified - let effective_seq = if self.prefix_length > 0 && seq.len() > self.prefix_length { - &seq[..self.prefix_length] - } else { - seq - }; - - // Trim the last newline character from `effective_seq` if it has one. - let effective_seq = effective_seq.strip_suffix(b"\n").unwrap_or(effective_seq); - - let FilterBuffers { - packed_seq, - invalid_mask, - positions, - minimizer_values, - } = &mut self.filter_buffers; - - packed_seq.clear(); - minimizer_values.clear(); - positions.clear(); - invalid_mask.clear(); - - // Pack the sequence into 2-bit representation. - // Any non-ACGT characters are silently converted to 2-bit ACGT as well. - packed_seq.push_ascii(effective_seq); - // let packed_seq = packed_seq::PackedSeqVec::from_ascii(effective_seq); - - // TODO: Extract this to some nicer helper function in packed_seq? - // TODO: Use SIMD? - // TODO: Should probably add some test for this. - // +2: one to round up, and one buffer. - invalid_mask.resize(packed_seq.len() / 64 + 2, 0); - // let mut invalid_mask = vec![0u64; packed_seq.len() / 64 + 2]; - for i in (0..effective_seq.len()).step_by(64) { - let mut mask = 0; - for (j, b) in effective_seq[i..(i + 64).min(effective_seq.len())] - .iter() - .enumerate() - { - mask |= ((!matches!(b, b'A' | b'C' | b'G' | b'T' | b'a' | b'c' | b'g' | b't')) - as u64) - << j; - } - - invalid_mask[i / 64] = mask; - } - - // let mut positions = Vec::new(); - simd_minimizers::canonical_minimizer_positions( - packed_seq.as_slice(), - self.kmer_length as usize, - self.window_size as usize, - positions, - ); - - assert!( - self.kmer_length <= 57, - "Indexing the bitmask of invalid characters requires k<=57, but it is {}", - self.kmer_length - ); - - // Filter positions to only include k-mers with ACGT bases - positions.retain(|&pos| { - // Extract bits pos .. pos+k from the bitmask. - - // mask of k ones in low positions. - let mask = u64::MAX >> (64 - self.kmer_length); - let byte = pos as usize / 8; - let offset = pos as usize % 8; - // The unaligned u64 read is OK, because we ensure that the underlying `Vec` always - // has at least 8 bytes of padding at the end. - let x = - (unsafe { invalid_mask.as_ptr().byte_add(byte).read_unaligned() } >> offset) & mask; - x == 0 - }); - - // Hash valid positions - if self.kmer_length > 32 { - minimizer_values.extend( - simd_minimizers::iter_canonical_minimizer_values_u128( - packed_seq.as_slice(), - self.kmer_length as usize, - positions, - ) - .map(|kmer| xxhash_rust::xxh3::xxh3_64(&kmer.to_le_bytes())), - ); - } else { - minimizer_values.extend( - simd_minimizers::iter_canonical_minimizer_values( - packed_seq.as_slice(), - self.kmer_length as usize, - positions, - ) - .map(|kmer| xxhash_rust::xxh3::xxh3_64(&kmer.to_le_bytes())), - ); - } - - let num_minimizers = minimizer_values.len(); - - // Count distinct minimizer hits and collect matching k-mers - let mut seen_hits = FxHashSet::default(); - let mut hit_count = 0; - let mut hit_kmers = Vec::new(); - - for (i, &hash) in minimizer_values.iter().enumerate() { - if self.minimizer_hashes.contains(&hash) && seen_hits.insert(hash) { - hit_count += 1; - // Extract the k-mer sequence at this position - if self.debug && i < positions.len() { - let pos = positions[i] as usize; - let kmer = &effective_seq[pos..pos + self.kmer_length as usize]; - hit_kmers.push(String::from_utf8_lossy(kmer).to_string()); - } - } - } - - ( - self.meets_filtering_criteria(hit_count, num_minimizers), - hit_count, - num_minimizers, - hit_kmers, - ) - } - - fn get_minimizer_hashes_and_positions(&self, seq: &[u8]) -> (Vec, Vec) { - // Canonicalise sequence - let canonical_seq = seq - .iter() - .map(|&b| match b { - b'A' | b'a' => b'A', - b'C' | b'c' => b'C', - b'G' | b'g' => b'G', - b'T' | b't' => b'T', - _ => b'C', - }) - .collect::>(); - - let mut positions = Vec::new(); - simd_minimizers::canonical_minimizer_positions( - packed_seq::AsciiSeq(&canonical_seq), - self.kmer_length as usize, - self.window_size as usize, - &mut positions, - ); - - // Filter to valid positions - let valid_positions: Vec = positions - .into_iter() - .filter(|&pos| { - let pos_usize = pos as usize; - if pos_usize + self.kmer_length as usize <= seq.len() { - let kmer = &seq[pos_usize..pos_usize + self.kmer_length as usize]; - kmer.iter().all(|&b| { - matches!(b, b'A' | b'C' | b'G' | b'T' | b'a' | b'c' | b'g' | b't') - }) - } else { - false - } - }) - .collect(); - - // Get hashes - let hashes: Vec = simd_minimizers::iter_canonical_minimizer_values( - packed_seq::AsciiSeq(&canonical_seq), - self.kmer_length as usize, - &valid_positions, - ) - .map(|kmer| xxhash_rust::xxh3::xxh3_64(&kmer.to_le_bytes())) - .collect(); - - (hashes, valid_positions) - } - - fn should_keep_pair(&self, seq1: &[u8], seq2: &[u8]) -> (bool, usize, usize, Vec) { - let mut all_hashes = Vec::new(); - let mut all_positions = Vec::new(); - let mut all_sequences = Vec::new(); - let mut seen_hits_pair = FxHashSet::default(); - let mut pair_hit_count = 0; - let mut hit_kmers = Vec::new(); - - // Process read 1 - if seq1.len() >= self.kmer_length as usize { - let effective_seq = if self.prefix_length > 0 && seq1.len() > self.prefix_length { - &seq1[..self.prefix_length] - } else { - seq1 - }; - - let (hashes, positions) = self.get_minimizer_hashes_and_positions(effective_seq); - all_hashes.extend(hashes); - all_positions.extend(positions); - all_sequences.extend(vec![effective_seq; all_hashes.len()]); - } - - // Process read 2 - if seq2.len() >= self.kmer_length as usize { - let effective_seq = if self.prefix_length > 0 && seq2.len() > self.prefix_length { - &seq2[..self.prefix_length] - } else { - seq2 - }; - - let (hashes, positions) = self.get_minimizer_hashes_and_positions(effective_seq); - let start_idx = all_hashes.len(); - all_hashes.extend(hashes); - all_positions.extend(positions); - all_sequences.extend(vec![effective_seq; all_hashes.len() - start_idx]); - } - - // Count hits and collect k-mers - for (i, &hash) in all_hashes.iter().enumerate() { - if self.minimizer_hashes.contains(&hash) && seen_hits_pair.insert(hash) { - pair_hit_count += 1; - if self.debug && i < all_positions.len() && i < all_sequences.len() { - let pos = all_positions[i] as usize; - let seq = all_sequences[i]; - if pos + self.kmer_length as usize <= seq.len() { - let kmer = &seq[pos..pos + self.kmer_length as usize]; - hit_kmers.push(String::from_utf8_lossy(kmer).to_string()); - } - } - } - } - - let total_minimizers = all_hashes.len(); - ( - self.meets_filtering_criteria(pair_hit_count, total_minimizers), - pair_hit_count, - total_minimizers, - hit_kmers, - ) - } - - fn write_record(&mut self, record: &Rf, seq: &[u8]) -> Result<()> { - self.local_stats.output_seq_counter += 1; - format_record_to_buffer( - record, - seq, - self.local_stats.output_seq_counter, - self.rename, - &mut self.local_buffer, - ) - } - - fn write_record_to_buffer2(&mut self, record: &Rf, seq: &[u8]) -> Result<()> { - self.local_stats.output_seq_counter += 1; - format_record_to_buffer( - record, - seq, - self.local_stats.output_seq_counter, - self.rename, - &mut self.local_buffer2, - ) - } - - fn update_spinner(&self) { - if let Some(ref spinner) = self.spinner { - let stats = self.global_stats.lock(); - let elapsed = self.filtering_start_time.elapsed(); - let seqs_per_sec = stats.total_seqs as f64 / elapsed.as_secs_f64(); - let bp_per_sec = stats.total_bp as f64 / elapsed.as_secs_f64(); - let mbp_per_sec = bp_per_sec / 1_000_000.0; - - let output_seqs = stats.total_seqs - stats.filtered_seqs; - let output_proportion = if stats.total_seqs > 0 { - output_seqs as f64 / stats.total_seqs as f64 - } else { - 0.0 - }; - - let output_bp_proportion = if stats.total_bp > 0 { - stats.output_bp as f64 / stats.total_bp as f64 - } else { - 0.0 - }; - - spinner.lock().set_message(format!( - "Retained {}/{} sequences ({:.2}%), {}/{} bp ({:.2}%). {:.0} seqs/s ({:.1} Mbp/s)", - output_seqs, - stats.total_seqs, - output_proportion * 100.0, - stats.output_bp, - stats.total_bp, - output_bp_proportion * 100.0, - seqs_per_sec, - mbp_per_sec - )); - } - } -} - -impl ParallelProcessor for FilterProcessor { - fn process_record(&mut self, record: Rf) -> paraseq::parallel::Result<()> { - let seq = record.seq(); - self.local_stats.total_seqs += 1; - self.local_stats.total_bp += seq.len() as u64; - - let (should_keep, hit_count, total_minimizers, hit_kmers) = self.should_keep_sequence(&seq); - - // Show debug info for sequences with hits - if self.debug { - eprintln!( - "DEBUG: {} hits={}/{} keep={} kmers=[{}]", - String::from_utf8_lossy(record.id()), - hit_count, - total_minimizers, - should_keep, - hit_kmers.join(",") - ); - } - - if should_keep { - self.local_stats.output_bp += seq.len() as u64; - self.write_record(&record, &seq)?; - } else { - self.local_stats.filtered_seqs += 1; - self.local_stats.filtered_bp += seq.len() as u64; - } - - Ok(()) - } - - fn on_batch_complete(&mut self) -> paraseq::parallel::Result<()> { - // Write buffer to output - if !self.local_buffer.is_empty() { - let mut global_writer = self.global_writer.lock(); - global_writer.write_all(&self.local_buffer)?; - global_writer.flush()?; - } - - // Clear buffer after releasing the lock - self.local_buffer.clear(); - - // Update global stats - { - let mut stats = self.global_stats.lock(); - stats.total_seqs += self.local_stats.total_seqs; - stats.filtered_seqs += self.local_stats.filtered_seqs; - stats.total_bp += self.local_stats.total_bp; - stats.output_bp += self.local_stats.output_bp; - stats.filtered_bp += self.local_stats.filtered_bp; - stats.output_seq_counter += self.local_stats.output_seq_counter; - } - - // Update spinner - self.update_spinner(); - - // Reset local stats - self.local_stats = ProcessingStats::default(); - - Ok(()) - } -} - -impl InterleavedParallelProcessor for FilterProcessor { - fn process_interleaved_pair( - &mut self, - record1: Rf, - record2: Rf, - ) -> paraseq::parallel::Result<()> { - let seq1 = record1.seq(); - let seq2 = record2.seq(); - - self.local_stats.total_seqs += 2; - self.local_stats.total_bp += (seq1.len() + seq2.len()) as u64; - - let (should_keep, hit_count, total_minimizers, hit_kmers) = - self.should_keep_pair(&seq1, &seq2); - - // Debug info for interleaved pairs - if self.debug && hit_count > 0 { - eprintln!( - "DEBUG: {}/{} hits={}/{} keep={} kmers=[{}]", - String::from_utf8_lossy(record1.id()), - String::from_utf8_lossy(record2.id()), - hit_count, - total_minimizers, - should_keep, - hit_kmers.join(",") - ); - } - - if should_keep { - self.local_stats.output_bp += (seq1.len() + seq2.len()) as u64; - - // Write both records to output - self.write_record(&record1, &seq1)?; - self.write_record(&record2, &seq2)?; - } else { - self.local_stats.filtered_seqs += 2; - self.local_stats.filtered_bp += (seq1.len() + seq2.len()) as u64; - } - - Ok(()) - } - - fn on_batch_complete(&mut self) -> paraseq::parallel::Result<()> { - // Write buffer to output - if !self.local_buffer.is_empty() { - let mut global_writer = self.global_writer.lock(); - global_writer.write_all(&self.local_buffer)?; - global_writer.flush()?; - } - - // Clear buffer after releasing the lock for better performance - self.local_buffer.clear(); - - // Update global stats - { - let mut stats = self.global_stats.lock(); - stats.total_seqs += self.local_stats.total_seqs; - stats.filtered_seqs += self.local_stats.filtered_seqs; - stats.total_bp += self.local_stats.total_bp; - stats.output_bp += self.local_stats.output_bp; - stats.filtered_bp += self.local_stats.filtered_bp; - stats.output_seq_counter += self.local_stats.output_seq_counter; - } - - // Update spinner - self.update_spinner(); - - // Reset local stats - self.local_stats = ProcessingStats::default(); - - Ok(()) - } -} - -impl PairedParallelProcessor for FilterProcessor { - fn process_record_pair( - &mut self, - record1: Rf, - record2: Rf, - ) -> paraseq::parallel::Result<()> { - let seq1 = record1.seq(); - let seq2 = record2.seq(); - self.local_stats.total_seqs += 2; - self.local_stats.total_bp += (seq1.len() + seq2.len()) as u64; - - let (should_keep, hit_count, total_minimizers, hit_kmers) = - self.should_keep_pair(&seq1, &seq2); - - // Debug info for paired reads - if self.debug && hit_count > 0 { - eprintln!( - "DEBUG: {}/{} hits={}/{} keep={} kmers=[{}]", - String::from_utf8_lossy(record1.id()), - String::from_utf8_lossy(record2.id()), - hit_count, - total_minimizers, - should_keep, - hit_kmers.join(",") - ); - } - - if should_keep { - self.local_stats.output_bp += (seq1.len() + seq2.len()) as u64; - - // Write to appropriate writers - if self.global_writer2.is_some() { - // Separate outputs - self.write_record(&record1, &seq1)?; - self.write_record_to_buffer2(&record2, &seq2)?; - } else { - // Interleaved output - self.write_record(&record1, &seq1)?; - self.write_record(&record2, &seq2)?; - } - } else { - self.local_stats.filtered_seqs += 2; - self.local_stats.filtered_bp += (seq1.len() + seq2.len()) as u64; - } - - Ok(()) - } - - fn on_batch_complete(&mut self) -> paraseq::parallel::Result<()> { - if let Some(ref writer2) = self.global_writer2 { - // Atomic paired batch writing - if !self.local_buffer.is_empty() || !self.local_buffer2.is_empty() { - let mut writer1 = self.global_writer.lock(); - let mut writer2 = writer2.lock(); - - writer1.write_all(&self.local_buffer)?; - writer1.flush()?; - writer2.write_all(&self.local_buffer2)?; - writer2.flush()?; - } - } else { - // Interleaved output - if !self.local_buffer.is_empty() { - let mut writer = self.global_writer.lock(); - writer.write_all(&self.local_buffer)?; - writer.flush()?; - } - } - - self.local_buffer.clear(); - self.local_buffer2.clear(); - - // Update global stats - { - let mut stats = self.global_stats.lock(); - stats.total_seqs += self.local_stats.total_seqs; - stats.filtered_seqs += self.local_stats.filtered_seqs; - stats.total_bp += self.local_stats.total_bp; - stats.output_bp += self.local_stats.output_bp; - stats.filtered_bp += self.local_stats.filtered_bp; - stats.output_seq_counter += self.local_stats.output_seq_counter; - } - - // Update spinner - self.update_spinner(); - - // Reset local stats - self.local_stats = ProcessingStats::default(); - - Ok(()) - } -} - -pub fn run(config: &FilterConfig) -> Result<()> { - let start_time = Instant::now(); - let version: String = env!("CARGO_PKG_VERSION").to_string(); - let tool_version = format!("deacon {}", version); - - // Enable quiet mode when debug enabled - let quiet = config.quiet || config.debug; - - // Configure thread pool if nonzero - if config.threads > 0 { - rayon::ThreadPoolBuilder::new() - .num_threads(config.threads) - .build_global() - .context("Failed to initialize thread pool")?; - } - - let mode = if config.deplete { "deplete" } else { "search" }; - - let mut input_type = String::new(); - let mut options = Vec::::new(); - let paired_stdin = config.input_path == "-" - && config.input2_path.is_some() - && config.input2_path.unwrap() == "-"; - if paired_stdin { - input_type.push_str("interleaved"); - } else if config.input2_path.is_some() { - input_type.push_str("paired"); - } else { - input_type.push_str("single"); - } - options.push(format!( - "abs_threshold={}, rel_threshold={}", - config.abs_threshold, config.rel_threshold - )); - if config.prefix_length > 0 { - options.push(format!("prefix_length={}", config.prefix_length)); - } - if config.rename { - options.push("rename".to_string()); - } - if config.threads > 0 { - options.push(format!("threads={}", config.threads)); - } - - if !quiet { - eprintln!( - "Deacon v{}; mode: {}; input: {}; options: {}", - version, - mode, - input_type, - options.join(", ") - ); - } - - // Load minimizer hashes and parse header - let (minimizer_hashes, header) = load_minimizer_hashes(&config.minimizers_path)?; - let minimizer_hashes = Arc::new(minimizer_hashes); - - let kmer_length = header.kmer_length(); - let window_size = header.window_size(); - - let load_time = start_time.elapsed(); - if !quiet { - eprintln!( - "Loaded index (k={}, w={}) in {:.2?}", - kmer_length, window_size, load_time - ); - } - - // Create appropriate writer(s) based on output path(s) - let writer = get_writer(config.output_path, config.compression_level)?; - let writer2 = if let (Some(output2), Some(_)) = (config.output2_path, config.input2_path) { - Some(get_writer(output2, config.compression_level)?) - } else { - None - }; - - // Progress bar setup if not quiet - let spinner = if !quiet { - let pb = ProgressBar::with_draw_target(None, ProgressDrawTarget::stderr()); - pb.set_style( - ProgressStyle::default_spinner() - .tick_strings(&["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]) - .template("{msg}")?, - ); - pb.set_message("Filtering"); - Some(Arc::new(Mutex::new(pb))) - } else { - None - }; - - // Start timer for rate calculation - let filtering_start_time = Instant::now(); - - // Create processor - let processor_config = FilterProcessorConfig { - abs_threshold: config.abs_threshold, - rel_threshold: config.rel_threshold, - prefix_length: config.prefix_length, - deplete: config.deplete, - rename: config.rename, - debug: config.debug, - }; - let processor = FilterProcessor::new( - minimizer_hashes, - kmer_length, - window_size, - &processor_config, - writer, - writer2, - spinner.clone(), - filtering_start_time, - ); - - // Process based on input type - let num_threads = if config.threads == 0 { - rayon::current_num_threads() - } else { - config.threads - }; - - if paired_stdin { - // Interleaved paired from stdin - use native interleaved processor - let reader = create_paraseq_reader(Some("-"))?; - reader.process_parallel_interleaved(processor.clone(), num_threads)?; - } else if let Some(input2_path) = config.input2_path { - // Paired files - let r1_reader = create_paraseq_reader(Some(config.input_path))?; - let r2_reader = create_paraseq_reader(Some(input2_path))?; - r1_reader.process_parallel_paired(r2_reader, processor.clone(), num_threads)?; - } else { - // Single file or stdin - let reader = create_paraseq_reader(Some(config.input_path))?; - reader.process_parallel(processor.clone(), num_threads)?; - } - - let final_stats = processor.global_stats.lock(); - let total_seqs = final_stats.total_seqs; - let filtered_seqs = final_stats.filtered_seqs; - let total_bp = final_stats.total_bp; - let output_bp = final_stats.output_bp; - let filtered_bp = final_stats.filtered_bp; - - drop(final_stats); // Release lock - - // Flush writers - they should auto-flush on drop - drop(processor.global_writer); - if let Some(w2) = processor.global_writer2 { - drop(w2); - } - - let total_time = start_time.elapsed(); - let seqs_per_sec = total_seqs as f64 / total_time.as_secs_f64(); - let bp_per_sec = total_bp as f64 / total_time.as_secs_f64(); - let mbp_per_sec = bp_per_sec / 1_000_000.0; - - // Calculate proportions - let filtered_proportion = if total_seqs > 0 { - filtered_seqs as f64 / total_seqs as f64 - } else { - 0.0 - }; - - let filtered_bp_proportion = if total_bp > 0 { - filtered_bp as f64 / total_bp as f64 - } else { - 0.0 - }; - - let output_seqs = total_seqs - filtered_seqs; - let output_seq_proportion = if total_seqs > 0 { - output_seqs as f64 / total_seqs as f64 - } else { - 0.0 - }; - - let output_bp_proportion = if total_bp > 0 { - output_bp as f64 / total_bp as f64 - } else { - 0.0 - }; - - // Finish and clear spinner - disable it completely - if let Some(ref spinner) = spinner { - let pb = spinner.lock(); - pb.set_draw_target(ProgressDrawTarget::hidden()); - pb.finish_and_clear(); - } - - if !quiet { - eprintln!( - "Retained {}/{} sequences ({:.3}%), {}/{} bp ({:.3}%) in {:.2?}. Speed: {:.0} seqs/s ({:.1} Mbp/s)", - output_seqs, - total_seqs, - output_seq_proportion * 100.0, - output_bp, - total_bp, - output_bp_proportion * 100.0, - total_time, - seqs_per_sec, - mbp_per_sec - ); - } - - // Build and write JSON summary if path provided - if let Some(summary_file) = config.summary_path { - let seqs_out = total_seqs - filtered_seqs; - - let summary = FilterSummary { - version: tool_version, - index: config.minimizers_path.to_string_lossy().to_string(), - input: config.input_path.to_string(), - input2: config.input2_path.map(|s| s.to_string()), - output: config.output_path.to_string(), - output2: config.output2_path.map(|s| s.to_string()), - k: kmer_length, - w: window_size, - abs_threshold: config.abs_threshold, - rel_threshold: config.rel_threshold, - prefix_length: config.prefix_length, - deplete: config.deplete, - rename: config.rename, - seqs_in: total_seqs as u64, - seqs_out: seqs_out as u64, - seqs_out_proportion: output_seq_proportion, - seqs_removed: filtered_seqs as u64, - seqs_removed_proportion: filtered_proportion, - bp_in: total_bp as u64, - bp_out: output_bp as u64, - bp_out_proportion: output_bp_proportion, - bp_removed: filtered_bp as u64, - bp_removed_proportion: filtered_bp_proportion, - time: total_time.as_secs_f64(), - seqs_per_second: seqs_per_sec as u64, - bp_per_second: bp_per_sec as u64, - }; - - let file = File::create(summary_file) - .context(format!("Failed to create summary: {:?}", summary_file))?; - let writer = BufWriter::new(file); - - serde_json::to_writer_pretty(writer, &summary).context("Failed to write summary")?; - - if !quiet { - eprintln!("Summary saved to {:?}", summary_file); - } - } - - Ok(()) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_filter_summary() { - let summary = FilterSummary { - version: "deacon 0.1.0".to_string(), - index: "test.idx".to_string(), - input: "test.fastq".to_string(), - input2: Some("test2.fastq".to_string()), - output: "output.fastq".to_string(), - output2: Some("output2.fastq".to_string()), - k: 31, - w: 21, - abs_threshold: 1, - rel_threshold: 0.01, - prefix_length: 0, - deplete: false, - rename: false, - seqs_in: 100, - seqs_out: 90, - seqs_out_proportion: 0.9, - seqs_removed: 10, - seqs_removed_proportion: 0.1, - bp_in: 10000, - bp_out: 9000, - bp_out_proportion: 0.9, - bp_removed: 1000, - bp_removed_proportion: 0.1, - time: 1.5, - seqs_per_second: 66, - bp_per_second: 6666, - }; - - let json = serde_json::to_string(&summary).unwrap(); - let parsed: FilterSummary = serde_json::from_str(&json).unwrap(); - - assert_eq!(parsed.version, "deacon 0.1.0"); - assert_eq!(parsed.seqs_in, 100); - assert_eq!(parsed.seqs_removed_proportion, 0.1); - assert_eq!(parsed.seqs_out_proportion, 0.9); - assert_eq!(parsed.bp_out_proportion, 0.9); - assert_eq!(parsed.input, "test.fastq"); - assert_eq!(parsed.input2, Some("test2.fastq".to_string())); - assert_eq!(parsed.output, "output.fastq"); - assert_eq!(parsed.output2, Some("output2.fastq".to_string())); - } -} +#[cfg(not(feature = "server"))] +pub use crate::local_filter::run; +#[cfg(feature = "server")] +pub use crate::remote_filter::run; diff --git a/src/filter_common.rs b/src/filter_common.rs new file mode 100644 index 0000000..2a045fe --- /dev/null +++ b/src/filter_common.rs @@ -0,0 +1,344 @@ +use packed_seq::SeqVec; +use rustc_hash::FxHashSet; +use serde::{Deserialize, Serialize}; +use std::path::PathBuf; + +#[cfg(feature = "server")] +use reqwest::blocking::Client; + +// JSON summary structure +#[derive(Serialize, Deserialize)] +pub struct FilterSummary { + pub version: String, + pub index: String, + pub input: String, + pub input2: Option, + pub output: String, + pub output2: Option, + pub k: u8, + pub w: u8, + pub abs_threshold: usize, + pub rel_threshold: f64, + pub prefix_length: usize, + pub deplete: bool, + pub rename: bool, + pub seqs_in: u64, + pub seqs_out: u64, + pub seqs_out_proportion: f64, + pub seqs_removed: u64, + pub seqs_removed_proportion: f64, + pub bp_in: u64, + pub bp_out: u64, + pub bp_out_proportion: f64, + pub bp_removed: u64, + pub bp_removed_proportion: f64, + pub time: f64, + pub seqs_per_second: u64, + pub bp_per_second: u64, +} + +/// Get a summary string for the index used, either from the local path or by querying the server. +/// # Args: +/// * `minimizers_path`: Optional path to the local minimizer index. +/// * `server_address`: Optional server address to query for index version. +/// # Returns: +/// * A string summarizing the index used. If local, it's the path; if from server, it's "address:filename@hash". +pub fn get_summary_index( + minimizers_path: &Option<&PathBuf>, + server_address: &Option, +) -> String { + let index = match minimizers_path { + Some(path) => path.to_string_lossy().to_string(), + None => match &server_address { + None => "No index or server specified".to_string(), + Some(_addr) => { + #[cfg(feature = "server")] + { + let client = Client::new(); + let response = client + .get(_addr.to_owned() + "/index_version") + .send() + .unwrap_or_else(|e| { + panic!("Failed to contact server at {}: {e}", _addr); + }); + if response.status().is_success() { + _addr.to_owned() + + ":" + + &response.text().unwrap_or_else(|e| { + panic!("Failed to parse server response: {e}"); + }) + } else { + panic!("Server returned error: {}", response.status()) + } + } + #[cfg(not(feature = "server"))] + { + panic!("Server feature not enabled, cannot use server address"); + } + } + }, + }; + index +} + +/// Calculate required hits based on absolute and relative thresholds +pub fn calculate_required_hits( + abs_threshold: usize, + rel_threshold: f64, + total_minimizers: usize, +) -> usize { + let abs_required = abs_threshold; + let rel_required = if total_minimizers == 0 { + 0 + } else { + ((rel_threshold * total_minimizers as f64).round() as usize).max(1) + }; + abs_required.max(rel_required) +} + +/// Check if sequence meets filtering criteria +pub fn meets_filtering_criteria( + hit_count: usize, + total_minimizers: usize, + abs_threshold: usize, + rel_threshold: f64, + deplete: bool, +) -> bool { + let required = calculate_required_hits(abs_threshold, rel_threshold, total_minimizers); + if deplete { + hit_count < required + } else { + hit_count >= required + } +} + +/// Check how many minimizers from the sequence are in the set of minimizer hashes +/// and optionally collect the matching k-mers for debugging. Unpaired equivalent of `pair_matches`. +/// Used in both local and remote filtering. +/// +/// # Args: +/// * `minimizer_hashes`: Set of minimizer hashes to check against. +/// * `minimizer_values`: Minimizer hashes from the sequence. +/// * `positions`: Positions of the minimizers in the sequence. +/// * `effective_seq`: The effective sequence used for minimizer calculation. +/// * `kmer_length`: Length of the k-mers. +/// * `debug`: If true, collect matching k-mers. +/// # Returns: +/// * A tuple containing: +/// - The count of distinct minimizer hits. +/// - A vector of matching k-mers as strings (if debug is true). +pub fn sequence_matches( + minimizer_hashes: &FxHashSet, + minimizer_values: &[u64], + positions: &[u32], + effective_seq: &[u8], + kmer_length: u8, + debug: bool, +) -> (usize, Vec) { + // Count distinct minimizer hits and collect matching k-mers + let mut seen_hits = FxHashSet::default(); + let mut hit_count = 0; + let mut hit_kmers = Vec::new(); + + // Should keep sequence if it meets filtering criteria + for (i, &hash) in minimizer_values.iter().enumerate() { + if minimizer_hashes.contains(&hash) && seen_hits.insert(hash) { + hit_count += 1; + // Extract the k-mer sequence at this position + if debug && i < positions.len() { + let pos = positions[i] as usize; + let kmer = &effective_seq[pos..pos + kmer_length as usize]; + hit_kmers.push(String::from_utf8_lossy(kmer).to_string()); + } + } + } + (hit_count, hit_kmers) +} + +/// Check how many minimizers from both sequences in a pair are in the set of minimizer hashes +/// and optionally collect the matching k-mers for debugging. Paired equivalent of `sequence_matches`. +/// Used in both local and remote filtering. +/// +/// # Args: +/// * `all_hashes`: Combined minimizer hashes from both sequences. +/// * `all_positions`: Combined positions of the minimizers in both sequences. +/// * `all_sequences`: Combined effective sequences from both sequences. +/// * `minimizer_hashes`: Set of minimizer hashes to check against. +/// * `kmer_length`: Length of the k-mers. +/// * `debug`: If true, collect matching k-mers. +/// # Returns: +/// * A tuple containing: +/// - The count of distinct minimizer hits across both sequences. +/// - A vector of matching k-mers as strings (if debug is true). +pub fn pair_matches( + all_hashes: &Vec, + all_positions: &Vec, + all_sequences: &Vec<&[u8]>, + minimizer_hashes: &FxHashSet, + kmer_length: u8, + debug: bool, +) -> (usize, Vec) { + let mut seen_hits_pair = FxHashSet::default(); + let mut pair_hit_count = 0; + let mut hit_kmers = Vec::new(); + // Count hits and collect k-mers + for (i, &hash) in all_hashes.iter().enumerate() { + if minimizer_hashes.contains(&hash) && seen_hits_pair.insert(hash) { + pair_hit_count += 1; + if debug && i < all_positions.len() && i < all_sequences.len() { + let pos = all_positions[i] as usize; + let seq = all_sequences[i]; + if pos + kmer_length as usize <= seq.len() { + let kmer = &seq[pos..pos + kmer_length as usize]; + hit_kmers.push(String::from_utf8_lossy(kmer).to_string()); + } + } + } + } + (pair_hit_count, hit_kmers) +} + +/// Given a sequence, compute the minimizer hashes and positons +/// # Args: +/// * `seq`: The input sequence as a byte slice. +/// * `prefix_length`: If >0, only consider the first `prefix_length` bases of the sequence. +/// * `kmer_length`: The length of k-mers to consider for minimizers. +/// * `window_size`: The size of the sliding window to find minimizers. +/// # Returns: +/// * A tuple containing: +/// - A vector of minimizer hash values (u64). +/// - A vector of positions (u32) where each minimizer occurs in the sequence. +/// - A slice of the effective sequence used for minimizer calculation (after applying prefix length and trimming). +pub fn get_minimizer_hashes_and_positions( + seq: &[u8], + prefix_length: usize, + kmer_length: u8, + window_size: u8, +) -> (Vec, Vec, &[u8]) { + if seq.len() < kmer_length as usize { + return (Vec::new(), Vec::new(), &[]); // If too short, return nothing + } + + // Apply prefix length limit if specified + let effective_seq = if prefix_length > 0 && seq.len() > prefix_length { + &seq[..prefix_length] + } else { + seq + }; + + // Trim the last newline character from `effective_seq` if it has one. + let effective_seq = effective_seq.strip_suffix(b"\n").unwrap_or(effective_seq); + + let mut invalid_mask = Vec::new(); + let mut positions = Vec::new(); + let mut minimizer_values = Vec::new(); + + // Pack the sequence into 2-bit representation. + // Any non-ACGT characters are silently converted to 2-bit ACGT as well. + // packed_seq.push_ascii(effective_seq); + let packed_seq = packed_seq::PackedSeqVec::from_ascii(effective_seq); + // let packed_seq = packed_seq::PackedSeqVec::from_ascii(effective_seq); + + // TODO: Extract this to some nicer helper function in packed_seq? + // TODO: Use SIMD? + // TODO: Should probably add some test for this. + // +2: one to round up, and one buffer. + invalid_mask.resize(packed_seq.len() / 64 + 2, 0); + // let mut invalid_mask = vec![0u64; packed_seq.len() / 64 + 2]; + for i in (0..effective_seq.len()).step_by(64) { + let mut mask = 0; + for (j, b) in effective_seq[i..(i + 64).min(effective_seq.len())] + .iter() + .enumerate() + { + mask |= + ((!matches!(b, b'A' | b'C' | b'G' | b'T' | b'a' | b'c' | b'g' | b't')) as u64) << j; + } + + invalid_mask[i / 64] = mask; + } + + // let mut positions = Vec::new(); + simd_minimizers::canonical_minimizer_positions( + // packed_seq::AsciiSeq(&canonical_seq), + packed_seq.as_slice(), + kmer_length as usize, + window_size as usize, + &mut positions, + ); + + assert!( + kmer_length <= 56, + "Indexing the bitmask of invalid characters requires k<=56, but it is {}", + kmer_length + ); + + // Filter positions to only include k-mers with ACGT bases + positions.retain(|&pos| { + // Extract bits pos .. pos+k from the bitmask. + + // mask of k ones in low positions. + let mask = u64::MAX >> (64 - kmer_length); + let byte = pos as usize / 8; + let offset = pos as usize % 8; + // The unaligned u64 read is OK, because we ensure that the underlying `Vec` always + // has at least 8 bytes of padding at the end. + let x = (unsafe { invalid_mask.as_ptr().byte_add(byte).read_unaligned() } >> offset) & mask; + x == 0 + }); + + // Hash valid positions + if kmer_length > 32 { + minimizer_values.extend( + simd_minimizers::iter_canonical_minimizer_values_u128( + packed_seq.as_slice(), + kmer_length as usize, + &positions, + ) + .map(|kmer| xxhash_rust::xxh3::xxh3_64(&kmer.to_le_bytes())), + ); + } else { + minimizer_values.extend( + simd_minimizers::iter_canonical_minimizer_values( + packed_seq.as_slice(), + kmer_length as usize, + &positions, + ) + .map(|kmer| xxhash_rust::xxh3::xxh3_64(&kmer.to_le_bytes())), + ); + } + + (minimizer_values, positions, effective_seq) +} + +pub fn get_paired_minimizer_hashes_and_positions<'a>( + seq1: &'a [u8], + seq2: &'a [u8], + prefix_length: usize, + kmer_length: u8, + window_size: u8, +) -> (Vec, Vec, Vec<&'a [u8]>) { + let mut all_hashes = Vec::new(); + let mut all_positions = Vec::new(); + let mut all_sequences = Vec::new(); + + // Process read 1 + if seq1.len() >= kmer_length as usize { + let (hashes, positions, effective_seq1) = + get_minimizer_hashes_and_positions(seq1, prefix_length, kmer_length, window_size); + all_hashes.extend(hashes); + all_positions.extend(positions); + all_sequences.extend(vec![effective_seq1; all_hashes.len() - all_positions.len()]); + } + + // Process read 2 + if seq2.len() >= kmer_length as usize { + let (hashes, positions, effective_seq2) = + get_minimizer_hashes_and_positions(seq2, prefix_length, kmer_length, window_size); + all_hashes.extend(hashes); + all_positions.extend(positions); + all_sequences.extend(vec![effective_seq2; all_hashes.len() - all_positions.len()]); + } + + (all_hashes, all_positions, all_sequences) +} diff --git a/src/index.rs b/src/index.rs index 6a2fbbf..7332026 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,4 +1,6 @@ use crate::IndexConfig; +#[cfg(feature = "server")] +use crate::server_common::get_server_index_header; use anyhow::{Context, Result}; use bincode::serde::{decode_from_std_read, encode_into_std_write}; use rayon::prelude::*; @@ -12,7 +14,7 @@ use std::time::Instant; use needletail::{parse_fastx_file, parse_fastx_stdin}; /// Serialisable header for the index file -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct IndexHeader { pub format_version: u8, pub kmer_length: u8, @@ -69,32 +71,59 @@ pub fn load_header_and_count>(path: &P) -> Result<(IndexHeader, u Ok((header, count)) } -/// Load the hashes without spiking memory usage with an extra vec -pub fn load_minimizer_hashes>(path: &P) -> Result<(FxHashSet, IndexHeader)> { - let file = - File::open(path).context(format!("Failed to open index file {:?}", path.as_ref()))?; - let mut reader = BufReader::new(file); - - // Deserialise header - let header: IndexHeader = decode_from_std_read(&mut reader, bincode::config::standard()) - .context("Failed to deserialise index header")?; - header.validate()?; - - // Deserialise the count of minimizers so we can init a FxHashSet with the right capacity - let count: usize = decode_from_std_read(&mut reader, bincode::config::standard()) - .context("Failed to deserialise minimizer count")?; - - // Pre-allocate FxHashSet with correct capacity - let mut minimizers = FxHashSet::with_capacity_and_hasher(count, Default::default()); +/// Load the hashes without spiking memory usage with an extra vec. +/// If a path is provided, it will read the minimizers from the file and return them in a FxHashSet +/// +/// If no path is provided, it will try to load the header from a server, returning the minimizers as None +/// This is used for server mode, where the header is important, but local minimizers are not needed. +/// If the server feature is not enabled, it will return an error. +pub fn load_minimizer_hashes( + path_option: &Option<&PathBuf>, + _server_address_option: &Option, +) -> Result<(Option>, IndexHeader)> { + if let Some(path) = path_option { + let file = File::open(path).context(format!("Failed to open index file {path:?}"))?; + let mut reader = BufReader::new(file); + + // Deserialise header + let header: IndexHeader = decode_from_std_read(&mut reader, bincode::config::standard()) + .context("Failed to deserialise index header")?; + header.validate()?; + + // Deserialise the count of minimizers so we can init a FxHashSet with the right capacity + let count: usize = decode_from_std_read(&mut reader, bincode::config::standard()) + .context("Failed to deserialise minimizer count")?; + + // Pre-allocate FxHashSet with correct capacity + let mut minimizers = FxHashSet::with_capacity_and_hasher(count, Default::default()); + + // Populate FxHashSet + for _ in 0..count { + let hash: u64 = decode_from_std_read(&mut reader, bincode::config::standard()) + .context("Failed to deserialise minimizer hash")?; + minimizers.insert(hash); + } - // Populate FxHashSet - for _ in 0..count { - let hash: u64 = decode_from_std_read(&mut reader, bincode::config::standard()) - .context("Failed to deserialise minimizer hash")?; - minimizers.insert(hash); + Ok((Some(minimizers), header)) + } else { + // If no path is provided, check if a server adress was given to populate the header + #[cfg(feature = "server")] + { + if let Some(server) = _server_address_option { + Ok((None, get_server_index_header(&server.to_string())?)) + } else { + Err(anyhow::anyhow!( + "No server address provided for running in server mode" + )) + } + } + #[cfg(not(feature = "server"))] + { + return Err(anyhow::anyhow!( + "Server feature is not enabled. Cannot run without an index." + )); + } } - - Ok((minimizers, header)) } /// Helper function to write minimizers to output file or stdout @@ -123,12 +152,12 @@ pub fn write_minimizers( // Serialise the count of minimizers first let count = minimizers.len(); - encode_into_std_write(count, &mut writer, bincode::config::standard()) + encode_into_std_write(&count, &mut writer, bincode::config::standard()) .context("Failed to serialise minimizer count")?; // Serialise each minimizer directly for &hash in minimizers { - encode_into_std_write(hash, &mut writer, bincode::config::standard()) + encode_into_std_write(&hash, &mut writer, bincode::config::standard()) .context("Failed to serialise minimizer hash")?; } Ok(()) @@ -395,13 +424,13 @@ fn stream_diff_fastx>( seq_count, total_bp ); - Ok((seq_count as usize, total_bp)) + Ok((seq_count as usize, total_bp as usize)) } /// Compute the set difference between two minimizer indexes (A - B) -pub fn diff>( - first: P, - second: P, +pub fn diff( + first: &PathBuf, + second: &PathBuf, kmer_length: Option, window_size: Option, output: Option<&PathBuf>, @@ -409,7 +438,11 @@ pub fn diff>( let start_time = Instant::now(); // Load first file (always an index) - let (mut first_minimizers, header) = load_minimizer_hashes(&first)?; + let (first_minimizers, header) = load_minimizer_hashes(&Some(first), &None)?; + if first_minimizers.is_none() { + return Err(anyhow::anyhow!("Failed to load first index file")); + } + let mut first_minimizers = first_minimizers.unwrap(); eprintln!("First index: loaded {} minimizers", first_minimizers.len()); // Guess if second file is an index or FASTX file @@ -434,7 +467,12 @@ pub fn diff>( return Ok(()); } else { // Try to load as index file first - if let Ok((second_minimizers, second_header)) = load_minimizer_hashes(&second) { + if let Ok((second_minimizers, second_header)) = load_minimizer_hashes(&Some(second), &None) + { + if second_minimizers.is_none() { + return Err(anyhow::anyhow!("Failed to load second index file")); + } + let second_minimizers = second_minimizers.unwrap(); // Second file is an index file eprintln!( "Second index: loaded {} minimizers", @@ -508,11 +546,15 @@ pub fn diff>( } /// Show info about an index -pub fn info>(index_path: P) -> Result<()> { +pub fn info(index_path: &PathBuf) -> Result<()> { let start_time = Instant::now(); // Load index file - let (minimizers, header) = load_minimizer_hashes(&index_path)?; + let (minimizers, header) = load_minimizer_hashes(&Some(index_path), &None)?; + if minimizers.is_none() { + return Err(anyhow::anyhow!("Failed to load index file")); + } + let minimizers = minimizers.unwrap(); // Show index info eprintln!("Index information:"); @@ -528,8 +570,8 @@ pub fn info>(index_path: P) -> Result<()> { } /// Combine minimizer indexes (set union) -pub fn union>( - inputs: &[P], +pub fn union( + inputs: &[PathBuf], output: Option<&PathBuf>, capacity_millions: Option, ) -> Result<()> { @@ -601,7 +643,11 @@ pub fn union>( // Now load and merge all indexes for (i, path) in inputs.iter().enumerate() { - let (minimizers, _) = load_minimizer_hashes(path)?; + let (minimizers, _) = load_minimizer_hashes(&Some(path), &None)?; + if minimizers.is_none() { + return Err(anyhow::anyhow!("Failed to load index file: {:?}", path)); + } + let minimizers = minimizers.unwrap(); let before_count = all_minimizers.len(); // Merge minimizers (set union) @@ -617,7 +663,7 @@ pub fn union>( ); } - write_minimizers(&all_minimizers, header, output)?; + write_minimizers(&all_minimizers, &header, output)?; let total_time = start_time.elapsed(); eprintln!( diff --git a/src/lib.rs b/src/lib.rs index 021a27a..bc1d73b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,9 +11,18 @@ pub mod filter; pub mod index; pub mod minimizers; +#[cfg(feature = "server")] +pub mod server_common; + +mod filter_common; +#[cfg(not(feature = "server"))] +mod local_filter; +#[cfg(feature = "server")] +mod remote_filter; // Re-export the important structures and functions for library users -pub use filter::{FilterSummary, run as run_filter}; +pub use filter::run as run_filter; +pub use filter_common::FilterSummary; pub use index::{ IndexHeader, build as build_index, diff as diff_index, info as index_info, union as union_index, }; @@ -23,11 +32,14 @@ pub use minimizers::{ use anyhow::Result; use rustc_hash::FxHashSet; -use std::path::{Path, PathBuf}; +use std::path::PathBuf; pub struct FilterConfig<'a> { /// Minimizer index file path - pub minimizers_path: &'a Path, + pub minimizers_path: Option<&'a PathBuf>, + + /// Server address (if any) + pub server_address: Option, /// Path to input fastx file (or - for stdin) pub input_path: &'a str, @@ -73,9 +85,10 @@ pub struct FilterConfig<'a> { } impl<'a> FilterConfig<'a> { - pub fn new(minimizers_path: &'a Path) -> Self { + pub fn new(minimizers_path: Option<&'a PathBuf>, server_address: Option) -> Self { Self { minimizers_path, + server_address, input_path: "-", input2_path: None, output_path: "-", @@ -197,9 +210,9 @@ pub struct IndexConfig { impl IndexConfig { /// Create a new index configuration with the specified input path - pub fn new>(input_path: P) -> Self { + pub fn new(input_path: PathBuf) -> Self { Self { - input_path: input_path.as_ref().to_path_buf(), + input_path: input_path, kmer_length: DEFAULT_KMER_LENGTH, window_size: DEFAULT_WINDOW_SIZE, output_path: None, @@ -223,8 +236,8 @@ impl IndexConfig { } /// Set output path - pub fn with_output>(mut self, output_path: P) -> Self { - self.output_path = Some(output_path.as_ref().to_path_buf()); + pub fn with_output(mut self, output_path: PathBuf) -> Self { + self.output_path = Some(output_path); self } @@ -258,8 +271,8 @@ impl IndexConfig { } } -pub fn load_minimizers>(path: P) -> Result<(FxHashSet, index::IndexHeader)> { - index::load_minimizer_hashes(&path) +pub fn load_minimizers(path: &PathBuf) -> Result<(Option>, index::IndexHeader)> { + index::load_minimizer_hashes(&Some(path), &None) } pub fn write_minimizers( diff --git a/src/local_filter.rs b/src/local_filter.rs new file mode 100644 index 0000000..6a4b708 --- /dev/null +++ b/src/local_filter.rs @@ -0,0 +1,874 @@ +use crate::FilterSummary; +use crate::filter_common::{ + get_minimizer_hashes_and_positions, get_paired_minimizer_hashes_and_positions, + get_summary_index, meets_filtering_criteria, pair_matches, sequence_matches, +}; +use crate::{FilterConfig, index::load_minimizer_hashes}; +use anyhow::{Context, Result}; +use flate2::write::GzEncoder; +use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle}; +use liblzma::write::XzEncoder; +use paraseq::Record; +use paraseq::fastx::Reader; +use paraseq::parallel::{ + InterleavedParallelProcessor, InterleavedParallelReader, PairedParallelProcessor, + PairedParallelReader, ParallelProcessor, ParallelReader, +}; +use parking_lot::Mutex; +use rustc_hash::FxHashSet; +use std::fs::{File, OpenOptions}; +use std::io::{self, BufWriter, Write}; +use std::sync::Arc; +use std::time::Instant; +use zstd::stream::write::Encoder as ZstdEncoder; + +const OUTPUT_BUFFER_SIZE: usize = 8 * 1024 * 1024; // Opt: 8MB output buffer +const DEFAULT_BUFFER_SIZE: usize = 64 * 1024; + +type BoxedWriter = Box; + +/// Config for FilterProcessor +struct FilterProcessorConfig { + abs_threshold: usize, + rel_threshold: f64, + prefix_length: usize, + deplete: bool, + rename: bool, + debug: bool, +} + +/// Create a paraseq reader from optional path (stdin if None or "-") +fn create_paraseq_reader(path: Option<&str>) -> Result>> { + match path { + None | Some("-") => { + let stdin_reader = Box::new(std::io::stdin()) as Box; + Reader::new(stdin_reader) + .map_err(|e| anyhow::anyhow!("Failed to create stdin reader: {}", e)) + } + Some(p) => { + let (reader, _format) = niffler::send::from_path(p) + .map_err(|e| anyhow::anyhow!("Failed to open file {}: {}", p, e))?; + Reader::new(reader) + .map_err(|e| anyhow::anyhow!("Failed to create reader for {}: {}", p, e)) + } + } +} + +/// Format a single record into a buffer (FASTA/FASTQ format) +/// +/// `seq` is the newline-free sequence corresponding to the record, obtained from `record.seq()`. +fn format_record_to_buffer( + record: &R, + seq: &[u8], + counter: u64, + rename: bool, + buffer: &mut Vec, +) -> Result<()> { + let is_fasta = record.qual().is_none(); + + // Header line + buffer.write_all(if is_fasta { b">" } else { b"@" })?; + if rename { + buffer.extend_from_slice(counter.to_string().as_bytes()); + } else { + buffer.extend_from_slice(record.id()); + } + buffer.write_all(b"\n")?; + + // Sequence line + buffer.extend_from_slice(seq); + + if is_fasta { + buffer.write_all(b"\n")?; + } else { + // FASTQ: plus line and quality + buffer.write_all(b"\n+\n")?; + if let Some(qual) = record.qual() { + buffer.extend_from_slice(qual); + } + buffer.write_all(b"\n")?; + } + Ok(()) +} + +/// Validate compression level for the given format +fn validate_compression_level(level: u8, min: u8, max: u8, format: &str) -> Result<()> { + if level < min || level > max { + Err(anyhow::anyhow!( + "Invalid {} compression level {}. Must be between {} and {}.", + format, + level, + min, + max + )) + } else { + Ok(()) + } +} + +// Return a file writer appropriate for the output path extension +fn get_writer(output_path: &str, compression_level: u8) -> Result { + if output_path == "-" { + return Ok(Box::new(BufWriter::with_capacity( + OUTPUT_BUFFER_SIZE, + io::stdout(), + ))); + } + + let file = OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(output_path) + .context(format!("Failed to create output file: {}", output_path))?; + + let buffered_file = BufWriter::with_capacity(OUTPUT_BUFFER_SIZE, file); + + match output_path { + p if p.ends_with(".gz") => { + validate_compression_level(compression_level, 1, 9, "gzip")?; + Ok(Box::new(GzEncoder::new( + buffered_file, + flate2::Compression::new(compression_level as u32), + ))) + } + p if p.ends_with(".zst") => { + validate_compression_level(compression_level, 1, 22, "zstd")?; + Ok(Box::new(ZstdEncoder::new( + buffered_file, + compression_level as i32, + )?)) + } + p if p.ends_with(".xz") => { + validate_compression_level(compression_level, 0, 9, "xz")?; + Ok(Box::new(XzEncoder::new( + buffered_file, + compression_level as u32, + ))) + } + _ => Ok(Box::new(buffered_file)), + } +} + +#[derive(Clone)] +struct FilterProcessor { + // Minimizer matching parameters + minimizer_hashes: Arc>, + kmer_length: u8, + window_size: u8, + abs_threshold: usize, + rel_threshold: f64, + prefix_length: usize, + deplete: bool, + rename: bool, + debug: bool, + + // Local buffers + local_buffer: Vec, + local_buffer2: Vec, // Second buffer for paired output + local_stats: ProcessingStats, + + // Global state + global_writer: Arc>, + global_writer2: Option>>, + global_stats: Arc>, + spinner: Option>>, + filtering_start_time: Instant, +} + +#[derive(Clone, Default)] +struct ProcessingStats { + total_seqs: u64, + filtered_seqs: u64, + total_bp: u64, + output_bp: u64, + filtered_bp: u64, + output_seq_counter: u64, +} + +impl FilterProcessor { + fn new( + minimizer_hashes: Arc>, + kmer_length: u8, + window_size: u8, + config: &FilterProcessorConfig, + writer: BoxedWriter, + writer2: Option, + spinner: Option>>, + filtering_start_time: Instant, + ) -> Self { + Self { + minimizer_hashes, + kmer_length, + window_size, + abs_threshold: config.abs_threshold, + rel_threshold: config.rel_threshold, + prefix_length: config.prefix_length, + deplete: config.deplete, + rename: config.rename, + debug: config.debug, + local_buffer: Vec::with_capacity(DEFAULT_BUFFER_SIZE), + local_buffer2: Vec::with_capacity(DEFAULT_BUFFER_SIZE), + local_stats: ProcessingStats::default(), + global_writer: Arc::new(Mutex::new(writer)), + global_writer2: writer2.map(|w| Arc::new(Mutex::new(w))), + global_stats: Arc::new(Mutex::new(ProcessingStats::default())), + spinner, + filtering_start_time, + } + } + + fn should_keep_sequence(&mut self, seq: &[u8]) -> (bool, usize, usize, Vec) { + let (minimizer_values, positions, effective_seq) = get_minimizer_hashes_and_positions( + seq, + self.prefix_length, + self.kmer_length, + self.window_size, + ); + + let num_minimizers = minimizer_values.len(); + + let (hit_count, hit_kmers) = sequence_matches( + &self.minimizer_hashes, + &minimizer_values, + &positions, + effective_seq, + self.kmer_length, + self.debug, + ); + + ( + meets_filtering_criteria( + hit_count, + num_minimizers, + self.abs_threshold, + self.rel_threshold, + self.deplete, + ), + hit_count, + num_minimizers, + hit_kmers, + ) + } + + fn should_keep_pair(&mut self, seq1: &[u8], seq2: &[u8]) -> (bool, usize, usize, Vec) { + let (all_hashes, all_positions, all_sequences) = get_paired_minimizer_hashes_and_positions( + seq1, + seq2, + self.prefix_length, + self.kmer_length, + self.window_size, + ); + + let total_minimizers = all_hashes.len(); + let (pair_hit_count, hit_kmers) = pair_matches( + &all_hashes, + &all_positions, + &all_sequences, + &self.minimizer_hashes, + self.kmer_length, + self.debug, + ); + + ( + meets_filtering_criteria( + pair_hit_count, + total_minimizers, + self.abs_threshold, + self.rel_threshold, + self.deplete, + ), + pair_hit_count, + total_minimizers, + hit_kmers, + ) + } + + fn write_record(&mut self, record: &Rf, seq: &[u8]) -> Result<()> { + self.local_stats.output_seq_counter += 1; + format_record_to_buffer( + record, + seq, + self.local_stats.output_seq_counter, + self.rename, + &mut self.local_buffer, + ) + } + + fn write_record_to_buffer2(&mut self, record: &Rf, seq: &[u8]) -> Result<()> { + self.local_stats.output_seq_counter += 1; + format_record_to_buffer( + record, + seq, + self.local_stats.output_seq_counter, + self.rename, + &mut self.local_buffer2, + ) + } + + fn update_spinner(&self) { + if let Some(ref spinner) = self.spinner { + let stats = self.global_stats.lock(); + let elapsed = self.filtering_start_time.elapsed(); + let seqs_per_sec = stats.total_seqs as f64 / elapsed.as_secs_f64(); + let bp_per_sec = stats.total_bp as f64 / elapsed.as_secs_f64(); + let mbp_per_sec = bp_per_sec / 1_000_000.0; + + let output_seqs = stats.total_seqs - stats.filtered_seqs; + let output_proportion = if stats.total_seqs > 0 { + output_seqs as f64 / stats.total_seqs as f64 + } else { + 0.0 + }; + + let output_bp_proportion = if stats.total_bp > 0 { + stats.output_bp as f64 / stats.total_bp as f64 + } else { + 0.0 + }; + + spinner.lock().set_message(format!( + "Retained {}/{} sequences ({:.2}%), {}/{} bp ({:.2}%). {:.0} seqs/s ({:.1} Mbp/s)", + output_seqs, + stats.total_seqs, + output_proportion * 100.0, + stats.output_bp, + stats.total_bp, + output_bp_proportion * 100.0, + seqs_per_sec, + mbp_per_sec + )); + } + } +} + +impl ParallelProcessor for FilterProcessor { + fn process_record(&mut self, record: Rf) -> paraseq::parallel::Result<()> { + let seq = record.seq(); + self.local_stats.total_seqs += 1; + self.local_stats.total_bp += seq.len() as u64; + + let (should_keep, hit_count, total_minimizers, hit_kmers) = self.should_keep_sequence(&seq); + + // Show debug info for sequences with hits + if self.debug { + eprintln!( + "DEBUG: {} hits={}/{} keep={} kmers=[{}]", + String::from_utf8_lossy(record.id()), + hit_count, + total_minimizers, + should_keep, + hit_kmers.join(",") + ); + } + + if should_keep { + self.local_stats.output_bp += seq.len() as u64; + self.write_record(&record, &seq)?; + } else { + self.local_stats.filtered_seqs += 1; + self.local_stats.filtered_bp += seq.len() as u64; + } + + Ok(()) + } + + fn on_batch_complete(&mut self) -> paraseq::parallel::Result<()> { + // Write buffer to output + if !self.local_buffer.is_empty() { + let mut global_writer = self.global_writer.lock(); + global_writer.write_all(&self.local_buffer)?; + global_writer.flush()?; + } + + // Clear buffer after releasing the lock + self.local_buffer.clear(); + + // Update global stats + { + let mut stats = self.global_stats.lock(); + stats.total_seqs += self.local_stats.total_seqs; + stats.filtered_seqs += self.local_stats.filtered_seqs; + stats.total_bp += self.local_stats.total_bp; + stats.output_bp += self.local_stats.output_bp; + stats.filtered_bp += self.local_stats.filtered_bp; + stats.output_seq_counter += self.local_stats.output_seq_counter; + } + + // Update spinner + self.update_spinner(); + + // Reset local stats + self.local_stats = ProcessingStats::default(); + + Ok(()) + } +} + +impl InterleavedParallelProcessor for FilterProcessor { + fn process_interleaved_pair( + &mut self, + record1: Rf, + record2: Rf, + ) -> paraseq::parallel::Result<()> { + let seq1 = record1.seq(); + let seq2 = record2.seq(); + + self.local_stats.total_seqs += 2; + self.local_stats.total_bp += (seq1.len() + seq2.len()) as u64; + + let (should_keep, hit_count, total_minimizers, hit_kmers) = + self.should_keep_pair(&seq1, &seq2); + + // Debug info for interleaved pairs + if self.debug && hit_count > 0 { + eprintln!( + "DEBUG: {}/{} hits={}/{} keep={} kmers=[{}]", + String::from_utf8_lossy(record1.id()), + String::from_utf8_lossy(record2.id()), + hit_count, + total_minimizers, + should_keep, + hit_kmers.join(",") + ); + } + + if should_keep { + self.local_stats.output_bp += (seq1.len() + seq2.len()) as u64; + + // Write both records to output + self.write_record(&record1, &seq1)?; + self.write_record(&record2, &seq2)?; + } else { + self.local_stats.filtered_seqs += 2; + self.local_stats.filtered_bp += (seq1.len() + seq2.len()) as u64; + } + + Ok(()) + } + + fn on_batch_complete(&mut self) -> paraseq::parallel::Result<()> { + // Write buffer to output + if !self.local_buffer.is_empty() { + let mut global_writer = self.global_writer.lock(); + global_writer.write_all(&self.local_buffer)?; + global_writer.flush()?; + } + + // Clear buffer after releasing the lock for better performance + self.local_buffer.clear(); + + // Update global stats + { + let mut stats = self.global_stats.lock(); + stats.total_seqs += self.local_stats.total_seqs; + stats.filtered_seqs += self.local_stats.filtered_seqs; + stats.total_bp += self.local_stats.total_bp; + stats.output_bp += self.local_stats.output_bp; + stats.filtered_bp += self.local_stats.filtered_bp; + stats.output_seq_counter += self.local_stats.output_seq_counter; + } + + // Update spinner + self.update_spinner(); + + // Reset local stats + self.local_stats = ProcessingStats::default(); + + Ok(()) + } +} + +impl PairedParallelProcessor for FilterProcessor { + fn process_record_pair( + &mut self, + record1: Rf, + record2: Rf, + ) -> paraseq::parallel::Result<()> { + let seq1 = record1.seq(); + let seq2 = record2.seq(); + self.local_stats.total_seqs += 2; + self.local_stats.total_bp += (seq1.len() + seq2.len()) as u64; + + let (should_keep, hit_count, total_minimizers, hit_kmers) = + self.should_keep_pair(&seq1, &seq2); + + // Debug info for paired reads + if self.debug && hit_count > 0 { + eprintln!( + "DEBUG: {}/{} hits={}/{} keep={} kmers=[{}]", + String::from_utf8_lossy(record1.id()), + String::from_utf8_lossy(record2.id()), + hit_count, + total_minimizers, + should_keep, + hit_kmers.join(",") + ); + } + + if should_keep { + self.local_stats.output_bp += (seq1.len() + seq2.len()) as u64; + + // Write to appropriate writers + if self.global_writer2.is_some() { + // Separate outputs + self.write_record(&record1, &seq1)?; + self.write_record_to_buffer2(&record2, &seq2)?; + } else { + // Interleaved output + self.write_record(&record1, &seq1)?; + self.write_record(&record2, &seq2)?; + } + } else { + self.local_stats.filtered_seqs += 2; + self.local_stats.filtered_bp += (seq1.len() + seq2.len()) as u64; + } + + Ok(()) + } + + fn on_batch_complete(&mut self) -> paraseq::parallel::Result<()> { + if let Some(ref writer2) = self.global_writer2 { + // Atomic paired batch writing + if !self.local_buffer.is_empty() || !self.local_buffer2.is_empty() { + let mut writer1 = self.global_writer.lock(); + let mut writer2 = writer2.lock(); + + writer1.write_all(&self.local_buffer)?; + writer1.flush()?; + writer2.write_all(&self.local_buffer2)?; + writer2.flush()?; + } + } else { + // Interleaved output + if !self.local_buffer.is_empty() { + let mut writer = self.global_writer.lock(); + writer.write_all(&self.local_buffer)?; + writer.flush()?; + } + } + + self.local_buffer.clear(); + self.local_buffer2.clear(); + + // Update global stats + { + let mut stats = self.global_stats.lock(); + stats.total_seqs += self.local_stats.total_seqs; + stats.filtered_seqs += self.local_stats.filtered_seqs; + stats.total_bp += self.local_stats.total_bp; + stats.output_bp += self.local_stats.output_bp; + stats.filtered_bp += self.local_stats.filtered_bp; + stats.output_seq_counter += self.local_stats.output_seq_counter; + } + + // Update spinner + self.update_spinner(); + + // Reset local stats + self.local_stats = ProcessingStats::default(); + + Ok(()) + } +} + +pub fn run(config: &FilterConfig) -> Result<()> { + let start_time = Instant::now(); + let version: String = env!("CARGO_PKG_VERSION").to_string(); + let tool_version = format!("deacon {}", version); + + // Enable quiet mode when debug enabled + let quiet = config.quiet || config.debug; + + // Configure thread pool if nonzero + if config.threads > 0 { + rayon::ThreadPoolBuilder::new() + .num_threads(config.threads) + .build_global() + .context("Failed to initialize thread pool")?; + } + + let mode = if config.deplete { "deplete" } else { "search" }; + + let mut input_type = String::new(); + let mut options = Vec::::new(); + let paired_stdin = config.input_path == "-" + && config.input2_path.is_some() + && config.input2_path.unwrap() == "-"; + if paired_stdin { + input_type.push_str("interleaved"); + } else if config.input2_path.is_some() { + input_type.push_str("paired"); + } else { + input_type.push_str("single"); + } + options.push(format!( + "abs_threshold={}, rel_threshold={}", + config.abs_threshold, config.rel_threshold + )); + if config.prefix_length > 0 { + options.push(format!("prefix_length={}", config.prefix_length)); + } + if config.rename { + options.push("rename".to_string()); + } + if config.threads > 0 { + options.push(format!("threads={}", config.threads)); + } + + if !quiet { + eprintln!( + "Deacon v{}; mode: {}; input: {}; options: {}", + version, + mode, + input_type, + options.join(", ") + ); + } + + // Load minimizer hashes and parse header + let (minimizer_hashes, header) = load_minimizer_hashes(&config.minimizers_path, &None)?; + let minimizer_hashes = Arc::new(minimizer_hashes.unwrap()); + + let kmer_length = header.kmer_length(); + let window_size = header.window_size(); + + let load_time = start_time.elapsed(); + if !quiet { + eprintln!( + "Loaded index (k={}, w={}) in {:.2?}", + kmer_length, window_size, load_time + ); + } + + // Create appropriate writer(s) based on output path(s) + let writer = get_writer(config.output_path, config.compression_level)?; + let writer2 = if let (Some(output2), Some(_)) = (config.output2_path, config.input2_path) { + Some(get_writer(output2, config.compression_level)?) + } else { + None + }; + + // Progress bar setup if not quiet + let spinner = if !quiet { + let pb = ProgressBar::with_draw_target(None, ProgressDrawTarget::stderr()); + pb.set_style( + ProgressStyle::default_spinner() + .tick_strings(&["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]) + .template("{msg}")?, + ); + pb.set_message("Filtering"); + Some(Arc::new(Mutex::new(pb))) + } else { + None + }; + + // Start timer for rate calculation + let filtering_start_time = Instant::now(); + + // Create processor + let processor_config = FilterProcessorConfig { + abs_threshold: config.abs_threshold, + rel_threshold: config.rel_threshold, + prefix_length: config.prefix_length, + deplete: config.deplete, + rename: config.rename, + debug: config.debug, + }; + let processor = FilterProcessor::new( + minimizer_hashes, + kmer_length, + window_size, + &processor_config, + writer, + writer2, + spinner.clone(), + filtering_start_time, + ); + + // Process based on input type + let num_threads = if config.threads == 0 { + rayon::current_num_threads() + } else { + config.threads + }; + + if paired_stdin { + // Interleaved paired from stdin - use native interleaved processor + let reader = create_paraseq_reader(Some("-"))?; + reader.process_parallel_interleaved(processor.clone(), num_threads)?; + } else if let Some(input2_path) = config.input2_path { + // Paired files + let r1_reader = create_paraseq_reader(Some(config.input_path))?; + let r2_reader = create_paraseq_reader(Some(input2_path))?; + r1_reader.process_parallel_paired(r2_reader, processor.clone(), num_threads)?; + } else { + // Single file or stdin + let reader = create_paraseq_reader(Some(config.input_path))?; + reader.process_parallel(processor.clone(), num_threads)?; + } + + let final_stats = processor.global_stats.lock(); + let total_seqs = final_stats.total_seqs; + let filtered_seqs = final_stats.filtered_seqs; + let total_bp = final_stats.total_bp; + let output_bp = final_stats.output_bp; + let filtered_bp = final_stats.filtered_bp; + + drop(final_stats); // Release lock + + // Flush writers - they should auto-flush on drop + drop(processor.global_writer); + if let Some(w2) = processor.global_writer2 { + drop(w2); + } + + let total_time = start_time.elapsed(); + let seqs_per_sec = total_seqs as f64 / total_time.as_secs_f64(); + let bp_per_sec = total_bp as f64 / total_time.as_secs_f64(); + let mbp_per_sec = bp_per_sec / 1_000_000.0; + + // Calculate proportions + let filtered_proportion = if total_seqs > 0 { + filtered_seqs as f64 / total_seqs as f64 + } else { + 0.0 + }; + + let filtered_bp_proportion = if total_bp > 0 { + filtered_bp as f64 / total_bp as f64 + } else { + 0.0 + }; + + let output_seqs = total_seqs - filtered_seqs; + let output_seq_proportion = if total_seqs > 0 { + output_seqs as f64 / total_seqs as f64 + } else { + 0.0 + }; + + let output_bp_proportion = if total_bp > 0 { + output_bp as f64 / total_bp as f64 + } else { + 0.0 + }; + + // Finish and clear spinner - disable it completely + if let Some(ref spinner) = spinner { + let pb = spinner.lock(); + pb.set_draw_target(ProgressDrawTarget::hidden()); + pb.finish_and_clear(); + } + + if !quiet { + eprintln!( + "Retained {}/{} sequences ({:.3}%), {}/{} bp ({:.3}%) in {:.2?}. Speed: {:.0} seqs/s ({:.1} Mbp/s)", + output_seqs, + total_seqs, + output_seq_proportion * 100.0, + output_bp, + total_bp, + output_bp_proportion * 100.0, + total_time, + seqs_per_sec, + mbp_per_sec + ); + } + + // Build and write JSON summary if path provided + if let Some(summary_file) = config.summary_path { + let seqs_out = total_seqs - filtered_seqs; + + let summary = FilterSummary { + version: tool_version, + index: get_summary_index(&config.minimizers_path, &config.server_address), + input: config.input_path.to_string(), + input2: config.input2_path.map(|s| s.to_string()), + output: config.output_path.to_string(), + output2: config.output2_path.map(|s| s.to_string()), + k: kmer_length, + w: window_size, + abs_threshold: config.abs_threshold, + rel_threshold: config.rel_threshold, + prefix_length: config.prefix_length, + deplete: config.deplete, + rename: config.rename, + seqs_in: total_seqs as u64, + seqs_out: seqs_out as u64, + seqs_out_proportion: output_seq_proportion, + seqs_removed: filtered_seqs as u64, + seqs_removed_proportion: filtered_proportion, + bp_in: total_bp as u64, + bp_out: output_bp as u64, + bp_out_proportion: output_bp_proportion, + bp_removed: filtered_bp as u64, + bp_removed_proportion: filtered_bp_proportion, + time: total_time.as_secs_f64(), + seqs_per_second: seqs_per_sec as u64, + bp_per_second: bp_per_sec as u64, + }; + + let file = File::create(summary_file) + .context(format!("Failed to create summary: {:?}", summary_file))?; + let writer = BufWriter::new(file); + + serde_json::to_writer_pretty(writer, &summary).context("Failed to write summary")?; + + if !quiet { + eprintln!("Summary saved to {:?}", summary_file); + } + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_filter_summary() { + let summary = FilterSummary { + version: "deacon 0.1.0".to_string(), + index: "test.idx".to_string(), + input: "test.fastq".to_string(), + input2: Some("test2.fastq".to_string()), + output: "output.fastq".to_string(), + output2: Some("output2.fastq".to_string()), + k: 31, + w: 21, + abs_threshold: 1, + rel_threshold: 0.01, + prefix_length: 0, + deplete: false, + rename: false, + seqs_in: 100, + seqs_out: 90, + seqs_out_proportion: 0.9, + seqs_removed: 10, + seqs_removed_proportion: 0.1, + bp_in: 10000, + bp_out: 9000, + bp_out_proportion: 0.9, + bp_removed: 1000, + bp_removed_proportion: 0.1, + time: 1.5, + seqs_per_second: 66, + bp_per_second: 6666, + }; + + let json = serde_json::to_string(&summary).unwrap(); + let parsed: FilterSummary = serde_json::from_str(&json).unwrap(); + + assert_eq!(parsed.version, "deacon 0.1.0"); + assert_eq!(parsed.seqs_in, 100); + assert_eq!(parsed.seqs_removed_proportion, 0.1); + assert_eq!(parsed.seqs_out_proportion, 0.9); + assert_eq!(parsed.bp_out_proportion, 0.9); + assert_eq!(parsed.input, "test.fastq"); + assert_eq!(parsed.input2, Some("test2.fastq".to_string())); + assert_eq!(parsed.output, "output.fastq"); + assert_eq!(parsed.output2, Some("output2.fastq".to_string())); + } +} diff --git a/src/main.rs b/src/main.rs index 3d56a97..60ba24a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -251,7 +251,8 @@ fn main() -> Result<()> { } let config = FilterConfig { - minimizers_path: minimizers, + minimizers_path: Some(minimizers), + server_address: None, input_path: input, input2_path: input2.as_deref(), output_path: output, diff --git a/src/minimizers.rs b/src/minimizers.rs index 677100a..3f66314 100644 --- a/src/minimizers.rs +++ b/src/minimizers.rs @@ -189,7 +189,6 @@ pub fn fill_minimizer_hashes( ); } } - #[cfg(test)] mod tests { use super::*; diff --git a/src/remote_filter.rs b/src/remote_filter.rs new file mode 100644 index 0000000..bfb54db --- /dev/null +++ b/src/remote_filter.rs @@ -0,0 +1,1561 @@ +use crate::FilterConfig; +use crate::FilterSummary; +use crate::filter_common::get_minimizer_hashes_and_positions; +use crate::filter_common::get_paired_minimizer_hashes_and_positions; +use crate::filter_common::pair_matches; +use crate::filter_common::sequence_matches; +use crate::filter_common::{get_summary_index, meets_filtering_criteria}; +use crate::index::load_minimizer_hashes; +use crate::minimizers::fill_minimizer_hashes; +#[cfg(feature = "server")] +use crate::server_common::{FilterResponse, PairedFilterRequest, UnpairedFilterRequest}; +use anyhow::{Context, Result}; +use flate2::Compression; +use flate2::write::GzEncoder; +use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle}; +use liblzma::write::XzEncoder; +use needletail::parse_fastx_file; +use needletail::parse_fastx_stdin; +use needletail::parser::Format; +use rayon::prelude::*; +#[cfg(feature = "server")] +use reqwest::blocking::Client; +use rustc_hash::FxHashSet; +use std::fs::{File, OpenOptions}; +use std::io::{self, BufWriter, Write}; +use std::time::Instant; +use zstd::stream::write::Encoder as ZstdEncoder; + +const OUTPUT_BUFFER_SIZE: usize = 8 * 1024 * 1024; // Opt: 8MB output buffer + +/// Data structure to hold a fastq record +struct RecordData { + id: Vec, + seq: Vec, + qual: Option>, + format: Format, +} +trait FastxWriter: Write { + fn flush_all(&mut self) -> io::Result<()>; +} + +trait CompressionEncoder: Write { + fn finish(self: Box) -> io::Result<()>; +} + +#[derive(Debug, Clone, Copy)] +enum CompressionFormat { + None, + Gzip, + Zstd, + Xz, +} + +impl CompressionFormat { + fn from_extension(path: &str) -> Self { + if path.ends_with(".gz") { + Self::Gzip + } else if path.ends_with(".zst") { + Self::Zstd + } else if path.ends_with(".xz") { + Self::Xz + } else { + Self::None + } + } + + fn validate_compression_level(&self, level: u8) -> Result<()> { + match self { + Self::None => Ok(()), + Self::Gzip => { + if !(1..=9).contains(&level) { + Err(anyhow::anyhow!( + "Invalid gzip compression level {}. Must be between 1 and 9.", + level + )) + } else { + Ok(()) + } + } + Self::Zstd => { + if !(1..=22).contains(&level) { + Err(anyhow::anyhow!( + "Invalid zstd compression level {}. Must be between 1 and 22.", + level + )) + } else { + Ok(()) + } + } + Self::Xz => { + if level > 9 { + Err(anyhow::anyhow!( + "Invalid xz compression level {}. Must be between 0 and 9.", + level + )) + } else { + Ok(()) + } + } + } + } +} + +impl CompressionEncoder for GzEncoder { + fn finish(mut self: Box) -> io::Result<()> { + self.try_finish() + } +} + +impl CompressionEncoder for ZstdEncoder<'static, W> { + fn finish(self: Box) -> io::Result<()> { + (*self).finish().map(|_| ()) + } +} + +impl CompressionEncoder for XzEncoder { + fn finish(self: Box) -> io::Result<()> { + (*self).finish().map(|_| ()) + } +} + +struct CompressedWriter { + encoder: Option>, +} + +impl CompressedWriter { + fn new(encoder: Box) -> Self { + Self { + encoder: Some(encoder), + } + } + + fn uncompressed(writer: W) -> StandardWriter { + StandardWriter(writer) + } +} + +impl Write for CompressedWriter { + fn write(&mut self, buf: &[u8]) -> io::Result { + if let Some(encoder) = &mut self.encoder { + encoder.write(buf) + } else { + Err(io::Error::new( + io::ErrorKind::BrokenPipe, + "Writer has been closed", + )) + } + } + + fn flush(&mut self) -> io::Result<()> { + if let Some(encoder) = &mut self.encoder { + encoder.flush() + } else { + Err(io::Error::new( + io::ErrorKind::BrokenPipe, + "Writer has been closed", + )) + } + } +} + +impl FastxWriter for CompressedWriter { + fn flush_all(&mut self) -> io::Result<()> { + if let Some(encoder) = self.encoder.take() { + encoder.finish()?; + } + Ok(()) + } +} + +struct StandardWriter(W); + +impl Write for StandardWriter { + fn write(&mut self, buf: &[u8]) -> io::Result { + self.0.write(buf) + } + + fn flush(&mut self) -> io::Result<()> { + self.0.flush() + } +} + +impl FastxWriter for StandardWriter { + fn flush_all(&mut self) -> io::Result<()> { + self.flush() + } +} + +// Return a file writer appropriate for the output path extension +fn get_writer(output_path: &str, compression_level: u8) -> Result> { + if output_path == "-" { + // Write to stdout + let stdout = io::stdout(); + let writer = BufWriter::with_capacity(OUTPUT_BUFFER_SIZE, stdout); + Ok(Box::new(CompressedWriter::uncompressed(writer))) + } else { + // Write to file with extension-appropriate encoder + let file = OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(output_path) + .context(format!("Failed to create output file: {output_path}"))?; + + let buffered_file = BufWriter::with_capacity(OUTPUT_BUFFER_SIZE, file); + let format = CompressionFormat::from_extension(output_path); + + // Validate compression level for the format + format.validate_compression_level(compression_level)?; + + match format { + CompressionFormat::None => Ok(Box::new(CompressedWriter::uncompressed(buffered_file))), + CompressionFormat::Gzip => { + let encoder = + GzEncoder::new(buffered_file, Compression::new(compression_level as u32)); + Ok(Box::new(CompressedWriter::new(Box::new(encoder)))) + } + CompressionFormat::Zstd => { + let encoder = ZstdEncoder::new(buffered_file, compression_level as i32) + .context("Failed to create zstd encoder")?; + Ok(Box::new(CompressedWriter::new(Box::new(encoder)))) + } + CompressionFormat::Xz => { + let encoder = XzEncoder::new(buffered_file, compression_level as u32); + Ok(Box::new(CompressedWriter::new(Box::new(encoder)))) + } + } + } +} + +fn unpaired_should_keep( + input_minimizers_and_positions: &Vec<(Vec, Vec, Vec)>, + kmer_length: u8, + index_minimizers: &FxHashSet, + abs_threshold: usize, + rel_threshold: f64, + deplete: bool, + debug: bool, +) -> Vec<(bool, usize, usize, Vec)> { + input_minimizers_and_positions + .par_iter() + .map(|(minimizers, positions, seq)| { + let (hit_count, hit_kmers) = sequence_matches( + index_minimizers, + minimizers, + positions, + seq, + kmer_length, + debug, + ); + ( + meets_filtering_criteria( + hit_count, + minimizers.len(), + abs_threshold, + rel_threshold, + deplete, + ), + hit_count, + minimizers.len(), + hit_kmers, + ) + }) + .collect() +} + +fn paired_should_keep( + input_minimizers_and_positions: &Vec<(Vec, Vec, Vec<&[u8]>)>, + kmer_length: u8, + index_minimizers: &FxHashSet, + abs_threshold: usize, + rel_threshold: f64, + deplete: bool, + debug: bool, +) -> Vec<(bool, usize, usize, Vec)> { + input_minimizers_and_positions + .par_iter() + .map(|(minimizers, positions, seq)| { + let (pair_hit_count, hit_kmers) = pair_matches( + minimizers, + positions, + seq, + index_minimizers, + kmer_length, + debug, + ); + + ( + meets_filtering_criteria( + pair_hit_count, + minimizers.len(), + abs_threshold, + rel_threshold, + deplete, + ), + pair_hit_count, + minimizers.len(), + hit_kmers, + ) + }) + .collect() +} + +/// Given a set of input minimizers from unpaired reads, check if they should be output +/// If index minimizers are provided, check locally. +/// If not, send to server for checking. Requires the `server` feature to be enabled. +pub fn check_single_inputs_should_be_output( + index_minimizers: &Option>, + input_minimizers_and_positions: &Vec<(Vec, Vec, Vec)>, + _server_address: &Option, + deplete: bool, + kmer_length: u8, + debug: bool, + abs_threshold: usize, + rel_threshold: f64, +) -> Vec<(bool, usize, usize, Vec)> { + // If index minimizers are provided, check if input matches locally + if let Some(index_minimizers) = index_minimizers { + unpaired_should_keep( + input_minimizers_and_positions, + kmer_length, + index_minimizers, + abs_threshold, + rel_threshold, + deplete, + debug, + ) + } else { + // Else, send the input minimizers to the server for checking + #[cfg(feature = "server")] + { + if _server_address.is_none() { + panic!("Server address is required when using the server feature."); + } + let server_address = _server_address.as_ref().map(String::as_str).unwrap(); + // Create a client to send the minimizers to the server + let client = Client::new(); + + // Send the minimizers as a POST request + let response = client + .post(server_address.to_owned() + "/should_output_unpaired") + .json(&UnpairedFilterRequest { + input: input_minimizers_and_positions.to_vec(), + abs_threshold, + rel_threshold, + deplete, + kmer_length, + debug, + }) + .send() + .unwrap(); + + // Check if the response indicates a match + if response.status().is_success() { + response.json::().unwrap().should_output + } else { + panic!("Server returned an error: {}", response.status()) + } + } + #[cfg(not(feature = "server"))] + { + panic!("Server feature is not enabled. Cannot check input against index."); + } + } +} + +/// Given a set of input minimizers from paired reads, check if they should be output +/// If index minimizers are provided, check locally. +/// If not, send to server for checking. Requires the `server` feature to be enabled. +pub fn check_paired_inputs_should_be_output( + index_minimizers: &Option>, + input_minimizers_and_positions: &Vec<(Vec, Vec, Vec<&[u8]>)>, + _server_address: &Option, + deplete: bool, + kmer_length: u8, + debug: bool, + abs_threshold: usize, + rel_threshold: f64, +) -> Vec<(bool, usize, usize, Vec)> { + // If index minimizers are provided, check if input matches locally + if let Some(index_minimizers) = index_minimizers { + paired_should_keep( + input_minimizers_and_positions, + kmer_length, + index_minimizers, + abs_threshold, + rel_threshold, + deplete, + debug, + ) + } else { + // Else, send the input minimizers to the server for checking + #[cfg(feature = "server")] + { + use simd_minimizers::private::minimizers; + + if _server_address.is_none() { + panic!("Server address is required when using the server feature."); + } + let server_address = _server_address.as_ref().map(String::as_str).unwrap(); + + // Quickly wrangle the seqs into vecs instead of slices so serde can cope + // Not perfect, but if it has to happen anywhere, here is the best + let input_minimizers_and_positions: Vec<(Vec, Vec, Vec>)> = + input_minimizers_and_positions + .iter() + .map(|(minimizers, positions, seqs)| { + ( + minimizers.to_vec(), + positions.to_vec(), + seqs.iter().map(|s| s.to_vec()).collect(), + ) + }) + .collect(); + + // Create a client to send the minimizers to the server + let client = Client::new(); + + // Send the minimizers as a POST request + let response = client + .post(server_address.to_owned() + "/should_output_unpaired") + .json(&PairedFilterRequest { + input: input_minimizers_and_positions.to_vec(), + abs_threshold, + rel_threshold, + deplete, + kmer_length, + debug, + }) + .send() + .unwrap(); + + // Check if the response indicates a match + if response.status().is_success() { + response.json::().unwrap().should_output + } else { + panic!("Server returned an error: {}", response.status()) + } + } + #[cfg(not(feature = "server"))] + { + panic!("Server feature is not enabled. Cannot check input against index."); + } + } +} + +/// Run deacon filter with the provided parameters. +pub fn run(config: &FilterConfig) -> Result<()> { + let start_time = Instant::now(); + let version: String = env!("CARGO_PKG_VERSION").to_string(); + let tool_version = format!("deacon {version}"); + + // Configure thread pool if nonzero + if config.threads > 0 { + rayon::ThreadPoolBuilder::new() + .num_threads(config.threads) + .build_global() + .context("Failed to initialize thread pool")?; + } + + let mode = if config.deplete { "deplete" } else { "search" }; + + let mut input_type = String::new(); + let mut options = Vec::::new(); + let paired_stdin = config.input_path == "-" + && config.input2_path.is_some() + && config.input2_path.unwrap() == "-"; + if paired_stdin { + input_type.push_str("interleaved"); + } else if config.input2_path.is_some() { + input_type.push_str("paired"); + } else { + input_type.push_str("single"); + } + options.push(format!( + "abs_threshold={}, rel_threshold={}", + config.abs_threshold, config.rel_threshold + )); + if config.prefix_length > 0 { + options.push(format!("prefix_length={}", config.prefix_length)); + } + if config.rename { + options.push("rename".to_string()); + } + if config.threads > 0 { + options.push(format!("threads={}", config.threads)); + } + + eprintln!( + "Deacon v{}; mode: {}; input: {}; options: {}", + version, + mode, + input_type, + options.join(", ") + ); + + // Load minimizers hashes and parse header + let (minimizer_hashes, header) = + load_minimizer_hashes(&config.minimizers_path, &config.server_address)?; + + let kmer_length = header.kmer_length(); + let window_size = header.window_size(); + + let load_time = start_time.elapsed(); + eprintln!("Loaded index (k={kmer_length}, w={window_size}) in {load_time:.2?}"); + + // Create the appropriate writer(s) based on the output path(s) + let mut writer = get_writer(config.output_path, config.compression_level)?; + let mut writer2 = if let (Some(output2), Some(_)) = (config.output2_path, config.input2_path) { + // Only create second writer if both output2 and input2 are specified + Some(get_writer(output2, config.compression_level)?) + } else { + None + }; + + // A progress bar would require a denominator, so let's spin + let spinner = ProgressBar::with_draw_target(None, ProgressDrawTarget::stderr()); + spinner.set_style( + ProgressStyle::default_spinner() + .tick_strings(&[". ", ".. ", "...", " ..", " .", " "]) + .template("{msg}{spinner} ")?, + ); + spinner.set_message("Filtering"); + + // Init counters + let mut total_seqs = 0; + let mut filtered_seqs = 0; + let mut total_bp = 0; + let mut output_bp = 0; + let mut filtered_bp = 0; + let mut output_seq_counter = 0; + + // Start timer for filtering rate calculation (excludes index loading time) + let filtering_start_time = Instant::now(); + + if paired_stdin { + process_interleaved_paired_seqs( + &minimizer_hashes, + &mut writer, + writer2.as_mut(), + config.abs_threshold, + config.rel_threshold, + config.prefix_length as u8, + kmer_length, + window_size, + config.deplete, + config.rename, + &mut total_seqs, + &mut filtered_seqs, + &mut total_bp, + &mut output_bp, + &mut filtered_bp, + &mut output_seq_counter, + &spinner, + filtering_start_time, + &config.server_address, + config.debug, + )?; + } else if let Some(input2_path) = config.input2_path { + process_paired_seqs( + &minimizer_hashes, + config.input_path, + input2_path, + &mut writer, + writer2.as_mut(), + config.abs_threshold, + config.rel_threshold, + config.prefix_length as u8, + kmer_length, + window_size, + config.deplete, + config.rename, + &mut total_seqs, + &mut filtered_seqs, + &mut total_bp, + &mut output_bp, + &mut filtered_bp, + &mut output_seq_counter, + &spinner, + filtering_start_time, + &config.server_address, + config.debug, + )?; + } else { + process_single_seqs( + &minimizer_hashes, + config.input_path, + &mut writer, + config.abs_threshold, + config.rel_threshold, + config.prefix_length as u8, + kmer_length, + window_size, + config.deplete, + config.rename, + &mut total_seqs, + &mut filtered_seqs, + &mut total_bp, + &mut output_bp, + &mut filtered_bp, + &mut output_seq_counter, + &spinner, + filtering_start_time, + &config.server_address, + config.debug, + )?; + } + + writer.flush_all()?; + if let Some(ref mut w2) = writer2 { + w2.flush_all()?; + } + + let total_time = start_time.elapsed(); + let seqs_per_sec = total_seqs as f64 / total_time.as_secs_f64(); + let bp_per_sec = total_bp as f64 / total_time.as_secs_f64(); + let mbp_per_sec = bp_per_sec / 1_000_000.0; + + // Calculate filtered proportion directly + let filtered_proportion = if total_seqs > 0 { + filtered_seqs as f64 / total_seqs as f64 + } else { + 0.0 + }; + + // Calculate filtered base pair proportion + let filtered_bp_proportion = if total_bp > 0 { + filtered_bp as f64 / total_bp as f64 + } else { + 0.0 + }; + + // Calculate output proportions + let output_seqs = total_seqs - filtered_seqs; + let output_seq_proportion = if total_seqs > 0 { + output_seqs as f64 / total_seqs as f64 + } else { + 0.0 + }; + + let output_bp_proportion = if total_bp > 0 { + output_bp as f64 / total_bp as f64 + } else { + 0.0 + }; + + // Finish and clear spinner, print final message + spinner.finish_and_clear(); + eprintln!( + "Retained {}/{} sequences ({:.3}%), {}/{} bp ({:.3}%)", + output_seqs, + total_seqs, + output_seq_proportion * 100.0, + output_bp, + total_bp, + output_bp_proportion * 100.0 + ); + + // Print completion message with speed + eprintln!( + "Completed in {total_time:.2?}. Speed: {seqs_per_sec:.0} seqs/s ({mbp_per_sec:.1} Mbp/s)" + ); + + // Build and write a JSON summary if path provided + if let Some(summary_file) = config.summary_path { + // Get number of sequences passing filter + let seqs_out = total_seqs - filtered_seqs; + + let summary = FilterSummary { + version: tool_version, + index: get_summary_index(&config.minimizers_path, &config.server_address), + input: config.input_path.to_string(), + input2: config.input2_path.map(|s| s.to_string()), + output: config.output_path.to_string(), + output2: config.output2_path.map(|s| s.to_string()), + k: kmer_length, + w: window_size, + abs_threshold: config.abs_threshold, + rel_threshold: config.rel_threshold, + prefix_length: config.prefix_length, + deplete: config.deplete, + rename: config.rename, + seqs_in: total_seqs, + seqs_out, + seqs_out_proportion: output_seq_proportion, + seqs_removed: filtered_seqs, + seqs_removed_proportion: filtered_proportion, + bp_in: total_bp, + bp_out: output_bp, + bp_out_proportion: output_bp_proportion, + bp_removed: filtered_bp, + bp_removed_proportion: filtered_bp_proportion, + time: total_time.as_secs_f64(), + seqs_per_second: seqs_per_sec as u64, + bp_per_second: bp_per_sec as u64, + }; + + // Write summary file + let file = File::create(summary_file) + .context(format!("Failed to create summary: {summary_file:?}"))?; + let writer = BufWriter::new(file); + + // Serialise and write the summary JSON + serde_json::to_writer_pretty(writer, &summary).context("Failed to write summary")?; + + eprintln!("Summary saved to {summary_file:?}"); + } + + Ok(()) +} + +/// Filter a single (unpaired) sequence. +#[allow(clippy::too_many_arguments)] +fn process_single_seqs( + minimizer_hashes: &Option>, + input_path: &str, + writer: &mut Box, + abs_threshold: usize, + rel_threshold: f64, + prefix_length: u8, + kmer_length: u8, + window_size: u8, + deplete: bool, + rename: bool, + total_seqs: &mut u64, + filtered_seqs: &mut u64, + total_bp: &mut u64, + output_bp: &mut u64, + filtered_bp: &mut u64, + output_seq_counter: &mut u64, + spinner: &ProgressBar, + filtering_start_time: Instant, + server_address: &Option, + debug: bool, +) -> Result<()> { + // Create a reader based on the input source + let mut reader = if input_path == "-" { + parse_fastx_stdin()? + } else { + parse_fastx_file(input_path)? + }; + + // Process in batches + let batch_size = 10000; + let mut output_record_buffer = Vec::with_capacity(1024); + + // Process batches + loop { + // Collect a batch of records with owned data + let mut batch: Vec = Vec::with_capacity(batch_size); + let mut reached_end = false; + + // Fill the batch (sequential read from either stdin or file) + for _ in 0..batch_size { + if let Some(record_result) = reader.next() { + match record_result { + Ok(record) => { + let record_data = RecordData { + id: record.id().to_vec(), + seq: record.seq().to_vec(), + qual: record.qual().map(|q| q.to_vec()), + format: record.format(), + }; + batch.push(record_data); + } + Err(e) => return Err(e.into()), + } + } else { + reached_end = true; + break; + } + } + + if batch.is_empty() { + break; + } + + // Get batch minimizers in parallel + let batch_result: Vec<(Vec, Vec, Vec)> = batch + .par_iter() + .map(|record_data| { + let (hashes, positions, seqs) = get_minimizer_hashes_and_positions( + &record_data.seq, + prefix_length as usize, + kmer_length, + window_size, + ); + (hashes, positions, seqs.to_vec()) + // get_hashes_from_record(record_data, kmer_length, prefix_length, window_size) + }) + .collect(); + + // let (batch_minimizers, batch_positons, effective_seqs): (Vec>, Vec>, Vec<&[u8]>) = + // batch_result.into_iter().multiunzip(); + + // Check if minimizers match the index + // Separated from initial par_iter to allow flexibility with local/server processing + let batch_should_outputs = check_single_inputs_should_be_output( + minimizer_hashes, + &batch_result, + server_address, + deplete, + kmer_length, + debug, + abs_threshold, + rel_threshold, + ); + + // Process results sequentially to maintain order + for (i, (should_output, hit_count, total_minimizers, hit_kmers)) in + batch_should_outputs.into_iter().enumerate() + { + let record_data = &batch[i]; + let seq_len = record_data.seq.len(); + *total_seqs += 1; + *total_bp += seq_len as u64; + + if debug { + eprintln!( + "DEBUG: {} hits={}/{} keep={} kmers=[{}]", + String::from_utf8_lossy(&record_data.id), + hit_count, + total_minimizers, + should_output, + hit_kmers.join(",") + ); + } + + if should_output { + // Track output base pairs + *output_bp += seq_len as u64; + + // Increment output sequence counter + *output_seq_counter += 1; + + // Format as FASTX and write + output_record_buffer.clear(); + output_fastx_record_from_parts( + &record_data.id, + &record_data.seq, + record_data.qual.as_deref(), + record_data.format, + &mut output_record_buffer, + rename, + *output_seq_counter, + ); + writer.write_all(&output_record_buffer)?; + } else { + *filtered_seqs += 1; + *filtered_bp += seq_len as u64; + } + } + + // Update spinner and flush periodically + let elapsed = filtering_start_time.elapsed(); + let seqs_per_sec = *total_seqs as f64 / elapsed.as_secs_f64(); + let bp_per_sec = *total_bp as f64 / elapsed.as_secs_f64(); + let mbp_per_sec = bp_per_sec / 1_000_000.0; + + // Calculate output proportion + let output_seqs = *total_seqs - *filtered_seqs; + let output_proportion = if *total_seqs > 0 { + output_seqs as f64 / *total_seqs as f64 + } else { + 0.0 + }; + + // Calculate output base pair proportion + let output_bp_proportion = if *total_bp > 0 { + *output_bp as f64 / *total_bp as f64 + } else { + 0.0 + }; + + // Update spinner message + spinner.set_message(format!( + "Retained {}/{} sequences ({:.2}%), {}/{} bp ({:.2}%). {:.0} seqs/s ({:.1} Mbp/s)", + output_seqs, + total_seqs, + output_proportion * 100.0, + output_bp, + total_bp, + output_bp_proportion * 100.0, + seqs_per_sec, + mbp_per_sec + )); + + // Flush writer periodically + writer.flush()?; + + // Check if we've reached the end of the file/stdin + if reached_end { + break; + } + } + + Ok(()) +} + +/// Filter a pair of sequences +#[allow(clippy::too_many_arguments)] +fn process_paired_seqs( + minimizer_hashes: &Option>, + input1_path: &str, + input2_path: &str, + writer: &mut Box, + mut writer2: Option<&mut Box>, + abs_threshold: usize, + rel_threshold: f64, + prefix_length: u8, + kmer_length: u8, + window_size: u8, + deplete: bool, + rename: bool, + total_seqs: &mut u64, + filtered_seqs: &mut u64, + total_bp: &mut u64, + output_bp: &mut u64, + filtered_bp: &mut u64, + output_seq_counter: &mut u64, + spinner: &ProgressBar, + filtering_start_time: Instant, + server_address: &Option, + debug: bool, +) -> Result<()> { + // Open both input files + let mut reader1 = if input1_path == "-" { + parse_fastx_stdin()? + } else { + parse_fastx_file(input1_path)? + }; + + let mut reader2 = parse_fastx_file(input2_path)?; + + // Process in batches + let batch_size = 10000; + let mut output_record_buffer = Vec::with_capacity(1024); + + // Process batches + loop { + // Collect a batch of read pairs with owned data + let mut batch1: Vec = Vec::with_capacity(batch_size); + let mut batch2: Vec = Vec::with_capacity(batch_size); + let mut reached_end = false; + + // Fill the batch (sequential read from files) + for _ in 0..batch_size { + if let (Some(record1_res), Some(record2_res)) = (reader1.next(), reader2.next()) { + match (record1_res, record2_res) { + (Ok(record1), Ok(record2)) => { + let record_data1 = RecordData { + id: record1.id().to_vec(), + seq: record1.seq().to_vec(), + qual: record1.qual().map(|q| q.to_vec()), + format: record1.format(), + }; + let record_data2 = RecordData { + id: record2.id().to_vec(), + seq: record2.seq().to_vec(), + qual: record2.qual().map(|q| q.to_vec()), + format: record2.format(), + }; + batch1.push(record_data1); + batch2.push(record_data2); + } + (Err(e), _) => return Err(e.into()), + (_, Err(e)) => return Err(e.into()), + } + } else { + reached_end = true; + break; + } + } + + if batch1.is_empty() { + break; + } + + // Get batch minimizers in parallel + let batch_result: Vec<(Vec, Vec, Vec<&[u8]>)> = batch1 + .par_iter() + .zip(batch2.par_iter()) + .map(|(record_data1, record_data2)| { + get_paired_minimizer_hashes_and_positions( + &record_data1.seq, + &record_data2.seq, + prefix_length.into(), + kmer_length, + window_size, + ) + }) + .collect(); + + // let batch_result: Vec<(Vec, u8, u8)> = batch1 + // .par_iter() + // .zip(batch2.par_iter()) + // .map(|(record_data1, record_data2)| { + // get_hashes_from_record_pair( + // record_data1, + // record_data2, + // kmer_length, + // prefix_length, + // window_size, + // ) + // }) + // .collect(); + + // let (batch_minimizers, seq_lens1, seq_lens2): (Vec>, Vec, Vec) = + // batch_result.into_iter().multiunzip(); + + // Check if minimizers match the index + // Separated from initial par_iter to allow flexibility with local/server processing + // let batch_should_outputs = check_inputs_should_be_output( + // minimizer_hashes, + // &batch_minimizers, + // abs_threshold, + // rel_threshold, + // server_address, + // deplete, + // ); + + let batch_should_outputs = check_paired_inputs_should_be_output( + minimizer_hashes, + &batch_result, + server_address, + deplete, + kmer_length, + debug, + abs_threshold, + rel_threshold, + ); + + // Process results sequentially to maintain order + for (i, (should_output, hit_count, total_minimizers, hit_kmers)) in + batch_should_outputs.into_iter().enumerate() + { + let record_data1 = &batch1[i]; + let record_data2 = &batch2[i]; + let seq1_len = record_data1.seq.len(); + let seq2_len = record_data2.seq.len(); + + *total_seqs += 2; + *total_bp += (seq1_len + seq2_len) as u64; + + if debug && hit_count > 0 { + eprintln!( + "DEBUG: {}/{} hits={}/{} keep={} kmers=[{}]", + String::from_utf8_lossy(&record_data1.id), + String::from_utf8_lossy(&record_data2.id), + hit_count, + total_minimizers, + should_output, + hit_kmers.join(",") + ); + } + + if should_output { + // Track output base pairs + *output_bp += (seq1_len + seq2_len) as u64; + + // Increment output sequence counter (twice, once for each read) + *output_seq_counter += 2; + + // Format s1 as FASTX to byte buffer and write to appropriate writer + output_record_buffer.clear(); + output_fastx_record_from_parts( + &record_data1.id, + &record_data1.seq, + record_data1.qual.as_deref(), + record_data1.format, + &mut output_record_buffer, + rename, + *output_seq_counter - 1, + ); + + if let Some(ref mut w2) = writer2 { + // Write read 1 to primary writer + writer.write_all(&output_record_buffer)?; + + // Format s2 as FASTX to byte buffer and write to second writer + output_record_buffer.clear(); + output_fastx_record_from_parts( + &record_data2.id, + &record_data2.seq, + record_data2.qual.as_deref(), + record_data2.format, + &mut output_record_buffer, + rename, + *output_seq_counter, + ); + w2.write_all(&output_record_buffer)?; + } else { + // Interleaved output + writer.write_all(&output_record_buffer)?; + + // Format s2 as FASTX to byte buffer + output_record_buffer.clear(); + output_fastx_record_from_parts( + &record_data2.id, + &record_data2.seq, + record_data2.qual.as_deref(), + record_data2.format, + &mut output_record_buffer, + rename, + *output_seq_counter, + ); + writer.write_all(&output_record_buffer)?; + } + } else { + *filtered_seqs += 2; // Both seqs filtered out + *filtered_bp += (seq1_len + seq2_len) as u64; // Track filtered base pairs + } + } + + // Update spinner and flush periodically + let elapsed = filtering_start_time.elapsed(); + let seqs_per_sec = *total_seqs as f64 / elapsed.as_secs_f64(); + let bp_per_sec = *total_bp as f64 / elapsed.as_secs_f64(); + let mbp_per_sec = bp_per_sec / 1_000_000.0; + + // Calculate output proportion directly + let output_seqs = *total_seqs - *filtered_seqs; + let output_proportion = if *total_seqs > 0 { + output_seqs as f64 / *total_seqs as f64 + } else { + 0.0 + }; + + // Calculate output base pair proportion + let output_bp_proportion = if *total_bp > 0 { + *output_bp as f64 / *total_bp as f64 + } else { + 0.0 + }; + + // Update spinner message with detailed stats + spinner.set_message(format!( + "Retained {}/{} sequences ({:.2}%), {}/{} bp ({:.2}%). {:.0} seqs/s ({:.1} Mbp/s)", + output_seqs, + total_seqs, + output_proportion * 100.0, + output_bp, + total_bp, + output_bp_proportion * 100.0, + seqs_per_sec, + mbp_per_sec + )); + + // Flush writer periodically + writer.flush()?; + if let Some(ref mut w2) = writer2 { + w2.flush()?; + } + + // Check if we've reached the end of the files + if reached_end { + break; + } + } + + Ok(()) +} + +/// Filter a pair of interleaved sequences +/// Functionally very similar to `process_paired_seqs`, but handles interleaved input +#[allow(clippy::too_many_arguments)] +fn process_interleaved_paired_seqs( + minimizer_hashes: &Option>, + writer: &mut Box, + mut writer2: Option<&mut Box>, + abs_threshold: usize, + rel_threshold: f64, + prefix_length: u8, + kmer_length: u8, + window_size: u8, + deplete: bool, + rename: bool, + total_seqs: &mut u64, + filtered_seqs: &mut u64, + total_bp: &mut u64, + output_bp: &mut u64, + filtered_bp: &mut u64, + output_seq_counter: &mut u64, + spinner: &ProgressBar, + filtering_start_time: Instant, + server_address: &Option, + debug: bool, +) -> Result<()> { + // Parse FASTX from stdin + let mut reader = parse_fastx_stdin()?; + let mut output_record_buffer = Vec::with_capacity(1024); + let mut record_counter = 0; + + // Process in batches + let batch_size = 10000; + + loop { + // Collect a batch of read pairs with owned data + let mut batch_pairs = Vec::with_capacity(batch_size); + let mut reached_end = false; + + // Fill the batch with interleaved pairs + for _ in 0..batch_size { + // Read the first record of the pair + let (record1_id, record1_seq, record1_qual, record1_format) = match reader.next() { + Some(result) => { + record_counter += 1; + let record = result?; + // Extract all data we need from the record + let id = record.id().to_vec(); + let seq = record.seq().to_vec(); + let qual = record.qual().map(|q| q.to_vec()); + let format = record.format(); + (id, seq, qual, format) + } + None => { + reached_end = true; + break; // End of input + } + }; + + // Read the second record of the pair + let (record2_id, record2_seq, record2_qual, record2_format) = match reader.next() { + Some(result) => { + record_counter += 1; + let record = result?; + let id = record.id().to_vec(); + let seq = record.seq().to_vec(); + let qual = record.qual().map(|q| q.to_vec()); + let format = record.format(); + (id, seq, qual, format) + } + None => { + // Check if we have record1 but no record2 (mispaired) + return Err(anyhow::anyhow!( + "Uneven number of interleaved sequence pairs. Found {} records.", + record_counter + )); + } + }; + + // Store the pair in the batch + batch_pairs.push(( + RecordData { + id: record1_id, + seq: record1_seq, + qual: record1_qual, + format: record1_format, + }, + RecordData { + id: record2_id, + seq: record2_seq, + qual: record2_qual, + format: record2_format, + }, + )); + } + + if batch_pairs.is_empty() { + break; + } + + // Get batch minimizers in parallel + let batch_result: Vec<(Vec, Vec, Vec<&[u8]>)> = batch_pairs + .par_iter() + .map(|(record_data1, record_data2)| { + get_paired_minimizer_hashes_and_positions( + &record_data1.seq, + &record_data2.seq, + prefix_length.into(), + kmer_length, + window_size, + ) + }) + .collect(); + + let batch_should_outputs = check_paired_inputs_should_be_output( + minimizer_hashes, + &batch_result, + server_address, + deplete, + kmer_length, + debug, + abs_threshold, + rel_threshold, + ); + // let batch_result: Vec<(Vec, u8, u8)> = batch_pairs + // .par_iter() + // .map(|(record_data1, record_data2)| { + // get_hashes_from_record_pair( + // record_data1, + // record_data2, + // kmer_length, + // prefix_length, + // window_size, + // ) + // }) + // .collect(); + + // let (batch_minimizers, seq_lens1, seq_lens2): (Vec>, Vec, Vec) = + // batch_result.into_iter().multiunzip(); + + // // Check if minimizers match the index + // // Separated from initial par_iter to allow flexibility with local/server processing + // let batch_should_outputs = check_inputs_should_be_output( + // minimizer_hashes, + // &batch_minimizers, + // abs_threshold, + // rel_threshold, + // server_address, + // deplete, + // ); + + // Process results sequentially to maintain order + for (i, (should_output, hit_count, total_minimizers, hit_kmers)) in + batch_should_outputs.into_iter().enumerate() + { + // for (i, result) in batch_results.into_iter().enumerate() { + let (record1, record2) = &batch_pairs[i]; + let seq1_len = record1.seq.len(); + let seq2_len = record2.seq.len(); + + *total_seqs += 2; + *total_bp += (seq1_len + seq2_len) as u64; + + if debug && hit_count > 0 { + eprintln!( + "DEBUG: {}/{} hits={}/{} keep={} kmers=[{}]", + String::from_utf8_lossy(&record1.id), + String::from_utf8_lossy(&record2.id), + hit_count, + total_minimizers, + should_output, + hit_kmers.join(",") + ); + } + + if should_output { + // Track output base pairs + *output_bp += (seq1_len + seq2_len) as u64; + + // Increment output sequence counter (twice, once for each seq) + *output_seq_counter += 2; + + // Format and write record 1 + output_record_buffer.clear(); + output_fastx_record_from_parts( + &record1.id, + &record1.seq, + record1.qual.as_deref(), + record1.format, + &mut output_record_buffer, + rename, + *output_seq_counter - 1, + ); + + if let Some(ref mut w2) = writer2 { + // Write read 1 to primary writer + writer.write_all(&output_record_buffer)?; + + // Format and write record 2 to second writer + output_record_buffer.clear(); + output_fastx_record_from_parts( + &record2.id, + &record2.seq, + record2.qual.as_deref(), + record2.format, + &mut output_record_buffer, + rename, + *output_seq_counter, + ); + w2.write_all(&output_record_buffer)?; + } else { + // Interleaved output (existing behavior) + writer.write_all(&output_record_buffer)?; + + // Format and write record 2 + output_record_buffer.clear(); + output_fastx_record_from_parts( + &record2.id, + &record2.seq, + record2.qual.as_deref(), + record2.format, + &mut output_record_buffer, + rename, + *output_seq_counter, + ); + writer.write_all(&output_record_buffer)?; + } + } else { + *filtered_seqs += 2; // Both seqs filtered out + *filtered_bp += (seq1_len + seq2_len) as u64; // Track filtered base pairs + } + } + + // Update spinner and flush periodically + let elapsed = filtering_start_time.elapsed(); + let seqs_per_sec = *total_seqs as f64 / elapsed.as_secs_f64(); + let bp_per_sec = *total_bp as f64 / elapsed.as_secs_f64(); + let mbp_per_sec = bp_per_sec / 1_000_000.0; + + // Calculate output proportion directly + let output_seqs = *total_seqs - *filtered_seqs; + let output_proportion = if *total_seqs > 0 { + output_seqs as f64 / *total_seqs as f64 + } else { + 0.0 + }; + + // Calculate output base pair proportion + let output_bp_proportion = if *total_bp > 0 { + *output_bp as f64 / *total_bp as f64 + } else { + 0.0 + }; + + // Update spinner message with detailed stats + spinner.set_message(format!( + "Retained {}/{} seqs ({:.2}%), {}/{} bp ({:.2}%). {:.0} seqs/s ({:.1} Mbp/s)", + output_seqs, + total_seqs, + output_proportion * 100.0, + output_bp, + total_bp, + output_bp_proportion * 100.0, + seqs_per_sec, + mbp_per_sec + )); + + // Flush writer periodically + writer.flush()?; + if let Some(ref mut w2) = writer2 { + w2.flush()?; + } + + // Check if we've reached the end of input + if reached_end { + break; + } + } + + Ok(()) +} + +/// Push FASTA or FASTQ record to output buffer from component parts +/// Workaround for borrowing misery with interleaved pairs from stdin +fn output_fastx_record_from_parts( + id: &[u8], + seq: &[u8], + qual: Option<&[u8]>, + format: Format, + buffer: &mut Vec, + rename: bool, + seq_number: u64, +) { + match format { + Format::Fasta => { + buffer.push(b'>'); + if rename { + // Use sequential numbering for sequence ID + buffer.extend_from_slice(seq_number.to_string().as_bytes()); + } else { + // Use original sequence ID + buffer.extend_from_slice(id); + } + buffer.push(b'\n'); + buffer.extend_from_slice(seq); + buffer.push(b'\n'); + } + Format::Fastq => { + buffer.push(b'@'); + if rename { + // Use sequential numbering for sequence ID + buffer.extend_from_slice(seq_number.to_string().as_bytes()); + } else { + // Use original sequence ID + buffer.extend_from_slice(id); + } + buffer.push(b'\n'); + buffer.extend_from_slice(seq); + buffer.extend_from_slice(b"\n+\n"); + if let Some(qual_data) = qual { + buffer.extend_from_slice(qual_data); + } + buffer.push(b'\n'); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::index::IndexHeader; + use crate::index::write_minimizers; + use std::path::PathBuf; + use tempfile::TempDir; + + #[allow(dead_code)] // Suppress unused warnings + fn create_test_index() -> (PathBuf, IndexHeader, TempDir) { + // Create a temporary directory + let temp_dir = TempDir::new().unwrap(); + let index_path = temp_dir.path().join("test.idx"); + + // Create dummy minimizers + let minimizers: FxHashSet = [1, 2, 3, 4, 5].iter().cloned().collect(); + let header = IndexHeader::new(5, 3); + + write_minimizers(&minimizers, &header, Some(&index_path)).unwrap(); + + // Return the TempDir along with the other values to keep it in scope + (index_path, header, temp_dir) + } + + #[test] + fn test_filter_summary() { + // Create a sample summary + let summary = FilterSummary { + version: "deacon 0.1.0".to_string(), + index: "test.idx".to_string(), + input: "test.fastq".to_string(), + input2: Some("test2.fastq".to_string()), + output: "output.fastq".to_string(), + output2: Some("output2.fastq".to_string()), + k: 31, + w: 21, + abs_threshold: 1, + rel_threshold: 0.01, + prefix_length: 0, + deplete: false, + rename: false, + seqs_in: 100, + seqs_out: 90, + seqs_out_proportion: 0.9, + seqs_removed: 10, + seqs_removed_proportion: 0.1, + bp_in: 10000, + bp_out: 9000, + bp_out_proportion: 0.9, + bp_removed: 1000, + bp_removed_proportion: 0.1, + time: 1.5, + seqs_per_second: 66, + bp_per_second: 6666, + }; + + // Test JSON ser+de + let json = serde_json::to_string(&summary).unwrap(); + let parsed: FilterSummary = serde_json::from_str(&json).unwrap(); + + // Check values + assert_eq!(parsed.version, "deacon 0.1.0"); + assert_eq!(parsed.seqs_in, 100); + assert_eq!(parsed.seqs_removed_proportion, 0.1); + assert_eq!(parsed.seqs_out_proportion, 0.9); + assert_eq!(parsed.bp_out_proportion, 0.9); + assert_eq!(parsed.input, "test.fastq"); + assert_eq!(parsed.input2, Some("test2.fastq".to_string())); + assert_eq!(parsed.output, "output.fastq"); + assert_eq!(parsed.output2, Some("output2.fastq".to_string())); + } +} diff --git a/src/server.rs b/src/server.rs new file mode 100644 index 0000000..3b0a38b --- /dev/null +++ b/src/server.rs @@ -0,0 +1,136 @@ +//! Functionality to create a server endpoint which can be used to filter based on a pre-loaded index +use std::path::PathBuf; +use std::sync::{Mutex, OnceLock}; + +use crate::filter::inputs_should_be_output; +use crate::index::{IndexHeader, load_minimizer_hashes}; +use crate::server_common::{FilterRequest, FilterResponse}; +use axum::{ + Json, Router, + extract::DefaultBodyLimit, + routing::{get, post}, +}; +use rustc_hash::FxHashSet; + +/// Shared index file between endpoint calls. +/// Annoyingly, we have to use an Option as the default/empty FxHashSet is not static +static INDEX: Mutex>> = Mutex::new(None); + +/// Shared index header between endpoint calls. +/// Initalised to a dummy value, which will be replaced when the index is loaded. +static INDEX_HEADER: Mutex = Mutex::new(IndexHeader { + format_version: 0, + kmer_length: 0, + window_size: 0, +}); + +/// Shared index hash between endpoint calls. +static INDEX_HASH: Mutex> = Mutex::new(None); + +/// Just for ensuring we get a single tracing setup. +/// Mostly needed as tests otherwise try to spawn multiple +static TRACING: OnceLock<()> = OnceLock::new(); + +/// Starts the server with the given index path and port. +/// To log the server's connections, set `RUST_LOG=trace` in your environment variables. +pub async fn run_server(index_path: PathBuf, port: u16) { + // initialize tracing + TRACING.get_or_init(|| { + tracing_subscriber::fmt::init(); + }); + + eprintln!("Loading index from: {}", index_path.display()); + // Load the index before starting the server to ensure it's available for requests + load_index(index_path); + + // build our application with a route + let app = Router::new() + // `GET /` goes to `root` + .route("/", get(root)) + // `GET /index_header` returns the index header + .route("/index_header", get(index_header)) + // `GET /index_version` returns the index version (hash) + .route("/index_version", get(index_version)) + // `POST /filter` goes to `filter` + .route("/should_output", post(should_output)) + // Increase the body limit to 2GB to ensure we don't error on large payloads + .layer(DefaultBodyLimit::max(2147483648)); + + // run our app with hyper, listening globally + let listener = tokio::net::TcpListener::bind("0.0.0.0:".to_owned() + &port.to_string()) + .await + .unwrap(); + axum::serve(listener, app).await.unwrap(); +} + +/// Load the index from the specified path. +fn load_index(index_path: PathBuf) { + // Load the hash as well as the file contents for returning as an ugly (but reliable) version + let bytes = std::fs::read(index_path.clone()).unwrap(); + let hash = sha256::digest(&bytes); + *INDEX_HASH.lock().unwrap() = Some(index_path.clone().into_os_string().into_string().unwrap() + "@" + &hash); + + let result = load_minimizer_hashes(&Some(&index_path), &None); + match result { + Ok((minimizers, header)) => { + *INDEX.lock().unwrap() = minimizers; + *INDEX_HEADER.lock().unwrap() = header; + } + Err(e) => { + eprintln!("Failed to load index: {e}"); + std::process::exit(1); + } + } +} + +/// Basic root, returing a message indicating the index is loaded +/// Endpoint is `/` +pub async fn root() -> String { + let index = INDEX.lock(); + match index { + Ok(index) => { + let index = index.as_ref().expect("Index not loaded"); + let header = INDEX_HEADER.lock().unwrap(); + format!( + "Index loaded with {} minimizers and header: {:?}", + index.len(), + header + ) + } + Err(e) => format!("Error accessing index: {e}"), + } +} + +/// Endpoint to return the header of the loaded index +/// Endpoint is `/index_header` +pub async fn index_header() -> Json { + let header = INDEX_HEADER.lock().unwrap(); + Json(header.clone()) +} + +/// Endpoint to return the loaded index version +/// Endpoint is `/index_version` +pub async fn index_version() -> String { + let hash = INDEX_HASH.lock().unwrap(); + hash.clone().unwrap() +} + +/// Endpoint which takes a set of hashes, returning whether they match the index +/// Endpoint is `/should_output` +pub async fn should_output(Json(request): Json) -> Json { + let index = INDEX.lock(); + match index { + Ok(index) => { + let index = index.as_ref().expect("Index not loaded"); + Json(FilterResponse { + should_output: inputs_should_be_output( + index, + &request.input, + &request.match_threshold, + request.deplete, + ), + }) + } + Err(e) => panic!("Error accessing index: {e}"), + } +} diff --git a/src/server_common.rs b/src/server_common.rs new file mode 100644 index 0000000..b072387 --- /dev/null +++ b/src/server_common.rs @@ -0,0 +1,81 @@ +//! Common structures and types used in the client and server +use crate::IndexHeader; +use anyhow::Result; +use reqwest::blocking::Client; +use serde::{Deserialize, Serialize}; + +/// Request structure for filtering minimizers unpaired reads +#[derive(Serialize, Deserialize)] +pub struct UnpairedFilterRequest { + /// Prehashed minimizers for input + pub input: Vec<(Vec, Vec, Vec)>, + + /// Mininum number (integer) of minimizer hits for a match + pub abs_threshold: usize, + + /// Mininum proportion (float) of minimizer hits for a match + pub rel_threshold: f64, + + /// Whether running in deplete mode + pub deplete: bool, + + /// kmer length used to compute minimizers + pub kmer_length: u8, + + /// Whether running in debug mode + pub debug: bool, +} + +/// Request structure for filtering minimizers from paired reads +#[derive(Serialize, Deserialize)] +pub struct PairedFilterRequest { + /// Prehashed minimizers for input + pub input: Vec<(Vec, Vec, Vec>)>, + + /// Mininum number (integer) of minimizer hits for a match + pub abs_threshold: usize, + + /// Mininum proportion (float) of minimizer hits for a match + pub rel_threshold: f64, + + /// Whether running in deplete mode + pub deplete: bool, + + /// kmer length used to compute minimizers + pub kmer_length: u8, + + /// Whether running in debug mode + pub debug: bool, +} + +/// Response structure for filter results +/// Returns whether this set of minimizers should be output +#[derive(Serialize, Deserialize)] +pub struct FilterResponse { + /// Indicates whether this set of minimizers should be output + /// Tuple of (should_keep, hit_count, total_minimizers, hit_kmers) + pub should_output: Vec<(bool, usize, usize, Vec)>, +} + +/// Get the header of the index loaded into a remote server +/// Required in order to ensure that the locally computed minimizers match +/// the kmer length and window size +pub fn get_server_index_header(server_address: &str) -> Result { + // Create a client to send the minimizers to the server + let client = Client::new(); + + // Send the minimizers as a POST request + let response = client + .get(server_address.to_owned() + "/index_header") + .send()?; + + // Check if the response indicates a match + if response.status().is_success() { + Ok(response.json::()?) + } else { + Err(anyhow::anyhow!( + "Server returned an error: {}", + response.status() + )) + } +} From b73d6e37d14c23c33c5ef6c3c7eb56bd3e8b0998 Mon Sep 17 00:00:00 2001 From: gpas-github-bot Date: Tue, 9 Sep 2025 14:56:47 +0100 Subject: [PATCH 2/7] feat: functional server mode --- Cargo.lock | 102 +++++++++++++++++++++++++++ Cargo.toml | 5 +- src/lib.rs | 2 + src/main.rs | 161 +++++++++++++++++++++++++++++++++++++++++++ src/remote_filter.rs | 8 +-- src/server.rs | 67 +++++++++++++----- 6 files changed, 321 insertions(+), 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 71e833b..f658a71 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -98,6 +98,17 @@ dependencies = [ "wait-timeout", ] +[[package]] +name = "async-trait" +version = "0.1.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "atomic-waker" version = "1.1.2" @@ -223,6 +234,15 @@ version = "2.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + [[package]] name = "bstr" version = "1.12.0" @@ -388,6 +408,15 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + [[package]] name = "crc32fast" version = "1.5.0" @@ -431,6 +460,16 @@ version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + [[package]] name = "deacon" version = "0.10.0" @@ -455,6 +494,7 @@ dependencies = [ "rustc-hash", "serde", "serde_json", + "sha256", "simd-minimizers", "tempfile", "thiserror 2.0.14", @@ -470,6 +510,16 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + [[package]] name = "displaydoc" version = "0.2.5" @@ -648,6 +698,16 @@ dependencies = [ "slab", ] +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getrandom" version = "0.2.16" @@ -714,6 +774,12 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + [[package]] name = "http" version = "1.3.1" @@ -1835,6 +1901,30 @@ dependencies = [ "serde", ] +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha256" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f880fc8562bdeb709793f00eb42a2ad0e672c4f883bbe59122b926eca935c8f6" +dependencies = [ + "async-trait", + "bytes", + "hex", + "sha2", + "tokio", +] + [[package]] name = "sharded-slab" version = "0.1.7" @@ -2247,6 +2337,12 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "typenum" +version = "1.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" + [[package]] name = "unicode-ident" version = "1.0.18" @@ -2307,6 +2403,12 @@ version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + [[package]] name = "virtue" version = "0.0.18" diff --git a/Cargo.toml b/Cargo.toml index 89fbbc4..1987d04 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,10 +39,11 @@ tokio = { version = "1.47.0", features = ["full"], optional = true } axum = { version = "0.8.4", optional = true } tracing-subscriber = { version = "0.3.19", features = ["fmt", "registry", "json"], optional = true } reqwest = { version = "0.12.22", features = ["blocking", "json"], optional = true } +sha256 = { version = "1.6.0", optional = true } [features] -# default = ["server"] -server = ["dep:tokio", "dep:axum", "dep:tracing-subscriber", "dep:reqwest"] +default = ["server"] +server = ["dep:tokio", "dep:axum", "dep:tracing-subscriber", "dep:reqwest", "dep:sha256"] [lints.clippy] too_many_arguments = "allow" diff --git a/src/lib.rs b/src/lib.rs index bc1d73b..a32bdaf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -19,6 +19,8 @@ mod filter_common; mod local_filter; #[cfg(feature = "server")] mod remote_filter; +#[cfg(feature = "server")] +pub mod server; // Re-export the important structures and functions for library users pub use filter::run as run_filter; diff --git a/src/main.rs b/src/main.rs index 60ba24a..15e87b1 100644 --- a/src/main.rs +++ b/src/main.rs @@ -76,6 +76,79 @@ enum Commands { #[arg(long = "debug", default_value_t = false)] debug: bool, + /// Suppress progress reporting + #[arg(short = 'q', long = "quiet", default_value_t = false)] + quiet: bool, + }, + /// Run a server to hold a pre-loaded minimizer index in memory for filtering + /// with the Client command. Saves time for filtering short sequences with large indexes + /// but will inevitably be slower than local filtering. + Server { + /// Path to minimizer index file + index: PathBuf, + + /// Port to run the server on + #[arg(short = 'p', long = "port", default_value_t = 8888)] + port: u16, + }, + /// Alternate version of Filter, swapping local compute for passing to a server + /// which has the index pre-loaded. Will inevitably be slower than local filtering, + /// but saves on index loading. Better used for cases of small input + large index + Client { + /// Server address to connect to (including port) + server_address: String, + + /// Optional path to fastx file (or - for stdin) + #[arg(default_value = "-")] + input: String, + + /// Optional path to second paired fastx file (or - for interleaved stdin) + input2: Option, + + /// Path to output fastx file (or - for stdout; detects .gz and .zst) + #[arg(short = 'o', long = "output", default_value = "-")] + output: String, + + /// Optional path to second paired output fastx file (detects .gz and .zst) + #[arg(short = 'O', long = "output2")] + output2: Option, + + /// Minimum absolute number of minimizer hits for a match + #[arg(short = 'a', long = "abs-threshold", default_value_t = 2, value_parser = clap::value_parser!(u16).range(1..))] + abs_threshold: u16, + + /// Minimum relative proportion (0.0-1.0) of minimizer hits for a match + #[arg(short = 'r', long = "rel-threshold", default_value_t = 0.01)] + rel_threshold: f64, + + /// Search only the first N nucleotides per sequence (0 = entire sequence) + #[arg(short = 'p', long = "prefix-length", default_value_t = 0)] + prefix_length: usize, + + /// Discard matching sequences (invert filtering behaviour) + #[arg(short = 'd', long = "deplete", default_value_t = false)] + deplete: bool, + + /// Replace sequence headers with incrementing numbers + #[arg(short = 'R', long = "rename", default_value_t = false)] + rename: bool, + + /// Path to JSON summary output file + #[arg(short = 's', long = "summary")] + summary: Option, + + /// Number of execution threads (0 = auto) + #[arg(short = 't', long = "threads", default_value_t = 8)] + threads: usize, + + /// Output compression level (1-9 for gz & xz; 1-22 for zstd) + #[arg(long = "compression-level", default_value_t = 2)] + compression_level: u8, + + /// Output sequences with minimizer hits to stderr + #[arg(long = "debug", default_value_t = false)] + debug: bool, + /// Suppress progress reporting #[arg(short = 'q', long = "quiet", default_value_t = false)] quiet: bool, @@ -270,6 +343,94 @@ fn main() -> Result<()> { }; config.execute().context("Failed to run filter command")?; } + Commands::Server { index, port } => { + #[cfg(feature = "server")] + { + // Server needs to run async, so spawn an async runtime to run it + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(async { + eprintln!("Loading server!"); + deacon::server::run_server(index.clone(), *port).await; + }); + } + #[cfg(not(feature = "server"))] + { + eprintln!( + "Server functionality is not enabled in this build. Please compile with the 'server' feature: `cargo build --features server`" + ); + // Suppress dead code warning so this compiles without issue when server is not enabled + let _ = (index, port); + std::process::exit(1); + } + } + Commands::Client { + server_address, + input, + input2, + output, + output2, + abs_threshold, + rel_threshold, + prefix_length, + summary, + deplete, + rename, + threads, + compression_level, + debug, + quiet, + } => { + #[cfg(feature = "server")] + { + // Validate output2 usage + if output2.is_some() && input2.is_none() { + eprintln!( + "Warning: --output2 specified but no second input file provided. --output2 will be ignored." + ); + } + let config = FilterConfig { + minimizers_path: None, + server_address: Some(server_address.to_string()), + input_path: input, + input2_path: input2.as_deref(), + output_path: output, + output2_path: output2.as_deref(), + abs_threshold: *abs_threshold as usize, + rel_threshold: *rel_threshold, + prefix_length: *prefix_length, + summary_path: summary.as_ref(), + deplete: *deplete, + rename: *rename, + threads: *threads, + compression_level: *compression_level, + debug: *debug, + quiet: *quiet, + }; + config.execute().context("Failed to run filter command")?; + } + #[cfg(not(feature = "server"))] + { + eprintln!( + "Client functionality is not enabled in this build. Please compile with the 'server' feature: `cargo build --features server`" + ); + // Suppress dead code warning so this compiles without issue when server is not enabled + let _ = ( + server_address, + input, + input2, + output, + output2, + match_threshold, + prefix_length, + summary, + deplete, + rename, + threads, + compression_level, + ); + std::process::exit(1); + } + } } Ok(()) diff --git a/src/remote_filter.rs b/src/remote_filter.rs index bfb54db..b59abe7 100644 --- a/src/remote_filter.rs +++ b/src/remote_filter.rs @@ -228,7 +228,7 @@ fn get_writer(output_path: &str, compression_level: u8) -> Result, Vec, Vec)>, kmer_length: u8, index_minimizers: &FxHashSet, @@ -264,7 +264,7 @@ fn unpaired_should_keep( .collect() } -fn paired_should_keep( +pub fn paired_should_keep( input_minimizers_and_positions: &Vec<(Vec, Vec, Vec<&[u8]>)>, kmer_length: u8, index_minimizers: &FxHashSet, @@ -392,8 +392,6 @@ pub fn check_paired_inputs_should_be_output( // Else, send the input minimizers to the server for checking #[cfg(feature = "server")] { - use simd_minimizers::private::minimizers; - if _server_address.is_none() { panic!("Server address is required when using the server feature."); } @@ -418,7 +416,7 @@ pub fn check_paired_inputs_should_be_output( // Send the minimizers as a POST request let response = client - .post(server_address.to_owned() + "/should_output_unpaired") + .post(server_address.to_owned() + "/should_output_paired") .json(&PairedFilterRequest { input: input_minimizers_and_positions.to_vec(), abs_threshold, diff --git a/src/server.rs b/src/server.rs index 3b0a38b..3c3e9dc 100644 --- a/src/server.rs +++ b/src/server.rs @@ -1,10 +1,10 @@ //! Functionality to create a server endpoint which can be used to filter based on a pre-loaded index use std::path::PathBuf; -use std::sync::{Mutex, OnceLock}; +use std::sync::{Arc, Mutex, OnceLock}; -use crate::filter::inputs_should_be_output; use crate::index::{IndexHeader, load_minimizer_hashes}; -use crate::server_common::{FilterRequest, FilterResponse}; +use crate::remote_filter::{paired_should_keep, unpaired_should_keep}; +use crate::server_common::{UnpairedFilterRequest, PairedFilterRequest, FilterResponse}; use axum::{ Json, Router, extract::DefaultBodyLimit, @@ -42,6 +42,7 @@ pub async fn run_server(index_path: PathBuf, port: u16) { eprintln!("Loading index from: {}", index_path.display()); // Load the index before starting the server to ensure it's available for requests load_index(index_path); + eprintln!("Loaded index!"); // build our application with a route let app = Router::new() @@ -51,8 +52,8 @@ pub async fn run_server(index_path: PathBuf, port: u16) { .route("/index_header", get(index_header)) // `GET /index_version` returns the index version (hash) .route("/index_version", get(index_version)) - // `POST /filter` goes to `filter` - .route("/should_output", post(should_output)) + .route("/should_output_paired", post(should_output_paired)) + .route("/should_output_unpaired", post(should_output_unpaired)) // Increase the body limit to 2GB to ensure we don't error on large payloads .layer(DefaultBodyLimit::max(2147483648)); @@ -115,22 +116,54 @@ pub async fn index_version() -> String { hash.clone().unwrap() } -/// Endpoint which takes a set of hashes, returning whether they match the index -/// Endpoint is `/should_output` -pub async fn should_output(Json(request): Json) -> Json { +async fn should_output_paired(Json(request): Json) -> Json { + // Quickly wrangle the seqs into slices from vecs as serde can't do it directly + let input_minimizers_and_positions: Vec<(Vec, Vec, Vec<&[u8]>)> = + request.input + .iter() + .map(|(minimizers, positions, seqs)| { + ( + minimizers.to_vec(), + positions.to_vec(), + seqs.iter().map(|s| s.as_slice()).collect(), + ) + }) + .collect(); let index = INDEX.lock(); match index { Ok(index) => { let index = index.as_ref().expect("Index not loaded"); - Json(FilterResponse { - should_output: inputs_should_be_output( - index, - &request.input, - &request.match_threshold, - request.deplete, - ), - }) - } + let result = paired_should_keep( + &input_minimizers_and_positions, + request.kmer_length, + &index, + request.abs_threshold, + request.rel_threshold, + request.deplete, + request.debug, + ); + Json(FilterResponse { should_output: result}) + }, Err(e) => panic!("Error accessing index: {e}"), } } + +async fn should_output_unpaired(Json(request): Json) -> Json { + let index = INDEX.lock(); + match index { + Ok(index) => { + let index = index.as_ref().expect("Index not loaded"); + let result = unpaired_should_keep( + &request.input, + request.kmer_length, + &index, + request.abs_threshold, + request.rel_threshold, + request.deplete, + request.debug, + ); + Json(FilterResponse { should_output: result}) + } + Err(e) => panic!("Error accessing index: {e}"), + } +} \ No newline at end of file From a87585d7d175ae4ad4899b7f78fe9230aa496c30 Mon Sep 17 00:00:00 2001 From: gpas-github-bot Date: Tue, 9 Sep 2025 15:29:13 +0100 Subject: [PATCH 3/7] chore: clippy/formatting fixes --- src/filter_common.rs | 10 ++++----- src/index.rs | 45 ++++++++++++++------------------------- src/lib.rs | 2 +- src/minimizers.rs | 48 +++++++++++++++--------------------------- src/remote_filter.rs | 1 - src/server.rs | 49 ++++++++++++++++++++++++------------------- tests/filter_tests.rs | 6 ++---- tests/index_tests.rs | 22 ++++++------------- 8 files changed, 74 insertions(+), 109 deletions(-) diff --git a/src/filter_common.rs b/src/filter_common.rs index 2a045fe..5a3c16e 100644 --- a/src/filter_common.rs +++ b/src/filter_common.rs @@ -47,7 +47,7 @@ pub fn get_summary_index( minimizers_path: &Option<&PathBuf>, server_address: &Option, ) -> String { - let index = match minimizers_path { + match minimizers_path { Some(path) => path.to_string_lossy().to_string(), None => match &server_address { None => "No index or server specified".to_string(), @@ -59,7 +59,7 @@ pub fn get_summary_index( .get(_addr.to_owned() + "/index_version") .send() .unwrap_or_else(|e| { - panic!("Failed to contact server at {}: {e}", _addr); + panic!("Failed to contact server at {_addr}: {e}"); }); if response.status().is_success() { _addr.to_owned() @@ -77,8 +77,7 @@ pub fn get_summary_index( } } }, - }; - index + } } /// Calculate required hits based on absolute and relative thresholds @@ -269,8 +268,7 @@ pub fn get_minimizer_hashes_and_positions( assert!( kmer_length <= 56, - "Indexing the bitmask of invalid characters requires k<=56, but it is {}", - kmer_length + "Indexing the bitmask of invalid characters requires k<=56, but it is {kmer_length}" ); // Filter positions to only include k-mers with ACGT bases diff --git a/src/index.rs b/src/index.rs index 7332026..69ae479 100644 --- a/src/index.rs +++ b/src/index.rs @@ -152,12 +152,12 @@ pub fn write_minimizers( // Serialise the count of minimizers first let count = minimizers.len(); - encode_into_std_write(&count, &mut writer, bincode::config::standard()) + encode_into_std_write(count, &mut writer, bincode::config::standard()) .context("Failed to serialise minimizer count")?; // Serialise each minimizer directly for &hash in minimizers { - encode_into_std_write(&hash, &mut writer, bincode::config::standard()) + encode_into_std_write(hash, &mut writer, bincode::config::standard()) .context("Failed to serialise minimizer hash")?; } Ok(()) @@ -302,7 +302,7 @@ pub fn build(config: &IndexConfig) -> Result<()> { write_minimizers(&all_minimizers, &header, config.output_path.as_ref())?; let total_time = start_time.elapsed(); - eprintln!("Completed in {:.2?}", total_time); + eprintln!("Completed in {total_time:.2?}"); Ok(()) } @@ -329,15 +329,9 @@ fn stream_diff_fastx>( } if path.to_string_lossy() == "-" { - eprintln!( - "Second index: processing FASTX from stdin (k={}, w={})…", - kmer_length, window_size - ); + eprintln!("Second index: processing FASTX from stdin (k={kmer_length}, w={window_size})…"); } else { - eprintln!( - "Second index: processing FASTX from file (k={}, w={})…", - kmer_length, window_size - ); + eprintln!("Second index: processing FASTX from file (k={kmer_length}, w={window_size})…",); } // Use needletail for parsing @@ -406,8 +400,7 @@ fn stream_diff_fastx>( let current_gb = total_bp / 1_000_000_000; if current_gb > last_reported_gb { eprintln!( - " Processed {} sequences ({}bp), removed {} minimizers", - seq_count, total_bp, removed_count + " Processed {seq_count} sequences ({total_bp}bp), removed {removed_count} minimizers" ); last_reported_gb = current_gb; } @@ -419,12 +412,9 @@ fn stream_diff_fastx>( } } - eprintln!( - "Processed {} sequences ({}bp) from FASTX file", - seq_count, total_bp - ); + eprintln!("Processed {seq_count} sequences ({total_bp}bp) from FASTX file"); - Ok((seq_count as usize, total_bp as usize)) + Ok((seq_count as usize, total_bp)) } /// Compute the set difference between two minimizer indexes (A - B) @@ -450,7 +440,7 @@ pub fn diff( // Second file is a FASTX file - stream diff with provided k, w let before_count = first_minimizers.len(); let (_seq_count, _total_bp) = - stream_diff_fastx(&second, k, w, &header, &mut first_minimizers)?; + stream_diff_fastx(second, k, w, &header, &mut first_minimizers)?; // Report results eprintln!( @@ -462,7 +452,7 @@ pub fn diff( write_minimizers(&first_minimizers, &header, output)?; let total_time = start_time.elapsed(); - eprintln!("Completed difference operation in {:.2?}", total_time); + eprintln!("Completed difference operation in {total_time:.2?}"); return Ok(()); } else { @@ -503,7 +493,7 @@ pub fn diff( let before_count = first_minimizers.len(); let (_seq_count, _total_bp) = - stream_diff_fastx(&second, k, w, &header, &mut first_minimizers)?; + stream_diff_fastx(second, k, w, &header, &mut first_minimizers)?; // Report results eprintln!( @@ -515,7 +505,7 @@ pub fn diff( write_minimizers(&first_minimizers, &header, output)?; let total_time = start_time.elapsed(); - eprintln!("Completed difference operation in {:.2?}", total_time); + eprintln!("Completed difference operation in {total_time:.2?}"); return Ok(()); } @@ -540,7 +530,7 @@ pub fn diff( write_minimizers(&first_minimizers, &header, output)?; let total_time = start_time.elapsed(); - eprintln!("Completed diff operation in {:.2?}", total_time); + eprintln!("Completed diff operation in {total_time:.2?}"); Ok(()) } @@ -564,7 +554,7 @@ pub fn info(index_path: &PathBuf) -> Result<()> { eprintln!(" Distinct minimizer count: {}", minimizers.len()); let total_time = start_time.elapsed(); - eprintln!("Retrieved index info in {:.2?}", total_time); + eprintln!("Retrieved index info in {total_time:.2?}"); Ok(()) } @@ -609,10 +599,7 @@ pub fn union( header.window_size() ); if capacity_millions.is_some() { - eprintln!( - "Pre-allocating user-specified capacity for {} minimizers", - total_capacity - ); + eprintln!("Pre-allocating user-specified capacity for {total_capacity} minimizers"); } else { eprintln!( "No capacity specified, pre-allocating worst-case capacity for {} minimizers from {} indexes", @@ -663,7 +650,7 @@ pub fn union( ); } - write_minimizers(&all_minimizers, &header, output)?; + write_minimizers(&all_minimizers, header, output)?; let total_time = start_time.elapsed(); eprintln!( diff --git a/src/lib.rs b/src/lib.rs index a32bdaf..03dfc7c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -214,7 +214,7 @@ impl IndexConfig { /// Create a new index configuration with the specified input path pub fn new(input_path: PathBuf) -> Self { Self { - input_path: input_path, + input_path, kmer_length: DEFAULT_KMER_LENGTH, window_size: DEFAULT_WINDOW_SIZE, output_path: None, diff --git a/src/minimizers.rs b/src/minimizers.rs index 3f66314..6d1ffe2 100644 --- a/src/minimizers.rs +++ b/src/minimizers.rs @@ -258,15 +258,14 @@ mod tests { // Test minimum entropy (homopolymer, 10bp) let min_entropy_kmer = b"AAAAAAAAAA"; let entropy = calculate_scaled_entropy(min_entropy_kmer, 10); - assert!(entropy < 0.1, "Expected very low entropy, got {}", entropy); + assert!(entropy < 0.1, "Expected very low entropy, got {entropy}"); // Test moderate entropy (alternating pattern, 10bp) let alt_entropy_kmer = b"ATATATATAT"; let entropy = calculate_scaled_entropy(alt_entropy_kmer, 10); assert!( (0.5..1.0).contains(&entropy), - "Expected moderate entropy, got {}", - entropy + "Expected moderate entropy, got {entropy}" ); // Test maximum entropy (diverse 10bp) @@ -274,8 +273,7 @@ mod tests { let entropy = calculate_scaled_entropy(max_entropy_kmer, 10); assert!( entropy > 0.9, - "Expected high entropy for diverse 10-mer, got {}", - entropy + "Expected high entropy for diverse 10-mer, got {entropy}" ); // Test realistic k-mer (31bp, default k) @@ -283,8 +281,7 @@ mod tests { let entropy = calculate_scaled_entropy(realistic_kmer, 31); assert!( entropy > 0.9, - "Expected high entropy for diverse 31-mer, got {}", - entropy + "Expected high entropy for diverse 31-mer, got {entropy}" ); } @@ -295,15 +292,14 @@ mod tests { // Homopolymer - lowest entropy (31 A's) let homopolymer = b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; let entropy = calculate_scaled_entropy(homopolymer, 31); - assert!(entropy < 0.01, "Homopolymer entropy = {}", entropy); + assert!(entropy < 0.01, "Homopolymer entropy = {entropy}"); // Mostly one base with minimal variation - low entropy let mostly_a = b"AAAAAAAAAAACAAAAAGAAAAATAAAAAAA"; let entropy = calculate_scaled_entropy(mostly_a, 31); assert!( (0.25..=0.35).contains(&entropy), - "Mostly A entropy = {}", - entropy + "Mostly A entropy = {entropy}" ); // GC alternating - moderate entropy (2 bases, equal distribution) @@ -311,8 +307,7 @@ mod tests { let entropy = calculate_scaled_entropy(gc_alternating, 31); assert!( (0.45..=0.55).contains(&entropy), - "GC alternating entropy = {}", - entropy + "GC alternating entropy = {entropy}" ); // AT with G ending - moderate entropy (mostly 2 bases) @@ -320,8 +315,7 @@ mod tests { let entropy = calculate_scaled_entropy(dinuc_repeat, 31); assert!( (0.55..=0.65).contains(&entropy), - "AT+G repeat entropy = {}", - entropy + "AT+G repeat entropy = {entropy}" ); // Trinucleotide repeat - high entropy (ACG repeated) @@ -329,8 +323,7 @@ mod tests { let entropy = calculate_scaled_entropy(trinuc_repeat, 31); assert!( (0.75..=0.85).contains(&entropy), - "ACG repeat entropy = {}", - entropy + "ACG repeat entropy = {entropy}" ); // Four bases uneven distribution - high entropy @@ -338,19 +331,18 @@ mod tests { let entropy = calculate_scaled_entropy(four_uneven, 31); assert!( (0.8..=1.0).contains(&entropy), - "Four bases uneven entropy = {}", - entropy + "Four bases uneven entropy = {entropy}" ); // Complex pattern with all 4 bases - very high entropy let complex_repeat = b"AACCGGTTAACCGGTTAACCGGTTAACCGGT"; let entropy = calculate_scaled_entropy(complex_repeat, 31); - assert!(entropy >= 0.95, "Complex pattern entropy = {}", entropy); + assert!(entropy >= 0.95, "Complex pattern entropy = {entropy}"); // Four bases perfectly balanced - maximum entropy let four_balanced = b"ACGTACGTACGTACGTACGTACGTACGTACG"; let entropy = calculate_scaled_entropy(four_balanced, 31); - assert!(entropy >= 0.95, "Four bases balanced entropy = {}", entropy); + assert!(entropy >= 0.95, "Four bases balanced entropy = {entropy}"); // Verify entropy ordering makes sense let one_base = calculate_scaled_entropy(b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 31); @@ -361,21 +353,15 @@ mod tests { // Entropy should increase with base diversity assert!( one_base < two_bases_even, - "1 base ({}) < 2 bases even ({})", - one_base, - two_bases_even + "1 base ({one_base}) < 2 bases even ({two_bases_even})" ); assert!( two_bases_even < three_bases, - "2 bases even ({}) < 3 bases ({})", - two_bases_even, - three_bases + "2 bases even ({two_bases_even}) < 3 bases ({three_bases})" ); assert!( three_bases < four_bases, - "3 bases ({}) < 4 bases ({})", - three_bases, - four_bases + "3 bases ({three_bases}) < 4 bases ({four_bases})" ); // Verify threshold behavior: common thresholds like 0.01 should filter appropriately @@ -395,7 +381,7 @@ mod tests { // 30 A's + 1 T, entropy ~0.1028 let near_homopolymer = b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT"; let entropy = calculate_scaled_entropy(near_homopolymer, 31); - assert!(entropy < 0.5, "Entropy {:.4} should be < 0.5", entropy); - assert!(entropy < 0.15, "Entropy {:.4} should be < 0.15", entropy); + assert!(entropy < 0.5, "Entropy {entropy:.4} should be < 0.5"); + assert!(entropy < 0.15, "Entropy {entropy:.4} should be < 0.15"); } } diff --git a/src/remote_filter.rs b/src/remote_filter.rs index b59abe7..3007b94 100644 --- a/src/remote_filter.rs +++ b/src/remote_filter.rs @@ -6,7 +6,6 @@ use crate::filter_common::pair_matches; use crate::filter_common::sequence_matches; use crate::filter_common::{get_summary_index, meets_filtering_criteria}; use crate::index::load_minimizer_hashes; -use crate::minimizers::fill_minimizer_hashes; #[cfg(feature = "server")] use crate::server_common::{FilterResponse, PairedFilterRequest, UnpairedFilterRequest}; use anyhow::{Context, Result}; diff --git a/src/server.rs b/src/server.rs index 3c3e9dc..a73cc55 100644 --- a/src/server.rs +++ b/src/server.rs @@ -1,10 +1,10 @@ //! Functionality to create a server endpoint which can be used to filter based on a pre-loaded index use std::path::PathBuf; -use std::sync::{Arc, Mutex, OnceLock}; +use std::sync::{Mutex, OnceLock}; use crate::index::{IndexHeader, load_minimizer_hashes}; use crate::remote_filter::{paired_should_keep, unpaired_should_keep}; -use crate::server_common::{UnpairedFilterRequest, PairedFilterRequest, FilterResponse}; +use crate::server_common::{FilterResponse, PairedFilterRequest, UnpairedFilterRequest}; use axum::{ Json, Router, extract::DefaultBodyLimit, @@ -69,7 +69,8 @@ fn load_index(index_path: PathBuf) { // Load the hash as well as the file contents for returning as an ugly (but reliable) version let bytes = std::fs::read(index_path.clone()).unwrap(); let hash = sha256::digest(&bytes); - *INDEX_HASH.lock().unwrap() = Some(index_path.clone().into_os_string().into_string().unwrap() + "@" + &hash); + *INDEX_HASH.lock().unwrap() = + Some(index_path.clone().into_os_string().into_string().unwrap() + "@" + &hash); let result = load_minimizer_hashes(&Some(&index_path), &None); match result { @@ -118,17 +119,17 @@ pub async fn index_version() -> String { async fn should_output_paired(Json(request): Json) -> Json { // Quickly wrangle the seqs into slices from vecs as serde can't do it directly - let input_minimizers_and_positions: Vec<(Vec, Vec, Vec<&[u8]>)> = - request.input - .iter() - .map(|(minimizers, positions, seqs)| { - ( - minimizers.to_vec(), - positions.to_vec(), - seqs.iter().map(|s| s.as_slice()).collect(), - ) - }) - .collect(); + let input_minimizers_and_positions: Vec<(Vec, Vec, Vec<&[u8]>)> = request + .input + .iter() + .map(|(minimizers, positions, seqs)| { + ( + minimizers.to_vec(), + positions.to_vec(), + seqs.iter().map(|s| s.as_slice()).collect(), + ) + }) + .collect(); let index = INDEX.lock(); match index { Ok(index) => { @@ -136,19 +137,23 @@ async fn should_output_paired(Json(request): Json) -> Json< let result = paired_should_keep( &input_minimizers_and_positions, request.kmer_length, - &index, + index, request.abs_threshold, request.rel_threshold, request.deplete, request.debug, ); - Json(FilterResponse { should_output: result}) - }, + Json(FilterResponse { + should_output: result, + }) + } Err(e) => panic!("Error accessing index: {e}"), } } -async fn should_output_unpaired(Json(request): Json) -> Json { +async fn should_output_unpaired( + Json(request): Json, +) -> Json { let index = INDEX.lock(); match index { Ok(index) => { @@ -156,14 +161,16 @@ async fn should_output_unpaired(Json(request): Json) -> J let result = unpaired_should_keep( &request.input, request.kmer_length, - &index, + index, request.abs_threshold, request.rel_threshold, request.deplete, request.debug, ); - Json(FilterResponse { should_output: result}) + Json(FilterResponse { + should_output: result, + }) } Err(e) => panic!("Error accessing index: {e}"), } -} \ No newline at end of file +} diff --git a/tests/filter_tests.rs b/tests/filter_tests.rs index 6887d7e..3b39597 100644 --- a/tests/filter_tests.rs +++ b/tests/filter_tests.rs @@ -1010,8 +1010,7 @@ fn test_shared_minimizer_counted_once() { assert_eq!( seqs_out, 2, "Expected 2 sequences in output (both reads of the pair should be kept) \ - but got {}. This indicates shared minimizers were double-counted.", - seqs_out + but got {seqs_out}. This indicates shared minimizers were double-counted." ); } @@ -1244,8 +1243,7 @@ fn test_newline_mapping_bug() { // With the bug, we'd expect a match. Without the bug, no match. if output_str.contains(">query") { panic!( - "BUG DETECTED: Query matched due to newlines being mapped to 'C'. Output: {}", - output_str + "BUG DETECTED: Query matched due to newlines being mapped to 'C'. Output: {output_str}" ); } diff --git a/tests/index_tests.rs b/tests/index_tests.rs index a1a32d4..c7aee5d 100644 --- a/tests/index_tests.rs +++ b/tests/index_tests.rs @@ -33,8 +33,7 @@ fn build_index(fasta_path: &Path, bin_path: &Path) { // Check file exists and isn't empty assert!( bin_path.exists(), - "Index file wasn't created at {:?}", - bin_path + "Index file wasn't created at {bin_path:?}" ); assert!( fs::metadata(bin_path).unwrap().len() > 0, @@ -120,9 +119,7 @@ fn test_index_union() { let max_individual_size = std::cmp::max(bin1_size, bin2_size); assert!( combined_size >= max_individual_size, - "Combined index size {} should be at least as large as the largest individual index size {}", - combined_size, - max_individual_size + "Combined index size {combined_size} should be at least as large as the largest individual index size {max_individual_size}" ); } @@ -163,9 +160,7 @@ fn test_index_diff() { assert!( result_size <= bin1_size, - "Result index size {} should be less than or equal to the first index size {}", - result_size, - bin1_size + "Result index size {result_size} should be less than or equal to the first index size {bin1_size}" ); } @@ -257,10 +252,7 @@ fn test_index_diff_three_methods() { } } } - panic!( - "Could not extract remaining minimizer count from stderr: {}", - stderr_str - ); + panic!("Could not extract remaining minimizer count from stderr: {stderr_str}"); } let remaining1 = extract_remaining_count(&output1.stderr); @@ -270,13 +262,11 @@ fn test_index_diff_three_methods() { // All three methods should produce the same number of remaining minimizers assert_eq!( remaining1, remaining2, - "Index+Index ({}) and Index+FASTX ({}) should have same remaining count", - remaining1, remaining2 + "Index+Index ({remaining1}) and Index+FASTX ({remaining2}) should have same remaining count" ); assert_eq!( remaining1, remaining3, - "Index+Index ({}) and Index+FASTX stdin ({}) should have same remaining count", - remaining1, remaining3 + "Index+Index ({remaining1}) and Index+FASTX stdin ({remaining3}) should have same remaining count" ); // Verify all result files have the same size (they should be identical) From 96dd6a53daed9671092d9244076cd0b6dda1f6fc Mon Sep 17 00:00:00 2001 From: gpas-github-bot Date: Tue, 9 Sep 2025 15:42:23 +0100 Subject: [PATCH 4/7] fix: avoid needless vec/slice conversions for remote filtering --- Cargo.toml | 2 +- src/filter_common.rs | 10 ++++----- src/main.rs | 19 +++++++++-------- src/remote_filter.rs | 50 ++++---------------------------------------- src/server.rs | 14 +------------ 5 files changed, 21 insertions(+), 74 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 1987d04..d13ad74 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -42,7 +42,7 @@ reqwest = { version = "0.12.22", features = ["blocking", "json"], optional = tru sha256 = { version = "1.6.0", optional = true } [features] -default = ["server"] +# default = ["server"] server = ["dep:tokio", "dep:axum", "dep:tracing-subscriber", "dep:reqwest", "dep:sha256"] [lints.clippy] diff --git a/src/filter_common.rs b/src/filter_common.rs index 5a3c16e..90b6326 100644 --- a/src/filter_common.rs +++ b/src/filter_common.rs @@ -172,7 +172,7 @@ pub fn sequence_matches( pub fn pair_matches( all_hashes: &Vec, all_positions: &Vec, - all_sequences: &Vec<&[u8]>, + all_sequences: &Vec>, minimizer_hashes: &FxHashSet, kmer_length: u8, debug: bool, @@ -186,7 +186,7 @@ pub fn pair_matches( pair_hit_count += 1; if debug && i < all_positions.len() && i < all_sequences.len() { let pos = all_positions[i] as usize; - let seq = all_sequences[i]; + let seq = &all_sequences[i]; if pos + kmer_length as usize <= seq.len() { let kmer = &seq[pos..pos + kmer_length as usize]; hit_kmers.push(String::from_utf8_lossy(kmer).to_string()); @@ -315,7 +315,7 @@ pub fn get_paired_minimizer_hashes_and_positions<'a>( prefix_length: usize, kmer_length: u8, window_size: u8, -) -> (Vec, Vec, Vec<&'a [u8]>) { +) -> (Vec, Vec, Vec>) { let mut all_hashes = Vec::new(); let mut all_positions = Vec::new(); let mut all_sequences = Vec::new(); @@ -326,7 +326,7 @@ pub fn get_paired_minimizer_hashes_and_positions<'a>( get_minimizer_hashes_and_positions(seq1, prefix_length, kmer_length, window_size); all_hashes.extend(hashes); all_positions.extend(positions); - all_sequences.extend(vec![effective_seq1; all_hashes.len() - all_positions.len()]); + all_sequences.extend(vec![effective_seq1.to_vec(); all_hashes.len() - all_positions.len()]); } // Process read 2 @@ -335,7 +335,7 @@ pub fn get_paired_minimizer_hashes_and_positions<'a>( get_minimizer_hashes_and_positions(seq2, prefix_length, kmer_length, window_size); all_hashes.extend(hashes); all_positions.extend(positions); - all_sequences.extend(vec![effective_seq2; all_hashes.len() - all_positions.len()]); + all_sequences.extend(vec![effective_seq2.to_vec(); all_hashes.len() - all_positions.len()]); } (all_hashes, all_positions, all_sequences) diff --git a/src/main.rs b/src/main.rs index 15e87b1..8c81d25 100644 --- a/src/main.rs +++ b/src/main.rs @@ -355,12 +355,11 @@ fn main() -> Result<()> { } #[cfg(not(feature = "server"))] { - eprintln!( - "Server functionality is not enabled in this build. Please compile with the 'server' feature: `cargo build --features server`" - ); // Suppress dead code warning so this compiles without issue when server is not enabled let _ = (index, port); - std::process::exit(1); + panic!( + "Server functionality is not enabled in this build. Please compile with the 'server' feature: `cargo build --features server`" + ); } } Commands::Client { @@ -410,9 +409,6 @@ fn main() -> Result<()> { } #[cfg(not(feature = "server"))] { - eprintln!( - "Client functionality is not enabled in this build. Please compile with the 'server' feature: `cargo build --features server`" - ); // Suppress dead code warning so this compiles without issue when server is not enabled let _ = ( server_address, @@ -420,15 +416,20 @@ fn main() -> Result<()> { input2, output, output2, - match_threshold, + abs_threshold, + rel_threshold, prefix_length, summary, deplete, rename, threads, compression_level, + debug, + quiet, + ); + panic!( + "Client functionality is not enabled in this build. Please compile with the 'server' feature: `cargo build --features server`" ); - std::process::exit(1); } } } diff --git a/src/remote_filter.rs b/src/remote_filter.rs index 3007b94..916bfd3 100644 --- a/src/remote_filter.rs +++ b/src/remote_filter.rs @@ -264,7 +264,7 @@ pub fn unpaired_should_keep( } pub fn paired_should_keep( - input_minimizers_and_positions: &Vec<(Vec, Vec, Vec<&[u8]>)>, + input_minimizers_and_positions: &Vec<(Vec, Vec, Vec>)>, kmer_length: u8, index_minimizers: &FxHashSet, abs_threshold: usize, @@ -368,7 +368,7 @@ pub fn check_single_inputs_should_be_output( /// If not, send to server for checking. Requires the `server` feature to be enabled. pub fn check_paired_inputs_should_be_output( index_minimizers: &Option>, - input_minimizers_and_positions: &Vec<(Vec, Vec, Vec<&[u8]>)>, + input_minimizers_and_positions: &Vec<(Vec, Vec, Vec>)>, _server_address: &Option, deplete: bool, kmer_length: u8, @@ -396,20 +396,6 @@ pub fn check_paired_inputs_should_be_output( } let server_address = _server_address.as_ref().map(String::as_str).unwrap(); - // Quickly wrangle the seqs into vecs instead of slices so serde can cope - // Not perfect, but if it has to happen anywhere, here is the best - let input_minimizers_and_positions: Vec<(Vec, Vec, Vec>)> = - input_minimizers_and_positions - .iter() - .map(|(minimizers, positions, seqs)| { - ( - minimizers.to_vec(), - positions.to_vec(), - seqs.iter().map(|s| s.to_vec()).collect(), - ) - }) - .collect(); - // Create a client to send the minimizers to the server let client = Client::new(); @@ -974,7 +960,7 @@ fn process_paired_seqs( } // Get batch minimizers in parallel - let batch_result: Vec<(Vec, Vec, Vec<&[u8]>)> = batch1 + let batch_result: Vec<(Vec, Vec, Vec>)> = batch1 .par_iter() .zip(batch2.par_iter()) .map(|(record_data1, record_data2)| { @@ -988,34 +974,6 @@ fn process_paired_seqs( }) .collect(); - // let batch_result: Vec<(Vec, u8, u8)> = batch1 - // .par_iter() - // .zip(batch2.par_iter()) - // .map(|(record_data1, record_data2)| { - // get_hashes_from_record_pair( - // record_data1, - // record_data2, - // kmer_length, - // prefix_length, - // window_size, - // ) - // }) - // .collect(); - - // let (batch_minimizers, seq_lens1, seq_lens2): (Vec>, Vec, Vec) = - // batch_result.into_iter().multiunzip(); - - // Check if minimizers match the index - // Separated from initial par_iter to allow flexibility with local/server processing - // let batch_should_outputs = check_inputs_should_be_output( - // minimizer_hashes, - // &batch_minimizers, - // abs_threshold, - // rel_threshold, - // server_address, - // deplete, - // ); - let batch_should_outputs = check_paired_inputs_should_be_output( minimizer_hashes, &batch_result, @@ -1258,7 +1216,7 @@ fn process_interleaved_paired_seqs( } // Get batch minimizers in parallel - let batch_result: Vec<(Vec, Vec, Vec<&[u8]>)> = batch_pairs + let batch_result: Vec<(Vec, Vec, Vec>)> = batch_pairs .par_iter() .map(|(record_data1, record_data2)| { get_paired_minimizer_hashes_and_positions( diff --git a/src/server.rs b/src/server.rs index a73cc55..ead5680 100644 --- a/src/server.rs +++ b/src/server.rs @@ -118,24 +118,12 @@ pub async fn index_version() -> String { } async fn should_output_paired(Json(request): Json) -> Json { - // Quickly wrangle the seqs into slices from vecs as serde can't do it directly - let input_minimizers_and_positions: Vec<(Vec, Vec, Vec<&[u8]>)> = request - .input - .iter() - .map(|(minimizers, positions, seqs)| { - ( - minimizers.to_vec(), - positions.to_vec(), - seqs.iter().map(|s| s.as_slice()).collect(), - ) - }) - .collect(); let index = INDEX.lock(); match index { Ok(index) => { let index = index.as_ref().expect("Index not loaded"); let result = paired_should_keep( - &input_minimizers_and_positions, + &request.input, request.kmer_length, index, request.abs_threshold, From 13190974f7ad95b10467042e091c87595a0e0cbf Mon Sep 17 00:00:00 2001 From: gpas-github-bot Date: Wed, 10 Sep 2025 11:06:46 +0100 Subject: [PATCH 5/7] test: add server tests + docs in the readme --- .github/workflows/test.yml | 6 +- Cargo.lock | 1 + Cargo.toml | 5 + README.md | 115 +++ src/filter_common.rs | 10 +- src/main.rs | 4 +- tests/server_tests.rs | 1401 ++++++++++++++++++++++++++++++++++++ 7 files changed, 1537 insertions(+), 5 deletions(-) create mode 100644 tests/server_tests.rs diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 090e6a5..dc3b850 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -30,4 +30,8 @@ jobs: key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} - name: Run tests - run: cargo test --verbose \ No newline at end of file + run: cargo test --verbose + + - name: Run tests (server) + # Use --test-threads 1 to avoid issues with parallel testing of server functionality + run: cargo test --verbose --features server -- --test-threads \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index f658a71..8abc890 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -488,6 +488,7 @@ dependencies = [ "paraseq", "parking_lot", "predicates", + "rand", "rayon", "reqwest", "rstest", diff --git a/Cargo.toml b/Cargo.toml index d13ad74..6032e72 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,6 +53,7 @@ assert_cmd = "2.0" predicates = "3.0" tempfile = "3.20" rstest = "0.25" +rand = "0.9.2" [[test]] name = "cli_tests" @@ -66,6 +67,10 @@ path = "tests/index_tests.rs" name = "filter_tests" path = "tests/filter_tests.rs" +[[test]] +name = "server_tests" +path = "tests/server_tests.rs" + [profile.release] opt-level = 3 lto = true diff --git a/README.md b/README.md index ea596ac..5a02643 100644 --- a/README.md +++ b/README.md @@ -233,6 +233,121 @@ Use `-s summary.json` to save detailed filtering statistics: } ``` +## Features +There is an optional feature which can be enabled when building with `cargo build --features server`. + +This enables the running of a server which pre-loads the index, allowing filtering to be remote rather than local. In most local use cases, this will inevitably be slower than local `filter`, but for cases where lots of small inputs would otherwise load the index for each input, this is expected to be faster. + +> [!NOTE] +> Compiling with the server feature swaps the filtering engine for a slower one in order to enable faster client performance. For optimal local filtering, **do not** use this feature. Scientifically the results are identical between both modes. + +### Testing +Unit tests covering all filter functionality (but with the server/client) are enabled when compiling with `--features server`. However, due to concurrency issues, the client tests can only be run single threaded, and will fail without this: +```bash +cargo test --features server -- --test-threads 1 +``` + +### Server +Start up a server with a specific index loaded. Note that this by default runs in the foreground, so it may appear as if nothing is happening after a message about loading your index. To display incomming connection logs, set `RUST_LOG=trace` in your environment variables. + + +#### Run on default port +Starts up the server on port `8888` +```bash +deacon server index.idx +``` +#### Run on custom port +Starts up the server on port `12345` +```bash +deacon server index.idx -p 12345 +``` + +### Client +Almost exactly identical to the `deacon filter` reference, but swapping index path for server address. + + +#### Usage +``` +Requires feature `server`. Alternate version of Filter, swapping local compute for passing to a server which has the index pre-loaded. Will inevitably be slower than local filtering, but saves on index loading. Better used for cases of small input + large index + +Usage: deacon client [OPTIONS] [INPUT] [INPUT2] + +Arguments: + Server address to connect to (including port) + [INPUT] Optional path to fastx file (or - for stdin) [default: -] + [INPUT2] Optional path to second paired fastx file (or - for interleaved stdin) + +Options: + -o, --output + Path to output fastx file (or - for stdout; detects .gz and .zst) [default: -] + -O, --output2 + Optional path to second paired output fastx file (detects .gz and .zst) + -a, --abs-threshold + Minimum absolute number of minimizer hits for a match [default: 2] + -r, --rel-threshold + Minimum relative proportion (0.0-1.0) of minimizer hits for a match [default: 0.01] + -p, --prefix-length + Search only the first N nucleotides per sequence (0 = entire sequence) [default: 0] + -d, --deplete + Discard matching sequences (invert filtering behaviour) + -R, --rename + Replace sequence headers with incrementing numbers + -s, --summary + Path to JSON summary output file + -t, --threads + Number of execution threads (0 = auto) [default: 8] + --compression-level + Output compression level (1-9 for gz & xz; 1-22 for zstd) [default: 2] + --debug + Output sequences with minimizer hits to stderr + -q, --quiet + Suppress progress reporting + -h, --help + Print help +``` + +**Examples** + +```bash +# Keep only sequences matching the index loaded in the server +deacon client http://0.0.0.0:8888 reads.fq.gz > filt.fq + +# Host depletion using the index loaded in the server and default thresholds +deacon client -d http://0.0.0.0:8888 reads.fq.gz -o filt.fq.gz + +# Max sensitivity with absolute threshold of 1 and no relative threshold +deacon client -d -a 1 -r 0 http://0.0.0.0:8888 reads.fq.gz -o filt.fq.gz + +# More specific 10% relative match threshold +deacon client -d -r 0.1 http://0.0.0.0:8888 reads.fq.gz > filt.fq.gz + +# Stdin and stdout +zcat reads.fq.gz | deacon client -d http://0.0.0.0:8888 > filt.fq + +# Faster Zstandard compression +deacon client -d http://0.0.0.0:8888 reads.fq.zst -o filt.fq.zst + +# Fast gzip with pigz +deacon client -d http://0.0.0.0:8888 reads.fq.gz | pigz > filt.fq.gz + +# Paired reads +deacon client -d http://0.0.0.0:8888 r1.fq.gz r2.fq.gz > filt12.fq +deacon client -d http://0.0.0.0:8888 r1.fq.gz r2.fq.gz -o filt.r1.fq.gz -O filt.r2.fq.gz +zcat r12.fq.gz | deacon client -d http://0.0.0.0:8888 - - > filt12.fq + +# Save summary JSON +deacon client -d http://0.0.0.0:8888 reads.fq.gz -o filt.fq.gz -s summary.json + +# Replace read headers with incrementing integers +deacon client -d -R http://0.0.0.0:8888 reads.fq.gz > filt.fq + +# Only look for minimizer hits inside the first 1000bp per record +deacon client -d -p 1000 http://0.0.0.0:8888 reads.fq.gz > filt.fq + +# Debug mode: see sequences with minimizer hits in stderr +deacon client -d --debug http://0.0.0.0:8888 reads.fq.gz > filt.fq +``` + ## Citation [![biorXiv preprint](https://img.shields.io/badge/biorXiv-10.1101/2025.06.09.658732-red?&style=flat-square)](https://doi.org/10.1101/2025.06.09.658732) diff --git a/src/filter_common.rs b/src/filter_common.rs index 90b6326..cf9519b 100644 --- a/src/filter_common.rs +++ b/src/filter_common.rs @@ -326,7 +326,10 @@ pub fn get_paired_minimizer_hashes_and_positions<'a>( get_minimizer_hashes_and_positions(seq1, prefix_length, kmer_length, window_size); all_hashes.extend(hashes); all_positions.extend(positions); - all_sequences.extend(vec![effective_seq1.to_vec(); all_hashes.len() - all_positions.len()]); + all_sequences.extend(vec![ + effective_seq1.to_vec(); + all_hashes.len() - all_positions.len() + ]); } // Process read 2 @@ -335,7 +338,10 @@ pub fn get_paired_minimizer_hashes_and_positions<'a>( get_minimizer_hashes_and_positions(seq2, prefix_length, kmer_length, window_size); all_hashes.extend(hashes); all_positions.extend(positions); - all_sequences.extend(vec![effective_seq2.to_vec(); all_hashes.len() - all_positions.len()]); + all_sequences.extend(vec![ + effective_seq2.to_vec(); + all_hashes.len() - all_positions.len() + ]); } (all_hashes, all_positions, all_sequences) diff --git a/src/main.rs b/src/main.rs index 8c81d25..af685de 100644 --- a/src/main.rs +++ b/src/main.rs @@ -80,7 +80,7 @@ enum Commands { #[arg(short = 'q', long = "quiet", default_value_t = false)] quiet: bool, }, - /// Run a server to hold a pre-loaded minimizer index in memory for filtering + /// Requires feature `server`. Run a server to hold a pre-loaded minimizer index in memory for filtering /// with the Client command. Saves time for filtering short sequences with large indexes /// but will inevitably be slower than local filtering. Server { @@ -91,7 +91,7 @@ enum Commands { #[arg(short = 'p', long = "port", default_value_t = 8888)] port: u16, }, - /// Alternate version of Filter, swapping local compute for passing to a server + /// Requires feature `server`. Alternate version of Filter, swapping local compute for passing to a server /// which has the index pre-loaded. Will inevitably be slower than local filtering, /// but saves on index loading. Better used for cases of small input + large index Client { diff --git a/tests/server_tests.rs b/tests/server_tests.rs new file mode 100644 index 0000000..5cc1071 --- /dev/null +++ b/tests/server_tests.rs @@ -0,0 +1,1401 @@ +#![cfg(feature = "server")] +//! Tests for the Deacon server functionality +//! Basically a duplicate of the contents of `filter_tests.rs`, +//! but swapping to use the server filtering instead of local. +use assert_cmd::Command; +use std::fs; +use std::fs::File; +use std::path::Path; +use std::process::Command as StdCommand; +use tempfile::tempdir; + +/// Run a given command with a server running on a random port. +/// The server will be started in a separate thread, and killed after the command completes. +/// The server will be started with the provided index path and a random port. +/// +/// # Arguments: +/// * `$index_path` - The path to the index file to be used by the server. +/// * `$cmd` - A closure that takes a string slice representing the port number. +macro_rules! run_with_server { + ($index_path: expr, $cmd: expr) => { + let random_port = rand::random_range(3000..10000); + let rt = tokio::runtime::Runtime::new().unwrap(); + let server_thread = rt.spawn(async move { + deacon::server::run_server($index_path, random_port).await; + }); + let random_port = random_port.to_string(); + let mut retries = 0; + // Ensure the server is actually running + while retries < 10 { + let client = reqwest::blocking::Client::new(); + + let response = client + .get("http://0.0.0.0:".to_owned() + &random_port + "/") + .send(); + match response { + Ok(response) => { + // Check if the response indicates a match + if response.status().is_success() { + break; + } else { + // Request worked, but returned an error, so wait and retry + std::thread::sleep(std::time::Duration::from_millis(100)); + } + } + Err(_) => { + // If the request fails, wait and retry + std::thread::sleep(std::time::Duration::from_millis(100)); + } + } + retries += 1; + } + if retries == 10 { + panic!("Server did not start in time"); + } + eprintln!("Started server on port {random_port}"); + // Run the provided command after starting the server + $cmd(&random_port); + + // Kill the server thread + server_thread.abort(); + }; +} + +fn create_test_fasta(path: &Path) { + let fasta_content = ">seq1\nACGTGCATAGCTGCATGCATGCATGCATGCATGCATGCAATGCAACGTGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA\n>seq2\nTGCAGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATTGCAGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGC\n"; + fs::write(path, fasta_content).unwrap(); +} + +fn create_test_fasta_aaa(path: &Path) { + let fasta_content = ">seq1\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n"; + fs::write(path, fasta_content).unwrap(); +} + +fn create_test_fastq(path: &Path) { + let fastq_content = "@seq1\nACGTGCATAGCTGCATGCATGCATGCATGCATGCATGCAATGCAACGTGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA\n+\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n@seq2\nTGCAGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATTGCAGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGC\n+\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"; + fs::write(path, fastq_content).unwrap(); +} + +fn create_test_paired_fastq(path1: &Path, path2: &Path) { + let fastq_content1 = "@read1\nACGTGCATAGCTGCATGCATGCATGCATGCATGCATGCATGCAATGCAACGTGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA\n+\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n@read2\nTGCAGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATTGCAGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGC\n+\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"; + let fastq_content2 = "@read1\nACGTGCATAGCTGCATGCATGCATGCATGCATGCATGCATGCAATGCAACGTGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA\n+\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n@read2\nTGCAGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATTGCAGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGC\n+\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"; + + fs::write(path1, fastq_content1).unwrap(); + fs::write(path2, fastq_content2).unwrap(); +} + +fn build_index(fasta_path: &Path, bin_path: &Path) { + let output = StdCommand::new(assert_cmd::cargo::cargo_bin("deacon")) + .arg("index") + .arg("build") + .arg(fasta_path) + .output() + .expect("Failed to execute command"); + + fs::write(bin_path, output.stdout).expect("Failed to write index file"); + assert!(output.status.success(), "Index build command failed"); +} + +fn create_test_fasta_sc2(path: &Path) { + let fasta_content = + ">mn908947.3_0:60\nATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCT\n"; + fs::write(path, fasta_content).unwrap(); +} + +fn create_test_fastq_sc2_fwd(path: &Path) { + let fastq_content = "@mn908947.3_0:60_fwd\n\ + ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCT\n\ + +\n\ + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"; + fs::write(path, fastq_content).unwrap(); +} + +fn create_test_fastq_sc2_rev(path: &Path) { + let fastq_content = "@mn908947.3_0:60_rev\n\ + AGATCTACAAGAGATCGAAAGTTGGTTGGTTTGTTACCTGGGAAGGTATAAACCTTTAAT\n\ + +\n\ + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"; + fs::write(path, fastq_content).unwrap(); +} + +fn create_test_paired_fastq_sc2_fwd(path1: &Path, path2: &Path) { + let fastq_content1 = "@mn908947.3_0:60_fwd\n\ + ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCT\n\ + +\n\ + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"; + let fastq_content2 = "@mn908947.3_60:120_fwd\n\ + GTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACT\n\ + +\n\ + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"; + fs::write(path1, fastq_content1).unwrap(); + fs::write(path2, fastq_content2).unwrap(); +} + +fn create_test_paired_fastq_sc2_rev(path1: &Path, path2: &Path) { + let fastq_content1 = "@mn908947.3_0:60_rev\n\ + AGATCTACAAGAGATCGAAAGTTGGTTGGTTTGTTACCTGGGAAGGTATAAACCTTTAAT\n\ + +\n\ + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"; + let fastq_content2 = "@mn908947.3_60:120_rev\n\ + AGTGCACTAAGCATGCAGCCGAGTGACAGCCACACAGATTTTAAAGTTCGTTTAGAGAAC\n\ + +\n\ + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"; + fs::write(path1, fastq_content1).unwrap(); + fs::write(path2, fastq_content2).unwrap(); +} + +#[test] +fn test_filter_to_file() { + let temp_dir = tempdir().unwrap(); + let fasta_path = temp_dir.path().join("ref.fasta"); + let fastq_path = temp_dir.path().join("reads.fastq"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path = temp_dir.path().join("filtered.fastq"); + let summary_path = temp_dir.path().join("summary.json"); + + create_test_fasta_aaa(&fasta_path); + create_test_fastq(&fastq_path); + + build_index(&fasta_path, &bin_path); + assert!(bin_path.exists(), "Index file wasn't created"); + + // Run filtering command + run_with_server!(bin_path.clone(), |port: &str| { + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("client") + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&fastq_path) + .arg("--output") + .arg(&output_path) + .arg("--summary") + .arg(&summary_path) + .assert() + .success(); + }); + + // Check output and report creation + assert!(output_path.exists(), "Output file wasn't created"); + assert!(summary_path.exists(), "Summary file wasn't created"); + + // With new default behavior: sequences without matches are filtered out (sequences too short for k=31) + let output_content = fs::read_to_string(&output_path).unwrap(); + assert!( + output_content.is_empty(), + "Output file should be empty - sequences too short for minimizers" + ); +} + +#[test] +fn test_filter_to_file_gzip() { + let temp_dir = tempdir().unwrap(); + let fasta_path = temp_dir.path().join("ref.fasta"); + let fastq_path = temp_dir.path().join("reads.fastq"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path = temp_dir.path().join("filtered.fastq.gz"); + + create_test_fasta(&fasta_path); + create_test_fastq(&fastq_path); + build_index(&fasta_path, &bin_path); + + run_with_server!(bin_path.clone(), |port: &str| { + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("client") + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&fastq_path) + .arg("--output") + .arg(&output_path) + .assert() + .success(); + }); + + // Check gzipped output file creation + assert!(output_path.exists(), "Gzipped output file wasn't created"); + assert!( + fs::metadata(&output_path).unwrap().len() > 0, + "Gzipped output file is empty" + ); +} + +#[test] +fn test_filter_to_file_zstd() { + let temp_dir = tempdir().unwrap(); + let fasta_path = temp_dir.path().join("ref.fasta"); + let fastq_path = temp_dir.path().join("reads.fastq"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path = temp_dir.path().join("filtered.fastq.zst"); + + create_test_fasta(&fasta_path); + create_test_fastq(&fastq_path); + build_index(&fasta_path, &bin_path); + + run_with_server!(bin_path.clone(), |port: &str| { + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("client") + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&fastq_path) + .arg("--output") + .arg(&output_path) + .assert() + .success(); + }); + + // Check that zstd output file was created + assert!(output_path.exists(), "Zstd output file wasn't created"); + assert!( + fs::metadata(&output_path).unwrap().len() > 0, + "Zstd output file is empty" + ); +} + +#[test] +fn test_filter_to_file_xz() { + let temp_dir = tempdir().unwrap(); + let fasta_path = temp_dir.path().join("ref.fasta"); + let fastq_path = temp_dir.path().join("reads.fastq"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path = temp_dir.path().join("filtered.fastq.xz"); + + create_test_fasta(&fasta_path); + create_test_fastq(&fastq_path); + build_index(&fasta_path, &bin_path); + + run_with_server!(bin_path.clone(), |port: &str| { + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("client") + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&fastq_path) + .arg("--output") + .arg(&output_path) + .assert() + .success(); + }); + + // Check that xz output file was created + assert!(output_path.exists(), "XZ output file wasn't created"); + assert!( + fs::metadata(&output_path).unwrap().len() > 0, + "XZ output file is empty" + ); +} + +#[test] +fn test_filter_deplete_flag() { + let temp_dir = tempdir().unwrap(); + let fasta_path = temp_dir.path().join("ref.fasta"); + let fastq_path = temp_dir.path().join("reads.fastq"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path = temp_dir.path().join("filtered_depleted.fastq"); + + create_test_fasta(&fasta_path); + create_test_fastq(&fastq_path); + build_index(&fasta_path, &bin_path); + + run_with_server!(bin_path.clone(), |port: &str| { + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("client") + .arg("--deplete") + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&fastq_path) + .arg("--output") + .arg(&output_path) + .assert() + .success(); + }); + + assert!( + output_path.exists(), + "Output file with deplete flag wasn't created" + ); +} + +#[test] +fn test_filter_rename() { + let temp_dir = tempdir().unwrap(); + let fasta_path = temp_dir.path().join("ref.fasta"); + let fastq_path = temp_dir.path().join("reads.fastq"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path = temp_dir.path().join("filtered_renamed.fastq"); + + create_test_fasta(&fasta_path); + create_test_fastq(&fastq_path); + build_index(&fasta_path, &bin_path); + run_with_server!(bin_path.clone(), |port: &str| { + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("client") + .arg("--rename") + .arg("-a") + .arg("1") + .arg("-r") + .arg("0.0") + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&fastq_path) + .arg("--output") + .arg(&output_path) + .assert() + .success(); + }); + + assert!( + output_path.exists(), + "Output file with rename flag wasn't created" + ); + + let output_content = fs::read_to_string(&output_path).unwrap(); + assert!( + output_content.contains("@1\n") || output_content.contains("@2\n"), + "Output does not contain renamed sequences" + ); +} + +#[test] +fn test_filter_min_matches() { + let temp_dir = tempdir().unwrap(); + let fasta_path = temp_dir.path().join("ref.fasta"); + let fastq_path = temp_dir.path().join("reads.fastq"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path = temp_dir.path().join("filtered_min_matches.fastq"); + + create_test_fasta(&fasta_path); + create_test_fastq(&fastq_path); + build_index(&fasta_path, &bin_path); + run_with_server!(bin_path.clone(), |port: &str| { + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("client") + .arg("--abs-threshold") + .arg("2") + .arg("--rel-threshold") + .arg("0.01") + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&fastq_path) + .arg("--output") + .arg(&output_path) + .assert() + .success(); + }); + + assert!( + output_path.exists(), + "Output file with min_matches parameter wasn't created" + ); +} + +#[test] +fn test_filter_prefix_length() { + let temp_dir = tempdir().unwrap(); + let fasta_path = temp_dir.path().join("ref.fasta"); + let fastq_path = temp_dir.path().join("reads.fastq"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path = temp_dir.path().join("filtered_prefix.fastq"); + + create_test_fasta(&fasta_path); + create_test_fastq(&fastq_path); + build_index(&fasta_path, &bin_path); + run_with_server!(bin_path.clone(), |port: &str| { + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("client") + .arg("--prefix-length") + .arg("6") + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&fastq_path) + .arg("--output") + .arg(&output_path) + .assert() + .success(); + }); + + assert!( + output_path.exists(), + "Output file with prefix_length parameter wasn't created" + ); +} + +#[test] +fn test_filter_paired() { + let temp_dir = tempdir().unwrap(); + let fasta_path = temp_dir.path().join("ref.fasta"); + let fastq_path1 = temp_dir.path().join("reads_1.fastq"); + let fastq_path2 = temp_dir.path().join("reads_2.fastq"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path = temp_dir.path().join("filtered.fastq"); + + create_test_fasta(&fasta_path); + create_test_paired_fastq(&fastq_path1, &fastq_path2); + + build_index(&fasta_path, &bin_path); + assert!(bin_path.exists(), "Index file wasn't created"); + + // Run filtering command with paired-end reads (using -a 1 so short sequences pass through) + run_with_server!(bin_path.clone(), |port: &str| { + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("client") + .arg("-a") + .arg("1") + .arg("-r") + .arg("0.0") + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&fastq_path1) + .arg(&fastq_path2) + .arg("--output") + .arg(&output_path) + .assert() + .success(); + }); + + // Check output creation + assert!(output_path.exists(), "Output file wasn't created"); + + // Validate output content (should be interleaved) + let output_content = fs::read_to_string(&output_path).unwrap(); + assert!(!output_content.is_empty(), "Output file is empty"); +} + +#[test] +fn test_filter_paired_with_deplete() { + let temp_dir = tempdir().unwrap(); + let fasta_path = temp_dir.path().join("ref.fasta"); + let fastq_path1 = temp_dir.path().join("reads_1.fastq"); + let fastq_path2 = temp_dir.path().join("reads_2.fastq"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path = temp_dir.path().join("filtered_depleted.fastq"); + + create_test_fasta(&fasta_path); + create_test_paired_fastq(&fastq_path1, &fastq_path2); + build_index(&fasta_path, &bin_path); + run_with_server!(bin_path.clone(), |port: &str| { + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("client") + .arg("--deplete") + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&fastq_path1) + .arg(&fastq_path2) + .arg("--output") + .arg(&output_path) + .assert() + .success(); + }); + + assert!( + output_path.exists(), + "Output file with deplete flag wasn't created" + ); +} + +#[test] +fn test_filter_paired_with_rename() { + let temp_dir = tempdir().unwrap(); + let fasta_path = temp_dir.path().join("ref.fasta"); + let fastq_path1 = temp_dir.path().join("reads_1.fastq"); + let fastq_path2 = temp_dir.path().join("reads_2.fastq"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path = temp_dir.path().join("filtered_renamed.fastq"); + + create_test_fasta(&fasta_path); + create_test_paired_fastq(&fastq_path1, &fastq_path2); + build_index(&fasta_path, &bin_path); + run_with_server!(bin_path.clone(), |port: &str| { + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("client") + .arg("--rename") + .arg("-a") + .arg("1") + .arg("-r") + .arg("0.0") + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&fastq_path1) + .arg(&fastq_path2) + .arg("--output") + .arg(&output_path) + .assert() + .success(); + }); + + assert!( + output_path.exists(), + "Output file with rename flag wasn't created" + ); + + let output_content = fs::read_to_string(&output_path).unwrap(); + assert!( + output_content.contains("@1\n") && output_content.contains("@2\n"), + "Output does not contain renamed sequences" + ); +} + +#[test] +fn test_filter_paired_with_min_matches() { + let temp_dir = tempdir().unwrap(); + let fasta_path = temp_dir.path().join("ref.fasta"); + let fastq_path1 = temp_dir.path().join("reads_1.fastq"); + let fastq_path2 = temp_dir.path().join("reads_2.fastq"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path = temp_dir.path().join("filtered_min_matches.fastq"); + + create_test_fasta(&fasta_path); + create_test_paired_fastq(&fastq_path1, &fastq_path2); + build_index(&fasta_path, &bin_path); + run_with_server!(bin_path.clone(), |port: &str| { + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("client") + .arg("--abs-threshold") + .arg("2") + .arg("--rel-threshold") + .arg("0.01") + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&fastq_path1) + .arg(&fastq_path2) + .arg("--output") + .arg(&output_path) + .assert() + .success(); + }); + + assert!( + output_path.exists(), + "Output file with min_matches parameter wasn't created" + ); +} + +#[test] +fn test_interleaved_paired_reads_stdin() { + let temp_dir = tempdir().unwrap(); + let fasta_path = temp_dir.path().join("ref.fasta"); + let interleaved_fastq_path = temp_dir.path().join("interleaved_reads.fastq"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path = temp_dir.path().join("filtered.fastq"); + + // Create test files + create_test_fasta(&fasta_path); + + let interleaved_content = + "@read1/1\nACGTGCATAGCTGCATGCATGCATGCATGCATGCATGCAATGCAACGTGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA\n+\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n@read1/2\nACGTGCATAGCTGCATGCATGCATGCATGCATGCATGCAATGCAACGTGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA\n+\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + .to_owned() + + "@read2/1\nTGCAGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATTGCAGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGC\n+\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n@read2/2\nTGCAGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATTGCAGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGC\n+\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"; + fs::write(&interleaved_fastq_path, interleaved_content).unwrap(); + + build_index(&fasta_path, &bin_path); + assert!(bin_path.exists(), "Index file wasn't created"); + + // Test piping interleaved file to stdin for processing + run_with_server!(bin_path.clone(), |port: &str| { + let mut cmd = StdCommand::new(assert_cmd::cargo::cargo_bin("deacon")); + let output = cmd + .arg("client") + .arg("-a") + .arg("1") + .arg("-r") + .arg("0.0") + .arg("http://0.0.0.0:".to_owned() + port) + .arg("-") // stdin for input + .arg("-") // stdin for input2 (signals interleaved mode) + .arg("--output") + .arg(&output_path) + .stdin(File::open(&interleaved_fastq_path).unwrap()) + .output() + .expect("Failed to execute command"); + + assert!(output.status.success(), "Command failed"); + assert!(output_path.exists(), "Output file wasn't created"); + }); + + // Validate output content (should contain processed reads) + let output_content = fs::read_to_string(&output_path).unwrap(); + assert!(!output_content.is_empty(), "Output file is empty"); +} + +#[test] +fn test_single_read_stdin() { + let temp_dir = tempdir().unwrap(); + let fasta_path = temp_dir.path().join("ref.fasta"); + let fastq_path = temp_dir.path().join("reads.fastq"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path = temp_dir.path().join("filtered.fastq"); + + create_test_fasta(&fasta_path); + + let fastq_content = "@read1\nACGTGCATAGCTGCATGCATGCATGCATGCATGCATGCAATGCAACGTGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA\n+\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n@read2\nTGCAGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATTGCAGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGC\n+\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"; + fs::write(&fastq_path, fastq_content).unwrap(); + + build_index(&fasta_path, &bin_path); + assert!(bin_path.exists(), "Index file wasn't created"); + + // Test single-end stdin + run_with_server!(bin_path.clone(), |port: &str| { + let mut cmd = StdCommand::new(assert_cmd::cargo::cargo_bin("deacon")); + let output = cmd + .arg("client") + .arg("-a") + .arg("1") + .arg("-r") + .arg("0.0") + .arg("http://0.0.0.0:".to_owned() + port) + .arg("-") // stdin + .arg("--output") + .arg(&output_path) + .stdin(File::open(&fastq_path).unwrap()) + .output() + .expect("Failed to execute command"); + + assert!( + output.status.success(), + "Command failed for single-read stdin" + ); + }); + assert!( + output_path.exists(), + "Output file wasn't created for single-read stdin" + ); + + let output_content = fs::read_to_string(&output_path).unwrap(); + assert!( + !output_content.is_empty(), + "Output file is empty for single-read stdin" + ); + assert!( + output_content.contains("read1"), + "read1 not found in output" + ); + assert!( + output_content.contains("read2"), + "read2 not found in output" + ); +} + +#[test] +fn test_filter_filtration_fwd() { + // Tests filtering with forward reads from SC2 + let temp_dir = tempdir().unwrap(); + let fasta_path = temp_dir.path().join("ref.fasta"); + let fastq_path = temp_dir.path().join("reads.fastq"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path = temp_dir.path().join("filtered.fastq"); + let summary_path = temp_dir.path().join("summary.json"); + + create_test_fasta_sc2(&fasta_path); + create_test_fastq_sc2_fwd(&fastq_path); + + build_index(&fasta_path, &bin_path); + assert!(bin_path.exists(), "Index file wasn't created"); + run_with_server!(bin_path.clone(), |port: &str| { + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("client") + .arg("--deplete") + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&fastq_path) + .arg("--output") + .arg(&output_path) + .arg("--summary") + .arg(&summary_path) + .arg("--abs-threshold") + .arg("1") + .arg("--rel-threshold") + .arg("0.01") + .assert() + .success(); + }); + + assert!(output_path.exists(), "Output file wasn't created"); + assert!(summary_path.exists(), "Summary file wasn't created"); + + let output_content = fs::read_to_string(&output_path).unwrap(); + assert!(output_content.is_empty(), "Output file is not empty"); +} + +#[test] +fn test_filter_filtration_rev() { + // Tests filtering with reverse read from SC2 + let temp_dir = tempdir().unwrap(); + let fasta_path = temp_dir.path().join("ref.fasta"); + let fastq_path = temp_dir.path().join("reads.fastq"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path = temp_dir.path().join("filtered.fastq"); + let summary_path = temp_dir.path().join("summary.json"); + + create_test_fasta_sc2(&fasta_path); + create_test_fastq_sc2_rev(&fastq_path); + + build_index(&fasta_path, &bin_path); + assert!(bin_path.exists(), "Index file wasn't created"); + run_with_server!(bin_path.clone(), |port: &str| { + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("client") + .arg("--deplete") + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&fastq_path) + .arg("--output") + .arg(&output_path) + .arg("--summary") + .arg(&summary_path) + .assert() + .success(); + }); + + assert!(output_path.exists(), "Output file wasn't created"); + assert!(summary_path.exists(), "Summary file wasn't created"); + + let output_content = fs::read_to_string(&output_path).unwrap(); + assert!(output_content.is_empty(), "Output file is not empty"); +} + +#[test] +fn test_filter_paired_filtration_fwd() { + // Tests that both reads are filtered when a forward read matches the SC2 ref + let temp_dir = tempdir().unwrap(); + let fasta_path = temp_dir.path().join("ref.fasta"); + let fastq_path1 = temp_dir.path().join("reads_1.fastq"); + let fastq_path2 = temp_dir.path().join("reads_2.fastq"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path = temp_dir.path().join("filtered.fastq"); + + create_test_fasta_sc2(&fasta_path); + create_test_paired_fastq_sc2_fwd(&fastq_path1, &fastq_path2); + + build_index(&fasta_path, &bin_path); + assert!(bin_path.exists(), "Index file wasn't created"); + run_with_server!(bin_path.clone(), |port: &str| { + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("client") + .arg("--deplete") + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&fastq_path1) + .arg(&fastq_path2) + .arg("--output") + .arg(&output_path) + .assert() + .success(); + }); + + assert!(output_path.exists(), "Output file wasn't created"); + + let output_content = fs::read_to_string(&output_path).unwrap(); + assert!(output_content.is_empty(), "Output file is not empty"); +} + +#[test] +fn test_filter_paired_filtration_rev() { + // Tests that both reads are filtered when a reverse read matches the SC2 ref + let temp_dir = tempdir().unwrap(); + let fasta_path = temp_dir.path().join("ref.fasta"); + let fastq_path1 = temp_dir.path().join("reads_1.fastq"); + let fastq_path2 = temp_dir.path().join("reads_2.fastq"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path = temp_dir.path().join("filtered.fastq"); + + create_test_fasta_sc2(&fasta_path); + create_test_paired_fastq_sc2_rev(&fastq_path1, &fastq_path2); + + build_index(&fasta_path, &bin_path); + assert!(bin_path.exists(), "Index file wasn't created"); + run_with_server!(bin_path.clone(), |port: &str| { + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("client") + .arg("--deplete") + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&fastq_path1) + .arg(&fastq_path2) + .arg("--output") + .arg(&output_path) + .assert() + .success(); + }); + + assert!(output_path.exists(), "Output file wasn't created"); + + let output_content = fs::read_to_string(&output_path).unwrap(); + assert!(output_content.is_empty(), "Output file is not empty"); +} + +#[cfg(test)] +mod output2_tests { + use assert_cmd::Command; + use std::fs; + use std::path::Path; + use std::process::Command as StdCommand; + use tempfile::tempdir; + + fn create_test_fasta(path: &Path) { + let fasta_content = ">seq1\nATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAA\n>seq2\nCGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAAC\n"; + fs::write(path, fasta_content).unwrap(); + } + + fn create_test_paired_fastq(path1: &Path, path2: &Path) { + let fastq_content1 = "@read1\nATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAA\n+\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n@read2\nCGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAAC\n+\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"; + let fastq_content2 = "@read1\nTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTG\n+\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n@read2\nTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTC\n+\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"; + + fs::write(path1, fastq_content1).unwrap(); + fs::write(path2, fastq_content2).unwrap(); + } + + fn build_index(fasta_path: &Path, bin_path: &Path) { + let output = StdCommand::new(assert_cmd::cargo::cargo_bin("deacon")) + .arg("index") + .arg("build") + .arg(fasta_path) + .output() + .expect("Failed to execute command"); + + fs::write(bin_path, output.stdout).expect("Failed to write index file"); + assert!(output.status.success(), "Index build command failed"); + } + + #[test] + fn test_filter_paired_with_output2() { + let temp_dir = tempdir().unwrap(); + let fasta_path = temp_dir.path().join("ref.fasta"); + let fastq_path1 = temp_dir.path().join("reads_1.fastq"); + let fastq_path2 = temp_dir.path().join("reads_2.fastq"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path1 = temp_dir.path().join("filtered_1.fastq.gz"); + let output_path2 = temp_dir.path().join("filtered_2.fastq.gz"); + + create_test_fasta(&fasta_path); + create_test_paired_fastq(&fastq_path1, &fastq_path2); + + build_index(&fasta_path, &bin_path); + assert!(bin_path.exists(), "Index file wasn't created"); + + // Run filtering command with separate output files + run_with_server!(bin_path.clone(), |port: &str| { + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("client") + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&fastq_path1) + .arg(&fastq_path2) + .arg("--output") + .arg(&output_path1) + .arg("--output2") + .arg(&output_path2) + .assert() + .success(); + }); + + // Check both output files were created + assert!(output_path1.exists(), "First output file wasn't created"); + assert!(output_path2.exists(), "Second output file wasn't created"); + + // Validate output content + assert!( + fs::metadata(&output_path1).unwrap().len() > 0, + "First gzipped output file is empty" + ); + assert!( + fs::metadata(&output_path2).unwrap().len() > 0, + "Second gzipped output file is empty" + ); + + // Actually decompress and check if there are reads + use flate2::read::GzDecoder; + use std::fs::File; + use std::io::Read; + + let file1 = File::open(&output_path1).unwrap(); + let mut gz1 = GzDecoder::new(file1); + let mut contents1 = String::new(); + gz1.read_to_string(&mut contents1).unwrap(); + + let file2 = File::open(&output_path2).unwrap(); + let mut gz2 = GzDecoder::new(file2); + let mut contents2 = String::new(); + gz2.read_to_string(&mut contents2).unwrap(); + + println!( + "Output2 test - Output1 length: {}, Output2 length: {}", + contents1.len(), + contents2.len() + ); + println!( + "Output2 test - Output1 preview: {:?}", + &contents1.chars().take(100).collect::() + ); + println!( + "Output2 test - Output2 preview: {:?}", + &contents2.chars().take(100).collect::() + ); + } + + #[test] + fn test_filter_paired_with_output2_gzip() { + let temp_dir = tempdir().unwrap(); + let fasta_path = temp_dir.path().join("ref.fasta"); + let fastq_path1 = temp_dir.path().join("reads_1.fastq"); + let fastq_path2 = temp_dir.path().join("reads_2.fastq"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path1 = temp_dir.path().join("filtered_1.fastq.gz"); + let output_path2 = temp_dir.path().join("filtered_2.fastq.gz"); + + create_test_fasta(&fasta_path); + create_test_paired_fastq(&fastq_path1, &fastq_path2); + build_index(&fasta_path, &bin_path); + run_with_server!(bin_path.clone(), |port: &str| { + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("client") + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&fastq_path1) + .arg(&fastq_path2) + .arg("--output") + .arg(&output_path1) + .arg("--output2") + .arg(&output_path2) + .assert() + .success(); + }); + + // Check both gzipped output files were created + assert!( + output_path1.exists(), + "First gzipped output file wasn't created" + ); + assert!( + output_path2.exists(), + "Second gzipped output file wasn't created" + ); + + assert!( + fs::metadata(&output_path1).unwrap().len() > 0, + "First gzipped output file is empty" + ); + assert!( + fs::metadata(&output_path2).unwrap().len() > 0, + "Second gzipped output file is empty" + ); + + // Actually decompress and check if there are reads + use flate2::read::GzDecoder; + use std::fs::File; + use std::io::Read; + + let file1 = File::open(&output_path1).unwrap(); + let mut gz1 = GzDecoder::new(file1); + let mut contents1 = String::new(); + gz1.read_to_string(&mut contents1).unwrap(); + + let file2 = File::open(&output_path2).unwrap(); + let mut gz2 = GzDecoder::new(file2); + let mut contents2 = String::new(); + gz2.read_to_string(&mut contents2).unwrap(); + + println!( + "Gzip test - Output1 length: {}, Output2 length: {}", + contents1.len(), + contents2.len() + ); + println!( + "Gzip test - Output1 preview: {:?}", + &contents1.chars().take(100).collect::() + ); + println!( + "Gzip test - Output2 preview: {:?}", + &contents2.chars().take(100).collect::() + ); + } + + #[test] + fn test_filter_single_input_with_output2_warning() { + let temp_dir = tempdir().unwrap(); + let fasta_path = temp_dir.path().join("ref.fasta"); + let fastq_path = temp_dir.path().join("reads.fastq"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path = temp_dir.path().join("filtered.fastq"); + let output_path2 = temp_dir.path().join("filtered_2.fastq"); + + create_test_fasta(&fasta_path); + let fastq_content = "@seq1\nACGTACGTACGT\n+\n~~~~~~~~~~~~\n"; + fs::write(&fastq_path, fastq_content).unwrap(); + build_index(&fasta_path, &bin_path); + + // Run filtering command with output2 but no second input (should warn) + run_with_server!(bin_path.clone(), |port: &str| { + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("client") + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&fastq_path) + .arg("--output") + .arg(&output_path) + .arg("-O") + .arg(&output_path2) + .assert() + .success() + .stderr(predicates::str::contains("Warning")); + }); + + // Check only the first output file was created + assert!(output_path.exists(), "First output file wasn't created"); + assert!( + !output_path2.exists(), + "Second output file shouldn't be created for single input" + ); + } +} + +#[test] +fn test_shared_minimizer_counted_once() { + // Catch bug where the same minimizer in different paired mates is counted twice + let temp_dir = tempdir().unwrap(); + let fasta_path = temp_dir.path().join("ref.fasta"); + let fasta_path1 = temp_dir.path().join("reads_1.fasta"); + let fasta_path2 = temp_dir.path().join("reads_2.fasta"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path = temp_dir.path().join("filtered.fasta"); + let summary_path = temp_dir.path().join("summary.json"); + + let ref_content = ">reference\nACGTACGTACGTACGTTGCATGCATGCATGCATAAGGTTAAGGTTAAGGTTAAGGTTCCCGGGCCCGGGCCCGGGCCCGGGATATATATATATATATATGCGCGCGCGCGCGCGCGC\n"; + fs::write(&fasta_path, ref_content).unwrap(); + + // Create 120bp ref + let ref_content = ">reference\nACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT\n"; + fs::write(&fasta_path, ref_content).unwrap(); + + // Create paired reads (80bp each) where both contain the same 60bp region from the reference + // Shared region: first 60bp of reference (ACGT repeated 15 times) + let fasta_content1 = ">read1/1\n\ + AAAAAAAAAACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAAAAAAAAAA\n"; + + let fasta_content2 = ">read1/2\n\ + TTTTTTTTTTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTTTTTTTTTTT\n"; + + fs::write(&fasta_path1, fasta_content1).unwrap(); + fs::write(&fasta_path2, fasta_content2).unwrap(); + + build_index(&fasta_path, &bin_path); + assert!(bin_path.exists(), "Index file wasn't created"); + + // If shared minimizers are counted once (correct): total hits = 1, pair kept (1 < 2) + // If shared minimizers are counted twice (bug): total hits = 2+, pair filtered (2+ >= 2) + // Using --deplete to restore original behavior for this bug test + run_with_server!(bin_path.clone(), |port: &str| { + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("client") + .arg("--deplete") + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&fasta_path1) + .arg(&fasta_path2) + .arg("--output") + .arg(&output_path) + .arg("--summary") + .arg(&summary_path) + .arg("--abs-threshold") + .arg("2") + .arg("--rel-threshold") + .arg("0.01") // Critical parameter: any pair with 2+ hits gets filtered + .assert() + .success(); + }); + + assert!(output_path.exists(), "Output file wasn't created"); + assert!(summary_path.exists(), "Summary file wasn't created"); + + let output_content = fs::read_to_string(&output_path).unwrap(); + let summary_content = fs::read_to_string(&summary_path).unwrap(); + let summary: serde_json::Value = serde_json::from_str(&summary_content).unwrap(); + + // The reads should be kept because shared minimizers should only count once + assert!( + !output_content.is_empty(), + "Read pair should be kept in output because shared minimizers should only count once. \ + Current implementation incorrectly counts them multiple times and filters the pair." + ); + + // Additional verification using the JSON summary + let seqs_out = summary["seqs_out"].as_u64().unwrap(); + assert_eq!( + seqs_out, 2, + "Expected 2 sequences in output (both reads of the pair should be kept) \ + but got {seqs_out}. This indicates shared minimizers were double-counted." + ); +} + +#[test] +fn test_filter_proportional_threshold() { + let temp_dir = tempdir().unwrap(); + let fasta_path = temp_dir.path().join("ref.fasta"); + let fastq_path = temp_dir.path().join("reads.fastq"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path = temp_dir.path().join("filtered_proportional.fastq"); + + create_test_fasta(&fasta_path); + create_test_fastq(&fastq_path); + build_index(&fasta_path, &bin_path); + run_with_server!(bin_path.clone(), |port: &str| { + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("client") + .arg("--abs-threshold") + .arg("1") + .arg("--rel-threshold") + .arg("0.5") // 50% proportional threshold + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&fastq_path) + .arg("--output") + .arg(&output_path) + .assert() + .success(); + }); + + assert!( + output_path.exists(), + "Output file with proportional threshold wasn't created" + ); +} + +#[test] +fn test_filter_proportional_paired() { + let temp_dir = tempdir().unwrap(); + let fasta_path = temp_dir.path().join("ref.fasta"); + let fastq_path1 = temp_dir.path().join("reads_1.fastq"); + let fastq_path2 = temp_dir.path().join("reads_2.fastq"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path = temp_dir.path().join("filtered_proportional_paired.fastq"); + + create_test_fasta(&fasta_path); + create_test_paired_fastq(&fastq_path1, &fastq_path2); + build_index(&fasta_path, &bin_path); + run_with_server!(bin_path.clone(), |port: &str| { + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("client") + .arg("--abs-threshold") + .arg("1") + .arg("--rel-threshold") + .arg("0.3") // 30% proportional threshold + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&fastq_path1) + .arg(&fastq_path2) + .arg("--output") + .arg(&output_path) + .assert() + .success(); + }); + + assert!( + output_path.exists(), + "Output file with proportional threshold for paired reads wasn't created" + ); +} + +#[test] +fn test_filter_edge_case_proportional_values() { + let temp_dir = tempdir().unwrap(); + let fasta_path = temp_dir.path().join("ref.fasta"); + let fastq_path = temp_dir.path().join("reads.fastq"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path = temp_dir.path().join("filtered_edge.fastq"); + + create_test_fasta(&fasta_path); + create_test_fastq(&fastq_path); + build_index(&fasta_path, &bin_path.clone()); + + // Work around moving values into closures for server running + let bin_path1 = bin_path.clone(); + let bin_path2 = bin_path.clone(); + + // Test with 0.0 (should pass everything) + run_with_server!(bin_path1, |port: &str| { + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("client") + .arg("--abs-threshold") + .arg("1") + .arg("--rel-threshold") + .arg("0.0") + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&fastq_path) + .arg("--output") + .arg(&output_path) + .assert() + .success(); + }); + + // Test with 1.0 (very strict) + let output_path_strict = temp_dir.path().join("filtered_strict.fastq"); + run_with_server!(bin_path2, |port: &str| { + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("client") + .arg("--abs-threshold") + .arg("1") + .arg("--rel-threshold") + .arg("1.0") + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&fastq_path) + .arg("--output") + .arg(&output_path_strict) + .assert() + .success(); + }); + + assert!( + output_path.exists(), + "Output with 0.0 threshold wasn't created" + ); + assert!( + output_path_strict.exists(), + "Output with 1.0 threshold wasn't created" + ); +} + +#[test] +fn test_multiline_fasta_matching() { + let temp_dir = tempdir().unwrap(); + let ref_path = temp_dir.path().join("ref.fasta"); + let query_path = temp_dir.path().join("query.fasta"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path = temp_dir.path().join("output.fasta"); + + let reference_fasta = ">ref\nACGTTTAAGGCCAACCACACACACACACATT\n"; + let query_fasta = ">query\nACGTTTAAGGCCAACC\nACACACACACACATT\n"; + + fs::write(&ref_path, reference_fasta).unwrap(); + fs::write(&query_path, query_fasta).unwrap(); + + // Build index with k=31, w=1 + let output = StdCommand::new(assert_cmd::cargo::cargo_bin("deacon")) + .arg("index") + .arg("build") + .arg("-k") + .arg("31") + .arg("-w") + .arg("1") + .arg(&ref_path) + .output() + .expect("Failed to execute index command"); + + fs::write(&bin_path, output.stdout).expect("Failed to write index file"); + assert!(output.status.success(), "Index build command failed"); + + // client with -a 1 + run_with_server!(bin_path.clone(), |port: &str| { + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("client") + .arg("-a") + .arg("1") + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&query_path) + .arg("-o") + .arg(&output_path) + .assert() + .success(); + }); + + // Verify that mid record newline doesn't break match + let output_content = fs::read_to_string(&output_path).unwrap(); + assert!( + !output_content.is_empty(), + "Multiline FASTA should match indexed sequence" + ); + assert!( + output_content.contains(">query"), + "Output should contain query header" + ); + assert!( + output_content.contains("ACGTTTAAGGCCAACCACACACACACACATT"), + "Output should contain the full sequence" + ); +} + +#[test] +fn test_newline_mapping_bug() { + let temp_dir = tempdir().unwrap(); + let ref_path = temp_dir.path().join("reference.fa"); + let query_path = temp_dir.path().join("query.fa"); + let bin_path = temp_dir.path().join("ref.bin"); + let output_path = temp_dir.path().join("output.fa"); + + // Create reference file with sequence split across lines + // The newlines should be stripped but if they're not, they'll be mapped to 'C' + let ref_content = ">reference\nAAAAA\nAAAAA\nAAAAA\nAAAAA\n"; + fs::write(&ref_path, ref_content).unwrap(); + + // Create query file with Cs where newlines would be + let query_content = ">query\nAAAAACAAAAACAAAAACAAAAA\n"; + fs::write(&query_path, query_content).unwrap(); + + // Build index with k=5, w=5 (k+w-1 must be odd: 5+5-1=9, odd ✓) + let output = StdCommand::new(assert_cmd::cargo::cargo_bin("deacon")) + .arg("index") + .arg("build") + .arg("-k") + .arg("5") + .arg("-w") + .arg("5") + .arg(&ref_path) + .output() + .expect("Failed to execute index command"); + + fs::write(&bin_path, output.stdout).expect("Failed to write index file"); + assert!(output.status.success(), "Index build command failed"); + + // client query against index + run_with_server!(bin_path.clone(), |port: &str| { + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("client") + .arg("-a") + .arg("1") + .arg("-r") + .arg("0.0") + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&query_path) + .arg("-o") + .arg(&output_path) + .assert() + .success(); + }); + + // Read filtered output + let output_str = fs::read_to_string(&output_path).unwrap(); + + // If newlines are being mapped to C, the query would match + // The bug would cause the reference "AAAAA\nAAAAA\nAAAAA\nAAAAA" to become + // "AAAAACAAAAACAAAAACAAAAA" after mapping newlines to C + // So if the bug exists, the query would match and be filtered (kept with deplete=false) + + // With the bug, we'd expect a match. Without the bug, no match. + if output_str.contains(">query") { + panic!( + "BUG DETECTED: Query matched due to newlines being mapped to 'C'. Output: {output_str}" + ); + } + + println!("Test passed - no false matches from newline mapping"); +} + +#[test] +fn test_large_kmer_filter() { + let temp_dir = tempdir().unwrap(); + let fasta_path = temp_dir.path().join("test.fasta"); + let bin_path = temp_dir.path().join("test.bin"); + let fastq_path = temp_dir.path().join("test.fastq"); + + create_test_fasta(&fasta_path); + create_test_fastq(&fastq_path); + + // Index with k=41 (u128 code path) + let mut cmd = Command::cargo_bin("deacon").unwrap(); + cmd.arg("index") + .arg("build") + .arg("-k") + .arg("41") + .arg("-w") + .arg("15") + .arg(&fasta_path) + .arg("-o") + .arg(&bin_path) + .assert() + .success(); + + // Test filtering with our k=41 index + run_with_server!(bin_path.clone(), |port: &str| { + let output = Command::cargo_bin("deacon") + .unwrap() + .arg("client") + .arg("http://0.0.0.0:".to_owned() + port) + .arg(&fastq_path) + .arg("-a") + .arg("1") + .arg("-r") + .arg("0.0") + .output() + .unwrap(); + + assert!(output.status.success(), "client command failed"); + + // Should retain both seqs + let stdout = String::from_utf8_lossy(&output.stdout); + let num_sequences = stdout.lines().filter(|line| line.starts_with('@')).count(); + assert_eq!(num_sequences, 2, "Should retain both sequences"); + }); +} From e224760fbf0f2a863dad6e1fc4d50e37181cf314 Mon Sep 17 00:00:00 2001 From: gpas-github-bot Date: Wed, 10 Sep 2025 11:08:17 +0100 Subject: [PATCH 6/7] fix: default to not compile with server --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 6032e72..04e6128 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -42,7 +42,7 @@ reqwest = { version = "0.12.22", features = ["blocking", "json"], optional = tru sha256 = { version = "1.6.0", optional = true } [features] -# default = ["server"] +default = [] server = ["dep:tokio", "dep:axum", "dep:tracing-subscriber", "dep:reqwest", "dep:sha256"] [lints.clippy] From 154945ae9cc6e4ed2e2555f2fd37698289f2dfbf Mon Sep 17 00:00:00 2001 From: gpas-github-bot Date: Wed, 10 Sep 2025 11:31:30 +0100 Subject: [PATCH 7/7] docs: expand some docstrings --- README.md | 8 ++++++-- src/filter.rs | 4 ++++ src/filter_common.rs | 27 +++++++++++++++++++++++---- src/local_filter.rs | 4 ++++ src/main.rs | 10 +++++++--- src/remote_filter.rs | 9 +++++++-- src/server.rs | 8 ++++++++ src/server_common.rs | 2 ++ 8 files changed, 61 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 5a02643..b445535 100644 --- a/README.md +++ b/README.md @@ -247,6 +247,8 @@ Unit tests covering all filter functionality (but with the server/client) are en cargo test --features server -- --test-threads 1 ``` +This also tests the remote filtering engine in local mode to ensure that results are identical between the two filtering implementations. + ### Server Start up a server with a specific index loaded. Note that this by default runs in the foreground, so it may appear as if nothing is happening after a message about loading your index. To display incomming connection logs, set `RUST_LOG=trace` in your environment variables. @@ -267,8 +269,10 @@ Almost exactly identical to the `deacon filter` reference, but swapping index pa #### Usage -``` -Requires feature `server`. Alternate version of Filter, swapping local compute for passing to a server which has the index pre-loaded. Will inevitably be slower than local filtering, but saves on index loading. Better used for cases of small input + large index +```bash +Alternate version of Filter, swapping local compute for passing to a server which has the index pre-loaded. Will inevitably be slower than local filtering, but saves on index loading. Better used for cases of small input + large index + +Requires "server" feature to be enabled at compile time. Usage: deacon client [OPTIONS] [INPUT] [INPUT2] diff --git a/src/filter.rs b/src/filter.rs index ba971b9..f13ab8f 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -1,3 +1,7 @@ +//! This module provides filtering functionality for processing data. +//! Basic switching of which `run` to use based on whether the executable is compiled with the +//! `server` feature or not. +//! #[cfg(not(feature = "server"))] pub use crate::local_filter::run; #[cfg(feature = "server")] diff --git a/src/filter_common.rs b/src/filter_common.rs index cf9519b..48acb63 100644 --- a/src/filter_common.rs +++ b/src/filter_common.rs @@ -1,3 +1,8 @@ +//! Common functions for filtering sequences based on minimizer indices. +//! Used by both local and remote filtering implementations. +//! +//! Includes functions for calculating minimizers, checking filtering criteria, +//! and generating summaries of filtering operations. use packed_seq::SeqVec; use rustc_hash::FxHashSet; use serde::{Deserialize, Serialize}; @@ -6,7 +11,7 @@ use std::path::PathBuf; #[cfg(feature = "server")] use reqwest::blocking::Client; -// JSON summary structure +/// JSON filter summary structure #[derive(Serialize, Deserialize)] pub struct FilterSummary { pub version: String, @@ -309,9 +314,23 @@ pub fn get_minimizer_hashes_and_positions( (minimizer_values, positions, effective_seq) } -pub fn get_paired_minimizer_hashes_and_positions<'a>( - seq1: &'a [u8], - seq2: &'a [u8], +/// Given two sequences (e.g., paired reads), compute the combined minimizer hashes and positions +/// from both sequences. +/// +/// # Args: +/// * `seq1`: The first input sequence as a byte slice. +/// * `seq2`: The second input sequence as a byte slice. +/// * `prefix_length`: If >0, only consider the first `prefix_length` bases of each sequence. +/// * `kmer_length`: The length of k-mers to consider for minimizers. +/// * `window_size`: The size of the sliding window to find minimizers. +/// # Returns: +/// * A tuple containing: +/// - A vector of combined minimizer hash values (u64) from both sequences. +/// - A vector of combined positions (u32) where each minimizer occurs in the sequences. +/// - A vector of effective sequences (Vec) used for minimizer calculation from both sequences. +pub fn get_paired_minimizer_hashes_and_positions( + seq1: &[u8], + seq2: &[u8], prefix_length: usize, kmer_length: u8, window_size: u8, diff --git a/src/local_filter.rs b/src/local_filter.rs index 6a4b708..86f3a82 100644 --- a/src/local_filter.rs +++ b/src/local_filter.rs @@ -1,3 +1,7 @@ +//! Deacon filtering functionality, including single and paired read support. +//! Uses paraseq for parallel processing of FASTA/FASTQ files. +//! +//! Includes *only* local filtering implementation, for remote filtering see the `remote_filter` module. use crate::FilterSummary; use crate::filter_common::{ get_minimizer_hashes_and_positions, get_paired_minimizer_hashes_and_positions, diff --git a/src/main.rs b/src/main.rs index af685de..844b16a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -80,9 +80,11 @@ enum Commands { #[arg(short = 'q', long = "quiet", default_value_t = false)] quiet: bool, }, - /// Requires feature `server`. Run a server to hold a pre-loaded minimizer index in memory for filtering + /// Run a server to hold a pre-loaded minimizer index in memory for filtering /// with the Client command. Saves time for filtering short sequences with large indexes /// but will inevitably be slower than local filtering. + /// + /// Requires "server" feature to be enabled at compile time. Server { /// Path to minimizer index file index: PathBuf, @@ -91,9 +93,11 @@ enum Commands { #[arg(short = 'p', long = "port", default_value_t = 8888)] port: u16, }, - /// Requires feature `server`. Alternate version of Filter, swapping local compute for passing to a server + /// Alternate version of Filter, swapping local compute for passing to a server /// which has the index pre-loaded. Will inevitably be slower than local filtering, - /// but saves on index loading. Better used for cases of small input + large index + /// but saves on index loading. Better used for cases of small input + large index. + /// + /// Requires "server" feature to be enabled at compile time. Client { /// Server address to connect to (including port) server_address: String, diff --git a/src/remote_filter.rs b/src/remote_filter.rs index 916bfd3..a4c10bd 100644 --- a/src/remote_filter.rs +++ b/src/remote_filter.rs @@ -1,3 +1,8 @@ +//! Deacon filtering functionality, optimised for remote operation. +//! For faster local filtering, without a remote option, please see the `local_filter` module. +//! +//! Functionally both filter modules use the same core logic, only differing on +//! file parsing, and multithreading orchestration. use crate::FilterConfig; use crate::FilterSummary; use crate::filter_common::get_minimizer_hashes_and_positions; @@ -302,7 +307,7 @@ pub fn paired_should_keep( /// Given a set of input minimizers from unpaired reads, check if they should be output /// If index minimizers are provided, check locally. -/// If not, send to server for checking. Requires the `server` feature to be enabled. +/// If not, send to server for checking. Requires the "server" feature to be enabled. pub fn check_single_inputs_should_be_output( index_minimizers: &Option>, input_minimizers_and_positions: &Vec<(Vec, Vec, Vec)>, @@ -365,7 +370,7 @@ pub fn check_single_inputs_should_be_output( /// Given a set of input minimizers from paired reads, check if they should be output /// If index minimizers are provided, check locally. -/// If not, send to server for checking. Requires the `server` feature to be enabled. +/// If not, send to server for checking. Requires the "server" feature to be enabled. pub fn check_paired_inputs_should_be_output( index_minimizers: &Option>, input_minimizers_and_positions: &Vec<(Vec, Vec, Vec>)>, diff --git a/src/server.rs b/src/server.rs index ead5680..1888468 100644 --- a/src/server.rs +++ b/src/server.rs @@ -117,6 +117,10 @@ pub async fn index_version() -> String { hash.clone().unwrap() } +/// Endpoint to determine whether to output paired reads based on their minimizers +/// Endpoint is `/should_output_paired` +/// +/// Simply forwards to `paired_should_keep` in the `remote_filter` module async fn should_output_paired(Json(request): Json) -> Json { let index = INDEX.lock(); match index { @@ -139,6 +143,10 @@ async fn should_output_paired(Json(request): Json) -> Json< } } +/// Endpoint to determine whether to output unpaired reads based on their minimizers +/// Endpoint is `/should_output_unpaired` +/// +/// Simply forwards to `unpaired_should_keep` in the `remote_filter` module async fn should_output_unpaired( Json(request): Json, ) -> Json { diff --git a/src/server_common.rs b/src/server_common.rs index b072387..d1c2ab6 100644 --- a/src/server_common.rs +++ b/src/server_common.rs @@ -8,6 +8,7 @@ use serde::{Deserialize, Serialize}; #[derive(Serialize, Deserialize)] pub struct UnpairedFilterRequest { /// Prehashed minimizers for input + /// Tuple of (minimizer hashes, positions, effective sequences) pub input: Vec<(Vec, Vec, Vec)>, /// Mininum number (integer) of minimizer hits for a match @@ -30,6 +31,7 @@ pub struct UnpairedFilterRequest { #[derive(Serialize, Deserialize)] pub struct PairedFilterRequest { /// Prehashed minimizers for input + /// Tuple of (minimizer hashes, positions, effective sequences) pub input: Vec<(Vec, Vec, Vec>)>, /// Mininum number (integer) of minimizer hits for a match