diff --git a/.cargo/config.toml b/.cargo/config.toml index e757e115..262a07a0 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -1,3 +1,10 @@ # This enables KaTex in docs, but requires running `cargo doc --no-deps`. [build] rustdocflags = "--html-in-header .cargo/katex-header.html" + +[target.wasm32-wasip2] +rustflags = ["-C", "target-feature=+simd128,+relaxed-simd"] + +[target.wasm32-wasip1] +runner = "wasmtime run --dir . " +rustflags = ["-C", "target-feature=+simd128,+relaxed-simd"] diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index c9c4bf6a..a7a18c56 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -18,6 +18,10 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Replace divan with codspeed-divan-compat + run: | + sed -i 's/^divan = .*/divan = { package = "codspeed-divan-compat", version = "3.0.1" }/' Cargo.toml + - name: Setup Rust toolchain, cache and cargo-codspeed binary uses: moonrepo/setup-rust@v1 with: diff --git a/.gitignore b/.gitignore index f770c0ae..165e92b5 100644 --- a/.gitignore +++ b/.gitignore @@ -43,4 +43,4 @@ Cargo.lock # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ -circuit_stats_examples/ \ No newline at end of file +circuit_stats_examples/ diff --git a/Cargo.toml b/Cargo.toml index 97664360..e7b31656 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,8 +3,8 @@ resolver = "2" members = [ "skyscraper/fp-rounding", "skyscraper/hla", - "skyscraper/block-multiplier", - "skyscraper/block-multiplier-codegen", + "skyscraper/bn254-multiplier", + "skyscraper/bn254-multiplier-codegen", "skyscraper/core", "provekit/common", "provekit/r1cs-compiler", @@ -40,6 +40,9 @@ license = "MIT" homepage = "https://github.com/worldfnd/ProveKit" repository = "https://github.com/worldfnd/ProveKit" +[workspace.lints.rust] +unexpected_cfgs = { level = "warn", check-cfg = ['cfg(kani)'] } + [workspace.lints.clippy] cargo = "warn" perf = "warn" @@ -70,8 +73,8 @@ opt-level = 3 [workspace.dependencies] # Workspace members - Skyscraper -block-multiplier = { path = "skyscraper/block-multiplier" } -block-multiplier-codegen = { path = "skyscraper/block-multiplier-codegen" } +bn254-multiplier = { path = "skyscraper/bn254-multiplier" } +bn254-multiplier-codegen = { path = "skyscraper/bn254-multiplier-codegen" } fp-rounding = { path = "skyscraper/fp-rounding" } hla = { path = "skyscraper/hla" } skyscraper = { path = "skyscraper/core" } @@ -94,7 +97,9 @@ axum = "0.8.4" base64 = "0.22.1" bytes = "1.10.1" chrono = "0.4.41" -divan = { package = "codspeed-divan-compat", version = "3.0.1" } +# On CI divan get replaced by divan = { package = "codspeed-divan-compat", version = "3.0.1" } for benchmark tracking. +# This is a workaround because different package selection based on target does not mix well with workspace dependencies. +divan = "0.1.21" hex = "0.4.3" itertools = "0.14.0" paste = "1.0.15" diff --git a/skyscraper/block-multiplier/benches/bench.rs b/skyscraper/block-multiplier/benches/bench.rs deleted file mode 100644 index 3e5c6f17..00000000 --- a/skyscraper/block-multiplier/benches/bench.rs +++ /dev/null @@ -1,217 +0,0 @@ -#![feature(portable_simd)] - -use { - core::{array, simd::u64x2}, - divan::Bencher, - fp_rounding::with_rounding_mode, - rand::{rng, Rng}, -}; - -// #[divan::bench_group] -mod mul { - use super::*; - - #[divan::bench] - fn scalar_mul(bencher: Bencher) { - bencher - //.counter(ItemsCount::new(1usize)) - .with_inputs(|| rng().random()) - .bench_local_values(|(a, b)| block_multiplier::scalar_mul(a, b)); - } - - #[divan::bench] - fn ark_ff(bencher: Bencher) { - use {ark_bn254::Fr, ark_ff::BigInt}; - bencher - //.counter(ItemsCount::new(1usize)) - .with_inputs(|| { - ( - Fr::new(BigInt(rng().random())), - Fr::new(BigInt(rng().random())), - ) - }) - .bench_local_values(|(a, b)| a * b); - } - - #[divan::bench] - fn simd_mul(bencher: Bencher) { - bencher - //.counter(ItemsCount::new(2usize)) - .with_inputs(|| rng().random()) - .bench_local_values(|(a, b, c, d)| block_multiplier::simd_mul(a, b, c, d)); - } - - #[divan::bench] - fn block_mul(bencher: Bencher) { - let bencher = bencher - //.counter(ItemsCount::new(3usize)) - .with_inputs(|| rng().random()); - unsafe { - with_rounding_mode((), |guard, _| { - bencher.bench_local_values(|(a, b, c, d, e, f)| { - block_multiplier::block_mul(guard, a, b, c, d, e, f) - }); - }); - } - } - - #[divan::bench] - fn montgomery_interleaved_3(bencher: Bencher) { - let bencher = bencher - //.counter(ItemsCount::new(3usize)) - .with_inputs(|| { - ( - rng().random(), - rng().random(), - array::from_fn(|_| u64x2::from_array(rng().random())), - array::from_fn(|_| u64x2::from_array(rng().random())), - ) - }); - unsafe { - with_rounding_mode((), |mode_guard, _| { - bencher.bench_local_values(|(a, b, c, d)| { - block_multiplier::montgomery_interleaved_3(mode_guard, a, b, c, d) - }); - }); - } - } - - #[divan::bench] - fn montgomery_interleaved_4(bencher: Bencher) { - let bencher = bencher - //.counter(ItemsCount::new(4usize)) - .with_inputs(|| { - ( - rng().random(), - rng().random(), - rng().random(), - rng().random(), - array::from_fn(|_| u64x2::from_array(rng().random())), - array::from_fn(|_| u64x2::from_array(rng().random())), - ) - }); - unsafe { - with_rounding_mode((), |mode_guard, _| { - bencher.bench_local_values(|(a, b, c, d, e, f)| { - block_multiplier::montgomery_interleaved_4(mode_guard, a, b, c, d, e, f) - }); - }); - } - } -} - -// #[divan::bench_group] -mod sqr { - use {super::*, ark_ff::Field}; - - #[divan::bench] - fn scalar_sqr(bencher: Bencher) { - bencher - //.counter(ItemsCount::new(1usize)) - .with_inputs(|| rng().random()) - .bench_local_values(block_multiplier::scalar_sqr); - } - - #[divan::bench] - fn ark_ff(bencher: Bencher) { - use {ark_bn254::Fr, ark_ff::BigInt}; - bencher - //.counter(ItemsCount::new(1usize)) - .with_inputs(|| Fr::new(BigInt(rng().random()))) - .bench_local_values(|a: Fr| a.square()); - } - - #[divan::bench] - fn montgomery_square_log_interleaved_3(bencher: Bencher) { - let bencher = bencher.with_inputs(|| { - ( - rng().random(), - array::from_fn(|_| u64x2::from_array(rng().random())), - ) - }); - unsafe { - with_rounding_mode((), |mode_guard, _| { - bencher.bench_local_values(|(a, b)| { - block_multiplier::montgomery_square_log_interleaved_3(mode_guard, a, b) - }); - }); - } - } - - #[divan::bench] - fn montgomery_square_log_interleaved_4(bencher: Bencher) { - let bencher = bencher.with_inputs(|| { - ( - rng().random(), - rng().random(), - array::from_fn(|_| u64x2::from_array(rng().random())), - ) - }); - unsafe { - with_rounding_mode((), |mode_guard, _| { - bencher.bench_local_values(|(a, b, c)| { - block_multiplier::montgomery_square_log_interleaved_4(mode_guard, a, b, c) - }); - }); - } - - #[divan::bench] - fn montgomery_square_interleaved_3(bencher: Bencher) { - let bencher = bencher.with_inputs(|| { - ( - rng().random(), - array::from_fn(|_| u64x2::from_array(rng().random())), - ) - }); - unsafe { - with_rounding_mode((), |mode_guard, _| { - bencher.bench_local_values(|(a, b)| { - block_multiplier::montgomery_square_interleaved_3(mode_guard, a, b) - }); - }); - } - } - - #[divan::bench] - fn montgomery_square_interleaved_4(bencher: Bencher) { - let bencher = bencher.with_inputs(|| { - ( - rng().random(), - rng().random(), - array::from_fn(|_| u64x2::from_array(rng().random())), - ) - }); - unsafe { - with_rounding_mode((), |mode_guard, _| { - bencher.bench_local_values(|(a, b, c)| { - block_multiplier::montgomery_square_interleaved_4(mode_guard, a, b, c) - }); - }); - } - } - } - - #[divan::bench] - fn simd_sqr(bencher: Bencher) { - bencher - //.counter(ItemsCount::new(2usize)) - .with_inputs(|| rng().random()) - .bench_local_values(|(a, b)| block_multiplier::simd_sqr(a, b)); - } - - #[divan::bench] - fn block_sqr(bencher: Bencher) { - let bencher = bencher - //.counter(ItemsCount::new(3usize)) - .with_inputs(|| rng().random()); - unsafe { - with_rounding_mode((), |guard, _| { - bencher.bench_local_values(|(a, b, c)| block_multiplier::block_sqr(guard, a, b, c)); - }); - } - } -} - -fn main() { - divan::main(); -} diff --git a/skyscraper/block-multiplier/proptest-regressions/scalar.txt b/skyscraper/block-multiplier/proptest-regressions/scalar.txt deleted file mode 100644 index 4715d78f..00000000 --- a/skyscraper/block-multiplier/proptest-regressions/scalar.txt +++ /dev/null @@ -1,8 +0,0 @@ -# Seeds for failure cases proptest has generated in the past. It is -# automatically read and these particular cases re-run before any -# novel cases are generated. -# -# It is recommended to check this file in to source control so that -# everyone who runs the test benefits from these saved cases. -cc 46acc9f3c07fefb126b59a0edec37c56f92c16c1468989ed132bf42ef54ffe86 # shrinks to l = [0, 0, 0, 1], r = [0, 0, 0, 1] -cc e629632cdf5eb4aefd4fdb2da29bdbd7b2a177a69dd74f99f70683f11c942da7 # shrinks to l = [0, 887, 0, 15778841185528309819], r = [458854615557053794, 8784556235901218364, 1751211468174275388, 16873806747226852460] diff --git a/skyscraper/block-multiplier/src/aarch64/generate_montgomery_table.py b/skyscraper/block-multiplier/src/aarch64/generate_montgomery_table.py deleted file mode 100644 index bf8d78d3..00000000 --- a/skyscraper/block-multiplier/src/aarch64/generate_montgomery_table.py +++ /dev/null @@ -1,112 +0,0 @@ -p = 21888242871839275222246405745257275088696311157297823662689037894645226208583 - -U52_i1 = [ - 0x82e644ee4c3d2, - 0xf93893c98b1de, - 0xd46fe04d0a4c7, - 0x8f0aad55e2a1f, - 0x005ed0447de83, -] - -U52_i2 = [ - 0x74eccce9a797a, - 0x16ddcc30bd8a4, - 0x49ecd3539499e, - 0xb23a6fcc592b8, - 0x00e3bd49f6ee5, -] - -U52_i3 = [ - 0x0E8C656567D77, - 0x430D05713AE61, - 0xEA3BA6B167128, - 0xA7DAE55C5A296, - 0x01B4AFD513572, -] - -U52_i4 = [ - 0x22E2400E2F27D, - 0x323B46EA19686, - 0xE6C43F0DF672D, - 0x7824014C39E8B, - 0x00C6B48AFE1B8, -] - -U64_I1 = [ - 0x2d3e8053e396ee4d, - 0xca478dbeab3c92cd, - 0xb2d8f06f77f52a93, - 0x24d6ba07f7aa8f04, -] - -U64_I2 = [ - 0x18ee753c76f9dc6f, - 0x54ad7e14a329e70f, - 0x2b16366f4f7684df, - 0x133100d71fdf3579, -] - -U64_I3 = [ - 0x9BACB016127CBE4E, - 0x0B2051FA31944124, - 0xB064EEA46091C76C, - 0x2B062AAA49F80C7D, -] - -def limbs_to_int(size, xs): - total = 0 - for (i, x) in enumerate(xs): - total += x << (size*i) - - return total - -u64_i1 = limbs_to_int(64, U64_I1) -u64_i2 = limbs_to_int(64, U64_I2) -u64_i3 = limbs_to_int(64, U64_I3) - -u52_i1 = limbs_to_int(52, U52_i1) -u52_i2 = limbs_to_int(52, U52_i2) -u52_i3 = limbs_to_int(52, U52_i3) -u52_i4 = limbs_to_int(52, U52_i4) - - -def log_jump(single_input_bound): - - product_bound = single_input_bound**2 - - first_round = (product_bound>>2*64) + u64_i2 * (2**128-1) - second_round = (first_round >> 64) + u64_i1 * (2**64-1) - mont_round = second_round + p*(2**64-1) - final = mont_round >> 64 - return final - -def single_step(single_input_bound): - product_bound = single_input_bound**2 - - first_round = (product_bound>>3*64) + (u64_i3 + u64_i2 + u64_i1) * (2**64-1) - mont_round = first_round + p*(2**64-1) - final = mont_round >> 64 - return final - -def single_step_simd(single_input_bound): - product_bound = (single_input_bound<<2)**2 - - first_round = (product_bound>>4*52) + (u52_i4 + u52_i3 + u52_i2 + u52_i1) * (2**52-1) - mont_round = first_round + p*(2**52-1) - final = mont_round >> 52 - return final - -if __name__ == "__main__": - # Test bounds for different input sizes - test_bounds = [("p", p),("2p", 2*p), ("3p", 3*p), ("2ˆ256-2p",2**256-2*p)] - print("Input Size | single_step | single_step_simd | log_jump") - print("-----------|-------------|------------------|---------") - for name, bound in test_bounds: - single = single_step(bound)/p - simd = single_step_simd(bound)/p - log = log_jump(bound)/p - single_space = (2**256-1-single_step(bound))/p - simd_space = (2**256-1-single_step_simd(bound))/p - log_space = (2**256-1-log_jump(bound))/p - print(f"{name:10} | {single:4.2f} [{single_space:4.2f}] | {simd:7.2f} [{simd_space:.4f}] | {log:4.2f} [{log_space:.2f}]") - diff --git a/skyscraper/block-multiplier/src/constants.rs b/skyscraper/block-multiplier/src/constants.rs deleted file mode 100644 index 171273f5..00000000 --- a/skyscraper/block-multiplier/src/constants.rs +++ /dev/null @@ -1,151 +0,0 @@ -pub const U64_NP0: u64 = 0xc2e1f593efffffff; - -pub const U64_P: [u64; 4] = [ - 0x43e1f593f0000001, - 0x2833e84879b97091, - 0xb85045b68181585d, - 0x30644e72e131a029, -]; - -pub const U64_2P: [u64; 4] = [ - 0x87c3eb27e0000002, - 0x5067d090f372e122, - 0x70a08b6d0302b0ba, - 0x60c89ce5c2634053, -]; - -// R mod P -pub const U64_R: [u64; 4] = [ - 0xac96341c4ffffffb, - 0x36fc76959f60cd29, - 0x666ea36f7879462e, - 0x0e0a77c19a07df2f, -]; - -// R^2 mod P -pub const U64_R2: [u64; 4] = [ - 0x1bb8e645ae216da7, - 0x53fe3ab1e35c59e3, - 0x8c49833d53bb8085, - 0x0216d0b17f4e44a5, -]; - -// R^-1 mod P -pub const U64_R_INV: [u64; 4] = [ - 0xdc5ba0056db1194e, - 0x090ef5a9e111ec87, - 0xc8260de4aeb85d5d, - 0x15ebf95182c5551c, -]; - -pub const U52_NP0: u64 = 0x1f593efffffff; -pub const U52_R2: [u64; 5] = [ - 0x0b852d16da6f5, - 0xc621620cddce3, - 0xaf1b95343ffb6, - 0xc3c15e103e7c2, - 0x00281528fa122, -]; - -pub const U52_P: [u64; 5] = [ - 0x1f593f0000001, - 0x4879b9709143e, - 0x181585d2833e8, - 0xa029b85045b68, - 0x030644e72e131, -]; - -pub const U52_2P: [u64; 5] = [ - 0x3eb27e0000002, - 0x90f372e12287c, - 0x302b0ba5067d0, - 0x405370a08b6d0, - 0x060c89ce5c263, -]; - -pub const F52_P: [f64; 5] = [ - 0x1f593f0000001_u64 as f64, - 0x4879b9709143e_u64 as f64, - 0x181585d2833e8_u64 as f64, - 0xa029b85045b68_u64 as f64, - 0x030644e72e131_u64 as f64, -]; - -pub const MASK52: u64 = 2_u64.pow(52) - 1; -pub const MASK48: u64 = 2_u64.pow(48) - 1; - -pub const U64_I1: [u64; 4] = [ - 0x2d3e8053e396ee4d, - 0xca478dbeab3c92cd, - 0xb2d8f06f77f52a93, - 0x24d6ba07f7aa8f04, -]; -pub const U64_I2: [u64; 4] = [ - 0x18ee753c76f9dc6f, - 0x54ad7e14a329e70f, - 0x2b16366f4f7684df, - 0x133100d71fdf3579, -]; - -pub const U64_I3: [u64; 4] = [ - 0x9bacb016127cbe4e, - 0x0b2051fa31944124, - 0xb064eea46091c76c, - 0x2b062aaa49f80c7d, -]; -pub const U64_MU0: u64 = 0xc2e1f593efffffff; - -// -- [FP SIMD CONSTANTS] -// -------------------------------------------------------------------------- -pub const RHO_1: [u64; 5] = [ - 0x82e644ee4c3d2, - 0xf93893c98b1de, - 0xd46fe04d0a4c7, - 0x8f0aad55e2a1f, - 0x005ed0447de83, -]; - -pub const RHO_2: [u64; 5] = [ - 0x74eccce9a797a, - 0x16ddcc30bd8a4, - 0x49ecd3539499e, - 0xb23a6fcc592b8, - 0x00e3bd49f6ee5, -]; - -pub const RHO_3: [u64; 5] = [ - 0x0e8c656567d77, - 0x430d05713ae61, - 0xea3ba6b167128, - 0xa7dae55c5a296, - 0x01b4afd513572, -]; - -pub const RHO_4: [u64; 5] = [ - 0x22e2400e2f27d, - 0x323b46ea19686, - 0xe6c43f0df672d, - 0x7824014c39e8b, - 0x00c6b48afe1b8, -]; - -pub const C1: f64 = pow_2(104); // 2.0^104 -pub const C2: f64 = pow_2(104) + pow_2(52); // 2.0^104 + 2.0^52 - // const C3: f64 = pow_2(52); // 2.0^52 - // ------------------------------------------------------------------------------------------------- - -const fn pow_2(n: u32) -> f64 { - // Unfortunately we can't use f64::powi in const fn yet - // This is a workaround that creates the bit pattern directly - let exp = ((n as u64 + 1023) & 0x7ff) << 52; - f64::from_bits(exp) -} - -// BOUNDS -/// Upper bound of 2**256-2p -pub const OUTPUT_MAX: [u64; 4] = [ - 0x783c14d81ffffffe, - 0xaf982f6f0c8d1edd, - 0x8f5f7492fcfd4f45, - 0x9f37631a3d9cbfac, -]; diff --git a/skyscraper/block-multiplier/src/lib.rs b/skyscraper/block-multiplier/src/lib.rs deleted file mode 100644 index fe54fa53..00000000 --- a/skyscraper/block-multiplier/src/lib.rs +++ /dev/null @@ -1,33 +0,0 @@ -#![feature(portable_simd)] -#![feature(bigint_helper_methods)] -//#![no_std] This crate can technically be no_std. However this requires -// replacing StdFloat.mul_add with intrinsics. - -#[cfg(target_arch = "aarch64")] -mod aarch64; - -// These can be made to work on x86, -// but for now it uses an ARM NEON intrinsic. -#[cfg(target_arch = "aarch64")] -mod block_simd; -#[cfg(target_arch = "aarch64")] -mod portable_simd; -#[cfg(target_arch = "aarch64")] -mod simd_utils; - -pub mod constants; -mod scalar; -mod test_utils; -mod utils; - -pub use crate::scalar::{scalar_mul, scalar_sqr}; -#[cfg(target_arch = "aarch64")] -pub use crate::{ - aarch64::{ - montgomery_interleaved_3, montgomery_interleaved_4, montgomery_square_interleaved_3, - montgomery_square_interleaved_4, montgomery_square_log_interleaved_3, - montgomery_square_log_interleaved_4, - }, - block_simd::{block_mul, block_sqr}, - portable_simd::{simd_mul, simd_sqr}, -}; diff --git a/skyscraper/block-multiplier-codegen/.gitignore b/skyscraper/bn254-multiplier-codegen/.gitignore similarity index 63% rename from skyscraper/block-multiplier-codegen/.gitignore rename to skyscraper/bn254-multiplier-codegen/.gitignore index ab9cdb40..8e3e5af3 100644 --- a/skyscraper/block-multiplier-codegen/.gitignore +++ b/skyscraper/bn254-multiplier-codegen/.gitignore @@ -1,2 +1,2 @@ -# We don't include the inline rust generated files as they will be part of block-multiplier-sys -asm/ \ No newline at end of file +# We don't include the inline rust generated files as they will be part of bn254-multiplier-sys +asm/ diff --git a/skyscraper/block-multiplier-codegen/Cargo.toml b/skyscraper/bn254-multiplier-codegen/Cargo.toml similarity index 88% rename from skyscraper/block-multiplier-codegen/Cargo.toml rename to skyscraper/bn254-multiplier-codegen/Cargo.toml index 946f023d..d8a7b8f1 100644 --- a/skyscraper/block-multiplier-codegen/Cargo.toml +++ b/skyscraper/bn254-multiplier-codegen/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "block-multiplier-codegen" +name = "bn254-multiplier-codegen" version = "0.1.0" edition.workspace = true rust-version.workspace = true diff --git a/skyscraper/block-multiplier-codegen/README.md b/skyscraper/bn254-multiplier-codegen/README.md similarity index 71% rename from skyscraper/block-multiplier-codegen/README.md rename to skyscraper/bn254-multiplier-codegen/README.md index f929636d..270d99d1 100644 --- a/skyscraper/block-multiplier-codegen/README.md +++ b/skyscraper/bn254-multiplier-codegen/README.md @@ -6,12 +6,12 @@ This crate contains a binary that generates optimized assembly code for block mu 1. **Run the binary:** ```bash - cargo run --package block-multiplier-codegen + cargo run --package bn254-multiplier-codegen ``` This will execute the `main` function in `src/main.rs`. 2. **Generated File:** The binary will generate an assembly file named `asm/montgomery_interleaved.s` within this crate's directory. -3. **Integrate into `block-multiplier-sys`:** - Copy the contents of the generated `asm/montgomery_interleaved.s` file. Paste this assembly code into the appropriate location within the `block-multiplier-sys` crate, likely inside a specific function designed to use this inline assembly. \ No newline at end of file +3. **Integrate into `bn254-multiplier-sys`:** + Copy the contents of the generated `asm/montgomery_interleaved.s` file. Paste this assembly code into the appropriate location within the `bn254-multiplier-sys` crate, likely inside a specific function designed to use this inline assembly. diff --git a/skyscraper/block-multiplier-codegen/src/constants.rs b/skyscraper/bn254-multiplier-codegen/src/constants.rs similarity index 100% rename from skyscraper/block-multiplier-codegen/src/constants.rs rename to skyscraper/bn254-multiplier-codegen/src/constants.rs diff --git a/skyscraper/block-multiplier-codegen/src/lib.rs b/skyscraper/bn254-multiplier-codegen/src/lib.rs similarity index 100% rename from skyscraper/block-multiplier-codegen/src/lib.rs rename to skyscraper/bn254-multiplier-codegen/src/lib.rs diff --git a/skyscraper/block-multiplier-codegen/src/load_store.rs b/skyscraper/bn254-multiplier-codegen/src/load_store.rs similarity index 100% rename from skyscraper/block-multiplier-codegen/src/load_store.rs rename to skyscraper/bn254-multiplier-codegen/src/load_store.rs diff --git a/skyscraper/block-multiplier-codegen/src/main.rs b/skyscraper/bn254-multiplier-codegen/src/main.rs similarity index 97% rename from skyscraper/block-multiplier-codegen/src/main.rs rename to skyscraper/bn254-multiplier-codegen/src/main.rs index 7437e321..b467bbfa 100644 --- a/skyscraper/block-multiplier-codegen/src/main.rs +++ b/skyscraper/bn254-multiplier-codegen/src/main.rs @@ -1,5 +1,5 @@ use { - block_multiplier_codegen::{scalar, simd}, + bn254_multiplier_codegen::{scalar, simd}, hla::builder::{build_includable, Interleaving}, }; diff --git a/skyscraper/block-multiplier-codegen/src/scalar.rs b/skyscraper/bn254-multiplier-codegen/src/scalar.rs similarity index 100% rename from skyscraper/block-multiplier-codegen/src/scalar.rs rename to skyscraper/bn254-multiplier-codegen/src/scalar.rs diff --git a/skyscraper/block-multiplier-codegen/src/simd.rs b/skyscraper/bn254-multiplier-codegen/src/simd.rs similarity index 100% rename from skyscraper/block-multiplier-codegen/src/simd.rs rename to skyscraper/bn254-multiplier-codegen/src/simd.rs diff --git a/skyscraper/block-multiplier/Cargo.toml b/skyscraper/bn254-multiplier/Cargo.toml similarity index 84% rename from skyscraper/block-multiplier/Cargo.toml rename to skyscraper/bn254-multiplier/Cargo.toml index ab66b0aa..ddd49133 100644 --- a/skyscraper/block-multiplier/Cargo.toml +++ b/skyscraper/bn254-multiplier/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "block-multiplier" +name = "bn254-multiplier" version = "0.1.0" edition.workspace = true rust-version.workspace = true @@ -24,12 +24,14 @@ ark-ff.workspace = true # 3rd party divan.workspace = true primitive-types.workspace = true -proptest.workspace = true rand.workspace = true +[target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies] +proptest.workspace = true + [build-dependencies] # Workspace crates -block-multiplier-codegen.workspace = true +bn254-multiplier-codegen.workspace = true hla.workspace = true [lints] diff --git a/skyscraper/bn254-multiplier/benches/bench.rs b/skyscraper/bn254-multiplier/benches/bench.rs new file mode 100644 index 00000000..7d27d256 --- /dev/null +++ b/skyscraper/bn254-multiplier/benches/bench.rs @@ -0,0 +1,261 @@ +#![feature(portable_simd)] + +use { + divan::Bencher, + rand::{rng, Rng}, +}; + +// #[divan::bench_group] +mod mul { + use super::*; + + #[divan::bench] + fn scalar_mul(bencher: Bencher) { + bencher + //.counter(ItemsCount::new(1usize)) + .with_inputs(|| rng().random()) + .bench_local_values(|(a, b)| bn254_multiplier::scalar_mul(a, b)); + } + + #[divan::bench] + fn ark_ff(bencher: Bencher) { + use {ark_bn254::Fr, ark_ff::BigInt}; + bencher + //.counter(ItemsCount::new(1usize)) + .with_inputs(|| { + ( + Fr::new(BigInt(rng().random())), + Fr::new(BigInt(rng().random())), + ) + }) + .bench_local_values(|(a, b)| a * b); + } + + #[divan::bench] + fn simd_mul_51b(bencher: Bencher) { + bencher + //.counter(ItemsCount::new(2usize)) + .with_inputs(|| rng().random()) + .bench_local_values(|(a, b, c, d)| { + bn254_multiplier::rne::portable_simd::simd_mul(a, b, c, d) + }); + } + + #[cfg(target_arch = "aarch64")] + mod aarch64 { + use { + super::*, + core::{array, simd::u64x2}, + fp_rounding::with_rounding_mode, + }; + + #[divan::bench] + fn simd_mul_rtz(bencher: Bencher) { + let bencher = bencher.with_inputs(|| rng().random()); + unsafe { + with_rounding_mode((), |mode_guard, _| { + bencher.bench_local_values(|(a, b, c, d)| { + bn254_multiplier::rtz::simd_mul(mode_guard, a, b, c, d) + }); + }); + } + } + + #[divan::bench] + fn block_mul(bencher: Bencher) { + let bencher = bencher + //.counter(ItemsCount::new(3usize)) + .with_inputs(|| rng().random()); + unsafe { + with_rounding_mode((), |guard, _| { + bencher.bench_local_values(|(a, b, c, d, e, f)| { + bn254_multiplier::rtz::block_mul(guard, a, b, c, d, e, f) + }); + }); + } + } + + #[divan::bench] + fn montgomery_interleaved_3(bencher: Bencher) { + let bencher = bencher + //.counter(ItemsCount::new(3usize)) + .with_inputs(|| { + ( + rng().random(), + rng().random(), + array::from_fn(|_| u64x2::from_array(rng().random())), + array::from_fn(|_| u64x2::from_array(rng().random())), + ) + }); + unsafe { + with_rounding_mode((), |mode_guard, _| { + bencher.bench_local_values(|(a, b, c, d)| { + bn254_multiplier::montgomery_interleaved_3(mode_guard, a, b, c, d) + }); + }); + } + } + + #[divan::bench] + fn montgomery_interleaved_4(bencher: Bencher) { + let bencher = bencher + //.counter(ItemsCount::new(4usize)) + .with_inputs(|| { + ( + rng().random(), + rng().random(), + rng().random(), + rng().random(), + array::from_fn(|_| u64x2::from_array(rng().random())), + array::from_fn(|_| u64x2::from_array(rng().random())), + ) + }); + unsafe { + with_rounding_mode((), |mode_guard, _| { + bencher.bench_local_values(|(a, b, c, d, e, f)| { + bn254_multiplier::montgomery_interleaved_4(mode_guard, a, b, c, d, e, f) + }); + }); + } + } + } +} + +// #[divan::bench_group] +mod sqr { + use {super::*, ark_ff::Field, bn254_multiplier::rne}; + + #[divan::bench] + fn scalar_sqr(bencher: Bencher) { + bencher + //.counter(ItemsCount::new(1usize)) + .with_inputs(|| rng().random()) + .bench_local_values(bn254_multiplier::scalar_sqr); + } + + #[divan::bench] + fn simd_sqr_b51(bencher: Bencher) { + bencher + //.counter(ItemsCount::new(1usize)) + .with_inputs(|| rng().random()) + .bench_local_values(|(a, b)| rne::simd_sqr(a, b)); + } + + #[divan::bench] + fn ark_ff(bencher: Bencher) { + use {ark_bn254::Fr, ark_ff::BigInt}; + bencher + //.counter(ItemsCount::new(1usize)) + .with_inputs(|| Fr::new(BigInt(rng().random()))) + .bench_local_values(|a: Fr| a.square()); + } + + #[cfg(target_arch = "aarch64")] + mod aarch64 { + use { + super::*, + core::{array, simd::u64x2}, + fp_rounding::with_rounding_mode, + }; + + #[divan::bench] + fn montgomery_square_log_interleaved_3(bencher: Bencher) { + let bencher = bencher.with_inputs(|| { + ( + rng().random(), + array::from_fn(|_| u64x2::from_array(rng().random())), + ) + }); + unsafe { + with_rounding_mode((), |mode_guard, _| { + bencher.bench_local_values(|(a, b)| { + bn254_multiplier::montgomery_square_log_interleaved_3(mode_guard, a, b) + }); + }); + } + } + + #[divan::bench] + fn montgomery_square_log_interleaved_4(bencher: Bencher) { + let bencher = bencher.with_inputs(|| { + ( + rng().random(), + rng().random(), + array::from_fn(|_| u64x2::from_array(rng().random())), + ) + }); + unsafe { + with_rounding_mode((), |mode_guard, _| { + bencher.bench_local_values(|(a, b, c)| { + bn254_multiplier::montgomery_square_log_interleaved_4(mode_guard, a, b, c) + }); + }); + } + } + + #[divan::bench] + fn montgomery_square_interleaved_3(bencher: Bencher) { + let bencher = bencher.with_inputs(|| { + ( + rng().random(), + array::from_fn(|_| u64x2::from_array(rng().random())), + ) + }); + unsafe { + with_rounding_mode((), |mode_guard, _| { + bencher.bench_local_values(|(a, b)| { + bn254_multiplier::montgomery_square_interleaved_3(mode_guard, a, b) + }); + }); + } + } + + #[divan::bench] + fn montgomery_square_interleaved_4(bencher: Bencher) { + let bencher = bencher.with_inputs(|| { + ( + rng().random(), + rng().random(), + array::from_fn(|_| u64x2::from_array(rng().random())), + ) + }); + unsafe { + with_rounding_mode((), |mode_guard, _| { + bencher.bench_local_values(|(a, b, c)| { + bn254_multiplier::montgomery_square_interleaved_4(mode_guard, a, b, c) + }); + }); + } + } + + #[divan::bench] + fn simd_sqr(bencher: Bencher) { + let bencher = bencher.with_inputs(|| rng().random()); + unsafe { + with_rounding_mode((), |mode_guard, _| { + bencher.bench_local_values(|(a, b)| { + bn254_multiplier::rtz::simd_sqr(mode_guard, a, b) + }); + }); + } + } + + #[divan::bench] + fn block_sqr(bencher: Bencher) { + let bencher = bencher + //.counter(ItemsCount::new(3usize)) + .with_inputs(|| rng().random()); + unsafe { + with_rounding_mode((), |guard, _| { + bencher.bench_local_values(|(a, b, c)| { + bn254_multiplier::rtz::block_sqr(guard, a, b, c) + }); + }); + } + } + } +} + +fn main() { + divan::main(); +} diff --git a/skyscraper/block-multiplier/build.rs b/skyscraper/bn254-multiplier/build.rs similarity index 97% rename from skyscraper/block-multiplier/build.rs rename to skyscraper/bn254-multiplier/build.rs index 7623a247..8d2137a5 100644 --- a/skyscraper/block-multiplier/build.rs +++ b/skyscraper/bn254-multiplier/build.rs @@ -1,5 +1,5 @@ use { - block_multiplier_codegen::{scalar, simd}, + bn254_multiplier_codegen::{scalar, simd}, hla::builder::{build_includable, Interleaving}, std::path::Path, }; diff --git a/skyscraper/bn254-multiplier/src/aarch64/generate_montgomery_table.py b/skyscraper/bn254-multiplier/src/aarch64/generate_montgomery_table.py new file mode 100644 index 00000000..1e066e69 --- /dev/null +++ b/skyscraper/bn254-multiplier/src/aarch64/generate_montgomery_table.py @@ -0,0 +1,185 @@ +from math import log2 + +p = 0x30644E72E131A029B85045B68181585D2833E84879B9709143E1F593F0000001 + +U52_i1 = [ + 0x82E644EE4C3D2, + 0xF93893C98B1DE, + 0xD46FE04D0A4C7, + 0x8F0AAD55E2A1F, + 0x005ED0447DE83, +] + +U52_i2 = [ + 0x74ECCCE9A797A, + 0x16DDCC30BD8A4, + 0x49ECD3539499E, + 0xB23A6FCC592B8, + 0x00E3BD49F6EE5, +] + +U52_i3 = [ + 0x0E8C656567D77, + 0x430D05713AE61, + 0xEA3BA6B167128, + 0xA7DAE55C5A296, + 0x01B4AFD513572, +] + +U52_i4 = [ + 0x22E2400E2F27D, + 0x323B46EA19686, + 0xE6C43F0DF672D, + 0x7824014C39E8B, + 0x00C6B48AFE1B8, +] + +U64_I1 = [ + 0x2D3E8053E396EE4D, + 0xCA478DBEAB3C92CD, + 0xB2D8F06F77F52A93, + 0x24D6BA07F7AA8F04, +] + +U64_I2 = [ + 0x18EE753C76F9DC6F, + 0x54AD7E14A329E70F, + 0x2B16366F4F7684DF, + 0x133100D71FDF3579, +] + +U64_I3 = [ + 0x9BACB016127CBE4E, + 0x0B2051FA31944124, + 0xB064EEA46091C76C, + 0x2B062AAA49F80C7D, +] + + +U51_i1 = pow( + 2**51, + -1, + p, +) +U51_i2 = pow( + 2**51, + -2, + p, +) +U51_i3 = pow( + 2**51, + -3, + p, +) +U51_i4 = pow( + 2**51, + -4, + p, +) + + +def int_to_limbs(size, i): + mask = 2**size - 1 + limbs = [] + while i != 0: + limbs.append(i & mask) + i = i >> size + + return limbs + + +def format_limbs(limbs): + return map(lambda x: hex(x), limbs) + + +def limbs_to_int(size, xs): + total = 0 + for i, x in enumerate(xs): + total += x << (size * i) + + return total + + +u64_i1 = limbs_to_int(64, U64_I1) +u64_i2 = limbs_to_int(64, U64_I2) +u64_i3 = limbs_to_int(64, U64_I3) + +u52_i1 = limbs_to_int(52, U52_i1) +u52_i2 = limbs_to_int(52, U52_i2) +u52_i3 = limbs_to_int(52, U52_i3) +u52_i4 = limbs_to_int(52, U52_i4) + + +def log_jump(single_input_bound): + product_bound = single_input_bound**2 + + first_round = (product_bound >> 2 * 64) + u64_i2 * (2**128 - 1) + second_round = (first_round >> 64) + u64_i1 * (2**64 - 1) + mont_round = second_round + p * (2**64 - 1) + final = mont_round >> 64 + return final + + +def single_step(single_input_bound): + product_bound = single_input_bound**2 + + first_round = (product_bound >> 3 * 64) + (u64_i3 + u64_i2 + u64_i1) * (2**64 - 1) + mont_round = first_round + p * (2**64 - 1) + final = mont_round >> 64 + # print(log2(final)) + + return final + + +def single_step_simd(single_input_bound): + product_bound = (single_input_bound << 2) ** 2 + + first_round = (product_bound >> 4 * 52) + (u52_i4 + u52_i3 + u52_i2 + u52_i1) * ( + 2**52 - 1 + ) + mont_round = first_round + p * (2**52 - 1) + final = mont_round >> 52 + # print(log2(final)) + return final + + +def single_step_simd_wasm(single_input_bound): + product_bound = (single_input_bound) ** 2 + + first_round = (product_bound >> 4 * 51) + (U51_i1 + U51_i2 + U51_i3 + U51_i4) * ( + 2**51 - 1 + ) + mont_round = first_round + p * (2**51 - 1) + final = mont_round >> 51 + # print(log2(final)) + # print(log2(final + p)) + + reduced = (final + p) >> 1 if final & 1 else final >> 1 + # print(log2(reduced)) + return reduced + + +if __name__ == "__main__": + print(hex(pow(-p, -1, 2**51))) + # Test bounds for different input sizes + test_bounds = [ + ("p", p), + ("2p", 2 * p), + ("2ˆ255", 2**255), + ("3p", 3 * p), + ("2ˆ256-2p", 2**256 - 2 * p), + ] + print("Input Size | single_step | single_step_simd | log_jump| single_step_wasm ") + print("-----------|-------------|------------------|---------|-----------------|") + for name, bound in test_bounds: + single = single_step(bound) / p + simd = single_step_simd(bound) / p + simd_wasm = single_step_simd_wasm(bound) / p + log = log_jump(bound) / p + single_space = (2**256 - 1 - single_step(bound)) / p + simd_space = (2**256 - 1 - single_step_simd(bound)) / p + simd_wasm_space = (2**256 - 1 - single_step_simd_wasm(bound)) / p + log_space = (2**256 - 1 - log_jump(bound)) / p + print( + f"{name:10} | {single:4.2f} [{single_space:4.2f}] | {simd:7.2f} [{simd_space:.4f}] | {log:4.2f} [{log_space:.2f}] | {simd_wasm:4.2f} [{simd_wasm_space:.2f}]" + ) diff --git a/skyscraper/block-multiplier/src/aarch64/mod.rs b/skyscraper/bn254-multiplier/src/aarch64/mod.rs similarity index 100% rename from skyscraper/block-multiplier/src/aarch64/mod.rs rename to skyscraper/bn254-multiplier/src/aarch64/mod.rs diff --git a/skyscraper/block-multiplier/src/aarch64/montgomery_interleaved_3.s b/skyscraper/bn254-multiplier/src/aarch64/montgomery_interleaved_3.s similarity index 100% rename from skyscraper/block-multiplier/src/aarch64/montgomery_interleaved_3.s rename to skyscraper/bn254-multiplier/src/aarch64/montgomery_interleaved_3.s diff --git a/skyscraper/block-multiplier/src/aarch64/montgomery_interleaved_4.s b/skyscraper/bn254-multiplier/src/aarch64/montgomery_interleaved_4.s similarity index 100% rename from skyscraper/block-multiplier/src/aarch64/montgomery_interleaved_4.s rename to skyscraper/bn254-multiplier/src/aarch64/montgomery_interleaved_4.s diff --git a/skyscraper/block-multiplier/src/aarch64/montgomery_square_interleaved_3.s b/skyscraper/bn254-multiplier/src/aarch64/montgomery_square_interleaved_3.s similarity index 100% rename from skyscraper/block-multiplier/src/aarch64/montgomery_square_interleaved_3.s rename to skyscraper/bn254-multiplier/src/aarch64/montgomery_square_interleaved_3.s diff --git a/skyscraper/block-multiplier/src/aarch64/montgomery_square_interleaved_4.s b/skyscraper/bn254-multiplier/src/aarch64/montgomery_square_interleaved_4.s similarity index 100% rename from skyscraper/block-multiplier/src/aarch64/montgomery_square_interleaved_4.s rename to skyscraper/bn254-multiplier/src/aarch64/montgomery_square_interleaved_4.s diff --git a/skyscraper/block-multiplier/src/aarch64/montgomery_square_log_interleaved_3.s b/skyscraper/bn254-multiplier/src/aarch64/montgomery_square_log_interleaved_3.s similarity index 100% rename from skyscraper/block-multiplier/src/aarch64/montgomery_square_log_interleaved_3.s rename to skyscraper/bn254-multiplier/src/aarch64/montgomery_square_log_interleaved_3.s diff --git a/skyscraper/block-multiplier/src/aarch64/montgomery_square_log_interleaved_4.s b/skyscraper/bn254-multiplier/src/aarch64/montgomery_square_log_interleaved_4.s similarity index 100% rename from skyscraper/block-multiplier/src/aarch64/montgomery_square_log_interleaved_4.s rename to skyscraper/bn254-multiplier/src/aarch64/montgomery_square_log_interleaved_4.s diff --git a/skyscraper/bn254-multiplier/src/constants.rs b/skyscraper/bn254-multiplier/src/constants.rs new file mode 100644 index 00000000..b4997113 --- /dev/null +++ b/skyscraper/bn254-multiplier/src/constants.rs @@ -0,0 +1,69 @@ +pub const U64_NP0: u64 = 0xc2e1f593efffffff; + +pub const U64_P: [u64; 4] = [ + 0x43e1f593f0000001, + 0x2833e84879b97091, + 0xb85045b68181585d, + 0x30644e72e131a029, +]; + +pub const U64_2P: [u64; 4] = [ + 0x87c3eb27e0000002, + 0x5067d090f372e122, + 0x70a08b6d0302b0ba, + 0x60c89ce5c2634053, +]; + +// R mod P +pub const U64_R: [u64; 4] = [ + 0xac96341c4ffffffb, + 0x36fc76959f60cd29, + 0x666ea36f7879462e, + 0x0e0a77c19a07df2f, +]; + +// R^2 mod P +pub const U64_R2: [u64; 4] = [ + 0x1bb8e645ae216da7, + 0x53fe3ab1e35c59e3, + 0x8c49833d53bb8085, + 0x0216d0b17f4e44a5, +]; + +// R^-1 mod P +pub const U64_R_INV: [u64; 4] = [ + 0xdc5ba0056db1194e, + 0x090ef5a9e111ec87, + 0xc8260de4aeb85d5d, + 0x15ebf95182c5551c, +]; + +pub const U64_I1: [u64; 4] = [ + 0x2d3e8053e396ee4d, + 0xca478dbeab3c92cd, + 0xb2d8f06f77f52a93, + 0x24d6ba07f7aa8f04, +]; +pub const U64_I2: [u64; 4] = [ + 0x18ee753c76f9dc6f, + 0x54ad7e14a329e70f, + 0x2b16366f4f7684df, + 0x133100d71fdf3579, +]; + +pub const U64_I3: [u64; 4] = [ + 0x9bacb016127cbe4e, + 0x0b2051fa31944124, + 0xb064eea46091c76c, + 0x2b062aaa49f80c7d, +]; +pub const U64_MU0: u64 = 0xc2e1f593efffffff; + +// BOUNDS +/// Upper bound of 2**256-2p +pub const OUTPUT_MAX: [u64; 4] = [ + 0x783c14d81ffffffe, + 0xaf982f6f0c8d1edd, + 0x8f5f7492fcfd4f45, + 0x9f37631a3d9cbfac, +]; diff --git a/skyscraper/bn254-multiplier/src/lib.rs b/skyscraper/bn254-multiplier/src/lib.rs new file mode 100644 index 00000000..b8c33b08 --- /dev/null +++ b/skyscraper/bn254-multiplier/src/lib.rs @@ -0,0 +1,36 @@ +#![feature(portable_simd)] +#![feature(bigint_helper_methods)] +//#![no_std] This crate can technically be no_std. However this requires +// replacing StdFloat.mul_add with intrinsics. + +#[cfg(target_arch = "aarch64")] +mod aarch64; + +// These can be made to work on x86, +// but for now it uses an ARM NEON intrinsic. +#[cfg(target_arch = "aarch64")] +pub mod rtz; + +pub mod constants; +pub mod rne; +mod scalar; +mod utils; + +#[cfg(not(target_arch = "wasm32"))] // Proptest not supported on WASI +mod test_utils; + +#[cfg(target_arch = "aarch64")] +pub use crate::aarch64::{ + montgomery_interleaved_3, montgomery_interleaved_4, montgomery_square_interleaved_3, + montgomery_square_interleaved_4, montgomery_square_log_interleaved_3, + montgomery_square_log_interleaved_4, +}; +pub use crate::scalar::{scalar_mul, scalar_sqr}; + +const fn pow_2(n: u32) -> f64 { + assert!(n <= 1023); + // Unfortunately we can't use f64::powi in const fn yet + // This is a workaround that creates the bit pattern directly + let exp = (n as u64 + 1023) << 52; + f64::from_bits(exp) +} diff --git a/skyscraper/bn254-multiplier/src/rne/constants.rs b/skyscraper/bn254-multiplier/src/rne/constants.rs new file mode 100644 index 00000000..6f320cf5 --- /dev/null +++ b/skyscraper/bn254-multiplier/src/rne/constants.rs @@ -0,0 +1,55 @@ +//! Constants for RNE Montgomery multiplication over the BN254 scalar field. + +use crate::pow_2; + +/// Montgomery reduction constant: `-p⁻¹ mod 2⁵¹` +pub const U51_NP0: u64 = 0x1f593efffffff; + +/// The BN254 scalar field prime in 51-bit limb representation. +pub const U51_P: [u64; 5] = [ + 0x1f593f0000001, + 0x10f372e12287c, + 0x6056174a0cfa1, + 0x014dc2822db40, + 0x30644e72e131a, +]; + +/// Bit mask for 51-bit limbs. +pub const MASK51: u64 = 2_u64.pow(51) - 1; + +/// Reduction constants: `RHO_i = 2^(51*i) * 2^255 mod p` in 51-bit limbs. +pub const RHO_1: [u64; 5] = [ + 0x05cc89dc987a4, + 0x64e24f262c77a, + 0x237f02685263f, + 0x70aad55e2a1fd, + 0x0bda088fbd071, +]; + +pub const RHO_2: [u64; 5] = [ + 0x3459f4a69e5e7, + 0x25faeea4c9ca7, + 0x3e771def3ca40, + 0x46003708f7bc8, + 0x088b040ada652, +]; + +pub const RHO_3: [u64; 5] = [ + 0x76fe2f2b3ebb4, + 0x6d028b8f2441f, + 0x461c7904ae683, + 0x71824d0dd38b7, + 0x18c6b0be26ceb, +]; + +pub const RHO_4: [u64; 5] = [ + 0x30bf04e2f27cc, + 0x039b11bea2ed3, + 0x2fb7665568cc8, + 0x0cc99c143d8f0, + 0x0523513296c10, +]; + +pub const C1: f64 = pow_2(103); +pub const C2: f64 = pow_2(103) + pow_2(52) + pow_2(51); +pub const C3: f64 = pow_2(52) + pow_2(51); diff --git a/skyscraper/bn254-multiplier/src/rne/mod.rs b/skyscraper/bn254-multiplier/src/rne/mod.rs new file mode 100644 index 00000000..415090bd --- /dev/null +++ b/skyscraper/bn254-multiplier/src/rne/mod.rs @@ -0,0 +1,29 @@ +//! # RNE - Round-to-Nearest-Even Montgomery Multiplication +//! +//! This module implements Montgomery multiplication over the BN254 scalar field +//! using floating-point arithmetic with round-to-nearest-even (RNE) rounding +//! mode. +//! +//! ## Why Floating-Point? +//! +//! On WASM and ARM Cortex, integer multiplication has lower throughput +//! than floating-point FMA (fused multiply-add). By encoding +//! 51-bit limbs into the mantissa of f64 values we can perform integer +//! multiplication using FMA. +//! +//! ## Representation +//! +//! Field elements are stored in a 5-limb redundant form with 51 bits per limb +//! (5 × 51 = 255 bits), allowing representation of values up to 2²⁵⁵ - 1. +//! +//! ## References +//! +//! Variation of "Faster Modular Exponentiation using Double Precision Floating +//! Point Arithmetic on the GPU, 2018 IEEE 25th Symposium on Computer Arithmetic +//! (ARITH) by Emmart, Zheng and Weems; which uses RTZ. + +pub mod constants; +pub mod portable_simd; +pub mod simd_utils; + +pub use {constants::*, portable_simd::*, simd_utils::*}; diff --git a/skyscraper/bn254-multiplier/src/rne/portable_simd.rs b/skyscraper/bn254-multiplier/src/rne/portable_simd.rs new file mode 100644 index 00000000..dcaeaa52 --- /dev/null +++ b/skyscraper/bn254-multiplier/src/rne/portable_simd.rs @@ -0,0 +1,379 @@ +//! Portable SIMD Montgomery multiplication and squaring. +//! +//! Processes two independent field multiplications in parallel using 2-lane +//! SIMD. + +use { + crate::rne::{ + constants::*, + simd_utils::{ + addv_simd, fma, i2f, make_initial, reduce_ct_simd, smult_noinit_simd, + transpose_simd_to_u256, transpose_u256_to_simd, u255_to_u256_shr_1_simd, + u256_to_u255_simd, + }, + }, + core::{ + ops::BitAnd, + simd::{num::SimdFloat, Simd}, + }, + std::simd::num::{SimdInt, SimdUint}, +}; + +/// Two parallel Montgomery squarings: `(v0², v1²)`. +/// input must fit in 2^255-1; no runtime checking +#[inline] +pub fn simd_sqr(v0_a: [u64; 4], v1_a: [u64; 4]) -> ([u64; 4], [u64; 4]) { + let v0_a = u256_to_u255_simd(transpose_u256_to_simd([v0_a, v1_a])); + + let mut t: [Simd; 10] = [Simd::splat(0); 10]; + + for i in 0..5 { + let avi: Simd = i2f(v0_a[i]); + for j in (i + 1)..5 { + let bvj: Simd = i2f(v0_a[j]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[i + j + 1] += p_hi.to_bits().cast(); + t[i + j] += p_lo.to_bits().cast(); + } + } + + // Most shifting operations are more expensive addition thus for multiplying by + // 2 we use addition. + for i in 1..=8 { + t[i] += t[i]; + } + + for i in 0..5 { + let avi: Simd = i2f(v0_a[i]); + let p_hi = fma(avi, avi, Simd::splat(C1)); + let p_lo = fma(avi, avi, Simd::splat(C2) - p_hi); + t[i + i + 1] += p_hi.to_bits().cast(); + t[i + i] += p_lo.to_bits().cast(); + } + + t[0] += Simd::splat(make_initial(1, 0)); + t[9] += Simd::splat(make_initial(0, 6)); + t[1] += Simd::splat(make_initial(2, 1)); + t[8] += Simd::splat(make_initial(6, 7)); + t[2] += Simd::splat(make_initial(3, 2)); + t[7] += Simd::splat(make_initial(7, 8)); + t[3] += Simd::splat(make_initial(4, 3)); + t[6] += Simd::splat(make_initial(8, 9)); + t[4] += Simd::splat(make_initial(10, 4)); + t[5] += Simd::splat(make_initial(9, 10)); + + t[1] += t[0] >> 51; + t[2] += t[1] >> 51; + t[3] += t[2] >> 51; + t[4] += t[3] >> 51; + + let r0 = smult_noinit_simd(t[0].cast().bitand(Simd::splat(MASK51)), RHO_4); + let r1 = smult_noinit_simd(t[1].cast().bitand(Simd::splat(MASK51)), RHO_3); + let r2 = smult_noinit_simd(t[2].cast().bitand(Simd::splat(MASK51)), RHO_2); + let r3 = smult_noinit_simd(t[3].cast().bitand(Simd::splat(MASK51)), RHO_1); + + let s = [ + r0[0] + r1[0] + r2[0] + r3[0] + t[4], + r0[1] + r1[1] + r2[1] + r3[1] + t[5], + r0[2] + r1[2] + r2[2] + r3[2] + t[6], + r0[3] + r1[3] + r2[3] + r3[3] + t[7], + r0[4] + r1[4] + r2[4] + r3[4] + t[8], + r0[5] + r1[5] + r2[5] + r3[5] + t[9], + ]; + + // The upper bits of s will not affect the lower 51 bits of the product and + // therefore we only have to bitmask once. + let m = (s[0].cast() * Simd::splat(U51_NP0)).bitand(Simd::splat(MASK51)); + let mp = smult_noinit_simd(m, U51_P); + + let mut addi = addv_simd(s, mp); + // Apply carries before dropping the last limb + addi[1] += addi[0] >> 51; + let addi = [addi[1], addi[2], addi[3], addi[4], addi[5]]; + + // 1 bit reduction to go from R^-255 to R^-256. reduce_ct does the preparation + // and the final shift is done as part of the conversion back to u256 + let reduced = reduce_ct_simd(addi); + let reduced = redundant_carry(reduced); + let u256_result = u255_to_u256_shr_1_simd(reduced); + let v = transpose_simd_to_u256(u256_result); + (v[0], v[1]) +} + +/// Move redundant carries from lower limbs to the higher limbs such that all +/// limbs except the last one is 51 bits. The most significant limb can be +/// larger than 51 bits as the input can be bigger 2^255-1. +#[inline(always)] +fn redundant_carry(t: [Simd; N]) -> [Simd; N] +where + std::simd::LaneCount: std::simd::SupportedLaneCount, +{ + let mut borrow = Simd::splat(0); + let mut res = [Simd::splat(0); N]; + for i in 0..t.len() - 1 { + let tmp = t[i] + borrow; + res[i] = (tmp.cast()).bitand(Simd::splat(MASK51)); + borrow = tmp >> 51; + } + + res[N - 1] = (t[N - 1] + borrow).cast(); + res +} + +/// Two parallel Montgomery multiplications: `(v0_a*v0_b, v1_a*v1_b)`. +/// input must fit in 2^255-1; no runtime checking +#[inline(always)] +pub fn simd_mul( + v0_a: [u64; 4], + v0_b: [u64; 4], + v1_a: [u64; 4], + v1_b: [u64; 4], +) -> ([u64; 4], [u64; 4]) { + let v0_a = u256_to_u255_simd(transpose_u256_to_simd([v0_a, v1_a])); + let v0_b = u256_to_u255_simd(transpose_u256_to_simd([v0_b, v1_b])); + + let mut t: [Simd<_, 2>; 10] = [Simd::splat(0); 10]; + t[0] = Simd::splat(make_initial(1, 0)); + t[9] = Simd::splat(make_initial(0, 6)); + t[1] = Simd::splat(make_initial(2, 1)); + t[8] = Simd::splat(make_initial(6, 7)); + t[2] = Simd::splat(make_initial(3, 2)); + t[7] = Simd::splat(make_initial(7, 8)); + t[3] = Simd::splat(make_initial(4, 3)); + t[6] = Simd::splat(make_initial(8, 9)); + t[4] = Simd::splat(make_initial(10, 4)); + t[5] = Simd::splat(make_initial(9, 10)); + + let avi: Simd = i2f(v0_a[0]); + let bvj: Simd = i2f(v0_b[0]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[1] += p_hi.to_bits().cast(); + t[0] += p_lo.to_bits().cast(); + let bvj: Simd = i2f(v0_b[1]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[1 + 1] += p_hi.to_bits().cast(); + t[1] += p_lo.to_bits().cast(); + let bvj: Simd = i2f(v0_b[2]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[2 + 1] += p_hi.to_bits().cast(); + t[2] += p_lo.to_bits().cast(); + let bvj: Simd = i2f(v0_b[3]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[3 + 1] += p_hi.to_bits().cast(); + t[3] += p_lo.to_bits().cast(); + let bvj: Simd = i2f(v0_b[4]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[4 + 1] += p_hi.to_bits().cast(); + t[4] += p_lo.to_bits().cast(); + + let avi: Simd = i2f(v0_a[1]); + let bvj: Simd = i2f(v0_b[0]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[1 + 1] += p_hi.to_bits().cast(); + t[1] += p_lo.to_bits().cast(); + let bvj: Simd = i2f(v0_b[1]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[1 + 1 + 1] += p_hi.to_bits().cast(); + t[1 + 1] += p_lo.to_bits().cast(); + let bvj: Simd = i2f(v0_b[2]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[1 + 2 + 1] += p_hi.to_bits().cast(); + t[1 + 2] += p_lo.to_bits().cast(); + let bvj: Simd = i2f(v0_b[3]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[1 + 3 + 1] += p_hi.to_bits().cast(); + t[1 + 3] += p_lo.to_bits().cast(); + let bvj: Simd = i2f(v0_b[4]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[1 + 4 + 1] += p_hi.to_bits().cast(); + t[1 + 4] += p_lo.to_bits().cast(); + + let avi: Simd = i2f(v0_a[2]); + let bvj: Simd = i2f(v0_b[0]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[2 + 1] += p_hi.to_bits().cast(); + t[2] += p_lo.to_bits().cast(); + let bvj: Simd = i2f(v0_b[1]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[2 + 1 + 1] += p_hi.to_bits().cast(); + t[2 + 1] += p_lo.to_bits().cast(); + let bvj: Simd = i2f(v0_b[2]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[2 + 2 + 1] += p_hi.to_bits().cast(); + t[2 + 2] += p_lo.to_bits().cast(); + let bvj: Simd = i2f(v0_b[3]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[2 + 3 + 1] += p_hi.to_bits().cast(); + t[2 + 3] += p_lo.to_bits().cast(); + let bvj: Simd = i2f(v0_b[4]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[2 + 4 + 1] += p_hi.to_bits().cast(); + t[2 + 4] += p_lo.to_bits().cast(); + + let avi: Simd = i2f(v0_a[3]); + let bvj: Simd = i2f(v0_b[0]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[3 + 1] += p_hi.to_bits().cast(); + t[3] += p_lo.to_bits().cast(); + let bvj: Simd = i2f(v0_b[1]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[3 + 1 + 1] += p_hi.to_bits().cast(); + t[3 + 1] += p_lo.to_bits().cast(); + let bvj: Simd = i2f(v0_b[2]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[3 + 2 + 1] += p_hi.to_bits().cast(); + t[3 + 2] += p_lo.to_bits().cast(); + let bvj: Simd = i2f(v0_b[3]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[3 + 3 + 1] += p_hi.to_bits().cast(); + t[3 + 3] += p_lo.to_bits().cast(); + let bvj: Simd = i2f(v0_b[4]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[3 + 4 + 1] += p_hi.to_bits().cast(); + t[3 + 4] += p_lo.to_bits().cast(); + + let avi: Simd = i2f(v0_a[4]); + let bvj: Simd = i2f(v0_b[0]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[4 + 1] += p_hi.to_bits().cast(); + t[4] += p_lo.to_bits().cast(); + let bvj: Simd = i2f(v0_b[1]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[4 + 1 + 1] += p_hi.to_bits().cast(); + t[4 + 1] += p_lo.to_bits().cast(); + let bvj: Simd = i2f(v0_b[2]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[4 + 2 + 1] += p_hi.to_bits().cast(); + t[4 + 2] += p_lo.to_bits().cast(); + let bvj: Simd = i2f(v0_b[3]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[4 + 3 + 1] += p_hi.to_bits().cast(); + t[4 + 3] += p_lo.to_bits().cast(); + let bvj: Simd = i2f(v0_b[4]); + let p_hi = fma(avi, bvj, Simd::splat(C1)); + let p_lo = fma(avi, bvj, Simd::splat(C2) - p_hi); + t[4 + 4 + 1] += p_hi.to_bits().cast(); + t[4 + 4] += p_lo.to_bits().cast(); + + // sign extend redundant carries + t[1] += t[0] >> 51; + t[2] += t[1] >> 51; + t[3] += t[2] >> 51; + t[4] += t[3] >> 51; + + let r0 = smult_noinit_simd(t[0].cast().bitand(Simd::splat(MASK51)), RHO_4); + let r1 = smult_noinit_simd(t[1].cast().bitand(Simd::splat(MASK51)), RHO_3); + let r2 = smult_noinit_simd(t[2].cast().bitand(Simd::splat(MASK51)), RHO_2); + let r3 = smult_noinit_simd(t[3].cast().bitand(Simd::splat(MASK51)), RHO_1); + + let s = [ + r0[0] + r1[0] + r2[0] + r3[0] + t[4], + r0[1] + r1[1] + r2[1] + r3[1] + t[5], + r0[2] + r1[2] + r2[2] + r3[2] + t[6], + r0[3] + r1[3] + r2[3] + r3[3] + t[7], + r0[4] + r1[4] + r2[4] + r3[4] + t[8], + r0[5] + r1[5] + r2[5] + r3[5] + t[9], + ]; + + let m = (s[0].cast() * Simd::splat(U51_NP0)).bitand(Simd::splat(MASK51)); + let mp = smult_noinit_simd(m, U51_P); + + let mut addi = addv_simd(s, mp); + addi[1] += addi[0] >> 51; + let addi = [addi[1], addi[2], addi[3], addi[4], addi[5]]; + + // 1 bit reduction to go from R^-255 to R^-256. reduce_ct does the preparation + // and the final shift is done as part of the conversion back to u256 + let reduced = reduce_ct_simd(addi); + let reduced = redundant_carry(reduced); + let u256_result = u255_to_u256_shr_1_simd(reduced); + let v = transpose_simd_to_u256(u256_result); + (v[0], v[1]) +} + +#[cfg(not(target_arch = "wasm32"))] +#[cfg(test)] +mod tests { + use { + super::*, + crate::{rne::simd_utils::u255_to_u256_simd, test_utils::ark_ff_reference}, + ark_bn254::Fr, + ark_ff::{BigInt, PrimeField}, + proptest::{ + prelude::{prop, Strategy}, + prop_assert_eq, proptest, + }, + }; + + #[test] + fn test_simd_mul() { + proptest!(|( + a in limbs5_51(), + b in limbs5_51(), + c in limbs5_51(), + )| { + let a: [Simd;_] = a.map(Simd::splat); + let b: [Simd;_] = b.map(Simd::splat); + let c: [Simd;_] = c.map(Simd::splat); + let a = u255_to_u256_simd(a).map(|x|x[0]); + let b = u255_to_u256_simd(b).map(|x|x[0]); + let c = u255_to_u256_simd(c).map(|x|x[0]); + let (ab, bc) = simd_mul(a, b,b,c); + let ab_ref = ark_ff_reference(a, b); + let bc_ref = ark_ff_reference(b, c); + let ab = Fr::new(BigInt(ab)); + let bc = Fr::new(BigInt(bc)); + prop_assert_eq!(ab_ref, ab, "mismatch: l = {:X}, b = {:X}", ab_ref.into_bigint(), ab.into_bigint()); + prop_assert_eq!(bc_ref, bc, "mismatch: l = {:X}, b = {:X}", bc_ref.into_bigint(), bc.into_bigint()); + }) + } + + #[test] + fn test_simd_sqr() { + proptest!(|( + a in limbs5_51(), + b in limbs5_51(), + )| { + let a: [Simd;_] = a.map(Simd::splat); + let b: [Simd;_] = b.map(Simd::splat); + let a = u255_to_u256_simd(a).map(|x|x[0]); + let b = u255_to_u256_simd(b).map(|x|x[0]); + let (a2, _b2) = simd_mul(a, a, b, b); + let (a2s, _b2s) = simd_sqr(a, b); + prop_assert_eq!(a2, a2s); + }) + } + + fn limb51() -> impl Strategy { + 0u64..(1u64 << 51) + } + + fn limbs5_51() -> impl Strategy { + prop::array::uniform5(limb51()) + } +} diff --git a/skyscraper/bn254-multiplier/src/rne/simd_utils.rs b/skyscraper/bn254-multiplier/src/rne/simd_utils.rs new file mode 100644 index 00000000..b0054b08 --- /dev/null +++ b/skyscraper/bn254-multiplier/src/rne/simd_utils.rs @@ -0,0 +1,244 @@ +//! SIMD utilities for RNE Montgomery multiplication. + +use { + crate::rne::constants::{C1, C2, C3, MASK51, U51_P}, + core::{ + array, + ops::BitAnd, + simd::{ + cmp::SimdPartialEq, + num::{SimdFloat, SimdInt, SimdUint}, + Simd, + }, + }, + std::simd::{LaneCount, SupportedLaneCount}, +}; +#[inline(always)] +/// On WASM there is no single specialised instruction to cast an integer to a +/// float. Since we are only interested in 52 bits, we can emulate it with fewer +/// instructions. +/// +/// Warning: due to Rust's limitations this can not be a const function. +/// Therefore check your dependency path as this will not be optimised out. +pub fn i2f(a: Simd) -> Simd +where + LaneCount: SupportedLaneCount, +{ + // This function has no target gating as we want to verify this function with + // kani and proptest on a different platform than wasm + + // By adding 2^52 represented as float (0x1p52) -> 0x433 << 52, we align the + // 52bit number fully in the mantissa. This can be done with a simple or. Then + // to convert a to it's floating point number we subtract this again. This way + // we only pay for the conversion of the lower bits and not the full 64 bits. + let exponent = Simd::splat(0x433 << 52); + let a: Simd = Simd::::from_bits(a | exponent); + let b: Simd = Simd::::from_bits(exponent); + a - b +} + +/// Fused multiply-add: `a * b + c`. +#[inline(always)] +pub fn fma(a: Simd, b: Simd, c: Simd) -> Simd { + #[cfg(not(target_arch = "wasm32"))] + { + use std::simd::StdFloat; + + a.mul_add(b, c) + } + #[cfg(target_arch = "wasm32")] + { + use core::arch::wasm32::*; + f64x2_relaxed_madd(a.into(), b.into(), c.into()).into() + } +} + +/// Computes bias compensation for accumulator limbs. +/// +/// - `low_count`: number of p_lo contributions +/// - `high_count`: number of p_hi contributions +#[inline(always)] +pub const fn make_initial(low_count: u64, high_count: u64) -> i64 { + let val = high_count + .wrapping_mul(C1.to_bits()) + .wrapping_add(low_count.wrapping_mul(C3.to_bits())); + -(val as i64) +} + +/// Transpose two 4-limb values into 4 SIMD vectors. +#[inline(always)] +pub fn transpose_u256_to_simd(limbs: [[u64; 4]; 2]) -> [Simd; 4] { + [ + Simd::from_array([limbs[0][0], limbs[1][0]]), + Simd::from_array([limbs[0][1], limbs[1][1]]), + Simd::from_array([limbs[0][2], limbs[1][2]]), + Simd::from_array([limbs[0][3], limbs[1][3]]), + ] +} + +/// Transpose 4 SIMD vectors back to two 4-limb values. +#[inline(always)] +pub fn transpose_simd_to_u256(limbs: [Simd; 4]) -> [[u64; 4]; 2] { + let tmp0 = limbs[0].to_array(); + let tmp1 = limbs[1].to_array(); + let tmp2 = limbs[2].to_array(); + let tmp3 = limbs[3].to_array(); + [[tmp0[0], tmp1[0], tmp2[0], tmp3[0]], [ + tmp0[1], tmp1[1], tmp2[1], tmp3[1], + ]] +} + +/// Convert 4×64-bit to 5×51-bit limb representation. +/// Input must fit in 255 bits; no runtime checking. +#[inline(always)] +pub fn u256_to_u255_simd(limbs: [Simd; 4]) -> [Simd; 5] +where + LaneCount: SupportedLaneCount, +{ + let [l0, l1, l2, l3] = limbs; + [ + (l0) & Simd::splat(MASK51), + ((l0 >> 51) | (l1 << 13)) & Simd::splat(MASK51), + ((l1 >> 38) | (l2 << 26)) & Simd::splat(MASK51), + ((l2 >> 25) | (l3 << 39)) & Simd::splat(MASK51), + l3 >> 12 & Simd::splat(MASK51), + ] +} + +/// Convert 5×51-bit back to 4×64-bit limb representation. +#[inline(always)] +pub fn u255_to_u256_simd(limbs: [Simd; 5]) -> [Simd; 4] +where + LaneCount: SupportedLaneCount, +{ + let [l0, l1, l2, l3, l4] = limbs; + [ + l0 | (l1 << 51), + (l1 >> 13) | (l2 << 38), + (l2 >> 26) | (l3 << 25), + (l3 >> 39) | (l4 << 12), + ] +} + +/// Convert 5×51-bit to 4×64-bit with simultaneous division by 2. +#[inline(always)] +pub fn u255_to_u256_shr_1_simd(limbs: [Simd; 5]) -> [Simd; 4] +where + LaneCount: SupportedLaneCount, +{ + let [l0, l1, l2, l3, l4] = limbs; + [ + (l0 >> 1) | (l1 << 50), + (l1 >> 14) | (l2 << 37), + (l2 >> 27) | (l3 << 24), + (l3 >> 40) | (l4 << 11), + ] +} + +/// Multiply SIMD scalar by 5-limb constant using FMA splitting. +/// Returns 6-limb result in redundant signed form. +#[inline(always)] +pub fn smult_noinit_simd(s: Simd, v: [u64; 5]) -> [Simd; 6] { + let mut t = [Simd::splat(0); 6]; + let s: Simd = i2f(s); + + let p_hi_0 = fma(s, Simd::splat(v[0] as f64), Simd::splat(C1)); + let p_lo_0 = fma(s, Simd::splat(v[0] as f64), Simd::splat(C2) - p_hi_0); + t[1] += p_hi_0.to_bits().cast(); + t[0] += p_lo_0.to_bits().cast(); + + let p_hi_1 = fma(s, Simd::splat(v[1] as f64), Simd::splat(C1)); + let p_lo_1 = fma(s, Simd::splat(v[1] as f64), Simd::splat(C2) - p_hi_1); + t[2] += p_hi_1.to_bits().cast(); + t[1] += p_lo_1.to_bits().cast(); + + let p_hi_2 = fma(s, Simd::splat(v[2] as f64), Simd::splat(C1)); + let p_lo_2 = fma(s, Simd::splat(v[2] as f64), Simd::splat(C2) - p_hi_2); + t[3] += p_hi_2.to_bits().cast(); + t[2] += p_lo_2.to_bits().cast(); + + let p_hi_3 = fma(s, Simd::splat(v[3] as f64), Simd::splat(C1)); + let p_lo_3 = fma(s, Simd::splat(v[3] as f64), Simd::splat(C2) - p_hi_3); + t[4] += p_hi_3.to_bits().cast(); + t[3] += p_lo_3.to_bits().cast(); + + let p_hi_4 = fma(s, Simd::splat(v[4] as f64), Simd::splat(C1)); + let p_lo_4 = fma(s, Simd::splat(v[4] as f64), Simd::splat(C2) - p_hi_4); + t[5] += p_hi_4.to_bits().cast(); + t[4] += p_lo_4.to_bits().cast(); + + t +} + +/// Constant-time conditional add of p to prepare for final bit reduction by +/// making the result even. +#[inline(always)] +pub fn reduce_ct_simd(a: [Simd; 5]) -> [Simd; 5] { + let mut c = [Simd::splat(0); 5]; + let tmp = a[0]; + + // To reduce Check whether the least significant bit is set + let mask = (tmp).bitand(Simd::splat(1)).simd_eq(Simd::splat(1)); + + // Select values based on the mask: if mask lane is true, add p, else add + // zero + let zeros = [Simd::splat(0); 5]; + let p = U51_P.map(|x| Simd::splat(x as i64)); + let b: [_; 5] = array::from_fn(|i| mask.select(p[i], zeros[i])); + + for i in 0..c.len() { + c[i] = a[i] + b[i]; + } + + // Check that final result is even + debug_assert!(c[0][0] & 1 == 0); + debug_assert!(c[0][1] & 1 == 0); + + c +} + +/// Element-wise vector addition in redundant form. +#[inline(always)] +pub fn addv_simd( + va: [Simd; N], + vb: [Simd; N], +) -> [Simd; N] { + let mut vc = [Simd::splat(0); N]; + for i in 0..va.len() { + vc[i] = va[i].cast() + vb[i]; + } + vc +} + +#[cfg(kani)] +mod tests { + use { + crate::rne::simd_utils::{i2f, u255_to_u256_simd, u256_to_u255_simd}, + std::simd::Simd, + }; + + #[kani::proof] + fn u256_to_u255_kani_roundtrip() { + let u: [u64; 4] = [ + kani::any(), + kani::any(), + kani::any(), + kani::any::() & 0x7fffffffffffffff, + ]; + let u255 = u256_to_u255_simd::<1>(u.map(Simd::splat)); + let roundtrip = u255_to_u256_simd::<1>(u255).map(|v| v[0]); + assert_eq!(u, roundtrip) + } + + /// Verify that i2f correctly converts integers in the valid range [0, + /// 2^52). + #[kani::proof] + fn i2f_kani_correctness() { + let val: u64 = kani::any(); + kani::assume(val < (1u64 << 52)); + + let result = i2f(Simd::from_array([val])); + + assert_eq!(result[0], val as f64); + } +} diff --git a/skyscraper/block-multiplier/src/block_simd.rs b/skyscraper/bn254-multiplier/src/rtz/block_simd.rs similarity index 98% rename from skyscraper/block-multiplier/src/block_simd.rs rename to skyscraper/bn254-multiplier/src/rtz/block_simd.rs index e770f557..b261cb45 100644 --- a/skyscraper/block-multiplier/src/block_simd.rs +++ b/skyscraper/bn254-multiplier/src/rtz/block_simd.rs @@ -1,9 +1,12 @@ use { crate::{ constants::*, - simd_utils::{ - addv_simd, make_initial, reduce_ct_simd, smult_noinit_simd, transpose_simd_to_u256, - transpose_u256_to_simd, u256_to_u260_shl2_simd, u260_to_u256_simd, + rtz::{ + constants::*, + simd_utils::{ + addv_simd, make_initial, reduce_ct_simd, smult_noinit_simd, transpose_simd_to_u256, + transpose_u256_to_simd, u256_to_u260_shl2_simd, u260_to_u256_simd, + }, }, subarray, utils::{addv, carrying_mul_add, reduce_ct}, diff --git a/skyscraper/bn254-multiplier/src/rtz/constants.rs b/skyscraper/bn254-multiplier/src/rtz/constants.rs new file mode 100644 index 00000000..2d8cbe29 --- /dev/null +++ b/skyscraper/bn254-multiplier/src/rtz/constants.rs @@ -0,0 +1,71 @@ +use crate::pow_2; + +pub const U52_NP0: u64 = 0x1f593efffffff; +pub const U52_R2: [u64; 5] = [ + 0x0b852d16da6f5, + 0xc621620cddce3, + 0xaf1b95343ffb6, + 0xc3c15e103e7c2, + 0x00281528fa122, +]; + +pub const U52_P: [u64; 5] = [ + 0x1f593f0000001, + 0x4879b9709143e, + 0x181585d2833e8, + 0xa029b85045b68, + 0x030644e72e131, +]; + +pub const U52_2P: [u64; 5] = [ + 0x3eb27e0000002, + 0x90f372e12287c, + 0x302b0ba5067d0, + 0x405370a08b6d0, + 0x060c89ce5c263, +]; + +pub const F52_P: [f64; 5] = [ + 0x1f593f0000001_u64 as f64, + 0x4879b9709143e_u64 as f64, + 0x181585d2833e8_u64 as f64, + 0xa029b85045b68_u64 as f64, + 0x030644e72e131_u64 as f64, +]; + +pub const MASK52: u64 = 2_u64.pow(52) - 1; + +pub const RHO_1: [u64; 5] = [ + 0x82e644ee4c3d2, + 0xf93893c98b1de, + 0xd46fe04d0a4c7, + 0x8f0aad55e2a1f, + 0x005ed0447de83, +]; + +pub const RHO_2: [u64; 5] = [ + 0x74eccce9a797a, + 0x16ddcc30bd8a4, + 0x49ecd3539499e, + 0xb23a6fcc592b8, + 0x00e3bd49f6ee5, +]; + +pub const RHO_3: [u64; 5] = [ + 0x0e8c656567d77, + 0x430d05713ae61, + 0xea3ba6b167128, + 0xa7dae55c5a296, + 0x01b4afd513572, +]; + +pub const RHO_4: [u64; 5] = [ + 0x22e2400e2f27d, + 0x323b46ea19686, + 0xe6c43f0df672d, + 0x7824014c39e8b, + 0x00c6b48afe1b8, +]; + +pub const C1: f64 = pow_2(104); // 2.0^104 +pub const C2: f64 = pow_2(104) + pow_2(52); // 2.0^104 + 2.0^52 diff --git a/skyscraper/bn254-multiplier/src/rtz/mod.rs b/skyscraper/bn254-multiplier/src/rtz/mod.rs new file mode 100644 index 00000000..8f8dc1a0 --- /dev/null +++ b/skyscraper/bn254-multiplier/src/rtz/mod.rs @@ -0,0 +1,6 @@ +pub mod block_simd; +pub mod constants; +pub mod portable_simd; +pub mod simd_utils; + +pub use {block_simd::*, constants::*, portable_simd::*, simd_utils::*}; diff --git a/skyscraper/block-multiplier/src/portable_simd.rs b/skyscraper/bn254-multiplier/src/rtz/portable_simd.rs similarity index 93% rename from skyscraper/block-multiplier/src/portable_simd.rs rename to skyscraper/bn254-multiplier/src/rtz/portable_simd.rs index 39ca34f2..a41c77de 100644 --- a/skyscraper/block-multiplier/src/portable_simd.rs +++ b/skyscraper/bn254-multiplier/src/rtz/portable_simd.rs @@ -1,5 +1,5 @@ use { - crate::{ + crate::rtz::{ constants::*, simd_utils::{ addv_simd, make_initial, reduce_ct_simd, smult_noinit_simd, transpose_simd_to_u256, @@ -11,11 +11,16 @@ use { ops::BitAnd, simd::{num::SimdFloat, Simd}, }, + fp_rounding::{RoundingGuard, Zero}, std::simd::StdFloat, }; #[inline] -pub fn simd_sqr(v0_a: [u64; 4], v1_a: [u64; 4]) -> ([u64; 4], [u64; 4]) { +pub fn simd_sqr( + _rtz: &RoundingGuard, + v0_a: [u64; 4], + v1_a: [u64; 4], +) -> ([u64; 4], [u64; 4]) { let v0_a = u256_to_u260_shl2_simd(transpose_u256_to_simd([v0_a, v1_a])); let mut t: [Simd; 10] = [Simd::splat(0); 10]; @@ -195,6 +200,7 @@ pub fn simd_sqr(v0_a: [u64; 4], v1_a: [u64; 4]) -> ([u64; 4], [u64; 4]) { #[inline] pub fn simd_mul( + _rtz: &RoundingGuard, v0_a: [u64; 4], v0_b: [u64; 4], v1_a: [u64; 4], @@ -377,3 +383,36 @@ pub fn simd_mul( let v = transpose_simd_to_u256(u256_result); (v[0], v[1]) } + +#[cfg(test)] +mod tests { + use { + super::*, + crate::test_utils::{ark_ff_reference, safe_bn254_montgomery_input}, + ark_bn254::Fr, + ark_ff::BigInt, + fp_rounding::{with_rounding_mode, Zero}, + proptest::proptest, + }; + + #[test] + fn test_simd_mul() { + proptest!(|( + a in safe_bn254_montgomery_input(), + b in safe_bn254_montgomery_input(), + c in safe_bn254_montgomery_input(), + )| { + unsafe { + with_rounding_mode((), |rtz : &fp_rounding::RoundingGuard, _| { + + let (ab, bc) = simd_mul(&rtz, a, b, b,c); + let ab_ref = ark_ff_reference(a, b); + let bc_ref = ark_ff_reference(b, c); + let ab = Fr::new(BigInt(ab)); + let bc = Fr::new(BigInt(bc)); + assert_eq!(ab_ref, ab); + assert_eq!(bc_ref, bc); + });} + }); + } +} diff --git a/skyscraper/block-multiplier/src/simd_utils.rs b/skyscraper/bn254-multiplier/src/rtz/simd_utils.rs similarity index 98% rename from skyscraper/block-multiplier/src/simd_utils.rs rename to skyscraper/bn254-multiplier/src/rtz/simd_utils.rs index 9ce3b4f6..144951ff 100644 --- a/skyscraper/block-multiplier/src/simd_utils.rs +++ b/skyscraper/bn254-multiplier/src/rtz/simd_utils.rs @@ -1,5 +1,5 @@ use { - crate::constants::{C1, C2, MASK52, U52_2P}, + crate::rtz::constants::{C1, C2, MASK52, U52_2P}, core::{ arch::aarch64::vcvtq_f64_u64, array, diff --git a/skyscraper/block-multiplier/src/scalar.rs b/skyscraper/bn254-multiplier/src/scalar.rs similarity index 99% rename from skyscraper/block-multiplier/src/scalar.rs rename to skyscraper/bn254-multiplier/src/scalar.rs index ff7250ec..93bb5c48 100644 --- a/skyscraper/block-multiplier/src/scalar.rs +++ b/skyscraper/bn254-multiplier/src/scalar.rs @@ -131,6 +131,7 @@ pub fn scalar_mul(a: [u64; 4], b: [u64; 4]) -> [u64; 4] { reduce_ct(subarray!(addv(s, mp), 1, 4)) } +#[cfg(not(target_arch = "wasm32"))] // Proptest not supported on WASI #[cfg(test)] mod tests { use { diff --git a/skyscraper/block-multiplier/src/test_utils.rs b/skyscraper/bn254-multiplier/src/test_utils.rs similarity index 97% rename from skyscraper/block-multiplier/src/test_utils.rs rename to skyscraper/bn254-multiplier/src/test_utils.rs index e46b3f25..bfbdaab3 100644 --- a/skyscraper/block-multiplier/src/test_utils.rs +++ b/skyscraper/bn254-multiplier/src/test_utils.rs @@ -13,7 +13,7 @@ use { /// Given a multiprecision integer in little-endian format, returns a /// `Strategy` that generates values uniformly in the range `0..=max`. -fn max_multiprecision(max: Vec) -> impl Strategy> { +pub fn max_multiprecision(max: Vec) -> impl Strategy> { // Takes ownership of a vector rather to deal with the 'static // requirement of boxed() let size = max.len(); diff --git a/skyscraper/block-multiplier/src/utils.rs b/skyscraper/bn254-multiplier/src/utils.rs similarity index 66% rename from skyscraper/block-multiplier/src/utils.rs rename to skyscraper/bn254-multiplier/src/utils.rs index b4e92777..ee3ac57b 100644 --- a/skyscraper/block-multiplier/src/utils.rs +++ b/skyscraper/bn254-multiplier/src/utils.rs @@ -14,7 +14,7 @@ use crate::constants::U64_2P; /// # Example /// /// ``` -/// use block_multiplier::subarray; +/// use bn254_multiplier::subarray; /// let array = [1, 2, 3, 4, 5]; /// let sub = subarray!(array, 1, 3); // Creates [2, 3, 4] /// ``` @@ -68,7 +68,32 @@ pub fn sub(a: [u64; N], b: [u64; N]) -> [u64; N] { } #[inline(always)] -pub fn carrying_mul_add(a: u64, b: u64, add: u64, carry: u64) -> (u64, u64) { - let c: u128 = a as u128 * b as u128 + carry as u128 + add as u128; +// Based on ark-ff +// On WASM first doing a widening on the operands will cause __multi3 called +// which is u128xu128 -> u128 causing unnecessary multiplications +pub const fn widening_mul(a: u64, b: u64) -> u128 { + #[cfg(not(target_family = "wasm"))] + { + a as u128 * b as u128 + } + #[cfg(target_family = "wasm")] + { + let a0 = a as u32 as u64; + let a1 = a >> 32; + let b0 = b as u32 as u64; + let b1 = b >> 32; + + let c00 = (a0 * b0) as u128; + let c01 = (a0 * b1) as u128; + let c10 = (a1 * b0) as u128; + let cxx = (c01 + c10) << 32; + let c11 = ((a1 * b1) as u128) << 64; + (c00 | c11) + cxx + } +} + +#[inline(always)] +pub const fn carrying_mul_add(a: u64, b: u64, add: u64, carry: u64) -> (u64, u64) { + let c: u128 = widening_mul(a, b) + carry as u128 + add as u128; (c as u64, (c >> 64) as u64) } diff --git a/skyscraper/core/Cargo.toml b/skyscraper/core/Cargo.toml index aa14dee4..cbbc5f92 100644 --- a/skyscraper/core/Cargo.toml +++ b/skyscraper/core/Cargo.toml @@ -10,7 +10,7 @@ repository.workspace = true [dependencies] # Workspace crates -block-multiplier.workspace = true +bn254-multiplier.workspace = true # Cryptography and proof systems ark-bn254.workspace = true diff --git a/skyscraper/core/benches/bench.rs b/skyscraper/core/benches/bench.rs index a5537148..bf37a2de 100644 --- a/skyscraper/core/benches/bench.rs +++ b/skyscraper/core/benches/bench.rs @@ -185,7 +185,7 @@ mod parts { use skyscraper::reduce::reduce_partial; bencher .with_inputs(|| reduce_partial(array::from_fn(|_| rng().random()))) - .bench_values(block_multiplier::scalar_sqr) + .bench_values(bn254_multiplier::scalar_sqr) } } diff --git a/skyscraper/core/src/block3.rs b/skyscraper/core/src/block3.rs index 285dd521..81974244 100644 --- a/skyscraper/core/src/block3.rs +++ b/skyscraper/core/src/block3.rs @@ -21,7 +21,7 @@ fn compress(guard: &RoundingGuard, input: [[[u64; 4]; 2]; 3]) -> [[u64; 4] fn square(guard: &RoundingGuard, n: [[u64; 4]; 3]) -> [[u64; 4]; 3] { let [a, b, c] = n; let v = array::from_fn(|i| std::simd::u64x2::from_array([b[i], c[i]])); - let (a, v) = block_multiplier::montgomery_square_log_interleaved_3(guard, a, v); + let (a, v) = bn254_multiplier::montgomery_square_log_interleaved_3(guard, a, v); let b = v.map(|e| e[0]); let c = v.map(|e| e[1]); [a, b, c] diff --git a/skyscraper/core/src/block4.rs b/skyscraper/core/src/block4.rs index 5ac239b1..24a388d5 100644 --- a/skyscraper/core/src/block4.rs +++ b/skyscraper/core/src/block4.rs @@ -21,7 +21,7 @@ fn compress(guard: &RoundingGuard, input: [[[u64; 4]; 2]; 4]) -> [[u64; 4] fn square(guard: &RoundingGuard, n: [[u64; 4]; 4]) -> [[u64; 4]; 4] { let [a, b, c, d] = n; let v = array::from_fn(|i| std::simd::u64x2::from_array([c[i], d[i]])); - let (a, b, v) = block_multiplier::montgomery_square_log_interleaved_4(guard, a, b, v); + let (a, b, v) = bn254_multiplier::montgomery_square_log_interleaved_4(guard, a, b, v); let c = v.map(|e| e[0]); let d = v.map(|e| e[1]); [a, b, c, d] diff --git a/skyscraper/core/src/simple.rs b/skyscraper/core/src/simple.rs index c1e530bb..f822c6ad 100644 --- a/skyscraper/core/src/simple.rs +++ b/skyscraper/core/src/simple.rs @@ -1,4 +1,4 @@ -use {crate::generic, block_multiplier::scalar_sqr as square}; +use {crate::generic, bn254_multiplier::scalar_sqr as square}; pub fn compress_many(messages: &[u8], hashes: &mut [u8]) { generic::compress_many( diff --git a/skyscraper/core/src/v1.rs b/skyscraper/core/src/v1.rs index 7f31f1cc..512d2bd1 100644 --- a/skyscraper/core/src/v1.rs +++ b/skyscraper/core/src/v1.rs @@ -5,7 +5,7 @@ use { generic, reduce::{reduce, reduce_partial, reduce_partial_add_rc}, }, - block_multiplier::scalar_sqr as square, + bn254_multiplier::scalar_sqr as square, }; pub fn compress_many(messages: &[u8], hashes: &mut [u8]) { diff --git a/tooling/provekit-bench/Cargo.toml b/tooling/provekit-bench/Cargo.toml index 5c6aaddc..b90f5c9a 100644 --- a/tooling/provekit-bench/Cargo.toml +++ b/tooling/provekit-bench/Cargo.toml @@ -34,4 +34,4 @@ workspace = true [[bench]] name = "bench" -harness = false \ No newline at end of file +harness = false