diff --git a/benches/base62.rs b/benches/base62.rs index 19ad1a5..bd9f271 100644 --- a/benches/base62.rs +++ b/benches/base62.rs @@ -97,7 +97,14 @@ pub fn criterion_benchmark(c: &mut Criterion) { }); group.bench_function("standard_buf_fixed", |b| { - b.iter(|| encode_buf(black_box(u128::MAX), black_box(&mut String::new()))) + b.iter_batched_ref( + || String::with_capacity(22), + |buf| { + buf.clear(); + encode_buf(black_box(u128::MAX), black_box(buf)) + }, + BatchSize::SmallInput, + ) }); group.bench_function("standard_buf_random", |b| { @@ -176,7 +183,14 @@ pub fn criterion_benchmark(c: &mut Criterion) { }); group.bench_function("alternative_buf_fixed", |b| { - b.iter(|| encode_alternative_buf(black_box(u128::MAX), black_box(&mut String::new()))) + b.iter_batched_ref( + || String::with_capacity(22), + |buf| { + buf.clear(); + encode_alternative_buf(black_box(u128::MAX), black_box(buf)) + }, + BatchSize::SmallInput, + ) }); group.bench_function("alternative_buf_random", |b| { diff --git a/src/lib.rs b/src/lib.rs index 0dd4306..483c92c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -22,7 +22,7 @@ extern crate alloc; #[cfg(feature = "std")] extern crate std; -use core::{convert::TryInto, fmt}; +use core::{convert::TryInto, fmt, num::NonZeroU128}; const BASE: u64 = 62; const BASE_TO_2: u64 = BASE * BASE; @@ -51,15 +51,13 @@ const BASE_TO_21: u128 = BASE_TO_20 * BASE as u128; const DIV_BASE_TO_10_MULTIPLY: u128 = 233718071534448225491982379416108680074; const DIV_BASE_TO_10_SHIFT: u8 = 59; -#[repr(align(64))] struct StandardTables { - encode: [u8; 62], + encode_pairs: [[u8; 2]; BASE_TO_2 as usize], decode: [u8; 128], } -#[repr(align(64))] struct AlternativeTables { - encode: [u8; 62], + encode_pairs: [[u8; 2]; BASE_TO_2 as usize], decode: [u8; 128], } @@ -74,6 +72,16 @@ impl StandardTables { b'u', b'v', b'w', b'x', b'y', b'z', ]; + // Generate pair table: index i represents (i / 62, i % 62) + let mut encode_pairs = [[0u8; 2]; BASE_TO_2 as usize]; + let mut i = 0usize; + while i < BASE_TO_2 as usize { + let hi = i / 62; + let lo = i % 62; + encode_pairs[i] = [ENCODE[hi], ENCODE[lo]]; + i += 1; + } + let mut decode = [255u8; 128]; // Populate decode table @@ -94,7 +102,7 @@ impl StandardTables { } Self { - encode: ENCODE, + encode_pairs, decode, } } @@ -111,6 +119,16 @@ impl AlternativeTables { b'U', b'V', b'W', b'X', b'Y', b'Z', ]; + // Generate pair table: index i represents (i / 62, i % 62) + let mut encode_pairs = [[0u8; 2]; BASE_TO_2 as usize]; + let mut i = 0usize; + while i < BASE_TO_2 as usize { + let hi = i / 62; + let lo = i % 62; + encode_pairs[i] = [ENCODE[hi], ENCODE[lo]]; + i += 1; + } + let mut decode = [255u8; 128]; // Populate decode table @@ -131,7 +149,7 @@ impl AlternativeTables { } Self { - encode: ENCODE, + encode_pairs, decode, } } @@ -323,35 +341,47 @@ pub fn encode_alternative_io, W: std::io::Write + ?Sized>( // Internal functions used by both no_std and alloc features pub(crate) fn digit_count(n: u128) -> usize { - const POWERS: [u128; 22] = [ + const THRESHOLDS: [u128; 23] = [ 0, - BASE as u128, - BASE_TO_2 as u128, - BASE_TO_3 as u128, - BASE_TO_4 as u128, - BASE_TO_5 as u128, - BASE_TO_6 as u128, - BASE_TO_7 as u128, - BASE_TO_8 as u128, - BASE_TO_9 as u128, - BASE_TO_10, - BASE_TO_11, - BASE_TO_12, - BASE_TO_13, - BASE_TO_14, - BASE_TO_15, - BASE_TO_16, - BASE_TO_17, - BASE_TO_18, - BASE_TO_19, - BASE_TO_20, - BASE_TO_21, + BASE as u128 - 1, + BASE_TO_2 as u128 - 1, + BASE_TO_3 as u128 - 1, + BASE_TO_4 as u128 - 1, + BASE_TO_5 as u128 - 1, + BASE_TO_6 as u128 - 1, + BASE_TO_7 as u128 - 1, + BASE_TO_8 as u128 - 1, + BASE_TO_9 as u128 - 1, + BASE_TO_10 - 1, + BASE_TO_11 - 1, + BASE_TO_12 - 1, + BASE_TO_13 - 1, + BASE_TO_14 - 1, + BASE_TO_15 - 1, + BASE_TO_16 - 1, + BASE_TO_17 - 1, + BASE_TO_18 - 1, + BASE_TO_19 - 1, + BASE_TO_20 - 1, + BASE_TO_21 - 1, + u128::MAX, // sentinel, u128 cannot be larger than this value ]; - match POWERS.binary_search(&n) { - Ok(n) => n.wrapping_add(1), - Err(n) => n, - } + let Some(n) = NonZeroU128::new(n) else { + return 1; + }; + // We want to find floor(log62(n)) + 1 = floor(log2(n) / log2(62)) + 1 + // First, approximate log2(n) with ilog2 = floor(log2(n)), underestimating by 0 <= err < 1 + let ilog2 = n.ilog2() as usize; + + // Next, we find floor(ilog2/log2(62)), which is exactly equal to floor(ilog2 * 43/256) for all ilog2 in [0, 127] + // The result is an underestimate by up to 1, purely because ilog2 is an underestimate + let estimate = ((ilog2 * 43) >> 8) + 1; + + // SAFETY: estimate is in [1,22] since ilog2 is in [0,127] and (127*43)>>8 + 1 = 22 + let threshold = unsafe { *THRESHOLDS.get_unchecked(estimate) }; + let bump = (n.get() > threshold) as usize; + estimate + bump } #[inline(always)] @@ -571,22 +601,27 @@ pub fn decode_alternative>(input: T) -> Result } // Common encoding function -unsafe fn encode_impl(num: u128, digits: usize, buf: &mut [u8], encode_table: &[u8; 62]) -> usize { +unsafe fn encode_impl( + num: u128, + digits: usize, + buf: &mut [u8], + encode_pairs: &[[u8; 2]; BASE_TO_2 as usize], +) -> usize { unsafe { if let Ok(num) = TryInto::::try_into(num) { - encode_impl_u64(num, digits, buf, encode_table) + encode_impl_u64(num, digits, buf, encode_pairs) } else if digits > 20 { - encode_impl_over_20_digits(num, digits, buf, encode_table) + encode_impl_over_20_digits(num, digits, buf, encode_pairs) } else if digits == 20 { // (AAAAAAAAAA, BBBBBBBBBB) let (first_u64, second_u64) = div_base_to_10(num); // AAAAAAAAAA let first_u64 = first_u64 as u64; - encode_impl_20_digits(first_u64, second_u64, buf, encode_table) + encode_impl_20_digits(first_u64, second_u64, buf, encode_pairs) } else { // digits between 11 and 20 (10 digits would always fit into a u64, which we checked first) - encode_impl_over_10_under_20_digits(num, digits, buf, encode_table) + encode_impl_over_10_under_20_digits(num, digits, buf, encode_pairs) } } } @@ -596,29 +631,26 @@ unsafe fn encode_impl_over_20_digits( num: u128, digits: usize, buf: &mut [u8], - encode_table: &[u8; 62], + encode_pairs: &[[u8; 2]; BASE_TO_2 as usize], ) -> usize { - // input: AABBBBBBBBBBCCCCCCCCCC + // input: [A]BCCCCCCCCCCDDDDDDDDDD // - // (AABBBBBBBBBB, CCCCCCCCCC) + // ([A]BCCCCCCCCCC, DDDDDDDDDD) let (num, third_u64) = div_base_to_10(num); - // (AA, BBBBBBBBBB) + // ([A]B, CCCCCCCCCC) let (first_u64, second_u64) = div_base_to_10(num); - // AA - no more than two digits as num was 22 digits + // [A]B - no more than two digits as num was 22 digits let first_u64 = first_u64 as u64; - // encode the first one or two digits - if digits == 21 { - unsafe { - *buf.get_unchecked_mut(0) = *encode_table.get_unchecked(first_u64 as usize); - } - } else { - let second_digit = first_u64 % BASE; - let first_digit = first_u64 / BASE; - unsafe { - *buf.get_unchecked_mut(1) = *encode_table.get_unchecked(second_digit as usize); - *buf.get_unchecked_mut(0) = *encode_table.get_unchecked(first_digit as usize); - } + // Branchless 21/22 digit handling of [A]B + // For 21 digits: write 0 then overwrite with B at position 0 + // For 22 digits: write A at position 0, B at position 1 + unsafe { + // [A, B] in 22 digit case or [0, B] in 21 digit case + let [c1, c2] = *encode_pairs.get_unchecked(first_u64 as usize); + let is_22 = digits - 21; // 0 or 1 + *buf.get_unchecked_mut(0) = c1; + *buf.get_unchecked_mut(is_22) = c2; } // encode the last 20 digits @@ -627,7 +659,7 @@ unsafe fn encode_impl_over_20_digits( second_u64, third_u64, &mut buf[(digits - 20)..], - encode_table, + encode_pairs, ); } @@ -639,32 +671,47 @@ unsafe fn encode_impl_20_digits( first_u64: u64, second_u64: u64, buf: &mut [u8], - encode_table: &[u8; 62], + encode_pairs: &[[u8; 2]; BASE_TO_2 as usize], ) -> usize { let first_u32 = (first_u64 / BASE_TO_5) as u32; let second_u32 = (first_u64 % BASE_TO_5) as u32; let third_u32 = (second_u64 / BASE_TO_5) as u32; let fourth_u32 = (second_u64 % BASE_TO_5) as u32; - // [AAAAA, BBBBB, CCCCC, DDDDD] + // [ABCDE, FGHIJ, KLMNO, PQRST] let mut nums = [first_u32, second_u32, third_u32, fourth_u32]; - const STARTING_WRITE_IDXS: [usize; 4] = [5, 10, 15, 20]; - - for i in 0..5 { - nums.iter_mut() - .zip(STARTING_WRITE_IDXS) - .for_each(|(num, starting_write_idx)| { - let quotient = num.wrapping_div(BASE as u32); - let remainder = (*num - (BASE as u32) * quotient) as usize; - *num = quotient; - - unsafe { - *buf.get_unchecked_mut(starting_write_idx - i - 1) = - *encode_table.get_unchecked(remainder) - } - }); + const BASE_POSITIONS: [usize; 4] = [0, 5, 10, 15]; + + for pair_idx in 0..2 { + nums.iter_mut().zip(BASE_POSITIONS).for_each(|(num, base)| { + // pair_idx 0: ABC, FGH, KLM, PQR + // pair_idx 1: A, F, K, P + let quotient = *num / BASE_TO_2 as u32; + // pair_idx 0: DE, IJ, NO, ST + // pair_idx 1: BC, GH, LM, QR + let pair = (*num - BASE_TO_2 as u32 * quotient) as usize; + *num = quotient; + + // Write positions: base+3,base+4 for pair_idx=0; base+1,base+2 for pair_idx=1 + let pos = base + 4 - 2 * pair_idx; + unsafe { + // pair_idx 0: [D, E], [I, J], [N, O], [S, T] + // pair_idx 1: [B, C], [G, H], [L, M], [Q, R] + let [c1, c2] = *encode_pairs.get_unchecked(pair); + *buf.get_unchecked_mut(pos - 1) = c1; + *buf.get_unchecked_mut(pos) = c2; + } + }); } + // A, F, K, P + nums.iter() + .zip(BASE_POSITIONS) + .for_each(|(num, base)| unsafe { + // num is now a single digit (0-61), use first byte of pair table + *buf.get_unchecked_mut(base) = encode_pairs.get_unchecked(*num as usize)[1]; + }); + 20 } @@ -673,7 +720,7 @@ unsafe fn encode_impl_over_10_under_20_digits( num: u128, digits: usize, buf: &mut [u8], - encode_table: &[u8; 62], + encode_pairs: &[[u8; 2]; BASE_TO_2 as usize], ) -> usize { let mut write_idx = digits; let mut digit_index = 0_usize; @@ -689,7 +736,7 @@ unsafe fn encode_impl_over_10_under_20_digits( num /= BASE; unsafe { - *buf.get_unchecked_mut(write_idx) = *encode_table.get_unchecked(remainder as usize); + *buf.get_unchecked_mut(write_idx) = encode_pairs.get_unchecked(remainder as usize)[1]; } digit_index = digit_index.wrapping_add(1); @@ -706,26 +753,25 @@ unsafe fn encode_impl_u64( num: u64, digits: usize, buf: &mut [u8], - encode_table: &[u8; 62], + encode_pairs: &[[u8; 2]; BASE_TO_2 as usize], ) -> usize { - if digits == 11 { - // ABBBBBBBBBB - - // A - let first_u64 = num / (BASE_TO_10 as u64); - // BBBBBBBBBB - let second_u64 = num % (BASE_TO_10 as u64); + if digits >= 10 { + // Branchless 10/11 digit handling + // For 10 digits: BBBBBBBBBB -> first_digit=0, remainder=BBBBBBBBBB, offset=0 (buf[0] gets overwritten) + // For 11 digits: ABBBBBBBBBB -> first_digit=A, remainder=BBBBBBBBBB, offset=1 + let first_digit = num / (BASE_TO_10 as u64); + let remainder = num % (BASE_TO_10 as u64); + let offset = digits - 10; // 0 or 1 unsafe { - *buf.get_unchecked_mut(0) = *encode_table.get_unchecked(first_u64 as usize); + // This is unnecessary work for the 10 digit case, but its very cheap work and allows us to avoid a branch + *buf.get_unchecked_mut(0) = encode_pairs.get_unchecked(first_digit as usize)[1]; - encode_impl_u64_10_digits(second_u64, &mut buf[1..], encode_table); + encode_impl_u64_10_digits(remainder, &mut buf[offset..], encode_pairs); } digits - } else if digits == 10 { - unsafe { encode_impl_u64_10_digits(num, buf, encode_table) } } else { - unsafe { encode_impl_u64_under_10_digits(num, digits, buf, encode_table) } + unsafe { encode_impl_u64_under_10_digits(num, digits, buf, encode_pairs) } } } @@ -733,7 +779,7 @@ unsafe fn encode_impl_u64_under_10_digits( mut num: u64, digits: usize, buf: &mut [u8], - encode_table: &[u8; 62], + encode_pairs: &[[u8; 2]; BASE_TO_2 as usize], ) -> usize { let mut write_idx = digits; let mut digit_index = 0_usize; @@ -745,7 +791,7 @@ unsafe fn encode_impl_u64_under_10_digits( num /= BASE; unsafe { - *buf.get_unchecked_mut(write_idx) = *encode_table.get_unchecked(remainder as usize); + *buf.get_unchecked_mut(write_idx) = encode_pairs.get_unchecked(remainder as usize)[1]; } digit_index = digit_index.wrapping_add(1); @@ -754,29 +800,48 @@ unsafe fn encode_impl_u64_under_10_digits( digits } -unsafe fn encode_impl_u64_10_digits(num: u64, buf: &mut [u8], encode_table: &[u8; 62]) -> usize { +unsafe fn encode_impl_u64_10_digits( + num: u64, + buf: &mut [u8], + encode_pairs: &[[u8; 2]; BASE_TO_2 as usize], +) -> usize { let first_u32 = (num / BASE_TO_5) as u32; let second_u32 = (num % BASE_TO_5) as u32; - // [AAAAA, BBBBB] + // [ABCDE, FGHIJ] let mut nums = [first_u32, second_u32]; - const STARTING_WRITE_IDXS: [usize; 2] = [5, 10]; - - for i in 0..5 { - nums.iter_mut() - .zip(STARTING_WRITE_IDXS) - .for_each(|(num, starting_write_idx)| { - let quotient = num.wrapping_div(BASE as u32); - let remainder = (*num - (BASE as u32) * quotient) as usize; - *num = quotient; - - unsafe { - *buf.get_unchecked_mut(starting_write_idx - i - 1) = - *encode_table.get_unchecked(remainder) - } - }); + const BASE_POSITIONS: [usize; 2] = [0, 5]; + + for pair_idx in 0..2 { + nums.iter_mut().zip(BASE_POSITIONS).for_each(|(num, base)| { + // pair_idx 0: ABC, FGH + // pair_idx 1: A, F + let quotient = *num / BASE_TO_2 as u32; + // pair_idx 0: DE, IJ + // pair_idx 1: BC, GH + let pair = (*num - BASE_TO_2 as u32 * quotient) as usize; + *num = quotient; + + // Write positions: base+3,base+4 for pair_idx=0; base+1,base+2 for pair_idx=1 + let pos = base + 4 - 2 * pair_idx; + unsafe { + // pair_idx 0: [D, E], [I, J] + // pair_idx 1: [B, C], [G, H] + let [c1, c2] = *encode_pairs.get_unchecked(pair); + *buf.get_unchecked_mut(pos - 1) = c1; + *buf.get_unchecked_mut(pos) = c2; + } + }); } + // A, F + nums.iter() + .zip(BASE_POSITIONS) + .for_each(|(num, base)| unsafe { + // num is now a single digit (0-61), use second byte of pair entry (the low digit) + *buf.get_unchecked_mut(base) = encode_pairs.get_unchecked(*num as usize)[1]; + }); + 10 } @@ -808,11 +873,11 @@ const fn mulh(x: u128, y: u128) -> u128 { } unsafe fn _encode_buf(num: u128, digits: usize, buf: &mut [u8]) -> usize { - unsafe { encode_impl(num, digits, buf, &STANDARD_TABLES.encode) } + unsafe { encode_impl(num, digits, buf, &STANDARD_TABLES.encode_pairs) } } unsafe fn _encode_alternative_buf(num: u128, digits: usize, buf: &mut [u8]) -> usize { - unsafe { encode_impl(num, digits, buf, &ALTERNATIVE_TABLES.encode) } + unsafe { encode_impl(num, digits, buf, &ALTERNATIVE_TABLES.encode_pairs) } } #[cfg(feature = "alloc")]