diff --git a/src/handles.rs b/src/handles.rs index 94cecbd..fd1be45 100644 --- a/src/handles.rs +++ b/src/handles.rs @@ -16,6 +16,8 @@ //! the plan is to replace the internals with unsafe code that omits the //! bound check at the read/write time. +use core::mem::MaybeUninit; + #[cfg(all( feature = "simd-accel", any( @@ -25,6 +27,8 @@ ) ))] use crate::simd_funcs::*; +use crate::MaybeUninitSliceInitFromSlice; +use crate::PointerStripMaybeUninit; #[cfg(all( feature = "simd-accel", @@ -153,7 +157,7 @@ impl UnalignedU16Slice { #[cfg(feature = "simd-accel")] #[inline(always)] - pub fn copy_bmp_to(&self, other: &mut [u16]) -> Option<(u16, usize)> { + pub fn copy_bmp_to(&self, other: &mut [MaybeUninit]) -> Option<(u16, usize)> { assert!(self.len <= other.len()); let mut offset = 0; // Safety: SIMD_STRIDE_SIZE is measured in bytes, whereas len is in u16s. We check we can @@ -167,7 +171,7 @@ impl UnalignedU16Slice { } // Safety: we have enough space on the other side to write this unsafe { - store8_unaligned(other.as_mut_ptr().add(offset), simd); + store8_unaligned(other.as_mut_ptr().add(offset).strip_maybeuninit(), simd); } if contains_surrogates(simd) { break; @@ -192,11 +196,11 @@ impl UnalignedU16Slice { #[cfg(not(feature = "simd-accel"))] #[inline(always)] - fn copy_bmp_to(&self, other: &mut [u16]) -> Option<(u16, usize)> { + fn copy_bmp_to(&self, other: &mut [MaybeUninit]) -> Option<(u16, usize)> { assert!(self.len <= other.len()); for (i, target) in other.iter_mut().enumerate().take(self.len) { let unit = swap_if_opposite_endian::(self.at(i)); - *target = unit; + *target = MaybeUninit::new(unit); if super::in_range16(unit, 0xD800, 0xE000) { return Some((unit, i)); } @@ -208,7 +212,7 @@ impl UnalignedU16Slice { #[inline(always)] fn copy_unaligned_basic_latin_to_ascii_alu( src: UnalignedU16Slice, - dst: &mut [u8], + dst: &mut [MaybeUninit], offset: usize, ) -> CopyAsciiResult { let len = ::core::cmp::min(src.len(), dst.len()); @@ -221,7 +225,7 @@ fn copy_unaligned_basic_latin_to_ascii_alu( if unit > 0x7F { return CopyAsciiResult::GoOn((unit, i + offset)); } - dst[i] = unit as u8; + dst[i] = MaybeUninit::new(unit as u8); i += 1; } } @@ -239,7 +243,7 @@ fn swap_if_opposite_endian(unit: u16) -> u16 { #[inline(always)] fn copy_unaligned_basic_latin_to_ascii( src: UnalignedU16Slice, - dst: &mut [u8], + dst: &mut [MaybeUninit], ) -> CopyAsciiResult { copy_unaligned_basic_latin_to_ascii_alu::(src, dst, 0) } @@ -248,7 +252,7 @@ fn copy_unaligned_basic_latin_to_ascii( #[inline(always)] fn copy_unaligned_basic_latin_to_ascii( src: UnalignedU16Slice, - dst: &mut [u8], + dst: &mut [MaybeUninit], ) -> CopyAsciiResult { let len = ::core::cmp::min(src.len(), dst.len()); let mut offset = 0; @@ -284,7 +288,7 @@ fn copy_unaligned_basic_latin_to_ascii( #[inline(always)] fn convert_unaligned_utf16_to_utf8( src: UnalignedU16Slice, - dst: &mut [u8], + dst: &mut [MaybeUninit], ) -> (usize, usize, bool) { if dst.len() < 4 { return (0, 0, false); @@ -317,16 +321,16 @@ fn convert_unaligned_utf16_to_utf8( let non_ascii_minus_surrogate_start = non_ascii.wrapping_sub(0xD800); if non_ascii_minus_surrogate_start > (0xDFFF - 0xD800) { if non_ascii < 0x800 { - dst[dst_pos] = ((non_ascii >> 6) | 0xC0) as u8; + dst[dst_pos] = MaybeUninit::new(((non_ascii >> 6) | 0xC0) as u8); dst_pos += 1; - dst[dst_pos] = ((non_ascii & 0x3F) | 0x80) as u8; + dst[dst_pos] = MaybeUninit::new(((non_ascii & 0x3F) | 0x80) as u8); dst_pos += 1; } else { - dst[dst_pos] = ((non_ascii >> 12) | 0xE0) as u8; + dst[dst_pos] = MaybeUninit::new(((non_ascii >> 12) | 0xE0) as u8); dst_pos += 1; - dst[dst_pos] = (((non_ascii & 0xFC0) >> 6) | 0x80) as u8; + dst[dst_pos] = MaybeUninit::new((((non_ascii & 0xFC0) >> 6) | 0x80) as u8); dst_pos += 1; - dst[dst_pos] = ((non_ascii & 0x3F) | 0x80) as u8; + dst[dst_pos] = MaybeUninit::new(((non_ascii & 0x3F) | 0x80) as u8); dst_pos += 1; } } else if non_ascii_minus_surrogate_start <= (0xDBFF - 0xD800) { @@ -340,13 +344,15 @@ fn convert_unaligned_utf16_to_utf8( let point = (u32::from(non_ascii) << 10) + u32::from(second) - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32); - dst[dst_pos] = ((point >> 18) | 0xF0u32) as u8; + dst[dst_pos] = MaybeUninit::new(((point >> 18) | 0xF0u32) as u8); dst_pos += 1; - dst[dst_pos] = (((point & 0x3F000u32) >> 12) | 0x80u32) as u8; + dst[dst_pos] = + MaybeUninit::new((((point & 0x3F000u32) >> 12) | 0x80u32) as u8); dst_pos += 1; - dst[dst_pos] = (((point & 0xFC0u32) >> 6) | 0x80u32) as u8; + dst[dst_pos] = + MaybeUninit::new((((point & 0xFC0u32) >> 6) | 0x80u32) as u8); dst_pos += 1; - dst[dst_pos] = ((point & 0x3Fu32) | 0x80u32) as u8; + dst[dst_pos] = MaybeUninit::new(((point & 0x3Fu32) | 0x80u32) as u8); dst_pos += 1; } else { // The next code unit is not a low surrogate. Don't advance @@ -370,7 +376,7 @@ fn convert_unaligned_utf16_to_utf8( non_ascii = unit; continue 'inner; } - dst[dst_pos] = unit as u8; + dst[dst_pos] = MaybeUninit::new(unit as u8); dst_pos += 1; continue 'outer; } @@ -588,13 +594,13 @@ where } pub struct Utf16Destination<'a> { - slice: &'a mut [u16], + slice: &'a mut [MaybeUninit], pos: usize, } impl<'a> Utf16Destination<'a> { #[inline(always)] - pub fn new(dst: &mut [u16]) -> Utf16Destination { + pub fn new(dst: &'a mut [MaybeUninit]) -> Utf16Destination<'a> { Utf16Destination { slice: dst, pos: 0 } } #[inline(always)] @@ -619,10 +625,8 @@ impl<'a> Utf16Destination<'a> { } #[inline(always)] fn write_code_unit(&mut self, u: u16) { - unsafe { - // OK, because we checked before handing out a handle. - *(self.slice.get_unchecked_mut(self.pos)) = u; - } + // SAFETY: OK, because we checked before handing out a handle. + *unsafe { self.slice.get_unchecked_mut(self.pos) } = MaybeUninit::new(u); self.pos += 1; } #[inline(always)] @@ -683,7 +687,11 @@ impl<'a> Utf16Destination<'a> { // Safety: This function is documented as needing valid pointers for src/dest and len, which // is true since we've passed the minumum length of the two match unsafe { - ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) + ascii_to_basic_latin( + src_remaining.as_ptr(), + dst_remaining.as_mut_ptr().strip_maybeuninit(), + length, + ) } { None => { source.pos += length; @@ -721,7 +729,11 @@ impl<'a> Utf16Destination<'a> { // Safety: This function is documented as needing valid pointers for src/dest and len, which // is true since we've passed the minumum length of the two match unsafe { - ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) + ascii_to_basic_latin( + src_remaining.as_ptr(), + dst_remaining.as_mut_ptr().strip_maybeuninit(), + length, + ) } { None => { source.pos += length; @@ -804,7 +816,7 @@ impl<'a> Utf16Destination<'a> { return Some((source.pos, self.pos)); } // `surrogate` was already speculatively written - dst_remaining[second_pos] = second; + dst_remaining[second_pos] = MaybeUninit::new(second); offset += 2; continue; } else { @@ -933,13 +945,13 @@ where } pub struct Utf8Destination<'a> { - slice: &'a mut [u8], + slice: &'a mut [MaybeUninit], pos: usize, } impl<'a> Utf8Destination<'a> { #[inline(always)] - pub fn new(dst: &mut [u8]) -> Utf8Destination { + pub fn new(dst: &'a mut [MaybeUninit]) -> Utf8Destination<'a> { Utf8Destination { slice: dst, pos: 0 } } #[inline(always)] @@ -964,10 +976,9 @@ impl<'a> Utf8Destination<'a> { } #[inline(always)] fn write_code_unit(&mut self, u: u8) { - unsafe { - // OK, because we checked before handing out a handle. - *(self.slice.get_unchecked_mut(self.pos)) = u; - } + // SAFETY: OK, because we checked before handing out a handle. + *unsafe { self.slice.get_unchecked_mut(self.pos) } = MaybeUninit::new(u); + self.pos += 1; } #[inline(always)] @@ -1043,7 +1054,11 @@ impl<'a> Utf8Destination<'a> { (DecoderResult::InputEmpty, src_remaining.len()) }; match unsafe { - ascii_to_ascii(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) + ascii_to_ascii( + src_remaining.as_ptr(), + dst_remaining.as_mut_ptr().strip_maybeuninit(), + length, + ) } { None => { source.pos += length; @@ -1083,7 +1098,11 @@ impl<'a> Utf8Destination<'a> { (DecoderResult::InputEmpty, src_remaining.len()) }; match unsafe { - ascii_to_ascii(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) + ascii_to_ascii( + src_remaining.as_ptr(), + dst_remaining.as_mut_ptr().strip_maybeuninit(), + length, + ) } { None => { source.pos += length; @@ -1116,7 +1135,7 @@ impl<'a> Utf8Destination<'a> { // Validate first, then memcpy to let memcpy do its thing even for // non-ASCII. (And potentially do something better than SSE2 for ASCII.) let valid_len = utf8_valid_up_to(&src_remaining[..min_len]); - (&mut dst_remaining[..valid_len]).copy_from_slice(&src_remaining[..valid_len]); + dst_remaining[..valid_len].init_from_slice(&src_remaining[..valid_len]); source.pos += valid_len; self.pos += valid_len; } @@ -1262,7 +1281,11 @@ impl<'a> Utf16Source<'a> { (EncoderResult::InputEmpty, src_remaining.len()) }; match unsafe { - basic_latin_to_ascii(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) + basic_latin_to_ascii( + src_remaining.as_ptr(), + dst_remaining.as_mut_ptr().strip_maybeuninit(), + length, + ) } { None => { self.pos += length; @@ -1331,7 +1354,11 @@ impl<'a> Utf16Source<'a> { (EncoderResult::InputEmpty, src_remaining.len()) }; match unsafe { - basic_latin_to_ascii(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) + basic_latin_to_ascii( + src_remaining.as_ptr(), + dst_remaining.as_mut_ptr().strip_maybeuninit(), + length, + ) } { None => { self.pos += length; @@ -1554,7 +1581,11 @@ impl<'a> Utf8Source<'a> { (EncoderResult::InputEmpty, src_remaining.len()) }; match unsafe { - ascii_to_ascii(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) + ascii_to_ascii( + src_remaining.as_ptr(), + dst_remaining.as_mut_ptr().strip_maybeuninit(), + length, + ) } { None => { self.pos += length; @@ -1604,7 +1635,11 @@ impl<'a> Utf8Source<'a> { (EncoderResult::InputEmpty, src_remaining.len()) }; match unsafe { - ascii_to_ascii(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) + ascii_to_ascii( + src_remaining.as_ptr(), + dst_remaining.as_mut_ptr().strip_maybeuninit(), + length, + ) } { None => { self.pos += length; @@ -1660,7 +1695,11 @@ impl<'a> Utf8Source<'a> { (EncoderResult::InputEmpty, src_remaining.len()) }; match unsafe { - ascii_to_ascii(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) + ascii_to_ascii( + src_remaining.as_ptr(), + dst_remaining.as_mut_ptr().strip_maybeuninit(), + length, + ) } { None => { self.pos += length; @@ -1915,22 +1954,22 @@ where } pub struct ByteDestination<'a> { - slice: &'a mut [u8], + slice: &'a mut [MaybeUninit], /// Pointer to the original start of the slice. It's never dereferenced. - start: *const u8, + start: *const MaybeUninit, } impl<'a> ByteDestination<'a> { #[inline(always)] - pub fn new(dst: &mut [u8]) -> ByteDestination { + pub fn new(dst: &'a mut [MaybeUninit]) -> ByteDestination<'a> { ByteDestination { start: dst.as_ptr(), slice: dst, } } #[inline(always)] - pub fn remaining(&mut self) -> &mut [u8] { - &mut self.slice + pub fn remaining(&mut self) -> &mut [MaybeUninit] { + self.slice } #[inline(always)] pub fn check_space_one<'b>(&'b mut self) -> Space> { @@ -1975,24 +2014,24 @@ impl<'a> ByteDestination<'a> { let (dst, rest) = core::mem::take(&mut self.slice).split_first_mut().unwrap(); self.slice = rest; - *dst = first; + *dst = MaybeUninit::new(first); } #[inline(always)] fn write_two(&mut self, first: u8, second: u8) { let (dst, rest) = core::mem::take(&mut self.slice).split_at_mut(2); self.slice = rest; - dst[0] = first; - dst[1] = second; + dst[0] = MaybeUninit::new(first); + dst[1] = MaybeUninit::new(second); } #[inline(always)] fn write_three(&mut self, first: u8, second: u8, third: u8) { let (dst, rest) = core::mem::take(&mut self.slice).split_at_mut(3); self.slice = rest; - dst[0] = first; - dst[1] = second; - dst[2] = third; + dst[0] = MaybeUninit::new(first); + dst[1] = MaybeUninit::new(second); + dst[2] = MaybeUninit::new(third); } #[inline(always)] fn write_four(&mut self, first: u8, second: u8, third: u8, fourth: u8) { @@ -2000,10 +2039,10 @@ impl<'a> ByteDestination<'a> { let (dst, rest) = core::mem::take(&mut self.slice).split_at_mut(4); self.slice = rest; - dst[0] = first; - dst[1] = second; - dst[2] = third; - dst[3] = fourth; + dst[0] = MaybeUninit::new(first); + dst[1] = MaybeUninit::new(second); + dst[2] = MaybeUninit::new(third); + dst[3] = MaybeUninit::new(fourth); } /// Assume this many bytes have been written #[inline(always)] diff --git a/src/lib.rs b/src/lib.rs index 78e9849..e0dae58 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -793,6 +793,7 @@ use alloc::vec::Vec; use core::cmp::Ordering; use core::hash::Hash; use core::hash::Hasher; +use core::mem::MaybeUninit; #[cfg(feature = "serde")] use serde::de::Visitor; @@ -3131,8 +3132,8 @@ impl Encoding { ); unsafe { let vec = string.as_mut_vec(); - vec.set_len(valid_up_to); core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to); + vec.set_len(valid_up_to); } (decoder, string, valid_up_to) } else { @@ -3232,8 +3233,8 @@ impl Encoding { ); unsafe { let vec = string.as_mut_vec(); - vec.set_len(valid_up_to); core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to); + vec.set_len(valid_up_to); } (decoder, string, &bytes[valid_up_to..]) } else { @@ -3323,8 +3324,8 @@ impl Encoding { .next_power_of_two(), ); unsafe { - vec.set_len(valid_up_to); core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to); + vec.set_len(valid_up_to); } let mut total_read = valid_up_to; let mut total_had_errors = false; @@ -3959,17 +3960,28 @@ impl Decoder { /// methods collectively. /// /// Available via the C wrapper. + #[inline] pub fn decode_to_utf8( &mut self, src: &[u8], dst: &mut [u8], last: bool, + ) -> (CoderResult, usize, usize, bool) { + // SAFETY: we only write initialized values to the slice. + let dst = unsafe { as_slice_of_maybeuninit(dst) }; + self.decode_to_utf8_maybeuninit(src, dst, last) + } + fn decode_to_utf8_maybeuninit( + &mut self, + src: &[u8], + dst: &mut [MaybeUninit], + last: bool, ) -> (CoderResult, usize, usize, bool) { let mut had_errors = false; let mut total_read = 0usize; let mut total_written = 0usize; loop { - let (result, read, written) = self.decode_to_utf8_without_replacement( + let (result, read, written) = self.decode_to_utf8_maybeuninit_without_replacement( &src[total_read..], &mut dst[total_written..], last, @@ -3999,11 +4011,11 @@ impl Decoder { // otherwise we'd have gotten OutputFull already. // XXX: is the above comment actually true for UTF-8 itself? // TODO: Consider having fewer bound checks here. - dst[total_written] = 0xEFu8; + dst[total_written] = MaybeUninit::new(0xEFu8); total_written += 1; - dst[total_written] = 0xBFu8; + dst[total_written] = MaybeUninit::new(0xBFu8); total_written += 1; - dst[total_written] = 0xBDu8; + dst[total_written] = MaybeUninit::new(0xBDu8); total_written += 1; } } @@ -4074,16 +4086,12 @@ impl Decoder { dst: &mut String, last: bool, ) -> (CoderResult, usize, bool) { - unsafe { - let vec = dst.as_mut_vec(); - let old_len = vec.len(); - let capacity = vec.capacity(); - vec.set_len(capacity); - let (result, read, written, replaced) = - self.decode_to_utf8(src, &mut vec[old_len..], last); - vec.set_len(old_len + written); - (result, read, replaced) - } + let vec = unsafe { dst.as_mut_vec() }; + let old_len = vec.len(); + let spare = spare_capacity_mut(vec); + let (result, read, written, replaced) = self.decode_to_utf8_maybeuninit(src, spare, last); + unsafe { vec.set_len(old_len + written) }; + (result, read, replaced) } public_decode_function!(/// Incrementally decode a byte stream into UTF-8 @@ -4096,6 +4104,7 @@ impl Decoder { /// Available via the C wrapper. , decode_to_utf8_without_replacement, + decode_to_utf8_maybeuninit_without_replacement, decode_to_utf8_raw, decode_to_utf8_checking_end, decode_to_utf8_after_one_potential_bom_byte, @@ -4164,16 +4173,13 @@ impl Decoder { dst: &mut String, last: bool, ) -> (DecoderResult, usize) { - unsafe { - let vec = dst.as_mut_vec(); - let old_len = vec.len(); - let capacity = vec.capacity(); - vec.set_len(capacity); - let (result, read, written) = - self.decode_to_utf8_without_replacement(src, &mut vec[old_len..], last); - vec.set_len(old_len + written); - (result, read) - } + let vec = unsafe { dst.as_mut_vec() }; + let old_len = vec.len(); + let spare = spare_capacity_mut(vec); + let (result, read, written) = + self.decode_to_utf8_maybeuninit_without_replacement(src, spare, last); + unsafe { vec.set_len(old_len + written) }; + (result, read) } /// Query the worst-case UTF-16 output size (with or without replacement). @@ -4326,6 +4332,7 @@ impl Decoder { /// Available via the C wrapper. , decode_to_utf16_without_replacement, + decode_to_utf16_maybeuninit_without_replacement, decode_to_utf16_raw, decode_to_utf16_checking_end, decode_to_utf16_after_one_potential_bom_byte, @@ -4575,11 +4582,22 @@ impl Encoder { /// methods collectively. /// /// Available via the C wrapper. + #[inline] pub fn encode_from_utf8( &mut self, src: &str, dst: &mut [u8], last: bool, + ) -> (CoderResult, usize, usize, bool) { + // SAFETY: we only write initialized values to the slice. + let dst = unsafe { as_slice_of_maybeuninit(dst) }; + self.encode_from_utf8_maybeuninit(src, dst, last) + } + pub fn encode_from_utf8_maybeuninit( + &mut self, + src: &str, + dst: &mut [MaybeUninit], + last: bool, ) -> (CoderResult, usize, usize, bool) { let dst_len = dst.len(); let effective_dst_len = if self.encoding().can_encode_everything() { @@ -4597,7 +4615,7 @@ impl Encoder { let mut total_read = 0usize; let mut total_written = 0usize; loop { - let (result, read, written) = self.encode_from_utf8_without_replacement( + let (result, read, written) = self.variant.encode_from_utf8_raw( &src[total_read..], &mut dst[total_written..effective_dst_len], last, @@ -4665,15 +4683,11 @@ impl Encoder { dst: &mut Vec, last: bool, ) -> (CoderResult, usize, bool) { - unsafe { - let old_len = dst.len(); - let capacity = dst.capacity(); - dst.set_len(capacity); - let (result, read, written, replaced) = - self.encode_from_utf8(src, &mut dst[old_len..], last); - dst.set_len(old_len + written); - (result, read, replaced) - } + let old_len = dst.len(); + let spare = spare_capacity_mut(dst); + let (result, read, written, replaced) = self.encode_from_utf8_maybeuninit(src, spare, last); + unsafe { dst.set_len(old_len + written) }; + (result, read, replaced) } /// Incrementally encode into byte stream from UTF-8 _without replacement_. @@ -4688,6 +4702,8 @@ impl Encoder { dst: &mut [u8], last: bool, ) -> (EncoderResult, usize, usize) { + // SAFETY: we only write initialized values to the slice. + let dst = unsafe { as_slice_of_maybeuninit(dst) }; self.variant.encode_from_utf8_raw(src, dst, last) } @@ -4705,15 +4721,11 @@ impl Encoder { dst: &mut Vec, last: bool, ) -> (EncoderResult, usize) { - unsafe { - let old_len = dst.len(); - let capacity = dst.capacity(); - dst.set_len(capacity); - let (result, read, written) = - self.encode_from_utf8_without_replacement(src, &mut dst[old_len..], last); - dst.set_len(old_len + written); - (result, read) - } + let old_len = dst.len(); + let spare = spare_capacity_mut(dst); + let (result, read, written) = self.variant.encode_from_utf8_raw(src, spare, last); + unsafe { dst.set_len(old_len + written) }; + (result, read) } /// Query the worst-case output size when encoding from UTF-16 with @@ -4811,6 +4823,8 @@ impl Encoder { EncoderResult::Unmappable(unmappable) => { had_unmappables = true; debug_assert!(dst.len() - total_written >= NCR_EXTRA); + // SAFETY: we only write initialized values to the slice. + let dst_maybeuninit = unsafe { as_slice_of_maybeuninit(dst) }; // There are no UTF-16 encoders and even if there were, // they'd never have unmappables. debug_assert_ne!(self.encoding(), UTF_16BE); @@ -4821,7 +4835,7 @@ impl Encoder { // ISO-2022-JP and come here, the encoder is in either the // ASCII or the Roman state. We are allowed to generate any // printable ASCII excluding \ and ~. - total_written += write_ncr(unmappable, &mut dst[total_written..]); + total_written += write_ncr(unmappable, &mut dst_maybeuninit[total_written..]); if total_written >= effective_dst_len { if total_read == src.len() && !(last && self.has_pending_state()) { return ( @@ -4855,12 +4869,14 @@ impl Encoder { dst: &mut [u8], last: bool, ) -> (EncoderResult, usize, usize) { + // SAFETY: we only write initialized values to the slice. + let dst = unsafe { as_slice_of_maybeuninit(dst) }; self.variant.encode_from_utf16_raw(src, dst, last) } } /// Format an unmappable as NCR without heap allocation. -fn write_ncr(unmappable: char, dst: &mut [u8]) -> usize { +fn write_ncr(unmappable: char, dst: &mut [MaybeUninit]) -> usize { // len is the number of decimal digits needed to represent unmappable plus // 3 (the length of "&#" and ";"). let mut number = unmappable as u32; @@ -4882,19 +4898,19 @@ fn write_ncr(unmappable: char, dst: &mut [u8]) -> usize { debug_assert!(number >= 10u32); debug_assert!(len <= dst.len()); let mut pos = len - 1; - dst[pos] = b';'; + dst[pos] = MaybeUninit::new(b';'); pos -= 1; loop { let rightmost = number % 10; - dst[pos] = rightmost as u8 + b'0'; + dst[pos] = MaybeUninit::new(rightmost as u8 + b'0'); pos -= 1; if number < 10 { break; } number /= 10; } - dst[1] = b'#'; - dst[0] = b'&'; + dst[1] = MaybeUninit::new(b'#'); + dst[0] = MaybeUninit::new(b'&'); len } @@ -4984,6 +5000,62 @@ fn checked_min(one: Option, other: Option) -> Option { } } +/// like slice::copy_from_slice, but with a [MaybeUninit] destination. +pub(crate) trait MaybeUninitSliceInitFromSlice { + fn init_from_slice(&mut self, slice: &[T]); +} + +impl MaybeUninitSliceInitFromSlice for [MaybeUninit] { + fn init_from_slice(&mut self, slice: &[T]) { + // SAFETY: `MaybeUninit` has the same layout as `T`. + // Note also that both `T` and `MaybeUninit` are `Copy`. + let slice = unsafe { core::mem::transmute::<&[T], &[MaybeUninit]>(slice) }; + self.copy_from_slice(slice); + } +} + +/// Returns the remaining spare capacity of the vector as a slice of +/// `MaybeUninit`. +/// +/// The returned slice can be used to fill the vector with data (e.g. by +/// reading from a file) before marking the data as initialized using the +/// [`set_len`] method. +/// +/// [`set_len`]: Vec::set_len +#[inline] +fn spare_capacity_mut(vec: &mut Vec) -> &mut [MaybeUninit] { + // Note: this function is copy-pasted from Rust std, as it is not yet stable. + // Note: + // This method is not implemented in terms of `split_at_spare_mut`, + // to prevent invalidation of pointers to the buffer. + unsafe { + core::slice::from_raw_parts_mut( + vec.as_mut_ptr().add(vec.len()) as *mut MaybeUninit, + vec.capacity() - vec.len(), + ) + } +} + +/// Helper trait to make `*mut MaybeUninit`` to `*mut T`` conversions less error prone than +/// using the `as` operator directly, which requires writing the `T` out explicitly (and correctly!) +pub(crate) trait PointerStripMaybeUninit { + /// *mut T + type P; + fn strip_maybeuninit(self) -> Self::P; +} +impl PointerStripMaybeUninit for *mut MaybeUninit { + type P = *mut T; + fn strip_maybeuninit(self) -> Self::P { + self as Self::P + } +} + +/// # Safety +/// Caller must only write valid, initialized values into the slice. +unsafe fn as_slice_of_maybeuninit(slice: &mut [T]) -> &mut [MaybeUninit] { + core::mem::transmute::<&mut [T], &mut [MaybeUninit]>(slice) +} + // ############## TESTS ############### #[cfg(all(test, feature = "serde"))] diff --git a/src/macros.rs b/src/macros.rs index 2038129..7d1178d 100644 --- a/src/macros.rs +++ b/src/macros.rs @@ -25,7 +25,7 @@ macro_rules! decoder_function { $dest_struct:ident) => ( pub fn $name(&mut $slf, src: &[u8], - dst: &mut [$code_unit], + dst: &mut [core::mem::MaybeUninit<$code_unit>], last: bool) -> (DecoderResult, usize, usize) { let mut $source = ByteSource::new(src); @@ -141,7 +141,7 @@ macro_rules! ascii_compatible_two_byte_decoder_function { $ascii_punctuation:expr) => ( pub fn $name(&mut $slf, src: &[u8], - dst: &mut [$code_unit], + dst: &mut [core::mem::MaybeUninit<$code_unit>], last: bool) -> (DecoderResult, usize, usize) { let mut $source = ByteSource::new(src); @@ -365,7 +365,7 @@ macro_rules! gb18030_decoder_function { #[allow(clippy::never_loop)] pub fn $name(&mut $slf, src: &[u8], - dst: &mut [$code_unit], + dst: &mut [core::mem::MaybeUninit<$code_unit>], last: bool) -> (DecoderResult, usize, usize) { let mut $source = ByteSource::new(src); @@ -691,7 +691,7 @@ macro_rules! euc_jp_decoder_function { #[allow(clippy::never_loop)] pub fn $name(&mut $slf, src: &[u8], - dst: &mut [$code_unit], + dst: &mut [core::mem::MaybeUninit<$code_unit>], last: bool) -> (DecoderResult, usize, usize) { let mut $source = ByteSource::new(src); @@ -976,7 +976,7 @@ macro_rules! encoder_function { $source_struct:ident) => ( pub fn $name(&mut $slf, src: &$input, - dst: &mut [u8], + dst: &mut [core::mem::MaybeUninit], last: bool) -> (EncoderResult, usize, usize) { let mut $source = $source_struct::new(src); @@ -1073,7 +1073,7 @@ macro_rules! ascii_compatible_encoder_function { $ascii_punctuation:expr) => ( pub fn $name(&mut $slf, src: &$input, - dst: &mut [u8], + dst: &mut [core::mem::MaybeUninit], _last: bool) -> (EncoderResult, usize, usize) { let mut $source = $source_struct::new(src); @@ -1287,18 +1287,28 @@ macro_rules! ascii_compatible_bmp_encoder_functions { macro_rules! public_decode_function{ ($(#[$meta:meta])*, $decode_to_utf:ident, + $decode_to_utf_maybeuninit:ident, $decode_to_utf_raw:ident, $decode_to_utf_checking_end:ident, $decode_to_utf_after_one_potential_bom_byte:ident, $decode_to_utf_after_two_potential_bom_bytes:ident, $decode_to_utf_checking_end_with_offset:ident, $code_unit:ty) => ( - $(#[$meta])* pub fn $decode_to_utf(&mut self, - src: &[u8], - dst: &mut [$code_unit], - last: bool) - -> (DecoderResult, usize, usize) { + src: &[u8], + dst: &mut [$code_unit], + last: bool) + -> (DecoderResult, usize, usize) { + // SAFETY: we only write initialized values to the slice. + let dst = unsafe { as_slice_of_maybeuninit(dst) }; + self.$decode_to_utf_maybeuninit(src, dst, last) + } + $(#[$meta])* + pub(crate) fn $decode_to_utf_maybeuninit(&mut self, + src: &[u8], + dst: &mut [MaybeUninit<$code_unit>], + last: bool) + -> (DecoderResult, usize, usize) { let mut offset = 0usize; loop { match self.life_cycle { @@ -1507,7 +1517,7 @@ macro_rules! public_decode_function{ fn $decode_to_utf_after_one_potential_bom_byte(&mut self, src: &[u8], - dst: &mut [$code_unit], + dst: &mut [MaybeUninit<$code_unit>], last: bool, offset: usize, first_byte: u8) @@ -1544,7 +1554,7 @@ macro_rules! public_decode_function{ fn $decode_to_utf_after_two_potential_bom_bytes(&mut self, src: &[u8], - dst: &mut [$code_unit], + dst: &mut [MaybeUninit<$code_unit>], last: bool, offset: usize) -> (DecoderResult, usize, usize) { @@ -1597,7 +1607,7 @@ macro_rules! public_decode_function{ /// as having been consumed. fn $decode_to_utf_checking_end_with_offset(&mut self, src: &[u8], - dst: &mut [$code_unit], + dst: &mut [MaybeUninit<$code_unit>], last: bool, offset: usize) -> (DecoderResult, usize, usize) { @@ -1610,7 +1620,7 @@ macro_rules! public_decode_function{ /// `true` and result is `DecoderResult::InputEmpty`. fn $decode_to_utf_checking_end(&mut self, src: &[u8], - dst: &mut [$code_unit], + dst: &mut [MaybeUninit<$code_unit>], last: bool) -> (DecoderResult, usize, usize) { debug_assert_eq!(self.life_cycle, DecoderLifeCycle::Converting); diff --git a/src/mem.rs b/src/mem.rs index 92941bb..cc00478 100644 --- a/src/mem.rs +++ b/src/mem.rs @@ -24,6 +24,8 @@ //! The FFI binding for this module are in the //! [encoding_c_mem crate](https://github.com/hsivonen/encoding_c_mem). +use core::mem::MaybeUninit; + #[cfg(feature = "alloc")] use alloc::borrow::Cow; #[cfg(feature = "alloc")] @@ -37,6 +39,7 @@ use super::in_inclusive_range8; use super::in_range16; use super::in_range32; use super::DecoderResult; +use crate::as_slice_of_maybeuninit; use crate::ascii::*; use crate::utf_8::*; @@ -1488,6 +1491,12 @@ pub fn check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi { /// /// Panics if the destination buffer is shorter than stated above. pub fn convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize { + // SAFETY: + let dst = unsafe { core::mem::transmute::<&mut [u16], &mut [MaybeUninit]>(dst) }; + convert_utf8_to_utf16_maybeuninit(src, dst) +} + +pub fn convert_utf8_to_utf16_maybeuninit(src: &[u8], dst: &mut [MaybeUninit]) -> usize { // TODO: Can the requirement for dst to be at least one unit longer // be eliminated? assert!(dst.len() > src.len()); @@ -1509,7 +1518,7 @@ pub fn convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize { DecoderResult::Malformed(_, _) => { // There should always be space for the U+FFFD, because // otherwise we'd have gotten OutputFull already. - dst[total_written] = 0xFFFD; + dst[total_written] = MaybeUninit::new(0xFFFD); total_written += 1; } } @@ -1625,7 +1634,15 @@ pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize { /// # Panics /// /// Panics if the destination buffer is shorter than stated above. +#[inline] pub fn convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> Option { + let dst = unsafe { core::mem::transmute::<&mut [u16], &mut [MaybeUninit]>(dst) }; + convert_utf8_to_utf16_without_replacement_internal(src, dst) +} +fn convert_utf8_to_utf16_without_replacement_internal( + src: &[u8], + dst: &mut [MaybeUninit], +) -> Option { assert!( dst.len() >= src.len(), "Destination must not be shorter than the source." @@ -1664,6 +1681,16 @@ pub fn convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> /// together with the `unsafe` method `as_bytes_mut()` on `&mut str`. #[inline(always)] pub fn convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize) { + // SAFETY: we only write initialized values to the slice. + let dst = unsafe { as_slice_of_maybeuninit(dst) }; + convert_utf16_to_utf8_partial_maybeuninit(src, dst) +} + +#[inline(always)] +pub(crate) fn convert_utf16_to_utf8_partial_maybeuninit( + src: &[u16], + dst: &mut [MaybeUninit], +) -> (usize, usize) { // The two functions called below are marked `inline(never)` to make // transitions from the hot part (first function) into the cold part // (second function) go through a return and another call to discouge @@ -1787,11 +1814,26 @@ pub fn convert_latin1_to_utf16(src: &[u8], dst: &mut [u16]) { /// If you want to convert into a `&mut str`, use /// `convert_utf16_to_str_partial()` instead of using this function /// together with the `unsafe` method `as_bytes_mut()` on `&mut str`. +#[inline] pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize) { - let src_len = src.len(); let src_ptr = src.as_ptr(); + let src_len = src.len(); let dst_ptr = dst.as_mut_ptr(); let dst_len = dst.len(); + unsafe { convert_latin1_to_utf8_partial_raw(src_ptr, src_len, dst_ptr, dst_len) } +} + +/// # Safety +/// src_ptr must be valid for reads at offsets `0..src_len`. +/// dst_ptr must be valid for writes at offsets `0..dst_len`. +/// +/// NOTE: this method does not read values from `dst_ptr`, so `dst_ptr` can point to uninitialized memory. +unsafe fn convert_latin1_to_utf8_partial_raw( + src_ptr: *const u8, + src_len: usize, + dst_ptr: *mut u8, + dst_len: usize, +) -> (usize, usize) { let mut total_read = 0usize; let mut total_written = 0usize; loop { @@ -1814,9 +1856,9 @@ pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usi total_read += 1; // consume `non_ascii` - dst[total_written] = (non_ascii >> 6) | 0xC0; + dst_ptr.add(total_written).write((non_ascii >> 6) | 0xC0); total_written += 1; - dst[total_written] = (non_ascii & 0x3F) | 0x80; + dst_ptr.add(total_written).write((non_ascii & 0x3F) | 0x80); total_written += 1; continue; } @@ -1844,12 +1886,28 @@ pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usi /// a `&mut str`, use `convert_utf16_to_str()` instead of this function. #[inline] pub fn convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize { + // SAFETY: `src` is valid for reads within it, as is `dst` for writes within it. + unsafe { convert_latin1_to_utf8_raw(src.as_ptr(), src.len(), dst.as_mut_ptr(), dst.len()) } +} + +/// # Safety +/// src_ptr must be valid for reads at offsets `0..src_len`. +/// dst_ptr must be valid for writes at offsets `0..dst_len`. +/// +/// NOTE: this method does not read values from `dst_ptr`, so `dst_ptr` can point to uninitialized memory. +#[inline] +unsafe fn convert_latin1_to_utf8_raw( + src_ptr: *const u8, + src_len: usize, + dst_ptr: *mut u8, + dst_len: usize, +) -> usize { assert!( - dst.len() >= src.len() * 2, + dst_len >= src_len * 2, "Destination must not be shorter than the source times two." ); - let (read, written) = convert_latin1_to_utf8_partial(src, dst); - debug_assert_eq!(read, src.len()); + let (read, written) = convert_latin1_to_utf8_partial_raw(src_ptr, src_len, dst_ptr, dst_len); + debug_assert_eq!(read, src_len); written } @@ -1925,15 +1983,24 @@ pub fn convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize { /// /// If debug assertions are enabled (and not fuzzing) and the input is /// not in the range U+0000 to U+00FF, inclusive. +#[inline] pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize { + // SAFETY: `dst.as_mut_ptr()` can be written to within its length. + unsafe { convert_utf8_to_latin1_lossy_raw(src, dst.as_mut_ptr(), dst.len()) } +} + +/// # Safety +/// dst_ptr must be valid for writes at offsets `0..dst_len`. +/// +/// NOTE: this method does not read values from `dst_ptr`, so `dst_ptr` can point to uninitialized memory. +unsafe fn convert_utf8_to_latin1_lossy_raw(src: &[u8], dst_ptr: *mut u8, dst_len: usize) -> usize { assert!( - dst.len() >= src.len(), + dst_len >= src.len(), "Destination must not be shorter than the source." ); non_fuzz_debug_assert!(is_utf8_latin1(src)); let src_len = src.len(); let src_ptr = src.as_ptr(); - let dst_ptr = dst.as_mut_ptr(); let mut total_read = 0usize; let mut total_written = 0usize; loop { @@ -1956,7 +2023,9 @@ pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize { let trail = src[total_read]; total_read += 1; - dst[total_written] = ((non_ascii & 0x1F) << 6) | (trail & 0x3F); + dst_ptr + .add(total_written) + .write(((non_ascii & 0x1F) << 6) | (trail & 0x3F)); total_written += 1; continue; } @@ -2016,13 +2085,20 @@ pub fn decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str> { } let (head, tail) = bytes.split_at(up_to); let capacity = head.len() + tail.len() * 2; - let mut vec = Vec::with_capacity(capacity); - unsafe { - vec.set_len(capacity); - } - (&mut vec[..up_to]).copy_from_slice(head); - let written = convert_latin1_to_utf8(tail, &mut vec[up_to..]); - vec.truncate(up_to + written); + let mut vec = Vec::::with_capacity(capacity); + vec.extend(head); + // SAFETY: these pointers and lengths are valid for the required reads and writes. + let written = unsafe { + convert_latin1_to_utf8_raw( + tail.as_ptr(), + tail.len(), + vec.as_mut_ptr().add(up_to), + capacity - up_to, + ) + }; + // SAFETY: convert_latin1_to_utf8_raw initialized `written` valid values into the `vec`. + unsafe { vec.set_len(up_to + written) }; + // SAFETY: `vec` contains only valid UTF-8 Cow::Owned(unsafe { String::from_utf8_unchecked(vec) }) } @@ -2053,13 +2129,13 @@ pub fn encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]> { } let (head, tail) = bytes.split_at(up_to); let capacity = bytes.len(); - let mut vec = Vec::with_capacity(capacity); - unsafe { - vec.set_len(capacity); - } - (&mut vec[..up_to]).copy_from_slice(head); - let written = convert_utf8_to_latin1_lossy(tail, &mut vec[up_to..]); - vec.truncate(up_to + written); + let mut vec = Vec::::with_capacity(capacity); + vec.extend(head); + // SAFETY: these pointers and lengths are valid for the required reads and writes. + let written = unsafe { + convert_utf8_to_latin1_lossy_raw(tail, vec.as_mut_ptr().add(up_to), capacity - up_to) + }; + unsafe { vec.set_len(up_to + written) }; Cow::Owned(vec) } diff --git a/src/replacement.rs b/src/replacement.rs index 3d1bcc1..75f48d6 100644 --- a/src/replacement.rs +++ b/src/replacement.rs @@ -34,7 +34,7 @@ impl ReplacementDecoder { pub fn decode_to_utf16_raw( &mut self, src: &[u8], - dst: &mut [u16], + dst: &mut [MaybeUninit], _last: bool, ) -> (DecoderResult, usize, usize) { // Don't err if the input stream is empty. See @@ -53,7 +53,7 @@ impl ReplacementDecoder { pub fn decode_to_utf8_raw( &mut self, src: &[u8], - dst: &mut [u8], + dst: &mut [MaybeUninit], _last: bool, ) -> (DecoderResult, usize, usize) { // Don't err if the input stream is empty. See diff --git a/src/single_byte.rs b/src/single_byte.rs index 49f099e..37efb47 100644 --- a/src/single_byte.rs +++ b/src/single_byte.rs @@ -7,6 +7,8 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +use core::mem::MaybeUninit; + use super::*; use crate::ascii::*; use crate::data::position; @@ -37,7 +39,7 @@ impl SingleByteDecoder { pub fn decode_to_utf8_raw( &mut self, src: &[u8], - dst: &mut [u8], + dst: &mut [core::mem::MaybeUninit], _last: bool, ) -> (DecoderResult, usize, usize) { let mut source = ByteSource::new(src); @@ -147,7 +149,7 @@ impl SingleByteDecoder { pub fn decode_to_utf16_raw( &mut self, src: &[u8], - dst: &mut [u16], + dst: &mut [MaybeUninit], _last: bool, ) -> (DecoderResult, usize, usize) { let (pending, length) = if dst.len() < src.len() { @@ -163,7 +165,7 @@ impl SingleByteDecoder { // Safety: length is the minimum length, `src/dst + x` will always be valid for reads/writes of `len - x` ascii_to_basic_latin( src.as_ptr().add(converted), - dst.as_mut_ptr().add(converted), + dst.as_mut_ptr().add(converted).strip_maybeuninit(), length - converted, ) } { @@ -198,10 +200,9 @@ impl SingleByteDecoder { converted, ); } - unsafe { - // Safety: As mentioned above, `converted < length` - *(dst.get_unchecked_mut(converted)) = mapped; - } + // Safety: As mentioned above, `converted < length` + *unsafe { dst.get_unchecked_mut(converted) } = MaybeUninit::new(mapped); + // Safety: `converted <= length` upheld, since `converted < length` before this converted += 1; // Next, handle ASCII punctuation and non-ASCII without @@ -227,10 +228,10 @@ impl SingleByteDecoder { // Testing on Haswell says that we should write the // byte unconditionally instead of trying to unread it // to make it part of the next SIMD stride. - unsafe { - // Safety: `converted < length` is true for this loop - *(dst.get_unchecked_mut(converted)) = u16::from(b); - } + // Safety: `converted < length` is true for this loop + *unsafe { dst.get_unchecked_mut(converted) } = + MaybeUninit::new(u16::from(b)); + // Safety: We are now at `converted <= length`. We should *not* `continue` // the loop without reverifying converted += 1; @@ -403,7 +404,7 @@ impl SingleByteEncoder { pub fn encode_from_utf16_raw( &mut self, src: &[u16], - dst: &mut [u8], + dst: &mut [MaybeUninit], _last: bool, ) -> (EncoderResult, usize, usize) { let (pending, length) = if dst.len() < src.len() { @@ -419,7 +420,7 @@ impl SingleByteEncoder { // Safety: length is the minimum length, `src/dst + x` will always be valid for reads/writes of `len - x` basic_latin_to_ascii( src.as_ptr().add(converted), - dst.as_mut_ptr().add(converted), + dst.as_mut_ptr().add(converted).strip_maybeuninit(), length - converted, ) } { @@ -438,10 +439,10 @@ impl SingleByteEncoder { // `converted` doesn't count the reading of `non_ascii` yet. match self.encode_u16(non_ascii) { Some(byte) => { - unsafe { - // Safety: we're allowed this access since `converted < length` - *(dst.get_unchecked_mut(converted)) = byte; - } + // Safety: we're allowed this access since `converted < length` + *unsafe { dst.get_unchecked_mut(converted) } = + MaybeUninit::new(byte); + converted += 1; // `converted <= length` now } @@ -535,10 +536,10 @@ impl SingleByteEncoder { // Testing on Haswell says that we should write the // byte unconditionally instead of trying to unread it // to make it part of the next SIMD stride. - unsafe { - // Safety: Can rely on converted < length - *(dst.get_unchecked_mut(converted)) = unit as u8; - } + // Safety: Can rely on converted < length + *unsafe { dst.get_unchecked_mut(converted) } = + MaybeUninit::new(unit as u8); + converted += 1; // `converted <= length` here if unit < 60 { diff --git a/src/utf_8.rs b/src/utf_8.rs index 8d836c2..5b20571 100644 --- a/src/utf_8.rs +++ b/src/utf_8.rs @@ -12,8 +12,9 @@ use crate::ascii::ascii_to_basic_latin; use crate::ascii::basic_latin_to_ascii; use crate::ascii::validate_ascii; use crate::handles::*; -use crate::mem::convert_utf16_to_utf8_partial; +use crate::mem::convert_utf16_to_utf8_partial_maybeuninit; use crate::variant::*; +use core::mem::MaybeUninit; cfg_if! { if #[cfg(feature = "simd-accel")] { @@ -227,7 +228,10 @@ pub fn utf8_valid_up_to(src: &[u8]) -> usize { } #[allow(clippy::never_loop, clippy::cognitive_complexity)] -pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usize, usize) { +pub fn convert_utf8_to_utf16_up_to_invalid( + src: &[u8], + dst: &mut [MaybeUninit], +) -> (usize, usize) { let mut read = 0; let mut written = 0; 'outer: loop { @@ -236,7 +240,11 @@ pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usiz let dst_remaining = &mut dst[written..]; let length = ::core::cmp::min(src_remaining.len(), dst_remaining.len()); match unsafe { - ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) + ascii_to_basic_latin( + src_remaining.as_ptr(), + dst_remaining.as_mut_ptr().strip_maybeuninit(), + length, + ) } { None => { read += length; @@ -272,10 +280,9 @@ pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usiz if !in_inclusive_range8(second, 0x80, 0xBF) { break 'outer; } - unsafe { - *(dst.get_unchecked_mut(written)) = - ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F) - }; + *unsafe { dst.get_unchecked_mut(written) } = MaybeUninit::new( + ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F), + ); read += 2; written += 1; @@ -286,7 +293,8 @@ pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usiz if likely(read + 4 <= src.len()) { byte = unsafe { *(src.get_unchecked(read)) }; if byte < 0x80 { - unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) }; + *unsafe { dst.get_unchecked_mut(written) } = + MaybeUninit::new(u16::from(byte)); read += 1; written += 1; continue 'outer; @@ -310,7 +318,7 @@ pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usiz let point = ((u16::from(byte) & 0xF) << 12) | ((u16::from(second) & 0x3F) << 6) | (u16::from(third) & 0x3F); - unsafe { *(dst.get_unchecked_mut(written)) = point }; + *unsafe { dst.get_unchecked_mut(written) } = MaybeUninit::new(point); read += 3; written += 1; @@ -324,7 +332,8 @@ pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usiz continue 'three; } if likely(byte < 0x80) { - unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) }; + *unsafe { dst.get_unchecked_mut(written) } = + MaybeUninit::new(u16::from(byte)); read += 1; written += 1; continue 'outer; @@ -354,10 +363,10 @@ pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usiz | ((u32::from(second) & 0x3F) << 12) | ((u32::from(third) & 0x3F) << 6) | (u32::from(fourth) & 0x3F); - unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 }; - unsafe { - *(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16 - }; + *unsafe { dst.get_unchecked_mut(written) } = + MaybeUninit::new((0xD7C0 + (point >> 10)) as u16); + *unsafe { dst.get_unchecked_mut(written + 1) } = + MaybeUninit::new((0xDC00 + (point & 0x3FF)) as u16); read += 4; written += 2; @@ -368,7 +377,8 @@ pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usiz if likely(read + 4 <= src.len()) { byte = unsafe { *(src.get_unchecked(read)) }; if byte < 0x80 { - unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) }; + *unsafe { dst.get_unchecked_mut(written) } = + MaybeUninit::new(u16::from(byte)); read += 1; written += 1; continue 'outer; @@ -392,7 +402,7 @@ pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usiz // Inspecting the lead byte directly is faster than what the // std lib does! if byte < 0x80 { - dst[written] = u16::from(byte); + dst[written] = MaybeUninit::new(u16::from(byte)); read += 1; written += 1; continue 'tail; @@ -407,7 +417,8 @@ pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usiz if !in_inclusive_range8(second, 0x80, 0xBF) { break 'outer; } - dst[written] = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F); + dst[written] = + MaybeUninit::new(((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F)); read += 2; written += 1; continue 'tail; @@ -432,7 +443,7 @@ pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usiz let point = ((u16::from(byte) & 0xF) << 12) | ((u16::from(second) & 0x3F) << 6) | (u16::from(third) & 0x3F); - dst[written] = point; + dst[written] = MaybeUninit::new(point); read += 3; written += 1; // `'tail` handles sequences shorter than 4, so @@ -605,7 +616,10 @@ impl Utf8Decoder { #[allow(clippy::never_loop)] #[inline(never)] -pub fn convert_utf16_to_utf8_partial_inner(src: &[u16], dst: &mut [u8]) -> (usize, usize) { +pub fn convert_utf16_to_utf8_partial_inner( + src: &[u16], + dst: &mut [MaybeUninit], +) -> (usize, usize) { let mut read = 0; let mut written = 0; 'outer: loop { @@ -618,7 +632,11 @@ pub fn convert_utf16_to_utf8_partial_inner(src: &[u16], dst: &mut [u8]) -> (usiz src_remaining.len() }; match unsafe { - basic_latin_to_ascii(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) + basic_latin_to_ascii( + src_remaining.as_ptr(), + dst_remaining.as_mut_ptr().strip_maybeuninit(), + length, + ) } { None => { read += length; @@ -644,9 +662,11 @@ pub fn convert_utf16_to_utf8_partial_inner(src: &[u16], dst: &mut [u8]) -> (usiz read += 1; if unit < 0x800 { unsafe { - *(dst.get_unchecked_mut(written)) = (unit >> 6) as u8 | 0xC0u8; + *dst.get_unchecked_mut(written) = + MaybeUninit::new((unit >> 6) as u8 | 0xC0u8); written += 1; - *(dst.get_unchecked_mut(written)) = (unit & 0x3F) as u8 | 0x80u8; + *dst.get_unchecked_mut(written) = + MaybeUninit::new((unit & 0x3F) as u8 | 0x80u8); written += 1; } break; @@ -654,11 +674,14 @@ pub fn convert_utf16_to_utf8_partial_inner(src: &[u16], dst: &mut [u8]) -> (usiz let unit_minus_surrogate_start = unit.wrapping_sub(0xD800); if likely(unit_minus_surrogate_start > (0xDFFF - 0xD800)) { unsafe { - *(dst.get_unchecked_mut(written)) = (unit >> 12) as u8 | 0xE0u8; + *dst.get_unchecked_mut(written) = + MaybeUninit::new((unit >> 12) as u8 | 0xE0u8); written += 1; - *(dst.get_unchecked_mut(written)) = ((unit & 0xFC0) >> 6) as u8 | 0x80u8; + *dst.get_unchecked_mut(written) = + MaybeUninit::new(((unit & 0xFC0) >> 6) as u8 | 0x80u8); written += 1; - *(dst.get_unchecked_mut(written)) = (unit & 0x3F) as u8 | 0x80u8; + *dst.get_unchecked_mut(written) = + MaybeUninit::new((unit & 0x3F) as u8 | 0x80u8); written += 1; } break; @@ -671,11 +694,11 @@ pub fn convert_utf16_to_utf8_partial_inner(src: &[u16], dst: &mut [u8]) -> (usiz debug_assert_eq!(read, src.len()); // Unpaired surrogate at the end of the buffer. unsafe { - *(dst.get_unchecked_mut(written)) = 0xEFu8; + *dst.get_unchecked_mut(written) = MaybeUninit::new(0xEFu8); written += 1; - *(dst.get_unchecked_mut(written)) = 0xBFu8; + *dst.get_unchecked_mut(written) = MaybeUninit::new(0xBFu8); written += 1; - *(dst.get_unchecked_mut(written)) = 0xBDu8; + *dst.get_unchecked_mut(written) = MaybeUninit::new(0xBDu8); written += 1; } return (read, written); @@ -688,15 +711,17 @@ pub fn convert_utf16_to_utf8_partial_inner(src: &[u16], dst: &mut [u8]) -> (usiz let astral = (u32::from(unit) << 10) + u32::from(second) - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32); unsafe { - *(dst.get_unchecked_mut(written)) = (astral >> 18) as u8 | 0xF0u8; + *dst.get_unchecked_mut(written) = + MaybeUninit::new((astral >> 18) as u8 | 0xF0u8); written += 1; - *(dst.get_unchecked_mut(written)) = - ((astral & 0x3F000u32) >> 12) as u8 | 0x80u8; + *dst.get_unchecked_mut(written) = + MaybeUninit::new(((astral & 0x3F000u32) >> 12) as u8 | 0x80u8); written += 1; - *(dst.get_unchecked_mut(written)) = - ((astral & 0xFC0u32) >> 6) as u8 | 0x80u8; + *dst.get_unchecked_mut(written) = + MaybeUninit::new(((astral & 0xFC0u32) >> 6) as u8 | 0x80u8); written += 1; - *(dst.get_unchecked_mut(written)) = (astral & 0x3F) as u8 | 0x80u8; + *dst.get_unchecked_mut(written) = + MaybeUninit::new((astral & 0x3F) as u8 | 0x80u8); written += 1; } break; @@ -707,11 +732,11 @@ pub fn convert_utf16_to_utf8_partial_inner(src: &[u16], dst: &mut [u8]) -> (usiz } // Unpaired low surrogate unsafe { - *(dst.get_unchecked_mut(written)) = 0xEFu8; + *dst.get_unchecked_mut(written) = MaybeUninit::new(0xEFu8); written += 1; - *(dst.get_unchecked_mut(written)) = 0xBFu8; + *dst.get_unchecked_mut(written) = MaybeUninit::new(0xBFu8); written += 1; - *(dst.get_unchecked_mut(written)) = 0xBDu8; + *dst.get_unchecked_mut(written) = MaybeUninit::new(0xBDu8); written += 1; } break; @@ -731,7 +756,7 @@ pub fn convert_utf16_to_utf8_partial_inner(src: &[u16], dst: &mut [u8]) -> (usiz debug_assert_eq!(written, dst.len()); return (read, written); } - dst[written] = unit as u8; + dst[written] = MaybeUninit::new(unit as u8); read += 1; written += 1; // Mysteriously, adding a punctuation check here makes @@ -744,7 +769,10 @@ pub fn convert_utf16_to_utf8_partial_inner(src: &[u16], dst: &mut [u8]) -> (usiz } #[inline(never)] -pub fn convert_utf16_to_utf8_partial_tail(src: &[u16], dst: &mut [u8]) -> (usize, usize) { +pub fn convert_utf16_to_utf8_partial_tail( + src: &[u16], + dst: &mut [MaybeUninit], +) -> (usize, usize) { // Everything below is cold code! let mut read = 0; let mut written = 0; @@ -758,16 +786,16 @@ pub fn convert_utf16_to_utf8_partial_tail(src: &[u16], dst: &mut [u8]) -> (usize return (read, written); } read += 1; - dst[written] = unit as u8; + dst[written] = MaybeUninit::new(unit as u8); written += 1; } else if unit < 0x800 { if written + 2 > dst.len() { return (read, written); } read += 1; - dst[written] = (unit >> 6) as u8 | 0xC0u8; + dst[written] = MaybeUninit::new((unit >> 6) as u8 | 0xC0u8); written += 1; - dst[written] = (unit & 0x3F) as u8 | 0x80u8; + dst[written] = MaybeUninit::new((unit & 0x3F) as u8 | 0x80u8); written += 1; } else { return (read, written); @@ -810,11 +838,11 @@ pub fn convert_utf16_to_utf8_partial_tail(src: &[u16], dst: &mut [u8]) -> (usize unit = 0xFFFD; } } - dst[written] = (unit >> 12) as u8 | 0xE0u8; + dst[written] = MaybeUninit::new((unit >> 12) as u8 | 0xE0u8); written += 1; - dst[written] = ((unit & 0xFC0) >> 6) as u8 | 0x80u8; + dst[written] = MaybeUninit::new(((unit & 0xFC0) >> 6) as u8 | 0x80u8); written += 1; - dst[written] = (unit & 0x3F) as u8 | 0x80u8; + dst[written] = MaybeUninit::new((unit & 0x3F) as u8 | 0x80u8); written += 1; debug_assert_eq!(written, dst.len()); (read, written) @@ -844,10 +872,10 @@ impl Utf8Encoder { pub fn encode_from_utf16_raw( &mut self, src: &[u16], - dst: &mut [u8], + dst: &mut [MaybeUninit], _last: bool, ) -> (EncoderResult, usize, usize) { - let (read, written) = convert_utf16_to_utf8_partial(src, dst); + let (read, written) = convert_utf16_to_utf8_partial_maybeuninit(src, dst); ( if read == src.len() { EncoderResult::InputEmpty @@ -862,13 +890,13 @@ impl Utf8Encoder { pub fn encode_from_utf8_raw( &mut self, src: &str, - dst: &mut [u8], + dst: &mut [MaybeUninit], _last: bool, ) -> (EncoderResult, usize, usize) { let bytes = src.as_bytes(); let mut to_write = bytes.len(); if to_write <= dst.len() { - (&mut dst[..to_write]).copy_from_slice(bytes); + dst[..to_write].init_from_slice(bytes); return (EncoderResult::InputEmpty, to_write, to_write); } to_write = dst.len(); @@ -876,7 +904,7 @@ impl Utf8Encoder { while (bytes[to_write] & 0xC0) == 0x80 { to_write -= 1; } - (&mut dst[..to_write]).copy_from_slice(&bytes[..to_write]); + dst[..to_write].init_from_slice(&bytes[..to_write]); (EncoderResult::OutputFull, to_write, to_write) } } diff --git a/src/variant.rs b/src/variant.rs index dffaf05..87b42af 100644 --- a/src/variant.rs +++ b/src/variant.rs @@ -19,6 +19,8 @@ //! The purpose of making `Decoder` and `Encoder` `Sized` is to allow stack //! allocation in Rust code, including the convenience methods on `Encoding`. +use core::mem::MaybeUninit; + use super::*; use big5::*; use euc_jp::*; @@ -120,7 +122,7 @@ impl VariantDecoder { pub fn decode_to_utf16_raw( &mut self, src: &[u8], - dst: &mut [u16], + dst: &mut [MaybeUninit], last: bool, ) -> (DecoderResult, usize, usize) { match *self { @@ -141,7 +143,7 @@ impl VariantDecoder { pub fn decode_to_utf8_raw( &mut self, src: &[u8], - dst: &mut [u8], + dst: &mut [MaybeUninit], last: bool, ) -> (DecoderResult, usize, usize) { match *self { @@ -301,7 +303,7 @@ impl VariantEncoder { pub fn encode_from_utf16_raw( &mut self, src: &[u16], - dst: &mut [u8], + dst: &mut [MaybeUninit], last: bool, ) -> (EncoderResult, usize, usize) { match *self { @@ -320,7 +322,7 @@ impl VariantEncoder { pub fn encode_from_utf8_raw( &mut self, src: &str, - dst: &mut [u8], + dst: &mut [MaybeUninit], last: bool, ) -> (EncoderResult, usize, usize) { match *self { diff --git a/src/x_user_defined.rs b/src/x_user_defined.rs index cd87b9a..f4bae8a 100644 --- a/src/x_user_defined.rs +++ b/src/x_user_defined.rs @@ -76,7 +76,7 @@ impl UserDefinedDecoder { pub fn decode_to_utf16_raw( &mut self, src: &[u8], - dst: &mut [u16], + dst: &mut [MaybeUninit], _last: bool, ) -> (DecoderResult, usize, usize) { let (pending, length) = if dst.len() < src.len() { @@ -90,14 +90,14 @@ impl UserDefinedDecoder { .iter() .zip(dst_trim.iter_mut()) .for_each(|(from, to)| { - *to = { + *to = MaybeUninit::new({ let unit = *from; if unit < 0x80 { u16::from(unit) } else { u16::from(unit) + 0xF700 } - } + }); }); (pending, length, length) }