From 8c7b0d42eeb7cd17faf4c966bcddc0c3ec0e5c48 Mon Sep 17 00:00:00 2001 From: "Marty B." Date: Fri, 30 Jan 2026 09:36:03 +0100 Subject: [PATCH 1/2] Fix clippy warnings --- src/big5.rs | 64 ++++++++++++++--------------- src/euc_jp.rs | 7 +--- src/euc_kr.rs | 4 +- src/gb18030.rs | 9 ++--- src/handles.rs | 19 +++++---- src/iso_2022_jp.rs | 13 +++--- src/lib.rs | 9 +++-- src/mem.rs | 94 ++++++++++++++++--------------------------- src/shift_jis.rs | 2 +- src/simd_funcs.rs | 3 +- src/single_byte.rs | 34 +++++----------- src/utf_16.rs | 2 +- src/utf_8.rs | 4 +- src/x_user_defined.rs | 3 +- 14 files changed, 113 insertions(+), 154 deletions(-) diff --git a/src/big5.rs b/src/big5.rs index 3d161b5..2ae9d67 100644 --- a/src/big5.rs +++ b/src/big5.rs @@ -279,82 +279,82 @@ mod tests { #[test] fn test_big5_decode() { // Empty - decode_big5(b"", &""); + decode_big5(b"", ""); // ASCII - decode_big5(&[0x61u8, 0x62u8], &"\u{0061}\u{0062}"); + decode_big5(&[0x61u8, 0x62u8], "\u{0061}\u{0062}"); // Edge cases - decode_big5(&[0x87u8, 0x40u8], &"\u{43F0}"); - decode_big5(&[0xFEu8, 0xFEu8], &"\u{79D4}"); - decode_big5(&[0xFEu8, 0xFDu8], &"\u{2910D}"); - decode_big5(&[0x88u8, 0x62u8], &"\u{00CA}\u{0304}"); - decode_big5(&[0x88u8, 0x64u8], &"\u{00CA}\u{030C}"); - decode_big5(&[0x88u8, 0x66u8], &"\u{00CA}"); - decode_big5(&[0x88u8, 0xA3u8], &"\u{00EA}\u{0304}"); - decode_big5(&[0x88u8, 0xA5u8], &"\u{00EA}\u{030C}"); - decode_big5(&[0x88u8, 0xA7u8], &"\u{00EA}"); - decode_big5(&[0x99u8, 0xD4u8], &"\u{8991}"); - decode_big5(&[0x99u8, 0xD5u8], &"\u{27967}"); - decode_big5(&[0x99u8, 0xD6u8], &"\u{8A29}"); + decode_big5(&[0x87u8, 0x40u8], "\u{43F0}"); + decode_big5(&[0xFEu8, 0xFEu8], "\u{79D4}"); + decode_big5(&[0xFEu8, 0xFDu8], "\u{2910D}"); + decode_big5(&[0x88u8, 0x62u8], "\u{00CA}\u{0304}"); + decode_big5(&[0x88u8, 0x64u8], "\u{00CA}\u{030C}"); + decode_big5(&[0x88u8, 0x66u8], "\u{00CA}"); + decode_big5(&[0x88u8, 0xA3u8], "\u{00EA}\u{0304}"); + decode_big5(&[0x88u8, 0xA5u8], "\u{00EA}\u{030C}"); + decode_big5(&[0x88u8, 0xA7u8], "\u{00EA}"); + decode_big5(&[0x99u8, 0xD4u8], "\u{8991}"); + decode_big5(&[0x99u8, 0xD5u8], "\u{27967}"); + decode_big5(&[0x99u8, 0xD6u8], "\u{8A29}"); // Edge cases surrounded with ASCII decode_big5( &[0x61u8, 0x87u8, 0x40u8, 0x62u8], - &"\u{0061}\u{43F0}\u{0062}", + "\u{0061}\u{43F0}\u{0062}", ); decode_big5( &[0x61u8, 0xFEu8, 0xFEu8, 0x62u8], - &"\u{0061}\u{79D4}\u{0062}", + "\u{0061}\u{79D4}\u{0062}", ); decode_big5( &[0x61u8, 0xFEu8, 0xFDu8, 0x62u8], - &"\u{0061}\u{2910D}\u{0062}", + "\u{0061}\u{2910D}\u{0062}", ); decode_big5( &[0x61u8, 0x88u8, 0x62u8, 0x62u8], - &"\u{0061}\u{00CA}\u{0304}\u{0062}", + "\u{0061}\u{00CA}\u{0304}\u{0062}", ); decode_big5( &[0x61u8, 0x88u8, 0x64u8, 0x62u8], - &"\u{0061}\u{00CA}\u{030C}\u{0062}", + "\u{0061}\u{00CA}\u{030C}\u{0062}", ); decode_big5( &[0x61u8, 0x88u8, 0x66u8, 0x62u8], - &"\u{0061}\u{00CA}\u{0062}", + "\u{0061}\u{00CA}\u{0062}", ); decode_big5( &[0x61u8, 0x88u8, 0xA3u8, 0x62u8], - &"\u{0061}\u{00EA}\u{0304}\u{0062}", + "\u{0061}\u{00EA}\u{0304}\u{0062}", ); decode_big5( &[0x61u8, 0x88u8, 0xA5u8, 0x62u8], - &"\u{0061}\u{00EA}\u{030C}\u{0062}", + "\u{0061}\u{00EA}\u{030C}\u{0062}", ); decode_big5( &[0x61u8, 0x88u8, 0xA7u8, 0x62u8], - &"\u{0061}\u{00EA}\u{0062}", + "\u{0061}\u{00EA}\u{0062}", ); decode_big5( &[0x61u8, 0x99u8, 0xD4u8, 0x62u8], - &"\u{0061}\u{8991}\u{0062}", + "\u{0061}\u{8991}\u{0062}", ); decode_big5( &[0x61u8, 0x99u8, 0xD5u8, 0x62u8], - &"\u{0061}\u{27967}\u{0062}", + "\u{0061}\u{27967}\u{0062}", ); decode_big5( &[0x61u8, 0x99u8, 0xD6u8, 0x62u8], - &"\u{0061}\u{8A29}\u{0062}", + "\u{0061}\u{8A29}\u{0062}", ); // Bad sequences - decode_big5(&[0x80u8, 0x61u8], &"\u{FFFD}\u{0061}"); - decode_big5(&[0xFFu8, 0x61u8], &"\u{FFFD}\u{0061}"); - decode_big5(&[0xFEu8, 0x39u8], &"\u{FFFD}\u{0039}"); - decode_big5(&[0x87u8, 0x66u8], &"\u{FFFD}\u{0066}"); - decode_big5(&[0x81u8, 0x40u8], &"\u{FFFD}\u{0040}"); - decode_big5(&[0x61u8, 0x81u8], &"\u{0061}\u{FFFD}"); + decode_big5(&[0x80u8, 0x61u8], "\u{FFFD}\u{0061}"); + decode_big5(&[0xFFu8, 0x61u8], "\u{FFFD}\u{0061}"); + decode_big5(&[0xFEu8, 0x39u8], "\u{FFFD}\u{0039}"); + decode_big5(&[0x87u8, 0x66u8], "\u{FFFD}\u{0066}"); + decode_big5(&[0x81u8, 0x40u8], "\u{FFFD}\u{0040}"); + decode_big5(&[0x61u8, 0x81u8], "\u{0061}\u{FFFD}"); } #[test] diff --git a/src/euc_jp.rs b/src/euc_jp.rs index bf95a1f..6eb946a 100644 --- a/src/euc_jp.rs +++ b/src/euc_jp.rs @@ -24,10 +24,7 @@ enum EucJpPending { impl EucJpPending { fn is_none(&self) -> bool { - match *self { - EucJpPending::None => true, - _ => false, - } + matches!(*self, EucJpPending::None) } fn count(&self) -> usize { @@ -362,7 +359,7 @@ mod tests { #[test] fn test_euc_jp_decode() { // Empty - decode_euc_jp(b"", &""); + decode_euc_jp(b"", ""); // ASCII decode_euc_jp(b"\x61\x62", "\u{0061}\u{0062}"); diff --git a/src/euc_kr.rs b/src/euc_kr.rs index ab92b0f..23d3251 100644 --- a/src/euc_kr.rs +++ b/src/euc_kr.rs @@ -255,7 +255,7 @@ fn ksx1001_encode_hangul(bmp: u16, _: u16) -> (u8, u8) { } else { 0x41 }; - (lead as u8, (cp949_trail + offset) as u8) + (lead, cp949_trail + offset) } } } @@ -378,7 +378,7 @@ mod tests { #[test] fn test_euc_kr_decode() { // Empty - decode_euc_kr(b"", &""); + decode_euc_kr(b"", ""); // ASCII decode_euc_kr(b"\x61\x62", "\u{0061}\u{0062}"); diff --git a/src/gb18030.rs b/src/gb18030.rs index 5cfd97e..e9e1cba 100644 --- a/src/gb18030.rs +++ b/src/gb18030.rs @@ -25,10 +25,7 @@ enum Gb18030Pending { impl Gb18030Pending { fn is_none(&self) -> bool { - match *self { - Gb18030Pending::None => true, - _ => false, - } + matches!(*self, Gb18030Pending::None) } fn count(&self) -> usize { @@ -270,7 +267,7 @@ impl Gb18030Decoder { } else { handle.write_bmp_excl_ascii(gb18030_range_decode(pointer as u16)) } - } else if pointer >= 189_000 && pointer <= 1_237_575 { + } else if (189_000..=1_237_575).contains(&pointer) { // Astral handle.write_astral((pointer - (189_000usize - 0x1_0000usize)) as u32) } else { @@ -602,7 +599,7 @@ mod tests { #[test] fn test_gb18030_decode() { // Empty - decode_gb18030(b"", &""); + decode_gb18030(b"", ""); // ASCII decode_gb18030(b"\x61\x62", "\u{0061}\u{0062}"); diff --git a/src/handles.rs b/src/handles.rs index 94cecbd..ca3eaa0 100644 --- a/src/handles.rs +++ b/src/handles.rs @@ -387,7 +387,7 @@ pub struct ByteSource<'a> { impl<'a> ByteSource<'a> { #[inline(always)] - pub fn new(src: &[u8]) -> ByteSource { + pub fn new(src: &'a [u8]) -> ByteSource<'a> { ByteSource { slice: src, pos: 0 } } #[inline(always)] @@ -594,7 +594,7 @@ pub struct Utf16Destination<'a> { impl<'a> Utf16Destination<'a> { #[inline(always)] - pub fn new(dst: &mut [u16]) -> Utf16Destination { + pub fn new(dst: &'a mut [u16]) -> Utf16Destination<'a> { Utf16Destination { slice: dst, pos: 0 } } #[inline(always)] @@ -939,7 +939,7 @@ pub struct Utf8Destination<'a> { impl<'a> Utf8Destination<'a> { #[inline(always)] - pub fn new(dst: &mut [u8]) -> Utf8Destination { + pub fn new(dst: &mut [u8]) -> Utf8Destination<'_> { Utf8Destination { slice: dst, pos: 0 } } #[inline(always)] @@ -1116,7 +1116,7 @@ impl<'a> Utf8Destination<'a> { // Validate first, then memcpy to let memcpy do its thing even for // non-ASCII. (And potentially do something better than SSE2 for ASCII.) let valid_len = utf8_valid_up_to(&src_remaining[..min_len]); - (&mut dst_remaining[..valid_len]).copy_from_slice(&src_remaining[..valid_len]); + dst_remaining[..valid_len].copy_from_slice(&src_remaining[..valid_len]); source.pos += valid_len; self.pos += valid_len; } @@ -1164,7 +1164,7 @@ pub struct Utf16Source<'a> { impl<'a> Utf16Source<'a> { #[inline(always)] - pub fn new(src: &[u16]) -> Utf16Source { + pub fn new(src: &[u16]) -> Utf16Source<'_> { Utf16Source { slice: src, pos: 0 } } #[inline(always)] @@ -1272,6 +1272,7 @@ impl<'a> Utf16Source<'a> { Some((non_ascii, consumed)) => { self.pos += consumed; dest.advance(consumed); + #[allow(clippy::len_zero)] if dest.remaining().len() >= 1 { self.pos += 1; // commit to reading `non_ascii` let unit = non_ascii; @@ -1466,7 +1467,7 @@ pub struct Utf8Source<'a> { impl<'a> Utf8Source<'a> { #[inline(always)] - pub fn new(src: &str) -> Utf8Source { + pub fn new(src: &str) -> Utf8Source<'_> { Utf8Source { slice: src.as_bytes(), pos: 0, @@ -1614,6 +1615,7 @@ impl<'a> Utf8Source<'a> { Some((non_ascii, consumed)) => { self.pos += consumed; dest.advance(consumed); + #[allow(clippy::len_zero)] if dest.remaining().len() >= 1 { if non_ascii < 0xE0 { let point = ((u16::from(non_ascii) & 0x1F) << 6) @@ -1922,7 +1924,7 @@ pub struct ByteDestination<'a> { impl<'a> ByteDestination<'a> { #[inline(always)] - pub fn new(dst: &mut [u8]) -> ByteDestination { + pub fn new(dst: &mut [u8]) -> ByteDestination<'_> { ByteDestination { start: dst.as_ptr(), slice: dst, @@ -1930,10 +1932,11 @@ impl<'a> ByteDestination<'a> { } #[inline(always)] pub fn remaining(&mut self) -> &mut [u8] { - &mut self.slice + self.slice } #[inline(always)] pub fn check_space_one<'b>(&'b mut self) -> Space> { + #[allow(clippy::len_zero)] if self.slice.len() >= 1 { Space::Available(ByteOneHandle::new(self)) } else { diff --git a/src/iso_2022_jp.rs b/src/iso_2022_jp.rs index 39bb38c..dac1a32 100644 --- a/src/iso_2022_jp.rs +++ b/src/iso_2022_jp.rs @@ -190,7 +190,7 @@ impl Iso2022JpDecoder { continue; } self.output_flag = false; - if b >= 0x21u8 && b <= 0x5Fu8 { + if (0x21u8..=0x5Fu8).contains(&b) { destination_handle.write_upper_bmp(u16::from(b) - 0x21u16 + 0xFF61u16); continue; } @@ -206,7 +206,7 @@ impl Iso2022JpDecoder { continue; } self.output_flag = false; - if b >= 0x21u8 && b <= 0x7Eu8 { + if (0x21u8..=0x7Eu8).contains(&b) { self.lead = b; self.decoder_state = Iso2022JpDecoderState::TrailByte; continue; @@ -376,6 +376,7 @@ fn is_kanji_mapped(bmp: u16) -> bool { #[allow(clippy::redundant_pattern_matching, clippy::if_same_then_else)] #[inline(always)] fn is_kanji_mapped(bmp: u16) -> bool { + #[allow(clippy::match_like_matches_macro)] if 0x4EDD == bmp { true } else if let Some(_) = jis0208_level1_kanji_shift_jis_encode(bmp) { @@ -407,6 +408,7 @@ fn is_mapped_for_two_byte_encode(bmp: u16) -> bool { true } else { let bmp_minus_space = bmp.wrapping_sub(0x3000); + #[allow(clippy::match_like_matches_macro)] if bmp_minus_space < 3 { // fast-track common punctuation true @@ -477,10 +479,7 @@ impl Iso2022JpEncoder { } pub fn has_pending_state(&self) -> bool { - match self.state { - Iso2022JpEncoderState::Ascii => false, - _ => true, - } + !matches!(self.state, Iso2022JpEncoderState::Ascii) } pub fn max_buffer_length_from_utf16_without_replacement( @@ -764,7 +763,7 @@ mod tests { #[test] fn test_iso_2022_jp_decode() { // Empty - decode_iso_2022_jp(b"", &""); + decode_iso_2022_jp(b"", ""); // ASCII decode_iso_2022_jp(b"\x61\x62", "\u{0061}\u{0062}"); diff --git a/src/lib.rs b/src/lib.rs index 3239db5..78e0065 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2746,7 +2746,7 @@ impl Encoding { pub fn for_label(label: &[u8]) -> Option<&'static Encoding> { let mut trimmed = [0u8; LONGEST_LABEL_LENGTH]; let mut trimmed_pos = 0usize; - let mut iter = label.into_iter(); + let mut iter = label.iter(); // before loop { match iter.next() { @@ -3322,6 +3322,7 @@ impl Encoding { .unwrap() .next_power_of_two(), ); + #[allow(clippy::uninit_vec)] unsafe { vec.set_len(valid_up_to); core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to); @@ -3452,7 +3453,7 @@ impl Encoding { impl PartialEq for Encoding { #[inline] fn eq(&self, other: &Encoding) -> bool { - (self as *const Encoding) == (other as *const Encoding) + core::ptr::eq(self, other) } } @@ -3461,7 +3462,7 @@ impl Eq for Encoding {} #[cfg(test)] impl PartialOrd for Encoding { fn partial_cmp(&self, other: &Self) -> Option { - (self as *const Encoding as usize).partial_cmp(&(other as *const Encoding as usize)) + Some(self.cmp(other)) } } @@ -4353,7 +4354,7 @@ impl Decoder { pub fn latin1_byte_compatible_up_to(&self, bytes: &[u8]) -> Option { match self.life_cycle { DecoderLifeCycle::Converting => { - return self.variant.latin1_byte_compatible_up_to(bytes); + self.variant.latin1_byte_compatible_up_to(bytes) } DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."), _ => None, diff --git a/src/mem.rs b/src/mem.rs index 92941bb..c6a2d4a 100644 --- a/src/mem.rs +++ b/src/mem.rs @@ -229,7 +229,7 @@ macro_rules! by_unit_check_simd { let mut simd_accu = $splat; while offset <= len_minus_stride { // Safety: the above check lets us perform one $simd_ty read. - simd_accu = simd_accu | unsafe { *(src.add(offset) as *const $simd_ty) }; + simd_accu |= unsafe { *(src.add(offset) as *const $simd_ty) }; offset += SIMD_STRIDE_SIZE / unit_size; } if !$func(simd_accu) { @@ -566,7 +566,7 @@ cfg_if! { } } } - let mut iter = (&buffer[offset..]).iter(); + let mut iter = buffer[offset..].iter(); loop { if let Some(&u) = iter.next() { if u > 0xFF { @@ -623,7 +623,7 @@ cfg_if! { } } } - let mut iter = (&buffer[offset..]).iter(); + let mut iter = buffer[offset..].iter(); loop { if let Some(&u) = iter.next() { if u > 0xFF { @@ -2017,10 +2017,11 @@ pub fn decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str> { let (head, tail) = bytes.split_at(up_to); let capacity = head.len() + tail.len() * 2; let mut vec = Vec::with_capacity(capacity); + #[allow(clippy::uninit_vec)] unsafe { vec.set_len(capacity); } - (&mut vec[..up_to]).copy_from_slice(head); + vec[..up_to].copy_from_slice(head); let written = convert_latin1_to_utf8(tail, &mut vec[up_to..]); vec.truncate(up_to + written); Cow::Owned(unsafe { String::from_utf8_unchecked(vec) }) @@ -2054,10 +2055,11 @@ pub fn encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]> { let (head, tail) = bytes.split_at(up_to); let capacity = bytes.len(); let mut vec = Vec::with_capacity(capacity); + #[allow(clippy::uninit_vec)] unsafe { vec.set_len(capacity); } - (&mut vec[..up_to]).copy_from_slice(head); + vec[..up_to].copy_from_slice(head); let written = convert_utf8_to_latin1_lossy(tail, &mut vec[up_to..]); vec.truncate(up_to + written); Cow::Owned(vec) @@ -2079,7 +2081,7 @@ pub fn utf8_latin1_up_to(buffer: &[u8]) -> usize { /// Returns the index of first byte that starts a non-Latin1 byte /// sequence, or the length of the string if there are none. pub fn str_latin1_up_to(buffer: &str) -> usize { - is_str_latin1_impl(buffer).unwrap_or_else(|| buffer.len()) + is_str_latin1_impl(buffer).unwrap_or(buffer.len()) } /// Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER. @@ -2182,8 +2184,7 @@ mod tests { #[test] fn test_is_ascii_success() { - let mut src: Vec = Vec::with_capacity(128); - src.resize(128, 0); + let mut src: Vec = vec![0; 128]; for i in 0..src.len() { src[i] = i as u8; } @@ -2194,8 +2195,7 @@ mod tests { #[test] fn test_is_ascii_fail() { - let mut src: Vec = Vec::with_capacity(128); - src.resize(128, 0); + let mut src: Vec = vec![0; 128]; for i in 0..src.len() { src[i] = i as u8; } @@ -2210,8 +2210,7 @@ mod tests { #[test] fn test_is_basic_latin_success() { - let mut src: Vec = Vec::with_capacity(128); - src.resize(128, 0); + let mut src: Vec = vec![0; 128]; for i in 0..src.len() { src[i] = i as u16; } @@ -2222,8 +2221,7 @@ mod tests { #[test] fn test_is_basic_latin_fail() { - let mut src: Vec = Vec::with_capacity(128); - src.resize(128, 0); + let mut src: Vec = vec![0; 128]; for i in 0..src.len() { src[i] = i as u16; } @@ -2238,8 +2236,7 @@ mod tests { #[test] fn test_is_utf16_latin1_success() { - let mut src: Vec = Vec::with_capacity(256); - src.resize(256, 0); + let mut src: Vec = vec![0; 256]; for i in 0..src.len() { src[i] = i as u16; } @@ -2255,8 +2252,7 @@ mod tests { #[test] fn test_is_utf16_latin1_fail() { let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow - let mut src: Vec = Vec::with_capacity(len); - src.resize(len, 0); + let mut src: Vec = vec![0; len]; for i in 0..src.len() { src[i] = i as u16; } @@ -2273,8 +2269,7 @@ mod tests { #[test] fn test_is_str_latin1_success() { let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow - let mut src: Vec = Vec::with_capacity(len); - src.resize(len, 0); + let mut src: Vec = vec![0; len]; for i in 0..src.len() { src[i] = i as u16; } @@ -2288,8 +2283,7 @@ mod tests { #[test] fn test_is_str_latin1_fail() { let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow - let mut src: Vec = Vec::with_capacity(len); - src.resize(len, 0); + let mut src: Vec = vec![0; len]; for i in 0..src.len() { src[i] = i as u16; } @@ -2307,8 +2301,7 @@ mod tests { #[test] fn test_is_utf8_latin1_success() { let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow - let mut src: Vec = Vec::with_capacity(len); - src.resize(len, 0); + let mut src: Vec = vec![0; len]; for i in 0..src.len() { src[i] = i as u16; } @@ -2325,8 +2318,7 @@ mod tests { #[test] fn test_is_utf8_latin1_fail() { let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow - let mut src: Vec = Vec::with_capacity(len); - src.resize(len, 0); + let mut src: Vec = vec![0; len]; for i in 0..src.len() { src[i] = i as u16; } @@ -2357,8 +2349,7 @@ mod tests { #[test] fn test_convert_utf8_to_utf16() { let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; - let mut dst: Vec = Vec::with_capacity(src.len() + 1); - dst.resize(src.len() + 1, 0); + let mut dst: Vec = vec![0; src.len() + 1]; let len = convert_utf8_to_utf16(src.as_bytes(), &mut dst[..]); dst.truncate(len); let reference: Vec = src.encode_utf16().collect(); @@ -2368,8 +2359,7 @@ mod tests { #[test] fn test_convert_str_to_utf16() { let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; - let mut dst: Vec = Vec::with_capacity(src.len()); - dst.resize(src.len(), 0); + let mut dst: Vec = vec![0; src.len()]; let len = convert_str_to_utf16(src, &mut dst[..]); dst.truncate(len); let reference: Vec = src.encode_utf16().collect(); @@ -2380,8 +2370,7 @@ mod tests { fn test_convert_utf16_to_utf8_partial() { let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; let src: Vec = reference.encode_utf16().collect(); - let mut dst: Vec = Vec::with_capacity(src.len() * 3 + 1); - dst.resize(src.len() * 3 + 1, 0); + let mut dst: Vec = vec![0; src.len() * 3 + 1]; let (read, written) = convert_utf16_to_utf8_partial(&src[..], &mut dst[..24]); let len = written + convert_utf16_to_utf8(&src[read..], &mut dst[written..]); dst.truncate(len); @@ -2392,8 +2381,7 @@ mod tests { fn test_convert_utf16_to_utf8() { let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; let src: Vec = reference.encode_utf16().collect(); - let mut dst: Vec = Vec::with_capacity(src.len() * 3 + 1); - dst.resize(src.len() * 3 + 1, 0); + let mut dst: Vec = vec![0; src.len() * 3 + 1]; let len = convert_utf16_to_utf8(&src[..], &mut dst[..]); dst.truncate(len); assert_eq!(dst, reference.as_bytes()); @@ -2401,16 +2389,13 @@ mod tests { #[test] fn test_convert_latin1_to_utf16() { - let mut src: Vec = Vec::with_capacity(256); - src.resize(256, 0); - let mut reference: Vec = Vec::with_capacity(256); - reference.resize(256, 0); + let mut src: Vec = vec![0; 256]; + let mut reference: Vec = vec![0; 256]; for i in 0..256 { src[i] = i as u8; reference[i] = i as u16; } - let mut dst: Vec = Vec::with_capacity(src.len()); - dst.resize(src.len(), 0); + let mut dst: Vec = vec![0; src.len()]; convert_latin1_to_utf16(&src[..], &mut dst[..]); assert_eq!(dst, reference); } @@ -2425,17 +2410,14 @@ mod tests { #[test] fn test_convert_latin1_to_utf8() { - let mut src: Vec = Vec::with_capacity(256); - src.resize(256, 0); - let mut reference: Vec = Vec::with_capacity(256); - reference.resize(256, 0); + let mut src: Vec = vec![0; 256]; + let mut reference: Vec = vec![0; 256]; for i in 0..256 { src[i] = i as u8; reference[i] = i as u16; } let s = String::from_utf16(&reference[..]).unwrap(); - let mut dst: Vec = Vec::with_capacity(src.len() * 2); - dst.resize(src.len() * 2, 0); + let mut dst: Vec = vec![0; src.len() * 2]; let len = convert_latin1_to_utf8(&src[..], &mut dst[..]); dst.truncate(len); assert_eq!(&dst[..], s.as_bytes()); @@ -2443,17 +2425,14 @@ mod tests { #[test] fn test_convert_utf8_to_latin1_lossy() { - let mut reference: Vec = Vec::with_capacity(256); - reference.resize(256, 0); - let mut src16: Vec = Vec::with_capacity(256); - src16.resize(256, 0); + let mut reference: Vec = vec![0; 256]; + let mut src16: Vec = vec![0; 256]; for i in 0..256 { src16[i] = i as u16; reference[i] = i as u8; } let src = String::from_utf16(&src16[..]).unwrap(); - let mut dst: Vec = Vec::with_capacity(src.len()); - dst.resize(src.len(), 0); + let mut dst: Vec = vec![0; src.len()]; let len = convert_utf8_to_latin1_lossy(src.as_bytes(), &mut dst[..]); dst.truncate(len); assert_eq!(dst, reference); @@ -2469,16 +2448,13 @@ mod tests { #[test] fn test_convert_utf16_to_latin1_lossy() { - let mut src: Vec = Vec::with_capacity(256); - src.resize(256, 0); - let mut reference: Vec = Vec::with_capacity(256); - reference.resize(256, 0); + let mut src: Vec = vec![0; 256]; + let mut reference: Vec = vec![0; 256]; for i in 0..256 { src[i] = i as u16; reference[i] = i as u8; } - let mut dst: Vec = Vec::with_capacity(src.len()); - dst.resize(src.len(), 0); + let mut dst: Vec = vec![0; src.len()]; convert_utf16_to_latin1_lossy(&src[..], &mut dst[..]); assert_eq!(dst, reference); } @@ -2487,7 +2463,7 @@ mod tests { // #[should_panic] fn test_convert_utf16_to_latin1_lossy_panics() { let mut dst = [0u8; 16]; - let _ = convert_utf16_to_latin1_lossy(&[0x0100u16], &mut dst[..]); + convert_utf16_to_latin1_lossy(&[0x0100u16], &mut dst[..]); } #[test] diff --git a/src/shift_jis.rs b/src/shift_jis.rs index b201ae4..cb481a8 100644 --- a/src/shift_jis.rs +++ b/src/shift_jis.rs @@ -321,7 +321,7 @@ mod tests { #[test] fn test_shift_jis_decode() { // Empty - decode_shift_jis(b"", &""); + decode_shift_jis(b"", ""); // ASCII decode_shift_jis(b"\x61\x62", "\u{0061}\u{0062}"); diff --git a/src/simd_funcs.rs b/src/simd_funcs.rs index d082418..10fdb77 100644 --- a/src/simd_funcs.rs +++ b/src/simd_funcs.rs @@ -388,8 +388,7 @@ mod tests { ]; let first = unsafe { load8_unaligned(basic_latin.as_ptr()) }; let second = unsafe { load8_unaligned(basic_latin.as_ptr().add(8)) }; - let mut vec = Vec::with_capacity(16); - vec.resize(16, 0u8); + let mut vec = vec![0; 16]; let ptr = vec.as_mut_ptr(); assert!(simd_is_basic_latin(first | second)); unsafe { diff --git a/src/single_byte.rs b/src/single_byte.rs index 49f099e..303f795 100644 --- a/src/single_byte.rs +++ b/src/single_byte.rs @@ -646,18 +646,11 @@ mod tests { fn decode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128]) { let mut with_replacement = [0u16; 128]; let mut it = data.iter().enumerate(); - loop { - match it.next() { - Some((i, code_point)) => { - if *code_point == 0 { - with_replacement[i] = 0xFFFD; - } else { - with_replacement[i] = *code_point; - } - } - None => { - break; - } + while let Some((i, code_point)) = it.next() { + if *code_point == 0 { + with_replacement[i] = 0xFFFD; + } else { + with_replacement[i] = *code_point; } } @@ -667,18 +660,11 @@ mod tests { fn encode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128]) { let mut with_zeros = [0u8; 128]; let mut it = data.iter().enumerate(); - loop { - match it.next() { - Some((i, code_point)) => { - if *code_point == 0 { - with_zeros[i] = 0; - } else { - with_zeros[i] = HIGH_BYTES[i]; - } - } - None => { - break; - } + while let Some((i, code_point)) = it.next() { + if *code_point == 0 { + with_zeros[i] = 0; + } else { + with_zeros[i] = HIGH_BYTES[i]; } } diff --git a/src/utf_16.rs b/src/utf_16.rs index f24806d..de402cf 100644 --- a/src/utf_16.rs +++ b/src/utf_16.rs @@ -145,7 +145,7 @@ impl Utf16Decoder { // The previous high surrogate was in // error and this one becomes the new // pending one. - self.lead_surrogate = code_unit as u16; + self.lead_surrogate = code_unit; return ( DecoderResult::Malformed(2, 2), unread_handle.consumed(), diff --git a/src/utf_8.rs b/src/utf_8.rs index 8d836c2..93cfb0f 100644 --- a/src/utf_8.rs +++ b/src/utf_8.rs @@ -868,7 +868,7 @@ impl Utf8Encoder { let bytes = src.as_bytes(); let mut to_write = bytes.len(); if to_write <= dst.len() { - (&mut dst[..to_write]).copy_from_slice(bytes); + dst[..to_write].copy_from_slice(bytes); return (EncoderResult::InputEmpty, to_write, to_write); } to_write = dst.len(); @@ -876,7 +876,7 @@ impl Utf8Encoder { while (bytes[to_write] & 0xC0) == 0x80 { to_write -= 1; } - (&mut dst[..to_write]).copy_from_slice(&bytes[..to_write]); + dst[..to_write].copy_from_slice(&bytes[..to_write]); (EncoderResult::OutputFull, to_write, to_write) } } diff --git a/src/x_user_defined.rs b/src/x_user_defined.rs index cd87b9a..bd2ffa8 100644 --- a/src/x_user_defined.rs +++ b/src/x_user_defined.rs @@ -16,6 +16,7 @@ cfg_if! { use simd_funcs::*; use core::simd::u16x8; use core::simd::cmp::SimdPartialOrd; + use core::simd::Select; #[inline(always)] fn shift_upper(unpacked: u16x8) -> u16x8 { @@ -180,7 +181,7 @@ impl UserDefinedEncoder { destination_handle.write_one(c as u8); continue; } - if c < '\u{F780}' || c > '\u{F7FF}' { + if !('\u{F780}'..='\u{F7FF}').contains(&c) { return ( EncoderResult::Unmappable(c), unread_handle.consumed(), From 67812b96995925149503cb2fb9d9d2ab06d9b14a Mon Sep 17 00:00:00 2001 From: "Marty B." Date: Fri, 30 Jan 2026 09:36:15 +0100 Subject: [PATCH 2/2] Further fixes --- .vscode/settings.json | 20 +++++ Cargo.toml | 8 +- src/ascii.rs | 202 +++++++++++++++++++++++------------------- src/variant.rs | 10 +-- 4 files changed, 141 insertions(+), 99 deletions(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..fe2db73 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,20 @@ +{ + "git.ignoreLimitWarning": true, + "files.eol": "\n", + "editor.formatOnSave": true, + "files.exclude": { + "tmp/**": true + }, + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter" + }, + "[markdown]": { + "editor.formatOnSave": true, + "editor.formatOnPaste": true + }, + "markdownlint.config": { + "MD013": false, + "MD024": false + }, + "cSpell.diagnosticLevel": "Hint", +} diff --git a/Cargo.toml b/Cargo.toml index f862dce..995b4d7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,8 +1,8 @@ [package] name = "encoding_rs" description = "A Gecko-oriented implementation of the Encoding Standard" -version = "0.8.35" -edition = '2018' +version = "0.8.36" +edition = '2024' authors = ["Henri Sivonen "] license = "(Apache-2.0 OR MIT) AND BSD-3-Clause" include = ["src/**/*.rs", "/data", "Cargo.toml", "COPYRIGHT", "LICENSE*", "README.md"] @@ -12,7 +12,7 @@ homepage = "https://docs.rs/encoding_rs/" repository = "https://github.com/hsivonen/encoding_rs" keywords = ["encoding", "web", "unicode", "charset"] categories = ["text-processing", "encoding", "web-programming", "internationalization"] -rust-version = "1.40" +rust-version = "1.86" [features] default = ["alloc"] @@ -39,7 +39,7 @@ any_all_workaround = { version = "0.1.0" , optional = true } [dev-dependencies] serde_derive = "1.0" -bincode = "1.0" +bincode = "2.0" serde_json = "1.0" [profile.release] diff --git a/src/ascii.rs b/src/ascii.rs index 2f54360..a257461 100644 --- a/src/ascii.rs +++ b/src/ascii.rs @@ -153,31 +153,33 @@ macro_rules! ascii_alu { // // Safety: This is the naïve code once again, for `until_alignment` bytes while until_alignment != 0 { - let code_unit = *(src.add(offset)); + let code_unit = unsafe { *(src.add(offset)) }; if code_unit > 127 { // Safety: Upholds safety-usable invariant here return Some((code_unit, offset)); } - *(dst.add(offset)) = code_unit as $dst_unit; + unsafe { *(dst.add(offset)) = code_unit as $dst_unit }; // Safety: offset is the number of bytes copied so far offset += 1; until_alignment -= 1; } let len_minus_stride = len - ALU_STRIDE_SIZE; loop { - // Safety: num_ascii is known to be a byte index of a non-ascii byte due to stride_fn's invariant - if let Some(num_ascii) = $stride_fn( - // Safety: These are known to be valid and aligned since we have at - // least ALU_STRIDE_SIZE data in these buffers, and offset is the - // number of elements copied so far, which according to the - // until_alignment calculation above will cause both src and dst to be - // aligned to usize after this add - src.add(offset) as *const usize, - dst.add(offset) as *mut usize, - ) { - offset += num_ascii; - // Safety: Upholds safety-usable invariant here by indexing into non-ascii byte - return Some((*(src.add(offset)), offset)); + unsafe { + // Safety: num_ascii is known to be a byte index of a non-ascii byte due to stride_fn's invariant + if let Some(num_ascii) = $stride_fn( + // Safety: These are known to be valid and aligned since we have at + // least ALU_STRIDE_SIZE data in these buffers, and offset is the + // number of elements copied so far, which according to the + // until_alignment calculation above will cause both src and dst to be + // aligned to usize after this add + src.add(offset) as *const usize, + dst.add(offset) as *mut usize, + ) { + offset += num_ascii; + // Safety: Upholds safety-usable invariant here by indexing into non-ascii byte + return Some((*(src.add(offset)), offset)); + } } // Safety: offset continues to be the number of bytes copied so far, and // maintains usize alignment for the next loop iteration @@ -196,13 +198,13 @@ macro_rules! ascii_alu { // other than src/dst being valid for the the right lens while offset < len { // Safety: len invariant used here - let code_unit = *(src.add(offset)); + let code_unit = unsafe { *(src.add(offset)) }; if code_unit > 127 { // Safety: Upholds safety-usable invariant here return Some((code_unit, offset)); } // Safety: len invariant used here - *(dst.add(offset)) = code_unit as $dst_unit; + unsafe { *(dst.add(offset)) = code_unit as $dst_unit }; offset += 1; } None @@ -283,28 +285,30 @@ macro_rules! basic_latin_alu { // // Safety: This is the naïve code once again, for `until_alignment` bytes while until_alignment != 0 { - let code_unit = *(src.add(offset)); + let code_unit = unsafe { *(src.add(offset)) }; if code_unit > 127 { // Safety: Upholds safety-usable invariant here return Some((code_unit, offset)); } - *(dst.add(offset)) = code_unit as $dst_unit; + unsafe { *(dst.add(offset)) = code_unit as $dst_unit }; // Safety: offset is the number of bytes copied so far offset += 1; until_alignment -= 1; } let len_minus_stride = len - ALU_STRIDE_SIZE; loop { - if !$stride_fn( - // Safety: These are known to be valid and aligned since we have at - // least ALU_STRIDE_SIZE data in these buffers, and offset is the - // number of elements copied so far, which according to the - // until_alignment calculation above will cause both src and dst to be - // aligned to usize after this add - src.add(offset) as *const usize, - dst.add(offset) as *mut usize, - ) { - break; + unsafe { + if !$stride_fn( + // Safety: These are known to be valid and aligned since we have at + // least ALU_STRIDE_SIZE data in these buffers, and offset is the + // number of elements copied so far, which according to the + // until_alignment calculation above will cause both src and dst to be + // aligned to usize after this add + src.add(offset) as *const usize, + dst.add(offset) as *mut usize, + ) { + break; + } } // Safety: offset continues to be the number of bytes copied so far, and // maintains usize alignment for the next loop iteration @@ -321,13 +325,13 @@ macro_rules! basic_latin_alu { // Safety: This is the naïve code once again, for leftover bytes while offset < len { // Safety: len invariant used here - let code_unit = *(src.add(offset)); + let code_unit = unsafe { *(src.add(offset)) }; if code_unit > 127 { // Safety: Upholds safety-usable invariant here return Some((code_unit, offset)); } // Safety: len invariant used here - *(dst.add(offset)) = code_unit as $dst_unit; + unsafe { *(dst.add(offset)) = code_unit as $dst_unit }; offset += 1; } None @@ -378,23 +382,25 @@ macro_rules! latin1_alu { if until_alignment + ALU_STRIDE_SIZE <= len { // Safety: This is the naïve code once again, for `until_alignment` bytes while until_alignment != 0 { - let code_unit = *(src.add(offset)); - *(dst.add(offset)) = code_unit as $dst_unit; + let code_unit = unsafe { *(src.add(offset)) }; + unsafe { *(dst.add(offset)) = code_unit as $dst_unit }; // Safety: offset is the number of bytes copied so far offset += 1; until_alignment -= 1; } let len_minus_stride = len - ALU_STRIDE_SIZE; loop { - $stride_fn( - // Safety: These are known to be valid and aligned since we have at - // least ALU_STRIDE_SIZE data in these buffers, and offset is the - // number of elements copied so far, which according to the - // until_alignment calculation above will cause both src and dst to be - // aligned to usize after this add - src.add(offset) as *const usize, - dst.add(offset) as *mut usize, - ); + unsafe { + $stride_fn( + // Safety: These are known to be valid and aligned since we have at + // least ALU_STRIDE_SIZE data in these buffers, and offset is the + // number of elements copied so far, which according to the + // until_alignment calculation above will cause both src and dst to be + // aligned to usize after this add + src.add(offset) as *const usize, + dst.add(offset) as *mut usize, + ); + } // Safety: offset continues to be the number of bytes copied so far, and // maintains usize alignment for the next loop iteration offset += ALU_STRIDE_SIZE; @@ -410,8 +416,8 @@ macro_rules! latin1_alu { // Safety: This is the naïve code once again, for leftover bytes while offset < len { // Safety: len invariant used here - let code_unit = *(src.add(offset)); - *(dst.add(offset)) = code_unit as $dst_unit; + let code_unit = unsafe { *(src.add(offset)) }; + unsafe { *(dst.add(offset)) = code_unit as $dst_unit }; offset += 1; } } @@ -1278,10 +1284,12 @@ cfg_if! { ((0x0000_FF00_0000_0000usize & second_word) >> 24) | ((0x0000_00FF_0000_0000usize & second_word) >> 32); // Safety: fn invariant used here - *dst = first; - *(dst.add(1)) = second; - *(dst.add(2)) = third; - *(dst.add(3)) = fourth; + unsafe { + *dst = first; + *(dst.add(1)) = second; + *(dst.add(2)) = third; + *(dst.add(3)) = fourth; + } } /// Safety: dst must point to valid space for writing two `usize`s @@ -1304,8 +1312,10 @@ cfg_if! { ((0x0000_0000_00FF_0000usize & third) >> 8) | (0x0000_0000_0000_00FFusize & third); // Safety: fn invariant used here - *dst = word; - *(dst.add(1)) = second_word; + unsafe { + *dst = word; + *(dst.add(1)) = second_word; + } } } else if #[cfg(all(target_endian = "little", target_pointer_width = "32"))] { // Aligned ALU word, little-endian, 32-bit @@ -1712,9 +1722,11 @@ cfg_if! { /// Safety-usable invariant: will return byte index of first non-ascii byte #[inline(always)] unsafe fn validate_ascii_stride(src: *const usize) -> Option { - let word = *src; - let second_word = *(src.add(1)); - find_non_ascii(word, second_word) + unsafe { + let word = *src; + let second_word = *(src.add(1)); + find_non_ascii(word, second_word) + } } /// Safety-usable invariant: will return Some() when it encounters non-ASCII, with the first element in the Some being @@ -1785,69 +1797,79 @@ cfg_if! { } else { // Safety: src points to two valid `usize`s, dst points to four valid `usize`s #[inline(always)] - unsafe fn unpack_latin1_stride_alu(src: *const usize, dst: *mut usize) { + fn unpack_latin1_stride_alu(src: *const usize, dst: *mut usize) { // Safety: src safety invariant used here - let word = *src; - let second_word = *(src.add(1)); - // Safety: dst safety invariant passed down - unpack_alu(word, second_word, dst); + unsafe { + let word = *src; + let second_word = *(src.add(1)); + // Safety: dst safety invariant passed down + unpack_alu(word, second_word, dst); + } } // Safety: src points to four valid `usize`s, dst points to two valid `usize`s #[inline(always)] - unsafe fn pack_latin1_stride_alu(src: *const usize, dst: *mut usize) { + fn pack_latin1_stride_alu(src: *const usize, dst: *mut usize) { // Safety: src safety invariant used here - let first = *src; - let second = *(src.add(1)); - let third = *(src.add(2)); - let fourth = *(src.add(3)); - // Safety: dst safety invariant passed down - pack_alu(first, second, third, fourth, dst); + unsafe { + let first = *src; + let second = *(src.add(1)); + let third = *(src.add(2)); + let fourth = *(src.add(3)); + // Safety: dst safety invariant passed down + pack_alu(first, second, third, fourth, dst); + } } // Safety: src points to two valid `usize`s, dst points to four valid `usize`s #[inline(always)] - unsafe fn ascii_to_basic_latin_stride_alu(src: *const usize, dst: *mut usize) -> bool { + fn ascii_to_basic_latin_stride_alu(src: *const usize, dst: *mut usize) -> bool { // Safety: src safety invariant used here - let word = *src; - let second_word = *(src.add(1)); - // Check if the words contains non-ASCII - if (word & ASCII_MASK) | (second_word & ASCII_MASK) != 0 { - return false; + unsafe { + let word = *src; + let second_word = *(src.add(1)); + // Check if the words contains non-ASCII + if (word & ASCII_MASK) | (second_word & ASCII_MASK) != 0 { + return false; + } + // Safety: dst safety invariant passed down + unpack_alu(word, second_word, dst); } - // Safety: dst safety invariant passed down - unpack_alu(word, second_word, dst); true } // Safety: src points four valid `usize`s, dst points to two valid `usize`s #[inline(always)] - unsafe fn basic_latin_to_ascii_stride_alu(src: *const usize, dst: *mut usize) -> bool { + fn basic_latin_to_ascii_stride_alu(src: *const usize, dst: *mut usize) -> bool { // Safety: src safety invariant used here - let first = *src; - let second = *(src.add(1)); - let third = *(src.add(2)); - let fourth = *(src.add(3)); - if (first & BASIC_LATIN_MASK) | (second & BASIC_LATIN_MASK) | (third & BASIC_LATIN_MASK) | (fourth & BASIC_LATIN_MASK) != 0 { - return false; + unsafe { + let first = *src; + let second = *(src.add(1)); + let third = *(src.add(2)); + let fourth = *(src.add(3)); + if (first & BASIC_LATIN_MASK) | (second & BASIC_LATIN_MASK) | (third & BASIC_LATIN_MASK) | (fourth & BASIC_LATIN_MASK) != 0 { + return false; + } + // Safety: dst safety invariant passed down + pack_alu(first, second, third, fourth, dst); } - // Safety: dst safety invariant passed down - pack_alu(first, second, third, fourth, dst); true } // Safety: src, dst both point to two valid `usize`s each // Safety-usable invariant: Will return byte index of first non-ascii byte. #[inline(always)] - unsafe fn ascii_to_ascii_stride(src: *const usize, dst: *mut usize) -> Option { + fn ascii_to_ascii_stride(src: *const usize, dst: *mut usize) -> Option { // Safety: src safety invariant used here - let word = *src; - let second_word = *(src.add(1)); - // Safety: src safety invariant used here - *dst = word; - *(dst.add(1)) = second_word; - // Relies on safety-usable invariant here - find_non_ascii(word, second_word) + unsafe { + let word = *src; + let second_word = *(src.add(1)); + // Safety: src safety invariant used here + *dst = word; + *(dst.add(1)) = second_word; + // Relies on safety-usable invariant here + find_non_ascii(word, second_word) + } } basic_latin_alu!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_alu); diff --git a/src/variant.rs b/src/variant.rs index dffaf05..bdcbc9e 100644 --- a/src/variant.rs +++ b/src/variant.rs @@ -28,8 +28,8 @@ use iso_2022_jp::*; use replacement::*; use shift_jis::*; use single_byte::*; -use utf_16::*; use utf_8::*; +use utf_16::*; use x_user_defined::*; pub enum VariantDecoder { @@ -392,9 +392,9 @@ impl VariantEncoding { } pub fn is_single_byte(&self) -> bool { - match *self { - VariantEncoding::SingleByte(_, _, _, _) | VariantEncoding::UserDefined => true, - _ => false, - } + matches!( + *self, + VariantEncoding::SingleByte(_, _, _, _) | VariantEncoding::UserDefined + ) } }