G4Vi · G4Vi · Sep 5, 2025 · Sep 5, 2025 · Sep 5, 2025
diff --git a/App-MHFS/lib/MHFS/Util.pm b/App-MHFS/lib/MHFS/Util.pm
@@ -343,27 +343,35 @@ sub surrogatecodepointpairtochar {
 }
 
 # returns the byte length and the codepoint
-sub peek_utf8_codepoint {
+sub _peek_utf8_codepoint {
     my ($octets) = @_;
     my @rules = (
+        [0x80, 0x00, 1], # 1 byte sequence
         [0xE0, 0xC0, 2], # 2 byte sequence
         [0xF0, 0xE0, 3], # 3 byte sequence
         [0XF8, 0xF0, 4]  # 4 byte sequence
     );
-
-    length($$octets) >= 1 or return;
-    my $byte = substr($$octets, 0, 1);
-    my $byteval = ord($byte);
-    my $charlen = 1;
+    my $byteval = ord(substr($octets, 0, 1));
+    my $charlen;
     foreach my $rule (@rules) {
         if(($byteval & $rule->[0]) == $rule->[1]) {
             $charlen = $rule->[2];
             last;
         }
     }
-    length($octets) >= $charlen or return;
-    my $char = decode("utf8", substr($$octets, 0, $charlen));
+    $charlen or return {'codepoint' => 0xFFFD, 'bytelength' => 1};
+    my $valid_bytes = 1;
+    for my $i (1 .. $charlen - 1) {
+        # this handles length($octets) < $charlen properly
+        my $cont_byte = ord(substr($octets, $i, 1));
+        if (($cont_byte & 0xC0) != 0x80) {
+            return {'codepoint' => 0xFFFD, 'bytelength' => $valid_bytes};
+        }
+        $valid_bytes++;
+    }
+    my $char = decode("utf8", substr($octets, 0, $charlen));
     if(length($char) > 1) {
+        warnings::warnif "impossible situation, decode returned more than one char";
         return {'codepoint' => 0xFFFD, 'bytelength' => 1};
     }
     return { 'codepoint' => ord($char), 'bytelength' => $charlen};
@@ -377,15 +385,16 @@ sub get_printable_utf8 {
         last if(!length($octets));
 
         # by default replace with the replacement char
-        my $chardata = peek_utf8_codepoint(\$octets);
+        my $char = _peek_utf8_codepoint($octets);
         my $toappend = chr(0xFFFD);
-        my $toremove = $chardata->{'bytelength'};
+        my $toremove = $char->{bytelength};
 
         # if we find a surrogate pair, make the actual codepoint
-        if(($chardata->{'bytelength'} == 3) && ($chardata->{'codepoint'}  >= 0xD800) && ($chardata->{'codepoint'} <= 0xDBFF)) {
-            my $secondchar = peek_utf8_codepoint(\substr($octets, 3, 3));
-            if($secondchar && ($secondchar->{'bytelength'} == 3) && ($secondchar->{'codepoint'}  >= 0xDC00) && ($secondchar->{'codepoint'} <= 0xDFFF)) {
-                $toappend = surrogatecodepointpairtochar($chardata->{'codepoint'}, $secondchar->{'codepoint'});
+        my $mask = ~0 << 16 | 0xFC00;
+        if (length($octets) >= 6 && ($char->{bytelength} == 3) && (($char->{codepoint} & $mask) == 0xD800)) {
+            my $secondchar = _peek_utf8_codepoint(substr($octets, 3, 3));
+            if(($secondchar->{bytelength} == 3) && (($secondchar->{codepoint} & $mask) == 0xDC00)) {
+                $toappend = surrogatecodepointpairtochar($char->{codepoint}, $secondchar->{codepoint});
                 $toremove += 3;
             }
         }

diff --git a/App-MHFS/t/01-util.t b/App-MHFS/t/01-util.t
@@ -7,7 +7,7 @@ use Feature::Compat::Try;
 use Encode qw(decode encode);
 use MHFS::Util qw(space2us escape_html escape_html_noquote shell_escape get_printable_utf8 read_text_file_lossy read_text_file write_text_file write_text_file_lossy decode_utf_8 parse_ipv4 write_file read_file fold_case);
 
-plan 27;
+plan 103;
 
 is(space2us('hello world'), 'hello_world');
 
@@ -42,6 +42,77 @@ is(shell_escape(q|it's|), q|it'"'"'s|);
     }
 }
 
+{
+    my @valid = (
+        # valid range in 1-4 byte sequences
+        "\x00",
+        "\x7F",
+        "\xC2\x80",
+        "\xDF\xBF",
+        "\xE0\xA0\x80",
+        "\xEF\xBF\xBF",
+        "\xF0\x90\x80\x80",
+        "\xF7\xBF\xBF\xBF",
+        # high and low surrogate ranges
+        "\xED\xA0\x80",
+        "\xED\xAF\xBF",
+        "\xED\xB0\x80",
+        "\xED\xBF\xBF",
+    );
+    my @overlong = (
+        # overlong ranges
+        "\xC0\x80",
+        "\xC1\xBF",
+        "\xE0\x80\x80",
+        "\xE0\x9F\xBF",
+        "\xF0\x80\x80\x80",
+        "\xF0\x8F\xBF\xBF",
+    );
+    my @toolong = (
+        "\xF8\x80\x80\x80\x80",
+        "\xF8\x88\x80\x80\x80"
+    );
+    # truncated, without and with additional input
+    my @truncated = (
+        # bytes, expected bytelength
+        ["\xC2", 1],
+        ["\xC2A", 1],
+        ["\xE0\xA0", 2],
+        ["\xE0\xA0A", 2],
+        ["\xF0\x90\x80", 3],
+        ["\xF0\x90\x80A", 3],
+    );
+
+    # test format:
+    # [bytes, bytelength, is_replacement, [, codepoint]]
+    my @tests;
+    foreach my $valid (@valid) {
+        push @tests, [$valid, length($valid), 0, ord(decode('utf8', $valid, Encode::LEAVE_SRC))];
+    }
+    foreach my $overlong (@overlong) {
+        push @tests, [$overlong, length($overlong), 1, ord(decode('utf8', $overlong, Encode::LEAVE_SRC))];
+    }
+    foreach my $toolong (@toolong) {
+        push @tests, [$toolong, 1, 1];
+    }
+    foreach my $truncated (@truncated) {
+        my @test = (@$truncated, 1);
+        push @tests, \@test;
+    }
+
+    foreach my $test (@tests) {
+        my ($bytes, $bytelength, $is_replacement, $codepoint) = @$test;
+        my $display = uc(unpack("H*", $bytes));
+        my $peeked = MHFS::Util::_peek_utf8_codepoint($bytes);
+        my $message = "_peek_utf8_codepoint $display ". ($is_replacement ? 'is' : 'isnt'). ' U+FFFD';
+        $is_replacement ? is($peeked->{codepoint}, 0xFFFD, $message) : isnt($peeked->{codepoint}, 0xFFFD, $message);
+        is($peeked->{bytelength}, $bytelength, "_peek_utf8_codepoint $display bytelength is as expected");
+        if (defined $codepoint) {
+            is($peeked->{codepoint}, $codepoint, "_peek_utf8_codepoint $display codepoint matches decode(utf8)");
+        }
+    }
+}
+
 {
     my $result = MHFS::Util::surrogatepairtochar("\x{D83C}", "\x{DF84}");
     is(ord($result), 0x1F384, "Converting surrogate pair for $result (U+1F384)");
@@ -76,6 +147,21 @@ is(shell_escape(q|it's|), q|it'"'"'s|);
     my $result = get_printable_utf8("A\xED\xA0\xBC\xED\xBE\x84B");
     is($result, 'A'.chr(0x1F384).'B', 'Valid low surrogate high surrogate valid');
 }
+{
+    # verify truncated sequences are replaced as expected
+    my @tests = (
+        ["\xC2", chr(0xFFFD)],
+        ["\xC2A", chr(0xFFFD).'A'],
+        ["\xE0\xA0", chr(0xFFFD)],
+        ["\xE0\xA0A", chr(0xFFFD).'A'],
+        ["\xF0\x90\x80", chr(0xFFFD)],
+        ["\xF0\x90\x80A", chr(0xFFFD).'A'],
+    );
+    foreach my $test (@tests) {
+        my $display = uc(unpack("H*", $test->[0]));
+        is(get_printable_utf8($test->[0]), $test->[1], "get_printable_utf8 truncated sequences decode as expected");
+    }
+}
 
 {
     my $fname = 'test_read_text_file.txt';