diff --git a/App-MHFS/lib/MHFS/Util.pm b/App-MHFS/lib/MHFS/Util.pm index b10c6f5..625cfb5 100644 --- a/App-MHFS/lib/MHFS/Util.pm +++ b/App-MHFS/lib/MHFS/Util.pm @@ -343,27 +343,35 @@ sub surrogatecodepointpairtochar { } # returns the byte length and the codepoint -sub peek_utf8_codepoint { +sub _peek_utf8_codepoint { my ($octets) = @_; my @rules = ( + [0x80, 0x00, 1], # 1 byte sequence [0xE0, 0xC0, 2], # 2 byte sequence [0xF0, 0xE0, 3], # 3 byte sequence [0XF8, 0xF0, 4] # 4 byte sequence ); - - length($$octets) >= 1 or return; - my $byte = substr($$octets, 0, 1); - my $byteval = ord($byte); - my $charlen = 1; + my $byteval = ord(substr($octets, 0, 1)); + my $charlen; foreach my $rule (@rules) { if(($byteval & $rule->[0]) == $rule->[1]) { $charlen = $rule->[2]; last; } } - length($octets) >= $charlen or return; - my $char = decode("utf8", substr($$octets, 0, $charlen)); + $charlen or return {'codepoint' => 0xFFFD, 'bytelength' => 1}; + my $valid_bytes = 1; + for my $i (1 .. $charlen - 1) { + # this handles length($octets) < $charlen properly + my $cont_byte = ord(substr($octets, $i, 1)); + if (($cont_byte & 0xC0) != 0x80) { + return {'codepoint' => 0xFFFD, 'bytelength' => $valid_bytes}; + } + $valid_bytes++; + } + my $char = decode("utf8", substr($octets, 0, $charlen)); if(length($char) > 1) { + warnings::warnif "impossible situation, decode returned more than one char"; return {'codepoint' => 0xFFFD, 'bytelength' => 1}; } return { 'codepoint' => ord($char), 'bytelength' => $charlen}; @@ -377,15 +385,16 @@ sub get_printable_utf8 { last if(!length($octets)); # by default replace with the replacement char - my $chardata = peek_utf8_codepoint(\$octets); + my $char = _peek_utf8_codepoint($octets); my $toappend = chr(0xFFFD); - my $toremove = $chardata->{'bytelength'}; + my $toremove = $char->{bytelength}; # if we find a surrogate pair, make the actual codepoint - if(($chardata->{'bytelength'} == 3) && ($chardata->{'codepoint'} >= 0xD800) && ($chardata->{'codepoint'} <= 0xDBFF)) { - my $secondchar = peek_utf8_codepoint(\substr($octets, 3, 3)); - if($secondchar && ($secondchar->{'bytelength'} == 3) && ($secondchar->{'codepoint'} >= 0xDC00) && ($secondchar->{'codepoint'} <= 0xDFFF)) { - $toappend = surrogatecodepointpairtochar($chardata->{'codepoint'}, $secondchar->{'codepoint'}); + my $mask = ~0 << 16 | 0xFC00; + if (length($octets) >= 6 && ($char->{bytelength} == 3) && (($char->{codepoint} & $mask) == 0xD800)) { + my $secondchar = _peek_utf8_codepoint(substr($octets, 3, 3)); + if(($secondchar->{bytelength} == 3) && (($secondchar->{codepoint} & $mask) == 0xDC00)) { + $toappend = surrogatecodepointpairtochar($char->{codepoint}, $secondchar->{codepoint}); $toremove += 3; } } diff --git a/App-MHFS/t/01-util.t b/App-MHFS/t/01-util.t index 5a54fd1..392714f 100644 --- a/App-MHFS/t/01-util.t +++ b/App-MHFS/t/01-util.t @@ -7,7 +7,7 @@ use Feature::Compat::Try; use Encode qw(decode encode); use MHFS::Util qw(space2us escape_html escape_html_noquote shell_escape get_printable_utf8 read_text_file_lossy read_text_file write_text_file write_text_file_lossy decode_utf_8 parse_ipv4 write_file read_file fold_case); -plan 27; +plan 103; is(space2us('hello world'), 'hello_world'); @@ -42,6 +42,77 @@ is(shell_escape(q|it's|), q|it'"'"'s|); } } +{ + my @valid = ( + # valid range in 1-4 byte sequences + "\x00", + "\x7F", + "\xC2\x80", + "\xDF\xBF", + "\xE0\xA0\x80", + "\xEF\xBF\xBF", + "\xF0\x90\x80\x80", + "\xF7\xBF\xBF\xBF", + # high and low surrogate ranges + "\xED\xA0\x80", + "\xED\xAF\xBF", + "\xED\xB0\x80", + "\xED\xBF\xBF", + ); + my @overlong = ( + # overlong ranges + "\xC0\x80", + "\xC1\xBF", + "\xE0\x80\x80", + "\xE0\x9F\xBF", + "\xF0\x80\x80\x80", + "\xF0\x8F\xBF\xBF", + ); + my @toolong = ( + "\xF8\x80\x80\x80\x80", + "\xF8\x88\x80\x80\x80" + ); + # truncated, without and with additional input + my @truncated = ( + # bytes, expected bytelength + ["\xC2", 1], + ["\xC2A", 1], + ["\xE0\xA0", 2], + ["\xE0\xA0A", 2], + ["\xF0\x90\x80", 3], + ["\xF0\x90\x80A", 3], + ); + + # test format: + # [bytes, bytelength, is_replacement, [, codepoint]] + my @tests; + foreach my $valid (@valid) { + push @tests, [$valid, length($valid), 0, ord(decode('utf8', $valid, Encode::LEAVE_SRC))]; + } + foreach my $overlong (@overlong) { + push @tests, [$overlong, length($overlong), 1, ord(decode('utf8', $overlong, Encode::LEAVE_SRC))]; + } + foreach my $toolong (@toolong) { + push @tests, [$toolong, 1, 1]; + } + foreach my $truncated (@truncated) { + my @test = (@$truncated, 1); + push @tests, \@test; + } + + foreach my $test (@tests) { + my ($bytes, $bytelength, $is_replacement, $codepoint) = @$test; + my $display = uc(unpack("H*", $bytes)); + my $peeked = MHFS::Util::_peek_utf8_codepoint($bytes); + my $message = "_peek_utf8_codepoint $display ". ($is_replacement ? 'is' : 'isnt'). ' U+FFFD'; + $is_replacement ? is($peeked->{codepoint}, 0xFFFD, $message) : isnt($peeked->{codepoint}, 0xFFFD, $message); + is($peeked->{bytelength}, $bytelength, "_peek_utf8_codepoint $display bytelength is as expected"); + if (defined $codepoint) { + is($peeked->{codepoint}, $codepoint, "_peek_utf8_codepoint $display codepoint matches decode(utf8)"); + } + } +} + { my $result = MHFS::Util::surrogatepairtochar("\x{D83C}", "\x{DF84}"); is(ord($result), 0x1F384, "Converting surrogate pair for $result (U+1F384)"); @@ -76,6 +147,21 @@ is(shell_escape(q|it's|), q|it'"'"'s|); my $result = get_printable_utf8("A\xED\xA0\xBC\xED\xBE\x84B"); is($result, 'A'.chr(0x1F384).'B', 'Valid low surrogate high surrogate valid'); } +{ + # verify truncated sequences are replaced as expected + my @tests = ( + ["\xC2", chr(0xFFFD)], + ["\xC2A", chr(0xFFFD).'A'], + ["\xE0\xA0", chr(0xFFFD)], + ["\xE0\xA0A", chr(0xFFFD).'A'], + ["\xF0\x90\x80", chr(0xFFFD)], + ["\xF0\x90\x80A", chr(0xFFFD).'A'], + ); + foreach my $test (@tests) { + my $display = uc(unpack("H*", $test->[0])); + is(get_printable_utf8($test->[0]), $test->[1], "get_printable_utf8 truncated sequences decode as expected"); + } +} { my $fname = 'test_read_text_file.txt';