Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 23 additions & 14 deletions App-MHFS/lib/MHFS/Util.pm
Original file line number Diff line number Diff line change
Expand Up @@ -343,27 +343,35 @@ sub surrogatecodepointpairtochar {
}

# returns the byte length and the codepoint
sub peek_utf8_codepoint {
sub _peek_utf8_codepoint {
my ($octets) = @_;
my @rules = (
[0x80, 0x00, 1], # 1 byte sequence
[0xE0, 0xC0, 2], # 2 byte sequence
[0xF0, 0xE0, 3], # 3 byte sequence
[0XF8, 0xF0, 4] # 4 byte sequence
);

length($$octets) >= 1 or return;
my $byte = substr($$octets, 0, 1);
my $byteval = ord($byte);
my $charlen = 1;
my $byteval = ord(substr($octets, 0, 1));
my $charlen;
foreach my $rule (@rules) {
if(($byteval & $rule->[0]) == $rule->[1]) {
$charlen = $rule->[2];
last;
}
}
length($octets) >= $charlen or return;
my $char = decode("utf8", substr($$octets, 0, $charlen));
$charlen or return {'codepoint' => 0xFFFD, 'bytelength' => 1};
my $valid_bytes = 1;
for my $i (1 .. $charlen - 1) {
# this handles length($octets) < $charlen properly
my $cont_byte = ord(substr($octets, $i, 1));
if (($cont_byte & 0xC0) != 0x80) {
return {'codepoint' => 0xFFFD, 'bytelength' => $valid_bytes};
}
$valid_bytes++;
}
my $char = decode("utf8", substr($octets, 0, $charlen));
if(length($char) > 1) {
warnings::warnif "impossible situation, decode returned more than one char";
return {'codepoint' => 0xFFFD, 'bytelength' => 1};
}
return { 'codepoint' => ord($char), 'bytelength' => $charlen};
Expand All @@ -377,15 +385,16 @@ sub get_printable_utf8 {
last if(!length($octets));

# by default replace with the replacement char
my $chardata = peek_utf8_codepoint(\$octets);
my $char = _peek_utf8_codepoint($octets);
my $toappend = chr(0xFFFD);
my $toremove = $chardata->{'bytelength'};
my $toremove = $char->{bytelength};

# if we find a surrogate pair, make the actual codepoint
if(($chardata->{'bytelength'} == 3) && ($chardata->{'codepoint'} >= 0xD800) && ($chardata->{'codepoint'} <= 0xDBFF)) {
my $secondchar = peek_utf8_codepoint(\substr($octets, 3, 3));
if($secondchar && ($secondchar->{'bytelength'} == 3) && ($secondchar->{'codepoint'} >= 0xDC00) && ($secondchar->{'codepoint'} <= 0xDFFF)) {
$toappend = surrogatecodepointpairtochar($chardata->{'codepoint'}, $secondchar->{'codepoint'});
my $mask = ~0 << 16 | 0xFC00;
if (length($octets) >= 6 && ($char->{bytelength} == 3) && (($char->{codepoint} & $mask) == 0xD800)) {
my $secondchar = _peek_utf8_codepoint(substr($octets, 3, 3));
if(($secondchar->{bytelength} == 3) && (($secondchar->{codepoint} & $mask) == 0xDC00)) {
$toappend = surrogatecodepointpairtochar($char->{codepoint}, $secondchar->{codepoint});
$toremove += 3;
}
}
Expand Down
88 changes: 87 additions & 1 deletion App-MHFS/t/01-util.t
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use Feature::Compat::Try;
use Encode qw(decode encode);
use MHFS::Util qw(space2us escape_html escape_html_noquote shell_escape get_printable_utf8 read_text_file_lossy read_text_file write_text_file write_text_file_lossy decode_utf_8 parse_ipv4 write_file read_file fold_case);

plan 27;
plan 103;

is(space2us('hello world'), 'hello_world');

Expand Down Expand Up @@ -42,6 +42,77 @@ is(shell_escape(q|it's|), q|it'"'"'s|);
}
}

{
my @valid = (
# valid range in 1-4 byte sequences
"\x00",
"\x7F",
"\xC2\x80",
"\xDF\xBF",
"\xE0\xA0\x80",
"\xEF\xBF\xBF",
"\xF0\x90\x80\x80",
"\xF7\xBF\xBF\xBF",
# high and low surrogate ranges
"\xED\xA0\x80",
"\xED\xAF\xBF",
"\xED\xB0\x80",
"\xED\xBF\xBF",
);
my @overlong = (
# overlong ranges
"\xC0\x80",
"\xC1\xBF",
"\xE0\x80\x80",
"\xE0\x9F\xBF",
"\xF0\x80\x80\x80",
"\xF0\x8F\xBF\xBF",
);
my @toolong = (
"\xF8\x80\x80\x80\x80",
"\xF8\x88\x80\x80\x80"
);
# truncated, without and with additional input
my @truncated = (
# bytes, expected bytelength
["\xC2", 1],
["\xC2A", 1],
["\xE0\xA0", 2],
["\xE0\xA0A", 2],
["\xF0\x90\x80", 3],
["\xF0\x90\x80A", 3],
);

# test format:
# [bytes, bytelength, is_replacement, [, codepoint]]
my @tests;
foreach my $valid (@valid) {
push @tests, [$valid, length($valid), 0, ord(decode('utf8', $valid, Encode::LEAVE_SRC))];
}
foreach my $overlong (@overlong) {
push @tests, [$overlong, length($overlong), 1, ord(decode('utf8', $overlong, Encode::LEAVE_SRC))];
}
foreach my $toolong (@toolong) {
push @tests, [$toolong, 1, 1];
}
foreach my $truncated (@truncated) {
my @test = (@$truncated, 1);
push @tests, \@test;
}

foreach my $test (@tests) {
my ($bytes, $bytelength, $is_replacement, $codepoint) = @$test;
my $display = uc(unpack("H*", $bytes));
my $peeked = MHFS::Util::_peek_utf8_codepoint($bytes);
my $message = "_peek_utf8_codepoint $display ". ($is_replacement ? 'is' : 'isnt'). ' U+FFFD';
$is_replacement ? is($peeked->{codepoint}, 0xFFFD, $message) : isnt($peeked->{codepoint}, 0xFFFD, $message);
is($peeked->{bytelength}, $bytelength, "_peek_utf8_codepoint $display bytelength is as expected");
if (defined $codepoint) {
is($peeked->{codepoint}, $codepoint, "_peek_utf8_codepoint $display codepoint matches decode(utf8)");
}
}
}

{
my $result = MHFS::Util::surrogatepairtochar("\x{D83C}", "\x{DF84}");
is(ord($result), 0x1F384, "Converting surrogate pair for $result (U+1F384)");
Expand Down Expand Up @@ -76,6 +147,21 @@ is(shell_escape(q|it's|), q|it'"'"'s|);
my $result = get_printable_utf8("A\xED\xA0\xBC\xED\xBE\x84B");
is($result, 'A'.chr(0x1F384).'B', 'Valid low surrogate high surrogate valid');
}
{
# verify truncated sequences are replaced as expected
my @tests = (
["\xC2", chr(0xFFFD)],
["\xC2A", chr(0xFFFD).'A'],
["\xE0\xA0", chr(0xFFFD)],
["\xE0\xA0A", chr(0xFFFD).'A'],
["\xF0\x90\x80", chr(0xFFFD)],
["\xF0\x90\x80A", chr(0xFFFD).'A'],
);
foreach my $test (@tests) {
my $display = uc(unpack("H*", $test->[0]));
is(get_printable_utf8($test->[0]), $test->[1], "get_printable_utf8 truncated sequences decode as expected");
}
}

{
my $fname = 'test_read_text_file.txt';
Expand Down
Loading