From 0cb869782215f319e2697be717d7f88bd81fdbf8 Mon Sep 17 00:00:00 2001 From: Neok Date: Mon, 7 Mar 2022 08:55:14 +0100 Subject: [PATCH] fix PHP Deprecated fix PHP Deprecated: Implicit conversion from float to int loses precision adding floor() on lines 201, 210, 219, 224, 232 for compat PHP 8.1 --- src/ForceUTF8/Encoding.php | 409 +++++++++++++++++++------------------ 1 file changed, 215 insertions(+), 194 deletions(-) diff --git a/src/ForceUTF8/Encoding.php b/src/ForceUTF8/Encoding.php index 2031592..c8d17f1 100644 --- a/src/ForceUTF8/Encoding.php +++ b/src/ForceUTF8/Encoding.php @@ -39,13 +39,13 @@ namespace ForceUTF8; -class Encoding { +class Encoding +{ + public const ICONV_TRANSLIT = 'TRANSLIT'; + public const ICONV_IGNORE = 'IGNORE'; + public const WITHOUT_ICONV = ''; - const ICONV_TRANSLIT = "TRANSLIT"; - const ICONV_IGNORE = "IGNORE"; - const WITHOUT_ICONV = ""; - - protected static $win1252ToUtf8 = array( + protected static array $win1252ToUtf8 = [ 128 => "\xe2\x82\xac", 130 => "\xe2\x80\x9a", @@ -62,7 +62,6 @@ class Encoding { 142 => "\xc5\xbd", - 145 => "\xe2\x80\x98", 146 => "\xe2\x80\x99", 147 => "\xe2\x80\x9c", @@ -77,10 +76,10 @@ class Encoding { 156 => "\xc5\x93", 158 => "\xc5\xbe", - 159 => "\xc5\xb8" - ); + 159 => "\xc5\xb8", + ]; - protected static $brokenUtf8ToUtf8 = array( + protected static array $brokenUtf8ToUtf8 = [ "\xc2\x80" => "\xe2\x82\xac", "\xc2\x82" => "\xe2\x80\x9a", @@ -97,7 +96,6 @@ class Encoding { "\xc2\x8e" => "\xc5\xbd", - "\xc2\x91" => "\xe2\x80\x98", "\xc2\x92" => "\xe2\x80\x99", "\xc2\x93" => "\xe2\x80\x9c", @@ -112,26 +110,25 @@ class Encoding { "\xc2\x9c" => "\xc5\x93", "\xc2\x9e" => "\xc5\xbe", - "\xc2\x9f" => "\xc5\xb8" - ); + "\xc2\x9f" => "\xc5\xb8", + ]; - protected static $utf8ToWin1252 = array( + protected static array $utf8ToWin1252 = [ "\xe2\x82\xac" => "\x80", "\xe2\x80\x9a" => "\x82", - "\xc6\x92" => "\x83", + "\xc6\x92" => "\x83", "\xe2\x80\x9e" => "\x84", "\xe2\x80\xa6" => "\x85", "\xe2\x80\xa0" => "\x86", "\xe2\x80\xa1" => "\x87", - "\xcb\x86" => "\x88", + "\xcb\x86" => "\x88", "\xe2\x80\xb0" => "\x89", - "\xc5\xa0" => "\x8a", + "\xc5\xa0" => "\x8a", "\xe2\x80\xb9" => "\x8b", - "\xc5\x92" => "\x8c", - - "\xc5\xbd" => "\x8e", + "\xc5\x92" => "\x8c", + "\xc5\xbd" => "\x8e", "\xe2\x80\x98" => "\x91", "\xe2\x80\x99" => "\x92", @@ -140,212 +137,236 @@ class Encoding { "\xe2\x80\xa2" => "\x95", "\xe2\x80\x93" => "\x96", "\xe2\x80\x94" => "\x97", - "\xcb\x9c" => "\x98", + "\xcb\x9c" => "\x98", "\xe2\x84\xa2" => "\x99", - "\xc5\xa1" => "\x9a", + "\xc5\xa1" => "\x9a", "\xe2\x80\xba" => "\x9b", - "\xc5\x93" => "\x9c", - - "\xc5\xbe" => "\x9e", - "\xc5\xb8" => "\x9f" - ); - - static function toUTF8($text){ - /** - * Function \ForceUTF8\Encoding::toUTF8 - * - * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8. - * - * It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1. - * - * It may fail to convert characters to UTF-8 if they fall into one of these scenarios: - * - * 1) when any of these characters: ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞß - * are followed by any of these: ("group B") - * ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶•¸¹º»¼½¾¿ - * For example: %ABREPRESENT%C9%BB. «REPRESENTÉ» - * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB) - * is also a valid unicode character, and will be left unchanged. - * - * 2) when any of these: àáâãäåæçèéêëìíîï are followed by TWO chars from group B, - * 3) when any of these: ðñòó are followed by THREE chars from group B. - * - * @name toUTF8 - * @param string $text Any string. - * @return string The same string, UTF8 encoded - * - */ - - if(is_array($text)) + "\xc5\x93" => "\x9c", + + "\xc5\xbe" => "\x9e", + "\xc5\xb8" => "\x9f", + ]; + + public static function toUTF8(array|string $text): array|string { - foreach($text as $k => $v) - { - $text[$k] = self::toUTF8($v); - } - return $text; - } + /** + * Function \ForceUTF8\Encoding::toUTF8. + * + * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8. + * + * It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1. + * + * It may fail to convert characters to UTF-8 if they fall into one of these scenarios: + * + * 1) when any of these characters: ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞß + * are followed by any of these: ("group B") + * ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶•¸¹º»¼½¾¿ + * For example: %ABREPRESENT%C9%BB. «REPRESENTÉ» + * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB) + * is also a valid unicode character, and will be left unchanged. + * + * 2) when any of these: àáâãäåæçèéêëìíîï are followed by TWO chars from group B, + * 3) when any of these: ðñòó are followed by THREE chars from group B. + * + * @param string $text any string + * + * @return string The same string, UTF8 encoded + */ + if (is_array($text)) { + foreach ($text as $k => $v) { + $text[$k] = self::toUTF8($v); + } - if(!is_string($text)) { - return $text; - } + return $text; + } + + if (!is_string($text)) { + return $text; + } - $max = self::strlen($text); - - $buf = ""; - for($i = 0; $i < $max; $i++){ - $c1 = $text[$i]; - if($c1>="\xc0"){ //Should be converted to UTF8, if it's not UTF8 already - $c2 = $i+1 >= $max? "\x00" : $text[$i+1]; - $c3 = $i+2 >= $max? "\x00" : $text[$i+2]; - $c4 = $i+3 >= $max? "\x00" : $text[$i+3]; - if($c1 >= "\xc0" & $c1 <= "\xdf"){ //looks like 2 bytes UTF8 - if($c2 >= "\x80" && $c2 <= "\xbf"){ //yeah, almost sure it's UTF8 already - $buf .= $c1 . $c2; - $i++; + $max = self::strlen($text); + + $buf = ''; + for ($i = 0; $i < $max; ++$i) { + $c1 = $text[$i]; + if ($c1 >= "\xc0") { //Should be converted to UTF8, if it's not UTF8 already + $c2 = $i + 1 >= $max ? "\x00" : $text[$i + 1]; + $c3 = $i + 2 >= $max ? "\x00" : $text[$i + 2]; + $c4 = $i + 3 >= $max ? "\x00" : $text[$i + 3]; + if ($c1 >= "\xc0" & $c1 <= "\xdf") { //looks like 2 bytes UTF8 + if ($c2 >= "\x80" && $c2 <= "\xbf") { //yeah, almost sure it's UTF8 already + $buf .= $c1.$c2; + ++$i; } else { //not valid UTF8. Convert it. - $cc1 = (chr(ord($c1) / 64) | "\xc0"); + $cc1 = chr(floor(ord($c1) / 64)) | "\xc0"; $cc2 = ($c1 & "\x3f") | "\x80"; - $buf .= $cc1 . $cc2; + $buf .= $cc1.$cc2; } - } elseif($c1 >= "\xe0" & $c1 <= "\xef"){ //looks like 3 bytes UTF8 - if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf"){ //yeah, almost sure it's UTF8 already - $buf .= $c1 . $c2 . $c3; + } elseif ($c1 >= "\xe0" & $c1 <= "\xef") { //looks like 3 bytes UTF8 + if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf") { //yeah, almost sure it's UTF8 already + $buf .= $c1.$c2.$c3; $i = $i + 2; } else { //not valid UTF8. Convert it. - $cc1 = (chr(ord($c1) / 64) | "\xc0"); + $cc1 = chr(floor(ord($c1) / 64)) | "\xc0"; $cc2 = ($c1 & "\x3f") | "\x80"; - $buf .= $cc1 . $cc2; + $buf .= $cc1.$cc2; } - } elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ //looks like 4 bytes UTF8 - if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf"){ //yeah, almost sure it's UTF8 already - $buf .= $c1 . $c2 . $c3 . $c4; + } elseif ($c1 >= "\xf0" & $c1 <= "\xf7") { //looks like 4 bytes UTF8 + if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf") { //yeah, almost sure it's UTF8 already + $buf .= $c1.$c2.$c3.$c4; $i = $i + 3; } else { //not valid UTF8. Convert it. - $cc1 = (chr(ord($c1) / 64) | "\xc0"); + $cc1 = chr(floor(ord($c1) / 64)) | "\xc0"; $cc2 = ($c1 & "\x3f") | "\x80"; - $buf .= $cc1 . $cc2; + $buf .= $cc1.$cc2; } - } else { //doesn't look like UTF8, but should be converted - $cc1 = (chr(ord($c1) / 64) | "\xc0"); - $cc2 = (($c1 & "\x3f") | "\x80"); - $buf .= $cc1 . $cc2; - } - } elseif(($c1 & "\xc0") === "\x80"){ // needs conversion - if(isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases + } else { //doesn't look like UTF8, but should be converted + $cc1 = chr(floor(ord($c1) / 64)) | "\xc0"; + $cc2 = ($c1 & "\x3f") | "\x80"; + $buf .= $cc1.$cc2; + } + } elseif (($c1 & "\xc0") === "\x80") { // needs conversion + if (isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases $buf .= self::$win1252ToUtf8[ord($c1)]; } else { - $cc1 = (chr(ord($c1) / 64) | "\xc0"); - $cc2 = (($c1 & "\x3f") | "\x80"); - $buf .= $cc1 . $cc2; + $cc1 = chr(floor(ord($c1) / 64)) | "\xc0"; + $cc2 = ($c1 & "\x3f") | "\x80"; + $buf .= $cc1.$cc2; } - } else { // it doesn't need conversion - $buf .= $c1; + } else { // it doesn't need conversion + $buf .= $c1; + } + } + + return $buf; + } + + public static function toWin1252($text, ?string $option = self::WITHOUT_ICONV): bool|array|string + { + if (is_array($text)) { + foreach ($text as $k => $v) { + $text[$k] = self::toWin1252($v, $option); + } + + return $text; + } elseif (is_string($text)) { + + return static::utf8_decode($text, $option); + } else { + + return $text; } } - return $buf; - } - - static function toWin1252($text, $option = self::WITHOUT_ICONV) { - if(is_array($text)) { - foreach($text as $k => $v) { - $text[$k] = self::toWin1252($v, $option); - } - return $text; - } elseif(is_string($text)) { - return static::utf8_decode($text, $option); - } else { - return $text; + + public static function toISO8859($text, ?string $option = self::WITHOUT_ICONV): bool|array|string + { + return self::toWin1252($text, $option); + } + + public static function toLatin1($text, ?string $option = self::WITHOUT_ICONV): bool|array|string + { + return self::toWin1252($text, $option); } - } - - static function toISO8859($text, $option = self::WITHOUT_ICONV) { - return self::toWin1252($text, $option); - } - - static function toLatin1($text, $option = self::WITHOUT_ICONV) { - return self::toWin1252($text, $option); - } - - static function fixUTF8($text, $option = self::WITHOUT_ICONV){ - if(is_array($text)) { - foreach($text as $k => $v) { - $text[$k] = self::fixUTF8($v, $option); - } - return $text; + + public static function fixUTF8($text, ?string $option = self::WITHOUT_ICONV): bool|array|string + { + if (is_array($text)) { + foreach ($text as $k => $v) { + $text[$k] = self::fixUTF8($v, $option); + } + + return $text; + } + + if (!is_string($text)) { + + return $text; + } + + $last = ''; + while ($last != $text) { + $last = $text; + $text = self::toUTF8(static::utf8_decode($text, $option)); + } + + return self::toUTF8(static::utf8_decode($text, $option)); } - if(!is_string($text)) { - return $text; + public static function UTF8FixWin1252Chars($text): array|string + { + // If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1 + // (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it. + // See: http://en.wikipedia.org/wiki/Windows-1252 + + return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text); } - $last = ""; - while($last <> $text){ - $last = $text; - $text = self::toUTF8(static::utf8_decode($text, $option)); + public static function removeBOM(?string $str = ''): string + { + if (substr($str, 0, 3) === pack('CCC', 0xEF, 0xBB, 0xBF)) { + $str = substr($str, 3); + } + + return $str; } - $text = self::toUTF8(static::utf8_decode($text, $option)); - return $text; - } - static function UTF8FixWin1252Chars($text){ - // If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1 - // (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it. - // See: http://en.wikipedia.org/wiki/Windows-1252 + protected static function strlen(string $text): bool|int + { + return (function_exists('mb_strlen') && ((int) ini_get('mbstring.func_overload')) & 2) + ? mb_strlen($text, '8bit') + : strlen($text); + } - return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text); - } + public static function normalizeEncoding(string $encodingLabel): string + { + $encoding = strtoupper($encodingLabel); + $encoding = preg_replace('/[^a-zA-Z0-9\s]/', '', $encoding); + $equivalences = [ + 'ISO88591' => 'ISO-8859-1', + 'ISO8859' => 'ISO-8859-1', + 'ISO' => 'ISO-8859-1', + 'LATIN1' => 'ISO-8859-1', + 'LATIN' => 'ISO-8859-1', + 'UTF8' => 'UTF-8', + 'UTF' => 'UTF-8', + 'WIN1252' => 'ISO-8859-1', + 'WINDOWS1252' => 'ISO-8859-1', + ]; + + if (empty($equivalences[$encoding])) { + + return 'UTF-8'; + } - static function removeBOM($str=""){ - if(substr($str, 0,3) === pack("CCC",0xef,0xbb,0xbf)) { - $str=substr($str, 3); + return $equivalences[$encoding]; } - return $str; - } - - protected static function strlen($text){ - return (function_exists('mb_strlen') && ((int) ini_get('mbstring.func_overload')) & 2) ? - mb_strlen($text,'8bit') : strlen($text); - } - - public static function normalizeEncoding($encodingLabel) - { - $encoding = strtoupper($encodingLabel); - $encoding = preg_replace('/[^a-zA-Z0-9\s]/', '', $encoding); - $equivalences = array( - 'ISO88591' => 'ISO-8859-1', - 'ISO8859' => 'ISO-8859-1', - 'ISO' => 'ISO-8859-1', - 'LATIN1' => 'ISO-8859-1', - 'LATIN' => 'ISO-8859-1', - 'UTF8' => 'UTF-8', - 'UTF' => 'UTF-8', - 'WIN1252' => 'ISO-8859-1', - 'WINDOWS1252' => 'ISO-8859-1' - ); - - if(empty($equivalences[$encoding])){ - return 'UTF-8'; + + public static function encode(string $encodingLabel, array|string $text): bool|array|string + { + $encodingLabel = self::normalizeEncoding($encodingLabel); + if ('ISO-8859-1' === $encodingLabel) { + + return self::toLatin1($text); + } + + return self::toUTF8($text); } - return $equivalences[$encoding]; - } - - public static function encode($encodingLabel, $text) - { - $encodingLabel = self::normalizeEncoding($encodingLabel); - if($encodingLabel === 'ISO-8859-1') return self::toLatin1($text); - return self::toUTF8($text); - } - - protected static function utf8_decode($text, $option = self::WITHOUT_ICONV) - { - if ($option == self::WITHOUT_ICONV || !function_exists('iconv')) { - $o = utf8_decode( - str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text)) - ); - } else { - $o = iconv("UTF-8", "Windows-1252" . ($option === self::ICONV_TRANSLIT ? '//TRANSLIT' : ($option === self::ICONV_IGNORE ? '//IGNORE' : '')), $text); + protected static function utf8_decode($text, ?string $option = self::WITHOUT_ICONV): bool|string + { + if (self::WITHOUT_ICONV == $option || !function_exists('iconv')) { + $o = utf8_decode( + str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text)) + ); + } else { + $o = iconv( + 'UTF-8', + 'Windows-1252'.(self::ICONV_TRANSLIT === $option ? + '//TRANSLIT' : (self::ICONV_IGNORE === $option ? + '//IGNORE' : '')), $text + ); + } + + return $o; } - return $o; - } }