From ebe54c1bcdf29588f6e95b3e7b2f96f953216268 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hans-J=C3=BCrgen=20Tappe?= Date: Wed, 25 Mar 2015 23:05:30 +0100 Subject: [PATCH 1/4] Add a test file demonstrating UTF-8 BOM handling for a hungarian translation file in the widely used ckeditor. --- tests/Resources/minify/expected/utf8-with-bom.js | 3 +++ tests/Resources/minify/input/utf8-with-bom.js | Bin 0 -> 524 bytes 2 files changed, 3 insertions(+) create mode 100644 tests/Resources/minify/expected/utf8-with-bom.js create mode 100644 tests/Resources/minify/input/utf8-with-bom.js diff --git a/tests/Resources/minify/expected/utf8-with-bom.js b/tests/Resources/minify/expected/utf8-with-bom.js new file mode 100644 index 0000000..3206cb0 --- /dev/null +++ b/tests/Resources/minify/expected/utf8-with-bom.js @@ -0,0 +1,3 @@ +MyClass._A=new Array +("One","Two","Three");MyClass._B=new Array +("Ten","Twenty","Thirty"); \ No newline at end of file diff --git a/tests/Resources/minify/input/utf8-with-bom.js b/tests/Resources/minify/input/utf8-with-bom.js new file mode 100644 index 0000000000000000000000000000000000000000..e151dd9e9b3af638b91b3f840a94957abcf526db GIT binary patch literal 524 zcmaivOH0E*6ot>)UlFphVhwJ(5f@c(QG6_nYl%s;0W%Fvr0t(qzk8=GS{E`5!#(%B z=Ib-li4r}kP^(Vws+nfYiC)!g=SG3~mU-Y9XUq+%KnmYIDc23?xh6V;d&Du{Tyr@6 zFSkxhEzo|b#HC^vJ!-8T>Bwkc1e|t0_b^v5EwyyDb$SD-zy@x)S4BVb?R{r%#@)48 zvl;fiVw)HLRlCj0{Jqmg*Wd-po-WUtqP*k=e67utpDfvLoPVrT557xv>PP~T!(8zG sMCPb!PO6bEoS7-ZuZ3wF`}N=StAEpTwBaCY<=8;S!wwBu5sxjt0UHWXNdN!< literal 0 HcmV?d00001 From 4a447e0c9d93f9bd5f9321488131b83327d57e75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hans-J=C3=BCrgen=20Tappe?= Date: Wed, 25 Mar 2015 23:07:49 +0100 Subject: [PATCH 2/4] Fix parsing of the UTF-8 BOM to retrieve the correctly encoded content. --- src/JSMin/JSMin.php | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/src/JSMin/JSMin.php b/src/JSMin/JSMin.php index 0c2aeb7..bbe54c2 100644 --- a/src/JSMin/JSMin.php +++ b/src/JSMin/JSMin.php @@ -111,9 +111,34 @@ public function min() mb_internal_encoding('8bit'); } - if (isset($this->input[0]) && $this->input[0] === "\xef") { - $this->input = substr($this->input, 3); - } + // Remove the utf-8 BOM to save transfer bytes. + // Otherwise, line breaks before the 2nd comment are kept and + // lots of zero bytes stay, leading to additional waste and parsing + // exceptions. + $first2 = substr($this->input, 0, 2); + $first3 = substr($this->input, 0, 3); + $first4 = substr($this->input, 0, 4); + $encoding = 'UTF-8'; + // Unicode BOM is U+FEFF, but after encoded, it will look like this. + if ($first3 == chr(0xEF).chr(0xBB).chr(0xBF)) { + $this->input = substr($this->input, 3); + } elseif ($first4 == chr(0x00).chr(0x00).chr(0xFE).chr(0xFF)) { + $encoding = 'UTF-32BE'; + $this->input = substr($this->input, 4); + } elseif ($first4 == chr(0xFF).chr(0xFE).chr(0x00).chr(0x00)) { + $encoding = 'UTF-32LE'; + $this->input = substr($this->input, 4); + } elseif ($first2 == chr(0xFE).chr(0xFF)) { + $encoding = 'UTF-16BE'; + $this->input = substr($this->input, 2); + } elseif ($first2 == chr(0xFF).chr(0xFE)) { + $encoding = 'UTF-16LE'; + $this->input = substr($this->input, 2); + } + // Convert only non-8-bit files. + if ($encoding != 'UTF-8') { + $this->input = mb_convert_encoding($this->input, 'UTF-8', $encoding); + } $this->input = str_replace("\r\n", "\n", $this->input); $this->inputLength = strlen($this->input); From 74717eaf6a7dffb7d9a8c89bfa83a7bfb7d75a1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hans-J=C3=BCrgen=20Tappe?= Date: Sat, 28 Mar 2015 21:52:02 +0100 Subject: [PATCH 3/4] Correcte coding style: indent by whitespace. --- src/JSMin/JSMin.php | 80 ++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/src/JSMin/JSMin.php b/src/JSMin/JSMin.php index bbe54c2..88eb3a2 100644 --- a/src/JSMin/JSMin.php +++ b/src/JSMin/JSMin.php @@ -111,34 +111,34 @@ public function min() mb_internal_encoding('8bit'); } - // Remove the utf-8 BOM to save transfer bytes. - // Otherwise, line breaks before the 2nd comment are kept and - // lots of zero bytes stay, leading to additional waste and parsing - // exceptions. - $first2 = substr($this->input, 0, 2); - $first3 = substr($this->input, 0, 3); - $first4 = substr($this->input, 0, 4); - $encoding = 'UTF-8'; - // Unicode BOM is U+FEFF, but after encoded, it will look like this. - if ($first3 == chr(0xEF).chr(0xBB).chr(0xBF)) { - $this->input = substr($this->input, 3); - } elseif ($first4 == chr(0x00).chr(0x00).chr(0xFE).chr(0xFF)) { - $encoding = 'UTF-32BE'; - $this->input = substr($this->input, 4); - } elseif ($first4 == chr(0xFF).chr(0xFE).chr(0x00).chr(0x00)) { - $encoding = 'UTF-32LE'; - $this->input = substr($this->input, 4); - } elseif ($first2 == chr(0xFE).chr(0xFF)) { - $encoding = 'UTF-16BE'; - $this->input = substr($this->input, 2); - } elseif ($first2 == chr(0xFF).chr(0xFE)) { - $encoding = 'UTF-16LE'; - $this->input = substr($this->input, 2); - } - // Convert only non-8-bit files. - if ($encoding != 'UTF-8') { - $this->input = mb_convert_encoding($this->input, 'UTF-8', $encoding); - } + // Remove the utf-8 BOM to save transfer bytes. + // Otherwise, line breaks before the 2nd comment are kept and + // lots of zero bytes stay, leading to additional waste and parsing + // exceptions. + $first2 = substr($this->input, 0, 2); + $first3 = substr($this->input, 0, 3); + $first4 = substr($this->input, 0, 4); + $encoding = 'UTF-8'; + // Unicode BOM is U+FEFF, but after encoded, it will look like this. + if ($first3 == chr(0xEF).chr(0xBB).chr(0xBF)) { + $this->input = substr($this->input, 3); + } elseif ($first4 == chr(0x00).chr(0x00).chr(0xFE).chr(0xFF)) { + $encoding = 'UTF-32BE'; + $this->input = substr($this->input, 4); + } elseif ($first4 == chr(0xFF).chr(0xFE).chr(0x00).chr(0x00)) { + $encoding = 'UTF-32LE'; + $this->input = substr($this->input, 4); + } elseif ($first2 == chr(0xFE).chr(0xFF)) { + $encoding = 'UTF-16BE'; + $this->input = substr($this->input, 2); + } elseif ($first2 == chr(0xFF).chr(0xFE)) { + $encoding = 'UTF-16LE'; + $this->input = substr($this->input, 2); + } + // Convert only non-8-bit files. + if ($encoding != 'UTF-8') { + $this->input = mb_convert_encoding($this->input, 'UTF-8', $encoding); + } $this->input = str_replace("\r\n", "\n", $this->input); $this->inputLength = strlen($this->input); @@ -307,12 +307,12 @@ protected function isRegexpLiteral() return true; } - // we have to check for a preceding keyword, and we don't need to pattern - // match over the whole output. - $recentOutput = substr($this->output, -10); + // we have to check for a preceding keyword, and we don't need to pattern + // match over the whole output. + $recentOutput = substr($this->output, -10); - // check if return/typeof directly precede a pattern without a space - foreach (array('return', 'typeof') as $keyword) { + // check if return/typeof directly precede a pattern without a space + foreach (array('return', 'typeof') as $keyword) { if ($this->a !== substr($keyword, -1)) { // certainly wasn't keyword continue; @@ -324,13 +324,13 @@ protected function isRegexpLiteral() } } - // check all keywords - if ($this->a === ' ' || $this->a === "\n") { - if (preg_match('~(^|[\\s\\S])(?:case|else|in|return|typeof)$~', $recentOutput, $m)) { - if ($m[1] === '' || !$this->isAlphaNum($m[1])) { - return true; - } - } + // check all keywords + if ($this->a === ' ' || $this->a === "\n") { + if (preg_match('~(^|[\\s\\S])(?:case|else|in|return|typeof)$~', $recentOutput, $m)) { + if ($m[1] === '' || !$this->isAlphaNum($m[1])) { + return true; + } + } } return false; From c77c3014c3b9ff638410b105ac02826e91e361b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hans-J=C3=BCrgen=20Tappe?= Date: Wed, 8 Apr 2015 11:41:48 +0200 Subject: [PATCH 4/4] Implementation of the UTF-8 BOM Encoding class. --- src/JSMin/EncodingDetector.php | 139 ++++++++++++++++++++++++++++++++ src/JSMin/EncodingException.php | 6 ++ src/JSMin/JSMin.php | 30 +------ 3 files changed, 147 insertions(+), 28 deletions(-) create mode 100644 src/JSMin/EncodingDetector.php create mode 100644 src/JSMin/EncodingException.php diff --git a/src/JSMin/EncodingDetector.php b/src/JSMin/EncodingDetector.php new file mode 100644 index 0000000..6747690 --- /dev/null +++ b/src/JSMin/EncodingDetector.php @@ -0,0 +1,139 @@ + + * @license http://opensource.org/licenses/mit-license.php MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * The Software shall be used for Good, not Evil. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +class EncodingDetector +{ + /** + * Default Target Encoding + */ + const DEFAULT_ENCODING = 'UTF-8'; + + /** + * Debug output switch + */ + const DEBUG = false; + + + /** + * Decode input file content to UTF-8, taking BOM into account. + * + * @param string $input Input string + * @throws EncodingException + */ + public static function decode($input, $targetEncoding = NULL) + { + // Check for the target encoding. + if (is_null($targetEncoding)) { + $targetEncoding = self::get_encoding(); + } + + // Remove the utf-8 BOM to save transfer bytes. + // Otherwise, line breaks before the 2nd comment are kept and + // lots of zero bytes stay, leading to additional waste and + // parsing + // exceptions. + $first2 = substr($input, 0, 2); + $first3 = substr($input, 0, 3); + $first4 = substr($input, 0, 4); + + // Unicode BOM is U+FEFF, but after encoded, it will look + // like this. + if ($first3 == chr(0xEF).chr(0xBB).chr(0xBF)) { + $encoding = 'UTF-8'; + $input = substr($input, 3); + } elseif ($first4 == chr(0x00).chr(0x00).chr(0xFE).chr(0xFF)) { + $encoding = 'UTF-32BE'; + $input = substr($input, 4); + } elseif ($first4 == chr(0xFF).chr(0xFE).chr(0x00).chr(0x00)) { + $encoding = 'UTF-32LE'; + $input = substr($input, 4); + } elseif ($first2 == chr(0xFE).chr(0xFF)) { + $encoding = 'UTF-16BE'; + $input = substr($input, 2); + } elseif ($first2 == chr(0xFF).chr(0xFE)) { + $encoding = 'UTF-16LE'; + $input = substr($input, 2); + } else { + $encoding = 'UTF-8'; + // No BOM + } + + // Convert only convertible files. + if (strtoupper($encoding) != strtoupper($targetEncoding)) { + if (self::check_mbstring()) { + $input = mb_convert_encoding($input, $targetEncoding, $encoding); + } else { + $result = iconv( + $in_charset = $encoding, + $out_charset = $targetEncoding, + $input); + if (false === $result) + { + throw new EncodingException('Input string could not be converted.'); + } else { + $input = $result; + } + } + } + + return $input; + } + + /** + * Check the availability of mbstring. + * @returns boolean + */ + private static function check_mbstring() + { + if (function_exists('mb_strlen') && + ((int)ini_get('mbstring.func_overload') & 2)) { + self::DEBUG && error_log(__CLASS__.'::'.__FUNCTION__.': true'); + return true; + } else { + self::DEBUG && error_log(__CLASS__.'::'.__FUNCTION__.': false'); + return false; + } + } + + /** + * Get the target encoding. + * @returns string + */ + private static function get_encoding() + { + $iniValue = ini_get('default_encoding'); + if (!is_null($iniValue) && $iniValue != '') { + self::DEBUG && error_log(__CLASS__.'::'.__FUNCTION__.': '.$iniValue.' (INI VALUE)'); + return $iniValue; + } else { + self::DEBUG && error_log(__CLASS__.'::'.__FUNCTION__.': '.self::DEFAULT_ENCODING.' (DEFAULT)'); + return self::DEFAULT_ENCODING; + } + } +} diff --git a/src/JSMin/EncodingException.php b/src/JSMin/EncodingException.php new file mode 100644 index 0000000..a429225 --- /dev/null +++ b/src/JSMin/EncodingException.php @@ -0,0 +1,6 @@ +input, 0, 2); - $first3 = substr($this->input, 0, 3); - $first4 = substr($this->input, 0, 4); - $encoding = 'UTF-8'; - // Unicode BOM is U+FEFF, but after encoded, it will look like this. - if ($first3 == chr(0xEF).chr(0xBB).chr(0xBF)) { - $this->input = substr($this->input, 3); - } elseif ($first4 == chr(0x00).chr(0x00).chr(0xFE).chr(0xFF)) { - $encoding = 'UTF-32BE'; - $this->input = substr($this->input, 4); - } elseif ($first4 == chr(0xFF).chr(0xFE).chr(0x00).chr(0x00)) { - $encoding = 'UTF-32LE'; - $this->input = substr($this->input, 4); - } elseif ($first2 == chr(0xFE).chr(0xFF)) { - $encoding = 'UTF-16BE'; - $this->input = substr($this->input, 2); - } elseif ($first2 == chr(0xFF).chr(0xFE)) { - $encoding = 'UTF-16LE'; - $this->input = substr($this->input, 2); - } - // Convert only non-8-bit files. - if ($encoding != 'UTF-8') { - $this->input = mb_convert_encoding($this->input, 'UTF-8', $encoding); - } + // Encode the input string, taking the UTF-8 BOM into account + $this->input = EncodingDetector::decode($this->input); $this->input = str_replace("\r\n", "\n", $this->input); $this->inputLength = strlen($this->input);