From ebe54c1bcdf29588f6e95b3e7b2f96f953216268 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hans-J=C3=BCrgen=20Tappe?=
 <hjtappe@users.noreply.github.com>
Date: Wed, 25 Mar 2015 23:05:30 +0100
Subject: [PATCH 1/4] Add a test file demonstrating UTF-8 BOM handling for a
 hungarian translation file in the widely used ckeditor.

---
 tests/Resources/minify/expected/utf8-with-bom.js |   3 +++
 tests/Resources/minify/input/utf8-with-bom.js    | Bin 0 -> 524 bytes
 2 files changed, 3 insertions(+)
 create mode 100644 tests/Resources/minify/expected/utf8-with-bom.js
 create mode 100644 tests/Resources/minify/input/utf8-with-bom.js

diff --git a/tests/Resources/minify/expected/utf8-with-bom.js b/tests/Resources/minify/expected/utf8-with-bom.js
new file mode 100644
index 0000000..3206cb0
--- /dev/null
+++ b/tests/Resources/minify/expected/utf8-with-bom.js
@@ -0,0 +1,3 @@
+MyClass._A=new Array
+("One","Two","Three");MyClass._B=new Array
+("Ten","Twenty","Thirty");
\ No newline at end of file
diff --git a/tests/Resources/minify/input/utf8-with-bom.js b/tests/Resources/minify/input/utf8-with-bom.js
new file mode 100644
index 0000000000000000000000000000000000000000..e151dd9e9b3af638b91b3f840a94957abcf526db
GIT binary patch
literal 524
zcmaivOH0E*6ot>)UlFphVhwJ(5f@c(QG6_nYl%s;0W%Fvr0t(qzk8=GS{E`5!#(%B
z=Ib-li4r}kP^(Vws+nfYiC)!g=SG3~mU-Y9XUq+%KnmYIDc23?xh6V;d&Du{Tyr@6
zFSkxhEzo|b#HC^vJ!-8T>Bwkc1e|t0_b^v5EwyyDb$SD-zy@x)S4BVb?R{r%#@)48
zvl;fiVw)HLRlCj0{Jqmg*Wd-po-WUtqP*k=e67utpDfvLoPVrT557xv>PP~T!(8zG
sMCPb!PO6bEoS7-ZuZ3wF`}N=StAEpTwBaCY<=8;S!wwBu5sxjt0UHWXNdN!<

literal 0
HcmV?d00001


From 4a447e0c9d93f9bd5f9321488131b83327d57e75 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hans-J=C3=BCrgen=20Tappe?=
 <hjtappe@users.noreply.github.com>
Date: Wed, 25 Mar 2015 23:07:49 +0100
Subject: [PATCH 2/4] Fix parsing of the UTF-8 BOM to retrieve the correctly
 encoded content.

---
 src/JSMin/JSMin.php | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/src/JSMin/JSMin.php b/src/JSMin/JSMin.php
index 0c2aeb7..bbe54c2 100644
--- a/src/JSMin/JSMin.php
+++ b/src/JSMin/JSMin.php
@@ -111,9 +111,34 @@ public function min()
             mb_internal_encoding('8bit');
         }
 
-        if (isset($this->input[0]) && $this->input[0] === "\xef") {
-            $this->input = substr($this->input, 3);
-        }
+    	// Remove the utf-8 BOM to save transfer bytes.
+		// Otherwise, line breaks before the 2nd comment are kept and
+		// lots of zero bytes stay, leading to additional waste and parsing
+		// exceptions.
+		$first2 = substr($this->input, 0, 2);
+		$first3 = substr($this->input, 0, 3);
+		$first4 = substr($this->input, 0, 4);
+		$encoding = 'UTF-8';
+		// Unicode BOM is U+FEFF, but after encoded, it will look like this.
+		if ($first3 == chr(0xEF).chr(0xBB).chr(0xBF)) {
+			$this->input = substr($this->input, 3);
+		} elseif ($first4 == chr(0x00).chr(0x00).chr(0xFE).chr(0xFF)) {
+			$encoding = 'UTF-32BE';
+			$this->input = substr($this->input, 4);
+		} elseif ($first4 == chr(0xFF).chr(0xFE).chr(0x00).chr(0x00)) {
+			$encoding = 'UTF-32LE';
+			$this->input = substr($this->input, 4);
+		} elseif ($first2 == chr(0xFE).chr(0xFF)) {
+			$encoding = 'UTF-16BE';
+			$this->input = substr($this->input, 2);
+		} elseif ($first2 == chr(0xFF).chr(0xFE)) {
+			$encoding = 'UTF-16LE';
+			$this->input = substr($this->input, 2);
+		}
+		// Convert only non-8-bit files.
+		if ($encoding != 'UTF-8') {
+			$this->input = mb_convert_encoding($this->input, 'UTF-8', $encoding);
+		}
 
         $this->input = str_replace("\r\n", "\n", $this->input);
         $this->inputLength = strlen($this->input);

From 74717eaf6a7dffb7d9a8c89bfa83a7bfb7d75a1f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hans-J=C3=BCrgen=20Tappe?=
 <hjtappe@users.noreply.github.com>
Date: Sat, 28 Mar 2015 21:52:02 +0100
Subject: [PATCH 3/4] Correcte coding style: indent by whitespace.

---
 src/JSMin/JSMin.php | 80 ++++++++++++++++++++++-----------------------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/src/JSMin/JSMin.php b/src/JSMin/JSMin.php
index bbe54c2..88eb3a2 100644
--- a/src/JSMin/JSMin.php
+++ b/src/JSMin/JSMin.php
@@ -111,34 +111,34 @@ public function min()
             mb_internal_encoding('8bit');
         }
 
-    	// Remove the utf-8 BOM to save transfer bytes.
-		// Otherwise, line breaks before the 2nd comment are kept and
-		// lots of zero bytes stay, leading to additional waste and parsing
-		// exceptions.
-		$first2 = substr($this->input, 0, 2);
-		$first3 = substr($this->input, 0, 3);
-		$first4 = substr($this->input, 0, 4);
-		$encoding = 'UTF-8';
-		// Unicode BOM is U+FEFF, but after encoded, it will look like this.
-		if ($first3 == chr(0xEF).chr(0xBB).chr(0xBF)) {
-			$this->input = substr($this->input, 3);
-		} elseif ($first4 == chr(0x00).chr(0x00).chr(0xFE).chr(0xFF)) {
-			$encoding = 'UTF-32BE';
-			$this->input = substr($this->input, 4);
-		} elseif ($first4 == chr(0xFF).chr(0xFE).chr(0x00).chr(0x00)) {
-			$encoding = 'UTF-32LE';
-			$this->input = substr($this->input, 4);
-		} elseif ($first2 == chr(0xFE).chr(0xFF)) {
-			$encoding = 'UTF-16BE';
-			$this->input = substr($this->input, 2);
-		} elseif ($first2 == chr(0xFF).chr(0xFE)) {
-			$encoding = 'UTF-16LE';
-			$this->input = substr($this->input, 2);
-		}
-		// Convert only non-8-bit files.
-		if ($encoding != 'UTF-8') {
-			$this->input = mb_convert_encoding($this->input, 'UTF-8', $encoding);
-		}
+        // Remove the utf-8 BOM to save transfer bytes.
+        // Otherwise, line breaks before the 2nd comment are kept and
+        // lots of zero bytes stay, leading to additional waste and parsing
+        // exceptions.
+        $first2 = substr($this->input, 0, 2);
+        $first3 = substr($this->input, 0, 3);
+        $first4 = substr($this->input, 0, 4);
+        $encoding = 'UTF-8';
+        // Unicode BOM is U+FEFF, but after encoded, it will look like this.
+        if ($first3 == chr(0xEF).chr(0xBB).chr(0xBF)) {
+            $this->input = substr($this->input, 3);
+        } elseif ($first4 == chr(0x00).chr(0x00).chr(0xFE).chr(0xFF)) {
+            $encoding = 'UTF-32BE';
+            $this->input = substr($this->input, 4);
+        } elseif ($first4 == chr(0xFF).chr(0xFE).chr(0x00).chr(0x00)) {
+            $encoding = 'UTF-32LE';
+            $this->input = substr($this->input, 4);
+        } elseif ($first2 == chr(0xFE).chr(0xFF)) {
+            $encoding = 'UTF-16BE';
+            $this->input = substr($this->input, 2);
+        } elseif ($first2 == chr(0xFF).chr(0xFE)) {
+            $encoding = 'UTF-16LE';
+            $this->input = substr($this->input, 2);
+        }
+        // Convert only non-8-bit files.
+        if ($encoding != 'UTF-8') {
+            $this->input = mb_convert_encoding($this->input, 'UTF-8', $encoding);
+        }
 
         $this->input = str_replace("\r\n", "\n", $this->input);
         $this->inputLength = strlen($this->input);
@@ -307,12 +307,12 @@ protected function isRegexpLiteral()
             return true;
         }
 
-		// we have to check for a preceding keyword, and we don't need to pattern
-		// match over the whole output.
-		$recentOutput = substr($this->output, -10);
+        // we have to check for a preceding keyword, and we don't need to pattern
+        // match over the whole output.
+        $recentOutput = substr($this->output, -10);
 
-		// check if return/typeof directly precede a pattern without a space
-		foreach (array('return', 'typeof') as $keyword) {
+        // check if return/typeof directly precede a pattern without a space
+        foreach (array('return', 'typeof') as $keyword) {
             if ($this->a !== substr($keyword, -1)) {
                 // certainly wasn't keyword
                 continue;
@@ -324,13 +324,13 @@ protected function isRegexpLiteral()
             }
         }
 
-		// check all keywords
-		if ($this->a === ' ' || $this->a === "\n") {
-			if (preg_match('~(^|[\\s\\S])(?:case|else|in|return|typeof)$~', $recentOutput, $m)) {
-				if ($m[1] === '' || !$this->isAlphaNum($m[1])) {
-					return true;
-				}
-			}
+        // check all keywords
+        if ($this->a === ' ' || $this->a === "\n") {
+            if (preg_match('~(^|[\\s\\S])(?:case|else|in|return|typeof)$~', $recentOutput, $m)) {
+                if ($m[1] === '' || !$this->isAlphaNum($m[1])) {
+                    return true;
+                }
+            }
         }
 
         return false;

From c77c3014c3b9ff638410b105ac02826e91e361b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hans-J=C3=BCrgen=20Tappe?=
 <hjtappe@users.noreply.github.com>
Date: Wed, 8 Apr 2015 11:41:48 +0200
Subject: [PATCH 4/4] Implementation of the UTF-8 BOM Encoding class.

---
 src/JSMin/EncodingDetector.php  | 139 ++++++++++++++++++++++++++++++++
 src/JSMin/EncodingException.php |   6 ++
 src/JSMin/JSMin.php             |  30 +------
 3 files changed, 147 insertions(+), 28 deletions(-)
 create mode 100644 src/JSMin/EncodingDetector.php
 create mode 100644 src/JSMin/EncodingException.php

diff --git a/src/JSMin/EncodingDetector.php b/src/JSMin/EncodingDetector.php
new file mode 100644
index 0000000..6747690
--- /dev/null
+++ b/src/JSMin/EncodingDetector.php
@@ -0,0 +1,139 @@
+<?php
+
+namespace JSMin;
+
+/**
+ * @package JSMin
+ * @copyright 2015 Hans-Jürgen Tappe <hjtappe@users.noreply.github.com>
+ * @license http://opensource.org/licenses/mit-license.php MIT License
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is furnished to do
+ * so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * The Software shall be used for Good, not Evil.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+class EncodingDetector
+{
+    /**
+     * Default Target Encoding
+     */
+    const DEFAULT_ENCODING = 'UTF-8';
+
+    /**
+     * Debug output switch
+     */
+    const DEBUG = false;
+
+
+    /**
+     * Decode input file content to UTF-8, taking BOM into account.
+     *
+     * @param string $input Input string
+     * @throws EncodingException
+     */
+    public static function decode($input, $targetEncoding = NULL)
+    {
+        // Check for the target encoding.
+        if (is_null($targetEncoding)) {
+            $targetEncoding = self::get_encoding();
+        }
+
+        // Remove the utf-8 BOM to save transfer bytes.
+        // Otherwise, line breaks before the 2nd comment are kept and
+        // lots of zero bytes stay, leading to additional waste and
+        // parsing
+        // exceptions.
+        $first2 = substr($input, 0, 2);
+        $first3 = substr($input, 0, 3);
+        $first4 = substr($input, 0, 4);
+
+        // Unicode BOM is U+FEFF, but after encoded, it will look
+        // like this.
+        if ($first3 == chr(0xEF).chr(0xBB).chr(0xBF)) {
+            $encoding = 'UTF-8';
+            $input = substr($input, 3);
+        } elseif ($first4 == chr(0x00).chr(0x00).chr(0xFE).chr(0xFF)) {
+            $encoding = 'UTF-32BE';
+            $input = substr($input, 4);
+        } elseif ($first4 == chr(0xFF).chr(0xFE).chr(0x00).chr(0x00)) {
+            $encoding = 'UTF-32LE';
+            $input = substr($input, 4);
+        } elseif ($first2 == chr(0xFE).chr(0xFF)) {
+            $encoding = 'UTF-16BE';
+            $input = substr($input, 2);
+        } elseif ($first2 == chr(0xFF).chr(0xFE)) {
+            $encoding = 'UTF-16LE';
+            $input = substr($input, 2);
+        } else {
+            $encoding = 'UTF-8';
+            // No BOM
+        }
+
+        // Convert only convertible files.
+        if (strtoupper($encoding) != strtoupper($targetEncoding)) {
+            if (self::check_mbstring()) {
+                $input = mb_convert_encoding($input, $targetEncoding, $encoding);
+            } else {
+                $result = iconv(
+                        $in_charset = $encoding,
+                        $out_charset = $targetEncoding,
+                        $input);
+                if (false === $result)
+                {
+                    throw new EncodingException('Input string could not be converted.');
+                } else {
+                    $input = $result;
+                }
+            }
+        }
+
+        return $input;
+    }
+
+    /**
+     * Check the availability of mbstring.
+     * @returns boolean
+     */
+    private static function check_mbstring()
+    {
+        if (function_exists('mb_strlen') &&
+                ((int)ini_get('mbstring.func_overload') & 2)) {
+            self::DEBUG && error_log(__CLASS__.'::'.__FUNCTION__.': true');
+            return true;
+        } else {
+            self::DEBUG && error_log(__CLASS__.'::'.__FUNCTION__.': false');
+            return false;
+        }
+    }
+
+    /**
+     * Get the target encoding.
+     * @returns string 
+     */
+    private static function get_encoding()
+    {
+        $iniValue = ini_get('default_encoding');
+        if (!is_null($iniValue) && $iniValue != '') {
+            self::DEBUG && error_log(__CLASS__.'::'.__FUNCTION__.': '.$iniValue.' (INI VALUE)');
+            return $iniValue;
+        } else {
+            self::DEBUG && error_log(__CLASS__.'::'.__FUNCTION__.': '.self::DEFAULT_ENCODING.' (DEFAULT)');
+            return self::DEFAULT_ENCODING;
+        }
+    }
+}
diff --git a/src/JSMin/EncodingException.php b/src/JSMin/EncodingException.php
new file mode 100644
index 0000000..a429225
--- /dev/null
+++ b/src/JSMin/EncodingException.php
@@ -0,0 +1,6 @@
+<?php
+
+namespace JSMin;
+
+class EncodingException extends \Exception {
+}
diff --git a/src/JSMin/JSMin.php b/src/JSMin/JSMin.php
index 88eb3a2..9a8550b 100644
--- a/src/JSMin/JSMin.php
+++ b/src/JSMin/JSMin.php
@@ -111,34 +111,8 @@ public function min()
             mb_internal_encoding('8bit');
         }
 
-        // Remove the utf-8 BOM to save transfer bytes.
-        // Otherwise, line breaks before the 2nd comment are kept and
-        // lots of zero bytes stay, leading to additional waste and parsing
-        // exceptions.
-        $first2 = substr($this->input, 0, 2);
-        $first3 = substr($this->input, 0, 3);
-        $first4 = substr($this->input, 0, 4);
-        $encoding = 'UTF-8';
-        // Unicode BOM is U+FEFF, but after encoded, it will look like this.
-        if ($first3 == chr(0xEF).chr(0xBB).chr(0xBF)) {
-            $this->input = substr($this->input, 3);
-        } elseif ($first4 == chr(0x00).chr(0x00).chr(0xFE).chr(0xFF)) {
-            $encoding = 'UTF-32BE';
-            $this->input = substr($this->input, 4);
-        } elseif ($first4 == chr(0xFF).chr(0xFE).chr(0x00).chr(0x00)) {
-            $encoding = 'UTF-32LE';
-            $this->input = substr($this->input, 4);
-        } elseif ($first2 == chr(0xFE).chr(0xFF)) {
-            $encoding = 'UTF-16BE';
-            $this->input = substr($this->input, 2);
-        } elseif ($first2 == chr(0xFF).chr(0xFE)) {
-            $encoding = 'UTF-16LE';
-            $this->input = substr($this->input, 2);
-        }
-        // Convert only non-8-bit files.
-        if ($encoding != 'UTF-8') {
-            $this->input = mb_convert_encoding($this->input, 'UTF-8', $encoding);
-        }
+        // Encode the input string, taking the UTF-8 BOM into account
+        $this->input = EncodingDetector::decode($this->input);
 
         $this->input = str_replace("\r\n", "\n", $this->input);
         $this->inputLength = strlen($this->input);