Fix a problem with PHP4 and UTF-8 byte order marks (BOM) in t3lib_div::xml2array()
authorMartin Kutschker <martin.t.kutschker@blackbox.net>
Tue, 21 Nov 2006 08:53:40 +0000 (08:53 +0000)
committerMartin Kutschker <martin.t.kutschker@blackbox.net>
Tue, 21 Nov 2006 08:53:40 +0000 (08:53 +0000)
git-svn-id: https://svn.typo3.org/TYPO3v4/Core/branches/TYPO3_4-0@1805 709f56b5-9817-0410-a4d7-c38de5d9e867

ChangeLog
t3lib/class.t3lib_div.php

index 4a1089a..a8eea28 100755 (executable)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2006-11-21  Martin Kutschker  <martin.t.kutschker@blackbox.net>
+
+       * Fix a problem with PHP4 and UTF-8 byte order marks (BOM) in t3lib_div::xml2array()
+
 2006-11-17  Michael Stucki  <michael@typo3.org>
 
        * Fixed bug #4059: options.hideRecords.pages doesn't work for mounted pages - Thanks to Marc Bastian Heinrichs for providing a fix!
index eb3767b..0222af6 100755 (executable)
@@ -2131,14 +2131,18 @@ class t3lib_div {
                xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, 0);
                xml_parser_set_option($parser, XML_OPTION_SKIP_WHITE, 0);
 
-                       // PHP5 fix of charset awareness:
-                       // Problem is: PHP5 apparently detects the charset of the XML file (or defaults to utf-8) and will AUTOMATICALLY convert the content to either utf-8, iso-8859-1 or us-ascii. PHP4 just passed the content through without taking action regarding the charset.
-                       // In TYPO3 we expect that the charset of XML content is NOT handled in the parser but internally in TYPO3 instead. Therefore it would be very nice if PHP5 could be configured to NOT process the charset of the files. But this is not possible for now.
-                       // What we do here fixes the problem but ONLY if the charset is utf-8, iso-8859-1 or us-ascii. That should work for most TYPO3 installations, in particular if people use utf-8 which we highly recommend.
-               if ((double)phpversion()>=5)    {
-                       $ereg_result = array();
-                       ereg('^[[:space:]]*<\?xml[^>]*encoding[[:space:]]*=[[:space:]]*"([^"]*)"',substr($string,0,200),$ereg_result);
-                       $theCharset = $ereg_result[1] ? $ereg_result[1] : ($TYPO3_CONF_VARS['BE']['forceCharset'] ? $TYPO3_CONF_VARS['BE']['forceCharset'] : 'iso-8859-1');
+                       //  PHP4 doesn't like Unicode byte order marks (BOM), so we have to check for them
+                       // The BOM check comes first, so that the PHP5 preg_match() below doesn't have to check for it
+               if(substr($string,0,3)=="\xEF\xBB\xBF") {
+                       xml_parser_set_option($parser, XML_OPTION_TARGET_ENCODING, 'utf-8');
+               }
+                       // PHP 4.x: output charset is the same as the input charset, charsets are handled transparently if not specified in xml_parser_create()
+                       // PHP 5.0.0 & 5.0.1: default output charset is ISO-8859-1, only ASCII, ISO-8859-1 and UTF-8 are supported!!!
+                       // PHP 5.0.2+: default output charset is UTF-8  , only ASCII, ISO-8859-1 and UTF-8 are supported!!!
+               elseif ((double)phpversion()>=5)        {
+                       $match = array();
+                       preg_match('/^[[:space:]]*<\?xml[^>]*encoding[[:space:]]*=[[:space:]]*"([^"]*)"/',substr($string,0,200),$match);
+                       $theCharset = $match[1] ? $match[1] : ($TYPO3_CONF_VARS['BE']['forceCharset'] ? $TYPO3_CONF_VARS['BE']['forceCharset'] : 'iso-8859-1');
                        xml_parser_set_option($parser, XML_OPTION_TARGET_ENCODING, $theCharset);  // us-ascii / utf-8 / iso-8859-1
                }