[TASK] Change various functions to use utf-8 by default
authorSteffen Ritter <info@rs-websystems.de>
Sun, 19 Feb 2012 11:40:55 +0000 (12:40 +0100)
committerSteffen Ritter <info@rs-websystems.de>
Sun, 26 Feb 2012 19:34:16 +0000 (20:34 +0100)
Many functions in TYPO3core expect a character set parameter to be
defined. Their default was iso-8859-1 in the past and is changed to
utf-8 now.

Change-Id: I9c228821e95167b67811c8475880707d5c77bdb7
Resolves: #34094
Releases: 4.7
Reviewed-on: http://review.typo3.org/9101
Reviewed-by: Michael Stucki
Tested-by: Michael Stucki
13 files changed:
t3lib/class.t3lib_cs.php
t3lib/class.t3lib_div.php
t3lib/l10n/parser/class.t3lib_l10n_parser_llphp.php
tests/t3lib/class.t3lib_pagerendererTest.php
typo3/sysext/cms/tslib/class.tslib_fe.php
typo3/sysext/impexp/class.tx_impexp.php
typo3/sysext/indexed_search/class.external_parser.php
typo3/sysext/rtehtmlarea/class.tx_rtehtmlarea_base.php
typo3/sysext/rtehtmlarea/mod3/class.tx_rtehtmlarea_browse_links.php
typo3/sysext/rtehtmlarea/mod6/class.tx_rtehtmlarea_parse_html.php
typo3/sysext/rtehtmlarea/pi2/class.tx_rtehtmlarea_pi2.php
typo3/sysext/t3editor/res/tsref/tsref.xml
typo3/template.php

index a49a9ea..712c500 100644 (file)
@@ -572,7 +572,7 @@ class t3lib_cs {
                if (TYPO3_OS == 'WIN') {
                        $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
                } else {
-                       $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
+                       $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'utf-8';
                }
 
                return $cs;
@@ -814,26 +814,32 @@ class t3lib_cs {
         * @param       boolean         If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
         * @return      string          Output string
         */
-       function entities_to_utf8($str, $alsoStdHtmlEnt = 0) {
+       function entities_to_utf8($str, $alsoStdHtmlEnt = FALSE) {
                if ($alsoStdHtmlEnt) {
-                       $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
+                       $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8'));
                }
 
                $token = md5(microtime());
                $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
                foreach ($parts as $k => $v) {
-                       if ($k % 2) {
-                               if (substr($v, 0, 1) == '#') { // Dec or hex entities:
-                                       if (substr($v, 1, 1) == 'x') {
-                                               $parts[$k] = $this->UnumberToChar(hexdec(substr($v, 2)));
-                                       } else {
-                                               $parts[$k] = $this->UnumberToChar(substr($v, 1));
-                                       }
-                               } elseif ($alsoStdHtmlEnt && $trans_tbl['&' . $v . ';']) { // Other entities:
-                                       $parts[$k] = $this->utf8_encode($trans_tbl['&' . $v . ';'], 'iso-8859-1');
-                               } else { // No conversion:
-                                       $parts[$k] = '&' . $v . ';';
+                               // only take every second element
+                       if ($k % 2 === 0) {
+                               continue;
+                       }
+
+                       $position = 0;
+                       if (substr($v, $position, 1) == '#') { // Dec or hex entities:
+                               $position++;
+                               if (substr($v, $position, 1) == 'x') {
+                                       $v = hexdec(substr($v, ++$position));
+                               } else {
+                                       $v = substr($v, $position);
                                }
+                               $parts[$k] = $this->UnumberToChar($v);
+                       } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) { // Other entities:
+                               $parts[$k] = $trans_tbl['&' . $v . ';'];
+                       } else { // No conversion:
+                               $parts[$k] = '&' . $v . ';';
                        }
                }
 
@@ -2346,4 +2352,4 @@ if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLA
        include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
 }
 
-?>
\ No newline at end of file
+?>
index 86186d8..67d6be1 100644 (file)
@@ -375,7 +375,7 @@ final class t3lib_div {
                } else {
                                // this case should not happen
                        $csConvObj = self::makeInstance('t3lib_cs');
-                       return $csConvObj->crop('iso-8859-1', $string, $chars, $appendString);
+                       return $csConvObj->crop('utf-8', $string, $chars, $appendString);
                }
        }
 
@@ -4189,7 +4189,7 @@ final class t3lib_div {
                if (@is_file($fileRef) && $langKey) {
 
                                // Set charsets:
-                       $sourceCharset = $csConvObj->parse_charset($csConvObj->charSetArray[$langKey] ? $csConvObj->charSetArray[$langKey] : 'iso-8859-1');
+                       $sourceCharset = $csConvObj->parse_charset($csConvObj->charSetArray[$langKey] ? $csConvObj->charSetArray[$langKey] : 'utf-8');
                        if ($charset) {
                                $targetCharset = $csConvObj->parse_charset($charset);
                        } else {
@@ -4216,9 +4216,9 @@ final class t3lib_div {
 
                                        // converting the default language (English)
                                        // this needs to be done for a few accented loan words and extension names
-                               if (is_array($LOCAL_LANG['default']) && $targetCharset != 'iso-8859-1') {
+                               if (is_array($LOCAL_LANG['default']) && $targetCharset != 'utf-8') {
                                        foreach ($LOCAL_LANG['default'] as &$labelValue) {
-                                               $labelValue = $csConvObj->conv($labelValue, 'iso-8859-1', $targetCharset);
+                                               $labelValue = $csConvObj->conv($labelValue, 'utf-8', $targetCharset);
                                        }
                                        unset($labelValue);
                                }
@@ -5190,7 +5190,7 @@ final class t3lib_div {
         * @param string $charset Charset used for encoding
         * @return string The encoded string
         */
-       public static function encodeHeader($line, $enc = 'quoted-printable', $charset = 'iso-8859-1') {
+       public static function encodeHeader($line, $enc = 'quoted-printable', $charset = 'utf-8') {
                        // Avoid problems if "###" is found in $line (would conflict with the placeholder which is used below)
                if (strpos($line, '###') !== FALSE) {
                        return $line;
index 953f495..497c7ac 100644 (file)
@@ -140,9 +140,9 @@ class t3lib_l10n_parser_Llphp implements t3lib_l10n_parser {
 
                        // Converting the default language (English)
                        // This needs to be done for a few accented loan words and extension names
-               if (is_array($LOCAL_LANG['default']) && $this->targetCharset !== 'iso-8859-1') {
+               if (is_array($LOCAL_LANG['default']) && $this->targetCharset !== 'utf-8') {
                        foreach ($LOCAL_LANG['default'] as &$labelValue) {
-                               $labelValue = $this->csConvObj->conv($labelValue, 'iso-8859-1', $this->targetCharset);
+                               $labelValue = $this->csConvObj->conv($labelValue, 'utf-8', $this->targetCharset);
                        }
                        unset($labelValue);
                }
@@ -211,7 +211,7 @@ class t3lib_l10n_parser_Llphp implements t3lib_l10n_parser {
         */
        protected function setCharsets($languageKey, $charset) {
                $this->sourceCharset = $this->csConvObj->parse_charset($this->csConvObj->charSetArray[$languageKey]
-                       ? $this->csConvObj->charSetArray[$languageKey] : 'iso-8859-1');
+                       ? $this->csConvObj->charSetArray[$languageKey] : 'utf-8');
                if ($charset) {
                        $this->targetCharset = $this->csConvObj->parse_charset($charset);
                } else {
index 60290ca..775b4bc 100644 (file)
@@ -766,7 +766,7 @@ class t3lib_PageRendererTest extends tx_phpunit_testcase {
        public function isInlineLanguageLabelDeliveredWithNonUTF8() {
                $testPrefix = uniqid('test');
                $this->fixture->loadExtCore();
-               $this->fixture->setCharSet('iso-8859-1');
+               $this->fixture->setCharSet('utf-8');
                $this->fixture->addInlineLanguageLabel($testPrefix, $testPrefix . "_\xd8");
 
                $out = $this->fixture->render();
index d5f2fd5..446f0f9 100644 (file)
         * @var t3lib_cs
         */
        var $csConvObj;
-       var $defaultCharSet = 'iso-8859-1';     // The default charset used in the frontend if nothing else is set.
+       var $defaultCharSet = 'utf-8';  // The default charset used in the frontend if nothing else is set.
        var $renderCharset='';                          // Internal charset of the frontend during rendering. (Default: UTF-8)
        var $metaCharset='';                            // Output charset of the websites content. This is the charset found in the header, meta tag etc. If different from $renderCharset a conversion happens before output to browser. Defaults to ->renderCharset if not set.
        var $localeCharset='';                          // Assumed charset of locale strings.
@@ -4788,7 +4788,7 @@ if (version == "n3") {
 
        /**
         * Converts the charset of the input string if applicable.
-        * The "to" charset is determined by the currently used charset for the page which is "iso-8859-1" by default or set by $GLOBALS['TSFE']->config['config']['renderCharset']
+        * The "to" charset is determined by the currently used charset for the page which is "utf-8" by default or set by $GLOBALS['TSFE']->config['config']['renderCharset']
         * Only if there is a difference between the two charsets will a conversion be made
         * The conversion is done real-time - no caching for performance at this point!
         *
index 7626571..c67a145 100755 (executable)
@@ -965,7 +965,7 @@ class tx_impexp {
                );
 
                        // Creating XML file from $outputArray:
-               $charset = $this->dat['header']['charset'] ? $this->dat['header']['charset'] : 'iso-8859-1';
+               $charset = $this->dat['header']['charset'] ? $this->dat['header']['charset'] : 'utf-8';
                $XML = '<?xml version="1.0" encoding="'.$charset.'" standalone="yes" ?>'.LF;
                $XML.= t3lib_div::array2xml($this->dat,'',0,'T3RecordDocument',0,$options);
 
index b37f045..118d237 100755 (executable)
@@ -493,8 +493,9 @@ class tx_indexed_search_extparse {
                        case 'txt':
                        case 'csv':             // Raw text
                                $content = t3lib_div::getUrl($absFile);
-                                       // TODO: Auto-registration of charset???? -> utf-8 (Current assuming western europe...)
-                               $content = $this->pObj->convertHTMLToUtf8($content, 'iso-8859-1');
+                                       // TODO: Implement auto detection of charset (currently assuming utf-8)
+                               $contentCharset = 'utf-8';
+                               $content = $this->pObj->convertHTMLToUtf8($content, $contentCharset);
                                $contentArr = $this->pObj->splitRegularContent($content);
                                $contentArr['title'] = basename($absFile);      // Make sure the title doesn't expose the absolute path!
                        break;
index 7172785..74c12c1 100644 (file)
@@ -328,7 +328,7 @@ class tx_rtehtmlarea_base extends t3lib_rteapi {
                        $this->OutputCharset = $this->charset;
 
                        $this->contentCharset = $LANG->csConvObj->charSetArray[$this->contentTypo3Language];
-                       $this->contentCharset = $this->contentCharset ? $this->contentCharset : 'iso-8859-1';
+                       $this->contentCharset = $this->contentCharset ? $this->contentCharset : 'utf-8';
                        $this->origContentCharSet = $this->contentCharset;
                        $this->contentCharset = 'utf-8';
 
index 4012ae0..e8ea029 100644 (file)
@@ -1139,7 +1139,7 @@ class tx_rtehtmlarea_browse_links extends browse_links {
 
                $LANG->lang = $this->contentTypo3Language;
                $LANG->origCharSet = $LANG->csConvObj->charSetArray[$this->contentTypo3Language];
-               $LANG->origCharSet = $LANG->origCharSet ? $LANG->origCharSet : 'iso-8859-1';
+               $LANG->origCharSet = $LANG->origCharSet ? $LANG->origCharSet : 'utf-8';
                $LANG->charSet = $this->contentTypo3Charset;
                $LLString = $LANG->sL($string);
 
index 82a0656..20d03ae 100644 (file)
@@ -78,7 +78,7 @@ class tx_rtehtmlarea_parse_html {
 
                $clientInfo = t3lib_div::clientInfo();
                        // the charset of the content element, possibly overidden by forceCharset
-               $toCharSet = t3lib_div::_GP('charset')?t3lib_div::_GP('charset'):'iso-8859-1';
+               $toCharSet = t3lib_div::_GP('charset')?t3lib_div::_GP('charset'):'utf-8';
                        // IE wants it back in utf-8
                if ( $clientInfo['BROWSER']= 'msie') {
                        $toCharSet = 'utf-8';
index 9de5fe3..87edeec 100644 (file)
@@ -169,7 +169,7 @@ class tx_rtehtmlarea_pi2 extends tx_rtehtmlarea_base {
 
                        // Set the charset of the content
                $this->contentCharset = $TSFE->csConvObj->charSetArray[$this->contentTypo3Language];
-               $this->contentCharset = $this->contentCharset ? $this->contentCharset : 'iso-8859-1';
+               $this->contentCharset = $this->contentCharset ? $this->contentCharset : 'utf-8';
                $this->contentCharset = trim($TSFE->config['config']['metaCharset']) ? trim($TSFE->config['config']['metaCharset']) : $this->contentCharset;
 
                /* =======================================
index 866497b..d42e4ec 100644 (file)
@@ -774,7 +774,7 @@ If this property is set, images are not allowed to be scaled up in size. This pa
                </property>
                <property name="notification_email_charset" type="string">
                        <description><![CDATA[Alternative charset for the notification mails.]]></description>
-                       <default><![CDATA[ISO-8859-1]]></default>
+                       <default><![CDATA[utf-8]]></default>
                </property>
                <property name="notification_email_encoding" type="string">
                        <description><![CDATA[This sets the encoding of plaintext emails (notification messages). The default encoding is "quoted-printable". But setting this to eg. "base64" will encode the content with base64 encoding.
index 42116aa..8b64af3 100644 (file)
@@ -128,7 +128,7 @@ class template {
        var $parseTimeFlag = 0;                 // Will output the parsetime of the scripts in milliseconds (for admin-users). Set this to FALSE when releasing TYPO3. Only for dev.
 
                // INTERNAL
-       var $charset = 'iso-8859-1';    // Default charset. see function initCharset()
+       var $charset = 'utf-8'; // Default charset. see function initCharset()
 
        var $sectionFlag=0;                             // Internal: Indicates if a <div>-output section is open
        var $divClass = '';                             // (Default) Class for wrapping <DIV>-tag of page. Is set in class extensions.