Fixed issue #13670: Performance optimization: change while(list() to foreach() (thank...
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
old mode 100755 (executable)
new mode 100644 (file)
index c253e25..bdacd5d
@@ -2,7 +2,7 @@
 /***************************************************************
 *  Copyright notice
 *
-*  (c) 2003-2004 Kasper Skaarhoj (kasper@typo3.com)
+*  (c) 2003-2009 Kasper Skaarhoj (kasperYYYY@typo3.com)
 *  All rights reserved
 *
 *  This script is part of the Typo3 project. The Typo3 project is
@@ -26,7 +26,7 @@
  *
  * $Id$
  *
- * @author     Kasper Skaarhoj <kasper@typo3.com>
+ * @author     Kasper Skaarhoj <kasperYYYY@typo3.com>
  * @author     Martin Kutschker <martin.t.kutschker@blackbox.net>
  */
 /**
  *
  *
  *
- *  128: class t3lib_cs
- *  442:     function parse_charset($charset)
- *  460:     function get_locale_charset($locale)
- *  492:     function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
- *  529:     function utf8_encode($str,$charset)
- *  576:     function utf8_decode($str,$charset,$useEntityForNoChar=0)
- *  619:     function utf8_to_entities($str)
- *  652:     function entities_to_utf8($str,$alsoStdHtmlEnt=0)
- *  686:     function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
- *  736:     function UnumberToChar($cbyte)
- *  781:     function utf8CharToUnumber($str,$hex=0)
+ *  136: class t3lib_cs
+ *  488:     function parse_charset($charset)
+ *  507:     function get_locale_charset($locale)
+ *
+ *              SECTION: Charset Conversion functions
+ *  560:     function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
+ *  600:     function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
+ *  617:     function utf8_encode($str,$charset)
+ *  663:     function utf8_decode($str,$charset,$useEntityForNoChar=0)
+ *  706:     function utf8_to_entities($str)
+ *  739:     function entities_to_utf8($str,$alsoStdHtmlEnt=0)
+ *  773:     function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
+ *  823:     function UnumberToChar($cbyte)
+ *  868:     function utf8CharToUnumber($str,$hex=0)
  *
  *              SECTION: Init functions
- *  824:     function initCharset($charset)
- *  885:     function initCaseFoldingUTF8()
- *  973:     function initCaseFolding($charset)
+ *  911:     function initCharset($charset)
+ *  973:     function initUnicodeData($mode=null)
+ * 1198:     function initCaseFolding($charset)
+ * 1260:     function initToASCII($charset)
  *
  *              SECTION: String operation functions
- * 1058:     function substr($charset,$string,$start,$len=null)
- * 1096:     function strlen($charset,$string)
- * 1124:     function crop($charset,$string,$len,$crop='')
- * 1165:     function strtrunc($charset,$string,$len)
- * 1197:     function conv_case($charset,$string,$case)
+ * 1331:     function substr($charset,$string,$start,$len=null)
+ * 1384:     function strlen($charset,$string)
+ * 1414:     function crop($charset,$string,$len,$crop='')
+ * 1467:     function strtrunc($charset,$string,$len)
+ * 1501:     function conv_case($charset,$string,$case)
+ * 1527:     function specCharsToASCII($charset,$string)
+ *
+ *              SECTION: Internal string operation functions
+ * 1567:     function sb_char_mapping($str,$charset,$mode,$opt='')
  *
  *              SECTION: Internal UTF-8 string operation functions
- * 1264:     function utf8_substr($str,$start,$len=null)
- * 1297:     function utf8_strlen($str)
- * 1318:     function utf8_strtrunc($str,$len)
- * 1340:     function utf8_strpos($haystack,$needle,$offset=0)
- * 1363:     function utf8_strrpos($haystack,$needle)
- * 1383:     function utf8_char2byte_pos($str,$pos)
- * 1424:     function utf8_byte2char_pos($str,$pos)
- * 1448:     function utf8_conv_case($str,$case)
+ * 1622:     function utf8_substr($str,$start,$len=null)
+ * 1655:     function utf8_strlen($str)
+ * 1676:     function utf8_strtrunc($str,$len)
+ * 1698:     function utf8_strpos($haystack,$needle,$offset=0)
+ * 1723:     function utf8_strrpos($haystack,$needle)
+ * 1745:     function utf8_char2byte_pos($str,$pos)
+ * 1786:     function utf8_byte2char_pos($str,$pos)
+ * 1809:     function utf8_char_mapping($str,$mode,$opt='')
  *
  *              SECTION: Internal EUC string operation functions
- * 1514:     function euc_strtrunc($str,$len,$charset)
- * 1543:     function euc_substr($str,$start,$charset,$len=null)
- * 1568:     function euc_strlen($str,$charset)
- * 1595:     function euc_char2byte_pos($str,$pos,$charset)
- * 1636:     function euc_conv_case($str,$case,$charset)
+ * 1885:     function euc_strtrunc($str,$len,$charset)
+ * 1914:     function euc_substr($str,$start,$charset,$len=null)
+ * 1939:     function euc_strlen($str,$charset)
+ * 1966:     function euc_char2byte_pos($str,$pos,$charset)
+ * 2007:     function euc_char_mapping($str,$charset,$mode,$opt='')
  *
- * TOTAL FUNCTIONS: 31
+ * TOTAL FUNCTIONS: 35
  * (This index is automatically created/updated by the extension "extdeveval")
  *
  */
  *
  * Functions nearly working on UTF-8 strings:
  *
- * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf_strlen
- * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained 7-bit ASCII
+ * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
+ * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
  * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
  * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
+ * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
  *
  * Functions NOT working on UTF-8 strings:
  *
  * - stripos
  * - substr
  * - strrev
- * - ereg/eregi
  * - split/spliti
- * - preg_*
  * - ...
  *
  */
 /**
  * Class for conversion between charsets
  *
- * @author     Kasper Skaarhoj <kasper@typo3.com>
+ * @author     Kasper Skaarhoj <kasperYYYY@typo3.com>
  * @author     Martin Kutschker <martin.t.kutschker@blackbox.net>
  * @package TYPO3
  * @subpackage t3lib
@@ -134,6 +141,9 @@ class t3lib_cs {
                // An array where case folding data will be stored (cached)
        var $caseFolding=array();
 
+               // An array where charset-to-ASCII mappings are stored (cached)
+       var $toASCII=array();
+
                // This tells the converter which charsets has two bytes per char:
        var $twoByteSets=array(
                'ucs-2'=>1,     // 2-byte Unicode
@@ -148,7 +158,8 @@ class t3lib_cs {
                // This tells the converter which charsets use a scheme like the Extended Unix Code:
        var $eucBasedSets=array(
                'gb2312'=>1,            // Chinese, simplified.
-               'big5'=>1,                      // Chinese, traditional.
+               'big5'=>1,              // Chinese, traditional.
+               'euc-kr'=>1,            // Korean
                'shift_jis'=>1,         // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
        );
 
@@ -160,8 +171,16 @@ class t3lib_cs {
                'cp819' => 'iso-8859-1',
                'ibm819' => 'iso-8859-1',
                'iso-ir-100' => 'iso-8859-1',
-               'iso-ir-109' => 'iso-8859-2',
+               'iso-ir-101' => 'iso-8859-2',
+               'iso-ir-109' => 'iso-8859-3',
+               'iso-ir-110' => 'iso-8859-4',
+               'iso-ir-144' => 'iso-8859-5',
+               'iso-ir-127' => 'iso-8859-6',
+               'iso-ir-126' => 'iso-8859-7',
+               'iso-ir-138' => 'iso-8859-8',
                'iso-ir-148' => 'iso-8859-9',
+               'iso-ir-157' => 'iso-8859-10',
+               'iso-ir-179' => 'iso-8859-13',
                'iso-ir-199' => 'iso-8859-14',
                'iso-ir-203' => 'iso-8859-15',
                'csisolatin1' => 'iso-8859-1',
@@ -222,6 +241,7 @@ class t3lib_cs {
                'sjis' => 'shift_jis',
                'shift-jis' => 'shift_jis',
                'cp932' => 'shift_jis',
+               'cp949' => 'euc-kr',
                'utf7' => 'utf-7',
                'utf8' => 'utf-8',
                'utf16' => 'utf-16',
@@ -231,85 +251,120 @@ class t3lib_cs {
                'ucs4' => 'ucs-4',
        );
 
-               // mapping of iso-639:2 language codes to language (family) names
-       var $lang_to_langfamily=array(
-                       // iso-639:2 language codes, see:
-                       //  http://www.w3.org/WAI/ER/IG/ert/iso639.htm
-                       //  http://www.unicode.org/onlinedat/languages.html
+               // mapping of iso-639-1 language codes to script names
+       var $lang_to_script=array(
+                       // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
                'ar' => 'arabic',
-               'bg' => 'cyrillic',
-               'cs' => 'east_european',
-               'da' => 'west_european',
-               'de' => 'west_european',
-               'es' => 'west_european',
+               'bg' => 'cyrillic',             // Bulgarian
+               'bs' => 'east_european',        // Bosnian
+               'cs' => 'east_european',        // Czech
+               'da' => 'west_european',        // Danish
+               'de' => 'west_european',        // German
+               'es' => 'west_european',        // Spanish
                'et' => 'estonian',
-               'eu' => 'west_european',
-               'fi' => 'west_european',
-               'fr' => 'west_european',
+               'eo' => 'unicode',              // Esperanto
+               'eu' => 'west_european',        // Basque
+               'fa' => 'arabic',       // Persian
+               'fi' => 'west_european',        // Finish
+               'fo' => 'west_european',        // Faroese
+               'fr' => 'west_european',        // French
+               'ga' => 'west_european',        // Galician
+               'ge' => 'unicode',                      // Georgian
                'gr' => 'greek',
-               'hr' => 'east_european',
-               'hu' => 'east_european',
-               'iw' => 'hebrew',
-               'is' => 'west_european',
-               'it' => 'west_european',
+               'he' => 'hebrew',               // Hebrew (since 1998)
+               'hi' => 'unicode',              // Hindi
+               'hr' => 'east_european',        // Croatian
+               'hu' => 'east_european',        // Hungarian
+               'iw' => 'hebrew',               // Hebrew (til 1998)
+               'is' => 'west_european',        // Icelandic
+               'it' => 'west_european',        // Italian
                'ja' => 'japanese',
-               'kl' => 'west_european',
+               'kl' => 'west_european',        // Greenlandic
                'ko' => 'korean',
                'lt' => 'lithuanian',
-               'lv' => 'west_european', // Latvian/Lettish
-               'nl' => 'west_european',
-               'no' => 'west_european',
-               'pl' => 'east_european',
-               'pt' => 'west_european',
-               'ro' => 'east_european',
-               'ru' => 'cyrillic',
-               'sk' => 'east_european',
-               'sl' => 'east_european',
-               'sv' => 'west_european',
+               'lv' => 'west_european',        // Latvian/Lettish
+               'nl' => 'west_european',        // Dutch
+               'no' => 'west_european',        // Norwegian
+               'nb' => 'west_european',        // Norwegian Bokmal
+               'nn' => 'west_european',        // Norwegian Nynorsk
+               'pl' => 'east_european',        // Polish
+               'pt' => 'west_european',        // Portuguese
+               'ro' => 'east_european',        // Romanian
+               'ru' => 'cyrillic',             // Russian
+               'sk' => 'east_european',        // Slovak
+               'sl' => 'east_european',        // Slovenian
+               'sr' => 'cyrillic',             // Serbian
+               'sv' => 'west_european',        // Swedish
+               'sq' => 'albanian',             // Albanian
                'th' => 'thai',
-               'uk' => 'cyrillic',
+               'uk' => 'cyrillic',             // Ukranian
                'vi' => 'vietnamese',
                'zh' => 'chinese',
                        // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
+                       // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
+               'ara' => 'arabic',
+               'bgr' => 'cyrillic',            // Bulgarian
+               'cat' => 'west_european',       // Catalan
                'chs' => 'simpl_chinese',
                'cht' => 'trad_chinese',
-               'csy' => 'east_european',
-               'dan' => 'west_european',
-               'deu' => 'west_european',
-               'dea' => 'west_european',
-               'des' => 'west_european',
-               'ena' => 'west_european',
-               'enc' => 'west_european',
-               'eng' => 'west_european',
-               'enz' => 'west_european',
-               'enu' => 'west_european',
-               'nld' => 'west_european',
-               'nlb' => 'west_european',
-               'fin' => 'west_european',
-               'fra' => 'west_european',
-               'frb' => 'west_european',
-               'frc' => 'west_european',
-               'frs' => 'west_european',
+               'csy' => 'east_european',       // Czech
+               'dan' => 'west_european',       // Danisch
+               'deu' => 'west_european',       // German
+               'dea' => 'west_european',       // German (Austrian)
+               'des' => 'west_european',       // German (Swiss)
+               'ena' => 'west_european',       // English (Australian)
+               'enc' => 'west_european',       // English (Canadian)
+               'eng' => 'west_european',       // English
+               'enz' => 'west_european',       // English (New Zealand)
+               'enu' => 'west_european',       // English (United States)
+               'euq' => 'west_european',       // Basque
+               'fos' => 'west_european',       // Faroese
+               'far' => 'arabic',      // Persian
+               'fin' => 'west_european',       // Finish
+               'fra' => 'west_european',       // French
+               'frb' => 'west_european',       // French (Belgian)
+               'frc' => 'west_european',       // French (Canadian)
+               'frs' => 'west_european',       // French (Swiss)
+               'geo' => 'unicode',                     // Georgian
+               'glg' => 'west_european',       // Galician
                'ell' => 'greek',
-               'hun' => 'east_european',
-               'isl' => 'west_euorpean',
-               'ita' => 'west_european',
-               'its' => 'west_european',
+               'heb' => 'hebrew',
+               'hin' => 'unicode',     // Hindi
+               'hun' => 'east_european',       // Hungarian
+               'isl' => 'west_euorpean',       // Icelandic
+               'ita' => 'west_european',       // Italian
+               'its' => 'west_european',       // Italian (Swiss)
                'jpn' => 'japanese',
                'kor' => 'korean',
-               'nor' => 'west_european',
-               'non' => 'west_european',
-               'plk' => 'east_european',
-               'ptg' => 'west_european',
-               'ptb' => 'west_european',
-               'rus' => 'east_european',
-               'sky' => 'east_european',
-               'esp' => 'west_european',
-               'esm' => 'west_european',
-               'esn' => 'west_european',
-               'sve' => 'west_european',
+               'lth' => 'lithuanian',
+               'lvi' => 'west_european',       // Latvian/Lettish
+               'msl' => 'west_european',       // Malay
+               'nlb' => 'west_european',       // Dutch (Belgian)
+               'nld' => 'west_european',       // Dutch
+               'nor' => 'west_european',       // Norwegian (bokmal)
+               'non' => 'west_european',       // Norwegian (nynorsk)
+               'plk' => 'east_european',       // Polish
+               'ptg' => 'west_european',       // Portuguese
+               'ptb' => 'west_european',       // Portuguese (Brazil)
+               'rom' => 'east_european',       // Romanian
+               'rus' => 'cyrillic',            // Russian
+               'slv' => 'east_european',       // Slovenian
+               'sky' => 'east_european',       // Slovak
+               'srl' => 'east_european',       // Serbian (Latin)
+               'srb' => 'cyrillic',            // Serbian (Cyrillic)
+               'esp' => 'west_european',       // Spanish (trad. sort)
+               'esm' => 'west_european',       // Spanish (Mexican)
+               'esn' => 'west_european',       // Spanish (internat. sort)
+               'sve' => 'west_european',       // Swedish
+               'sqi' => 'albanian',            // Albanian
+               'tha' => 'thai',
                'trk' => 'turkish',
+               'ukr' => 'cyrillic',    // Ukrainian
                        // English language names
+               'albanian' => 'albanian',
+               'arabic' => 'arabic',
+               'basque' => 'west_european',
+               'bosnian' => 'east_european',
                'bulgarian' => 'east_european',
                'catalan' => 'west_european',
                'croatian' => 'east_european',
@@ -317,30 +372,44 @@ class t3lib_cs {
                'danish' => 'west_european',
                'dutch' => 'west_european',
                'english' => 'west_european',
+               'esperanto' => 'unicode',
+               'estonian' => 'estonian',
+               'faroese' => 'west_european',
+               'farsi' => 'arabic',
                'finnish' => 'west_european',
                'french' => 'west_european',
                'galician' => 'west_european',
+               'georgian' => 'unicode',
                'german' => 'west_european',
+               'greek' => 'greek',
+               'greenlandic' => 'west_european',
+               'hebrew' => 'hebrew',
+               'hindi' => 'unicode',
                'hungarian' => 'east_european',
                'icelandic' => 'west_european',
                'italian' => 'west_european',
                'latvian' => 'west_european',
                'lettish' => 'west_european',
+               'lithuanian' => 'lithuanian',
+               'malay' => 'west_european',
                'norwegian' => 'west_european',
+               'persian' => 'arabic',
                'polish' => 'east_european',
                'portuguese' => 'west_european',
                'russian' => 'cyrillic',
                'romanian' => 'east_european',
+               'serbian' => 'cyrillic',
                'slovak' => 'east_european',
                'slovenian' => 'east_european',
                'spanish' => 'west_european',
                'svedish' => 'west_european',
-               'turkish' => 'east_european',
+               'that' => 'thai',
+               'turkish' => 'turkish',
                'ukrainian' => 'cyrillic',
        );
 
                // mapping of language (family) names to charsets on Unix
-       var $lang_to_charset_unix=array(
+       var $script_to_charset_unix=array(
                'west_european' => 'iso-8859-1',
                'estonian' => 'iso-8859-1',
                'east_european' => 'iso-8859-2',
@@ -358,10 +427,12 @@ class t3lib_cs {
                'simpl_chinese' => 'gb2312',
                'trad_chinese' => 'big5',
                'vietnamese' => '',
+               'unicode' => 'utf-8',
+               'albanian' => 'utf-8'
        );
 
                // mapping of language (family) names to charsets on Windows
-       var $lang_to_charset_windows=array(
+       var $script_to_charset_windows=array(
                'east_european' => 'windows-1250',
                'cyrillic' => 'windows-1251',
                'west_european' => 'windows-1252',
@@ -374,11 +445,13 @@ class t3lib_cs {
                'lithuanian' => 'windows-1257',
                'vietnamese' => 'windows-1258',
                'thai' => 'cp874',
-               'korean' => 'cp950',
+               'korean' => 'cp949',
                'chinese' => 'gb2312',
                'japanese' => 'shift_jis',
                'simpl_chinese' => 'gb2312',
                'trad_chinese' => 'big5',
+               'albanian' => 'windows-1250',
+               'unicode' => 'utf-8'
        );
 
                // mapping of locale names to charsets
@@ -386,6 +459,7 @@ class t3lib_cs {
                'japanese.euc' => 'euc-jp',
                'ja_jp.ujis' => 'euc-jp',
                'korean.euc' => 'euc-kr',
+               'sr@Latn' => 'iso-8859-2',
                'zh_cn' => 'gb2312',
                'zh_hk' => 'big5',
                'zh_tw' => 'big5',
@@ -430,6 +504,37 @@ class t3lib_cs {
                'jp' => 'shift_jis',
                'lv' => 'utf-8',
                'vn' => 'utf-8',
+               'ca' => 'iso-8859-15',
+               'ba' => 'iso-8859-2',
+               'kr' => 'euc-kr',
+               'eo' => 'utf-8',
+               'my' => '',
+               'hi' => 'utf-8',
+               'fo' => 'utf-8',
+               'fa' => 'utf-8',
+               'sr' => 'utf-8',
+               'sq' => 'utf-8',
+               'ge' => 'utf-8',
+               'ga' => '',
+       );
+
+               // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
+               // Missing keys means: same as Typo3
+       var $isoArray = array(
+               'ba' => 'bs',
+               'br' => 'pt_BR',
+               'ch' => 'zh_CN',
+               'cz' => 'cs',
+               'dk' => 'da',
+               'si' => 'sl',
+               'se' => 'sv',
+               'gl' => 'kl',
+               'gr' => 'el',
+               'hk' => 'zh_HK',
+               'kr' => 'ko',
+               'ua' => 'uk',
+               'jp' => 'ja',
+               'vn' => 'vi',
        );
 
        /**
@@ -440,7 +545,7 @@ class t3lib_cs {
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
         */
        function parse_charset($charset)        {
-               $charset = strtolower($charset);
+               $charset = trim(strtolower($charset));
                if (isset($this->synonyms[$charset]))   $charset = $this->synonyms[$charset];
 
                return $charset;
@@ -449,9 +554,10 @@ class t3lib_cs {
        /**
         * Get the charset of a locale.
         *
-        * ln        language
-        * ln_CN     language / country
-        * ln_CN.cs  language / country / charset
+        * ln            language
+        * ln_CN         language / country
+        * ln_CN.cs      language / country / charset
+        * ln_CN.cs@mod  language / country / charset / modifier
         *
         * @param       string          Locale string
         * @return      string          Charset resolved for locale string
@@ -463,23 +569,43 @@ class t3lib_cs {
                        // exact locale specific charset?
                if (isset($this->locale_to_charset[$locale]))   return $this->locale_to_charset[$locale];
 
+                       // get modifier
+               list($locale,$modifier) = explode('@',$locale);
+
                        // locale contains charset: use it
                list($locale,$charset) = explode('.',$locale);
                if ($charset)   return $this->parse_charset($charset);
 
+                       // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
+               if ($modifier == 'euro')        return 'iso-8859-15';
+
                        // get language
                list($language,$country) = explode('_',$locale);
-               if (isset($this->lang_to_langfamily[$language]))        $language = $this->lang_to_langfamily[$language];
+               if (isset($this->lang_to_script[$language]))    $script = $this->lang_to_script[$language];
 
                if (TYPO3_OS == 'WIN')  {
-                       $cs = $this->lang_to_charset_windows[$language];
+                       $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
                } else {
-                       $cs = $this->lang_to_charset_unix[$language];
+                       $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
                }
 
-               return $cs ? $cs : 'iso-8859-1';
+               return $cs;
        }
 
+
+
+
+
+
+
+
+
+       /********************************************
+        *
+        * Charset Conversion functions
+        *
+        ********************************************/
+
        /**
         * Convert from one charset to another charset.
         *
@@ -488,6 +614,7 @@ class t3lib_cs {
         * @param       string          To charset (the output charset wanted)
         * @param       boolean         If set, then characters that are not available in the destination character set will be encoded as numeric entities
         * @return      string          Converted string
+        * @see convArray()
         */
        function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
                if ($fromCS==$toCS)     return $str;
@@ -501,12 +628,12 @@ class t3lib_cs {
                                break;
 
                        case 'iconv':
-                               $conv_str = iconv($str,$fromCS,$toCS.'//TRANSLIT');
+                               $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
                                if (false !== $conv_str)        return $conv_str;
                                break;
 
                        case 'recode':
-                               $conv_str = recode_string($toCS.'..'.$fromCS,$str);
+                               $conv_str = recode_string($fromCS.'..'.$toCS,$str);
                                if (false !== $conv_str)        return $conv_str;
                                break;
                        }
@@ -518,6 +645,26 @@ class t3lib_cs {
                return $str;
        }
 
+       /**
+        * Convert all elements in ARRAY with type string from one charset to another charset.
+        * NOTICE: Array is passed by reference!
+        *
+        * @param       string          Input array, possibly multidimensional
+        * @param       string          From charset (the current charset of the string)
+        * @param       string          To charset (the output charset wanted)
+        * @param       boolean         If set, then characters that are not available in the destination character set will be encoded as numeric entities
+        * @return      void
+        * @see conv()
+        */
+       function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
+               foreach($array as $key => $value)       {
+                       if (is_array($array[$key]))     {
+                               $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
+                       } elseif (is_string($array[$key])) {
+                               $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
+                       }
+               }
+       }
 
        /**
         * Converts $str from $charset to UTF-8
@@ -528,6 +675,8 @@ class t3lib_cs {
         */
        function utf8_encode($str,$charset)     {
 
+               if ($charset === 'utf-8')       return $str;
+
                        // Charset is case-insensitive.
                if ($this->initCharset($charset))       {       // Parse conv. table if not already...
                        $strLen = strlen($str);
@@ -536,30 +685,27 @@ class t3lib_cs {
                        for ($a=0;$a<$strLen;$a++)      {       // Traverse each char in string.
                                $chr=substr($str,$a,1);
                                $ord=ord($chr);
-                               if ($this->twoByteSets[$charset])       {       // If the charset has two bytes per char
+                               if (isset($this->twoByteSets[$charset]))        {       // If the charset has two bytes per char
                                        $ord2 = ord($str{$a+1});
-                                       $ord = $ord<<8 & $ord2; // assume big endian
+                                       $ord = $ord<<8 | $ord2; // assume big endian
 
                                        if (isset($this->parsedCharsets[$charset]['local'][$ord]))      {       // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
                                                $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
                                        } else $outStr.=chr($this->noCharByteVal);      // No char exists
                                        $a++;
                                } elseif ($ord>127)     {       // If char has value over 127 it's a multibyte char in UTF-8
-                                       if ($this->eucBasedSets[$charset])      {       // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
-                                               $a++;
-                                               $ord2=ord(substr($str,$a,1));
-                                               $ord = $ord*256+$ord2;
-                                       }
-                                       elseif ($charset == 'shift_jis' && ($ord <160 || $ord>223))     {       // Shift-JIS is like EUC, but chars between 160 and 223 are single byte
-                                               $a++;
-                                               $ord2=ord(substr($str,$a,1));
-                                               $ord = $ord*256+$ord2;
+                                       if (isset($this->eucBasedSets[$charset]))       {       // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
+                                               if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF))    {       // Shift-JIS: chars between 160 and 223 are single byte
+                                                       $a++;
+                                                       $ord2=ord(substr($str,$a,1));
+                                                       $ord = $ord*256+$ord2;
+                                               }
                                        }
 
                                        if (isset($this->parsedCharsets[$charset]['local'][$ord]))      {       // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
-                                               $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
-                                       } else $outStr.=chr($this->noCharByteVal);      // No char exists
-                               } else $outStr.=$chr;   // ... otherwise it's just ASCII 0-127 and one byte. Transparent
+                                               $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
+                                       } else $outStr.= chr($this->noCharByteVal);     // No char exists
+                               } else $outStr.= $chr;  // ... otherwise it's just ASCII 0-127 and one byte. Transparent
                        }
                        return $outStr;
                }
@@ -575,6 +721,10 @@ class t3lib_cs {
         */
        function utf8_decode($str,$charset,$useEntityForNoChar=0)       {
 
+               if ($charset === 'utf-8') {
+                       return $str;
+               }
+
                        // Charset is case-insensitive.
                if ($this->initCharset($charset))       {       // Parse conv. table if not already...
                        $strLen = strlen($str);
@@ -655,7 +805,7 @@ class t3lib_cs {
                }
 
                $token = md5(microtime());
-               $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
+               $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
                foreach($parts as $k => $v)     {
                        if ($k%2)       {
                                if (substr($v,0,1)=='#')        {       // Dec or hex entities:
@@ -832,12 +982,12 @@ class t3lib_cs {
                        if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile))      {
                                        // Cache file for charsets:
                                        // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
-                               $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/charset_'.$charset.'.tbl');
+                               $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
                                if ($cacheFile && @is_file($cacheFile)) {
                                        $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
                                } else {
                                                // Parse conversion table into lines:
-                                       $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
+                                       $lines=t3lib_div::trimExplode(LF,t3lib_div::getUrl($charsetConvTableFile),1);
                                                // Initialize the internal variable holding the conv. table:
                                        $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
                                                // traverse the lines:
@@ -847,13 +997,13 @@ class t3lib_cs {
 
                                                                // Detect type if not done yet: (Done on first real line)
                                                                // The "whitespaced" type is on the syntax      "0x0A   0x000A  #LINE FEED"     while   "ms-token" is like              "B9 = U+00B9 : SUPERSCRIPT ONE"
-                                                       if (!$detectedType)             $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
+                                                       if (!$detectedType)             $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/',$value) ? 'whitespaced' : 'ms-token';
 
                                                        if ($detectedType=='ms-token')  {
-                                                               list($hexbyte,$utf8) = split('=|:',$value,3);
+                                                               list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
                                                        } elseif ($detectedType=='whitespaced') {
                                                                $regA=array();
-                                                               ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
+                                                               preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/',$value,$regA);
                                                                $hexbyte = $regA[1];
                                                                $utf8 = 'U+'.$regA[2];
                                                        }
@@ -866,7 +1016,7 @@ class t3lib_cs {
                                                }
                                        }
                                        if ($cacheFile) {
-                                               t3lib_div::writeFile($cacheFile,serialize($this->parsedCharsets[$charset]));
+                                               t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
                                        }
                                }
                                return 2;
@@ -875,29 +1025,47 @@ class t3lib_cs {
        }
 
        /**
-        * This function initializes the UTF-8 case folding table.
+        * This function initializes all UTF-8 character data tables.
         *
         * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
         *
+        * @param       string          Mode ("case", "ascii", ...)
         * @return      integer         Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
         * @access private
         */
-       function initCaseFoldingUTF8()  {
-                       // Only process if the case table is not yet loaded:
-               if (is_array($this->caseFolding['utf-8']))      return 1;
+       function initUnicodeData($mode=null)    {
+                       // cache files
+               $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
+               $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
+
+                       // Only process if the tables are not yet loaded
+               switch($mode)   {
+                       case 'case':
+                               if (is_array($this->caseFolding['utf-8']))      return 1;
+
+                                       // Use cached version if possible
+                               if ($cacheFileCase && @is_file($cacheFileCase)) {
+                                       $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
+                                       return 2;
+                               }
+                               break;
 
-                       // Use cached version if possible
-               $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cscase_utf-8.tbl');
-               if ($cacheFile && @is_file($cacheFile)) {
-                       $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFile));
-                       return 2;
+                       case 'ascii':
+                               if (is_array($this->toASCII['utf-8']))  return 1;
+
+                                       // Use cached version if possible
+                               if ($cacheFileASCII && @is_file($cacheFileASCII))       {
+                                       $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
+                                       return 2;
+                               }
+                               break;
                }
 
                        // process main Unicode data file
                $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
                if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
 
-               $fh = fopen($unicodeDataFile,'r');
+               $fh = fopen($unicodeDataFile,'rb');
                if (!$fh)       return false;
 
                        // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
@@ -908,45 +1076,99 @@ class t3lib_cs {
                $utf8CaseFolding['toLower'] = array();
                $utf8CaseFolding['toTitle'] = array();
 
+               $decomposition = array();       // array of temp. decompositions
+               $mark = array();                // array of chars that are marks (eg. composing accents)
+               $number = array();              // array of chars that are numbers (eg. digits)
+               $omit = array();                // array of chars to be omitted (eg. Russian hard sign)
+
                while (!feof($fh))      {
-                       $line = fgets($fh);
-                               // has also other info like character class (digit, white space, etc.) and more
-                       list($char,,,,,,,,,,,,$upper,$lower,$title,) = split(';', rtrim($line));
-                       $char = $this->UnumberToChar(hexdec($char));
-                       if ($upper)     $utf8CaseFolding['toUpper'][$char] = $this->UnumberToChar(hexdec($upper));
-                       if ($lower)     $utf8CaseFolding['toLower'][$char] = $this->UnumberToChar(hexdec($lower));
+                       $line = fgets($fh,4096);
+                               // has a lot of info
+                       list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = explode(';', rtrim($line));
+
+                       $ord = hexdec($char);
+                       if ($ord > 0xFFFF)      break;  // only process the BMP
+
+                       $utf8_char = $this->UnumberToChar($ord);
+
+                       if ($upper)     $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
+                       if ($lower)     $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
                                // store "title" only when different from "upper" (only a few)
-                       if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$char] = $this->UnumberToChar(hexdec($title));
+                       if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
+
+                       switch ($cat{0})        {
+                               case 'M':       // mark (accent, umlaut, ...)
+                                       $mark["U+$char"] = 1;
+                                       break;
+
+                               case 'N':       // numeric value
+                                       if ($ord > 0x80 && $num != '')  $number["U+$char"] = $num;
+                       }
+
+                               // accented Latin letters without "official" decomposition
+                       $match = array();
+                       if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/',$name,$match) && !$decomp)        {
+                               $c = ord($match[2]);
+                               if ($match[1] == 'SMALL')       $c += 32;
+
+                               $decomposition["U+$char"] = array(dechex($c));
+                               continue;
+                       }
+
+                       $match = array();
+                       if (preg_match('/(<.*>)? *(.+)/',$decomp,$match))       {
+                               switch($match[1])       {
+                                       case '<circle>':        // add parenthesis as circle replacement, eg (1)
+                                               $match[2] = '0028 '.$match[2].' 0029';
+                                               break;
+
+                                       case '<square>':        // add square brackets as square replacement, eg [1]
+                                               $match[2] = '005B '.$match[2].' 005D';
+                                               break;
+
+                                       case '<compat>':        // ignore multi char decompositions that start with a space
+                                               if (preg_match('/^0020 /',$match[2]))   continue 2;
+                                               break;
+
+                                               // ignore Arabic and vertical layout presentation decomposition
+                                       case '<initial>':
+                                       case '<medial>':
+                                       case '<final>':
+                                       case '<isolated>':
+                                       case '<vertical>':
+                                               continue 2;
+                               }
+                               $decomposition["U+$char"] = explode(' ', $match[2]);
+                       }
                }
                fclose($fh);
 
                        // process additional Unicode data for casing (allow folded characters to expand into a sequence)
                $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
                if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile))        {
-
-                       $fh = fopen($specialCasingFile,'r');
+                       $fh = fopen($specialCasingFile,'rb');
                        if ($fh)        {
                                while (!feof($fh))      {
-                                       $line = fgets($fh);
+                                       $line = fgets($fh,4096);
                                        if ($line{0} != '#' && trim($line) != '')       {
 
                                                list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
                                                if ($cond == '' || $cond{0} == '#')     {
                                                        $utf8_char = $this->UnumberToChar(hexdec($char));
                                                        if ($char != $lower)    {
-                                                               $arr = split(' ',$lower);
+                                                               $arr = explode(' ', $lower);
                                                                for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
-                                                               $utf8CaseFolding['toLower'][$utf8_char] = implode($arr);
+                                                               $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
                                                        }
                                                        if ($char != $title && $title != $upper)        {
-                                                               $arr = split(' ',$title);
+                                                               $arr = explode(' ', $title);
                                                                for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
-                                                               $utf8CaseFolding['toTitle'][$utf8_char] = implode($arr);
+                                                               $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
                                                        }
                                                        if ($char != $upper)    {
-                                                                       $arr = split(' ',$upper);
+                                                                       $arr = explode(' ', $upper);
                                                                for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
-                                                               $utf8CaseFolding['toUpper'][$utf8_char] = implode($arr);
+                                                               $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
                                                        }
                                                }
                                        }
@@ -955,8 +1177,74 @@ class t3lib_cs {
                        }
                }
 
-               if ($cacheFile) {
-                               t3lib_div::writeFile($cacheFile,serialize($utf8CaseFolding));
+                       // process custom decompositions
+               $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
+               if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile))      {
+                       $fh = fopen($customTranslitFile,'rb');
+                       if ($fh)        {
+                               while (!feof($fh))      {
+                                       $line = fgets($fh,4096);
+                                       if ($line{0} != '#' && trim($line) != '')       {
+                                               list($char,$translit) = t3lib_div::trimExplode(';', $line);
+                                               if (!$translit) $omit["U+$char"] = 1;
+                                               $decomposition["U+$char"] = explode(' ', $translit);
+
+                                       }
+                               }
+                               fclose($fh);
+                       }
+               }
+
+                       // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
+               foreach($decomposition as $from => $to) {
+                       $code_decomp = array();
+
+                       while ($code_value = array_shift($to))  {
+                               if (isset($decomposition["U+$code_value"]))     {       // do recursive decomposition
+                                       foreach(array_reverse($decomposition["U+$code_value"]) as $cv)  {
+                                               array_unshift($to, $cv);
+                                       }
+                               } elseif (!isset($mark["U+$code_value"])) {     // remove mark
+                                       array_push($code_decomp, $code_value);
+                               }
+                       }
+                       if (count($code_decomp) || isset($omit[$from])) {
+                               $decomposition[$from] = $code_decomp;
+                       } else {
+                               unset($decomposition[$from]);
+                       }
+               }
+
+                       // create ascii only mapping
+               $this->toASCII['utf-8'] = array();
+               $ascii =& $this->toASCII['utf-8'];
+
+               foreach($decomposition as $from => $to) {
+                       $code_decomp = array();
+                       while ($code_value = array_shift($to))  {
+                               $ord = hexdec($code_value);
+                               if ($ord > 127)
+                                       continue 2;     // skip decompositions containing non-ASCII chars
+                               else
+                                       array_push($code_decomp,chr($ord));
+                       }
+                       $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
+               }
+
+                       // add numeric decompositions
+               foreach($number as $from => $to)        {
+                       $utf8_char = $this->UnumberToChar(hexdec($from));
+                       if (!isset($ascii[$utf8_char])) {
+                               $ascii[$utf8_char] = $to;
+                       }
+               }
+
+               if ($cacheFileCase)     {
+                               t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
+               }
+
+               if ($cacheFileASCII)    {
+                               t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
                }
 
                return 3;
@@ -975,7 +1263,7 @@ class t3lib_cs {
                if (is_array($this->caseFolding[$charset]))     return 1;
 
                        // Use cached version if possible
-               $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cscase_'.$charset.'.tbl');
+               $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
                if ($cacheFile && @is_file($cacheFile)) {
                        $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
                        return 2;
@@ -987,23 +1275,26 @@ class t3lib_cs {
                }
 
                        // UTF-8 case folding is used as the base conversion table
-               if (!$this->initCaseFoldingUTF8())      {
+               if (!$this->initUnicodeData('case'))    {
                        return false;
                }
 
                $nochar = chr($this->noCharByteVal);
                foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8)      {
                                // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
-                       $c = $this->conv($utf8, 'utf-8', $charset);
+                       $c = $this->utf8_decode($utf8, $charset);
 
-                       $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
-                       if ($cc && $cc != $nochar)      $this->caseFolding[$charset]['toUpper'][$c] = $cc;
+                               // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
+                       $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
+                       if ($cc != '' && $cc != $nochar)        $this->caseFolding[$charset]['toUpper'][$c] = $cc;
 
-                       $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
-                       if ($cc && $cc != $nochar)      $this->caseFolding[$charset]['toLower'][$c] = $cc;
+                               // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
+                       $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
+                       if ($cc != '' && $cc != $nochar)        $this->caseFolding[$charset]['toLower'][$c] = $cc;
 
-                       $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
-                       if ($cc && $cc != $nochar)      $this->caseFolding[$charset]['toTitle'][$c] = $cc;
+                               // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
+                       $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
+                       if ($cc != '' && $cc != $nochar)        $this->caseFolding[$charset]['toTitle'][$c] = $cc;
                }
 
                        // add the ASCII case table
@@ -1015,12 +1306,57 @@ class t3lib_cs {
                }
 
                if ($cacheFile) {
-                               t3lib_div::writeFile($cacheFile,serialize($this->caseFolding[$charset]));
+                               t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
                }
 
                return 3;
        }
 
+       /**
+        * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
+        * This function is automatically called by the ASCII transliteration functions.
+        *
+        * @param       string          Charset for which to initialize conversion.
+        * @return      integer         Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
+        * @access private
+        */
+       function initToASCII($charset)  {
+                       // Only process if the case table is not yet loaded:
+               if (is_array($this->toASCII[$charset])) return 1;
+
+                       // Use cached version if possible
+               $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
+               if ($cacheFile && @is_file($cacheFile)) {
+                       $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
+                       return 2;
+               }
+
+                       // init UTF-8 conversion for this charset
+               if (!$this->initCharset($charset))      {
+                       return false;
+               }
+
+                       // UTF-8/ASCII transliteration is used as the base conversion table
+               if (!$this->initUnicodeData('ascii'))   {
+                       return false;
+               }
+
+               $nochar = chr($this->noCharByteVal);
+               foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8)      {
+                               // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
+                       $c = $this->utf8_decode($utf8, $charset);
+
+                       if (isset($this->toASCII['utf-8'][$utf8]))      {
+                               $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
+                       }
+               }
+
+               if ($cacheFile) {
+                               t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
+               }
+
+               return 3;
+       }
 
 
 
@@ -1062,13 +1398,28 @@ class t3lib_cs {
                                // cannot omit $len, when specifying charset
                        if ($len==null) {
                                $enc = mb_internal_encoding();  // save internal encoding
-                               mb_internal_encoding('utf-8');
+                               mb_internal_encoding($charset);
                                $str = mb_substr($string,$start);
                                mb_internal_encoding($enc);     // restore internal encoding
 
                                return $str;
                        }
-                       else    return mb_substr($string,$start,$len,'utf-8');
+                       else {
+                               return mb_substr($string,$start,$len,$charset);
+                       }
+               } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')      {
+                               // cannot omit $len, when specifying charset
+                       if ($len==null) {
+                               $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
+                               iconv_set_encoding('internal_encoding',$charset);
+                               $str = iconv_substr($string,$start);
+                               iconv_set_encoding('internal_encoding',$enc);   // restore internal encoding
+
+                               return $str;
+                       }
+                       else {
+                               return iconv_substr($string,$start,$len,$charset);
+                       }
                } elseif ($charset == 'utf-8')  {
                        return $this->utf8_substr($string,$start,$len);
                } elseif ($this->eucBasedSets[$charset])        {
@@ -1096,6 +1447,8 @@ class t3lib_cs {
        function strlen($charset,$string)       {
                if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
                        return mb_strlen($string,$charset);
+               } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')      {
+                       return iconv_strlen($string,$charset);
                } elseif ($charset == 'utf-8')  {
                        return $this->utf8_strlen($string);
                } elseif ($this->eucBasedSets[$charset])        {
@@ -1110,6 +1463,30 @@ class t3lib_cs {
        }
 
        /**
+        * Method to crop strings using the mb_substr function.
+        *
+        * @param  string               The character set
+        * @param  string               String to be cropped
+        * @param  integer              Crop length (in characters)
+        * @param  string               Crop signifier
+        * @return string               The shortened string
+        * @see mb_strlen(), mb_substr()
+        */
+       protected function cropMbstring($charset, $string, $len, $crop = '') {
+               if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) {
+                       return $string;
+               }
+
+               if ($len > 0) {
+                       $string = mb_substr($string, 0, $len, $charset) . $crop;
+               } else {
+                       $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
+               }
+
+               return $string;
+       }
+
+       /**
         * Truncates a string and pre-/appends a string.
         * Unit tested by Kasper
         *
@@ -1122,7 +1499,11 @@ class t3lib_cs {
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
         */
        function crop($charset,$string,$len,$crop='')   {
-               if ($len == 0)  return $crop;
+               if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
+                       return $this->cropMbstring($charset, $string, $len, $crop);
+               }
+
+               if (intval($len) == 0)  return $string;
 
                if ($charset == 'utf-8')        {
                        $i = $this->utf8_char2byte_pos($string,$len);
@@ -1140,6 +1521,18 @@ class t3lib_cs {
                if ($i === false)       {       // $len outside actual string length
                        return $string;
                } else  {
+                       if ($len > 0)   {
+                               if (strlen($string{$i}))        {
+                                       return substr($string,0,$i).$crop;
+
+                               }
+                       } else {
+                               if (strlen($string{$i-1}))      {
+                                       return $crop.substr($string,$i);
+                               }
+                       }
+
+/*
                        if (abs($len)<$this->strlen($charset,$string))  {       // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
                                if ($len > 0)   {
                                        return substr($string,0,$i).$crop;
@@ -1147,6 +1540,7 @@ class t3lib_cs {
                                        return $crop.substr($string,$i);
                                }
                        }
+*/
                }
                return $string;
        }
@@ -1183,6 +1577,8 @@ class t3lib_cs {
        /**
         * Translates all characters of a string into their respective case values.
         * Unlike strtolower() and strtoupper() this method is locale independent.
+        * Note that the string length may change!
+        * eg. lower case German �(sharp S) becomes upper case "SS"
         * Unit-tested by Kasper
         * Real case folding is language dependent, this method ignores this fact.
         *
@@ -1194,40 +1590,104 @@ class t3lib_cs {
         * @see strtolower(), strtoupper()
         */
        function conv_case($charset,$string,$case)      {
-               if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' &&
-                       float(phpversion()) >= 4.3)     {
+               if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
                        if ($case == 'toLower') {
-                               return mb_strtolower($str,'utf-8');
+                               $string = mb_strtolower($string,$charset);
                        } else {
-                               return mb_strtoupper($str,'utf-8');
+                               $string = mb_strtoupper($string,$charset);
                        }
                } elseif ($charset == 'utf-8')  {
-                       return $this->utf8_conv_case($string,$case);
-               } elseif ($this->eucBasedSets[$charset])        {
-                       return $this->euc_conv_case($string,$case,$charset);
+                       $string = $this->utf8_char_mapping($string,'case',$case);
+               } elseif (isset($this->eucBasedSets[$charset])) {
+                       $string = $this->euc_char_mapping($string,$charset,'case',$case);
+               } else {
+                               // treat everything else as single-byte encoding
+                       $string = $this->sb_char_mapping($string,$charset,'case',$case);
                }
 
-               // treat everything else as single-byte encoding
-               if (!$this->initCaseFolding($charset))  return $string; // do nothing
-               $out = '';
-               $caseConv =& $this->caseFolding[$charset][$case];
+               return $string;
+       }
 
-               for($i=0; strlen($string{$i}); $i++)    {
-                       $c = $string{$i};
-                       $cc = $caseConv[$c];
-                       if ($cc)        {
-                               $out .= $cc;
-                       } else {
-                               $out .= $c;
+       /**
+        * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
+        *
+        * @param       string          Character set of string
+        * @param       string          Input string to convert
+        * @return      string          The converted string
+        */
+       function specCharsToASCII($charset,$string)     {
+               if ($charset == 'utf-8')        {
+                       $string = $this->utf8_char_mapping($string,'ascii');
+               } elseif (isset($this->eucBasedSets[$charset])) {
+                       $string = $this->euc_char_mapping($string,$charset,'ascii');
+               } else {
+                               // treat everything else as single-byte encoding
+                       $string = $this->sb_char_mapping($string,$charset,'ascii');
+               }
+
+               return $string;
+       }
+
+
+       /**
+        * converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
+        * into a TYPO3-readable language code
+        * @param       $languageCodesList      list of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
+        *                      see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
+        * @return      string  a preferred language that TYPO3 supports, or "default" if none found
+        * @author      Benjamin Mack (benni.typo3.org)
+        */
+       public function getPreferredClientLanguage($languageCodesList) {
+               $allLanguageCodes = array();
+               $selectedLanguage = 'default';
+
+               // get all languages where TYPO3 code is the same as the ISO code
+               foreach ($this->charSetArray as $typo3Lang => $charSet) {
+                       $allLanguageCodes[$typo3Lang] = $typo3Lang;
+               }
+
+               // get all languages where TYPO3 code differs from ISO code
+               // or needs the country part
+               // the iso codes will here overwrite the default typo3 language in the key
+               foreach ($this->isoArray as $typo3Lang => $isoLang) {
+                       $isoLang = join('-', explode('_', $isoLang));
+                       $allLanguageCodes[$typo3Lang] = $isoLang;
+               }
+
+               // move the iso codes to the (because we're comparing the keys with "isset" later on)
+               $allLanguageCodes = array_flip($allLanguageCodes);
+
+
+               $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList);
+               // order the preferred languages after they key
+               $sortedPreferredLanguages = array();
+               foreach ($preferredLanguages as $preferredLanguage) {
+                       $quality = 1.0;
+                       if (strpos($preferredLanguage, ';q=') !== false) {
+                               list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
                        }
+                       $sortedPreferredLanguages[$preferredLanguage] = $quality;
                }
 
-               // is a simple strtr() faster or slower than the code above?
-               // perhaps faster for small single-byte tables but slower for large multi-byte tables?
-               //
-               // return strtr($string,$this->caseFolding[$charset][$case]);
+               // loop through the languages, with the highest priority first
+               arsort($sortedPreferredLanguages, SORT_NUMERIC);
+               foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
+                       if (isset($allLanguageCodes[$preferredLanguage])) {
+                               $selectedLanguage = $allLanguageCodes[$preferredLanguage];
+                               break;
+                       }
 
-               return $out;
+                       // strip the country code from the end
+                       list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
+                       if (isset($allLanguageCodes[$preferredLanguage])) {
+                               $selectedLanguage = $allLanguageCodes[$preferredLanguage];
+                               break;
+                       }
+               }
+               if (!$selectedLanguage || $selectedLanguage == 'en') {
+                       $selectedLanguage = 'default';
+               }
+               return $selectedLanguage;
        }
 
 
@@ -1239,6 +1699,56 @@ class t3lib_cs {
 
 
 
+       /********************************************
+        *
+        * Internal string operation functions
+        *
+        ********************************************/
+
+       /**
+        * Maps all characters of a string in a single byte charset.
+        *
+        * @param       string          the string
+        * @param       string          the charset
+        * @param       string          mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
+        * @param       string          'case': conversion 'toLower' or 'toUpper'
+        * @return      string          the converted string
+        * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
+        */
+       function sb_char_mapping($str,$charset,$mode,$opt='')   {
+               switch($mode)   {
+                       case 'case':
+                               if (!$this->initCaseFolding($charset))  return $str;    // do nothing
+                               $map =& $this->caseFolding[$charset][$opt];
+                               break;
+
+                       case 'ascii':
+                               if (!$this->initToASCII($charset))      return $str;    // do nothing
+                               $map =& $this->toASCII[$charset];
+                               break;
+
+                       default:
+                               return $str;
+               }
+
+               $out = '';
+               for($i=0; strlen($str{$i}); $i++)       {
+                       $c = $str{$i};
+                       if (isset($map[$c]))    {
+                               $out .= $map[$c];
+                       } else {
+                               $out .= $c;
+                       }
+               }
+
+               return $out;
+       }
+
+
+
+
+
+
 
 
 
@@ -1321,9 +1831,9 @@ class t3lib_cs {
                        if ($i <= 0)    return ''; // sanity check
                        for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1)  $bc++;  // calculate number of bytes
                        if ($bc+$i > $len)      return substr($str,0,$i);
-                        // fallthru: multibyte char fits into length
+                       // fallthru: multibyte char fits into length
                }
-               return substr($str,$len);
+               return substr($str,0,$len);
        }
 
        /**
@@ -1338,7 +1848,9 @@ class t3lib_cs {
         */
        function utf8_strpos($haystack,$needle,$offset=0)       {
                if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
-                       return mb_strpos($haystack,$needle,'utf-8');
+                       return mb_strpos($haystack,$needle,$offset,'utf-8');
+               } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')      {
+                       return iconv_strpos($haystack,$needle,$offset,'utf-8');
                }
 
                $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
@@ -1362,6 +1874,8 @@ class t3lib_cs {
        function utf8_strrpos($haystack,$needle)        {
                if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
                        return mb_strrpos($haystack,$needle,'utf-8');
+               } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')      {
+                       return iconv_strrpos($haystack,$needle,'utf-8');
                }
 
                $byte_pos = strrpos($haystack,$needle);
@@ -1435,20 +1949,30 @@ class t3lib_cs {
        }
 
        /**
-        * Translates all characters of an UTF-8 string into their respective case values.
-        * Unit-tested by Kasper
+        * Maps all characters of an UTF-8 string.
         *
         * @param       string          UTF-8 string
-        * @param       string          conversion: 'toLower' or 'toUpper'
+        * @param       string          mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
+        * @param       string          'case': conversion 'toLower' or 'toUpper'
         * @return      string          the converted string
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
-        * @see strtolower(), strtoupper(), mb_convert_case()
         */
-       function utf8_conv_case($str,$case)     {
-               if (!$this->initCaseFoldingUTF8())      return $str;    // do nothing
+       function utf8_char_mapping($str,$mode,$opt='')  {
+               if (!$this->initUnicodeData($mode))     return $str;    // do nothing
 
                $out = '';
-               $caseConv =& $this->caseFolding['utf-8'][$case];
+               switch($mode)   {
+                       case 'case':
+                               $map =& $this->caseFolding['utf-8'][$opt];
+                               break;
+
+                       case 'ascii':
+                               $map =& $this->toASCII['utf-8'];
+                               break;
+
+                       default:
+                               return $str;
+               }
 
                for($i=0; strlen($str{$i}); $i++)       {
                        $c = ord($str{$i});
@@ -1460,9 +1984,8 @@ class t3lib_cs {
                                $i += $bc-1;
                        }
 
-                       $cc = $caseConv[$mbc];
-                       if ($cc)        {
-                               $out .= $cc;
+                       if (isset($map[$mbc]))  {
+                               $out .= $map[$mbc];
                        } else {
                                $out .= $mbc;
                        }
@@ -1523,11 +2046,12 @@ class t3lib_cs {
                }
                if (!strlen($str{$i}))  return $str;    // string shorter than supplied length
 
-               if ($i>$len)
+               if ($i>$len) {
                        return substr($str,0,$len-1);   // we ended on a first byte
-               else
+               } else {
                        return substr($str,0,$len);
-        }
+               }
+       }
 
        /**
         * Returns a part of a string in the EUC charset family.
@@ -1623,23 +2147,36 @@ class t3lib_cs {
        }
 
        /**
-        * Translates all characters of a string in the EUC charset family into their respective case values.
+        * Maps all characters of a string in the EUC charset family.
         *
         * @param       string          EUC multibyte character string
-        * @param       string          conversion: 'toLower' or 'toUpper'
         * @param       string          the charset
+        * @param       string          mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
+        * @param       string          'case': conversion 'toLower' or 'toUpper'
         * @return      string          the converted string
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
-        * @see strtolower(), strtoupper(), mb_convert_case()
         */
-       function euc_conv_case($str,$case,$charset)     {
-               if (!$this->initCaseFolding($charset))  return $str;    // do nothing
+       function euc_char_mapping($str,$charset,$mode,$opt='')  {
+               switch($mode)   {
+                       case 'case':
+                               if (!$this->initCaseFolding($charset))  return $str;    // do nothing
+                               $map =& $this->caseFolding[$charset][$opt];
+                               break;
+
+                       case 'ascii':
+                               if (!$this->initToASCII($charset))      return $str;    // do nothing
+                               $map =& $this->toASCII[$charset];
+                               break;
+
+                       default:
+                               return $str;
+               }
 
                $sjis = ($charset == 'shift_jis');
                $out = '';
-               $caseConv =& $this->caseFolding[$charset][$case];
-               for($i=0; $mbc=$str{$i}; $i++)  {
-                       $c = ord($str{$i});
+               for($i=0; strlen($str{$i}); $i++)       {
+                       $mbc = $str{$i};
+                       $c = ord($mbc);
 
                        if ($sjis)      {
                                if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  {       // a double-byte char
@@ -1654,9 +2191,8 @@ class t3lib_cs {
                                }
                        }
 
-                       $cc = $caseConv[$mbc];
-                       if ($cc)        {
-                               $out .= $cc;
+                       if (isset($map[$mbc]))  {
+                               $out .= $map[$mbc];
                        } else {
                                $out .= $mbc;
                        }
@@ -1664,9 +2200,11 @@ class t3lib_cs {
 
                return $out;
        }
+
 }
 
 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])       {
        include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
 }
-?>
+
+?>
\ No newline at end of file