Fixed issue #13670: Performance optimization: change while(list() to foreach() (thank...
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
old mode 100755 (executable)
new mode 100644 (file)
index fb66ce8..bdacd5d
@@ -2,7 +2,7 @@
 /***************************************************************
 *  Copyright notice
 *
-*  (c) 2003-2004 Kasper Skaarhoj (kasper@typo3.com)
+*  (c) 2003-2009 Kasper Skaarhoj (kasperYYYY@typo3.com)
 *  All rights reserved
 *
 *  This script is part of the Typo3 project. The Typo3 project is
@@ -26,7 +26,7 @@
  *
  * $Id$
  *
- * @author     Kasper Skaarhoj <kasper@typo3.com>
+ * @author     Kasper Skaarhoj <kasperYYYY@typo3.com>
  * @author     Martin Kutschker <martin.t.kutschker@blackbox.net>
  */
 /**
  *
  *
  *
- *  119: class t3lib_cs
- *  261:     function parse_charset($charset)
- *  278:     function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
- *  312:     function utf8_encode($str,$charset)
- *  359:     function utf8_decode($str,$charset,$useEntityForNoChar=0)
- *  407:     function utf8_to_entities($str)
- *  440:     function entities_to_utf8($str,$alsoStdHtmlEnt=0)
- *  474:     function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
- *  515:     function initCharset($charset)
- *  586:     function UnumberToChar($cbyte)
- *  630:     function utf8CharToUnumber($str,$hex=0)
+ *  136: class t3lib_cs
+ *  488:     function parse_charset($charset)
+ *  507:     function get_locale_charset($locale)
+ *
+ *              SECTION: Charset Conversion functions
+ *  560:     function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
+ *  600:     function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
+ *  617:     function utf8_encode($str,$charset)
+ *  663:     function utf8_decode($str,$charset,$useEntityForNoChar=0)
+ *  706:     function utf8_to_entities($str)
+ *  739:     function entities_to_utf8($str,$alsoStdHtmlEnt=0)
+ *  773:     function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
+ *  823:     function UnumberToChar($cbyte)
+ *  868:     function utf8CharToUnumber($str,$hex=0)
+ *
+ *              SECTION: Init functions
+ *  911:     function initCharset($charset)
+ *  973:     function initUnicodeData($mode=null)
+ * 1198:     function initCaseFolding($charset)
+ * 1260:     function initToASCII($charset)
  *
  *              SECTION: String operation functions
- *  682:     function strtrunc($charset,$string,$len)
- *  716:     function substr($charset,$str,$start,$len=null)
- *  755:     function strlen($charset,$string)
+ * 1331:     function substr($charset,$string,$start,$len=null)
+ * 1384:     function strlen($charset,$string)
+ * 1414:     function crop($charset,$string,$len,$crop='')
+ * 1467:     function strtrunc($charset,$string,$len)
+ * 1501:     function conv_case($charset,$string,$case)
+ * 1527:     function specCharsToASCII($charset,$string)
  *
- *              SECTION: UTF-8 String operation functions
- *  803:     function utf8_strtrunc($str,$len)
- *  831:     function utf8_substr($str,$start,$len=null)
- *  857:     function utf8_strlen($str)
- *  879:     function utf8_strpos($haystack,$needle,$offset=0)
- *  902:     function utf8_strrpos($haystack,$needle)
- *  921:     function utf8_char2byte_pos($str,$pos)
- *  946:     function utf8_byte2char_pos($str,$pos)
+ *              SECTION: Internal string operation functions
+ * 1567:     function sb_char_mapping($str,$charset,$mode,$opt='')
  *
- *              SECTION: EUC String operation functions
- *  994:     function euc_strtrunc($str,$len,$charset)
- * 1028:     function euc_substr($str,$start,$charset,$len=null)
- * 1055:     function euc_strlen($str,$charset)
- * 1082:     function euc_char2byte_pos($str,$pos,$charset)
+ *              SECTION: Internal UTF-8 string operation functions
+ * 1622:     function utf8_substr($str,$start,$len=null)
+ * 1655:     function utf8_strlen($str)
+ * 1676:     function utf8_strtrunc($str,$len)
+ * 1698:     function utf8_strpos($haystack,$needle,$offset=0)
+ * 1723:     function utf8_strrpos($haystack,$needle)
+ * 1745:     function utf8_char2byte_pos($str,$pos)
+ * 1786:     function utf8_byte2char_pos($str,$pos)
+ * 1809:     function utf8_char_mapping($str,$mode,$opt='')
  *
- * TOTAL FUNCTIONS: 24
+ *              SECTION: Internal EUC string operation functions
+ * 1885:     function euc_strtrunc($str,$len,$charset)
+ * 1914:     function euc_substr($str,$start,$charset,$len=null)
+ * 1939:     function euc_strlen($str,$charset)
+ * 1966:     function euc_char2byte_pos($str,$pos,$charset)
+ * 2007:     function euc_char_mapping($str,$charset,$mode,$opt='')
+ *
+ * TOTAL FUNCTIONS: 35
  * (This index is automatically created/updated by the extension "extdeveval")
  *
  */
  *
  * Functions nearly working on UTF-8 strings:
  *
- * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf_strlen
- * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained 7-bit ASCII
+ * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
+ * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
  * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
  * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
+ * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
  *
  * Functions NOT working on UTF-8 strings:
  *
  * - stripos
  * - substr
  * - strrev
- * - ereg/eregi
  * - split/spliti
- * - preg_*
  * - ...
  *
  */
 /**
  * Class for conversion between charsets
  *
- * @author     Kasper Skaarhoj <kasper@typo3.com>
+ * @author     Kasper Skaarhoj <kasperYYYY@typo3.com>
  * @author     Martin Kutschker <martin.t.kutschker@blackbox.net>
  * @package TYPO3
  * @subpackage t3lib
  */
 class t3lib_cs {
-       var $noCharByteVal=127;         // ASCII Value for chars with no equalent.
+       var $noCharByteVal=63;          // ASCII Value for chars with no equivalent.
 
                // This is the array where parsed conversion tables are stored (cached)
        var $parsedCharsets=array();
@@ -125,6 +141,9 @@ class t3lib_cs {
                // An array where case folding data will be stored (cached)
        var $caseFolding=array();
 
+               // An array where charset-to-ASCII mappings are stored (cached)
+       var $toASCII=array();
+
                // This tells the converter which charsets has two bytes per char:
        var $twoByteSets=array(
                'ucs-2'=>1,     // 2-byte Unicode
@@ -138,9 +157,10 @@ class t3lib_cs {
 
                // This tells the converter which charsets use a scheme like the Extended Unix Code:
        var $eucBasedSets=array(
-               'gb2312'=>1,    // Chinese, simplified.
-               'big5'=>1,      // Chinese, traditional.
-               'shift_jis'=>1, // Japanes - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
+               'gb2312'=>1,            // Chinese, simplified.
+               'big5'=>1,              // Chinese, traditional.
+               'euc-kr'=>1,            // Korean
+               'shift_jis'=>1,         // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
        );
 
                // see  http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
@@ -151,8 +171,16 @@ class t3lib_cs {
                'cp819' => 'iso-8859-1',
                'ibm819' => 'iso-8859-1',
                'iso-ir-100' => 'iso-8859-1',
-               'iso-ir-109' => 'iso-8859-2',
+               'iso-ir-101' => 'iso-8859-2',
+               'iso-ir-109' => 'iso-8859-3',
+               'iso-ir-110' => 'iso-8859-4',
+               'iso-ir-144' => 'iso-8859-5',
+               'iso-ir-127' => 'iso-8859-6',
+               'iso-ir-126' => 'iso-8859-7',
+               'iso-ir-138' => 'iso-8859-8',
                'iso-ir-148' => 'iso-8859-9',
+               'iso-ir-157' => 'iso-8859-10',
+               'iso-ir-179' => 'iso-8859-13',
                'iso-ir-199' => 'iso-8859-14',
                'iso-ir-203' => 'iso-8859-15',
                'csisolatin1' => 'iso-8859-1',
@@ -213,6 +241,7 @@ class t3lib_cs {
                'sjis' => 'shift_jis',
                'shift-jis' => 'shift_jis',
                'cp932' => 'shift_jis',
+               'cp949' => 'euc-kr',
                'utf7' => 'utf-7',
                'utf8' => 'utf-8',
                'utf16' => 'utf-16',
@@ -222,85 +251,120 @@ class t3lib_cs {
                'ucs4' => 'ucs-4',
        );
 
-               // mapping of iso-639:2 language codes to language (family) names
-       var $lang_to_langfamily=array(
-                       // iso-639:2 language codes, see:
-                       //  http://www.w3.org/WAI/ER/IG/ert/iso639.htm
-                       //  http://www.unicode.org/onlinedat/languages.html
+               // mapping of iso-639-1 language codes to script names
+       var $lang_to_script=array(
+                       // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
                'ar' => 'arabic',
-               'bg' => 'cyrillic',
-               'cs' => 'east_european',
-               'da' => 'west_european',
-               'de' => 'west_european',
-               'es' => 'west_european',
+               'bg' => 'cyrillic',             // Bulgarian
+               'bs' => 'east_european',        // Bosnian
+               'cs' => 'east_european',        // Czech
+               'da' => 'west_european',        // Danish
+               'de' => 'west_european',        // German
+               'es' => 'west_european',        // Spanish
                'et' => 'estonian',
-               'eu' => 'west_european',
-               'fi' => 'west_european',
-               'fr' => 'west_european',
+               'eo' => 'unicode',              // Esperanto
+               'eu' => 'west_european',        // Basque
+               'fa' => 'arabic',       // Persian
+               'fi' => 'west_european',        // Finish
+               'fo' => 'west_european',        // Faroese
+               'fr' => 'west_european',        // French
+               'ga' => 'west_european',        // Galician
+               'ge' => 'unicode',                      // Georgian
                'gr' => 'greek',
-               'hr' => 'east_european',
-               'hu' => 'east_european',
-               'iw' => 'hebrew',
-               'is' => 'west_european',
-               'it' => 'west_european',
+               'he' => 'hebrew',               // Hebrew (since 1998)
+               'hi' => 'unicode',              // Hindi
+               'hr' => 'east_european',        // Croatian
+               'hu' => 'east_european',        // Hungarian
+               'iw' => 'hebrew',               // Hebrew (til 1998)
+               'is' => 'west_european',        // Icelandic
+               'it' => 'west_european',        // Italian
                'ja' => 'japanese',
-               'kl' => 'west_european',
+               'kl' => 'west_european',        // Greenlandic
                'ko' => 'korean',
                'lt' => 'lithuanian',
-               'lv' => 'west_european', // Latvian/Lettish
-               'nl' => 'west_european',
-               'no' => 'west_european',
-               'pl' => 'east_european',
-               'pt' => 'west_european',
-               'ro' => 'east_european',
-               'ru' => 'cyrillic',
-               'sk' => 'east_european',
-               'sl' => 'east_european',
-               'sv' => 'west_european',
+               'lv' => 'west_european',        // Latvian/Lettish
+               'nl' => 'west_european',        // Dutch
+               'no' => 'west_european',        // Norwegian
+               'nb' => 'west_european',        // Norwegian Bokmal
+               'nn' => 'west_european',        // Norwegian Nynorsk
+               'pl' => 'east_european',        // Polish
+               'pt' => 'west_european',        // Portuguese
+               'ro' => 'east_european',        // Romanian
+               'ru' => 'cyrillic',             // Russian
+               'sk' => 'east_european',        // Slovak
+               'sl' => 'east_european',        // Slovenian
+               'sr' => 'cyrillic',             // Serbian
+               'sv' => 'west_european',        // Swedish
+               'sq' => 'albanian',             // Albanian
                'th' => 'thai',
-               'uk' => 'cyrillic',
+               'uk' => 'cyrillic',             // Ukranian
                'vi' => 'vietnamese',
                'zh' => 'chinese',
                        // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
+                       // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
+               'ara' => 'arabic',
+               'bgr' => 'cyrillic',            // Bulgarian
+               'cat' => 'west_european',       // Catalan
                'chs' => 'simpl_chinese',
                'cht' => 'trad_chinese',
-               'csy' => 'east_european',
-               'dan' => 'west_european',
-               'deu' => 'west_european',
-               'dea' => 'west_european',
-               'des' => 'west_european',
-               'ena' => 'west_european',
-               'enc' => 'west_european',
-               'eng' => 'west_european',
-               'enz' => 'west_european',
-               'enu' => 'west_european',
-               'nld' => 'west_european',
-               'nlb' => 'west_european',
-               'fin' => 'west_european',
-               'fra' => 'west_european',
-               'frb' => 'west_european',
-               'frc' => 'west_european',
-               'frs' => 'west_european',
+               'csy' => 'east_european',       // Czech
+               'dan' => 'west_european',       // Danisch
+               'deu' => 'west_european',       // German
+               'dea' => 'west_european',       // German (Austrian)
+               'des' => 'west_european',       // German (Swiss)
+               'ena' => 'west_european',       // English (Australian)
+               'enc' => 'west_european',       // English (Canadian)
+               'eng' => 'west_european',       // English
+               'enz' => 'west_european',       // English (New Zealand)
+               'enu' => 'west_european',       // English (United States)
+               'euq' => 'west_european',       // Basque
+               'fos' => 'west_european',       // Faroese
+               'far' => 'arabic',      // Persian
+               'fin' => 'west_european',       // Finish
+               'fra' => 'west_european',       // French
+               'frb' => 'west_european',       // French (Belgian)
+               'frc' => 'west_european',       // French (Canadian)
+               'frs' => 'west_european',       // French (Swiss)
+               'geo' => 'unicode',                     // Georgian
+               'glg' => 'west_european',       // Galician
                'ell' => 'greek',
-               'hun' => 'east_european',
-               'isl' => 'west_euorpean',
-               'ita' => 'west_european',
-               'its' => 'west_european',
+               'heb' => 'hebrew',
+               'hin' => 'unicode',     // Hindi
+               'hun' => 'east_european',       // Hungarian
+               'isl' => 'west_euorpean',       // Icelandic
+               'ita' => 'west_european',       // Italian
+               'its' => 'west_european',       // Italian (Swiss)
                'jpn' => 'japanese',
                'kor' => 'korean',
-               'nor' => 'west_european',
-               'non' => 'west_european',
-               'plk' => 'east_european',
-               'ptg' => 'west_european',
-               'ptb' => 'west_european',
-               'rus' => 'east_european',
-               'sky' => 'east_european',
-               'esp' => 'west_european',
-               'esm' => 'west_european',
-               'esn' => 'west_european',
-               'sve' => 'west_european',
+               'lth' => 'lithuanian',
+               'lvi' => 'west_european',       // Latvian/Lettish
+               'msl' => 'west_european',       // Malay
+               'nlb' => 'west_european',       // Dutch (Belgian)
+               'nld' => 'west_european',       // Dutch
+               'nor' => 'west_european',       // Norwegian (bokmal)
+               'non' => 'west_european',       // Norwegian (nynorsk)
+               'plk' => 'east_european',       // Polish
+               'ptg' => 'west_european',       // Portuguese
+               'ptb' => 'west_european',       // Portuguese (Brazil)
+               'rom' => 'east_european',       // Romanian
+               'rus' => 'cyrillic',            // Russian
+               'slv' => 'east_european',       // Slovenian
+               'sky' => 'east_european',       // Slovak
+               'srl' => 'east_european',       // Serbian (Latin)
+               'srb' => 'cyrillic',            // Serbian (Cyrillic)
+               'esp' => 'west_european',       // Spanish (trad. sort)
+               'esm' => 'west_european',       // Spanish (Mexican)
+               'esn' => 'west_european',       // Spanish (internat. sort)
+               'sve' => 'west_european',       // Swedish
+               'sqi' => 'albanian',            // Albanian
+               'tha' => 'thai',
                'trk' => 'turkish',
+               'ukr' => 'cyrillic',    // Ukrainian
                        // English language names
+               'albanian' => 'albanian',
+               'arabic' => 'arabic',
+               'basque' => 'west_european',
+               'bosnian' => 'east_european',
                'bulgarian' => 'east_european',
                'catalan' => 'west_european',
                'croatian' => 'east_european',
@@ -308,30 +372,44 @@ class t3lib_cs {
                'danish' => 'west_european',
                'dutch' => 'west_european',
                'english' => 'west_european',
+               'esperanto' => 'unicode',
+               'estonian' => 'estonian',
+               'faroese' => 'west_european',
+               'farsi' => 'arabic',
                'finnish' => 'west_european',
                'french' => 'west_european',
                'galician' => 'west_european',
+               'georgian' => 'unicode',
                'german' => 'west_european',
+               'greek' => 'greek',
+               'greenlandic' => 'west_european',
+               'hebrew' => 'hebrew',
+               'hindi' => 'unicode',
                'hungarian' => 'east_european',
                'icelandic' => 'west_european',
                'italian' => 'west_european',
                'latvian' => 'west_european',
                'lettish' => 'west_european',
+               'lithuanian' => 'lithuanian',
+               'malay' => 'west_european',
                'norwegian' => 'west_european',
+               'persian' => 'arabic',
                'polish' => 'east_european',
                'portuguese' => 'west_european',
                'russian' => 'cyrillic',
                'romanian' => 'east_european',
+               'serbian' => 'cyrillic',
                'slovak' => 'east_european',
                'slovenian' => 'east_european',
                'spanish' => 'west_european',
                'svedish' => 'west_european',
-               'turkish' => 'east_european',
+               'that' => 'thai',
+               'turkish' => 'turkish',
                'ukrainian' => 'cyrillic',
        );
 
                // mapping of language (family) names to charsets on Unix
-       var $lang_to_charset_unix=array(
+       var $script_to_charset_unix=array(
                'west_european' => 'iso-8859-1',
                'estonian' => 'iso-8859-1',
                'east_european' => 'iso-8859-2',
@@ -349,10 +427,12 @@ class t3lib_cs {
                'simpl_chinese' => 'gb2312',
                'trad_chinese' => 'big5',
                'vietnamese' => '',
+               'unicode' => 'utf-8',
+               'albanian' => 'utf-8'
        );
 
                // mapping of language (family) names to charsets on Windows
-       var $lang_to_charset_windows=array(
+       var $script_to_charset_windows=array(
                'east_european' => 'windows-1250',
                'cyrillic' => 'windows-1251',
                'west_european' => 'windows-1252',
@@ -365,18 +445,21 @@ class t3lib_cs {
                'lithuanian' => 'windows-1257',
                'vietnamese' => 'windows-1258',
                'thai' => 'cp874',
-               'korean' => 'cp950',
+               'korean' => 'cp949',
                'chinese' => 'gb2312',
                'japanese' => 'shift_jis',
                'simpl_chinese' => 'gb2312',
                'trad_chinese' => 'big5',
+               'albanian' => 'windows-1250',
+               'unicode' => 'utf-8'
        );
 
                // mapping of locale names to charsets
        var $locale_to_charset=array(
                'japanese.euc' => 'euc-jp',
-               'ja_JP.ujis' => 'euc-jp',
+               'ja_jp.ujis' => 'euc-jp',
                'korean.euc' => 'euc-kr',
+               'sr@Latn' => 'iso-8859-2',
                'zh_cn' => 'gb2312',
                'zh_hk' => 'big5',
                'zh_tw' => 'big5',
@@ -421,6 +504,37 @@ class t3lib_cs {
                'jp' => 'shift_jis',
                'lv' => 'utf-8',
                'vn' => 'utf-8',
+               'ca' => 'iso-8859-15',
+               'ba' => 'iso-8859-2',
+               'kr' => 'euc-kr',
+               'eo' => 'utf-8',
+               'my' => '',
+               'hi' => 'utf-8',
+               'fo' => 'utf-8',
+               'fa' => 'utf-8',
+               'sr' => 'utf-8',
+               'sq' => 'utf-8',
+               'ge' => 'utf-8',
+               'ga' => '',
+       );
+
+               // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
+               // Missing keys means: same as Typo3
+       var $isoArray = array(
+               'ba' => 'bs',
+               'br' => 'pt_BR',
+               'ch' => 'zh_CN',
+               'cz' => 'cs',
+               'dk' => 'da',
+               'si' => 'sl',
+               'se' => 'sv',
+               'gl' => 'kl',
+               'gr' => 'el',
+               'hk' => 'zh_HK',
+               'kr' => 'ko',
+               'ua' => 'uk',
+               'jp' => 'ja',
+               'vn' => 'vi',
        );
 
        /**
@@ -431,7 +545,7 @@ class t3lib_cs {
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
         */
        function parse_charset($charset)        {
-               $charset = strtolower($charset);
+               $charset = trim(strtolower($charset));
                if (isset($this->synonyms[$charset]))   $charset = $this->synonyms[$charset];
 
                return $charset;
@@ -440,12 +554,13 @@ class t3lib_cs {
        /**
         * Get the charset of a locale.
         *
-        * ln        language
-        * ln_CN     language / country
-        * ln_CN.cs  language / country / charset
+        * ln            language
+        * ln_CN         language / country
+        * ln_CN.cs      language / country / charset
+        * ln_CN.cs@mod  language / country / charset / modifier
         *
-        * @param       string          Locale
-        * @return      string          Charset
+        * @param       string          Locale string
+        * @return      string          Charset resolved for locale string
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
         */
        function get_locale_charset($locale)    {
@@ -454,23 +569,43 @@ class t3lib_cs {
                        // exact locale specific charset?
                if (isset($this->locale_to_charset[$locale]))   return $this->locale_to_charset[$locale];
 
+                       // get modifier
+               list($locale,$modifier) = explode('@',$locale);
+
                        // locale contains charset: use it
                list($locale,$charset) = explode('.',$locale);
                if ($charset)   return $this->parse_charset($charset);
 
+                       // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
+               if ($modifier == 'euro')        return 'iso-8859-15';
+
                        // get language
                list($language,$country) = explode('_',$locale);
-               if (isset($this->lang_to_langfamily[$language]))        $language = $this->lang_to_langfamily[$language];
+               if (isset($this->lang_to_script[$language]))    $script = $this->lang_to_script[$language];
 
                if (TYPO3_OS == 'WIN')  {
-                       $cs = $this->lang_to_charset_windows[$language];
+                       $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
                } else {
-                       $cs = $this->lang_to_charset_unix[$language];
+                       $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
                }
 
-               return $cs ? $cs : 'iso-8859-1';
+               return $cs;
        }
 
+
+
+
+
+
+
+
+
+       /********************************************
+        *
+        * Charset Conversion functions
+        *
+        ********************************************/
+
        /**
         * Convert from one charset to another charset.
         *
@@ -479,6 +614,7 @@ class t3lib_cs {
         * @param       string          To charset (the output charset wanted)
         * @param       boolean         If set, then characters that are not available in the destination character set will be encoded as numeric entities
         * @return      string          Converted string
+        * @see convArray()
         */
        function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
                if ($fromCS==$toCS)     return $str;
@@ -492,12 +628,12 @@ class t3lib_cs {
                                break;
 
                        case 'iconv':
-                               $conv_str = iconv($str,$fromCS,$toCS.'//TRANSLIT');
+                               $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
                                if (false !== $conv_str)        return $conv_str;
                                break;
 
                        case 'recode':
-                               $conv_str = recode_string($toCS.'..'.$fromCS,$str);
+                               $conv_str = recode_string($fromCS.'..'.$toCS,$str);
                                if (false !== $conv_str)        return $conv_str;
                                break;
                        }
@@ -509,6 +645,26 @@ class t3lib_cs {
                return $str;
        }
 
+       /**
+        * Convert all elements in ARRAY with type string from one charset to another charset.
+        * NOTICE: Array is passed by reference!
+        *
+        * @param       string          Input array, possibly multidimensional
+        * @param       string          From charset (the current charset of the string)
+        * @param       string          To charset (the output charset wanted)
+        * @param       boolean         If set, then characters that are not available in the destination character set will be encoded as numeric entities
+        * @return      void
+        * @see conv()
+        */
+       function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
+               foreach($array as $key => $value)       {
+                       if (is_array($array[$key]))     {
+                               $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
+                       } elseif (is_string($array[$key])) {
+                               $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
+                       }
+               }
+       }
 
        /**
         * Converts $str from $charset to UTF-8
@@ -519,6 +675,8 @@ class t3lib_cs {
         */
        function utf8_encode($str,$charset)     {
 
+               if ($charset === 'utf-8')       return $str;
+
                        // Charset is case-insensitive.
                if ($this->initCharset($charset))       {       // Parse conv. table if not already...
                        $strLen = strlen($str);
@@ -527,30 +685,27 @@ class t3lib_cs {
                        for ($a=0;$a<$strLen;$a++)      {       // Traverse each char in string.
                                $chr=substr($str,$a,1);
                                $ord=ord($chr);
-                               if ($this->twoByteSets[$charset])       {       // If the charset has two bytes per char
+                               if (isset($this->twoByteSets[$charset]))        {       // If the charset has two bytes per char
                                        $ord2 = ord($str{$a+1});
-                                       $ord = $ord<<8 & $ord2; // assume big endian
+                                       $ord = $ord<<8 | $ord2; // assume big endian
 
                                        if (isset($this->parsedCharsets[$charset]['local'][$ord]))      {       // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
                                                $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
                                        } else $outStr.=chr($this->noCharByteVal);      // No char exists
                                        $a++;
                                } elseif ($ord>127)     {       // If char has value over 127 it's a multibyte char in UTF-8
-                                       if ($this->eucBasedSets[$charset])      {       // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
-                                               $a++;
-                                               $ord2=ord(substr($str,$a,1));
-                                               $ord = $ord*256+$ord2;
-                                       }
-                                       elseif ($charset == 'shift_jis' && ($ord <160 || $ord>223))     {       // Shift-JIS is like EUC, but chars between 160 and 223 are single byte
-                                               $a++;
-                                               $ord2=ord(substr($str,$a,1));
-                                               $ord = $ord*256+$ord2;
+                                       if (isset($this->eucBasedSets[$charset]))       {       // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
+                                               if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF))    {       // Shift-JIS: chars between 160 and 223 are single byte
+                                                       $a++;
+                                                       $ord2=ord(substr($str,$a,1));
+                                                       $ord = $ord*256+$ord2;
+                                               }
                                        }
 
                                        if (isset($this->parsedCharsets[$charset]['local'][$ord]))      {       // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
-                                               $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
-                                       } else $outStr.=chr($this->noCharByteVal);      // No char exists
-                               } else $outStr.=$chr;   // ... otherwise it's just ASCII 0-127 and one byte. Transparent
+                                               $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
+                                       } else $outStr.= chr($this->noCharByteVal);     // No char exists
+                               } else $outStr.= $chr;  // ... otherwise it's just ASCII 0-127 and one byte. Transparent
                        }
                        return $outStr;
                }
@@ -566,6 +721,10 @@ class t3lib_cs {
         */
        function utf8_decode($str,$charset,$useEntityForNoChar=0)       {
 
+               if ($charset === 'utf-8') {
+                       return $str;
+               }
+
                        // Charset is case-insensitive.
                if ($this->initCharset($charset))       {       // Parse conv. table if not already...
                        $strLen = strlen($str);
@@ -642,11 +801,11 @@ class t3lib_cs {
         */
        function entities_to_utf8($str,$alsoStdHtmlEnt=0)       {
                if ($alsoStdHtmlEnt)    {
-                       $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES));
+                       $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES));             // Getting them in iso-8859-1 - but thats ok since this is observed below.
                }
 
                $token = md5(microtime());
-               $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
+               $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
                foreach($parts as $k => $v)     {
                        if ($k%2)       {
                                if (substr($v,0,1)=='#')        {       // Dec or hex entities:
@@ -706,71 +865,9 @@ class t3lib_cs {
        }
 
        /**
-        * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
-        * This function is automatically called by the conversion functions
-        *
-        * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
-        *
-        * @param       string          The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
-        * @return      integer         Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
-        * @access private
-        */
-       function initCharset($charset)  {
-                       // Only process if the charset is not yet loaded:
-               if (!is_array($this->parsedCharsets[$charset])) {
-
-                               // Conversion table filename:
-                       $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
-
-                               // If the conversion table is found:
-                       if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile))      {
-                                       // Cache file for charsets:
-                                       // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
-                               $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/charset_'.$charset.'.tbl');
-                               if ($cacheFile && @is_file($cacheFile)) {
-                                       $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
-                               } else {
-                                               // Parse conversion table into lines:
-                                       $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
-                                               // Initialize the internal variable holding the conv. table:
-                                       $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
-                                               // traverse the lines:
-                                       $detectedType='';
-                                       foreach($lines as $value)       {
-                                               if (trim($value) && substr($value,0,1)!='#')    {       // Comment line or blanks are ignored.
-
-                                                               // Detect type if not done yet: (Done on first real line)
-                                                               // The "whitespaced" type is on the syntax      "0x0A   0x000A  #LINE FEED"     while   "ms-token" is like              "B9 = U+00B9 : SUPERSCRIPT ONE"
-                                                       if (!$detectedType)             $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
-
-                                                       if ($detectedType=='ms-token')  {
-                                                               list($hexbyte,$utf8) = split('=|:',$value,3);
-                                                       } elseif ($detectedType=='whitespaced') {
-                                                               $regA=array();
-                                                               ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
-                                                               $hexbyte = $regA[1];
-                                                               $utf8 = 'U+'.$regA[2];
-                                                       }
-                                                       $decval = hexdec(trim($hexbyte));
-                                                       if ($decval>127)        {
-                                                               $utf8decval = hexdec(substr(trim($utf8),2));
-                                                               $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
-                                                               $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
-                                                       }
-                                               }
-                                       }
-                                       if ($cacheFile) {
-                                               t3lib_div::writeFile($cacheFile,serialize($this->parsedCharsets[$charset]));
-                                       }
-                               }
-                               return 2;
-                       } else return false;
-               } else return 1;
-       }
-
-       /**
         * Converts a UNICODE number to a UTF-8 multibyte character
         * Algorithm based on script found at From: http://czyborra.com/utf/
+        * Unit-tested by Kasper
         *
         * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
         *
@@ -824,6 +921,7 @@ class t3lib_cs {
 
        /**
         * Converts a UTF-8 Multibyte character to a UNICODE number
+        * Unit-tested by Kasper
         *
         * @param       string          UTF-8 multibyte character string
         * @param       boolean         If set, then a hex. number is returned.
@@ -849,30 +947,125 @@ class t3lib_cs {
                return $hex ? 'x'.dechex($int) : $int;
        }
 
+
+
+
+
+
+
+
+
+       /********************************************
+        *
+        * Init functions
+        *
+        ********************************************/
+
+       /**
+        * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
+        * This function is automatically called by the conversion functions
+        *
+        * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
+        *
+        * @param       string          The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
+        * @return      integer         Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
+        * @access private
+        */
+       function initCharset($charset)  {
+                       // Only process if the charset is not yet loaded:
+               if (!is_array($this->parsedCharsets[$charset])) {
+
+                               // Conversion table filename:
+                       $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
+
+                               // If the conversion table is found:
+                       if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile))      {
+                                       // Cache file for charsets:
+                                       // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
+                               $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
+                               if ($cacheFile && @is_file($cacheFile)) {
+                                       $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
+                               } else {
+                                               // Parse conversion table into lines:
+                                       $lines=t3lib_div::trimExplode(LF,t3lib_div::getUrl($charsetConvTableFile),1);
+                                               // Initialize the internal variable holding the conv. table:
+                                       $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
+                                               // traverse the lines:
+                                       $detectedType='';
+                                       foreach($lines as $value)       {
+                                               if (trim($value) && substr($value,0,1)!='#')    {       // Comment line or blanks are ignored.
+
+                                                               // Detect type if not done yet: (Done on first real line)
+                                                               // The "whitespaced" type is on the syntax      "0x0A   0x000A  #LINE FEED"     while   "ms-token" is like              "B9 = U+00B9 : SUPERSCRIPT ONE"
+                                                       if (!$detectedType)             $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/',$value) ? 'whitespaced' : 'ms-token';
+
+                                                       if ($detectedType=='ms-token')  {
+                                                               list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
+                                                       } elseif ($detectedType=='whitespaced') {
+                                                               $regA=array();
+                                                               preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/',$value,$regA);
+                                                               $hexbyte = $regA[1];
+                                                               $utf8 = 'U+'.$regA[2];
+                                                       }
+                                                       $decval = hexdec(trim($hexbyte));
+                                                       if ($decval>127)        {
+                                                               $utf8decval = hexdec(substr(trim($utf8),2));
+                                                               $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
+                                                               $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
+                                                       }
+                                               }
+                                       }
+                                       if ($cacheFile) {
+                                               t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
+                                       }
+                               }
+                               return 2;
+                       } else return false;
+               } else return 1;
+       }
+
        /**
-        * This function initializes the UTF-8 case folding table.
+        * This function initializes all UTF-8 character data tables.
         *
         * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
         *
+        * @param       string          Mode ("case", "ascii", ...)
         * @return      integer         Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
         * @access private
         */
-       function initCaseFoldingUTF8()  {
-                       // Only process if the case table is not yet loaded:
-               if (is_array($this->caseFolding['utf-8']))      return 1;
+       function initUnicodeData($mode=null)    {
+                       // cache files
+               $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
+               $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
+
+                       // Only process if the tables are not yet loaded
+               switch($mode)   {
+                       case 'case':
+                               if (is_array($this->caseFolding['utf-8']))      return 1;
+
+                                       // Use cached version if possible
+                               if ($cacheFileCase && @is_file($cacheFileCase)) {
+                                       $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
+                                       return 2;
+                               }
+                               break;
 
-                       // Use cached version if possible
-               $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cscase_utf-8.tbl');
-               if ($cacheFile && @is_file($cacheFile)) {
-                       $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFile));
-                       return 2;
+                       case 'ascii':
+                               if (is_array($this->toASCII['utf-8']))  return 1;
+
+                                       // Use cached version if possible
+                               if ($cacheFileASCII && @is_file($cacheFileASCII))       {
+                                       $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
+                                       return 2;
+                               }
+                               break;
                }
 
                        // process main Unicode data file
                $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
                if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
 
-               $fh = fopen($unicodeDataFile,'r');
+               $fh = fopen($unicodeDataFile,'rb');
                if (!$fh)       return false;
 
                        // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
@@ -883,45 +1076,99 @@ class t3lib_cs {
                $utf8CaseFolding['toLower'] = array();
                $utf8CaseFolding['toTitle'] = array();
 
+               $decomposition = array();       // array of temp. decompositions
+               $mark = array();                // array of chars that are marks (eg. composing accents)
+               $number = array();              // array of chars that are numbers (eg. digits)
+               $omit = array();                // array of chars to be omitted (eg. Russian hard sign)
+
                while (!feof($fh))      {
-                       $line = fgets($fh);
-                               // has also other info like character class (digit, white space, etc.) and more
-                       list($char,,,,,,,,,,,,$upper,$lower,$title,) = split(';', rtrim($line));
-                       $char = $this->UnumberToChar(hexdec($char));
-                       if ($upper)     $utf8CaseFolding['toUpper'][$char] = $this->UnumberToChar(hexdec($upper));
-                       if ($lower)     $utf8CaseFolding['toLower'][$char] = $this->UnumberToChar(hexdec($lower));
+                       $line = fgets($fh,4096);
+                               // has a lot of info
+                       list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = explode(';', rtrim($line));
+
+                       $ord = hexdec($char);
+                       if ($ord > 0xFFFF)      break;  // only process the BMP
+
+                       $utf8_char = $this->UnumberToChar($ord);
+
+                       if ($upper)     $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
+                       if ($lower)     $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
                                // store "title" only when different from "upper" (only a few)
-                       if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$char] = $this->UnumberToChar(hexdec($title));
+                       if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
+
+                       switch ($cat{0})        {
+                               case 'M':       // mark (accent, umlaut, ...)
+                                       $mark["U+$char"] = 1;
+                                       break;
+
+                               case 'N':       // numeric value
+                                       if ($ord > 0x80 && $num != '')  $number["U+$char"] = $num;
+                       }
+
+                               // accented Latin letters without "official" decomposition
+                       $match = array();
+                       if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/',$name,$match) && !$decomp)        {
+                               $c = ord($match[2]);
+                               if ($match[1] == 'SMALL')       $c += 32;
+
+                               $decomposition["U+$char"] = array(dechex($c));
+                               continue;
+                       }
+
+                       $match = array();
+                       if (preg_match('/(<.*>)? *(.+)/',$decomp,$match))       {
+                               switch($match[1])       {
+                                       case '<circle>':        // add parenthesis as circle replacement, eg (1)
+                                               $match[2] = '0028 '.$match[2].' 0029';
+                                               break;
+
+                                       case '<square>':        // add square brackets as square replacement, eg [1]
+                                               $match[2] = '005B '.$match[2].' 005D';
+                                               break;
+
+                                       case '<compat>':        // ignore multi char decompositions that start with a space
+                                               if (preg_match('/^0020 /',$match[2]))   continue 2;
+                                               break;
+
+                                               // ignore Arabic and vertical layout presentation decomposition
+                                       case '<initial>':
+                                       case '<medial>':
+                                       case '<final>':
+                                       case '<isolated>':
+                                       case '<vertical>':
+                                               continue 2;
+                               }
+                               $decomposition["U+$char"] = explode(' ', $match[2]);
+                       }
                }
                fclose($fh);
 
                        // process additional Unicode data for casing (allow folded characters to expand into a sequence)
                $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
                if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile))        {
-
-                       $fh = fopen($specialCasingFile,'r');
+                       $fh = fopen($specialCasingFile,'rb');
                        if ($fh)        {
                                while (!feof($fh))      {
-                                       $line = fgets($fh);
+                                       $line = fgets($fh,4096);
                                        if ($line{0} != '#' && trim($line) != '')       {
 
                                                list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
                                                if ($cond == '' || $cond{0} == '#')     {
                                                        $utf8_char = $this->UnumberToChar(hexdec($char));
                                                        if ($char != $lower)    {
-                                                               $arr = split(' ',$lower);
+                                                               $arr = explode(' ', $lower);
                                                                for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
-                                                               $utf8CaseFolding['toLower'][$utf8_char] = implode($arr);
+                                                               $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
                                                        }
                                                        if ($char != $title && $title != $upper)        {
-                                                               $arr = split(' ',$title);
+                                                               $arr = explode(' ', $title);
                                                                for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
-                                                               $utf8CaseFolding['toTitle'][$utf8_char] = implode($arr);
+                                                               $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
                                                        }
                                                        if ($char != $upper)    {
-                                                                       $arr = split(' ',$upper);
+                                                                       $arr = explode(' ', $upper);
                                                                for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
-                                                               $utf8CaseFolding['toUpper'][$utf8_char] = implode($arr);
+                                                               $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
                                                        }
                                                }
                                        }
@@ -930,8 +1177,74 @@ class t3lib_cs {
                        }
                }
 
-               if ($cacheFile) {
-                               t3lib_div::writeFile($cacheFile,serialize($utf8CaseFolding));
+                       // process custom decompositions
+               $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
+               if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile))      {
+                       $fh = fopen($customTranslitFile,'rb');
+                       if ($fh)        {
+                               while (!feof($fh))      {
+                                       $line = fgets($fh,4096);
+                                       if ($line{0} != '#' && trim($line) != '')       {
+                                               list($char,$translit) = t3lib_div::trimExplode(';', $line);
+                                               if (!$translit) $omit["U+$char"] = 1;
+                                               $decomposition["U+$char"] = explode(' ', $translit);
+
+                                       }
+                               }
+                               fclose($fh);
+                       }
+               }
+
+                       // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
+               foreach($decomposition as $from => $to) {
+                       $code_decomp = array();
+
+                       while ($code_value = array_shift($to))  {
+                               if (isset($decomposition["U+$code_value"]))     {       // do recursive decomposition
+                                       foreach(array_reverse($decomposition["U+$code_value"]) as $cv)  {
+                                               array_unshift($to, $cv);
+                                       }
+                               } elseif (!isset($mark["U+$code_value"])) {     // remove mark
+                                       array_push($code_decomp, $code_value);
+                               }
+                       }
+                       if (count($code_decomp) || isset($omit[$from])) {
+                               $decomposition[$from] = $code_decomp;
+                       } else {
+                               unset($decomposition[$from]);
+                       }
+               }
+
+                       // create ascii only mapping
+               $this->toASCII['utf-8'] = array();
+               $ascii =& $this->toASCII['utf-8'];
+
+               foreach($decomposition as $from => $to) {
+                       $code_decomp = array();
+                       while ($code_value = array_shift($to))  {
+                               $ord = hexdec($code_value);
+                               if ($ord > 127)
+                                       continue 2;     // skip decompositions containing non-ASCII chars
+                               else
+                                       array_push($code_decomp,chr($ord));
+                       }
+                       $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
+               }
+
+                       // add numeric decompositions
+               foreach($number as $from => $to)        {
+                       $utf8_char = $this->UnumberToChar(hexdec($from));
+                       if (!isset($ascii[$utf8_char])) {
+                               $ascii[$utf8_char] = $to;
+                       }
+               }
+
+               if ($cacheFileCase)     {
+                               t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
+               }
+
+               if ($cacheFileASCII)    {
+                               t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
                }
 
                return 3;
@@ -941,6 +1254,7 @@ class t3lib_cs {
         * This function initializes the folding table for a charset other than UTF-8.
         * This function is automatically called by the case folding functions.
         *
+        * @param       string          Charset for which to initialize case folding.
         * @return      integer         Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
         * @access private
         */
@@ -949,7 +1263,7 @@ class t3lib_cs {
                if (is_array($this->caseFolding[$charset]))     return 1;
 
                        // Use cached version if possible
-               $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cscase_'.$charset.'.tbl');
+               $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
                if ($cacheFile && @is_file($cacheFile)) {
                        $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
                        return 2;
@@ -961,23 +1275,26 @@ class t3lib_cs {
                }
 
                        // UTF-8 case folding is used as the base conversion table
-               if (!$this->initCaseFoldingUTF8())      {
+               if (!$this->initUnicodeData('case'))    {
                        return false;
                }
 
                $nochar = chr($this->noCharByteVal);
                foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8)      {
                                // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
-                       $c = $this->conv($utf8, 'utf-8', $charset);
+                       $c = $this->utf8_decode($utf8, $charset);
 
-                       $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
-                       if ($cc && $cc != $nochar)      $this->caseFolding[$charset]['toUpper'][$c] = $cc;
+                               // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
+                       $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
+                       if ($cc != '' && $cc != $nochar)        $this->caseFolding[$charset]['toUpper'][$c] = $cc;
 
-                       $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
-                       if ($cc && $cc != $nochar)      $this->caseFolding[$charset]['toLower'][$c] = $cc;
+                               // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
+                       $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
+                       if ($cc != '' && $cc != $nochar)        $this->caseFolding[$charset]['toLower'][$c] = $cc;
 
-                       $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
-                       if ($cc && $cc != $nochar)      $this->caseFolding[$charset]['toTitle'][$c] = $cc;
+                               // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
+                       $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
+                       if ($cc != '' && $cc != $nochar)        $this->caseFolding[$charset]['toTitle'][$c] = $cc;
                }
 
                        // add the ASCII case table
@@ -989,12 +1306,57 @@ class t3lib_cs {
                }
 
                if ($cacheFile) {
-                               t3lib_div::writeFile($cacheFile,serialize($this->caseFolding[$charset]));
+                               t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
                }
 
                return 3;
        }
 
+       /**
+        * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
+        * This function is automatically called by the ASCII transliteration functions.
+        *
+        * @param       string          Charset for which to initialize conversion.
+        * @return      integer         Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
+        * @access private
+        */
+       function initToASCII($charset)  {
+                       // Only process if the case table is not yet loaded:
+               if (is_array($this->toASCII[$charset])) return 1;
+
+                       // Use cached version if possible
+               $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
+               if ($cacheFile && @is_file($cacheFile)) {
+                       $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
+                       return 2;
+               }
+
+                       // init UTF-8 conversion for this charset
+               if (!$this->initCharset($charset))      {
+                       return false;
+               }
+
+                       // UTF-8/ASCII transliteration is used as the base conversion table
+               if (!$this->initUnicodeData('ascii'))   {
+                       return false;
+               }
+
+               $nochar = chr($this->noCharByteVal);
+               foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8)      {
+                               // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
+                       $c = $this->utf8_decode($utf8, $charset);
+
+                       if (isset($this->toASCII['utf-8'][$utf8]))      {
+                               $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
+                       }
+               }
+
+               if ($cacheFile) {
+                               t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
+               }
+
+               return 3;
+       }
 
 
 
@@ -1018,42 +1380,14 @@ class t3lib_cs {
         ********************************************/
 
        /**
-        * Cuts a string short at a given byte length.
-        *
-        * @param       string          the character set
-        * @param       string          character string
-        * @param       integer         the byte length
-        * @return      string          the shortened string
-        * @see mb_strcut()
-        * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
-        */
-       function strtrunc($charset,$string,$len)        {
-               if ($len <= 0)  return '';
-
-               if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
-                       return mb_strcut($string,0,$len,$charset);
-               } elseif ($charset == 'utf-8')  {
-                       return $this->utf8_strtrunc($string);
-               } elseif ($this->eucBasedSets[$charset])        {
-                       return $this->euc_strtrunc($string,$charset);
-               } elseif ($this->twoByteSets[$charset]) {
-                       if ($len % 2)   $len--;         // don't cut at odd positions
-               } elseif ($this->fourByteSets[$charset])        {
-                       $x = $len % 4;
-                       $len -= $x;     // realign to position dividable by four
-               }
-               // treat everything else as single-byte encoding
-               return substr($string,0,$len);
-       }
-
-       /**
         * Returns a part of a string.
+        * Unit-tested by Kasper (single byte charsets only)
         *
-        * @param       string          the character set
-        * @param       string          character string
-        * @param       int             start position (character position)
-        * @param       int             length (in characters)
-        * @return      string          the substring
+        * @param       string          The character set
+        * @param       string          Character string
+        * @param       integer         Start position (character position)
+        * @param       integer         Length (in characters)
+        * @return      string          The substring
         * @see substr(), mb_substr()
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
         */
@@ -1064,13 +1398,28 @@ class t3lib_cs {
                                // cannot omit $len, when specifying charset
                        if ($len==null) {
                                $enc = mb_internal_encoding();  // save internal encoding
-                               mb_internal_encoding('utf-8');
+                               mb_internal_encoding($charset);
                                $str = mb_substr($string,$start);
                                mb_internal_encoding($enc);     // restore internal encoding
 
                                return $str;
                        }
-                       else    return mb_substr($string,$start,$len,'utf-8');
+                       else {
+                               return mb_substr($string,$start,$len,$charset);
+                       }
+               } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')      {
+                               // cannot omit $len, when specifying charset
+                       if ($len==null) {
+                               $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
+                               iconv_set_encoding('internal_encoding',$charset);
+                               $str = iconv_substr($string,$start);
+                               iconv_set_encoding('internal_encoding',$enc);   // restore internal encoding
+
+                               return $str;
+                       }
+                       else {
+                               return iconv_substr($string,$start,$len,$charset);
+                       }
                } elseif ($charset == 'utf-8')  {
                        return $this->utf8_substr($string,$start,$len);
                } elseif ($this->eucBasedSets[$charset])        {
@@ -1082,22 +1431,79 @@ class t3lib_cs {
                }
 
                // treat everything else as single-byte encoding
-               return substr($string,$start,$len);
+               return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
+       }
+
+       /**
+        * Counts the number of characters.
+        * Unit-tested by Kasper (single byte charsets only)
+        *
+        * @param       string          The character set
+        * @param       string          Character string
+        * @return      integer         The number of characters
+        * @see strlen()
+        * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
+        */
+       function strlen($charset,$string)       {
+               if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
+                       return mb_strlen($string,$charset);
+               } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')      {
+                       return iconv_strlen($string,$charset);
+               } elseif ($charset == 'utf-8')  {
+                       return $this->utf8_strlen($string);
+               } elseif ($this->eucBasedSets[$charset])        {
+                       return $this->euc_strlen($string,$charset);
+               } elseif ($this->twoByteSets[$charset]) {
+                       return strlen($string)/2;
+               } elseif ($this->fourByteSets[$charset])        {
+                       return strlen($string)/4;
+               }
+               // treat everything else as single-byte encoding
+               return strlen($string);
+       }
+
+       /**
+        * Method to crop strings using the mb_substr function.
+        *
+        * @param  string               The character set
+        * @param  string               String to be cropped
+        * @param  integer              Crop length (in characters)
+        * @param  string               Crop signifier
+        * @return string               The shortened string
+        * @see mb_strlen(), mb_substr()
+        */
+       protected function cropMbstring($charset, $string, $len, $crop = '') {
+               if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) {
+                       return $string;
+               }
+
+               if ($len > 0) {
+                       $string = mb_substr($string, 0, $len, $charset) . $crop;
+               } else {
+                       $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
+               }
+
+               return $string;
        }
 
        /**
         * Truncates a string and pre-/appends a string.
+        * Unit tested by Kasper
         *
-        * @param       string          the character set
-        * @param       string          character string
-        * @param       int             length (in characters)
-        * @param       string          crop signifier
-        * @return      string          the shortened string
+        * @param       string          The character set
+        * @param       string          Character string
+        * @param       integer         Length (in characters)
+        * @param       string          Crop signifier
+        * @return      string          The shortened string
         * @see substr(), mb_strimwidth()
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
         */
        function crop($charset,$string,$len,$crop='')   {
-               if ($len == 0)  return $crop;
+               if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
+                       return $this->cropMbstring($charset, $string, $len, $crop);
+               }
+
+               if (intval($len) == 0)  return $string;
 
                if ($charset == 'utf-8')        {
                        $i = $this->utf8_char2byte_pos($string,$len);
@@ -1116,94 +1522,173 @@ class t3lib_cs {
                        return $string;
                } else  {
                        if ($len > 0)   {
-                               if ($string{$i+1})      {
+                               if (strlen($string{$i}))        {
                                        return substr($string,0,$i).$crop;
+
                                }
                        } else {
-                               if ($string{$i-1})      {
+                               if (strlen($string{$i-1}))      {
                                        return $crop.substr($string,$i);
                                }
                        }
-               }
 
+/*
+                       if (abs($len)<$this->strlen($charset,$string))  {       // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
+                               if ($len > 0)   {
+                                       return substr($string,0,$i).$crop;
+                               } else {
+                                       return $crop.substr($string,$i);
+                               }
+                       }
+*/
+               }
                return $string;
        }
 
        /**
-        * Counts the number of characters.
+        * Cuts a string short at a given byte length.
         *
-        * @param       string          the character set
-        * @param       string          character string
-        * @return      integer         the number of characters
-        * @see strlen()
+        * @param       string          The character set
+        * @param       string          Character string
+        * @param       integer         The byte length
+        * @return      string          The shortened string
+        * @see mb_strcut()
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
         */
-       function strlen($charset,$string)       {
+       function strtrunc($charset,$string,$len)        {
+               if ($len <= 0)  return '';
+
                if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
-                       return mb_strlen($string,$charset);
+                       return mb_strcut($string,0,$len,$charset);
                } elseif ($charset == 'utf-8')  {
-                       return $this->utf8_strlen($string);
+                       return $this->utf8_strtrunc($string,$len);
                } elseif ($this->eucBasedSets[$charset])        {
-                       return $this->euc_strlen($string,$charset);
+                       return $this->euc_strtrunc($string,$charset);
                } elseif ($this->twoByteSets[$charset]) {
-                       return strlen($string)/2;
+                       if ($len % 2)   $len--;         // don't cut at odd positions
                } elseif ($this->fourByteSets[$charset])        {
-                       return strlen($string)/4;
+                       $x = $len % 4;
+                       $len -= $x;     // realign to position dividable by four
                }
                // treat everything else as single-byte encoding
-               return strlen($string);
+               return substr($string,0,$len);
        }
 
        /**
         * Translates all characters of a string into their respective case values.
         * Unlike strtolower() and strtoupper() this method is locale independent.
-        *
+        * Note that the string length may change!
+        * eg. lower case German �(sharp S) becomes upper case "SS"
+        * Unit-tested by Kasper
         * Real case folding is language dependent, this method ignores this fact.
         *
-        * @param       string          string
-        * @return      string          the converted string
+        * @param       string          Character set of string
+        * @param       string          Input string to convert case for
+        * @param       string          Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
+        * @return      string          The converted string
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
         * @see strtolower(), strtoupper()
         */
        function conv_case($charset,$string,$case)      {
-               if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' &&
-                       float(phpversion()) >= 4.3)     {
+               if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
                        if ($case == 'toLower') {
-                               return mb_strtolower($str,'utf-8');
+                               $string = mb_strtolower($string,$charset);
                        } else {
-                               return mb_strtoupper($str,'utf-8');
+                               $string = mb_strtoupper($string,$charset);
                        }
                } elseif ($charset == 'utf-8')  {
-                       return $this->utf8_conv_case($string,$case);
-               } elseif ($this->eucBasedSets[$charset])        {
-                       return $this->euc_conv_case($string,$case,$charset);
+                       $string = $this->utf8_char_mapping($string,'case',$case);
+               } elseif (isset($this->eucBasedSets[$charset])) {
+                       $string = $this->euc_char_mapping($string,$charset,'case',$case);
+               } else {
+                               // treat everything else as single-byte encoding
+                       $string = $this->sb_char_mapping($string,$charset,'case',$case);
                }
 
-               // treat everything else as single-byte encoding
-               if (!$this->initCaseFolding($charset))  return $string; // do nothing
+               return $string;
+       }
 
-               $out = '';
-               $caseConv =& $this->caseFolding[$charset][$case];
-               for($i=0; $c=$string{$i}; $i++) {
-                       $cc = $caseConv[$c];
-                       if ($cc)        {
-                               $out .= $cc;
-                       } else {
-                               $out .= $c;
-                       }
+       /**
+        * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
+        *
+        * @param       string          Character set of string
+        * @param       string          Input string to convert
+        * @return      string          The converted string
+        */
+       function specCharsToASCII($charset,$string)     {
+               if ($charset == 'utf-8')        {
+                       $string = $this->utf8_char_mapping($string,'ascii');
+               } elseif (isset($this->eucBasedSets[$charset])) {
+                       $string = $this->euc_char_mapping($string,$charset,'ascii');
+               } else {
+                               // treat everything else as single-byte encoding
+                       $string = $this->sb_char_mapping($string,$charset,'ascii');
                }
 
-               // is a simple strtr() faster or slower than the code above?
-               // perhaps faster for small single-byte tables but slower for large multi-byte tables?
-               //
-               // return strtr($string,$this->caseFolding[$charset][$case]);
-
-               return $out;
+               return $string;
        }
 
 
+       /**
+        * converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
+        * into a TYPO3-readable language code
+        * @param       $languageCodesList      list of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
+        *                      see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
+        * @return      string  a preferred language that TYPO3 supports, or "default" if none found
+        * @author      Benjamin Mack (benni.typo3.org)
+        */
+       public function getPreferredClientLanguage($languageCodesList) {
+               $allLanguageCodes = array();
+               $selectedLanguage = 'default';
+
+               // get all languages where TYPO3 code is the same as the ISO code
+               foreach ($this->charSetArray as $typo3Lang => $charSet) {
+                       $allLanguageCodes[$typo3Lang] = $typo3Lang;
+               }
+
+               // get all languages where TYPO3 code differs from ISO code
+               // or needs the country part
+               // the iso codes will here overwrite the default typo3 language in the key
+               foreach ($this->isoArray as $typo3Lang => $isoLang) {
+                       $isoLang = join('-', explode('_', $isoLang));
+                       $allLanguageCodes[$typo3Lang] = $isoLang;
+               }
+
+               // move the iso codes to the (because we're comparing the keys with "isset" later on)
+               $allLanguageCodes = array_flip($allLanguageCodes);
 
 
+               $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList);
+               // order the preferred languages after they key
+               $sortedPreferredLanguages = array();
+               foreach ($preferredLanguages as $preferredLanguage) {
+                       $quality = 1.0;
+                       if (strpos($preferredLanguage, ';q=') !== false) {
+                               list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
+                       }
+                       $sortedPreferredLanguages[$preferredLanguage] = $quality;
+               }
+
+               // loop through the languages, with the highest priority first
+               arsort($sortedPreferredLanguages, SORT_NUMERIC);
+               foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
+                       if (isset($allLanguageCodes[$preferredLanguage])) {
+                               $selectedLanguage = $allLanguageCodes[$preferredLanguage];
+                               break;
+                       }
+
+                       // strip the country code from the end
+                       list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
+                       if (isset($allLanguageCodes[$preferredLanguage])) {
+                               $selectedLanguage = $allLanguageCodes[$preferredLanguage];
+                               break;
+                       }
+               }
+               if (!$selectedLanguage || $selectedLanguage == 'en') {
+                       $selectedLanguage = 'default';
+               }
+               return $selectedLanguage;
+       }
 
 
 
@@ -1216,51 +1701,93 @@ class t3lib_cs {
 
        /********************************************
         *
-        * Internal UTF-8 string operation functions
+        * Internal string operation functions
         *
         ********************************************/
 
        /**
-        * Truncates a string in UTF-8 short at a given byte length.
+        * Maps all characters of a string in a single byte charset.
         *
-        * @param       string          UTF-8 multibyte character string
-        * @param       integer         the byte length
-        * @return      string          the shortened string
-        * @see mb_strcut()
+        * @param       string          the string
+        * @param       string          the charset
+        * @param       string          mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
+        * @param       string          'case': conversion 'toLower' or 'toUpper'
+        * @return      string          the converted string
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
         */
-       function utf8_strtrunc($str,$len)       {
-               $i = $len-1;
-               if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
-                       for (; $i>0 && !(ord($str{$i}) & 0x40); $i--)   ;       // find the first byte
-                       if ($i <= 0)    return ''; // sanity check
-                       for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1)  $bc++;  // calculate number of bytes
-                       if ($bc+$i > $len)      return substr($str,0,$i);
-                        // fallthru: multibyte char fits into length
+       function sb_char_mapping($str,$charset,$mode,$opt='')   {
+               switch($mode)   {
+                       case 'case':
+                               if (!$this->initCaseFolding($charset))  return $str;    // do nothing
+                               $map =& $this->caseFolding[$charset][$opt];
+                               break;
+
+                       case 'ascii':
+                               if (!$this->initToASCII($charset))      return $str;    // do nothing
+                               $map =& $this->toASCII[$charset];
+                               break;
+
+                       default:
+                               return $str;
                }
-               return substr($str,$len);
+
+               $out = '';
+               for($i=0; strlen($str{$i}); $i++)       {
+                       $c = $str{$i};
+                       if (isset($map[$c]))    {
+                               $out .= $map[$c];
+                       } else {
+                               $out .= $c;
+                       }
+               }
+
+               return $out;
        }
 
+
+
+
+
+
+
+
+
+
+       /********************************************
+        *
+        * Internal UTF-8 string operation functions
+        *
+        ********************************************/
+
        /**
         * Returns a part of a UTF-8 string.
+        * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
         *
-        * @param       string          $str    UTF-8 string
-        * @param       int             $start  start position (character position)
-        * @param       int             $len    length (in characters)
-        * @return      string          the substring
+        * @param       string          UTF-8 string
+        * @param       integer         Start position (character position)
+        * @param       integer         Length (in characters)
+        * @return      string          The substring
         * @see substr()
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
         */
        function utf8_substr($str,$start,$len=null)     {
+               if (!strcmp($len,'0'))  return '';
+
                $byte_start = $this->utf8_char2byte_pos($str,$start);
-               if ($byte_start === false)      return false;   // $start outside string length
+               if ($byte_start === false)      {
+                       if ($start > 0) {
+                               return false;   // $start outside string length
+                       } else {
+                               $start = 0;
+                       }
+               }
 
                $str = substr($str,$byte_start);
 
                if ($len!=null) {
                        $byte_end = $this->utf8_char2byte_pos($str,$len);
                        if ($byte_end === false)        // $len outside actual string length
-                               return $str;
+                               return $len<0 ? '' : $str;      // When length is less than zero and exceeds, then we return blank string.
                        else
                                return substr($str,0,$byte_end);
                }
@@ -1269,15 +1796,16 @@ class t3lib_cs {
 
        /**
         * Counts the number of characters of a string in UTF-8.
+        * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
         *
         * @param       string          UTF-8 multibyte character string
-        * @return      int             the number of characters
+        * @return      integer         The number of characters
         * @see strlen()
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
         */
        function utf8_strlen($str)      {
                $n=0;
-               for($i=0; $str{$i}; $i++)       {
+               for($i=0; strlen($str{$i}); $i++)       {
                        $c = ord($str{$i});
                        if (!($c & 0x80))       // single-byte (0xxxxxx)
                                $n++;
@@ -1288,18 +1816,41 @@ class t3lib_cs {
        }
 
        /**
+        * Truncates a string in UTF-8 short at a given byte length.
+        *
+        * @param       string          UTF-8 multibyte character string
+        * @param       integer         the byte length
+        * @return      string          the shortened string
+        * @see mb_strcut()
+        * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
+        */
+       function utf8_strtrunc($str,$len)       {
+               $i = $len-1;
+               if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
+                       for (; $i>0 && !(ord($str{$i}) & 0x40); $i--)   ;       // find the first byte
+                       if ($i <= 0)    return ''; // sanity check
+                       for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1)  $bc++;  // calculate number of bytes
+                       if ($bc+$i > $len)      return substr($str,0,$i);
+                       // fallthru: multibyte char fits into length
+               }
+               return substr($str,0,$len);
+       }
+
+       /**
         * Find position of first occurrence of a string, both arguments are in UTF-8.
         *
         * @param       string          UTF-8 string to search in
         * @param       string          UTF-8 string to search for
-        * @param       int             positition to start the search
-        * @return      int             the character position
+        * @param       integer         Positition to start the search
+        * @return      integer         The character position
         * @see strpos()
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
         */
        function utf8_strpos($haystack,$needle,$offset=0)       {
                if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
-                       return mb_strpos($haystack,$needle,'utf-8');
+                       return mb_strpos($haystack,$needle,$offset,'utf-8');
+               } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')      {
+                       return iconv_strpos($haystack,$needle,$offset,'utf-8');
                }
 
                $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
@@ -1315,14 +1866,16 @@ class t3lib_cs {
         * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
         *
         * @param       string          UTF-8 string to search in
-        * @param       char            UTF-8 character to search for
-        * @return      int             the character position
+        * @param       string          UTF-8 character to search for (single character)
+        * @return      integer         The character position
         * @see strrpos()
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
         */
        function utf8_strrpos($haystack,$needle)        {
                if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
                        return mb_strrpos($haystack,$needle,'utf-8');
+               } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')      {
+                       return iconv_strrpos($haystack,$needle,'utf-8');
                }
 
                $byte_pos = strrpos($haystack,$needle);
@@ -1333,15 +1886,16 @@ class t3lib_cs {
 
        /**
         * Translates a character position into an 'absolute' byte position.
+        * Unit tested by Kasper.
         *
         * @param       string          UTF-8 string
-        * @param       int             character position (negative values start from the end)
-        * @return      int             byte position
+        * @param       integer         Character position (negative values start from the end)
+        * @return      integer         Byte position
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
         */
        function utf8_char2byte_pos($str,$pos)  {
-               $n = 0;         // number of characters found
-               $p = abs($pos); // number of characters wanted
+               $n = 0;                         // number of characters found
+               $p = abs($pos);         // number of characters wanted
 
                if ($pos >= 0)  {
                        $i = 0;
@@ -1351,14 +1905,14 @@ class t3lib_cs {
                        $d = -1;
                }
 
-               for( ; $str{$i} && $n<$p; $i+=d)        {
+               for( ; strlen($str{$i}) && $n<$p; $i+=$d)       {
                        $c = (int)ord($str{$i});
                        if (!($c & 0x80))       // single-byte (0xxxxxx)
                                $n++;
                        elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
                                $n++;
                }
-               if (!$str{$i})  return false; // offset beyond string length
+               if (!strlen($str{$i}))  return false; // offset beyond string length
 
                if ($pos >= 0)  {
                                // skip trailing multi-byte data bytes
@@ -1373,10 +1927,11 @@ class t3lib_cs {
 
        /**
         * Translates an 'absolute' byte position into a character position.
+        * Unit tested by Kasper.
         *
         * @param       string          UTF-8 string
-        * @param       int             byte position
-        * @return      int             character position
+        * @param       integer         byte position
+        * @return      integer         character position
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
         */
        function utf8_byte2char_pos($str,$pos)  {
@@ -1388,26 +1943,38 @@ class t3lib_cs {
                        elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
                                $n++;
                }
-               if (!$str{$i})  return false; // offset beyond string length
+               if (!strlen($str{$i}))  return false; // offset beyond string length
 
                return $n;
        }
 
        /**
-        * Translates all characters of an UTF-8 string into their respective case values.
+        * Maps all characters of an UTF-8 string.
         *
         * @param       string          UTF-8 string
-        * @param       string          conversion: 'toLower' or 'toUpper'
+        * @param       string          mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
+        * @param       string          'case': conversion 'toLower' or 'toUpper'
         * @return      string          the converted string
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
-        * @see strtolower(), strtoupper(), mb_convert_case()
         */
-       function utf8_conv_case($str,$case)     {
-               if (!$this->initCaseFoldingUTF8())      return $str;    // do nothing
+       function utf8_char_mapping($str,$mode,$opt='')  {
+               if (!$this->initUnicodeData($mode))     return $str;    // do nothing
 
                $out = '';
-               $caseConv =& $this->caseFolding['utf-8'][$case];
-               for($i=0; $str{$i}; $i++)       {
+               switch($mode)   {
+                       case 'case':
+                               $map =& $this->caseFolding['utf-8'][$opt];
+                               break;
+
+                       case 'ascii':
+                               $map =& $this->toASCII['utf-8'];
+                               break;
+
+                       default:
+                               return $str;
+               }
+
+               for($i=0; strlen($str{$i}); $i++)       {
                        $c = ord($str{$i});
                        if (!($c & 0x80))       // single-byte (0xxxxxx)
                                $mbc = $str{$i};
@@ -1417,9 +1984,8 @@ class t3lib_cs {
                                $i += $bc-1;
                        }
 
-                       $cc = $caseConv[$mbc];
-                       if ($cc)        {
-                               $out .= $cc;
+                       if (isset($map[$mbc]))  {
+                               $out .= $map[$mbc];
                        } else {
                                $out .= $mbc;
                        }
@@ -1469,7 +2035,7 @@ class t3lib_cs {
         */
        function euc_strtrunc($str,$len,$charset)        {
                $sjis = ($charset == 'shift_jis');
-               for ($i=0; $str{$i} && $i<$len; $i++) {
+               for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
                        $c = ord($str{$i});
                        if ($sjis)      {
                                if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i++;   // advance a double-byte char
@@ -1478,21 +2044,22 @@ class t3lib_cs {
                                if ($c >= 0x80) $i++;   // advance a double-byte char
                        }
                }
-               if (!$str{$i})  return $str;    // string shorter than supplied length
+               if (!strlen($str{$i}))  return $str;    // string shorter than supplied length
 
-               if ($i>$len)
+               if ($i>$len) {
                        return substr($str,0,$len-1);   // we ended on a first byte
-               else
+               } else {
                        return substr($str,0,$len);
-        }
+               }
+       }
 
        /**
         * Returns a part of a string in the EUC charset family.
         *
         * @param       string          EUC multibyte character string
-        * @param       int             start position (character position)
+        * @param       integer         start position (character position)
         * @param       string          the charset
-        * @param       int             length (in characters)
+        * @param       integer         length (in characters)
         * @return      string          the substring
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
         */
@@ -1517,14 +2084,14 @@ class t3lib_cs {
         *
         * @param       string          EUC multibyte character string
         * @param       string          the charset
-        * @return      int             the number of characters
+        * @return      integer         the number of characters
         * @see strlen()
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
         */
        function euc_strlen($str,$charset)       {
                $sjis = ($charset == 'shift_jis');
                $n=0;
-               for ($i=0; $str{$i}; $i++) {
+               for ($i=0; strlen($str{$i}); $i++) {
                        $c = ord($str{$i});
                        if ($sjis)      {
                                if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i++;   // advance a double-byte char
@@ -1537,15 +2104,15 @@ class t3lib_cs {
                }
 
                return $n;
-        }
+       }
 
        /**
         * Translates a character position into an 'absolute' byte position.
         *
         * @param       string          EUC multibyte character string
-        * @param       int             character position (negative values start from the end)
+        * @param       integer         character position (negative values start from the end)
         * @param       string          the charset
-        * @return      int             byte position
+        * @return      integer         byte position
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
         */
        function euc_char2byte_pos($str,$pos,$charset)  {
@@ -1561,7 +2128,7 @@ class t3lib_cs {
                        $d = -1;
                }
 
-               for ( ; $str{$i} && $n<$p; $i+=$d) {
+               for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
                        $c = ord($str{$i});
                        if ($sjis)      {
                                if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i+=$d; // advance a double-byte char
@@ -1572,7 +2139,7 @@ class t3lib_cs {
 
                        $n++;
                }
-               if (!$str{$i})  return false; // offset beyond string length
+               if (!strlen($str{$i}))  return false; // offset beyond string length
 
                if ($pos < 0)   $i++;   // correct offset
 
@@ -1580,23 +2147,36 @@ class t3lib_cs {
        }
 
        /**
-        * Translates all characters of a string in the EUC charset family into their respective case values.
+        * Maps all characters of a string in the EUC charset family.
         *
         * @param       string          EUC multibyte character string
-        * @param       string          conversion: 'toLower' or 'toUpper'
         * @param       string          the charset
+        * @param       string          mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
+        * @param       string          'case': conversion 'toLower' or 'toUpper'
         * @return      string          the converted string
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
-        * @see strtolower(), strtoupper(), mb_convert_case()
         */
-       function euc_conv_case($str,$case,$charset)     {
-               if (!$this->initCaseFolding($charset))  return $str;    // do nothing
+       function euc_char_mapping($str,$charset,$mode,$opt='')  {
+               switch($mode)   {
+                       case 'case':
+                               if (!$this->initCaseFolding($charset))  return $str;    // do nothing
+                               $map =& $this->caseFolding[$charset][$opt];
+                               break;
+
+                       case 'ascii':
+                               if (!$this->initToASCII($charset))      return $str;    // do nothing
+                               $map =& $this->toASCII[$charset];
+                               break;
+
+                       default:
+                               return $str;
+               }
 
                $sjis = ($charset == 'shift_jis');
                $out = '';
-               $caseConv =& $this->caseFolding[$charset][$case];
-               for($i=0; $mbc=$str{$i}; $i++)  {
-                       $c = ord($str{$i});
+               for($i=0; strlen($str{$i}); $i++)       {
+                       $mbc = $str{$i};
+                       $c = ord($mbc);
 
                        if ($sjis)      {
                                if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  {       // a double-byte char
@@ -1611,9 +2191,8 @@ class t3lib_cs {
                                }
                        }
 
-                       $cc = $caseConv[$mbc];
-                       if ($cc)        {
-                               $out .= $cc;
+                       if (isset($map[$mbc]))  {
+                               $out .= $map[$mbc];
                        } else {
                                $out .= $mbc;
                        }
@@ -1627,4 +2206,5 @@ class t3lib_cs {
 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])       {
        include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
 }
-?>
+
+?>
\ No newline at end of file