This commit was manufactured by cvs2svn to create tag
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
index 165155e..4b7cc70 100755 (executable)
@@ -1,19 +1,19 @@
 <?php
 /***************************************************************
 *  Copyright notice
-*  
-*  (c) 2003 Kasper Skårhøj (kasper@typo3.com)
+*
+*  (c) 2003-2004 Kasper Skaarhoj (kasper@typo3.com)
 *  All rights reserved
 *
-*  This script is part of the Typo3 project. The Typo3 project is 
+*  This script is part of the Typo3 project. The Typo3 project is
 *  free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
-* 
+*
 *  The GNU General Public License can be found at
 *  http://www.gnu.org/copyleft/gpl.html.
-* 
+*
 *  This script is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *
 *  This copyright notice MUST APPEAR in all copies of the script!
 ***************************************************************/
-/** 
- * Class for conversion between charsets. 
+/**
+ * Class for conversion between charsets.
  *
- * @author     Kasper Skårhøj <kasper@typo3.com>
+ * $Id$
+ *
+ * @author     Kasper Skaarhoj <kasper@typo3.com>
  * @author     Martin Kutschker <martin.t.kutschker@blackbox.net>
  */
 /**
  *
  *
  *
- *  102: class t3lib_cs 
- *  194:     function parse_charset($charset)  
- *  211:     function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)   
- *  245:     function utf8_encode($str,$charset)       
- *  286:     function utf8_decode($str,$charset,$useEntityForNoChar=0) 
- *  341:     function utf8_to_entities($str)   
- *  374:     function entities_to_utf8($str,$alsoStdHtmlEnt=0) 
- *  405:     function utf8_to_numberarray($str,$convEntities=0,$retChar=0)     
- *  446:     function initCharset($charset)    
- *  517:     function UnumberToChar($cbyte)    
- *  561:     function utf8CharToUnumber($str,$hex=0)   
- *  590:     function utf8_strtrunc($str,$len) 
- *  612:     function utf_strlen($str) 
- *  625:     function utf_substr($str,$start,$len=0)   
- *  639:     function utf_strpos($haystack,$needle,$offset=0)  
- *  652:     function utf_strrpos($haystack,$needle,$offset=0) 
+ *  119: class t3lib_cs
+ *  261:     function parse_charset($charset)
+ *  278:     function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
+ *  312:     function utf8_encode($str,$charset)
+ *  359:     function utf8_decode($str,$charset,$useEntityForNoChar=0)
+ *  407:     function utf8_to_entities($str)
+ *  440:     function entities_to_utf8($str,$alsoStdHtmlEnt=0)
+ *  474:     function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
+ *  515:     function initCharset($charset)
+ *  586:     function UnumberToChar($cbyte)
+ *  630:     function utf8CharToUnumber($str,$hex=0)
+ *
+ *              SECTION: String operation functions
+ *  682:     function strtrunc($charset,$string,$len)
+ *  716:     function substr($charset,$str,$start,$len=null)
+ *  755:     function strlen($charset,$string)
+ *
+ *              SECTION: UTF-8 String operation functions
+ *  803:     function utf8_strtrunc($str,$len)
+ *  831:     function utf8_substr($str,$start,$len=null)
+ *  857:     function utf8_strlen($str)
+ *  879:     function utf8_strpos($haystack,$needle,$offset=0)
+ *  902:     function utf8_strrpos($haystack,$needle)
+ *  921:     function utf8_char2byte_pos($str,$pos)
+ *  946:     function utf8_byte2char_pos($str,$pos)
  *
- * TOTAL FUNCTIONS: 15
+ *              SECTION: EUC String operation functions
+ *  994:     function euc_strtrunc($str,$len,$charset)
+ * 1028:     function euc_substr($str,$start,$charset,$len=null)
+ * 1055:     function euc_strlen($str,$charset)
+ * 1082:     function euc_char2byte_pos($str,$pos,$charset)
+ *
+ * TOTAL FUNCTIONS: 24
  * (This index is automatically created/updated by the extension "extdeveval")
  *
  */
  */
 /**
  * Class for conversion between charsets.
- * 
- * @author     Kasper Skårhøj <kasper@typo3.com>
+ *
+ * @author     Kasper Skaarhoj <kasper@typo3.com>
  * @author     Martin Kutschker <martin.t.kutschker@blackbox.net>
  * @package TYPO3
  * @subpackage t3lib
  */
 class t3lib_cs {
        var $noCharByteVal=127;         // ASCII Value for chars with no equalent.
-               
-               // This is the array where parsed conversion tables are stored (cached) 
+
+               // This is the array where parsed conversion tables are stored (cached)
        var $parsedCharsets=array();
 
+               // An array where case folding data will be stored (cached)
+       var $caseFolding=array();
+
                // This tells the converter which charsets has two bytes per char:
        var $twoByteSets=array(
                'ucs-2'=>1,     // 2-byte Unicode
-               'utf-16'=>1     // 2-byte Unicode with surrogates
        );
 
-               // This tells the converter which charset use the Extended Unix Code scheme:
+               // This tells the converter which charsets has four bytes per char:
+       var $fourByteSets=array(
+               'ucs-4'=>1,     // 4-byte Unicode
+               'utf-32'=>1,    // 4-byte Unicode (limited to the 21-bits of UTF-16)
+       );
+
+               // This tells the converter which charsets use a scheme like the Extended Unix Code:
        var $eucBasedSets=array(
                'gb2312'=>1,    // Chinese, simplified.
+               'big5'=>1,      // Chinese, traditional.
+               'shift_jis'=>1, // Japanes - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
        );
 
                // see  http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
@@ -152,6 +179,7 @@ class t3lib_cs {
                'l9' => 'iso-8859-15',
                'cyrillic' => 'iso-8859-5',
                'arabic' => 'iso-8859-6',
+               'tis-620' => 'iso-8859-11',
                'win874' => 'windows-874',
                'win1250' => 'windows-1250',
                'win1251' => 'windows-1251',
@@ -163,45 +191,289 @@ class t3lib_cs {
                'win1257' => 'windows-1257',
                'win1258' => 'windows-1258',
                'cp1250' => 'windows-1250',
+               'cp1251' => 'windows-1251',
                'cp1252' => 'windows-1252',
                'ms-ee' => 'windows-1250',
                'ms-ansi' => 'windows-1252',
                'ms-greek' => 'windows-1253',
                'ms-turk' => 'windows-1254',
                'winbaltrim' => 'windows-1257',
-               'mac' => 'macRoman',
-               'macintosh' => 'macRoman',
+               'koi-8ru' => 'koi-8r',
+               'koi8r' => 'koi-8r',
+               'cp878' => 'koi-8r',
+               'mac' => 'macroman',
+               'macintosh' => 'macroman',
                'euc-cn' => 'gb2312',
                'x-euc-cn' => 'gb2312',
+               'euccn' => 'gb2312',
+               'cp936' => 'gb2312',
+               'big-5' => 'big5',
+               'cp950' => 'big5',
+               'eucjp' => 'euc-jp',
+               'sjis' => 'shift_jis',
+               'shift-jis' => 'shift_jis',
+               'cp932' => 'shift_jis',
+               'utf7' => 'utf-7',
+               'utf8' => 'utf-8',
+               'utf16' => 'utf-16',
+               'utf32' => 'utf-32',
                'utf8' => 'utf-8',
-               'utf-2' => 'utf-8',
-               'utf2' => 'utf-8',
+               'ucs2' => 'ucs-2',
+               'ucs4' => 'ucs-4',
+       );
+
+               // mapping of iso-639:2 language codes to language (family) names
+       var $lang_to_langfamily=array(
+                       // iso-639:2 language codes, see:
+                       //  http://www.w3.org/WAI/ER/IG/ert/iso639.htm
+                       //  http://www.unicode.org/onlinedat/languages.html
+               'ar' => 'arabic',
+               'bg' => 'cyrillic',
+               'cs' => 'east_european',
+               'da' => 'west_european',
+               'de' => 'west_european',
+               'es' => 'west_european',
+               'et' => 'estonian',
+               'eu' => 'west_european',
+               'fi' => 'west_european',
+               'fr' => 'west_european',
+               'gr' => 'greek',
+               'hr' => 'east_european',
+               'hu' => 'east_european',
+               'iw' => 'hebrew',
+               'is' => 'west_european',
+               'it' => 'west_european',
+               'ja' => 'japanese',
+               'kl' => 'west_european',
+               'ko' => 'korean',
+               'lt' => 'lithuanian',
+               'lv' => 'west_european', // Latvian/Lettish
+               'nl' => 'west_european',
+               'no' => 'west_european',
+               'pl' => 'east_european',
+               'pt' => 'west_european',
+               'ro' => 'east_european',
+               'ru' => 'cyrillic',
+               'sk' => 'east_european',
+               'sl' => 'east_european',
+               'sv' => 'west_european',
+               'th' => 'thai',
+               'uk' => 'cyrillic',
+               'vi' => 'vietnamese',
+               'zh' => 'chinese',
+                       // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
+               'chs' => 'simpl_chinese',
+               'cht' => 'trad_chinese',
+               'csy' => 'east_european',
+               'dan' => 'west_european',
+               'deu' => 'west_european',
+               'dea' => 'west_european',
+               'des' => 'west_european',
+               'ena' => 'west_european',
+               'enc' => 'west_european',
+               'eng' => 'west_european',
+               'enz' => 'west_european',
+               'enu' => 'west_european',
+               'nld' => 'west_european',
+               'nlb' => 'west_european',
+               'fin' => 'west_european',
+               'fra' => 'west_european',
+               'frb' => 'west_european',
+               'frc' => 'west_european',
+               'frs' => 'west_european',
+               'ell' => 'greek',
+               'hun' => 'east_european',
+               'isl' => 'west_euorpean',
+               'ita' => 'west_european',
+               'its' => 'west_european',
+               'jpn' => 'japanese',
+               'kor' => 'korean',
+               'nor' => 'west_european',
+               'non' => 'west_european',
+               'plk' => 'east_european',
+               'ptg' => 'west_european',
+               'ptb' => 'west_european',
+               'rus' => 'east_european',
+               'sky' => 'east_european',
+               'esp' => 'west_european',
+               'esm' => 'west_european',
+               'esn' => 'west_european',
+               'sve' => 'west_european',
+               'trk' => 'turkish',
+                       // English language names
+               'bulgarian' => 'east_european',
+               'catalan' => 'west_european',
+               'croatian' => 'east_european',
+               'czech' => 'east_european',
+               'danish' => 'west_european',
+               'dutch' => 'west_european',
+               'english' => 'west_european',
+               'finnish' => 'west_european',
+               'french' => 'west_european',
+               'galician' => 'west_european',
+               'german' => 'west_european',
+               'hungarian' => 'east_european',
+               'icelandic' => 'west_european',
+               'italian' => 'west_european',
+               'latvian' => 'west_european',
+               'lettish' => 'west_european',
+               'norwegian' => 'west_european',
+               'polish' => 'east_european',
+               'portuguese' => 'west_european',
+               'russian' => 'cyrillic',
+               'romanian' => 'east_european',
+               'slovak' => 'east_european',
+               'slovenian' => 'east_european',
+               'spanish' => 'west_european',
+               'svedish' => 'west_european',
+               'turkish' => 'east_european',
+               'ukrainian' => 'cyrillic',
+       );
+
+               // mapping of language (family) names to charsets on Unix
+       var $lang_to_charset_unix=array(
+               'west_european' => 'iso-8859-1',
+               'estonian' => 'iso-8859-1',
+               'east_european' => 'iso-8859-2',
+               'baltic' => 'iso-8859-4',
+               'cyrillic' => 'iso-8859-5',
+               'arabic' => 'iso-8859-6',
+               'greek' => 'iso-8859-7',
+               'hebrew' => 'iso-8859-8',
+               'turkish' => 'iso-8859-9',
+               'thai' => 'iso-8859-11', // = TIS-620
+               'lithuanian' => 'iso-8859-13',
+               'chinese' => 'gb2312', // = euc-cn
+               'japanese' => 'euc-jp',
+               'korean' => 'euc-kr',
+               'simpl_chinese' => 'gb2312',
+               'trad_chinese' => 'big5',
+               'vietnamese' => '',
+       );
+
+               // mapping of language (family) names to charsets on Windows
+       var $lang_to_charset_windows=array(
+               'east_european' => 'windows-1250',
+               'cyrillic' => 'windows-1251',
+               'west_european' => 'windows-1252',
+               'greek' => 'windows-1253',
+               'turkish' => 'windows-1254',
+               'hebrew' => 'windows-1255',
+               'arabic' => 'windows-1256',
+               'baltic' => 'windows-1257',
+               'estonian' => 'windows-1257',
+               'lithuanian' => 'windows-1257',
+               'vietnamese' => 'windows-1258',
+               'thai' => 'cp874',
+               'korean' => 'cp950',
+               'chinese' => 'gb2312',
+               'japanese' => 'shift_jis',
+               'simpl_chinese' => 'gb2312',
+               'trad_chinese' => 'big5',
+       );
+
+               // mapping of locale names to charsets
+       var $locale_to_charset=array(
+               'japanese.euc' => 'euc-jp',
+               'ja_JP.ujis' => 'euc-jp',
+               'korean.euc' => 'euc-kr',
+               'zh_cn' => 'gb2312',
+               'zh_hk' => 'big5',
+               'zh_tw' => 'big5',
        );
-       /*
-               JIS X 0208 (euc-jp)
-               CNS 11643 (EUC-TW)
-               KS C 5601 (EUC-KR) 
-       */
 
+               // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
+               // Empty values means "iso-8859-1"
+       var $charSetArray = array(
+               'dk' => '',
+               'de' => '',
+               'no' => '',
+               'it' => '',
+               'fr' => '',
+               'es' => '',
+               'nl' => '',
+               'cz' => 'windows-1250',
+               'pl' => 'iso-8859-2',
+               'si' => 'windows-1250',
+               'fi' => '',
+               'tr' => 'iso-8859-9',
+               'se' => '',
+               'pt' => '',
+               'ru' => 'windows-1251',
+               'ro' => 'iso-8859-2',
+               'ch' => 'gb2312',
+               'sk' => 'windows-1250',
+               'lt' => 'windows-1257',
+               'is' => 'utf-8',
+               'hr' => 'windows-1250',
+               'hu' => 'iso-8859-2',
+               'gl' => '',
+               'th' => 'iso-8859-11',
+               'gr' => 'iso-8859-7',
+               'hk' => 'big5',
+               'eu' => '',
+               'bg' => 'windows-1251',
+               'br' => '',
+               'et' => 'iso-8859-4',
+               'ar' => 'iso-8859-6',
+               'he' => 'utf-8',
+               'ua' => 'windows-1251',
+               'jp' => 'shift_jis',
+               'lv' => 'utf-8',
+               'vn' => 'utf-8',
+       );
 
        /**
-        * Normalize
-        * 
+        * Normalize - changes input character set to lowercase letters.
+        *
         * @param       string          Input charset
         * @return      string          Normalized charset
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
         */
        function parse_charset($charset)        {
-               $encoding = strtolower($charset);
-               if (isset($synonyms[$charset])) $encoding = $synonyms[$charset];
+               $charset = strtolower($charset);
+               if (isset($this->synonyms[$charset]))   $charset = $this->synonyms[$charset];
 
                return $charset;
        }
 
+       /**
+        * Get the charset of a locale.
+        *
+        * ln        language
+        * ln_CN     language / country
+        * ln_CN.cs  language / country / charset
+        *
+        * @param       string          Locale
+        * @return      string          Charset
+        * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
+        */
+       function get_locale_charset($locale)    {
+               $locale = strtolower($locale);
+
+                       // exact locale specific charset?
+               if (isset($this->locale_to_charset[$locale]))   return $this->locale_to_charset[$locale];
+
+                       // locale contains charset: use it
+               list($locale,$charset) = explode('.',$locale);
+               if ($charset)   return $this->parse_charset($charset);
+
+                       // get language
+               list($language,$country) = explode('_',$locale);
+               if (isset($this->lang_to_langfamily[$language]))        $language = $this->lang_to_langfamily[$language];
+
+               if (TYPO3_OS == 'WIN')  {
+                       $cs = $this->lang_to_charset_windows[$language];
+               } else {
+                       $cs = $this->lang_to_charset_unix[$language];
+               }
+
+               return $cs ? $cs : 'iso-8859-1';
+       }
 
        /**
         * Convert from one charset to another charset.
-        * 
+        *
         * @param       string          Input string
         * @param       string          From charset (the current charset of the string)
         * @param       string          To charset (the output charset wanted)
@@ -209,62 +481,71 @@ class t3lib_cs {
         * @return      string          Converted string
         */
        function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
-               global $TYPO3_CONF_VARS;
-
                if ($fromCS==$toCS)     return $str;
 
-               if (!$useEntityForNoChar)       { // iconv and recode don't support fallback to SGML entities
-                       if ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'iconv')  {
+                       // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
+               if ($toCS=='utf-8' || !$useEntityForNoChar)     {
+                       switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod'])       {
+                       case 'mbstring':
+                               $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
+                               if (false !== $conv_str)        return $conv_str; // returns false for unsupported charsets
+                               break;
+
+                       case 'iconv':
                                $conv_str = iconv($str,$fromCS,$toCS.'//TRANSLIT');
                                if (false !== $conv_str)        return $conv_str;
-                       }
-                       elseif ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'recode')     {
+                               break;
+
+                       case 'recode':
                                $conv_str = recode_string($toCS.'..'.$fromCS,$str);
                                if (false !== $conv_str)        return $conv_str;
-                       }
-                       elseif ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'mbstring')   {
-                               $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
-                               if (false !== $conv_str)        return $conv_str; // returns false for unsupported charsets
+                               break;
                        }
                        // fallback to TYPO3 conversion
                }
 
                if ($fromCS!='utf-8')   $str=$this->utf8_encode($str,$fromCS);
-               if ($toCS!='utf-8')             $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
+               if ($toCS!='utf-8')     $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
                return $str;
        }
 
 
        /**
         * Converts $str from $charset to UTF-8
-        * 
+        *
         * @param       string          String in local charset to convert to UTF-8
         * @param       string          Charset, lowercase. Must be found in csconvtbl/ folder.
         * @return      string          Output string, converted to UTF-8
         */
        function utf8_encode($str,$charset)     {
 
-                       // Charset is case-insensitive. 
+                       // Charset is case-insensitive.
                if ($this->initCharset($charset))       {       // Parse conv. table if not already...
                        $strLen = strlen($str);
                        $outStr='';
 
-                       for ($a=0,$i;$a<$strLen;$a++,$i++)      {       // Traverse each char in string.
+                       for ($a=0;$a<$strLen;$a++)      {       // Traverse each char in string.
                                $chr=substr($str,$a,1);
                                $ord=ord($chr);
                                if ($this->twoByteSets[$charset])       {       // If the charset has two bytes per char
-                                       $ord2 = ord($str[$i+1]);
+                                       $ord2 = ord($str{$a+1});
                                        $ord = $ord<<8 & $ord2; // assume big endian
 
                                        if (isset($this->parsedCharsets[$charset]['local'][$ord]))      {       // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
                                                $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
                                        } else $outStr.=chr($this->noCharByteVal);      // No char exists
+                                       $a++;
                                } elseif ($ord>127)     {       // If char has value over 127 it's a multibyte char in UTF-8
                                        if ($this->eucBasedSets[$charset])      {       // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
                                                $a++;
                                                $ord2=ord(substr($str,$a,1));
                                                $ord = $ord*256+$ord2;
                                        }
+                                       elseif ($charset == 'shift_jis' && ($ord <160 || $ord>223))     {       // Shift-JIS is like EUC, but chars between 160 and 223 are single byte
+                                               $a++;
+                                               $ord2=ord(substr($str,$a,1));
+                                               $ord = $ord*256+$ord2;
+                                       }
 
                                        if (isset($this->parsedCharsets[$charset]['local'][$ord]))      {       // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
                                                $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
@@ -277,7 +558,7 @@ class t3lib_cs {
 
        /**
         * Converts $str from UTF-8 to $charset
-        * 
+        *
         * @param       string          String in UTF-8 to convert to local charset
         * @param       string          Charset, lowercase. Must be found in csconvtbl/ folder.
         * @param       boolean         If set, then characters that are not available in the destination character set will be encoded as numeric entities
@@ -285,7 +566,7 @@ class t3lib_cs {
         */
        function utf8_decode($str,$charset,$useEntityForNoChar=0)       {
 
-                       // Charset is case-insensitive. 
+                       // Charset is case-insensitive.
                if ($this->initCharset($charset))       {       // Parse conv. table if not already...
                        $strLen = strlen($str);
                        $outStr='';
@@ -296,12 +577,6 @@ class t3lib_cs {
                                if ($ord>127)   {       // This means multibyte! (first byte!)
                                        if ($ord & 64)  {       // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
 
-
-/*
- *
- *
- *     alternative code which fewers substrs and concatenations: see below
- *
                                                $buf=$chr;      // Add first byte
                                                for ($b=0;$b<8;$b++)    {       // for each byte in multibyte string...
                                                        $ord = $ord << 1;       // Shift it left and ...
@@ -310,18 +585,17 @@ class t3lib_cs {
                                                                $buf.=substr($str,$a,1);        // ... and add the next char.
                                                        } else break;
                                                }
-*
-*/
 
-                                               for ($bc=0; $ord & 0x80; $ord = $ord << 1) { $bc++; }   // calculate number of bytes
-                                               $buf.=substr($str,$i,$bc);
-                                               $i+=$bc-1;
+# Martin Kutschker...! this does not work! With russian UTF-8 converted back to windows-1251 it failed... So the old code is re-inserted.
+#                                              for ($bc=0; $ord & 0x80; $ord = $ord << 1) { $bc++; }   // calculate number of bytes
+#                                              $buf.=substr($str,$i,$bc);
+#                                              $i+=$bc-1;
 
                                                if (isset($this->parsedCharsets[$charset]['utf8'][$buf]))       {       // If the UTF-8 char-sequence is found then...
                                                        $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
                                                        if ($mByte>255) {       // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
                                                                $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
-                                                       } else $outStr.= chr($mByte);   
+                                                       } else $outStr.= chr($mByte);
                                                } elseif ($useEntityForNoChar) {        // Create num entity:
                                                        $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
                                                } else $outStr.=chr($this->noCharByteVal);      // No char exists
@@ -334,7 +608,7 @@ class t3lib_cs {
 
        /**
         * Converts all chars > 127 to numeric entities.
-        * 
+        *
         * @param       string          Input string
         * @return      string          Output string
         */
@@ -355,18 +629,18 @@ class t3lib_cs {
                                                        $buf.=substr($str,$a,1);        // ... and add the next char.
                                                } else break;
                                        }
-                                       
+
                                        $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
                                } else $outStr.=chr($this->noCharByteVal);      // No char exists (MIDDLE of MB sequence!)
                        } else $outStr.=$chr;   // ... otherwise it's just ASCII 0-127 and one byte. Transparent
                }
-               
+
                return $outStr;
        }
-       
+
        /**
         * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
-        * 
+        *
         * @param       string          Input string, UTF-8
         * @param       boolean         If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
         * @return      string          Output string
@@ -381,8 +655,11 @@ class t3lib_cs {
                foreach($parts as $k => $v)     {
                        if ($k%2)       {
                                if (substr($v,0,1)=='#')        {       // Dec or hex entities:
-                                       if (substr($v,1,1)=='x')        $v=hexdec(substr($v,2));
-                                       $parts[$k] = $this->UnumberToChar(substr($v,1));
+                                       if (substr($v,1,1)=='x')        {
+                                               $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
+                                       } else {
+                                               $parts[$k] = $this->UnumberToChar(substr($v,1));
+                                       }
                                } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) {  // Other entities:
                                        $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
                                } else {        // No conversion:
@@ -390,13 +667,13 @@ class t3lib_cs {
                                }
                        }
                }
-               
+
                return implode('',$parts);
        }
-       
+
        /**
         * Converts all chars in the input UTF-8 string into integer numbers returned in an array
-        * 
+        *
         * @param       string          Input string, UTF-8
         * @param       boolean         If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
         * @param       boolean         If set, then instead of integer numbers the real UTF-8 char is returned.
@@ -424,21 +701,21 @@ class t3lib_cs {
                                                        $buf.=substr($str,$a,1);        // ... and add the next char.
                                                } else break;
                                        }
-                                       
+
                                        $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
                                } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal;       // No char exists (MIDDLE of MB sequence!)
                        } else $outArr[]=$retChar?chr($ord):$ord;       // ... otherwise it's just ASCII 0-127 and one byte. Transparent
                }
-               
+
                return $outArr;
        }
 
        /**
         * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
         * This function is automatically called by the conversion functions
-        * 
+        *
         * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
-        * 
+        *
         * @param       string          The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
         * @return      integer         Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
         * @access private
@@ -449,7 +726,7 @@ class t3lib_cs {
 
                                // Conversion table filename:
                        $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
-                       
+
                                // If the conversion table is found:
                        if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile))      {
                                        // Cache file for charsets:
@@ -459,18 +736,18 @@ class t3lib_cs {
                                        $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
                                } else {
                                                // Parse conversion table into lines:
-                                       $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);      
+                                       $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
                                                // Initialize the internal variable holding the conv. table:
-                                       $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());        
+                                       $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
                                                // traverse the lines:
                                        $detectedType='';
                                        foreach($lines as $value)       {
                                                if (trim($value) && substr($value,0,1)!='#')    {       // Comment line or blanks are ignored.
-                                                       
+
                                                                // Detect type if not done yet: (Done on first real line)
                                                                // The "whitespaced" type is on the syntax      "0x0A   0x000A  #LINE FEED"     while   "ms-token" is like              "B9 = U+00B9 : SUPERSCRIPT ONE"
                                                        if (!$detectedType)             $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
-                                                       
+
                                                        if ($detectedType=='ms-token')  {
                                                                list($hexbyte,$utf8) = split('=|:',$value,3);
                                                        } elseif ($detectedType=='whitespaced') {
@@ -499,9 +776,9 @@ class t3lib_cs {
        /**
         * Converts a UNICODE number to a UTF-8 multibyte character
         * Algorithm based on script found at From: http://czyborra.com/utf/
-        * 
-        * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence: 
-        * 
+        *
+        * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
+        *
         *  bytes | bits | representation
         *      1 |    7 | 0vvvvvvv
         *      2 |   11 | 110vvvvv 10vvvvvv
@@ -509,14 +786,14 @@ class t3lib_cs {
         *      4 |   21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
         *      5 |   26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
         *      6 |   31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
-        * 
+        *
         * @param       integer         UNICODE integer
         * @return      string          UTF-8 multibyte character string
         * @see utf8CharToUnumber()
         */
        function UnumberToChar($cbyte)  {
                $str='';
-               
+
                if ($cbyte < 0x80) {
                        $str.=chr($cbyte);
                } else if ($cbyte < 0x800) {
@@ -549,10 +826,10 @@ class t3lib_cs {
                }
                return $str;
        }
-       
+
        /**
         * Converts a UTF-8 Multibyte character to a UNICODE number
-        * 
+        *
         * @param       string          UTF-8 multibyte character string
         * @param       boolean         If set, then a hex. number is returned.
         * @return      integer         UNICODE integer
@@ -560,7 +837,7 @@ class t3lib_cs {
         */
        function utf8CharToUnumber($str,$hex=0) {
                $ord=ord(substr($str,0,1));     // First char
-               
+
                if (($ord & 192) == 192)        {       // This verifyes that it IS a multi byte string
                        $binBuf='';
                        for ($b=0;$b<8;$b++)    {       // for each byte in multibyte string...
@@ -573,90 +850,786 @@ class t3lib_cs {
 
                        $int = bindec($binBuf);
                } else $int = $ord;
-               
+
                return $hex ? 'x'.dechex($int) : $int;
        }
 
+       /**
+        * This function initializes the UTF-8 case folding table.
+        *
+        * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
+        *
+        * @return      integer         Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
+        * @access private
+        */
+       function initCaseFoldingUTF8()  {
+                       // Only process if the case table is not yet loaded:
+               if (is_array($this->caseFolding['utf-8']))      return 1;
+
+                       // Use cached version if possible
+               $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cscase_utf-8.tbl');
+               if ($cacheFile && @is_file($cacheFile)) {
+                       $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFile));
+                       return 2;
+               }
+
+                       // process main Unicode data file
+               $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
+               if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
+
+               $fh = fopen($unicodeDataFile,'r');
+               if (!$fh)       return false;
+
+                       // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
+                       // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
+               $this->caseFolding['utf-8'] = array();
+               $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
+               $utf8CaseFolding['toUpper'] = array();
+               $utf8CaseFolding['toLower'] = array();
+               $utf8CaseFolding['toTitle'] = array();
+
+               while (!feof($fh))      {
+                       $line = fgets($fh);
+                               // has also other info like character class (digit, white space, etc.) and more
+                       list($char,,,,,,,,,,,,$upper,$lower,$title,) = split(';', rtrim($line));
+                       $char = $this->UnumberToChar(hexdec($char));
+                       if ($upper)     $utf8CaseFolding['toUpper'][$char] = $this->UnumberToChar(hexdec($upper));
+                       if ($lower)     $utf8CaseFolding['toLower'][$char] = $this->UnumberToChar(hexdec($lower));
+                               // store "title" only when different from "upper" (only a few)
+                       if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$char] = $this->UnumberToChar(hexdec($title));
+               }
+               fclose($fh);
+
+                       // process additional Unicode data for casing (allow folded characters to expand into a sequence)
+               $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
+               if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile))        {
+
+                       $fh = fopen($specialCasingFile,'r');
+                       if ($fh)        {
+                               while (!feof($fh))      {
+                                       $line = fgets($fh);
+                                       if ($line{0} != '#' && trim($line) != '')       {
+
+                                               list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
+                                               if ($cond == '' || $cond{0} == '#')     {
+                                                       $utf8_char = $this->UnumberToChar(hexdec($char));
+                                                       if ($char != $lower)    {
+                                                               $arr = split(' ',$lower);
+                                                               for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
+                                                               $utf8CaseFolding['toLower'][$utf8_char] = implode($arr);
+                                                       }
+                                                       if ($char != $title && $title != $upper)        {
+                                                               $arr = split(' ',$title);
+                                                               for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
+                                                               $utf8CaseFolding['toTitle'][$utf8_char] = implode($arr);
+                                                       }
+                                                       if ($char != $upper)    {
+                                                                       $arr = split(' ',$upper);
+                                                               for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
+                                                               $utf8CaseFolding['toUpper'][$utf8_char] = implode($arr);
+                                                       }
+                                               }
+                                       }
+                               }
+                               fclose($fh);
+                       }
+               }
+
+               if ($cacheFile) {
+                               t3lib_div::writeFile($cacheFile,serialize($utf8CaseFolding));
+               }
+
+               return 3;
+       }
+
+       /**
+        * This function initializes the folding table for a charset other than UTF-8.
+        * This function is automatically called by the case folding functions.
+        *
+        * @return      integer         Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
+        * @access private
+        */
+       function initCaseFolding($charset)      {
+                       // Only process if the case table is not yet loaded:
+               if (is_array($this->caseFolding[$charset]))     return 1;
+
+                       // Use cached version if possible
+               $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cscase_'.$charset.'.tbl');
+               if ($cacheFile && @is_file($cacheFile)) {
+                       $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
+                       return 2;
+               }
+
+                       // init UTF-8 conversion for this charset
+               if (!$this->initCharset($charset))      {
+                       return false;
+               }
+
+                       // UTF-8 case folding is used as the base conversion table
+               if (!$this->initCaseFoldingUTF8())      {
+                       return false;
+               }
+
+               $nochar = chr($this->noCharByteVal);
+               foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8)      {
+                               // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
+                       $c = $this->conv($utf8, 'utf-8', $charset);
+
+                       $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
+                       if ($cc && $cc != $nochar)      $this->caseFolding[$charset]['toUpper'][$c] = $cc;
+
+                       $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
+                       if ($cc && $cc != $nochar)      $this->caseFolding[$charset]['toLower'][$c] = $cc;
+
+                       $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
+                       if ($cc && $cc != $nochar)      $this->caseFolding[$charset]['toTitle'][$c] = $cc;
+               }
+
+                       // add the ASCII case table
+               for ($i=ord('a'); $i<=ord('z'); $i++)   {
+                       $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
+               }
+               for ($i=ord('A'); $i<=ord('Z'); $i++)   {
+                       $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
+               }
+
+               if ($cacheFile) {
+                               t3lib_div::writeFile($cacheFile,serialize($this->caseFolding[$charset]));
+               }
+
+               return 3;
+       }
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+       /********************************************
+        *
+        * String operation functions
+        *
+        ********************************************/
 
        /**
-        * Truncates a string in UTF-8 short at a given byte length
-        * 
-        * @param       string          UTF-8 multibyte character string
+        * Cuts a string short at a given byte length.
+        *
+        * @param       string          the character set
+        * @param       string          character string
         * @param       integer         the byte length
         * @return      string          the shortened string
-        * @see strcut()
+        * @see mb_strcut()
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
         */
-       function utf8_strtrunc($str,$len)       {
+       function strtrunc($charset,$string,$len)        {
+               if ($len <= 0)  return '';
+
+               if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
+                       return mb_strcut($string,0,$len,$charset);
+               } elseif ($charset == 'utf-8')  {
+                       return $this->utf8_strtrunc($string);
+               } elseif ($this->eucBasedSets[$charset])        {
+                       return $this->euc_strtrunc($string,$charset);
+               } elseif ($this->twoByteSets[$charset]) {
+                       if ($len % 2)   $len--;         // don't cut at odd positions
+               } elseif ($this->fourByteSets[$charset])        {
+                       $x = $len % 4;
+                       $len -= $x;     // realign to position dividable by four
+               }
+               // treat everything else as single-byte encoding
+               return substr($string,0,$len);
+       }
+
+       /**
+        * Returns a part of a string.
+        *
+        * @param       string          the character set
+        * @param       string          character string
+        * @param       int             start position (character position)
+        * @param       int             length (in characters)
+        * @return      string          the substring
+        * @see substr(), mb_substr()
+        * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
+        */
+       function substr($charset,$string,$start,$len=null)      {
+               if ($len===0)   return '';
+
+               if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
+                               // cannot omit $len, when specifying charset
+                       if ($len==null) {
+                               $enc = mb_internal_encoding();  // save internal encoding
+                               mb_internal_encoding('utf-8');
+                               $str = mb_substr($string,$start);
+                               mb_internal_encoding($enc);     // restore internal encoding
+
+                               return $str;
+                       }
+                       else    return mb_substr($string,$start,$len,'utf-8');
+               } elseif ($charset == 'utf-8')  {
+                       return $this->utf8_substr($string,$start,$len);
+               } elseif ($this->eucBasedSets[$charset])        {
+                       return $this->euc_substr($string,$start,$charset,$len);
+               } elseif ($this->twoByteSets[$charset]) {
+                       return substr($string,$start*2,$len*2);
+               } elseif ($this->fourByteSets[$charset])        {
+                       return substr($string,$start*4,$len*4);
+               }
+
+               // treat everything else as single-byte encoding
+               return substr($string,$start,$len);
+       }
+
+       /**
+        * Truncates a string and pre-/appends a string.
+        *
+        * @param       string          the character set
+        * @param       string          character string
+        * @param       int             length (in characters)
+        * @param       string          crop signifier
+        * @return      string          the shortened string
+        * @see substr(), mb_strimwidth()
+        * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
+        */
+       function crop($charset,$string,$len,$crop='')   {
+               if ($len == 0)  return $crop;
+
+               if ($charset == 'utf-8')        {
+                       $i = $this->utf8_char2byte_pos($string,$len);
+               } elseif ($this->eucBasedSets[$charset])        {
+                       $i = $this->euc_char2byte_pos($string,$len,$charset);
+               } else {
+                       if ($len > 0)   {
+                               $i = $len;
+                       } else {
+                               $i = strlen($string)+$len;
+                               if ($i<=0)      $i = false;
+                       }
+               }
+
+               if ($i === false)       {       // $len outside actual string length
+                       return $string;
+               } else  {
+                       if ($len > 0)   {
+                               if ($string{$i+1})      {
+                                       return substr($string,0,$i).$crop;
+                               }
+                       } else {
+                               if ($string{$i-1})      {
+                                       return $crop.substr($string,$i);
+                               }
+                       }
+               }
+
+               return $string;
+       }
+
+       /**
+        * Counts the number of characters.
+        *
+        * @param       string          the character set
+        * @param       string          character string
+        * @return      integer         the number of characters
+        * @see strlen()
+        * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
+        */
+       function strlen($charset,$string)       {
                if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
-                       return mb_strcut($str,0,$len,'utf-8');
+                       return mb_strlen($string,$charset);
+               } elseif ($charset == 'utf-8')  {
+                       return $this->utf8_strlen($string);
+               } elseif ($this->eucBasedSets[$charset])        {
+                       return $this->euc_strlen($string,$charset);
+               } elseif ($this->twoByteSets[$charset]) {
+                       return strlen($string)/2;
+               } elseif ($this->fourByteSets[$charset])        {
+                       return strlen($string)/4;
+               }
+               // treat everything else as single-byte encoding
+               return strlen($string);
+       }
+
+       /**
+        * Translates all characters of a string into their respective case values.
+        * Unlike strtolower() and strtoupper() this method is locale independent.
+        *
+        * Real case folding is language dependent, this method ignores this fact.
+        *
+        * @param       string          string
+        * @return      string          the converted string
+        * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
+        * @see strtolower(), strtoupper()
+        */
+       function conv_case($charset,$string,$case)      {
+               if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' &&
+                       float(phpversion()) >= 4.3)     {
+                       if ($case == 'toLower') {
+                               return mb_strtolower($str,'utf-8');
+                       } else {
+                               return mb_strtoupper($str,'utf-8');
+                       }
+               } elseif ($charset == 'utf-8')  {
+                       return $this->utf8_conv_case($string,$case);
+               } elseif ($this->eucBasedSets[$charset])        {
+                       return $this->euc_conv_case($string,$case,$charset);
+               }
+
+               // treat everything else as single-byte encoding
+               if (!$this->initCaseFolding($charset))  return $string; // do nothing
+
+               $out = '';
+               $caseConv =& $this->caseFolding[$charset][$case];
+               for($i=0; $c=$string{$i}; $i++) {
+                       $cc = $caseConv[$c];
+                       if ($cc)        {
+                               $out .= $cc;
+                       } else {
+                               $out .= $c;
+                       }
                }
 
+               // is a simple strtr() faster or slower than the code above?
+               // perhaps faster for small single-byte tables but slower for large multi-byte tables?
+               //
+               // return strtr($string,$this->caseFolding[$charset][$case]);
+
+               return $out;
+       }
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+       /********************************************
+        *
+        * Internal UTF-8 string operation functions
+        *
+        ********************************************/
+
+       /**
+        * Truncates a string in UTF-8 short at a given byte length.
+        *
+        * @param       string          UTF-8 multibyte character string
+        * @param       integer         the byte length
+        * @return      string          the shortened string
+        * @see mb_strcut()
+        * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
+        */
+       function utf8_strtrunc($str,$len)       {
                $i = $len-1;
-               if (ord($str[$i]) & 0x80) { // part of a mulitbyte sequence
-                       for (; !(ord($str[$i]) & 0x40); $i--)   ;       // find the first byte
-                       for ($bc=0, $mbs=ord($str[$i]); $mbs & 0x80; $mbs = $mbs << 1)  $bc++;  // calculate number of bytes
+               if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
+                       for (; $i>0 && !(ord($str{$i}) & 0x40); $i--)   ;       // find the first byte
+                       if ($i <= 0)    return ''; // sanity check
+                       for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1)  $bc++;  // calculate number of bytes
                        if ($bc+$i > $len)      return substr($str,0,$i);
                         // fallthru: multibyte char fits into length
                }
                return substr($str,$len);
        }
 
-       // a few stubs of possibly useful functions, which may be impmeneted in PHP
-
        /**
-        * @param       [type]          $str: ...
-        * @return      [type]          ...
+        * Returns a part of a UTF-8 string.
+        *
+        * @param       string          $str    UTF-8 string
+        * @param       int             $start  start position (character position)
+        * @param       int             $len    length (in characters)
+        * @return      string          the substring
+        * @see substr()
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
         */
-       function utf_strlen($str)       {
-               if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
-                       return mb_strlen($str,'utf-8');
+       function utf8_substr($str,$start,$len=null)     {
+               $byte_start = $this->utf8_char2byte_pos($str,$start);
+               if ($byte_start === false)      return false;   // $start outside string length
+
+               $str = substr($str,$byte_start);
+
+               if ($len!=null) {
+                       $byte_end = $this->utf8_char2byte_pos($str,$len);
+                       if ($byte_end === false)        // $len outside actual string length
+                               return $str;
+                       else
+                               return substr($str,0,$byte_end);
                }
+               else    return $str;
        }
 
        /**
-        * @param       [type]          $str: ...
-        * @param       [type]          $start: ...
-        * @param       [type]          $len: ...
-        * @return      [type]          ...
+        * Counts the number of characters of a string in UTF-8.
+        *
+        * @param       string          UTF-8 multibyte character string
+        * @return      int             the number of characters
+        * @see strlen()
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
         */
-       function utf_substr($str,$start,$len=0) {
-               if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
-               // how to omit $len when you specify a charset?!?!
-                       return mb_substr($str,$start,$len,'utf-8');
+       function utf8_strlen($str)      {
+               $n=0;
+               for($i=0; $str{$i}; $i++)       {
+                       $c = ord($str{$i});
+                       if (!($c & 0x80))       // single-byte (0xxxxxx)
+                               $n++;
+                       elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
+                               $n++;
                }
+               return $n;
        }
 
        /**
-        * @param       [type]          $haystack: ...
-        * @param       [type]          $needle: ...
-        * @param       [type]          $offset: ...
-        * @return      [type]          ...
+        * Find position of first occurrence of a string, both arguments are in UTF-8.
+        *
+        * @param       string          UTF-8 string to search in
+        * @param       string          UTF-8 string to search for
+        * @param       int             positition to start the search
+        * @return      int             the character position
+        * @see strpos()
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
         */
-       function utf_strpos($haystack,$needle,$offset=0)        {
+       function utf8_strpos($haystack,$needle,$offset=0)       {
                if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
                        return mb_strpos($haystack,$needle,'utf-8');
                }
+
+               $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
+               if ($byte_offset === false)     return false; // offset beyond string length
+
+               $byte_pos = strpos($haystack,$needle,$byte_offset);
+               if ($byte_pos === false)        return false; // needle not found
+
+               return $this->utf8_byte2char_pos($haystack,$byte_pos);
        }
 
        /**
-        * @param       [type]          $haystack: ...
-        * @param       [type]          $needle: ...
-        * @param       [type]          $offset: ...
-        * @return      [type]          ...
+        * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
+        *
+        * @param       string          UTF-8 string to search in
+        * @param       char            UTF-8 character to search for
+        * @return      int             the character position
+        * @see strrpos()
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
         */
-       function utf_strrpos($haystack,$needle,$offset=0)       {
+       function utf8_strrpos($haystack,$needle)        {
                if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
                        return mb_strrpos($haystack,$needle,'utf-8');
                }
+
+               $byte_pos = strrpos($haystack,$needle);
+               if ($byte_pos === false)        return false; // needle not found
+
+               return $this->utf8_byte2char_pos($haystack,$byte_pos);
+       }
+
+       /**
+        * Translates a character position into an 'absolute' byte position.
+        *
+        * @param       string          UTF-8 string
+        * @param       int             character position (negative values start from the end)
+        * @return      int             byte position
+        * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
+        */
+       function utf8_char2byte_pos($str,$pos)  {
+               $n = 0;         // number of characters found
+               $p = abs($pos); // number of characters wanted
+
+               if ($pos >= 0)  {
+                       $i = 0;
+                       $d = 1;
+               } else {
+                       $i = strlen($str)-1;
+                       $d = -1;
+               }
+
+               for( ; $str{$i} && $n<$p; $i+=d)        {
+                       $c = (int)ord($str{$i});
+                       if (!($c & 0x80))       // single-byte (0xxxxxx)
+                               $n++;
+                       elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
+                               $n++;
+               }
+               if (!$str{$i})  return false; // offset beyond string length
+
+               if ($pos >= 0)  {
+                               // skip trailing multi-byte data bytes
+                       while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
+               } else {
+                               // correct offset
+                       $i++;
+               }
+
+               return $i;
+       }
+
+       /**
+        * Translates an 'absolute' byte position into a character position.
+        *
+        * @param       string          UTF-8 string
+        * @param       int             byte position
+        * @return      int             character position
+        * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
+        */
+       function utf8_byte2char_pos($str,$pos)  {
+               $n = 0; // number of characters
+               for($i=$pos; $i>0; $i--)        {
+                       $c = (int)ord($str{$i});
+                       if (!($c & 0x80))       // single-byte (0xxxxxx)
+                               $n++;
+                       elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
+                               $n++;
+               }
+               if (!$str{$i})  return false; // offset beyond string length
+
+               return $n;
+       }
+
+       /**
+        * Translates all characters of an UTF-8 string into their respective case values.
+        *
+        * @param       string          UTF-8 string
+        * @param       string          conversion: 'toLower' or 'toUpper'
+        * @return      string          the converted string
+        * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
+        * @see strtolower(), strtoupper(), mb_convert_case()
+        */
+       function utf8_conv_case($str,$case)     {
+               if (!$this->initCaseFoldingUTF8())      return $str;    // do nothing
+
+               $out = '';
+               $caseConv =& $this->caseFolding['utf-8'][$case];
+               for($i=0; $str{$i}; $i++)       {
+                       $c = ord($str{$i});
+                       if (!($c & 0x80))       // single-byte (0xxxxxx)
+                               $mbc = $str{$i};
+                       elseif (($c & 0xC0) == 0xC0)    {       // multi-byte starting byte (11xxxxxx)
+                               for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
+                               $mbc = substr($str,$i,$bc);
+                               $i += $bc-1;
+                       }
+
+                       $cc = $caseConv[$mbc];
+                       if ($cc)        {
+                               $out .= $cc;
+                       } else {
+                               $out .= $mbc;
+                       }
+               }
+
+               return $out;
+       }
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+       /********************************************
+        *
+        * Internal EUC string operation functions
+        *
+        * Extended Unix Code:
+        *  ASCII compatible 7bit single bytes chars
+        *  8bit two byte chars
+        *
+        * Shift-JIS is treated as a special case.
+        *
+        ********************************************/
+
+       /**
+        * Cuts a string in the EUC charset family short at a given byte length.
+        *
+        * @param       string          EUC multibyte character string
+        * @param       integer         the byte length
+        * @param       string          the charset
+        * @return      string          the shortened string
+        * @see mb_strcut()
+        * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
+        */
+       function euc_strtrunc($str,$len,$charset)        {
+               $sjis = ($charset == 'shift_jis');
+               for ($i=0; $str{$i} && $i<$len; $i++) {
+                       $c = ord($str{$i});
+                       if ($sjis)      {
+                               if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i++;   // advance a double-byte char
+                       }
+                       else    {
+                               if ($c >= 0x80) $i++;   // advance a double-byte char
+                       }
+               }
+               if (!$str{$i})  return $str;    // string shorter than supplied length
+
+               if ($i>$len)
+                       return substr($str,0,$len-1);   // we ended on a first byte
+               else
+                       return substr($str,0,$len);
+        }
+
+       /**
+        * Returns a part of a string in the EUC charset family.
+        *
+        * @param       string          EUC multibyte character string
+        * @param       int             start position (character position)
+        * @param       string          the charset
+        * @param       int             length (in characters)
+        * @return      string          the substring
+        * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
+        */
+       function euc_substr($str,$start,$charset,$len=null)     {
+               $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
+               if ($byte_start === false)      return false;   // $start outside string length
+
+               $str = substr($str,$byte_start);
+
+               if ($len!=null) {
+                       $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
+                       if ($byte_end === false)        // $len outside actual string length
+                               return $str;
+                       else
+                               return substr($str,0,$byte_end);
+               }
+               else    return $str;
+       }
+
+       /**
+        * Counts the number of characters of a string in the EUC charset family.
+        *
+        * @param       string          EUC multibyte character string
+        * @param       string          the charset
+        * @return      int             the number of characters
+        * @see strlen()
+        * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
+        */
+       function euc_strlen($str,$charset)       {
+               $sjis = ($charset == 'shift_jis');
+               $n=0;
+               for ($i=0; $str{$i}; $i++) {
+                       $c = ord($str{$i});
+                       if ($sjis)      {
+                               if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i++;   // advance a double-byte char
+                       }
+                       else    {
+                               if ($c >= 0x80) $i++;   // advance a double-byte char
+                       }
+
+                       $n++;
+               }
+
+               return $n;
+        }
+
+       /**
+        * Translates a character position into an 'absolute' byte position.
+        *
+        * @param       string          EUC multibyte character string
+        * @param       int             character position (negative values start from the end)
+        * @param       string          the charset
+        * @return      int             byte position
+        * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
+        */
+       function euc_char2byte_pos($str,$pos,$charset)  {
+               $sjis = ($charset == 'shift_jis');
+               $n = 0; // number of characters seen
+               $p = abs($pos); // number of characters wanted
+
+               if ($pos >= 0)  {
+                       $i = 0;
+                       $d = 1;
+               } else {
+                       $i = strlen($str)-1;
+                       $d = -1;
+               }
+
+               for ( ; $str{$i} && $n<$p; $i+=$d) {
+                       $c = ord($str{$i});
+                       if ($sjis)      {
+                               if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i+=$d; // advance a double-byte char
+                       }
+                       else    {
+                               if ($c >= 0x80) $i+=$d; // advance a double-byte char
+                       }
+
+                       $n++;
+               }
+               if (!$str{$i})  return false; // offset beyond string length
+
+               if ($pos < 0)   $i++;   // correct offset
+
+               return $i;
+       }
+
+       /**
+        * Translates all characters of a string in the EUC charset family into their respective case values.
+        *
+        * @param       string          EUC multibyte character string
+        * @param       string          conversion: 'toLower' or 'toUpper'
+        * @param       string          the charset
+        * @return      string          the converted string
+        * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
+        * @see strtolower(), strtoupper(), mb_convert_case()
+        */
+       function euc_conv_case($str,$case,$charset)     {
+               if (!$this->initCaseFolding($charset))  return $str;    // do nothing
+
+               $sjis = ($charset == 'shift_jis');
+               $out = '';
+               $caseConv =& $this->caseFolding[$charset][$case];
+               for($i=0; $mbc=$str{$i}; $i++)  {
+                       $c = ord($str{$i});
+
+                       if ($sjis)      {
+                               if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  {       // a double-byte char
+                                       $mbc = substr($str,$i,2);
+                                       $i++;
+                               }
+                       }
+                       else    {
+                               if ($c >= 0x80) {       // a double-byte char
+                                       $mbc = substr($str,$i,2);
+                                       $i++;
+                               }
+                       }
+
+                       $cc = $caseConv[$mbc];
+                       if ($cc)        {
+                               $out .= $cc;
+                       } else {
+                               $out .= $mbc;
+                       }
+               }
+
+               return $out;
        }
+
 }
 
 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])       {
        include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
 }
-?>
\ No newline at end of file
+?>