fixed bugs for array data usage of the tree class
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
index c253e25..a808e22 100755 (executable)
@@ -134,6 +134,9 @@ class t3lib_cs {
                // An array where case folding data will be stored (cached)
        var $caseFolding=array();
 
+               // An array where charset-to-ASCII mappings are stored (cached)
+       var $toASCII=array();
+
                // This tells the converter which charsets has two bytes per char:
        var $twoByteSets=array(
                'ucs-2'=>1,     // 2-byte Unicode
@@ -148,7 +151,7 @@ class t3lib_cs {
                // This tells the converter which charsets use a scheme like the Extended Unix Code:
        var $eucBasedSets=array(
                'gb2312'=>1,            // Chinese, simplified.
-               'big5'=>1,                      // Chinese, traditional.
+               'big5'=>1,              // Chinese, traditional.
                'shift_jis'=>1,         // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
        );
 
@@ -480,6 +483,20 @@ class t3lib_cs {
                return $cs ? $cs : 'iso-8859-1';
        }
 
+
+
+
+
+
+
+
+
+       /********************************************
+        *
+        * Charset Conversion functions
+        *
+        ********************************************/
+
        /**
         * Convert from one charset to another charset.
         *
@@ -501,12 +518,12 @@ class t3lib_cs {
                                break;
 
                        case 'iconv':
-                               $conv_str = iconv($str,$fromCS,$toCS.'//TRANSLIT');
+                               $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
                                if (false !== $conv_str)        return $conv_str;
                                break;
 
                        case 'recode':
-                               $conv_str = recode_string($toCS.'..'.$fromCS,$str);
+                               $conv_str = recode_string($fromCS.'..'.$toCS,$str);
                                if (false !== $conv_str)        return $conv_str;
                                break;
                        }
@@ -536,7 +553,7 @@ class t3lib_cs {
                        for ($a=0;$a<$strLen;$a++)      {       // Traverse each char in string.
                                $chr=substr($str,$a,1);
                                $ord=ord($chr);
-                               if ($this->twoByteSets[$charset])       {       // If the charset has two bytes per char
+                               if (isset($this->twoByteSets[$charset]))        {       // If the charset has two bytes per char
                                        $ord2 = ord($str{$a+1});
                                        $ord = $ord<<8 & $ord2; // assume big endian
 
@@ -545,7 +562,7 @@ class t3lib_cs {
                                        } else $outStr.=chr($this->noCharByteVal);      // No char exists
                                        $a++;
                                } elseif ($ord>127)     {       // If char has value over 127 it's a multibyte char in UTF-8
-                                       if ($this->eucBasedSets[$charset])      {       // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
+                                       if (isset($this->eucBasedSets[$charset]))       {       // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
                                                $a++;
                                                $ord2=ord(substr($str,$a,1));
                                                $ord = $ord*256+$ord2;
@@ -832,7 +849,7 @@ class t3lib_cs {
                        if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile))      {
                                        // Cache file for charsets:
                                        // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
-                               $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/charset_'.$charset.'.tbl');
+                               $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
                                if ($cacheFile && @is_file($cacheFile)) {
                                        $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
                                } else {
@@ -866,7 +883,7 @@ class t3lib_cs {
                                                }
                                        }
                                        if ($cacheFile) {
-                                               t3lib_div::writeFile($cacheFile,serialize($this->parsedCharsets[$charset]));
+                                               t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
                                        }
                                }
                                return 2;
@@ -875,29 +892,47 @@ class t3lib_cs {
        }
 
        /**
-        * This function initializes the UTF-8 case folding table.
+        * This function initializes all UTF-8 character data tables.
         *
         * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
         *
+        * @param       string          ???
         * @return      integer         Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
         * @access private
         */
-       function initCaseFoldingUTF8()  {
-                       // Only process if the case table is not yet loaded:
-               if (is_array($this->caseFolding['utf-8']))      return 1;
+       function initUnicodeData($mode=null)    {
+                       // cache files
+               $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
+               $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
+
+                       // Only process if the tables are not yet loaded
+               switch($mode)   {
+                       case 'case':
+                               if (is_array($this->caseFolding['utf-8']))      return 1;
+
+                                       // Use cached version if possible
+                               if ($cacheFileCase && @is_file($cacheFileCase)) {
+                                       $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFile));
+                                       return 2;
+                               }
+                               break;
 
-                       // Use cached version if possible
-               $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cscase_utf-8.tbl');
-               if ($cacheFile && @is_file($cacheFile)) {
-                       $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFile));
-                       return 2;
+                       case 'ascii':
+                               if (is_array($this->toASCII['utf-8']))  return 1;
+
+                                       // Use cached version if possible
+                               if ($cacheFileASCII && @is_file($cacheFileASCII))       {
+                                       $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
+                                       return 2;
+                               }
+                               break;
                }
 
                        // process main Unicode data file
                $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
                if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
 
-               $fh = fopen($unicodeDataFile,'r');
+               $fh = fopen($unicodeDataFile,'rb');
                if (!$fh)       return false;
 
                        // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
@@ -908,26 +943,79 @@ class t3lib_cs {
                $utf8CaseFolding['toLower'] = array();
                $utf8CaseFolding['toTitle'] = array();
 
+               $decomposition = array();       // array of temp. decompositions
+               $mark = array();                // array of chars that are marks (eg. composing accents)
+               $number = array();              // array of chars that are numbers (eg. digits)
+
                while (!feof($fh))      {
-                       $line = fgets($fh);
-                               // has also other info like character class (digit, white space, etc.) and more
-                       list($char,,,,,,,,,,,,$upper,$lower,$title,) = split(';', rtrim($line));
-                       $char = $this->UnumberToChar(hexdec($char));
-                       if ($upper)     $utf8CaseFolding['toUpper'][$char] = $this->UnumberToChar(hexdec($upper));
-                       if ($lower)     $utf8CaseFolding['toLower'][$char] = $this->UnumberToChar(hexdec($lower));
+                       $line = fgets($fh,4096);
+                               // has a lot of info
+                       list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
+
+                       $ord = hexdec($char);
+                       if ($ord > 0xFFFF)      break;  // only process the BMP
+
+                       $utf8_char = $this->UnumberToChar($ord);
+
+                       if ($upper)     $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
+                       if ($lower)     $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
                                // store "title" only when different from "upper" (only a few)
-                       if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$char] = $this->UnumberToChar(hexdec($title));
+                       if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
+
+                       switch ($cat{0})        {
+                               case 'M':       // mark (accent, umlaut, ...)
+                                       $mark["U+$char"] = 1;
+                                       break;
+
+                               case 'N':       // numeric value
+                                       if ($ord > 0x80 && $num != '')  $number["U+$char"] = $num;
+                       }
+
+                               // accented Latin letters without "official" decomposition
+                       $match = array();
+                       if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp)        {
+                               $c = ord($match[2]);
+                               if ($match[1] == 'SMALL')       $c += 32;
+
+                               $decomposition["U+$char"] = array(dechex($c));
+                               continue;
+                       }
+
+                       $match = array();
+                       if (ereg('(<.*>)? *(.+)',$decomp,$match))       {
+                               switch($match[1])       {
+                                       case '<circle>':        // add parenthesis as circle replacement, eg (1)
+                                               $match[2] = '0028 '.$match[2].' 0029';
+                                               break;
+
+                                       case '<square>':        // add square brackets as square replacement, eg [1]
+                                               $match[2] = '005B '.$match[2].' 005D';
+                                               break;
+
+                                       case '<compat>':        // ignore multi char decompositions that start with a space
+                                               if (ereg('^0020 ',$match[2]))   continue 2;
+                                               break;
+
+                                               // ignore Arabic and vertical layout presentation decomposition
+                                       case '<initial>':
+                                       case '<medial>':
+                                       case '<final>':
+                                       case '<isolated>':
+                                       case '<vertical>':
+                                               continue 2;
+                               }
+                               $decomposition["U+$char"] = split(' ',$match[2]);
+                       }
                }
                fclose($fh);
 
                        // process additional Unicode data for casing (allow folded characters to expand into a sequence)
                $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
                if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile))        {
-
-                       $fh = fopen($specialCasingFile,'r');
+                       $fh = fopen($specialCasingFile,'rb');
                        if ($fh)        {
                                while (!feof($fh))      {
-                                       $line = fgets($fh);
+                                       $line = fgets($fh,4096);
                                        if ($line{0} != '#' && trim($line) != '')       {
 
                                                list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
@@ -936,17 +1024,17 @@ class t3lib_cs {
                                                        if ($char != $lower)    {
                                                                $arr = split(' ',$lower);
                                                                for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
-                                                               $utf8CaseFolding['toLower'][$utf8_char] = implode($arr);
+                                                               $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
                                                        }
                                                        if ($char != $title && $title != $upper)        {
                                                                $arr = split(' ',$title);
                                                                for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
-                                                               $utf8CaseFolding['toTitle'][$utf8_char] = implode($arr);
+                                                               $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
                                                        }
                                                        if ($char != $upper)    {
                                                                        $arr = split(' ',$upper);
                                                                for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
-                                                               $utf8CaseFolding['toUpper'][$utf8_char] = implode($arr);
+                                                               $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
                                                        }
                                                }
                                        }
@@ -955,8 +1043,72 @@ class t3lib_cs {
                        }
                }
 
-               if ($cacheFile) {
-                               t3lib_div::writeFile($cacheFile,serialize($utf8CaseFolding));
+                       // process custom decompositions
+               $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
+               if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile))      {
+                       $fh = fopen($customTranslitFile,'rb');
+                       if ($fh)        {
+                               while (!feof($fh))      {
+                                       $line = fgets($fh,4096);
+                                       if ($line{0} != '#' && trim($line) != '')       {
+                                               list($char,$translit) = t3lib_div::trimExplode(';', $line);
+                                               $decomposition["U+$char"] = split(' ', $translit);
+                                       }
+                               }
+                               fclose($fh);
+                       }
+               }
+
+                       // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
+               foreach($decomposition as $from => $to) {
+                       $code_decomp = array();
+
+                       while ($code_value = array_shift($to))  {
+                               if (isset($decomposition["U+$code_value"]))     {       // do recursive decomposition
+                                       foreach(array_reverse($decomposition["U+$code_value"]) as $cv)  {
+                                               array_unshift($to, $cv);
+                                       }
+                               } elseif (!isset($mark["U+$code_value"])) {     // remove mark
+                                       array_push($code_decomp, $code_value);
+                               }
+                       }
+                       if (count($code_decomp)) {
+                               $decomposition[$from] = $code_decomp;
+                       } else {
+                               unset($decomposition[$from]);
+                       }
+               }
+
+                       // create ascii only mapping
+               $this->toASCII['utf-8'] = array();
+               $ascii =& $this->toASCII['utf-8'];
+
+               foreach($decomposition as $from => $to) {
+                       $code_decomp = array();
+                       while ($code_value = array_shift($to))  {
+                               $ord = hexdec($code_value);
+                               if ($ord > 127)
+                                       continue 2;     // skip decompositions containing non-ASCII chars
+                               else
+                                       array_push($code_decomp,chr($ord));
+                       }
+                       $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
+               }
+
+                       // add numeric decompositions
+               foreach($number as $from => $to)        {
+                       $utf8_char = $this->UnumberToChar(hexdec($from));
+                       if (!isset($ascii[$utf8_char])) {
+                               $ascii[$utf8_char] = $to;
+                       }
+               }
+
+               if ($cacheFileCase)     {
+                               t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
+               }
+
+               if ($cacheFileASCII)    {
+                               t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
                }
 
                return 3;
@@ -975,7 +1127,7 @@ class t3lib_cs {
                if (is_array($this->caseFolding[$charset]))     return 1;
 
                        // Use cached version if possible
-               $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cscase_'.$charset.'.tbl');
+               $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
                if ($cacheFile && @is_file($cacheFile)) {
                        $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
                        return 2;
@@ -987,23 +1139,26 @@ class t3lib_cs {
                }
 
                        // UTF-8 case folding is used as the base conversion table
-               if (!$this->initCaseFoldingUTF8())      {
+               if (!$this->initUnicodeData('case'))    {
                        return false;
                }
 
                $nochar = chr($this->noCharByteVal);
                foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8)      {
                                // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
-                       $c = $this->conv($utf8, 'utf-8', $charset);
+                       $c = $this->utf8_decode($utf8, $charset);
 
-                       $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
-                       if ($cc && $cc != $nochar)      $this->caseFolding[$charset]['toUpper'][$c] = $cc;
+                               // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
+                       $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
+                       if ($cc != '' && $cc != $nochar)        $this->caseFolding[$charset]['toUpper'][$c] = $cc;
 
-                       $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
-                       if ($cc && $cc != $nochar)      $this->caseFolding[$charset]['toLower'][$c] = $cc;
+                               // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
+                       $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
+                       if ($cc != '' && $cc != $nochar)        $this->caseFolding[$charset]['toLower'][$c] = $cc;
 
-                       $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
-                       if ($cc && $cc != $nochar)      $this->caseFolding[$charset]['toTitle'][$c] = $cc;
+                               // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
+                       $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
+                       if ($cc != '' && $cc != $nochar)        $this->caseFolding[$charset]['toTitle'][$c] = $cc;
                }
 
                        // add the ASCII case table
@@ -1015,12 +1170,57 @@ class t3lib_cs {
                }
 
                if ($cacheFile) {
-                               t3lib_div::writeFile($cacheFile,serialize($this->caseFolding[$charset]));
+                               t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
                }
 
                return 3;
        }
 
+       /**
+        * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
+        * This function is automatically called by the ASCII transliteration functions.
+        *
+        * @param       string          Charset for which to initialize conversion.
+        * @return      integer         Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
+        * @access private
+        */
+       function initToASCII($charset)  {
+                       // Only process if the case table is not yet loaded:
+               if (is_array($this->toASCII[$charset])) return 1;
+
+                       // Use cached version if possible
+               $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
+               if ($cacheFile && @is_file($cacheFile)) {
+                       $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
+                       return 2;
+               }
+
+                       // init UTF-8 conversion for this charset
+               if (!$this->initCharset($charset))      {
+                       return false;
+               }
+
+                       // UTF-8/ASCII transliteration is used as the base conversion table
+               if (!$this->initUnicodeData('ascii'))   {
+                       return false;
+               }
+
+               $nochar = chr($this->noCharByteVal);
+               foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8)      {
+                               // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
+                       $c = $this->utf8_decode($utf8, $charset);
+
+                       if (isset($this->toASCII['utf-8'][$utf8]))      {
+                               $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
+                       }
+               }
+
+               if ($cacheFile) {
+                               t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
+               }
+
+               return 3;
+       }
 
 
 
@@ -1122,7 +1322,7 @@ class t3lib_cs {
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
         */
        function crop($charset,$string,$len,$crop='')   {
-               if ($len == 0)  return $crop;
+               if (intval($len) == 0)  return $string;
 
                if ($charset == 'utf-8')        {
                        $i = $this->utf8_char2byte_pos($string,$len);
@@ -1140,6 +1340,17 @@ class t3lib_cs {
                if ($i === false)       {       // $len outside actual string length
                        return $string;
                } else  {
+                       if ($len > 0)   {
+                               if (isset($string{$i})) {
+                                       return substr($string,0,$i).$crop;
+                               }
+                       } else {
+                               if (isset($string{$i-1}))       {
+                                       return $crop.substr($string,$i);
+                               }
+                       }
+
+/*
                        if (abs($len)<$this->strlen($charset,$string))  {       // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
                                if ($len > 0)   {
                                        return substr($string,0,$i).$crop;
@@ -1147,6 +1358,7 @@ class t3lib_cs {
                                        return $crop.substr($string,$i);
                                }
                        }
+*/
                }
                return $string;
        }
@@ -1183,6 +1395,8 @@ class t3lib_cs {
        /**
         * Translates all characters of a string into their respective case values.
         * Unlike strtolower() and strtoupper() this method is locale independent.
+        * Note that the string length may change!
+        * eg. lower case German ß (sharp S) becomes upper case "SS"
         * Unit-tested by Kasper
         * Real case folding is language dependent, this method ignores this fact.
         *
@@ -1202,32 +1416,35 @@ class t3lib_cs {
                                return mb_strtoupper($str,'utf-8');
                        }
                } elseif ($charset == 'utf-8')  {
-                       return $this->utf8_conv_case($string,$case);
-               } elseif ($this->eucBasedSets[$charset])        {
-                       return $this->euc_conv_case($string,$case,$charset);
+                       return $this->utf8_char_mapping($string,'case',$case);
+               } elseif (isset($this->eucBasedSets[$charset])) {
+                       return $this->euc_char_mapping($string,$charset,'case',$case);
+               } else {
+                               // treat everything else as single-byte encoding
+                       return $this->sb_char_mapping($string,$charset,'case',$case);
                }
 
-               // treat everything else as single-byte encoding
-               if (!$this->initCaseFolding($charset))  return $string; // do nothing
-               $out = '';
-               $caseConv =& $this->caseFolding[$charset][$case];
+               return $string;
+       }
 
-               for($i=0; strlen($string{$i}); $i++)    {
-                       $c = $string{$i};
-                       $cc = $caseConv[$c];
-                       if ($cc)        {
-                               $out .= $cc;
-                       } else {
-                               $out .= $c;
-                       }
+       /**
+        * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
+        *
+        * @param       string          Character set of string
+        * @param       string          Input string to convert
+        * @return      string          The converted string
+        */
+       function specCharsToASCII($charset,$string)     {
+               if ($charset == 'utf-8')        {
+                       return $this->utf8_char_mapping($string,'ascii');
+               } elseif (isset($this->eucBasedSets[$charset])) {
+                       return $this->euc_char_mapping($string,$charset,'ascii');
+               } else {
+                               // treat everything else as single-byte encoding
+                       return $this->sb_char_mapping($string,$charset,'ascii');
                }
 
-               // is a simple strtr() faster or slower than the code above?
-               // perhaps faster for small single-byte tables but slower for large multi-byte tables?
-               //
-               // return strtr($string,$this->caseFolding[$charset][$case]);
-
-               return $out;
+               return $string;
        }
 
 
@@ -1241,6 +1458,58 @@ class t3lib_cs {
 
 
 
+       /********************************************
+        *
+        * Internal string operation functions
+        *
+        ********************************************/
+
+       /**
+        * Maps all characters of a string in a single byte charset.
+        *
+        * @param       string          the string
+        * @param       string          the charset
+        * @param       string          mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
+        * @param       string          'case': conversion 'toLower' or 'toUpper'
+        * @return      string          the converted string
+        * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
+        */
+       function sb_char_mapping($str,$charset,$mode,$opt='')   {
+               switch($mode)   {
+                       case 'case':
+                               if (!$this->initCaseFolding($charset))  return $str;    // do nothing
+                               $map =& $this->caseFolding[$charset][$opt];
+                               break;
+
+                       case 'ascii':
+                               if (!$this->initToASCII($charset))      return $str;    // do nothing
+                               $map =& $this->toASCII[$charset];
+                               break;
+
+                       default:
+                               return $str;
+               }
+
+               $out = '';
+               for($i=0; isset($str{$i}); $i++)        {
+                       $c = $str{$i};
+                       if (isset($map[$c]))    {
+                               $out .= $map[$c];
+                       } else {
+                               $out .= $c;
+                       }
+               }
+
+               return $out;
+       }
+
+
+
+
+
+
+
+
 
 
        /********************************************
@@ -1295,7 +1564,7 @@ class t3lib_cs {
         */
        function utf8_strlen($str)      {
                $n=0;
-               for($i=0; strlen($str{$i}); $i++)       {
+               for($i=0; isset($str{$i}); $i++)        {
                        $c = ord($str{$i});
                        if (!($c & 0x80))       // single-byte (0xxxxxx)
                                $n++;
@@ -1391,14 +1660,14 @@ class t3lib_cs {
                        $d = -1;
                }
 
-               for( ; strlen($str{$i}) && $n<$p; $i+=$d)       {
+               for( ; isset($str{$i}) && $n<$p; $i+=$d)        {
                        $c = (int)ord($str{$i});
                        if (!($c & 0x80))       // single-byte (0xxxxxx)
                                $n++;
                        elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
                                $n++;
                }
-               if (!strlen($str{$i}))  return false; // offset beyond string length
+               if (!isset($str{$i}))   return false; // offset beyond string length
 
                if ($pos >= 0)  {
                                // skip trailing multi-byte data bytes
@@ -1429,28 +1698,38 @@ class t3lib_cs {
                        elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
                                $n++;
                }
-               if (!strlen($str{$i}))  return false; // offset beyond string length
+               if (!isset($str{$i}))   return false; // offset beyond string length
 
                return $n;
        }
 
        /**
-        * Translates all characters of an UTF-8 string into their respective case values.
-        * Unit-tested by Kasper
+        * Maps all characters of an UTF-8 string.
         *
         * @param       string          UTF-8 string
-        * @param       string          conversion: 'toLower' or 'toUpper'
+        * @param       string          mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
+        * @param       string          'case': conversion 'toLower' or 'toUpper'
         * @return      string          the converted string
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
-        * @see strtolower(), strtoupper(), mb_convert_case()
         */
-       function utf8_conv_case($str,$case)     {
-               if (!$this->initCaseFoldingUTF8())      return $str;    // do nothing
+       function utf8_char_mapping($str,$mode,$opt='')  {
+               if (!$this->initUnicodeData($mode))     return $str;    // do nothing
 
                $out = '';
-               $caseConv =& $this->caseFolding['utf-8'][$case];
+               switch($mode)   {
+                       case 'case':
+                               $map =& $this->caseFolding['utf-8'][$opt];
+                               break;
 
-               for($i=0; strlen($str{$i}); $i++)       {
+                       case 'ascii':
+                               $map =& $this->toASCII['utf-8'];
+                               break;
+
+                       default:
+                               return $str;
+               }
+
+               for($i=0; isset($str{$i}); $i++)        {
                        $c = ord($str{$i});
                        if (!($c & 0x80))       // single-byte (0xxxxxx)
                                $mbc = $str{$i};
@@ -1460,9 +1739,8 @@ class t3lib_cs {
                                $i += $bc-1;
                        }
 
-                       $cc = $caseConv[$mbc];
-                       if ($cc)        {
-                               $out .= $cc;
+                       if (isset($map[$mbc]))  {
+                               $out .= $map[$mbc];
                        } else {
                                $out .= $mbc;
                        }
@@ -1512,7 +1790,7 @@ class t3lib_cs {
         */
        function euc_strtrunc($str,$len,$charset)        {
                $sjis = ($charset == 'shift_jis');
-               for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
+               for ($i=0; isset($str{$i}) && $i<$len; $i++) {
                        $c = ord($str{$i});
                        if ($sjis)      {
                                if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i++;   // advance a double-byte char
@@ -1521,7 +1799,7 @@ class t3lib_cs {
                                if ($c >= 0x80) $i++;   // advance a double-byte char
                        }
                }
-               if (!strlen($str{$i}))  return $str;    // string shorter than supplied length
+               if (!isset($str{$i}))   return $str;    // string shorter than supplied length
 
                if ($i>$len)
                        return substr($str,0,$len-1);   // we ended on a first byte
@@ -1567,7 +1845,7 @@ class t3lib_cs {
        function euc_strlen($str,$charset)       {
                $sjis = ($charset == 'shift_jis');
                $n=0;
-               for ($i=0; strlen($str{$i}); $i++) {
+               for ($i=0; isset($str{$i}); $i++) {
                        $c = ord($str{$i});
                        if ($sjis)      {
                                if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i++;   // advance a double-byte char
@@ -1604,7 +1882,7 @@ class t3lib_cs {
                        $d = -1;
                }
 
-               for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
+               for ( ; isset($str{$i}) && $n<$p; $i+=$d) {
                        $c = ord($str{$i});
                        if ($sjis)      {
                                if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i+=$d; // advance a double-byte char
@@ -1615,7 +1893,7 @@ class t3lib_cs {
 
                        $n++;
                }
-               if (!strlen($str{$i}))  return false; // offset beyond string length
+               if (!isset($str{$i}))   return false; // offset beyond string length
 
                if ($pos < 0)   $i++;   // correct offset
 
@@ -1623,23 +1901,36 @@ class t3lib_cs {
        }
 
        /**
-        * Translates all characters of a string in the EUC charset family into their respective case values.
+        * Maps all characters of a string in the EUC charset family.
         *
         * @param       string          EUC multibyte character string
-        * @param       string          conversion: 'toLower' or 'toUpper'
         * @param       string          the charset
+        * @param       string          mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
+        * @param       string          'case': conversion 'toLower' or 'toUpper'
         * @return      string          the converted string
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
-        * @see strtolower(), strtoupper(), mb_convert_case()
         */
-       function euc_conv_case($str,$case,$charset)     {
-               if (!$this->initCaseFolding($charset))  return $str;    // do nothing
+       function euc_char_mapping($str,$charset,$mode,$opt='')  {
+               switch($mode)   {
+                       case 'case':
+                               if (!$this->initCaseFolding($charset))  return $str;    // do nothing
+                               $map =& $this->caseFolding[$charset][$opt];
+                               break;
+
+                       case 'ascii':
+                               if (!$this->initToASCII($charset))      return $str;    // do nothing
+                               $map =& $this->toASCII[$charset];
+                               break;
+
+                       default:
+                               return $str;
+               }
 
                $sjis = ($charset == 'shift_jis');
                $out = '';
-               $caseConv =& $this->caseFolding[$charset][$case];
-               for($i=0; $mbc=$str{$i}; $i++)  {
-                       $c = ord($str{$i});
+               for($i=0; isset($str{$i}); $i++)        {
+                       $mbc = $str{$i};
+                       $c = ord($mbc);
 
                        if ($sjis)      {
                                if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  {       // a double-byte char
@@ -1654,9 +1945,8 @@ class t3lib_cs {
                                }
                        }
 
-                       $cc = $caseConv[$mbc];
-                       if ($cc)        {
-                               $out .= $cc;
+                       if (isset($map[$mbc]))  {
+                               $out .= $map[$mbc];
                        } else {
                                $out .= $mbc;
                        }
@@ -1664,6 +1954,7 @@ class t3lib_cs {
 
                return $out;
        }
+
 }
 
 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])       {