fixed bugs for array data usage of the tree class
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
index 3cdc145..a808e22 100755 (executable)
@@ -932,7 +932,7 @@ class t3lib_cs {
                $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
                if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
 
-               $fh = fopen($unicodeDataFile,'r');
+               $fh = fopen($unicodeDataFile,'rb');
                if (!$fh)       return false;
 
                        // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
@@ -948,7 +948,7 @@ class t3lib_cs {
                $number = array();              // array of chars that are numbers (eg. digits)
 
                while (!feof($fh))      {
-                       $line = fgets($fh);
+                       $line = fgets($fh,4096);
                                // has a lot of info
                        list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
 
@@ -1012,11 +1012,10 @@ class t3lib_cs {
                        // process additional Unicode data for casing (allow folded characters to expand into a sequence)
                $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
                if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile))        {
-
-                       $fh = fopen($specialCasingFile,'r');
+                       $fh = fopen($specialCasingFile,'rb');
                        if ($fh)        {
                                while (!feof($fh))      {
-                                       $line = fgets($fh);
+                                       $line = fgets($fh,4096);
                                        if ($line{0} != '#' && trim($line) != '')       {
 
                                                list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
@@ -1025,17 +1024,17 @@ class t3lib_cs {
                                                        if ($char != $lower)    {
                                                                $arr = split(' ',$lower);
                                                                for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
-                                                               $utf8CaseFolding['toLower'][$utf8_char] = implode($arr);
+                                                               $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
                                                        }
                                                        if ($char != $title && $title != $upper)        {
                                                                $arr = split(' ',$title);
                                                                for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
-                                                               $utf8CaseFolding['toTitle'][$utf8_char] = implode($arr);
+                                                               $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
                                                        }
                                                        if ($char != $upper)    {
                                                                        $arr = split(' ',$upper);
                                                                for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
-                                                               $utf8CaseFolding['toUpper'][$utf8_char] = implode($arr);
+                                                               $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
                                                        }
                                                }
                                        }
@@ -1044,50 +1043,21 @@ class t3lib_cs {
                        }
                }
 
-                       // custom decompositions
-               $decomposition['U+00A5'] = array('0079','0065','006E'); // YEN SIGN => yen
-               $decomposition['U+00A6'] = array('007C');               // BROKEN BAR => |
-               $decomposition['U+00AB'] = array('003C','003C');        // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK => <<
-               $decomposition['U+00A9'] = array('0028','0063','0029'); // COPYRIGHT SIGN => (c)
-               $decomposition['U+00AE'] = array('0028','0052','0029'); // REGISTERED SIGN => (R)
-               $decomposition['U+00B1'] = array('002B','002F','002D'); // PLUS-MINUS SIGN => +/-
-               $decomposition['U+00B5'] = array('0075');               // MICRO SIGN => u
-               $decomposition['U+00B7'] = array('002A');               // MIDDLE DOT => *
-               $decomposition['U+00BB'] = array('003E','003E');        // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK => <<
-               $decomposition['U+00C4'] = array('0041','0045');        // LATIN CAPITAL LETTER A WITH DIAERESIS => AE (German)
-               $decomposition['U+00C5'] = array('0041','0041');        // LATIN CAPITAL LETTER A WITH RING ABOVE => AA (Danish)
-               $decomposition['U+00C6'] = array('0041','0045');        // LATIN CAPITAL LETTER AE => AE (Danish)
-               $decomposition['U+00D6'] = array('004F','0045');        // LATIN CAPITAL LETTER O WITH DIAERESIS => OE (German)
-               $decomposition['U+00D7'] = array('002A');               // MULTIPLICATION SIGN => *
-               $decomposition['U+00D8'] = array('004F','0045');        // LATIN CAPITAL LETTER O WITH STROKE => OE (Danish)
-               $decomposition['U+00DC'] = array('0055','0045');        // LATIN CAPITAL LETTER U WITH DIAERESIS => UE (German)
-               $decomposition['U+00E4'] = array('0061','0065');        // LATIN SMALL LETTER A WITH DIAERESIS => ae (German)
-               $decomposition['U+00E5'] = array('0061','0061');        // LATIN SMALL LETTER A WITH RING ABOVE => aa (Danish)
-               $decomposition['U+00DF'] = array('0073','0073');        // LATIN SMALL LETTER SHARP S => ss (German)
-               $decomposition['U+00E6'] = array('0061','0065');        // LATIN SMALL LETTER AE => ae (Danish)
-               $decomposition['U+00F6'] = array('006F','0065');        // LATIN SMALL LETTER O WITH DIAERESIS => oe (German)
-               $decomposition['U+00F7'] = array('002F');               // DIVISION SIGN => /
-               $decomposition['U+00F8'] = array('006F','0065');        // LATIN SMALL LETTER O WITH STROKE => oe (Danish)
-               $decomposition['U+00FC'] = array('0075','0065');        // LATIN SMALL LETTER U WITH DIAERESIS => ue (German)
-               $decomposition['U+0152'] = array('004F','0045');        // LATIN CAPITAL LETTER OE => OE
-               $decomposition['U+0153'] = array('006F','0065');        // LATIN SMALL LETTER OE => oe
-               $decomposition['U+0192'] = array('0066');               // LATIN SMALL LETTER F WITH HOOK => f
-               $decomposition['U+02BC'] = array('0027');               // MODIFIER LETTER APOSTROPHE => '
-               $decomposition['U+02CA'] = array('0027');               // MODIFIER LETTER ACUTE ACCENT => '
-               $decomposition['U+2010'] = array('002D');               // HYPHEN => -
-               $decomposition['U+2013'] = array('002D');               // EN DASH => -
-               $decomposition['U+2014'] = array('002D');               // EM DASH => -
-               $decomposition['U+2018'] = array('0060');               // LEFT SINGLE QUOTATION MARK => `
-               $decomposition['U+2019'] = array('0027');               // RIGHT SINGLE QUOTATION MARK >= '
-               $decomposition['U+201C'] = array('0022');               // LEFT DOUBLE QUOTATION MARK => "
-               $decomposition['U+201D'] = array('0022');               // RIGHT DOUBLE QUOTATION MARK => "
-               $decomposition['U+201E'] = array('0022');               // DOUBLE LOW-9 QUOTATION MARK => "
-               $decomposition['U+2022'] = array('002A');               // BULLET => *
-               $decomposition['U+2039'] = array('003C');               // SINGLE LEFT-POINTING ANGLE QUOTATION MARK => <
-               $decomposition['U+203A'] = array('003E');               // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK => >
-               $decomposition['U+2044'] = array('002F');               // FRACTION SLASH => /
-               $decomposition['U+20A0'] = array('0045','0055','0052'); // EURO-CURRENCY SIGN => EUR
-               $decomposition['U+20AC'] = array('0045','0055','0052'); // EURO-CURRENCY SIGN => EUR
+                       // process custom decompositions
+               $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
+               if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile))      {
+                       $fh = fopen($customTranslitFile,'rb');
+                       if ($fh)        {
+                               while (!feof($fh))      {
+                                       $line = fgets($fh,4096);
+                                       if ($line{0} != '#' && trim($line) != '')       {
+                                               list($char,$translit) = t3lib_div::trimExplode(';', $line);
+                                               $decomposition["U+$char"] = split(' ', $translit);
+                                       }
+                               }
+                               fclose($fh);
+                       }
+               }
 
                        // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
                foreach($decomposition as $from => $to) {
@@ -1426,7 +1396,7 @@ class t3lib_cs {
         * Translates all characters of a string into their respective case values.
         * Unlike strtolower() and strtoupper() this method is locale independent.
         * Note that the string length may change!
-        * eg. lower case German "ß" (sharp S) becomes uper case "SS"
+        * eg. lower case German ß (sharp S) becomes upper case "SS"
         * Unit-tested by Kasper
         * Real case folding is language dependent, this method ignores this fact.
         *
@@ -1446,36 +1416,19 @@ class t3lib_cs {
                                return mb_strtoupper($str,'utf-8');
                        }
                } elseif ($charset == 'utf-8')  {
-                       return $this->utf8_conv_case($string,$case);
+                       return $this->utf8_char_mapping($string,'case',$case);
                } elseif (isset($this->eucBasedSets[$charset])) {
-                       return $this->euc_conv_case($string,$case,$charset);
-               }
-
-               // treat everything else as single-byte encoding
-               if (!$this->initCaseFolding($charset))  return $string; // do nothing
-               $out = '';
-               $caseConv =& $this->caseFolding[$charset][$case];
-
-               for($i=0; isset($string{$i}); $i++)     {
-                       $c = $string{$i};
-                       $cc = $caseConv[$c];
-                       if ($cc)        {
-                               $out .= $cc;
-                       } else {
-                               $out .= $c;
-                       }
+                       return $this->euc_char_mapping($string,$charset,'case',$case);
+               } else {
+                               // treat everything else as single-byte encoding
+                       return $this->sb_char_mapping($string,$charset,'case',$case);
                }
 
-               // is a simple strtr() faster or slower than the code above?
-               // perhaps faster for small single-byte tables but slower for large multi-byte tables?
-               //
-               // return strtr($string,$this->caseFolding[$charset][$case]);
-
-               return $out;
+               return $string;
        }
 
        /**
-        * Converts special chars (like ÆØÅæøå, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
+        * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
         *
         * @param       string          Character set of string
         * @param       string          Input string to convert
@@ -1483,20 +1436,65 @@ class t3lib_cs {
         */
        function specCharsToASCII($charset,$string)     {
                if ($charset == 'utf-8')        {
-                       return $this->utf8_toASCII($string);
+                       return $this->utf8_char_mapping($string,'ascii');
                } elseif (isset($this->eucBasedSets[$charset])) {
-                       return $this->euc_toASCII($string,$charset);
+                       return $this->euc_char_mapping($string,$charset,'ascii');
+               } else {
+                               // treat everything else as single-byte encoding
+                       return $this->sb_char_mapping($string,$charset,'ascii');
                }
 
-               // treat everything else as single-byte encoding
-               if (!$this->initToASCII($charset))      return $string; // do nothing
-               $out = '';
-               $ascii =& $this->toASCII[$charset];
+               return $string;
+       }
+
+
+
+
+
+
+
+
 
-               for($i=0; isset($string{$i}); $i++)     {
-                       $c = $string{$i};
-                       if (isset($ascii[$c]))  {
-                               $out .= $ascii[$c];
+
+
+
+       /********************************************
+        *
+        * Internal string operation functions
+        *
+        ********************************************/
+
+       /**
+        * Maps all characters of a string in a single byte charset.
+        *
+        * @param       string          the string
+        * @param       string          the charset
+        * @param       string          mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
+        * @param       string          'case': conversion 'toLower' or 'toUpper'
+        * @return      string          the converted string
+        * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
+        */
+       function sb_char_mapping($str,$charset,$mode,$opt='')   {
+               switch($mode)   {
+                       case 'case':
+                               if (!$this->initCaseFolding($charset))  return $str;    // do nothing
+                               $map =& $this->caseFolding[$charset][$opt];
+                               break;
+
+                       case 'ascii':
+                               if (!$this->initToASCII($charset))      return $str;    // do nothing
+                               $map =& $this->toASCII[$charset];
+                               break;
+
+                       default:
+                               return $str;
+               }
+
+               $out = '';
+               for($i=0; isset($str{$i}); $i++)        {
+                       $c = $str{$i};
+                       if (isset($map[$c]))    {
+                               $out .= $map[$c];
                        } else {
                                $out .= $c;
                        }
@@ -1514,8 +1512,6 @@ class t3lib_cs {
 
 
 
-
-
        /********************************************
         *
         * Internal UTF-8 string operation functions
@@ -1708,54 +1704,31 @@ class t3lib_cs {
        }
 
        /**
-        * Translates all characters of an UTF-8 string into their respective case values.
-        * Unit-tested by Kasper
+        * Maps all characters of an UTF-8 string.
         *
         * @param       string          UTF-8 string
-        * @param       string          conversion: 'toLower' or 'toUpper'
+        * @param       string          mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
+        * @param       string          'case': conversion 'toLower' or 'toUpper'
         * @return      string          the converted string
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
-        * @see strtolower(), strtoupper(), mb_convert_case()
         */
-       function utf8_conv_case($str,$case)     {
-               if (!$this->initUnicodeData('case'))    return $str;    // do nothing
+       function utf8_char_mapping($str,$mode,$opt='')  {
+               if (!$this->initUnicodeData($mode))     return $str;    // do nothing
 
                $out = '';
-               $caseConv =& $this->caseFolding['utf-8'][$case];
+               switch($mode)   {
+                       case 'case':
+                               $map =& $this->caseFolding['utf-8'][$opt];
+                               break;
 
-               for($i=0; isset($str{$i}); $i++)        {
-                       $c = ord($str{$i});
-                       if (!($c & 0x80))       // single-byte (0xxxxxx)
-                               $mbc = $str{$i};
-                       elseif (($c & 0xC0) == 0xC0)    {       // multi-byte starting byte (11xxxxxx)
-                               for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
-                               $mbc = substr($str,$i,$bc);
-                               $i += $bc-1;
-                       }
+                       case 'ascii':
+                               $map =& $this->toASCII['utf-8'];
+                               break;
 
-                       if (isset($caseConv[$mbc]))     {
-                               $out .= $caseConv[$mbc];
-                       } else {
-                               $out .= $mbc;
-                       }
+                       default:
+                               return $str;
                }
 
-               return $out;
-       }
-
-       /**
-        * Converts chars with accents, umlauts or composed to ASCII equivalents.
-        *
-        * @param       string          Input string to convert
-        * @return      string          The converted string
-        * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
-        */
-       function utf8_toASCII($str)     {
-               if (!$this->initUnicodeData('ascii'))   return $str;    // do nothing
-
-               $out = '';
-               $toASCII =& $this->toASCII['utf-8'];
-
                for($i=0; isset($str{$i}); $i++)        {
                        $c = ord($str{$i});
                        if (!($c & 0x80))       // single-byte (0xxxxxx)
@@ -1766,8 +1739,8 @@ class t3lib_cs {
                                $i += $bc-1;
                        }
 
-                       if (isset($toASCII[$mbc]))      {
-                               $out .= $toASCII[$mbc];
+                       if (isset($map[$mbc]))  {
+                               $out .= $map[$mbc];
                        } else {
                                $out .= $mbc;
                        }
@@ -1792,6 +1765,7 @@ class t3lib_cs {
 
 
 
+
        /********************************************
         *
         * Internal EUC string operation functions
@@ -1927,66 +1901,37 @@ class t3lib_cs {
        }
 
        /**
-        * Translates all characters of a string in the EUC charset family into their respective case values.
+        * Maps all characters of a string in the EUC charset family.
         *
         * @param       string          EUC multibyte character string
-        * @param       string          conversion: 'toLower' or 'toUpper'
         * @param       string          the charset
+        * @param       string          mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
+        * @param       string          'case': conversion 'toLower' or 'toUpper'
         * @return      string          the converted string
         * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
-        * @see strtolower(), strtoupper(), mb_convert_case()
         */
-       function euc_conv_case($str,$case,$charset)     {
-               if (!$this->initCaseFolding($charset))  return $str;    // do nothing
-
-               $sjis = ($charset == 'shift_jis');
-               $out = '';
-               $caseConv =& $this->caseFolding[$charset][$case];
-               for($i=0; isset($str{$i}); $i++)        {
-                       $mbc = $str{$i};
-                       $c = ord($mbc);
+       function euc_char_mapping($str,$charset,$mode,$opt='')  {
+               switch($mode)   {
+                       case 'case':
+                               if (!$this->initCaseFolding($charset))  return $str;    // do nothing
+                               $map =& $this->caseFolding[$charset][$opt];
+                               break;
 
-                       if ($sjis)      {
-                               if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  {       // a double-byte char
-                                       $mbc = substr($str,$i,2);
-                                       $i++;
-                               }
-                       }
-                       else    {
-                               if ($c >= 0x80) {       // a double-byte char
-                                       $mbc = substr($str,$i,2);
-                                       $i++;
-                               }
-                       }
+                       case 'ascii':
+                               if (!$this->initToASCII($charset))      return $str;    // do nothing
+                               $map =& $this->toASCII[$charset];
+                               break;
 
-                       if (isset($caseConv[$mbc]))     {
-                               $out .= $caseConv[$mbc];
-                       } else {
-                               $out .= $mbc;
-                       }
+                       default:
+                               return $str;
                }
 
-               return $out;
-       }
-
-       /**
-        * Converts chars with accents, umlauts or composed to ASCII equivalents.
-        *
-        * @param       string          Input string to convert
-        * @param       string          The charset
-        * @return      string          The converted string
-        * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
-        */
-       function euc_toASCII($str,$charset)     {
-               if (!$this->initToASCII($charset))      return $str;    // do nothing
-
                $sjis = ($charset == 'shift_jis');
                $out = '';
-               $toASCII =& $this->toASCII[$charset];
-
                for($i=0; isset($str{$i}); $i++)        {
                        $mbc = $str{$i};
                        $c = ord($mbc);
+
                        if ($sjis)      {
                                if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  {       // a double-byte char
                                        $mbc = substr($str,$i,2);
@@ -2000,8 +1945,8 @@ class t3lib_cs {
                                }
                        }
 
-                       if (isset($toASCII[$mbc]))      {
-                               $out .= $toASCII[$mbc];
+                       if (isset($map[$mbc]))  {
+                               $out .= $map[$mbc];
                        } else {
                                $out .= $mbc;
                        }