specCharsToASCII for UTF-8
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2004 Kasper Skaarhoj (kasper@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasper@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 128: class t3lib_cs
38 * 442: function parse_charset($charset)
39 * 460: function get_locale_charset($locale)
40 * 492: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
41 * 529: function utf8_encode($str,$charset)
42 * 576: function utf8_decode($str,$charset,$useEntityForNoChar=0)
43 * 619: function utf8_to_entities($str)
44 * 652: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
45 * 686: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
46 * 736: function UnumberToChar($cbyte)
47 * 781: function utf8CharToUnumber($str,$hex=0)
48 *
49 * SECTION: Init functions
50 * 824: function initCharset($charset)
51 * 885: function initCaseFoldingUTF8()
52 * 973: function initCaseFolding($charset)
53 *
54 * SECTION: String operation functions
55 * 1058: function substr($charset,$string,$start,$len=null)
56 * 1096: function strlen($charset,$string)
57 * 1124: function crop($charset,$string,$len,$crop='')
58 * 1165: function strtrunc($charset,$string,$len)
59 * 1197: function conv_case($charset,$string,$case)
60 *
61 * SECTION: Internal UTF-8 string operation functions
62 * 1264: function utf8_substr($str,$start,$len=null)
63 * 1297: function utf8_strlen($str)
64 * 1318: function utf8_strtrunc($str,$len)
65 * 1340: function utf8_strpos($haystack,$needle,$offset=0)
66 * 1363: function utf8_strrpos($haystack,$needle)
67 * 1383: function utf8_char2byte_pos($str,$pos)
68 * 1424: function utf8_byte2char_pos($str,$pos)
69 * 1448: function utf8_conv_case($str,$case)
70 *
71 * SECTION: Internal EUC string operation functions
72 * 1514: function euc_strtrunc($str,$len,$charset)
73 * 1543: function euc_substr($str,$start,$charset,$len=null)
74 * 1568: function euc_strlen($str,$charset)
75 * 1595: function euc_char2byte_pos($str,$pos,$charset)
76 * 1636: function euc_conv_case($str,$case,$charset)
77 *
78 * TOTAL FUNCTIONS: 31
79 * (This index is automatically created/updated by the extension "extdeveval")
80 *
81 */
82
83
84
85
86
87
88
89
90 /**
91 * Notes on UTF-8
92 *
93 * Functions working on UTF-8 strings:
94 *
95 * - strchr/strstr
96 * - strrchr
97 * - substr_count
98 * - implode/explode/join
99 *
100 * Functions nearly working on UTF-8 strings:
101 *
102 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf_strlen
103 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained 7-bit ASCII
104 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
105 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
106 *
107 * Functions NOT working on UTF-8 strings:
108 *
109 * - str*cmp
110 * - stristr
111 * - stripos
112 * - substr
113 * - strrev
114 * - ereg/eregi
115 * - split/spliti
116 * - preg_*
117 * - ...
118 *
119 */
120 /**
121 * Class for conversion between charsets
122 *
123 * @author Kasper Skaarhoj <kasper@typo3.com>
124 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
125 * @package TYPO3
126 * @subpackage t3lib
127 */
128 class t3lib_cs {
129 var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
130
131 // This is the array where parsed conversion tables are stored (cached)
132 var $parsedCharsets=array();
133
134 // An array where case folding data will be stored (cached)
135 var $caseFolding=array();
136
137 // An array where charset-to-ASCII mappings are stored (cached)
138 var $toASCII=array();
139
140 // This tells the converter which charsets has two bytes per char:
141 var $twoByteSets=array(
142 'ucs-2'=>1, // 2-byte Unicode
143 );
144
145 // This tells the converter which charsets has four bytes per char:
146 var $fourByteSets=array(
147 'ucs-4'=>1, // 4-byte Unicode
148 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
149 );
150
151 // This tells the converter which charsets use a scheme like the Extended Unix Code:
152 var $eucBasedSets=array(
153 'gb2312'=>1, // Chinese, simplified.
154 'big5'=>1, // Chinese, traditional.
155 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
156 );
157
158 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
159 // http://czyborra.com/charsets/iso8859.html
160 var $synonyms=array(
161 'us' => 'ascii',
162 'us-ascii'=> 'ascii',
163 'cp819' => 'iso-8859-1',
164 'ibm819' => 'iso-8859-1',
165 'iso-ir-100' => 'iso-8859-1',
166 'iso-ir-109' => 'iso-8859-2',
167 'iso-ir-148' => 'iso-8859-9',
168 'iso-ir-199' => 'iso-8859-14',
169 'iso-ir-203' => 'iso-8859-15',
170 'csisolatin1' => 'iso-8859-1',
171 'csisolatin2' => 'iso-8859-2',
172 'csisolatin3' => 'iso-8859-3',
173 'csisolatin5' => 'iso-8859-9',
174 'csisolatin8' => 'iso-8859-14',
175 'csisolatin9' => 'iso-8859-15',
176 'csisolatingreek' => 'iso-8859-7',
177 'iso-celtic' => 'iso-8859-14',
178 'latin1' => 'iso-8859-1',
179 'latin2' => 'iso-8859-2',
180 'latin3' => 'iso-8859-3',
181 'latin5' => 'iso-8859-9',
182 'latin6' => 'iso-8859-10',
183 'latin8' => 'iso-8859-14',
184 'latin9' => 'iso-8859-15',
185 'l1' => 'iso-8859-1',
186 'l2' => 'iso-8859-2',
187 'l3' => 'iso-8859-3',
188 'l5' => 'iso-8859-9',
189 'l6' => 'iso-8859-10',
190 'l8' => 'iso-8859-14',
191 'l9' => 'iso-8859-15',
192 'cyrillic' => 'iso-8859-5',
193 'arabic' => 'iso-8859-6',
194 'tis-620' => 'iso-8859-11',
195 'win874' => 'windows-874',
196 'win1250' => 'windows-1250',
197 'win1251' => 'windows-1251',
198 'win1252' => 'windows-1252',
199 'win1253' => 'windows-1253',
200 'win1254' => 'windows-1254',
201 'win1255' => 'windows-1255',
202 'win1256' => 'windows-1256',
203 'win1257' => 'windows-1257',
204 'win1258' => 'windows-1258',
205 'cp1250' => 'windows-1250',
206 'cp1251' => 'windows-1251',
207 'cp1252' => 'windows-1252',
208 'ms-ee' => 'windows-1250',
209 'ms-ansi' => 'windows-1252',
210 'ms-greek' => 'windows-1253',
211 'ms-turk' => 'windows-1254',
212 'winbaltrim' => 'windows-1257',
213 'koi-8ru' => 'koi-8r',
214 'koi8r' => 'koi-8r',
215 'cp878' => 'koi-8r',
216 'mac' => 'macroman',
217 'macintosh' => 'macroman',
218 'euc-cn' => 'gb2312',
219 'x-euc-cn' => 'gb2312',
220 'euccn' => 'gb2312',
221 'cp936' => 'gb2312',
222 'big-5' => 'big5',
223 'cp950' => 'big5',
224 'eucjp' => 'euc-jp',
225 'sjis' => 'shift_jis',
226 'shift-jis' => 'shift_jis',
227 'cp932' => 'shift_jis',
228 'utf7' => 'utf-7',
229 'utf8' => 'utf-8',
230 'utf16' => 'utf-16',
231 'utf32' => 'utf-32',
232 'utf8' => 'utf-8',
233 'ucs2' => 'ucs-2',
234 'ucs4' => 'ucs-4',
235 );
236
237 // mapping of iso-639:2 language codes to language (family) names
238 var $lang_to_langfamily=array(
239 // iso-639:2 language codes, see:
240 // http://www.w3.org/WAI/ER/IG/ert/iso639.htm
241 // http://www.unicode.org/onlinedat/languages.html
242 'ar' => 'arabic',
243 'bg' => 'cyrillic',
244 'cs' => 'east_european',
245 'da' => 'west_european',
246 'de' => 'west_european',
247 'es' => 'west_european',
248 'et' => 'estonian',
249 'eu' => 'west_european',
250 'fi' => 'west_european',
251 'fr' => 'west_european',
252 'gr' => 'greek',
253 'hr' => 'east_european',
254 'hu' => 'east_european',
255 'iw' => 'hebrew',
256 'is' => 'west_european',
257 'it' => 'west_european',
258 'ja' => 'japanese',
259 'kl' => 'west_european',
260 'ko' => 'korean',
261 'lt' => 'lithuanian',
262 'lv' => 'west_european', // Latvian/Lettish
263 'nl' => 'west_european',
264 'no' => 'west_european',
265 'pl' => 'east_european',
266 'pt' => 'west_european',
267 'ro' => 'east_european',
268 'ru' => 'cyrillic',
269 'sk' => 'east_european',
270 'sl' => 'east_european',
271 'sv' => 'west_european',
272 'th' => 'thai',
273 'uk' => 'cyrillic',
274 'vi' => 'vietnamese',
275 'zh' => 'chinese',
276 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
277 'chs' => 'simpl_chinese',
278 'cht' => 'trad_chinese',
279 'csy' => 'east_european',
280 'dan' => 'west_european',
281 'deu' => 'west_european',
282 'dea' => 'west_european',
283 'des' => 'west_european',
284 'ena' => 'west_european',
285 'enc' => 'west_european',
286 'eng' => 'west_european',
287 'enz' => 'west_european',
288 'enu' => 'west_european',
289 'nld' => 'west_european',
290 'nlb' => 'west_european',
291 'fin' => 'west_european',
292 'fra' => 'west_european',
293 'frb' => 'west_european',
294 'frc' => 'west_european',
295 'frs' => 'west_european',
296 'ell' => 'greek',
297 'hun' => 'east_european',
298 'isl' => 'west_euorpean',
299 'ita' => 'west_european',
300 'its' => 'west_european',
301 'jpn' => 'japanese',
302 'kor' => 'korean',
303 'nor' => 'west_european',
304 'non' => 'west_european',
305 'plk' => 'east_european',
306 'ptg' => 'west_european',
307 'ptb' => 'west_european',
308 'rus' => 'east_european',
309 'sky' => 'east_european',
310 'esp' => 'west_european',
311 'esm' => 'west_european',
312 'esn' => 'west_european',
313 'sve' => 'west_european',
314 'trk' => 'turkish',
315 // English language names
316 'bulgarian' => 'east_european',
317 'catalan' => 'west_european',
318 'croatian' => 'east_european',
319 'czech' => 'east_european',
320 'danish' => 'west_european',
321 'dutch' => 'west_european',
322 'english' => 'west_european',
323 'finnish' => 'west_european',
324 'french' => 'west_european',
325 'galician' => 'west_european',
326 'german' => 'west_european',
327 'hungarian' => 'east_european',
328 'icelandic' => 'west_european',
329 'italian' => 'west_european',
330 'latvian' => 'west_european',
331 'lettish' => 'west_european',
332 'norwegian' => 'west_european',
333 'polish' => 'east_european',
334 'portuguese' => 'west_european',
335 'russian' => 'cyrillic',
336 'romanian' => 'east_european',
337 'slovak' => 'east_european',
338 'slovenian' => 'east_european',
339 'spanish' => 'west_european',
340 'svedish' => 'west_european',
341 'turkish' => 'east_european',
342 'ukrainian' => 'cyrillic',
343 );
344
345 // mapping of language (family) names to charsets on Unix
346 var $lang_to_charset_unix=array(
347 'west_european' => 'iso-8859-1',
348 'estonian' => 'iso-8859-1',
349 'east_european' => 'iso-8859-2',
350 'baltic' => 'iso-8859-4',
351 'cyrillic' => 'iso-8859-5',
352 'arabic' => 'iso-8859-6',
353 'greek' => 'iso-8859-7',
354 'hebrew' => 'iso-8859-8',
355 'turkish' => 'iso-8859-9',
356 'thai' => 'iso-8859-11', // = TIS-620
357 'lithuanian' => 'iso-8859-13',
358 'chinese' => 'gb2312', // = euc-cn
359 'japanese' => 'euc-jp',
360 'korean' => 'euc-kr',
361 'simpl_chinese' => 'gb2312',
362 'trad_chinese' => 'big5',
363 'vietnamese' => '',
364 );
365
366 // mapping of language (family) names to charsets on Windows
367 var $lang_to_charset_windows=array(
368 'east_european' => 'windows-1250',
369 'cyrillic' => 'windows-1251',
370 'west_european' => 'windows-1252',
371 'greek' => 'windows-1253',
372 'turkish' => 'windows-1254',
373 'hebrew' => 'windows-1255',
374 'arabic' => 'windows-1256',
375 'baltic' => 'windows-1257',
376 'estonian' => 'windows-1257',
377 'lithuanian' => 'windows-1257',
378 'vietnamese' => 'windows-1258',
379 'thai' => 'cp874',
380 'korean' => 'cp950',
381 'chinese' => 'gb2312',
382 'japanese' => 'shift_jis',
383 'simpl_chinese' => 'gb2312',
384 'trad_chinese' => 'big5',
385 );
386
387 // mapping of locale names to charsets
388 var $locale_to_charset=array(
389 'japanese.euc' => 'euc-jp',
390 'ja_jp.ujis' => 'euc-jp',
391 'korean.euc' => 'euc-kr',
392 'zh_cn' => 'gb2312',
393 'zh_hk' => 'big5',
394 'zh_tw' => 'big5',
395 );
396
397 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
398 // Empty values means "iso-8859-1"
399 var $charSetArray = array(
400 'dk' => '',
401 'de' => '',
402 'no' => '',
403 'it' => '',
404 'fr' => '',
405 'es' => '',
406 'nl' => '',
407 'cz' => 'windows-1250',
408 'pl' => 'iso-8859-2',
409 'si' => 'windows-1250',
410 'fi' => '',
411 'tr' => 'iso-8859-9',
412 'se' => '',
413 'pt' => '',
414 'ru' => 'windows-1251',
415 'ro' => 'iso-8859-2',
416 'ch' => 'gb2312',
417 'sk' => 'windows-1250',
418 'lt' => 'windows-1257',
419 'is' => 'utf-8',
420 'hr' => 'windows-1250',
421 'hu' => 'iso-8859-2',
422 'gl' => '',
423 'th' => 'iso-8859-11',
424 'gr' => 'iso-8859-7',
425 'hk' => 'big5',
426 'eu' => '',
427 'bg' => 'windows-1251',
428 'br' => '',
429 'et' => 'iso-8859-4',
430 'ar' => 'iso-8859-6',
431 'he' => 'utf-8',
432 'ua' => 'windows-1251',
433 'jp' => 'shift_jis',
434 'lv' => 'utf-8',
435 'vn' => 'utf-8',
436 );
437
438 /**
439 * Normalize - changes input character set to lowercase letters.
440 *
441 * @param string Input charset
442 * @return string Normalized charset
443 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
444 */
445 function parse_charset($charset) {
446 $charset = strtolower($charset);
447 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
448
449 return $charset;
450 }
451
452 /**
453 * Get the charset of a locale.
454 *
455 * ln language
456 * ln_CN language / country
457 * ln_CN.cs language / country / charset
458 *
459 * @param string Locale string
460 * @return string Charset resolved for locale string
461 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
462 */
463 function get_locale_charset($locale) {
464 $locale = strtolower($locale);
465
466 // exact locale specific charset?
467 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale];
468
469 // locale contains charset: use it
470 list($locale,$charset) = explode('.',$locale);
471 if ($charset) return $this->parse_charset($charset);
472
473 // get language
474 list($language,$country) = explode('_',$locale);
475 if (isset($this->lang_to_langfamily[$language])) $language = $this->lang_to_langfamily[$language];
476
477 if (TYPO3_OS == 'WIN') {
478 $cs = $this->lang_to_charset_windows[$language];
479 } else {
480 $cs = $this->lang_to_charset_unix[$language];
481 }
482
483 return $cs ? $cs : 'iso-8859-1';
484 }
485
486
487
488
489
490
491
492
493
494 /********************************************
495 *
496 * Charset Conversion functions
497 *
498 ********************************************/
499
500 /**
501 * Convert from one charset to another charset.
502 *
503 * @param string Input string
504 * @param string From charset (the current charset of the string)
505 * @param string To charset (the output charset wanted)
506 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
507 * @return string Converted string
508 */
509 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
510 if ($fromCS==$toCS) return $str;
511
512 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
513 if ($toCS=='utf-8' || !$useEntityForNoChar) {
514 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
515 case 'mbstring':
516 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
517 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
518 break;
519
520 case 'iconv':
521 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
522 if (false !== $conv_str) return $conv_str;
523 break;
524
525 case 'recode':
526 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
527 if (false !== $conv_str) return $conv_str;
528 break;
529 }
530 // fallback to TYPO3 conversion
531 }
532
533 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
534 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
535 return $str;
536 }
537
538
539 /**
540 * Converts $str from $charset to UTF-8
541 *
542 * @param string String in local charset to convert to UTF-8
543 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
544 * @return string Output string, converted to UTF-8
545 */
546 function utf8_encode($str,$charset) {
547
548 // Charset is case-insensitive.
549 if ($this->initCharset($charset)) { // Parse conv. table if not already...
550 $strLen = strlen($str);
551 $outStr='';
552
553 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
554 $chr=substr($str,$a,1);
555 $ord=ord($chr);
556 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
557 $ord2 = ord($str{$a+1});
558 $ord = $ord<<8 & $ord2; // assume big endian
559
560 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
561 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
562 } else $outStr.=chr($this->noCharByteVal); // No char exists
563 $a++;
564 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
565 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
566 $a++;
567 $ord2=ord(substr($str,$a,1));
568 $ord = $ord*256+$ord2;
569 }
570 elseif ($charset == 'shift_jis' && ($ord <160 || $ord>223)) { // Shift-JIS is like EUC, but chars between 160 and 223 are single byte
571 $a++;
572 $ord2=ord(substr($str,$a,1));
573 $ord = $ord*256+$ord2;
574 }
575
576 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
577 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
578 } else $outStr.=chr($this->noCharByteVal); // No char exists
579 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
580 }
581 return $outStr;
582 }
583 }
584
585 /**
586 * Converts $str from UTF-8 to $charset
587 *
588 * @param string String in UTF-8 to convert to local charset
589 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
590 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
591 * @return string Output string, converted to local charset
592 */
593 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
594
595 // Charset is case-insensitive.
596 if ($this->initCharset($charset)) { // Parse conv. table if not already...
597 $strLen = strlen($str);
598 $outStr='';
599 $buf='';
600 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
601 $chr=substr($str,$a,1);
602 $ord=ord($chr);
603 if ($ord>127) { // This means multibyte! (first byte!)
604 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
605
606 $buf=$chr; // Add first byte
607 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
608 $ord = $ord << 1; // Shift it left and ...
609 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
610 $a++; // Increase pointer...
611 $buf.=substr($str,$a,1); // ... and add the next char.
612 } else break;
613 }
614
615 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
616 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
617 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
618 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
619 } else $outStr.= chr($mByte);
620 } elseif ($useEntityForNoChar) { // Create num entity:
621 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
622 } else $outStr.=chr($this->noCharByteVal); // No char exists
623 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
624 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
625 }
626 return $outStr;
627 }
628 }
629
630 /**
631 * Converts all chars > 127 to numeric entities.
632 *
633 * @param string Input string
634 * @return string Output string
635 */
636 function utf8_to_entities($str) {
637 $strLen = strlen($str);
638 $outStr='';
639 $buf='';
640 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
641 $chr=substr($str,$a,1);
642 $ord=ord($chr);
643 if ($ord>127) { // This means multibyte! (first byte!)
644 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
645 $buf=$chr; // Add first byte
646 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
647 $ord = $ord << 1; // Shift it left and ...
648 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
649 $a++; // Increase pointer...
650 $buf.=substr($str,$a,1); // ... and add the next char.
651 } else break;
652 }
653
654 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
655 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
656 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
657 }
658
659 return $outStr;
660 }
661
662 /**
663 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
664 *
665 * @param string Input string, UTF-8
666 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
667 * @return string Output string
668 */
669 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
670 if ($alsoStdHtmlEnt) {
671 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
672 }
673
674 $token = md5(microtime());
675 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
676 foreach($parts as $k => $v) {
677 if ($k%2) {
678 if (substr($v,0,1)=='#') { // Dec or hex entities:
679 if (substr($v,1,1)=='x') {
680 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
681 } else {
682 $parts[$k] = $this->UnumberToChar(substr($v,1));
683 }
684 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
685 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
686 } else { // No conversion:
687 $parts[$k] ='&'.$v.';';
688 }
689 }
690 }
691
692 return implode('',$parts);
693 }
694
695 /**
696 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
697 *
698 * @param string Input string, UTF-8
699 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
700 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
701 * @return array Output array with the char numbers
702 */
703 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
704 // If entities must be registered as well...:
705 if ($convEntities) {
706 $str = $this->entities_to_utf8($str,1);
707 }
708 // Do conversion:
709 $strLen = strlen($str);
710 $outArr=array();
711 $buf='';
712 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
713 $chr=substr($str,$a,1);
714 $ord=ord($chr);
715 if ($ord>127) { // This means multibyte! (first byte!)
716 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
717 $buf=$chr; // Add first byte
718 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
719 $ord = $ord << 1; // Shift it left and ...
720 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
721 $a++; // Increase pointer...
722 $buf.=substr($str,$a,1); // ... and add the next char.
723 } else break;
724 }
725
726 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
727 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
728 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
729 }
730
731 return $outArr;
732 }
733
734 /**
735 * Converts a UNICODE number to a UTF-8 multibyte character
736 * Algorithm based on script found at From: http://czyborra.com/utf/
737 * Unit-tested by Kasper
738 *
739 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
740 *
741 * bytes | bits | representation
742 * 1 | 7 | 0vvvvvvv
743 * 2 | 11 | 110vvvvv 10vvvvvv
744 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
745 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
746 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
747 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
748 *
749 * @param integer UNICODE integer
750 * @return string UTF-8 multibyte character string
751 * @see utf8CharToUnumber()
752 */
753 function UnumberToChar($cbyte) {
754 $str='';
755
756 if ($cbyte < 0x80) {
757 $str.=chr($cbyte);
758 } else if ($cbyte < 0x800) {
759 $str.=chr(0xC0 | ($cbyte >> 6));
760 $str.=chr(0x80 | ($cbyte & 0x3F));
761 } else if ($cbyte < 0x10000) {
762 $str.=chr(0xE0 | ($cbyte >> 12));
763 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
764 $str.=chr(0x80 | ($cbyte & 0x3F));
765 } else if ($cbyte < 0x200000) {
766 $str.=chr(0xF0 | ($cbyte >> 18));
767 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
768 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
769 $str.=chr(0x80 | ($cbyte & 0x3F));
770 } else if ($cbyte < 0x4000000) {
771 $str.=chr(0xF8 | ($cbyte >> 24));
772 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
773 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
774 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
775 $str.=chr(0x80 | ($cbyte & 0x3F));
776 } else if ($cbyte < 0x80000000) {
777 $str.=chr(0xFC | ($cbyte >> 30));
778 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
779 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
780 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
781 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
782 $str.=chr(0x80 | ($cbyte & 0x3F));
783 } else { // Cannot express a 32-bit character in UTF-8
784 $str .= chr($this->noCharByteVal);
785 }
786 return $str;
787 }
788
789 /**
790 * Converts a UTF-8 Multibyte character to a UNICODE number
791 * Unit-tested by Kasper
792 *
793 * @param string UTF-8 multibyte character string
794 * @param boolean If set, then a hex. number is returned.
795 * @return integer UNICODE integer
796 * @see UnumberToChar()
797 */
798 function utf8CharToUnumber($str,$hex=0) {
799 $ord=ord(substr($str,0,1)); // First char
800
801 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
802 $binBuf='';
803 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
804 $ord = $ord << 1; // Shift it left and ...
805 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
806 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
807 } else break;
808 }
809 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
810
811 $int = bindec($binBuf);
812 } else $int = $ord;
813
814 return $hex ? 'x'.dechex($int) : $int;
815 }
816
817
818
819
820
821
822
823
824
825 /********************************************
826 *
827 * Init functions
828 *
829 ********************************************/
830
831 /**
832 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
833 * This function is automatically called by the conversion functions
834 *
835 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
836 *
837 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
838 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
839 * @access private
840 */
841 function initCharset($charset) {
842 // Only process if the charset is not yet loaded:
843 if (!is_array($this->parsedCharsets[$charset])) {
844
845 // Conversion table filename:
846 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
847
848 // If the conversion table is found:
849 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
850 // Cache file for charsets:
851 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
852 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/charset_'.$charset.'.tbl');
853 if ($cacheFile && @is_file($cacheFile)) {
854 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
855 } else {
856 // Parse conversion table into lines:
857 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
858 // Initialize the internal variable holding the conv. table:
859 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
860 // traverse the lines:
861 $detectedType='';
862 foreach($lines as $value) {
863 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
864
865 // Detect type if not done yet: (Done on first real line)
866 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
867 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
868
869 if ($detectedType=='ms-token') {
870 list($hexbyte,$utf8) = split('=|:',$value,3);
871 } elseif ($detectedType=='whitespaced') {
872 $regA=array();
873 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
874 $hexbyte = $regA[1];
875 $utf8 = 'U+'.$regA[2];
876 }
877 $decval = hexdec(trim($hexbyte));
878 if ($decval>127) {
879 $utf8decval = hexdec(substr(trim($utf8),2));
880 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
881 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
882 }
883 }
884 }
885 if ($cacheFile) {
886 t3lib_div::writeFile($cacheFile,serialize($this->parsedCharsets[$charset]));
887 }
888 }
889 return 2;
890 } else return false;
891 } else return 1;
892 }
893
894 /**
895 * This function initializes all UTF-8 character data tables.
896 *
897 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
898 *
899 * @param string ???
900 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
901 * @access private
902 */
903 function initUnicodeData($mode=null) {
904 // cache files
905 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cscase_utf-8.tbl');
906 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/csascii_utf-8.tbl');
907
908 // Only process if the tables are not yet loaded
909 switch($mode) {
910 case 'case':
911 if (is_array($this->caseFolding['utf-8'])) return 1;
912
913 // Use cached version if possible
914 if ($cacheFileCase && @is_file($cacheFileCase)) {
915 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFile));
916 return 2;
917 }
918 break;
919
920 case 'ascii':
921 if (is_array($this->toASCII['utf-8'])) return 1;
922
923 // Use cached version if possible
924 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
925 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
926 return 2;
927 }
928 break;
929 }
930
931 // process main Unicode data file
932 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
933 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
934
935 $fh = fopen($unicodeDataFile,'r');
936 if (!$fh) return false;
937
938 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
939 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
940 $this->caseFolding['utf-8'] = array();
941 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
942 $utf8CaseFolding['toUpper'] = array();
943 $utf8CaseFolding['toLower'] = array();
944 $utf8CaseFolding['toTitle'] = array();
945
946 $decomposition = array(); // array of temp. decompositions
947 $mark = array(); // array of chars that are marks (eg. composing accents)
948 $number = array(); // array of chars that are numbers (eg. digits)
949
950 while (!feof($fh)) {
951 $line = fgets($fh);
952 // has a lot of info
953 list($char,,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
954
955 $ord = hexdec($char);
956 if ($ord > 0xFFFF) break; // only process the BMP
957
958 $utf8_char = $this->UnumberToChar($ord);
959
960 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
961 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
962 // store "title" only when different from "upper" (only a few)
963 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
964
965 switch ($cat{0}) {
966 case 'M': // mark (accent, umlaut, ...)
967 $mark["U+$char"] = 1;
968 break;
969
970 case 'N': // numeric value
971 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
972 }
973
974 $match = array();
975 if (ereg('(<.*>)? *(.+)',$decomp,$match)) {
976 switch($match[1]) {
977 case '<circle>': // add parenthesis as circle replacement, eg (1)
978 $match[2] = '0028 '.$match[2].' 0029';
979 break;
980
981 case '<square>': // add square brackets as square replacement, eg [1]
982 $match[2] = '005B '.$match[2].' 005D';
983 break;
984
985 case '<compat>': // ignore multi char decompositions that start with a space
986 if (ereg('^0020 ',$match[2])) continue 2;
987 break;
988
989 // ignore Arabic and vertical layout presentation decomposition
990 case '<initial>':
991 case '<medial>':
992 case '<final>':
993 case '<isolated>':
994 case '<vertical>':
995 continue 2;
996 }
997 $decomposition["U+$char"] = split(' ',$match[2]);
998 }
999 }
1000 fclose($fh);
1001
1002 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1003 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
1004 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1005
1006 $fh = fopen($specialCasingFile,'r');
1007 if ($fh) {
1008 while (!feof($fh)) {
1009 $line = fgets($fh);
1010 if ($line{0} != '#' && trim($line) != '') {
1011
1012 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
1013 if ($cond == '' || $cond{0} == '#') {
1014 $utf8_char = $this->UnumberToChar(hexdec($char));
1015 if ($char != $lower) {
1016 $arr = split(' ',$lower);
1017 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1018 $utf8CaseFolding['toLower'][$utf8_char] = implode($arr);
1019 }
1020 if ($char != $title && $title != $upper) {
1021 $arr = split(' ',$title);
1022 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1023 $utf8CaseFolding['toTitle'][$utf8_char] = implode($arr);
1024 }
1025 if ($char != $upper) {
1026 $arr = split(' ',$upper);
1027 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1028 $utf8CaseFolding['toUpper'][$utf8_char] = implode($arr);
1029 }
1030 }
1031 }
1032 }
1033 fclose($fh);
1034 }
1035 }
1036
1037 // custom decompositions
1038 $decomposition['U+00A5'] = array('0079','0065','006E'); // YEN SIGN => yen
1039 $decomposition['U+00A9'] = array('0028','0063','0029'); // COPYRIGHT SIGN => (c)
1040 $decomposition['U+00AE'] = array('0028','0072','0029'); // REGISTERED SIGN => (R)
1041 $decomposition['U+00B1'] = array('002B','002F','002D'); // PLUS-MINUS SIGN => +/-
1042 $decomposition['U+00B5'] = array('0075'); // MICRO SIGN => u
1043 $decomposition['U+00C4'] = array('0041','0045'); // LATIN CAPITAL LETTER A WITH DIAERESIS => AE
1044 $decomposition['U+00C5'] = array('0041','0041'); // LATIN CAPITAL LETTER A WITH RING ABOVE => AA (Danish)
1045 $decomposition['U+00C6'] = array('0041','0045'); // LATIN CAPITAL LETTER AE => AE
1046 $decomposition['U+00D6'] = array('004F','0045'); // LATIN CAPITAL LETTER O WITH DIAERESIS => OE
1047 $decomposition['U+00D8'] = array('004F','0045'); // LATIN CAPITAL LETTER O WITH STROKE => OE (Danish)
1048 $decomposition['U+00DC'] = array('0055','0045'); // LATIN CAPITAL LETTER U WITH DIAERESIS => UE
1049 $decomposition['U+00E4'] = array('0061','0065'); // LATIN SMALL LETTER A WITH DIAERESIS => ae
1050 $decomposition['U+00E5'] = array('0061','0061'); // LATIN SMALL LETTER A WITH RING ABOVE => aa
1051 $decomposition['U+00DF'] = array('0073','0073'); // LATIN SMALL LETTER SHARP S => ss (German)
1052 $decomposition['U+00E6'] = array('0061','0065'); // LATIN SMALL LETTER AE => ae
1053 $decomposition['U+00F6'] = array('006F','0065'); // LATIN SMALL LETTER O WITH DIAERESIS => oe
1054 $decomposition['U+00F8'] = array('006F','0065'); // LATIN SMALL LETTER O WITH STROKE => oe (Danish)
1055 $decomposition['U+00FC'] = array('0075','0065'); // LATIN SMALL LETTER U WITH DIAERESIS => ue
1056 $decomposition['U+0152'] = array('004F','0045'); // LATIN CAPITAL LETTER OE => OE
1057 $decomposition['U+0153'] = array('006F','0065'); // LATIN SMALL LETTER OE => oe
1058 $decomposition['U+02BC'] = array('0027'); // MODIFIER LETTER APOSTROPHE => '
1059 $decomposition['U+02CA'] = array('0027'); // MODIFIER LETTER ACUTE ACCENT => '
1060 $decomposition['U+2044'] = array('002F'); // FRACTION SLASH => /
1061 $decomposition['U+20A0'] = array('0045','0055','0052'); // EURO-CURRENCY SIGN => EUR
1062 $decomposition['U+20AC'] = array('0045','0055','0052'); // EURO-CURRENCY SIGN => EUR
1063
1064 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1065 foreach($decomposition as $from => $to) {
1066 $code_decomp = array();
1067
1068 while ($code_value = array_shift($to)) {
1069 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1070 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
1071 array_unshift($to, $cv);
1072 }
1073 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1074 array_push($code_decomp, $code_value);
1075 }
1076 }
1077 if (count($code_decomp)) {
1078 $decomposition[$from] = $code_decomp;
1079 } else {
1080 unset($decomposition[$from]);
1081 }
1082 }
1083
1084 // create ascii only mapping
1085 $this->toASCII['utf-8'] = array();
1086 $ascii =& $this->toASCII['utf-8'];
1087
1088 foreach($decomposition as $from => $to) {
1089 $code_decomp = array();
1090 while ($code_value = array_shift($to)) {
1091 $ord = hexdec($code_value);
1092 if ($ord > 127)
1093 continue 2; // skip decompositions containing non-ASCII chars
1094 else
1095 array_push($code_decomp,chr($ord));
1096 }
1097 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1098 }
1099
1100 // add numeric decompositions
1101 foreach($number as $from => $to) {
1102 $utf8_char = $this->UnumberToChar(hexdec($from));
1103 if (!isset($ascii[$utf8_char])) {
1104 $ascii[$utf8_char] = $to;
1105 }
1106 }
1107
1108 if ($cacheFileCase) {
1109 t3lib_div::writeFile($cacheFileCase,serialize($utf8CaseFolding));
1110 }
1111
1112 if ($cacheFileASCII) {
1113 t3lib_div::writeFile($cacheFileASCII,serialize($ascii));
1114 }
1115
1116 return 3;
1117 }
1118
1119 /**
1120 * This function initializes the folding table for a charset other than UTF-8.
1121 * This function is automatically called by the case folding functions.
1122 *
1123 * @param string Charset for which to initialize case folding.
1124 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1125 * @access private
1126 */
1127 function initCaseFolding($charset) {
1128 // Only process if the case table is not yet loaded:
1129 if (is_array($this->caseFolding[$charset])) return 1;
1130
1131 // Use cached version if possible
1132 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cscase_'.$charset.'.tbl');
1133 if ($cacheFile && @is_file($cacheFile)) {
1134 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1135 return 2;
1136 }
1137
1138 // init UTF-8 conversion for this charset
1139 if (!$this->initCharset($charset)) {
1140 return false;
1141 }
1142
1143 // UTF-8 case folding is used as the base conversion table
1144 if (!$this->initUnicodeData()) {
1145 return false;
1146 }
1147
1148 $nochar = chr($this->noCharByteVal);
1149 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1150 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1151 $c = $this->conv($utf8, 'utf-8', $charset);
1152
1153 $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1154 if ($cc && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1155
1156 $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1157 if ($cc && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
1158
1159 $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1160 if ($cc && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1161 }
1162
1163 // add the ASCII case table
1164 for ($i=ord('a'); $i<=ord('z'); $i++) {
1165 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
1166 }
1167 for ($i=ord('A'); $i<=ord('Z'); $i++) {
1168 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
1169 }
1170
1171 if ($cacheFile) {
1172 t3lib_div::writeFile($cacheFile,serialize($this->caseFolding[$charset]));
1173 }
1174
1175 return 3;
1176 }
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194 /********************************************
1195 *
1196 * String operation functions
1197 *
1198 ********************************************/
1199
1200 /**
1201 * Returns a part of a string.
1202 * Unit-tested by Kasper (single byte charsets only)
1203 *
1204 * @param string The character set
1205 * @param string Character string
1206 * @param integer Start position (character position)
1207 * @param integer Length (in characters)
1208 * @return string The substring
1209 * @see substr(), mb_substr()
1210 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1211 */
1212 function substr($charset,$string,$start,$len=null) {
1213 if ($len===0) return '';
1214
1215 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1216 // cannot omit $len, when specifying charset
1217 if ($len==null) {
1218 $enc = mb_internal_encoding(); // save internal encoding
1219 mb_internal_encoding('utf-8');
1220 $str = mb_substr($string,$start);
1221 mb_internal_encoding($enc); // restore internal encoding
1222
1223 return $str;
1224 }
1225 else return mb_substr($string,$start,$len,'utf-8');
1226 } elseif ($charset == 'utf-8') {
1227 return $this->utf8_substr($string,$start,$len);
1228 } elseif ($this->eucBasedSets[$charset]) {
1229 return $this->euc_substr($string,$start,$charset,$len);
1230 } elseif ($this->twoByteSets[$charset]) {
1231 return substr($string,$start*2,$len*2);
1232 } elseif ($this->fourByteSets[$charset]) {
1233 return substr($string,$start*4,$len*4);
1234 }
1235
1236 // treat everything else as single-byte encoding
1237 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
1238 }
1239
1240 /**
1241 * Counts the number of characters.
1242 * Unit-tested by Kasper (single byte charsets only)
1243 *
1244 * @param string The character set
1245 * @param string Character string
1246 * @return integer The number of characters
1247 * @see strlen()
1248 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1249 */
1250 function strlen($charset,$string) {
1251 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1252 return mb_strlen($string,$charset);
1253 } elseif ($charset == 'utf-8') {
1254 return $this->utf8_strlen($string);
1255 } elseif ($this->eucBasedSets[$charset]) {
1256 return $this->euc_strlen($string,$charset);
1257 } elseif ($this->twoByteSets[$charset]) {
1258 return strlen($string)/2;
1259 } elseif ($this->fourByteSets[$charset]) {
1260 return strlen($string)/4;
1261 }
1262 // treat everything else as single-byte encoding
1263 return strlen($string);
1264 }
1265
1266 /**
1267 * Truncates a string and pre-/appends a string.
1268 * Unit tested by Kasper
1269 *
1270 * @param string The character set
1271 * @param string Character string
1272 * @param integer Length (in characters)
1273 * @param string Crop signifier
1274 * @return string The shortened string
1275 * @see substr(), mb_strimwidth()
1276 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1277 */
1278 function crop($charset,$string,$len,$crop='') {
1279 if ($len == 0) return $crop;
1280
1281 if ($charset == 'utf-8') {
1282 $i = $this->utf8_char2byte_pos($string,$len);
1283 } elseif ($this->eucBasedSets[$charset]) {
1284 $i = $this->euc_char2byte_pos($string,$len,$charset);
1285 } else {
1286 if ($len > 0) {
1287 $i = $len;
1288 } else {
1289 $i = strlen($string)+$len;
1290 if ($i<=0) $i = false;
1291 }
1292 }
1293
1294 if ($i === false) { // $len outside actual string length
1295 return $string;
1296 } else {
1297 if ($len > 0) {
1298 if (isset($string{$i})) {
1299 return substr($string,0,$i).$crop;
1300 }
1301 } else {
1302 if (isset($string{$i-1})) {
1303 return $crop.substr($string,$i);
1304 }
1305 }
1306
1307 /*
1308 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1309 if ($len > 0) {
1310 return substr($string,0,$i).$crop;
1311 } else {
1312 return $crop.substr($string,$i);
1313 }
1314 }
1315 */
1316 }
1317 return $string;
1318 }
1319
1320 /**
1321 * Cuts a string short at a given byte length.
1322 *
1323 * @param string The character set
1324 * @param string Character string
1325 * @param integer The byte length
1326 * @return string The shortened string
1327 * @see mb_strcut()
1328 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1329 */
1330 function strtrunc($charset,$string,$len) {
1331 if ($len <= 0) return '';
1332
1333 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1334 return mb_strcut($string,0,$len,$charset);
1335 } elseif ($charset == 'utf-8') {
1336 return $this->utf8_strtrunc($string,$len);
1337 } elseif ($this->eucBasedSets[$charset]) {
1338 return $this->euc_strtrunc($string,$charset);
1339 } elseif ($this->twoByteSets[$charset]) {
1340 if ($len % 2) $len--; // don't cut at odd positions
1341 } elseif ($this->fourByteSets[$charset]) {
1342 $x = $len % 4;
1343 $len -= $x; // realign to position dividable by four
1344 }
1345 // treat everything else as single-byte encoding
1346 return substr($string,0,$len);
1347 }
1348
1349 /**
1350 * Translates all characters of a string into their respective case values.
1351 * Unlike strtolower() and strtoupper() this method is locale independent.
1352 * Note that the string length may change!
1353 * eg. lower case German "ß" (scharfes S) becomes uper case "SS"
1354 * Unit-tested by Kasper
1355 * Real case folding is language dependent, this method ignores this fact.
1356 *
1357 * @param string Character set of string
1358 * @param string Input string to convert case for
1359 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1360 * @return string The converted string
1361 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1362 * @see strtolower(), strtoupper()
1363 */
1364 function conv_case($charset,$string,$case) {
1365 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' &&
1366 float(phpversion()) >= 4.3) {
1367 if ($case == 'toLower') {
1368 return mb_strtolower($str,'utf-8');
1369 } else {
1370 return mb_strtoupper($str,'utf-8');
1371 }
1372 } elseif ($charset == 'utf-8') {
1373 return $this->utf8_conv_case($string,$case);
1374 } elseif ($this->eucBasedSets[$charset]) {
1375 return $this->euc_conv_case($string,$case,$charset);
1376 }
1377
1378 // treat everything else as single-byte encoding
1379 if (!$this->initCaseFolding($charset)) return $string; // do nothing
1380 $out = '';
1381 $caseConv =& $this->caseFolding[$charset][$case];
1382
1383 for($i=0; isset($string{$i}); $i++) {
1384 $c = $string{$i};
1385 $cc = $caseConv[$c];
1386 if ($cc) {
1387 $out .= $cc;
1388 } else {
1389 $out .= $c;
1390 }
1391 }
1392
1393 // is a simple strtr() faster or slower than the code above?
1394 // perhaps faster for small single-byte tables but slower for large multi-byte tables?
1395 //
1396 // return strtr($string,$this->caseFolding[$charset][$case]);
1397
1398 return $out;
1399 }
1400
1401 /**
1402 * Converts special chars (like ÆØÅæøå, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1403 * CURRENTLY IT IS FULLY IMPLEMENTED ONLY FOR UTF-8!!!
1404 *
1405 * @param string Character set of string
1406 * @param string Input string to convert
1407 * @return string The converted string
1408 */
1409 function specCharsToASCII($charset,$string) {
1410 if ($charset == 'utf-8') {
1411 return $this->utf8_toASCII($string);
1412 } else {
1413 $string = t3lib_div::convUmlauts($string);
1414 }
1415
1416 return $string;
1417 }
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430 /********************************************
1431 *
1432 * Internal UTF-8 string operation functions
1433 *
1434 ********************************************/
1435
1436 /**
1437 * Returns a part of a UTF-8 string.
1438 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1439 *
1440 * @param string UTF-8 string
1441 * @param integer Start position (character position)
1442 * @param integer Length (in characters)
1443 * @return string The substring
1444 * @see substr()
1445 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1446 */
1447 function utf8_substr($str,$start,$len=null) {
1448 if (!strcmp($len,'0')) return '';
1449
1450 $byte_start = $this->utf8_char2byte_pos($str,$start);
1451 if ($byte_start === false) {
1452 if ($start > 0) {
1453 return false; // $start outside string length
1454 } else {
1455 $start = 0;
1456 }
1457 }
1458
1459 $str = substr($str,$byte_start);
1460
1461 if ($len!=null) {
1462 $byte_end = $this->utf8_char2byte_pos($str,$len);
1463 if ($byte_end === false) // $len outside actual string length
1464 return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string.
1465 else
1466 return substr($str,0,$byte_end);
1467 }
1468 else return $str;
1469 }
1470
1471 /**
1472 * Counts the number of characters of a string in UTF-8.
1473 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1474 *
1475 * @param string UTF-8 multibyte character string
1476 * @return integer The number of characters
1477 * @see strlen()
1478 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1479 */
1480 function utf8_strlen($str) {
1481 $n=0;
1482 for($i=0; isset($str{$i}); $i++) {
1483 $c = ord($str{$i});
1484 if (!($c & 0x80)) // single-byte (0xxxxxx)
1485 $n++;
1486 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1487 $n++;
1488 }
1489 return $n;
1490 }
1491
1492 /**
1493 * Truncates a string in UTF-8 short at a given byte length.
1494 *
1495 * @param string UTF-8 multibyte character string
1496 * @param integer the byte length
1497 * @return string the shortened string
1498 * @see mb_strcut()
1499 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1500 */
1501 function utf8_strtrunc($str,$len) {
1502 $i = $len-1;
1503 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1504 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1505 if ($i <= 0) return ''; // sanity check
1506 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1507 if ($bc+$i > $len) return substr($str,0,$i);
1508 // fallthru: multibyte char fits into length
1509 }
1510 return substr($str,$len);
1511 }
1512
1513 /**
1514 * Find position of first occurrence of a string, both arguments are in UTF-8.
1515 *
1516 * @param string UTF-8 string to search in
1517 * @param string UTF-8 string to search for
1518 * @param integer Positition to start the search
1519 * @return integer The character position
1520 * @see strpos()
1521 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1522 */
1523 function utf8_strpos($haystack,$needle,$offset=0) {
1524 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1525 return mb_strpos($haystack,$needle,'utf-8');
1526 }
1527
1528 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1529 if ($byte_offset === false) return false; // offset beyond string length
1530
1531 $byte_pos = strpos($haystack,$needle,$byte_offset);
1532 if ($byte_pos === false) return false; // needle not found
1533
1534 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1535 }
1536
1537 /**
1538 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1539 *
1540 * @param string UTF-8 string to search in
1541 * @param string UTF-8 character to search for (single character)
1542 * @return integer The character position
1543 * @see strrpos()
1544 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1545 */
1546 function utf8_strrpos($haystack,$needle) {
1547 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1548 return mb_strrpos($haystack,$needle,'utf-8');
1549 }
1550
1551 $byte_pos = strrpos($haystack,$needle);
1552 if ($byte_pos === false) return false; // needle not found
1553
1554 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1555 }
1556
1557 /**
1558 * Translates a character position into an 'absolute' byte position.
1559 * Unit tested by Kasper.
1560 *
1561 * @param string UTF-8 string
1562 * @param integer Character position (negative values start from the end)
1563 * @return integer Byte position
1564 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1565 */
1566 function utf8_char2byte_pos($str,$pos) {
1567 $n = 0; // number of characters found
1568 $p = abs($pos); // number of characters wanted
1569
1570 if ($pos >= 0) {
1571 $i = 0;
1572 $d = 1;
1573 } else {
1574 $i = strlen($str)-1;
1575 $d = -1;
1576 }
1577
1578 for( ; isset($str{$i}) && $n<$p; $i+=$d) {
1579 $c = (int)ord($str{$i});
1580 if (!($c & 0x80)) // single-byte (0xxxxxx)
1581 $n++;
1582 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1583 $n++;
1584 }
1585 if (!isset($str{$i})) return false; // offset beyond string length
1586
1587 if ($pos >= 0) {
1588 // skip trailing multi-byte data bytes
1589 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1590 } else {
1591 // correct offset
1592 $i++;
1593 }
1594
1595 return $i;
1596 }
1597
1598 /**
1599 * Translates an 'absolute' byte position into a character position.
1600 * Unit tested by Kasper.
1601 *
1602 * @param string UTF-8 string
1603 * @param integer byte position
1604 * @return integer character position
1605 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1606 */
1607 function utf8_byte2char_pos($str,$pos) {
1608 $n = 0; // number of characters
1609 for($i=$pos; $i>0; $i--) {
1610 $c = (int)ord($str{$i});
1611 if (!($c & 0x80)) // single-byte (0xxxxxx)
1612 $n++;
1613 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1614 $n++;
1615 }
1616 if (!isset($str{$i})) return false; // offset beyond string length
1617
1618 return $n;
1619 }
1620
1621 /**
1622 * Translates all characters of an UTF-8 string into their respective case values.
1623 * Unit-tested by Kasper
1624 *
1625 * @param string UTF-8 string
1626 * @param string conversion: 'toLower' or 'toUpper'
1627 * @return string the converted string
1628 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1629 * @see strtolower(), strtoupper(), mb_convert_case()
1630 */
1631 function utf8_conv_case($str,$case) {
1632 if (!$this->initUnicodeData()) return $str; // do nothing
1633
1634 $out = '';
1635 $caseConv =& $this->caseFolding['utf-8'][$case];
1636
1637 for($i=0; isset($str{$i}); $i++) {
1638 $c = ord($str{$i});
1639 if (!($c & 0x80)) // single-byte (0xxxxxx)
1640 $mbc = $str{$i};
1641 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1642 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1643 $mbc = substr($str,$i,$bc);
1644 $i += $bc-1;
1645 }
1646
1647 if (isset($caseConv[$mbc])) {
1648 $out .= $caseConv[$mbc];
1649 } else {
1650 $out .= $mbc;
1651 }
1652 }
1653
1654 return $out;
1655 }
1656
1657 /**
1658 * Converts chars with accents, umlauts or composed to ASCII equivalents.
1659 *
1660 * @param string Input string to convert
1661 * @return string The converted string
1662 */
1663 function utf8_toASCII(&$str) {
1664 if (!$this->initUnicodeData('ascii')) return $str; // do nothing
1665
1666 $out = '';
1667 $toASCII =& $this->toASCII['utf-8'];
1668
1669 for($i=0; isset($str{$i}); $i++) {
1670 $c = ord($str{$i});
1671 if (!($c & 0x80)) // single-byte (0xxxxxx)
1672 $mbc = $str{$i};
1673 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1674 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1675 $mbc = substr($str,$i,$bc);
1676 $i += $bc-1;
1677 }
1678
1679 if (isset($toASCII[$mbc])) {
1680 $out .= $toASCII[$mbc];
1681 } else {
1682 $out .= $mbc;
1683 }
1684 }
1685
1686 return $out;
1687 }
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705 /********************************************
1706 *
1707 * Internal EUC string operation functions
1708 *
1709 * Extended Unix Code:
1710 * ASCII compatible 7bit single bytes chars
1711 * 8bit two byte chars
1712 *
1713 * Shift-JIS is treated as a special case.
1714 *
1715 ********************************************/
1716
1717 /**
1718 * Cuts a string in the EUC charset family short at a given byte length.
1719 *
1720 * @param string EUC multibyte character string
1721 * @param integer the byte length
1722 * @param string the charset
1723 * @return string the shortened string
1724 * @see mb_strcut()
1725 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1726 */
1727 function euc_strtrunc($str,$len,$charset) {
1728 $sjis = ($charset == 'shift_jis');
1729 for ($i=0; isset($str{$i}) && $i<$len; $i++) {
1730 $c = ord($str{$i});
1731 if ($sjis) {
1732 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1733 }
1734 else {
1735 if ($c >= 0x80) $i++; // advance a double-byte char
1736 }
1737 }
1738 if (!isset($str{$i})) return $str; // string shorter than supplied length
1739
1740 if ($i>$len)
1741 return substr($str,0,$len-1); // we ended on a first byte
1742 else
1743 return substr($str,0,$len);
1744 }
1745
1746 /**
1747 * Returns a part of a string in the EUC charset family.
1748 *
1749 * @param string EUC multibyte character string
1750 * @param integer start position (character position)
1751 * @param string the charset
1752 * @param integer length (in characters)
1753 * @return string the substring
1754 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1755 */
1756 function euc_substr($str,$start,$charset,$len=null) {
1757 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1758 if ($byte_start === false) return false; // $start outside string length
1759
1760 $str = substr($str,$byte_start);
1761
1762 if ($len!=null) {
1763 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1764 if ($byte_end === false) // $len outside actual string length
1765 return $str;
1766 else
1767 return substr($str,0,$byte_end);
1768 }
1769 else return $str;
1770 }
1771
1772 /**
1773 * Counts the number of characters of a string in the EUC charset family.
1774 *
1775 * @param string EUC multibyte character string
1776 * @param string the charset
1777 * @return integer the number of characters
1778 * @see strlen()
1779 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1780 */
1781 function euc_strlen($str,$charset) {
1782 $sjis = ($charset == 'shift_jis');
1783 $n=0;
1784 for ($i=0; isset($str{$i}); $i++) {
1785 $c = ord($str{$i});
1786 if ($sjis) {
1787 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1788 }
1789 else {
1790 if ($c >= 0x80) $i++; // advance a double-byte char
1791 }
1792
1793 $n++;
1794 }
1795
1796 return $n;
1797 }
1798
1799 /**
1800 * Translates a character position into an 'absolute' byte position.
1801 *
1802 * @param string EUC multibyte character string
1803 * @param integer character position (negative values start from the end)
1804 * @param string the charset
1805 * @return integer byte position
1806 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1807 */
1808 function euc_char2byte_pos($str,$pos,$charset) {
1809 $sjis = ($charset == 'shift_jis');
1810 $n = 0; // number of characters seen
1811 $p = abs($pos); // number of characters wanted
1812
1813 if ($pos >= 0) {
1814 $i = 0;
1815 $d = 1;
1816 } else {
1817 $i = strlen($str)-1;
1818 $d = -1;
1819 }
1820
1821 for ( ; isset($str{$i}) && $n<$p; $i+=$d) {
1822 $c = ord($str{$i});
1823 if ($sjis) {
1824 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char
1825 }
1826 else {
1827 if ($c >= 0x80) $i+=$d; // advance a double-byte char
1828 }
1829
1830 $n++;
1831 }
1832 if (!isset($str{$i})) return false; // offset beyond string length
1833
1834 if ($pos < 0) $i++; // correct offset
1835
1836 return $i;
1837 }
1838
1839 /**
1840 * Translates all characters of a string in the EUC charset family into their respective case values.
1841 *
1842 * @param string EUC multibyte character string
1843 * @param string conversion: 'toLower' or 'toUpper'
1844 * @param string the charset
1845 * @return string the converted string
1846 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1847 * @see strtolower(), strtoupper(), mb_convert_case()
1848 */
1849 function euc_conv_case($str,$case,$charset) {
1850 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1851
1852 $sjis = ($charset == 'shift_jis');
1853 $out = '';
1854 $caseConv =& $this->caseFolding[$charset][$case];
1855 for($i=0; $mbc=$str{$i}; $i++) {
1856 $c = ord($str{$i});
1857
1858 if ($sjis) {
1859 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
1860 $mbc = substr($str,$i,2);
1861 $i++;
1862 }
1863 }
1864 else {
1865 if ($c >= 0x80) { // a double-byte char
1866 $mbc = substr($str,$i,2);
1867 $i++;
1868 }
1869 }
1870
1871 $cc = $caseConv[$mbc];
1872 if ($cc) {
1873 $out .= $cc;
1874 } else {
1875 $out .= $mbc;
1876 }
1877 }
1878
1879 return $out;
1880 }
1881 }
1882
1883 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
1884 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
1885 }
1886 ?>