Fix PHP version and platform issues
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2004 Kasper Skaarhoj (kasper@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasper@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 128: class t3lib_cs
38 * 442: function parse_charset($charset)
39 * 460: function get_locale_charset($locale)
40 * 492: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
41 * 529: function utf8_encode($str,$charset)
42 * 576: function utf8_decode($str,$charset,$useEntityForNoChar=0)
43 * 619: function utf8_to_entities($str)
44 * 652: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
45 * 686: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
46 * 736: function UnumberToChar($cbyte)
47 * 781: function utf8CharToUnumber($str,$hex=0)
48 *
49 * SECTION: Init functions
50 * 824: function initCharset($charset)
51 * 885: function initCaseFoldingUTF8()
52 * 973: function initCaseFolding($charset)
53 *
54 * SECTION: String operation functions
55 * 1058: function substr($charset,$string,$start,$len=null)
56 * 1096: function strlen($charset,$string)
57 * 1124: function crop($charset,$string,$len,$crop='')
58 * 1165: function strtrunc($charset,$string,$len)
59 * 1197: function conv_case($charset,$string,$case)
60 *
61 * SECTION: Internal UTF-8 string operation functions
62 * 1264: function utf8_substr($str,$start,$len=null)
63 * 1297: function utf8_strlen($str)
64 * 1318: function utf8_strtrunc($str,$len)
65 * 1340: function utf8_strpos($haystack,$needle,$offset=0)
66 * 1363: function utf8_strrpos($haystack,$needle)
67 * 1383: function utf8_char2byte_pos($str,$pos)
68 * 1424: function utf8_byte2char_pos($str,$pos)
69 * 1448: function utf8_conv_case($str,$case)
70 *
71 * SECTION: Internal EUC string operation functions
72 * 1514: function euc_strtrunc($str,$len,$charset)
73 * 1543: function euc_substr($str,$start,$charset,$len=null)
74 * 1568: function euc_strlen($str,$charset)
75 * 1595: function euc_char2byte_pos($str,$pos,$charset)
76 * 1636: function euc_conv_case($str,$case,$charset)
77 *
78 * TOTAL FUNCTIONS: 31
79 * (This index is automatically created/updated by the extension "extdeveval")
80 *
81 */
82
83
84
85
86
87
88
89
90 /**
91 * Notes on UTF-8
92 *
93 * Functions working on UTF-8 strings:
94 *
95 * - strchr/strstr
96 * - strrchr
97 * - substr_count
98 * - implode/explode/join
99 *
100 * Functions nearly working on UTF-8 strings:
101 *
102 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf_strlen
103 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained 7-bit ASCII
104 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
105 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
106 *
107 * Functions NOT working on UTF-8 strings:
108 *
109 * - str*cmp
110 * - stristr
111 * - stripos
112 * - substr
113 * - strrev
114 * - ereg/eregi
115 * - split/spliti
116 * - preg_*
117 * - ...
118 *
119 */
120 /**
121 * Class for conversion between charsets
122 *
123 * @author Kasper Skaarhoj <kasper@typo3.com>
124 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
125 * @package TYPO3
126 * @subpackage t3lib
127 */
128 class t3lib_cs {
129 var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
130
131 // This is the array where parsed conversion tables are stored (cached)
132 var $parsedCharsets=array();
133
134 // An array where case folding data will be stored (cached)
135 var $caseFolding=array();
136
137 // An array where charset-to-ASCII mappings are stored (cached)
138 var $toASCII=array();
139
140 // This tells the converter which charsets has two bytes per char:
141 var $twoByteSets=array(
142 'ucs-2'=>1, // 2-byte Unicode
143 );
144
145 // This tells the converter which charsets has four bytes per char:
146 var $fourByteSets=array(
147 'ucs-4'=>1, // 4-byte Unicode
148 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
149 );
150
151 // This tells the converter which charsets use a scheme like the Extended Unix Code:
152 var $eucBasedSets=array(
153 'gb2312'=>1, // Chinese, simplified.
154 'big5'=>1, // Chinese, traditional.
155 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
156 );
157
158 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
159 // http://czyborra.com/charsets/iso8859.html
160 var $synonyms=array(
161 'us' => 'ascii',
162 'us-ascii'=> 'ascii',
163 'cp819' => 'iso-8859-1',
164 'ibm819' => 'iso-8859-1',
165 'iso-ir-100' => 'iso-8859-1',
166 'iso-ir-109' => 'iso-8859-2',
167 'iso-ir-148' => 'iso-8859-9',
168 'iso-ir-199' => 'iso-8859-14',
169 'iso-ir-203' => 'iso-8859-15',
170 'csisolatin1' => 'iso-8859-1',
171 'csisolatin2' => 'iso-8859-2',
172 'csisolatin3' => 'iso-8859-3',
173 'csisolatin5' => 'iso-8859-9',
174 'csisolatin8' => 'iso-8859-14',
175 'csisolatin9' => 'iso-8859-15',
176 'csisolatingreek' => 'iso-8859-7',
177 'iso-celtic' => 'iso-8859-14',
178 'latin1' => 'iso-8859-1',
179 'latin2' => 'iso-8859-2',
180 'latin3' => 'iso-8859-3',
181 'latin5' => 'iso-8859-9',
182 'latin6' => 'iso-8859-10',
183 'latin8' => 'iso-8859-14',
184 'latin9' => 'iso-8859-15',
185 'l1' => 'iso-8859-1',
186 'l2' => 'iso-8859-2',
187 'l3' => 'iso-8859-3',
188 'l5' => 'iso-8859-9',
189 'l6' => 'iso-8859-10',
190 'l8' => 'iso-8859-14',
191 'l9' => 'iso-8859-15',
192 'cyrillic' => 'iso-8859-5',
193 'arabic' => 'iso-8859-6',
194 'tis-620' => 'iso-8859-11',
195 'win874' => 'windows-874',
196 'win1250' => 'windows-1250',
197 'win1251' => 'windows-1251',
198 'win1252' => 'windows-1252',
199 'win1253' => 'windows-1253',
200 'win1254' => 'windows-1254',
201 'win1255' => 'windows-1255',
202 'win1256' => 'windows-1256',
203 'win1257' => 'windows-1257',
204 'win1258' => 'windows-1258',
205 'cp1250' => 'windows-1250',
206 'cp1251' => 'windows-1251',
207 'cp1252' => 'windows-1252',
208 'ms-ee' => 'windows-1250',
209 'ms-ansi' => 'windows-1252',
210 'ms-greek' => 'windows-1253',
211 'ms-turk' => 'windows-1254',
212 'winbaltrim' => 'windows-1257',
213 'koi-8ru' => 'koi-8r',
214 'koi8r' => 'koi-8r',
215 'cp878' => 'koi-8r',
216 'mac' => 'macroman',
217 'macintosh' => 'macroman',
218 'euc-cn' => 'gb2312',
219 'x-euc-cn' => 'gb2312',
220 'euccn' => 'gb2312',
221 'cp936' => 'gb2312',
222 'big-5' => 'big5',
223 'cp950' => 'big5',
224 'eucjp' => 'euc-jp',
225 'sjis' => 'shift_jis',
226 'shift-jis' => 'shift_jis',
227 'cp932' => 'shift_jis',
228 'utf7' => 'utf-7',
229 'utf8' => 'utf-8',
230 'utf16' => 'utf-16',
231 'utf32' => 'utf-32',
232 'utf8' => 'utf-8',
233 'ucs2' => 'ucs-2',
234 'ucs4' => 'ucs-4',
235 );
236
237 // mapping of iso-639:2 language codes to language (family) names
238 var $lang_to_langfamily=array(
239 // iso-639:2 language codes, see:
240 // http://www.w3.org/WAI/ER/IG/ert/iso639.htm
241 // http://www.unicode.org/onlinedat/languages.html
242 'ar' => 'arabic',
243 'bg' => 'cyrillic',
244 'cs' => 'east_european',
245 'da' => 'west_european',
246 'de' => 'west_european',
247 'es' => 'west_european',
248 'et' => 'estonian',
249 'eu' => 'west_european',
250 'fi' => 'west_european',
251 'fr' => 'west_european',
252 'gr' => 'greek',
253 'hr' => 'east_european',
254 'hu' => 'east_european',
255 'iw' => 'hebrew',
256 'is' => 'west_european',
257 'it' => 'west_european',
258 'ja' => 'japanese',
259 'kl' => 'west_european',
260 'ko' => 'korean',
261 'lt' => 'lithuanian',
262 'lv' => 'west_european', // Latvian/Lettish
263 'nl' => 'west_european',
264 'no' => 'west_european',
265 'pl' => 'east_european',
266 'pt' => 'west_european',
267 'ro' => 'east_european',
268 'ru' => 'cyrillic',
269 'sk' => 'east_european',
270 'sl' => 'east_european',
271 'sv' => 'west_european',
272 'th' => 'thai',
273 'uk' => 'cyrillic',
274 'vi' => 'vietnamese',
275 'zh' => 'chinese',
276 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
277 'chs' => 'simpl_chinese',
278 'cht' => 'trad_chinese',
279 'csy' => 'east_european',
280 'dan' => 'west_european',
281 'deu' => 'west_european',
282 'dea' => 'west_european',
283 'des' => 'west_european',
284 'ena' => 'west_european',
285 'enc' => 'west_european',
286 'eng' => 'west_european',
287 'enz' => 'west_european',
288 'enu' => 'west_european',
289 'nld' => 'west_european',
290 'nlb' => 'west_european',
291 'fin' => 'west_european',
292 'fra' => 'west_european',
293 'frb' => 'west_european',
294 'frc' => 'west_european',
295 'frs' => 'west_european',
296 'ell' => 'greek',
297 'hun' => 'east_european',
298 'isl' => 'west_euorpean',
299 'ita' => 'west_european',
300 'its' => 'west_european',
301 'jpn' => 'japanese',
302 'kor' => 'korean',
303 'nor' => 'west_european',
304 'non' => 'west_european',
305 'plk' => 'east_european',
306 'ptg' => 'west_european',
307 'ptb' => 'west_european',
308 'rus' => 'east_european',
309 'sky' => 'east_european',
310 'esp' => 'west_european',
311 'esm' => 'west_european',
312 'esn' => 'west_european',
313 'sve' => 'west_european',
314 'trk' => 'turkish',
315 // English language names
316 'bulgarian' => 'east_european',
317 'catalan' => 'west_european',
318 'croatian' => 'east_european',
319 'czech' => 'east_european',
320 'danish' => 'west_european',
321 'dutch' => 'west_european',
322 'english' => 'west_european',
323 'finnish' => 'west_european',
324 'french' => 'west_european',
325 'galician' => 'west_european',
326 'german' => 'west_european',
327 'hungarian' => 'east_european',
328 'icelandic' => 'west_european',
329 'italian' => 'west_european',
330 'latvian' => 'west_european',
331 'lettish' => 'west_european',
332 'norwegian' => 'west_european',
333 'polish' => 'east_european',
334 'portuguese' => 'west_european',
335 'russian' => 'cyrillic',
336 'romanian' => 'east_european',
337 'slovak' => 'east_european',
338 'slovenian' => 'east_european',
339 'spanish' => 'west_european',
340 'svedish' => 'west_european',
341 'turkish' => 'east_european',
342 'ukrainian' => 'cyrillic',
343 );
344
345 // mapping of language (family) names to charsets on Unix
346 var $lang_to_charset_unix=array(
347 'west_european' => 'iso-8859-1',
348 'estonian' => 'iso-8859-1',
349 'east_european' => 'iso-8859-2',
350 'baltic' => 'iso-8859-4',
351 'cyrillic' => 'iso-8859-5',
352 'arabic' => 'iso-8859-6',
353 'greek' => 'iso-8859-7',
354 'hebrew' => 'iso-8859-8',
355 'turkish' => 'iso-8859-9',
356 'thai' => 'iso-8859-11', // = TIS-620
357 'lithuanian' => 'iso-8859-13',
358 'chinese' => 'gb2312', // = euc-cn
359 'japanese' => 'euc-jp',
360 'korean' => 'euc-kr',
361 'simpl_chinese' => 'gb2312',
362 'trad_chinese' => 'big5',
363 'vietnamese' => '',
364 );
365
366 // mapping of language (family) names to charsets on Windows
367 var $lang_to_charset_windows=array(
368 'east_european' => 'windows-1250',
369 'cyrillic' => 'windows-1251',
370 'west_european' => 'windows-1252',
371 'greek' => 'windows-1253',
372 'turkish' => 'windows-1254',
373 'hebrew' => 'windows-1255',
374 'arabic' => 'windows-1256',
375 'baltic' => 'windows-1257',
376 'estonian' => 'windows-1257',
377 'lithuanian' => 'windows-1257',
378 'vietnamese' => 'windows-1258',
379 'thai' => 'cp874',
380 'korean' => 'cp950',
381 'chinese' => 'gb2312',
382 'japanese' => 'shift_jis',
383 'simpl_chinese' => 'gb2312',
384 'trad_chinese' => 'big5',
385 );
386
387 // mapping of locale names to charsets
388 var $locale_to_charset=array(
389 'japanese.euc' => 'euc-jp',
390 'ja_jp.ujis' => 'euc-jp',
391 'korean.euc' => 'euc-kr',
392 'zh_cn' => 'gb2312',
393 'zh_hk' => 'big5',
394 'zh_tw' => 'big5',
395 );
396
397 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
398 // Empty values means "iso-8859-1"
399 var $charSetArray = array(
400 'dk' => '',
401 'de' => '',
402 'no' => '',
403 'it' => '',
404 'fr' => '',
405 'es' => '',
406 'nl' => '',
407 'cz' => 'windows-1250',
408 'pl' => 'iso-8859-2',
409 'si' => 'windows-1250',
410 'fi' => '',
411 'tr' => 'iso-8859-9',
412 'se' => '',
413 'pt' => '',
414 'ru' => 'windows-1251',
415 'ro' => 'iso-8859-2',
416 'ch' => 'gb2312',
417 'sk' => 'windows-1250',
418 'lt' => 'windows-1257',
419 'is' => 'utf-8',
420 'hr' => 'windows-1250',
421 'hu' => 'iso-8859-2',
422 'gl' => '',
423 'th' => 'iso-8859-11',
424 'gr' => 'iso-8859-7',
425 'hk' => 'big5',
426 'eu' => '',
427 'bg' => 'windows-1251',
428 'br' => '',
429 'et' => 'iso-8859-4',
430 'ar' => 'iso-8859-6',
431 'he' => 'utf-8',
432 'ua' => 'windows-1251',
433 'jp' => 'shift_jis',
434 'lv' => 'utf-8',
435 'vn' => 'utf-8',
436 );
437
438 /**
439 * Normalize - changes input character set to lowercase letters.
440 *
441 * @param string Input charset
442 * @return string Normalized charset
443 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
444 */
445 function parse_charset($charset) {
446 $charset = strtolower($charset);
447 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
448
449 return $charset;
450 }
451
452 /**
453 * Get the charset of a locale.
454 *
455 * ln language
456 * ln_CN language / country
457 * ln_CN.cs language / country / charset
458 *
459 * @param string Locale string
460 * @return string Charset resolved for locale string
461 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
462 */
463 function get_locale_charset($locale) {
464 $locale = strtolower($locale);
465
466 // exact locale specific charset?
467 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale];
468
469 // locale contains charset: use it
470 list($locale,$charset) = explode('.',$locale);
471 if ($charset) return $this->parse_charset($charset);
472
473 // get language
474 list($language,$country) = explode('_',$locale);
475 if (isset($this->lang_to_langfamily[$language])) $language = $this->lang_to_langfamily[$language];
476
477 if (TYPO3_OS == 'WIN') {
478 $cs = $this->lang_to_charset_windows[$language];
479 } else {
480 $cs = $this->lang_to_charset_unix[$language];
481 }
482
483 return $cs ? $cs : 'iso-8859-1';
484 }
485
486
487
488
489
490
491
492
493
494 /********************************************
495 *
496 * Charset Conversion functions
497 *
498 ********************************************/
499
500 /**
501 * Convert from one charset to another charset.
502 *
503 * @param string Input string
504 * @param string From charset (the current charset of the string)
505 * @param string To charset (the output charset wanted)
506 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
507 * @return string Converted string
508 */
509 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
510 if ($fromCS==$toCS) return $str;
511
512 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
513 if ($toCS=='utf-8' || !$useEntityForNoChar) {
514 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
515 case 'mbstring':
516 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
517 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
518 break;
519
520 case 'iconv':
521 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
522 if (false !== $conv_str) return $conv_str;
523 break;
524
525 case 'recode':
526 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
527 if (false !== $conv_str) return $conv_str;
528 break;
529 }
530 // fallback to TYPO3 conversion
531 }
532
533 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
534 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
535 return $str;
536 }
537
538
539 /**
540 * Converts $str from $charset to UTF-8
541 *
542 * @param string String in local charset to convert to UTF-8
543 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
544 * @return string Output string, converted to UTF-8
545 */
546 function utf8_encode($str,$charset) {
547
548 // Charset is case-insensitive.
549 if ($this->initCharset($charset)) { // Parse conv. table if not already...
550 $strLen = strlen($str);
551 $outStr='';
552
553 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
554 $chr=substr($str,$a,1);
555 $ord=ord($chr);
556 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
557 $ord2 = ord($str{$a+1});
558 $ord = $ord<<8 & $ord2; // assume big endian
559
560 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
561 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
562 } else $outStr.=chr($this->noCharByteVal); // No char exists
563 $a++;
564 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
565 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
566 $a++;
567 $ord2=ord(substr($str,$a,1));
568 $ord = $ord*256+$ord2;
569 }
570 elseif ($charset == 'shift_jis' && ($ord <160 || $ord>223)) { // Shift-JIS is like EUC, but chars between 160 and 223 are single byte
571 $a++;
572 $ord2=ord(substr($str,$a,1));
573 $ord = $ord*256+$ord2;
574 }
575
576 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
577 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
578 } else $outStr.=chr($this->noCharByteVal); // No char exists
579 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
580 }
581 return $outStr;
582 }
583 }
584
585 /**
586 * Converts $str from UTF-8 to $charset
587 *
588 * @param string String in UTF-8 to convert to local charset
589 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
590 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
591 * @return string Output string, converted to local charset
592 */
593 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
594
595 // Charset is case-insensitive.
596 if ($this->initCharset($charset)) { // Parse conv. table if not already...
597 $strLen = strlen($str);
598 $outStr='';
599 $buf='';
600 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
601 $chr=substr($str,$a,1);
602 $ord=ord($chr);
603 if ($ord>127) { // This means multibyte! (first byte!)
604 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
605
606 $buf=$chr; // Add first byte
607 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
608 $ord = $ord << 1; // Shift it left and ...
609 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
610 $a++; // Increase pointer...
611 $buf.=substr($str,$a,1); // ... and add the next char.
612 } else break;
613 }
614
615 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
616 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
617 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
618 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
619 } else $outStr.= chr($mByte);
620 } elseif ($useEntityForNoChar) { // Create num entity:
621 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
622 } else $outStr.=chr($this->noCharByteVal); // No char exists
623 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
624 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
625 }
626 return $outStr;
627 }
628 }
629
630 /**
631 * Converts all chars > 127 to numeric entities.
632 *
633 * @param string Input string
634 * @return string Output string
635 */
636 function utf8_to_entities($str) {
637 $strLen = strlen($str);
638 $outStr='';
639 $buf='';
640 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
641 $chr=substr($str,$a,1);
642 $ord=ord($chr);
643 if ($ord>127) { // This means multibyte! (first byte!)
644 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
645 $buf=$chr; // Add first byte
646 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
647 $ord = $ord << 1; // Shift it left and ...
648 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
649 $a++; // Increase pointer...
650 $buf.=substr($str,$a,1); // ... and add the next char.
651 } else break;
652 }
653
654 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
655 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
656 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
657 }
658
659 return $outStr;
660 }
661
662 /**
663 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
664 *
665 * @param string Input string, UTF-8
666 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
667 * @return string Output string
668 */
669 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
670 if ($alsoStdHtmlEnt) {
671 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
672 }
673
674 $token = md5(microtime());
675 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
676 foreach($parts as $k => $v) {
677 if ($k%2) {
678 if (substr($v,0,1)=='#') { // Dec or hex entities:
679 if (substr($v,1,1)=='x') {
680 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
681 } else {
682 $parts[$k] = $this->UnumberToChar(substr($v,1));
683 }
684 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
685 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
686 } else { // No conversion:
687 $parts[$k] ='&'.$v.';';
688 }
689 }
690 }
691
692 return implode('',$parts);
693 }
694
695 /**
696 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
697 *
698 * @param string Input string, UTF-8
699 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
700 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
701 * @return array Output array with the char numbers
702 */
703 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
704 // If entities must be registered as well...:
705 if ($convEntities) {
706 $str = $this->entities_to_utf8($str,1);
707 }
708 // Do conversion:
709 $strLen = strlen($str);
710 $outArr=array();
711 $buf='';
712 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
713 $chr=substr($str,$a,1);
714 $ord=ord($chr);
715 if ($ord>127) { // This means multibyte! (first byte!)
716 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
717 $buf=$chr; // Add first byte
718 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
719 $ord = $ord << 1; // Shift it left and ...
720 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
721 $a++; // Increase pointer...
722 $buf.=substr($str,$a,1); // ... and add the next char.
723 } else break;
724 }
725
726 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
727 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
728 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
729 }
730
731 return $outArr;
732 }
733
734 /**
735 * Converts a UNICODE number to a UTF-8 multibyte character
736 * Algorithm based on script found at From: http://czyborra.com/utf/
737 * Unit-tested by Kasper
738 *
739 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
740 *
741 * bytes | bits | representation
742 * 1 | 7 | 0vvvvvvv
743 * 2 | 11 | 110vvvvv 10vvvvvv
744 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
745 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
746 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
747 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
748 *
749 * @param integer UNICODE integer
750 * @return string UTF-8 multibyte character string
751 * @see utf8CharToUnumber()
752 */
753 function UnumberToChar($cbyte) {
754 $str='';
755
756 if ($cbyte < 0x80) {
757 $str.=chr($cbyte);
758 } else if ($cbyte < 0x800) {
759 $str.=chr(0xC0 | ($cbyte >> 6));
760 $str.=chr(0x80 | ($cbyte & 0x3F));
761 } else if ($cbyte < 0x10000) {
762 $str.=chr(0xE0 | ($cbyte >> 12));
763 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
764 $str.=chr(0x80 | ($cbyte & 0x3F));
765 } else if ($cbyte < 0x200000) {
766 $str.=chr(0xF0 | ($cbyte >> 18));
767 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
768 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
769 $str.=chr(0x80 | ($cbyte & 0x3F));
770 } else if ($cbyte < 0x4000000) {
771 $str.=chr(0xF8 | ($cbyte >> 24));
772 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
773 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
774 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
775 $str.=chr(0x80 | ($cbyte & 0x3F));
776 } else if ($cbyte < 0x80000000) {
777 $str.=chr(0xFC | ($cbyte >> 30));
778 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
779 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
780 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
781 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
782 $str.=chr(0x80 | ($cbyte & 0x3F));
783 } else { // Cannot express a 32-bit character in UTF-8
784 $str .= chr($this->noCharByteVal);
785 }
786 return $str;
787 }
788
789 /**
790 * Converts a UTF-8 Multibyte character to a UNICODE number
791 * Unit-tested by Kasper
792 *
793 * @param string UTF-8 multibyte character string
794 * @param boolean If set, then a hex. number is returned.
795 * @return integer UNICODE integer
796 * @see UnumberToChar()
797 */
798 function utf8CharToUnumber($str,$hex=0) {
799 $ord=ord(substr($str,0,1)); // First char
800
801 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
802 $binBuf='';
803 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
804 $ord = $ord << 1; // Shift it left and ...
805 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
806 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
807 } else break;
808 }
809 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
810
811 $int = bindec($binBuf);
812 } else $int = $ord;
813
814 return $hex ? 'x'.dechex($int) : $int;
815 }
816
817
818
819
820
821
822
823
824
825 /********************************************
826 *
827 * Init functions
828 *
829 ********************************************/
830
831 /**
832 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
833 * This function is automatically called by the conversion functions
834 *
835 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
836 *
837 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
838 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
839 * @access private
840 */
841 function initCharset($charset) {
842 // Only process if the charset is not yet loaded:
843 if (!is_array($this->parsedCharsets[$charset])) {
844
845 // Conversion table filename:
846 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
847
848 // If the conversion table is found:
849 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
850 // Cache file for charsets:
851 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
852 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
853 if ($cacheFile && @is_file($cacheFile)) {
854 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
855 } else {
856 // Parse conversion table into lines:
857 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
858 // Initialize the internal variable holding the conv. table:
859 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
860 // traverse the lines:
861 $detectedType='';
862 foreach($lines as $value) {
863 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
864
865 // Detect type if not done yet: (Done on first real line)
866 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
867 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
868
869 if ($detectedType=='ms-token') {
870 list($hexbyte,$utf8) = split('=|:',$value,3);
871 } elseif ($detectedType=='whitespaced') {
872 $regA=array();
873 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
874 $hexbyte = $regA[1];
875 $utf8 = 'U+'.$regA[2];
876 }
877 $decval = hexdec(trim($hexbyte));
878 if ($decval>127) {
879 $utf8decval = hexdec(substr(trim($utf8),2));
880 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
881 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
882 }
883 }
884 }
885 if ($cacheFile) {
886 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
887 }
888 }
889 return 2;
890 } else return false;
891 } else return 1;
892 }
893
894 /**
895 * This function initializes all UTF-8 character data tables.
896 *
897 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
898 *
899 * @param string ???
900 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
901 * @access private
902 */
903 function initUnicodeData($mode=null) {
904 // cache files
905 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
906 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
907
908 // Only process if the tables are not yet loaded
909 switch($mode) {
910 case 'case':
911 if (is_array($this->caseFolding['utf-8'])) return 1;
912
913 // Use cached version if possible
914 if ($cacheFileCase && @is_file($cacheFileCase)) {
915 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFile));
916 return 2;
917 }
918 break;
919
920 case 'ascii':
921 if (is_array($this->toASCII['utf-8'])) return 1;
922
923 // Use cached version if possible
924 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
925 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
926 return 2;
927 }
928 break;
929 }
930
931 // process main Unicode data file
932 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
933 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
934
935 $fh = fopen($unicodeDataFile,'rb');
936 if (!$fh) return false;
937
938 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
939 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
940 $this->caseFolding['utf-8'] = array();
941 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
942 $utf8CaseFolding['toUpper'] = array();
943 $utf8CaseFolding['toLower'] = array();
944 $utf8CaseFolding['toTitle'] = array();
945
946 $decomposition = array(); // array of temp. decompositions
947 $mark = array(); // array of chars that are marks (eg. composing accents)
948 $number = array(); // array of chars that are numbers (eg. digits)
949
950 while (!feof($fh)) {
951 $line = fgets($fh,4096);
952 // has a lot of info
953 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
954
955 $ord = hexdec($char);
956 if ($ord > 0xFFFF) break; // only process the BMP
957
958 $utf8_char = $this->UnumberToChar($ord);
959
960 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
961 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
962 // store "title" only when different from "upper" (only a few)
963 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
964
965 switch ($cat{0}) {
966 case 'M': // mark (accent, umlaut, ...)
967 $mark["U+$char"] = 1;
968 break;
969
970 case 'N': // numeric value
971 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
972 }
973
974 // accented Latin letters without "official" decomposition
975 $match = array();
976 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) {
977 $c = ord($match[2]);
978 if ($match[1] == 'SMALL') $c += 32;
979
980 $decomposition["U+$char"] = array(dechex($c));
981 continue;
982 }
983
984 $match = array();
985 if (ereg('(<.*>)? *(.+)',$decomp,$match)) {
986 switch($match[1]) {
987 case '<circle>': // add parenthesis as circle replacement, eg (1)
988 $match[2] = '0028 '.$match[2].' 0029';
989 break;
990
991 case '<square>': // add square brackets as square replacement, eg [1]
992 $match[2] = '005B '.$match[2].' 005D';
993 break;
994
995 case '<compat>': // ignore multi char decompositions that start with a space
996 if (ereg('^0020 ',$match[2])) continue 2;
997 break;
998
999 // ignore Arabic and vertical layout presentation decomposition
1000 case '<initial>':
1001 case '<medial>':
1002 case '<final>':
1003 case '<isolated>':
1004 case '<vertical>':
1005 continue 2;
1006 }
1007 $decomposition["U+$char"] = split(' ',$match[2]);
1008 }
1009 }
1010 fclose($fh);
1011
1012 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1013 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
1014 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1015 $fh = fopen($specialCasingFile,'rb');
1016 if ($fh) {
1017 while (!feof($fh)) {
1018 $line = fgets($fh,4096);
1019 if ($line{0} != '#' && trim($line) != '') {
1020
1021 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
1022 if ($cond == '' || $cond{0} == '#') {
1023 $utf8_char = $this->UnumberToChar(hexdec($char));
1024 if ($char != $lower) {
1025 $arr = split(' ',$lower);
1026 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1027 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1028 }
1029 if ($char != $title && $title != $upper) {
1030 $arr = split(' ',$title);
1031 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1032 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1033 }
1034 if ($char != $upper) {
1035 $arr = split(' ',$upper);
1036 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1037 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1038 }
1039 }
1040 }
1041 }
1042 fclose($fh);
1043 }
1044 }
1045
1046 // process custom decompositions
1047 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
1048 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1049 $fh = fopen($customTranslitFile,'rb');
1050 if ($fh) {
1051 while (!feof($fh)) {
1052 $line = fgets($fh,4096);
1053 if ($line{0} != '#' && trim($line) != '') {
1054 list($char,$translit) = t3lib_div::trimExplode(';', $line);
1055 $decomposition["U+$char"] = split(' ', $translit);
1056 }
1057 }
1058 fclose($fh);
1059 }
1060 }
1061
1062 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1063 foreach($decomposition as $from => $to) {
1064 $code_decomp = array();
1065
1066 while ($code_value = array_shift($to)) {
1067 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1068 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
1069 array_unshift($to, $cv);
1070 }
1071 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1072 array_push($code_decomp, $code_value);
1073 }
1074 }
1075 if (count($code_decomp)) {
1076 $decomposition[$from] = $code_decomp;
1077 } else {
1078 unset($decomposition[$from]);
1079 }
1080 }
1081
1082 // create ascii only mapping
1083 $this->toASCII['utf-8'] = array();
1084 $ascii =& $this->toASCII['utf-8'];
1085
1086 foreach($decomposition as $from => $to) {
1087 $code_decomp = array();
1088 while ($code_value = array_shift($to)) {
1089 $ord = hexdec($code_value);
1090 if ($ord > 127)
1091 continue 2; // skip decompositions containing non-ASCII chars
1092 else
1093 array_push($code_decomp,chr($ord));
1094 }
1095 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1096 }
1097
1098 // add numeric decompositions
1099 foreach($number as $from => $to) {
1100 $utf8_char = $this->UnumberToChar(hexdec($from));
1101 if (!isset($ascii[$utf8_char])) {
1102 $ascii[$utf8_char] = $to;
1103 }
1104 }
1105
1106 if ($cacheFileCase) {
1107 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1108 }
1109
1110 if ($cacheFileASCII) {
1111 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1112 }
1113
1114 return 3;
1115 }
1116
1117 /**
1118 * This function initializes the folding table for a charset other than UTF-8.
1119 * This function is automatically called by the case folding functions.
1120 *
1121 * @param string Charset for which to initialize case folding.
1122 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1123 * @access private
1124 */
1125 function initCaseFolding($charset) {
1126 // Only process if the case table is not yet loaded:
1127 if (is_array($this->caseFolding[$charset])) return 1;
1128
1129 // Use cached version if possible
1130 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1131 if ($cacheFile && @is_file($cacheFile)) {
1132 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1133 return 2;
1134 }
1135
1136 // init UTF-8 conversion for this charset
1137 if (!$this->initCharset($charset)) {
1138 return false;
1139 }
1140
1141 // UTF-8 case folding is used as the base conversion table
1142 if (!$this->initUnicodeData('case')) {
1143 return false;
1144 }
1145
1146 $nochar = chr($this->noCharByteVal);
1147 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1148 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1149 $c = $this->utf8_decode($utf8, $charset);
1150
1151 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1152 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1153 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1154
1155 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1156 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1157 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
1158
1159 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1160 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1161 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1162 }
1163
1164 // add the ASCII case table
1165 for ($i=ord('a'); $i<=ord('z'); $i++) {
1166 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
1167 }
1168 for ($i=ord('A'); $i<=ord('Z'); $i++) {
1169 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
1170 }
1171
1172 if ($cacheFile) {
1173 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
1174 }
1175
1176 return 3;
1177 }
1178
1179 /**
1180 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1181 * This function is automatically called by the ASCII transliteration functions.
1182 *
1183 * @param string Charset for which to initialize conversion.
1184 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1185 * @access private
1186 */
1187 function initToASCII($charset) {
1188 // Only process if the case table is not yet loaded:
1189 if (is_array($this->toASCII[$charset])) return 1;
1190
1191 // Use cached version if possible
1192 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1193 if ($cacheFile && @is_file($cacheFile)) {
1194 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1195 return 2;
1196 }
1197
1198 // init UTF-8 conversion for this charset
1199 if (!$this->initCharset($charset)) {
1200 return false;
1201 }
1202
1203 // UTF-8/ASCII transliteration is used as the base conversion table
1204 if (!$this->initUnicodeData('ascii')) {
1205 return false;
1206 }
1207
1208 $nochar = chr($this->noCharByteVal);
1209 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1210 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1211 $c = $this->utf8_decode($utf8, $charset);
1212
1213 if (isset($this->toASCII['utf-8'][$utf8])) {
1214 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1215 }
1216 }
1217
1218 if ($cacheFile) {
1219 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
1220 }
1221
1222 return 3;
1223 }
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240 /********************************************
1241 *
1242 * String operation functions
1243 *
1244 ********************************************/
1245
1246 /**
1247 * Returns a part of a string.
1248 * Unit-tested by Kasper (single byte charsets only)
1249 *
1250 * @param string The character set
1251 * @param string Character string
1252 * @param integer Start position (character position)
1253 * @param integer Length (in characters)
1254 * @return string The substring
1255 * @see substr(), mb_substr()
1256 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1257 */
1258 function substr($charset,$string,$start,$len=null) {
1259 if ($len===0) return '';
1260
1261 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1262 // cannot omit $len, when specifying charset
1263 if ($len==null) {
1264 $enc = mb_internal_encoding(); // save internal encoding
1265 mb_internal_encoding('utf-8');
1266 $str = mb_substr($string,$start);
1267 mb_internal_encoding($enc); // restore internal encoding
1268
1269 return $str;
1270 }
1271 else return mb_substr($string,$start,$len,'utf-8');
1272 } elseif ($charset == 'utf-8') {
1273 return $this->utf8_substr($string,$start,$len);
1274 } elseif ($this->eucBasedSets[$charset]) {
1275 return $this->euc_substr($string,$start,$charset,$len);
1276 } elseif ($this->twoByteSets[$charset]) {
1277 return substr($string,$start*2,$len*2);
1278 } elseif ($this->fourByteSets[$charset]) {
1279 return substr($string,$start*4,$len*4);
1280 }
1281
1282 // treat everything else as single-byte encoding
1283 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
1284 }
1285
1286 /**
1287 * Counts the number of characters.
1288 * Unit-tested by Kasper (single byte charsets only)
1289 *
1290 * @param string The character set
1291 * @param string Character string
1292 * @return integer The number of characters
1293 * @see strlen()
1294 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1295 */
1296 function strlen($charset,$string) {
1297 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1298 return mb_strlen($string,$charset);
1299 } elseif ($charset == 'utf-8') {
1300 return $this->utf8_strlen($string);
1301 } elseif ($this->eucBasedSets[$charset]) {
1302 return $this->euc_strlen($string,$charset);
1303 } elseif ($this->twoByteSets[$charset]) {
1304 return strlen($string)/2;
1305 } elseif ($this->fourByteSets[$charset]) {
1306 return strlen($string)/4;
1307 }
1308 // treat everything else as single-byte encoding
1309 return strlen($string);
1310 }
1311
1312 /**
1313 * Truncates a string and pre-/appends a string.
1314 * Unit tested by Kasper
1315 *
1316 * @param string The character set
1317 * @param string Character string
1318 * @param integer Length (in characters)
1319 * @param string Crop signifier
1320 * @return string The shortened string
1321 * @see substr(), mb_strimwidth()
1322 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1323 */
1324 function crop($charset,$string,$len,$crop='') {
1325 if (intval($len) == 0) return $string;
1326
1327 if ($charset == 'utf-8') {
1328 $i = $this->utf8_char2byte_pos($string,$len);
1329 } elseif ($this->eucBasedSets[$charset]) {
1330 $i = $this->euc_char2byte_pos($string,$len,$charset);
1331 } else {
1332 if ($len > 0) {
1333 $i = $len;
1334 } else {
1335 $i = strlen($string)+$len;
1336 if ($i<=0) $i = false;
1337 }
1338 }
1339
1340 if ($i === false) { // $len outside actual string length
1341 return $string;
1342 } else {
1343 if ($len > 0) {
1344 if (isset($string{$i})) {
1345 return substr($string,0,$i).$crop;
1346 }
1347 } else {
1348 if (isset($string{$i-1})) {
1349 return $crop.substr($string,$i);
1350 }
1351 }
1352
1353 /*
1354 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1355 if ($len > 0) {
1356 return substr($string,0,$i).$crop;
1357 } else {
1358 return $crop.substr($string,$i);
1359 }
1360 }
1361 */
1362 }
1363 return $string;
1364 }
1365
1366 /**
1367 * Cuts a string short at a given byte length.
1368 *
1369 * @param string The character set
1370 * @param string Character string
1371 * @param integer The byte length
1372 * @return string The shortened string
1373 * @see mb_strcut()
1374 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1375 */
1376 function strtrunc($charset,$string,$len) {
1377 if ($len <= 0) return '';
1378
1379 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1380 return mb_strcut($string,0,$len,$charset);
1381 } elseif ($charset == 'utf-8') {
1382 return $this->utf8_strtrunc($string,$len);
1383 } elseif ($this->eucBasedSets[$charset]) {
1384 return $this->euc_strtrunc($string,$charset);
1385 } elseif ($this->twoByteSets[$charset]) {
1386 if ($len % 2) $len--; // don't cut at odd positions
1387 } elseif ($this->fourByteSets[$charset]) {
1388 $x = $len % 4;
1389 $len -= $x; // realign to position dividable by four
1390 }
1391 // treat everything else as single-byte encoding
1392 return substr($string,0,$len);
1393 }
1394
1395 /**
1396 * Translates all characters of a string into their respective case values.
1397 * Unlike strtolower() and strtoupper() this method is locale independent.
1398 * Note that the string length may change!
1399 * eg. lower case German "ß" (sharp S) becomes uper case "SS"
1400 * Unit-tested by Kasper
1401 * Real case folding is language dependent, this method ignores this fact.
1402 *
1403 * @param string Character set of string
1404 * @param string Input string to convert case for
1405 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1406 * @return string The converted string
1407 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1408 * @see strtolower(), strtoupper()
1409 */
1410 function conv_case($charset,$string,$case) {
1411 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' &&
1412 float(phpversion()) >= 4.3) {
1413 if ($case == 'toLower') {
1414 return mb_strtolower($str,'utf-8');
1415 } else {
1416 return mb_strtoupper($str,'utf-8');
1417 }
1418 } elseif ($charset == 'utf-8') {
1419 return $this->utf8_char_mapping($string,'case',$case);
1420 } elseif (isset($this->eucBasedSets[$charset])) {
1421 return $this->euc_char_mapping($string,$charset,'case',$case);
1422 } else {
1423 // treat everything else as single-byte encoding
1424 return $this->sb_char_mapping($string,'case',$case);
1425 }
1426
1427 return $string;
1428 }
1429
1430 /**
1431 * Converts special chars (like ÆØÅæøå, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1432 *
1433 * @param string Character set of string
1434 * @param string Input string to convert
1435 * @return string The converted string
1436 */
1437 function specCharsToASCII($charset,$string) {
1438 if ($charset == 'utf-8') {
1439 return $this->utf8_char_mapping($string,'ascii');
1440 } elseif (isset($this->eucBasedSets[$charset])) {
1441 return $this->euc_char_mapping($string,$charset,'ascii');
1442 } else {
1443 // treat everything else as single-byte encoding
1444 return $this->sb_char_mapping($string,$charset,'ascii');
1445 }
1446
1447 return $string;
1448 }
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461 /********************************************
1462 *
1463 * Internal string operation functions
1464 *
1465 ********************************************/
1466
1467 /**
1468 * Maps all characters of a string in a single byte charset.
1469 *
1470 * @param string the string
1471 * @param string the charset
1472 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1473 * @param string 'case': conversion 'toLower' or 'toUpper'
1474 * @return string the converted string
1475 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1476 */
1477 function sb_char_mapping($str,$charset,$mode,$opt='') {
1478 switch($mode) {
1479 case 'case':
1480 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1481 $map =& $this->caseFolding[$charset][$opt];
1482 break;
1483
1484 case 'ascii':
1485 if (!$this->initToASCII($charset)) return $str; // do nothing
1486 $map =& $this->toASCII[$charset];
1487 break;
1488
1489 default:
1490 return $str;
1491 }
1492
1493 $out = '';
1494 for($i=0; isset($str{$i}); $i++) {
1495 $c = $str{$i};
1496 if (isset($map[$c])) {
1497 $out .= $map[$c];
1498 } else {
1499 $out .= $c;
1500 }
1501 }
1502
1503 return $out;
1504 }
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515 /********************************************
1516 *
1517 * Internal UTF-8 string operation functions
1518 *
1519 ********************************************/
1520
1521 /**
1522 * Returns a part of a UTF-8 string.
1523 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1524 *
1525 * @param string UTF-8 string
1526 * @param integer Start position (character position)
1527 * @param integer Length (in characters)
1528 * @return string The substring
1529 * @see substr()
1530 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1531 */
1532 function utf8_substr($str,$start,$len=null) {
1533 if (!strcmp($len,'0')) return '';
1534
1535 $byte_start = $this->utf8_char2byte_pos($str,$start);
1536 if ($byte_start === false) {
1537 if ($start > 0) {
1538 return false; // $start outside string length
1539 } else {
1540 $start = 0;
1541 }
1542 }
1543
1544 $str = substr($str,$byte_start);
1545
1546 if ($len!=null) {
1547 $byte_end = $this->utf8_char2byte_pos($str,$len);
1548 if ($byte_end === false) // $len outside actual string length
1549 return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string.
1550 else
1551 return substr($str,0,$byte_end);
1552 }
1553 else return $str;
1554 }
1555
1556 /**
1557 * Counts the number of characters of a string in UTF-8.
1558 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1559 *
1560 * @param string UTF-8 multibyte character string
1561 * @return integer The number of characters
1562 * @see strlen()
1563 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1564 */
1565 function utf8_strlen($str) {
1566 $n=0;
1567 for($i=0; isset($str{$i}); $i++) {
1568 $c = ord($str{$i});
1569 if (!($c & 0x80)) // single-byte (0xxxxxx)
1570 $n++;
1571 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1572 $n++;
1573 }
1574 return $n;
1575 }
1576
1577 /**
1578 * Truncates a string in UTF-8 short at a given byte length.
1579 *
1580 * @param string UTF-8 multibyte character string
1581 * @param integer the byte length
1582 * @return string the shortened string
1583 * @see mb_strcut()
1584 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1585 */
1586 function utf8_strtrunc($str,$len) {
1587 $i = $len-1;
1588 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1589 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1590 if ($i <= 0) return ''; // sanity check
1591 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1592 if ($bc+$i > $len) return substr($str,0,$i);
1593 // fallthru: multibyte char fits into length
1594 }
1595 return substr($str,$len);
1596 }
1597
1598 /**
1599 * Find position of first occurrence of a string, both arguments are in UTF-8.
1600 *
1601 * @param string UTF-8 string to search in
1602 * @param string UTF-8 string to search for
1603 * @param integer Positition to start the search
1604 * @return integer The character position
1605 * @see strpos()
1606 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1607 */
1608 function utf8_strpos($haystack,$needle,$offset=0) {
1609 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1610 return mb_strpos($haystack,$needle,'utf-8');
1611 }
1612
1613 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1614 if ($byte_offset === false) return false; // offset beyond string length
1615
1616 $byte_pos = strpos($haystack,$needle,$byte_offset);
1617 if ($byte_pos === false) return false; // needle not found
1618
1619 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1620 }
1621
1622 /**
1623 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1624 *
1625 * @param string UTF-8 string to search in
1626 * @param string UTF-8 character to search for (single character)
1627 * @return integer The character position
1628 * @see strrpos()
1629 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1630 */
1631 function utf8_strrpos($haystack,$needle) {
1632 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1633 return mb_strrpos($haystack,$needle,'utf-8');
1634 }
1635
1636 $byte_pos = strrpos($haystack,$needle);
1637 if ($byte_pos === false) return false; // needle not found
1638
1639 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1640 }
1641
1642 /**
1643 * Translates a character position into an 'absolute' byte position.
1644 * Unit tested by Kasper.
1645 *
1646 * @param string UTF-8 string
1647 * @param integer Character position (negative values start from the end)
1648 * @return integer Byte position
1649 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1650 */
1651 function utf8_char2byte_pos($str,$pos) {
1652 $n = 0; // number of characters found
1653 $p = abs($pos); // number of characters wanted
1654
1655 if ($pos >= 0) {
1656 $i = 0;
1657 $d = 1;
1658 } else {
1659 $i = strlen($str)-1;
1660 $d = -1;
1661 }
1662
1663 for( ; isset($str{$i}) && $n<$p; $i+=$d) {
1664 $c = (int)ord($str{$i});
1665 if (!($c & 0x80)) // single-byte (0xxxxxx)
1666 $n++;
1667 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1668 $n++;
1669 }
1670 if (!isset($str{$i})) return false; // offset beyond string length
1671
1672 if ($pos >= 0) {
1673 // skip trailing multi-byte data bytes
1674 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1675 } else {
1676 // correct offset
1677 $i++;
1678 }
1679
1680 return $i;
1681 }
1682
1683 /**
1684 * Translates an 'absolute' byte position into a character position.
1685 * Unit tested by Kasper.
1686 *
1687 * @param string UTF-8 string
1688 * @param integer byte position
1689 * @return integer character position
1690 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1691 */
1692 function utf8_byte2char_pos($str,$pos) {
1693 $n = 0; // number of characters
1694 for($i=$pos; $i>0; $i--) {
1695 $c = (int)ord($str{$i});
1696 if (!($c & 0x80)) // single-byte (0xxxxxx)
1697 $n++;
1698 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1699 $n++;
1700 }
1701 if (!isset($str{$i})) return false; // offset beyond string length
1702
1703 return $n;
1704 }
1705
1706 /**
1707 * Maps all characters of an UTF-8 string.
1708 *
1709 * @param string UTF-8 string
1710 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1711 * @param string 'case': conversion 'toLower' or 'toUpper'
1712 * @return string the converted string
1713 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1714 */
1715 function utf8_char_mapping($str,$mode,$opt='') {
1716 if (!$this->initUnicodeData($mode)) return $str; // do nothing
1717
1718 $out = '';
1719 switch($mode) {
1720 case 'case':
1721 $map =& $this->caseFolding['utf-8'][$opt];
1722 break;
1723
1724 case 'ascii':
1725 $map =& $this->toASCII['utf-8'];
1726 break;
1727
1728 default:
1729 return $str;
1730 }
1731
1732 for($i=0; isset($str{$i}); $i++) {
1733 $c = ord($str{$i});
1734 if (!($c & 0x80)) // single-byte (0xxxxxx)
1735 $mbc = $str{$i};
1736 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1737 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1738 $mbc = substr($str,$i,$bc);
1739 $i += $bc-1;
1740 }
1741
1742 if (isset($map[$mbc])) {
1743 $out .= $map[$mbc];
1744 } else {
1745 $out .= $mbc;
1746 }
1747 }
1748
1749 return $out;
1750 }
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769 /********************************************
1770 *
1771 * Internal EUC string operation functions
1772 *
1773 * Extended Unix Code:
1774 * ASCII compatible 7bit single bytes chars
1775 * 8bit two byte chars
1776 *
1777 * Shift-JIS is treated as a special case.
1778 *
1779 ********************************************/
1780
1781 /**
1782 * Cuts a string in the EUC charset family short at a given byte length.
1783 *
1784 * @param string EUC multibyte character string
1785 * @param integer the byte length
1786 * @param string the charset
1787 * @return string the shortened string
1788 * @see mb_strcut()
1789 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1790 */
1791 function euc_strtrunc($str,$len,$charset) {
1792 $sjis = ($charset == 'shift_jis');
1793 for ($i=0; isset($str{$i}) && $i<$len; $i++) {
1794 $c = ord($str{$i});
1795 if ($sjis) {
1796 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1797 }
1798 else {
1799 if ($c >= 0x80) $i++; // advance a double-byte char
1800 }
1801 }
1802 if (!isset($str{$i})) return $str; // string shorter than supplied length
1803
1804 if ($i>$len)
1805 return substr($str,0,$len-1); // we ended on a first byte
1806 else
1807 return substr($str,0,$len);
1808 }
1809
1810 /**
1811 * Returns a part of a string in the EUC charset family.
1812 *
1813 * @param string EUC multibyte character string
1814 * @param integer start position (character position)
1815 * @param string the charset
1816 * @param integer length (in characters)
1817 * @return string the substring
1818 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1819 */
1820 function euc_substr($str,$start,$charset,$len=null) {
1821 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1822 if ($byte_start === false) return false; // $start outside string length
1823
1824 $str = substr($str,$byte_start);
1825
1826 if ($len!=null) {
1827 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1828 if ($byte_end === false) // $len outside actual string length
1829 return $str;
1830 else
1831 return substr($str,0,$byte_end);
1832 }
1833 else return $str;
1834 }
1835
1836 /**
1837 * Counts the number of characters of a string in the EUC charset family.
1838 *
1839 * @param string EUC multibyte character string
1840 * @param string the charset
1841 * @return integer the number of characters
1842 * @see strlen()
1843 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1844 */
1845 function euc_strlen($str,$charset) {
1846 $sjis = ($charset == 'shift_jis');
1847 $n=0;
1848 for ($i=0; isset($str{$i}); $i++) {
1849 $c = ord($str{$i});
1850 if ($sjis) {
1851 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1852 }
1853 else {
1854 if ($c >= 0x80) $i++; // advance a double-byte char
1855 }
1856
1857 $n++;
1858 }
1859
1860 return $n;
1861 }
1862
1863 /**
1864 * Translates a character position into an 'absolute' byte position.
1865 *
1866 * @param string EUC multibyte character string
1867 * @param integer character position (negative values start from the end)
1868 * @param string the charset
1869 * @return integer byte position
1870 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1871 */
1872 function euc_char2byte_pos($str,$pos,$charset) {
1873 $sjis = ($charset == 'shift_jis');
1874 $n = 0; // number of characters seen
1875 $p = abs($pos); // number of characters wanted
1876
1877 if ($pos >= 0) {
1878 $i = 0;
1879 $d = 1;
1880 } else {
1881 $i = strlen($str)-1;
1882 $d = -1;
1883 }
1884
1885 for ( ; isset($str{$i}) && $n<$p; $i+=$d) {
1886 $c = ord($str{$i});
1887 if ($sjis) {
1888 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char
1889 }
1890 else {
1891 if ($c >= 0x80) $i+=$d; // advance a double-byte char
1892 }
1893
1894 $n++;
1895 }
1896 if (!isset($str{$i})) return false; // offset beyond string length
1897
1898 if ($pos < 0) $i++; // correct offset
1899
1900 return $i;
1901 }
1902
1903 /**
1904 * Maps all characters of a string in the EUC charset family.
1905 *
1906 * @param string EUC multibyte character string
1907 * @param string the charset
1908 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1909 * @param string 'case': conversion 'toLower' or 'toUpper'
1910 * @return string the converted string
1911 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1912 */
1913 function euc_char_mapping($str,$charset,$mode,$opt='') {
1914 switch($mode) {
1915 case 'case':
1916 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1917 $map =& $this->caseFolding[$charset][$opt];
1918 break;
1919
1920 case 'ascii':
1921 if (!$this->initToASCII($charset)) return $str; // do nothing
1922 $map =& $this->toASCII[$charset];
1923 break;
1924
1925 default:
1926 return $str;
1927 }
1928
1929 $sjis = ($charset == 'shift_jis');
1930 $out = '';
1931 for($i=0; isset($str{$i}); $i++) {
1932 $mbc = $str{$i};
1933 $c = ord($mbc);
1934
1935 if ($sjis) {
1936 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
1937 $mbc = substr($str,$i,2);
1938 $i++;
1939 }
1940 }
1941 else {
1942 if ($c >= 0x80) { // a double-byte char
1943 $mbc = substr($str,$i,2);
1944 $i++;
1945 }
1946 }
1947
1948 if (isset($map[$mbc])) {
1949 $out .= $map[$mbc];
1950 } else {
1951 $out .= $mbc;
1952 }
1953 }
1954
1955 return $out;
1956 }
1957
1958 }
1959
1960 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
1961 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
1962 }
1963 ?>