41defbc502114edc27c57de9936f099e678fb8ce
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2004 Kasper Skaarhoj (kasper@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasper@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 128: class t3lib_cs
38 * 442: function parse_charset($charset)
39 * 460: function get_locale_charset($locale)
40 * 492: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
41 * 529: function utf8_encode($str,$charset)
42 * 576: function utf8_decode($str,$charset,$useEntityForNoChar=0)
43 * 619: function utf8_to_entities($str)
44 * 652: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
45 * 686: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
46 * 736: function UnumberToChar($cbyte)
47 * 781: function utf8CharToUnumber($str,$hex=0)
48 *
49 * SECTION: Init functions
50 * 824: function initCharset($charset)
51 * 885: function initCaseFoldingUTF8()
52 * 973: function initCaseFolding($charset)
53 *
54 * SECTION: String operation functions
55 * 1058: function substr($charset,$string,$start,$len=null)
56 * 1096: function strlen($charset,$string)
57 * 1124: function crop($charset,$string,$len,$crop='')
58 * 1165: function strtrunc($charset,$string,$len)
59 * 1197: function conv_case($charset,$string,$case)
60 *
61 * SECTION: Internal UTF-8 string operation functions
62 * 1264: function utf8_substr($str,$start,$len=null)
63 * 1297: function utf8_strlen($str)
64 * 1318: function utf8_strtrunc($str,$len)
65 * 1340: function utf8_strpos($haystack,$needle,$offset=0)
66 * 1363: function utf8_strrpos($haystack,$needle)
67 * 1383: function utf8_char2byte_pos($str,$pos)
68 * 1424: function utf8_byte2char_pos($str,$pos)
69 * 1448: function utf8_conv_case($str,$case)
70 *
71 * SECTION: Internal EUC string operation functions
72 * 1514: function euc_strtrunc($str,$len,$charset)
73 * 1543: function euc_substr($str,$start,$charset,$len=null)
74 * 1568: function euc_strlen($str,$charset)
75 * 1595: function euc_char2byte_pos($str,$pos,$charset)
76 * 1636: function euc_conv_case($str,$case,$charset)
77 *
78 * TOTAL FUNCTIONS: 31
79 * (This index is automatically created/updated by the extension "extdeveval")
80 *
81 */
82
83
84
85
86
87
88
89
90 /**
91 * Notes on UTF-8
92 *
93 * Functions working on UTF-8 strings:
94 *
95 * - strchr/strstr
96 * - strrchr
97 * - substr_count
98 * - implode/explode/join
99 *
100 * Functions nearly working on UTF-8 strings:
101 *
102 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf_strlen
103 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained 7-bit ASCII
104 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
105 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
106 *
107 * Functions NOT working on UTF-8 strings:
108 *
109 * - str*cmp
110 * - stristr
111 * - stripos
112 * - substr
113 * - strrev
114 * - ereg/eregi
115 * - split/spliti
116 * - preg_*
117 * - ...
118 *
119 */
120 /**
121 * Class for conversion between charsets
122 *
123 * @author Kasper Skaarhoj <kasper@typo3.com>
124 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
125 * @package TYPO3
126 * @subpackage t3lib
127 */
128 class t3lib_cs {
129 var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
130
131 // This is the array where parsed conversion tables are stored (cached)
132 var $parsedCharsets=array();
133
134 // An array where case folding data will be stored (cached)
135 var $caseFolding=array();
136
137 // An array where charset-to-ASCII mappings are stored (cached)
138 var $toASCII=array();
139
140 // This tells the converter which charsets has two bytes per char:
141 var $twoByteSets=array(
142 'ucs-2'=>1, // 2-byte Unicode
143 );
144
145 // This tells the converter which charsets has four bytes per char:
146 var $fourByteSets=array(
147 'ucs-4'=>1, // 4-byte Unicode
148 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
149 );
150
151 // This tells the converter which charsets use a scheme like the Extended Unix Code:
152 var $eucBasedSets=array(
153 'gb2312'=>1, // Chinese, simplified.
154 'big5'=>1, // Chinese, traditional.
155 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
156 );
157
158 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
159 // http://czyborra.com/charsets/iso8859.html
160 var $synonyms=array(
161 'us' => 'ascii',
162 'us-ascii'=> 'ascii',
163 'cp819' => 'iso-8859-1',
164 'ibm819' => 'iso-8859-1',
165 'iso-ir-100' => 'iso-8859-1',
166 'iso-ir-109' => 'iso-8859-2',
167 'iso-ir-148' => 'iso-8859-9',
168 'iso-ir-199' => 'iso-8859-14',
169 'iso-ir-203' => 'iso-8859-15',
170 'csisolatin1' => 'iso-8859-1',
171 'csisolatin2' => 'iso-8859-2',
172 'csisolatin3' => 'iso-8859-3',
173 'csisolatin5' => 'iso-8859-9',
174 'csisolatin8' => 'iso-8859-14',
175 'csisolatin9' => 'iso-8859-15',
176 'csisolatingreek' => 'iso-8859-7',
177 'iso-celtic' => 'iso-8859-14',
178 'latin1' => 'iso-8859-1',
179 'latin2' => 'iso-8859-2',
180 'latin3' => 'iso-8859-3',
181 'latin5' => 'iso-8859-9',
182 'latin6' => 'iso-8859-10',
183 'latin8' => 'iso-8859-14',
184 'latin9' => 'iso-8859-15',
185 'l1' => 'iso-8859-1',
186 'l2' => 'iso-8859-2',
187 'l3' => 'iso-8859-3',
188 'l5' => 'iso-8859-9',
189 'l6' => 'iso-8859-10',
190 'l8' => 'iso-8859-14',
191 'l9' => 'iso-8859-15',
192 'cyrillic' => 'iso-8859-5',
193 'arabic' => 'iso-8859-6',
194 'tis-620' => 'iso-8859-11',
195 'win874' => 'windows-874',
196 'win1250' => 'windows-1250',
197 'win1251' => 'windows-1251',
198 'win1252' => 'windows-1252',
199 'win1253' => 'windows-1253',
200 'win1254' => 'windows-1254',
201 'win1255' => 'windows-1255',
202 'win1256' => 'windows-1256',
203 'win1257' => 'windows-1257',
204 'win1258' => 'windows-1258',
205 'cp1250' => 'windows-1250',
206 'cp1251' => 'windows-1251',
207 'cp1252' => 'windows-1252',
208 'ms-ee' => 'windows-1250',
209 'ms-ansi' => 'windows-1252',
210 'ms-greek' => 'windows-1253',
211 'ms-turk' => 'windows-1254',
212 'winbaltrim' => 'windows-1257',
213 'koi-8ru' => 'koi-8r',
214 'koi8r' => 'koi-8r',
215 'cp878' => 'koi-8r',
216 'mac' => 'macroman',
217 'macintosh' => 'macroman',
218 'euc-cn' => 'gb2312',
219 'x-euc-cn' => 'gb2312',
220 'euccn' => 'gb2312',
221 'cp936' => 'gb2312',
222 'big-5' => 'big5',
223 'cp950' => 'big5',
224 'eucjp' => 'euc-jp',
225 'sjis' => 'shift_jis',
226 'shift-jis' => 'shift_jis',
227 'cp932' => 'shift_jis',
228 'utf7' => 'utf-7',
229 'utf8' => 'utf-8',
230 'utf16' => 'utf-16',
231 'utf32' => 'utf-32',
232 'utf8' => 'utf-8',
233 'ucs2' => 'ucs-2',
234 'ucs4' => 'ucs-4',
235 );
236
237 // mapping of iso-639:2 language codes to language (family) names
238 var $lang_to_langfamily=array(
239 // iso-639:2 language codes, see:
240 // http://www.w3.org/WAI/ER/IG/ert/iso639.htm
241 // http://www.unicode.org/onlinedat/languages.html
242 'ar' => 'arabic',
243 'bg' => 'cyrillic',
244 'cs' => 'east_european',
245 'da' => 'west_european',
246 'de' => 'west_european',
247 'es' => 'west_european',
248 'et' => 'estonian',
249 'eu' => 'west_european',
250 'fi' => 'west_european',
251 'fr' => 'west_european',
252 'gr' => 'greek',
253 'hr' => 'east_european',
254 'hu' => 'east_european',
255 'iw' => 'hebrew',
256 'is' => 'west_european',
257 'it' => 'west_european',
258 'ja' => 'japanese',
259 'kl' => 'west_european',
260 'ko' => 'korean',
261 'lt' => 'lithuanian',
262 'lv' => 'west_european', // Latvian/Lettish
263 'nl' => 'west_european',
264 'no' => 'west_european',
265 'pl' => 'east_european',
266 'pt' => 'west_european',
267 'ro' => 'east_european',
268 'ru' => 'cyrillic',
269 'sk' => 'east_european',
270 'sl' => 'east_european',
271 'sv' => 'west_european',
272 'th' => 'thai',
273 'uk' => 'cyrillic',
274 'vi' => 'vietnamese',
275 'zh' => 'chinese',
276 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
277 'chs' => 'simpl_chinese',
278 'cht' => 'trad_chinese',
279 'csy' => 'east_european',
280 'dan' => 'west_european',
281 'deu' => 'west_european',
282 'dea' => 'west_european',
283 'des' => 'west_european',
284 'ena' => 'west_european',
285 'enc' => 'west_european',
286 'eng' => 'west_european',
287 'enz' => 'west_european',
288 'enu' => 'west_european',
289 'nld' => 'west_european',
290 'nlb' => 'west_european',
291 'fin' => 'west_european',
292 'fra' => 'west_european',
293 'frb' => 'west_european',
294 'frc' => 'west_european',
295 'frs' => 'west_european',
296 'ell' => 'greek',
297 'hun' => 'east_european',
298 'isl' => 'west_euorpean',
299 'ita' => 'west_european',
300 'its' => 'west_european',
301 'jpn' => 'japanese',
302 'kor' => 'korean',
303 'nor' => 'west_european',
304 'non' => 'west_european',
305 'plk' => 'east_european',
306 'ptg' => 'west_european',
307 'ptb' => 'west_european',
308 'rus' => 'east_european',
309 'sky' => 'east_european',
310 'esp' => 'west_european',
311 'esm' => 'west_european',
312 'esn' => 'west_european',
313 'sve' => 'west_european',
314 'trk' => 'turkish',
315 // English language names
316 'bulgarian' => 'east_european',
317 'catalan' => 'west_european',
318 'croatian' => 'east_european',
319 'czech' => 'east_european',
320 'danish' => 'west_european',
321 'dutch' => 'west_european',
322 'english' => 'west_european',
323 'finnish' => 'west_european',
324 'french' => 'west_european',
325 'galician' => 'west_european',
326 'german' => 'west_european',
327 'hungarian' => 'east_european',
328 'icelandic' => 'west_european',
329 'italian' => 'west_european',
330 'latvian' => 'west_european',
331 'lettish' => 'west_european',
332 'norwegian' => 'west_european',
333 'polish' => 'east_european',
334 'portuguese' => 'west_european',
335 'russian' => 'cyrillic',
336 'romanian' => 'east_european',
337 'slovak' => 'east_european',
338 'slovenian' => 'east_european',
339 'spanish' => 'west_european',
340 'svedish' => 'west_european',
341 'turkish' => 'east_european',
342 'ukrainian' => 'cyrillic',
343 );
344
345 // mapping of language (family) names to charsets on Unix
346 var $lang_to_charset_unix=array(
347 'west_european' => 'iso-8859-1',
348 'estonian' => 'iso-8859-1',
349 'east_european' => 'iso-8859-2',
350 'baltic' => 'iso-8859-4',
351 'cyrillic' => 'iso-8859-5',
352 'arabic' => 'iso-8859-6',
353 'greek' => 'iso-8859-7',
354 'hebrew' => 'iso-8859-8',
355 'turkish' => 'iso-8859-9',
356 'thai' => 'iso-8859-11', // = TIS-620
357 'lithuanian' => 'iso-8859-13',
358 'chinese' => 'gb2312', // = euc-cn
359 'japanese' => 'euc-jp',
360 'korean' => 'euc-kr',
361 'simpl_chinese' => 'gb2312',
362 'trad_chinese' => 'big5',
363 'vietnamese' => '',
364 );
365
366 // mapping of language (family) names to charsets on Windows
367 var $lang_to_charset_windows=array(
368 'east_european' => 'windows-1250',
369 'cyrillic' => 'windows-1251',
370 'west_european' => 'windows-1252',
371 'greek' => 'windows-1253',
372 'turkish' => 'windows-1254',
373 'hebrew' => 'windows-1255',
374 'arabic' => 'windows-1256',
375 'baltic' => 'windows-1257',
376 'estonian' => 'windows-1257',
377 'lithuanian' => 'windows-1257',
378 'vietnamese' => 'windows-1258',
379 'thai' => 'cp874',
380 'korean' => 'cp950',
381 'chinese' => 'gb2312',
382 'japanese' => 'shift_jis',
383 'simpl_chinese' => 'gb2312',
384 'trad_chinese' => 'big5',
385 );
386
387 // mapping of locale names to charsets
388 var $locale_to_charset=array(
389 'japanese.euc' => 'euc-jp',
390 'ja_jp.ujis' => 'euc-jp',
391 'korean.euc' => 'euc-kr',
392 'zh_cn' => 'gb2312',
393 'zh_hk' => 'big5',
394 'zh_tw' => 'big5',
395 );
396
397 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
398 // Empty values means "iso-8859-1"
399 var $charSetArray = array(
400 'dk' => '',
401 'de' => '',
402 'no' => '',
403 'it' => '',
404 'fr' => '',
405 'es' => '',
406 'nl' => '',
407 'cz' => 'windows-1250',
408 'pl' => 'iso-8859-2',
409 'si' => 'windows-1250',
410 'fi' => '',
411 'tr' => 'iso-8859-9',
412 'se' => '',
413 'pt' => '',
414 'ru' => 'windows-1251',
415 'ro' => 'iso-8859-2',
416 'ch' => 'gb2312',
417 'sk' => 'windows-1250',
418 'lt' => 'windows-1257',
419 'is' => 'utf-8',
420 'hr' => 'windows-1250',
421 'hu' => 'iso-8859-2',
422 'gl' => '',
423 'th' => 'iso-8859-11',
424 'gr' => 'iso-8859-7',
425 'hk' => 'big5',
426 'eu' => '',
427 'bg' => 'windows-1251',
428 'br' => '',
429 'et' => 'iso-8859-4',
430 'ar' => 'iso-8859-6',
431 'he' => 'utf-8',
432 'ua' => 'windows-1251',
433 'jp' => 'shift_jis',
434 'lv' => 'utf-8',
435 'vn' => 'utf-8',
436 'ca' => 'iso-8859-15',
437 'ba' => 'iso-8859-2',
438 'kr' => 'euc-kr',
439 );
440
441 /**
442 * Normalize - changes input character set to lowercase letters.
443 *
444 * @param string Input charset
445 * @return string Normalized charset
446 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
447 */
448 function parse_charset($charset) {
449 $charset = strtolower($charset);
450 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
451
452 return $charset;
453 }
454
455 /**
456 * Get the charset of a locale.
457 *
458 * ln language
459 * ln_CN language / country
460 * ln_CN.cs language / country / charset
461 *
462 * @param string Locale string
463 * @return string Charset resolved for locale string
464 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
465 */
466 function get_locale_charset($locale) {
467 $locale = strtolower($locale);
468
469 // exact locale specific charset?
470 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale];
471
472 // locale contains charset: use it
473 list($locale,$charset) = explode('.',$locale);
474 if ($charset) return $this->parse_charset($charset);
475
476 // get language
477 list($language,$country) = explode('_',$locale);
478 if (isset($this->lang_to_langfamily[$language])) $language = $this->lang_to_langfamily[$language];
479
480 if (TYPO3_OS == 'WIN') {
481 $cs = $this->lang_to_charset_windows[$language];
482 } else {
483 $cs = $this->lang_to_charset_unix[$language];
484 }
485
486 return $cs ? $cs : 'iso-8859-1';
487 }
488
489
490
491
492
493
494
495
496
497 /********************************************
498 *
499 * Charset Conversion functions
500 *
501 ********************************************/
502
503 /**
504 * Convert from one charset to another charset.
505 *
506 * @param string Input string
507 * @param string From charset (the current charset of the string)
508 * @param string To charset (the output charset wanted)
509 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
510 * @return string Converted string
511 * @see convArray()
512 */
513 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
514 if ($fromCS==$toCS) return $str;
515
516 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
517 if ($toCS=='utf-8' || !$useEntityForNoChar) {
518 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
519 case 'mbstring':
520 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
521 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
522 break;
523
524 case 'iconv':
525 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
526 if (false !== $conv_str) return $conv_str;
527 break;
528
529 case 'recode':
530 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
531 if (false !== $conv_str) return $conv_str;
532 break;
533 }
534 // fallback to TYPO3 conversion
535 }
536
537 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
538 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
539 return $str;
540 }
541
542 /**
543 * Convert all elements in ARRAY from one charset to another charset.
544 * NOTICE: Array is passed by reference!
545 *
546 * @param string Input array, possibly multidimensional
547 * @param string From charset (the current charset of the string)
548 * @param string To charset (the output charset wanted)
549 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
550 * @return void
551 * @see conv()
552 */
553 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
554 foreach($array as $key => $value) {
555 if (is_array($array[$key])) {
556 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
557 } else {
558 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
559 }
560 }
561 }
562
563 /**
564 * Converts $str from $charset to UTF-8
565 *
566 * @param string String in local charset to convert to UTF-8
567 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
568 * @return string Output string, converted to UTF-8
569 */
570 function utf8_encode($str,$charset) {
571
572 // Charset is case-insensitive.
573 if ($this->initCharset($charset)) { // Parse conv. table if not already...
574 $strLen = strlen($str);
575 $outStr='';
576
577 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
578 $chr=substr($str,$a,1);
579 $ord=ord($chr);
580 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
581 $ord2 = ord($str{$a+1});
582 $ord = $ord<<8 & $ord2; // assume big endian
583
584 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
585 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
586 } else $outStr.=chr($this->noCharByteVal); // No char exists
587 $a++;
588 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
589 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
590 $a++;
591 $ord2=ord(substr($str,$a,1));
592 $ord = $ord*256+$ord2;
593 }
594 elseif ($charset == 'shift_jis' && ($ord <160 || $ord>223)) { // Shift-JIS is like EUC, but chars between 160 and 223 are single byte
595 $a++;
596 $ord2=ord(substr($str,$a,1));
597 $ord = $ord*256+$ord2;
598 }
599
600 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
601 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
602 } else $outStr.=chr($this->noCharByteVal); // No char exists
603 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
604 }
605 return $outStr;
606 }
607 }
608
609 /**
610 * Converts $str from UTF-8 to $charset
611 *
612 * @param string String in UTF-8 to convert to local charset
613 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
614 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
615 * @return string Output string, converted to local charset
616 */
617 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
618
619 // Charset is case-insensitive.
620 if ($this->initCharset($charset)) { // Parse conv. table if not already...
621 $strLen = strlen($str);
622 $outStr='';
623 $buf='';
624 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
625 $chr=substr($str,$a,1);
626 $ord=ord($chr);
627 if ($ord>127) { // This means multibyte! (first byte!)
628 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
629
630 $buf=$chr; // Add first byte
631 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
632 $ord = $ord << 1; // Shift it left and ...
633 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
634 $a++; // Increase pointer...
635 $buf.=substr($str,$a,1); // ... and add the next char.
636 } else break;
637 }
638
639 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
640 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
641 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
642 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
643 } else $outStr.= chr($mByte);
644 } elseif ($useEntityForNoChar) { // Create num entity:
645 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
646 } else $outStr.=chr($this->noCharByteVal); // No char exists
647 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
648 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
649 }
650 return $outStr;
651 }
652 }
653
654 /**
655 * Converts all chars > 127 to numeric entities.
656 *
657 * @param string Input string
658 * @return string Output string
659 */
660 function utf8_to_entities($str) {
661 $strLen = strlen($str);
662 $outStr='';
663 $buf='';
664 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
665 $chr=substr($str,$a,1);
666 $ord=ord($chr);
667 if ($ord>127) { // This means multibyte! (first byte!)
668 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
669 $buf=$chr; // Add first byte
670 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
671 $ord = $ord << 1; // Shift it left and ...
672 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
673 $a++; // Increase pointer...
674 $buf.=substr($str,$a,1); // ... and add the next char.
675 } else break;
676 }
677
678 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
679 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
680 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
681 }
682
683 return $outStr;
684 }
685
686 /**
687 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
688 *
689 * @param string Input string, UTF-8
690 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
691 * @return string Output string
692 */
693 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
694 if ($alsoStdHtmlEnt) {
695 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
696 }
697
698 $token = md5(microtime());
699 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
700 foreach($parts as $k => $v) {
701 if ($k%2) {
702 if (substr($v,0,1)=='#') { // Dec or hex entities:
703 if (substr($v,1,1)=='x') {
704 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
705 } else {
706 $parts[$k] = $this->UnumberToChar(substr($v,1));
707 }
708 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
709 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
710 } else { // No conversion:
711 $parts[$k] ='&'.$v.';';
712 }
713 }
714 }
715
716 return implode('',$parts);
717 }
718
719 /**
720 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
721 *
722 * @param string Input string, UTF-8
723 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
724 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
725 * @return array Output array with the char numbers
726 */
727 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
728 // If entities must be registered as well...:
729 if ($convEntities) {
730 $str = $this->entities_to_utf8($str,1);
731 }
732 // Do conversion:
733 $strLen = strlen($str);
734 $outArr=array();
735 $buf='';
736 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
737 $chr=substr($str,$a,1);
738 $ord=ord($chr);
739 if ($ord>127) { // This means multibyte! (first byte!)
740 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
741 $buf=$chr; // Add first byte
742 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
743 $ord = $ord << 1; // Shift it left and ...
744 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
745 $a++; // Increase pointer...
746 $buf.=substr($str,$a,1); // ... and add the next char.
747 } else break;
748 }
749
750 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
751 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
752 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
753 }
754
755 return $outArr;
756 }
757
758 /**
759 * Converts a UNICODE number to a UTF-8 multibyte character
760 * Algorithm based on script found at From: http://czyborra.com/utf/
761 * Unit-tested by Kasper
762 *
763 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
764 *
765 * bytes | bits | representation
766 * 1 | 7 | 0vvvvvvv
767 * 2 | 11 | 110vvvvv 10vvvvvv
768 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
769 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
770 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
771 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
772 *
773 * @param integer UNICODE integer
774 * @return string UTF-8 multibyte character string
775 * @see utf8CharToUnumber()
776 */
777 function UnumberToChar($cbyte) {
778 $str='';
779
780 if ($cbyte < 0x80) {
781 $str.=chr($cbyte);
782 } else if ($cbyte < 0x800) {
783 $str.=chr(0xC0 | ($cbyte >> 6));
784 $str.=chr(0x80 | ($cbyte & 0x3F));
785 } else if ($cbyte < 0x10000) {
786 $str.=chr(0xE0 | ($cbyte >> 12));
787 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
788 $str.=chr(0x80 | ($cbyte & 0x3F));
789 } else if ($cbyte < 0x200000) {
790 $str.=chr(0xF0 | ($cbyte >> 18));
791 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
792 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
793 $str.=chr(0x80 | ($cbyte & 0x3F));
794 } else if ($cbyte < 0x4000000) {
795 $str.=chr(0xF8 | ($cbyte >> 24));
796 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
797 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
798 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
799 $str.=chr(0x80 | ($cbyte & 0x3F));
800 } else if ($cbyte < 0x80000000) {
801 $str.=chr(0xFC | ($cbyte >> 30));
802 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
803 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
804 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
805 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
806 $str.=chr(0x80 | ($cbyte & 0x3F));
807 } else { // Cannot express a 32-bit character in UTF-8
808 $str .= chr($this->noCharByteVal);
809 }
810 return $str;
811 }
812
813 /**
814 * Converts a UTF-8 Multibyte character to a UNICODE number
815 * Unit-tested by Kasper
816 *
817 * @param string UTF-8 multibyte character string
818 * @param boolean If set, then a hex. number is returned.
819 * @return integer UNICODE integer
820 * @see UnumberToChar()
821 */
822 function utf8CharToUnumber($str,$hex=0) {
823 $ord=ord(substr($str,0,1)); // First char
824
825 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
826 $binBuf='';
827 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
828 $ord = $ord << 1; // Shift it left and ...
829 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
830 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
831 } else break;
832 }
833 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
834
835 $int = bindec($binBuf);
836 } else $int = $ord;
837
838 return $hex ? 'x'.dechex($int) : $int;
839 }
840
841
842
843
844
845
846
847
848
849 /********************************************
850 *
851 * Init functions
852 *
853 ********************************************/
854
855 /**
856 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
857 * This function is automatically called by the conversion functions
858 *
859 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
860 *
861 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
862 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
863 * @access private
864 */
865 function initCharset($charset) {
866 // Only process if the charset is not yet loaded:
867 if (!is_array($this->parsedCharsets[$charset])) {
868
869 // Conversion table filename:
870 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
871
872 // If the conversion table is found:
873 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
874 // Cache file for charsets:
875 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
876 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
877 if ($cacheFile && @is_file($cacheFile)) {
878 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
879 } else {
880 // Parse conversion table into lines:
881 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
882 // Initialize the internal variable holding the conv. table:
883 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
884 // traverse the lines:
885 $detectedType='';
886 foreach($lines as $value) {
887 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
888
889 // Detect type if not done yet: (Done on first real line)
890 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
891 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
892
893 if ($detectedType=='ms-token') {
894 list($hexbyte,$utf8) = split('=|:',$value,3);
895 } elseif ($detectedType=='whitespaced') {
896 $regA=array();
897 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
898 $hexbyte = $regA[1];
899 $utf8 = 'U+'.$regA[2];
900 }
901 $decval = hexdec(trim($hexbyte));
902 if ($decval>127) {
903 $utf8decval = hexdec(substr(trim($utf8),2));
904 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
905 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
906 }
907 }
908 }
909 if ($cacheFile) {
910 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
911 }
912 }
913 return 2;
914 } else return false;
915 } else return 1;
916 }
917
918 /**
919 * This function initializes all UTF-8 character data tables.
920 *
921 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
922 *
923 * @param string Mode ("case", "ascii", ...)
924 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
925 * @access private
926 */
927 function initUnicodeData($mode=null) {
928 // cache files
929 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
930 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
931
932 // Only process if the tables are not yet loaded
933 switch($mode) {
934 case 'case':
935 if (is_array($this->caseFolding['utf-8'])) return 1;
936
937 // Use cached version if possible
938 if ($cacheFileCase && @is_file($cacheFileCase)) {
939 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
940 return 2;
941 }
942 break;
943
944 case 'ascii':
945 if (is_array($this->toASCII['utf-8'])) return 1;
946
947 // Use cached version if possible
948 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
949 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
950 return 2;
951 }
952 break;
953 }
954
955 // process main Unicode data file
956 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
957 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
958
959 $fh = fopen($unicodeDataFile,'rb');
960 if (!$fh) return false;
961
962 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
963 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
964 $this->caseFolding['utf-8'] = array();
965 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
966 $utf8CaseFolding['toUpper'] = array();
967 $utf8CaseFolding['toLower'] = array();
968 $utf8CaseFolding['toTitle'] = array();
969
970 $decomposition = array(); // array of temp. decompositions
971 $mark = array(); // array of chars that are marks (eg. composing accents)
972 $number = array(); // array of chars that are numbers (eg. digits)
973
974 while (!feof($fh)) {
975 $line = fgets($fh,4096);
976 // has a lot of info
977 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
978
979 $ord = hexdec($char);
980 if ($ord > 0xFFFF) break; // only process the BMP
981
982 $utf8_char = $this->UnumberToChar($ord);
983
984 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
985 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
986 // store "title" only when different from "upper" (only a few)
987 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
988
989 switch ($cat{0}) {
990 case 'M': // mark (accent, umlaut, ...)
991 $mark["U+$char"] = 1;
992 break;
993
994 case 'N': // numeric value
995 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
996 }
997
998 // accented Latin letters without "official" decomposition
999 $match = array();
1000 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) {
1001 $c = ord($match[2]);
1002 if ($match[1] == 'SMALL') $c += 32;
1003
1004 $decomposition["U+$char"] = array(dechex($c));
1005 continue;
1006 }
1007
1008 $match = array();
1009 if (ereg('(<.*>)? *(.+)',$decomp,$match)) {
1010 switch($match[1]) {
1011 case '<circle>': // add parenthesis as circle replacement, eg (1)
1012 $match[2] = '0028 '.$match[2].' 0029';
1013 break;
1014
1015 case '<square>': // add square brackets as square replacement, eg [1]
1016 $match[2] = '005B '.$match[2].' 005D';
1017 break;
1018
1019 case '<compat>': // ignore multi char decompositions that start with a space
1020 if (ereg('^0020 ',$match[2])) continue 2;
1021 break;
1022
1023 // ignore Arabic and vertical layout presentation decomposition
1024 case '<initial>':
1025 case '<medial>':
1026 case '<final>':
1027 case '<isolated>':
1028 case '<vertical>':
1029 continue 2;
1030 }
1031 $decomposition["U+$char"] = split(' ',$match[2]);
1032 }
1033 }
1034 fclose($fh);
1035
1036 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1037 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
1038 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1039 $fh = fopen($specialCasingFile,'rb');
1040 if ($fh) {
1041 while (!feof($fh)) {
1042 $line = fgets($fh,4096);
1043 if ($line{0} != '#' && trim($line) != '') {
1044
1045 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
1046 if ($cond == '' || $cond{0} == '#') {
1047 $utf8_char = $this->UnumberToChar(hexdec($char));
1048 if ($char != $lower) {
1049 $arr = split(' ',$lower);
1050 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1051 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1052 }
1053 if ($char != $title && $title != $upper) {
1054 $arr = split(' ',$title);
1055 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1056 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1057 }
1058 if ($char != $upper) {
1059 $arr = split(' ',$upper);
1060 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1061 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1062 }
1063 }
1064 }
1065 }
1066 fclose($fh);
1067 }
1068 }
1069
1070 // process custom decompositions
1071 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
1072 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1073 $fh = fopen($customTranslitFile,'rb');
1074 if ($fh) {
1075 while (!feof($fh)) {
1076 $line = fgets($fh,4096);
1077 if ($line{0} != '#' && trim($line) != '') {
1078 list($char,$translit) = t3lib_div::trimExplode(';', $line);
1079 $decomposition["U+$char"] = split(' ', $translit);
1080 }
1081 }
1082 fclose($fh);
1083 }
1084 }
1085
1086 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1087 foreach($decomposition as $from => $to) {
1088 $code_decomp = array();
1089
1090 while ($code_value = array_shift($to)) {
1091 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1092 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
1093 array_unshift($to, $cv);
1094 }
1095 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1096 array_push($code_decomp, $code_value);
1097 }
1098 }
1099 if (count($code_decomp)) {
1100 $decomposition[$from] = $code_decomp;
1101 } else {
1102 unset($decomposition[$from]);
1103 }
1104 }
1105
1106 // create ascii only mapping
1107 $this->toASCII['utf-8'] = array();
1108 $ascii =& $this->toASCII['utf-8'];
1109
1110 foreach($decomposition as $from => $to) {
1111 $code_decomp = array();
1112 while ($code_value = array_shift($to)) {
1113 $ord = hexdec($code_value);
1114 if ($ord > 127)
1115 continue 2; // skip decompositions containing non-ASCII chars
1116 else
1117 array_push($code_decomp,chr($ord));
1118 }
1119 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1120 }
1121
1122 // add numeric decompositions
1123 foreach($number as $from => $to) {
1124 $utf8_char = $this->UnumberToChar(hexdec($from));
1125 if (!isset($ascii[$utf8_char])) {
1126 $ascii[$utf8_char] = $to;
1127 }
1128 }
1129
1130 if ($cacheFileCase) {
1131 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1132 }
1133
1134 if ($cacheFileASCII) {
1135 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1136 }
1137
1138 return 3;
1139 }
1140
1141 /**
1142 * This function initializes the folding table for a charset other than UTF-8.
1143 * This function is automatically called by the case folding functions.
1144 *
1145 * @param string Charset for which to initialize case folding.
1146 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1147 * @access private
1148 */
1149 function initCaseFolding($charset) {
1150 // Only process if the case table is not yet loaded:
1151 if (is_array($this->caseFolding[$charset])) return 1;
1152
1153 // Use cached version if possible
1154 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1155 if ($cacheFile && @is_file($cacheFile)) {
1156 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1157 return 2;
1158 }
1159
1160 // init UTF-8 conversion for this charset
1161 if (!$this->initCharset($charset)) {
1162 return false;
1163 }
1164
1165 // UTF-8 case folding is used as the base conversion table
1166 if (!$this->initUnicodeData('case')) {
1167 return false;
1168 }
1169
1170 $nochar = chr($this->noCharByteVal);
1171 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1172 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1173 $c = $this->utf8_decode($utf8, $charset);
1174
1175 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1176 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1177 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1178
1179 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1180 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1181 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
1182
1183 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1184 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1185 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1186 }
1187
1188 // add the ASCII case table
1189 for ($i=ord('a'); $i<=ord('z'); $i++) {
1190 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
1191 }
1192 for ($i=ord('A'); $i<=ord('Z'); $i++) {
1193 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
1194 }
1195
1196 if ($cacheFile) {
1197 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
1198 }
1199
1200 return 3;
1201 }
1202
1203 /**
1204 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1205 * This function is automatically called by the ASCII transliteration functions.
1206 *
1207 * @param string Charset for which to initialize conversion.
1208 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1209 * @access private
1210 */
1211 function initToASCII($charset) {
1212 // Only process if the case table is not yet loaded:
1213 if (is_array($this->toASCII[$charset])) return 1;
1214
1215 // Use cached version if possible
1216 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1217 if ($cacheFile && @is_file($cacheFile)) {
1218 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1219 return 2;
1220 }
1221
1222 // init UTF-8 conversion for this charset
1223 if (!$this->initCharset($charset)) {
1224 return false;
1225 }
1226
1227 // UTF-8/ASCII transliteration is used as the base conversion table
1228 if (!$this->initUnicodeData('ascii')) {
1229 return false;
1230 }
1231
1232 $nochar = chr($this->noCharByteVal);
1233 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1234 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1235 $c = $this->utf8_decode($utf8, $charset);
1236
1237 if (isset($this->toASCII['utf-8'][$utf8])) {
1238 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1239 }
1240 }
1241
1242 if ($cacheFile) {
1243 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
1244 }
1245
1246 return 3;
1247 }
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264 /********************************************
1265 *
1266 * String operation functions
1267 *
1268 ********************************************/
1269
1270 /**
1271 * Returns a part of a string.
1272 * Unit-tested by Kasper (single byte charsets only)
1273 *
1274 * @param string The character set
1275 * @param string Character string
1276 * @param integer Start position (character position)
1277 * @param integer Length (in characters)
1278 * @return string The substring
1279 * @see substr(), mb_substr()
1280 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1281 */
1282 function substr($charset,$string,$start,$len=null) {
1283 if ($len===0) return '';
1284
1285 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1286 // cannot omit $len, when specifying charset
1287 if ($len==null) {
1288 $enc = mb_internal_encoding(); // save internal encoding
1289 mb_internal_encoding('utf-8');
1290 $str = mb_substr($string,$start);
1291 mb_internal_encoding($enc); // restore internal encoding
1292
1293 return $str;
1294 }
1295 else return mb_substr($string,$start,$len,'utf-8');
1296 } elseif ($charset == 'utf-8') {
1297 return $this->utf8_substr($string,$start,$len);
1298 } elseif ($this->eucBasedSets[$charset]) {
1299 return $this->euc_substr($string,$start,$charset,$len);
1300 } elseif ($this->twoByteSets[$charset]) {
1301 return substr($string,$start*2,$len*2);
1302 } elseif ($this->fourByteSets[$charset]) {
1303 return substr($string,$start*4,$len*4);
1304 }
1305
1306 // treat everything else as single-byte encoding
1307 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
1308 }
1309
1310 /**
1311 * Counts the number of characters.
1312 * Unit-tested by Kasper (single byte charsets only)
1313 *
1314 * @param string The character set
1315 * @param string Character string
1316 * @return integer The number of characters
1317 * @see strlen()
1318 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1319 */
1320 function strlen($charset,$string) {
1321 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1322 return mb_strlen($string,$charset);
1323 } elseif ($charset == 'utf-8') {
1324 return $this->utf8_strlen($string);
1325 } elseif ($this->eucBasedSets[$charset]) {
1326 return $this->euc_strlen($string,$charset);
1327 } elseif ($this->twoByteSets[$charset]) {
1328 return strlen($string)/2;
1329 } elseif ($this->fourByteSets[$charset]) {
1330 return strlen($string)/4;
1331 }
1332 // treat everything else as single-byte encoding
1333 return strlen($string);
1334 }
1335
1336 /**
1337 * Truncates a string and pre-/appends a string.
1338 * Unit tested by Kasper
1339 *
1340 * @param string The character set
1341 * @param string Character string
1342 * @param integer Length (in characters)
1343 * @param string Crop signifier
1344 * @return string The shortened string
1345 * @see substr(), mb_strimwidth()
1346 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1347 */
1348 function crop($charset,$string,$len,$crop='') {
1349 if (intval($len) == 0) return $string;
1350
1351 if ($charset == 'utf-8') {
1352 $i = $this->utf8_char2byte_pos($string,$len);
1353 } elseif ($this->eucBasedSets[$charset]) {
1354 $i = $this->euc_char2byte_pos($string,$len,$charset);
1355 } else {
1356 if ($len > 0) {
1357 $i = $len;
1358 } else {
1359 $i = strlen($string)+$len;
1360 if ($i<=0) $i = false;
1361 }
1362 }
1363
1364 if ($i === false) { // $len outside actual string length
1365 return $string;
1366 } else {
1367 if ($len > 0) {
1368 if (strlen($string{$i})) {
1369 return substr($string,0,$i).$crop;
1370
1371 }
1372 } else {
1373 if (strlen($string{$i-1})) {
1374 return $crop.substr($string,$i);
1375 }
1376 }
1377
1378 /*
1379 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1380 if ($len > 0) {
1381 return substr($string,0,$i).$crop;
1382 } else {
1383 return $crop.substr($string,$i);
1384 }
1385 }
1386 */
1387 }
1388 return $string;
1389 }
1390
1391 /**
1392 * Cuts a string short at a given byte length.
1393 *
1394 * @param string The character set
1395 * @param string Character string
1396 * @param integer The byte length
1397 * @return string The shortened string
1398 * @see mb_strcut()
1399 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1400 */
1401 function strtrunc($charset,$string,$len) {
1402 if ($len <= 0) return '';
1403
1404 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1405 return mb_strcut($string,0,$len,$charset);
1406 } elseif ($charset == 'utf-8') {
1407 return $this->utf8_strtrunc($string,$len);
1408 } elseif ($this->eucBasedSets[$charset]) {
1409 return $this->euc_strtrunc($string,$charset);
1410 } elseif ($this->twoByteSets[$charset]) {
1411 if ($len % 2) $len--; // don't cut at odd positions
1412 } elseif ($this->fourByteSets[$charset]) {
1413 $x = $len % 4;
1414 $len -= $x; // realign to position dividable by four
1415 }
1416 // treat everything else as single-byte encoding
1417 return substr($string,0,$len);
1418 }
1419
1420 /**
1421 * Translates all characters of a string into their respective case values.
1422 * Unlike strtolower() and strtoupper() this method is locale independent.
1423 * Note that the string length may change!
1424 * eg. lower case German �(sharp S) becomes upper case "SS"
1425 * Unit-tested by Kasper
1426 * Real case folding is language dependent, this method ignores this fact.
1427 *
1428 * @param string Character set of string
1429 * @param string Input string to convert case for
1430 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1431 * @return string The converted string
1432 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1433 * @see strtolower(), strtoupper()
1434 */
1435 function conv_case($charset,$string,$case) {
1436 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' && float(phpversion()) >= 4.3) {
1437 if ($case == 'toLower') {
1438 return mb_strtolower($str,'utf-8');
1439 } else {
1440 return mb_strtoupper($str,'utf-8');
1441 }
1442 } elseif ($charset == 'utf-8') {
1443 return $this->utf8_char_mapping($string,'case',$case);
1444 } elseif (isset($this->eucBasedSets[$charset])) {
1445 return $this->euc_char_mapping($string,$charset,'case',$case);
1446 } else {
1447 // treat everything else as single-byte encoding
1448 return $this->sb_char_mapping($string,$charset,'case',$case);
1449 }
1450
1451 return $string;
1452 }
1453
1454 /**
1455 * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
1456 *
1457 * @param string Character set of string
1458 * @param string Input string to convert
1459 * @return string The converted string
1460 */
1461 function specCharsToASCII($charset,$string) {
1462 if ($charset == 'utf-8') {
1463 return $this->utf8_char_mapping($string,'ascii');
1464 } elseif (isset($this->eucBasedSets[$charset])) {
1465 return $this->euc_char_mapping($string,$charset,'ascii');
1466 } else {
1467 // treat everything else as single-byte encoding
1468 return $this->sb_char_mapping($string,$charset,'ascii');
1469 }
1470
1471 return $string;
1472 }
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485 /********************************************
1486 *
1487 * Internal string operation functions
1488 *
1489 ********************************************/
1490
1491 /**
1492 * Maps all characters of a string in a single byte charset.
1493 *
1494 * @param string the string
1495 * @param string the charset
1496 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1497 * @param string 'case': conversion 'toLower' or 'toUpper'
1498 * @return string the converted string
1499 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1500 */
1501 function sb_char_mapping($str,$charset,$mode,$opt='') {
1502 switch($mode) {
1503 case 'case':
1504 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1505 $map =& $this->caseFolding[$charset][$opt];
1506 break;
1507
1508 case 'ascii':
1509 if (!$this->initToASCII($charset)) return $str; // do nothing
1510 $map =& $this->toASCII[$charset];
1511 break;
1512
1513 default:
1514 return $str;
1515 }
1516
1517 $out = '';
1518 for($i=0; strlen($str{$i}); $i++) {
1519 $c = $str{$i};
1520 if (isset($map[$c])) {
1521 $out .= $map[$c];
1522 } else {
1523 $out .= $c;
1524 }
1525 }
1526
1527 return $out;
1528 }
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539 /********************************************
1540 *
1541 * Internal UTF-8 string operation functions
1542 *
1543 ********************************************/
1544
1545 /**
1546 * Returns a part of a UTF-8 string.
1547 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1548 *
1549 * @param string UTF-8 string
1550 * @param integer Start position (character position)
1551 * @param integer Length (in characters)
1552 * @return string The substring
1553 * @see substr()
1554 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1555 */
1556 function utf8_substr($str,$start,$len=null) {
1557 if (!strcmp($len,'0')) return '';
1558
1559 $byte_start = $this->utf8_char2byte_pos($str,$start);
1560 if ($byte_start === false) {
1561 if ($start > 0) {
1562 return false; // $start outside string length
1563 } else {
1564 $start = 0;
1565 }
1566 }
1567
1568 $str = substr($str,$byte_start);
1569
1570 if ($len!=null) {
1571 $byte_end = $this->utf8_char2byte_pos($str,$len);
1572 if ($byte_end === false) // $len outside actual string length
1573 return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string.
1574 else
1575 return substr($str,0,$byte_end);
1576 }
1577 else return $str;
1578 }
1579
1580 /**
1581 * Counts the number of characters of a string in UTF-8.
1582 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1583 *
1584 * @param string UTF-8 multibyte character string
1585 * @return integer The number of characters
1586 * @see strlen()
1587 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1588 */
1589 function utf8_strlen($str) {
1590 $n=0;
1591 for($i=0; strlen($str{$i}); $i++) {
1592 $c = ord($str{$i});
1593 if (!($c & 0x80)) // single-byte (0xxxxxx)
1594 $n++;
1595 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1596 $n++;
1597 }
1598 return $n;
1599 }
1600
1601 /**
1602 * Truncates a string in UTF-8 short at a given byte length.
1603 *
1604 * @param string UTF-8 multibyte character string
1605 * @param integer the byte length
1606 * @return string the shortened string
1607 * @see mb_strcut()
1608 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1609 */
1610 function utf8_strtrunc($str,$len) {
1611 $i = $len-1;
1612 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1613 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1614 if ($i <= 0) return ''; // sanity check
1615 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1616 if ($bc+$i > $len) return substr($str,0,$i);
1617 // fallthru: multibyte char fits into length
1618 }
1619 return substr($str,$len);
1620 }
1621
1622 /**
1623 * Find position of first occurrence of a string, both arguments are in UTF-8.
1624 *
1625 * @param string UTF-8 string to search in
1626 * @param string UTF-8 string to search for
1627 * @param integer Positition to start the search
1628 * @return integer The character position
1629 * @see strpos()
1630 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1631 */
1632 function utf8_strpos($haystack,$needle,$offset=0) {
1633 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1634 return mb_strpos($haystack,$needle,'utf-8');
1635 }
1636
1637 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1638 if ($byte_offset === false) return false; // offset beyond string length
1639
1640 $byte_pos = strpos($haystack,$needle,$byte_offset);
1641 if ($byte_pos === false) return false; // needle not found
1642
1643 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1644 }
1645
1646 /**
1647 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1648 *
1649 * @param string UTF-8 string to search in
1650 * @param string UTF-8 character to search for (single character)
1651 * @return integer The character position
1652 * @see strrpos()
1653 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1654 */
1655 function utf8_strrpos($haystack,$needle) {
1656 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1657 return mb_strrpos($haystack,$needle,'utf-8');
1658 }
1659
1660 $byte_pos = strrpos($haystack,$needle);
1661 if ($byte_pos === false) return false; // needle not found
1662
1663 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1664 }
1665
1666 /**
1667 * Translates a character position into an 'absolute' byte position.
1668 * Unit tested by Kasper.
1669 *
1670 * @param string UTF-8 string
1671 * @param integer Character position (negative values start from the end)
1672 * @return integer Byte position
1673 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1674 */
1675 function utf8_char2byte_pos($str,$pos) {
1676 $n = 0; // number of characters found
1677 $p = abs($pos); // number of characters wanted
1678
1679 if ($pos >= 0) {
1680 $i = 0;
1681 $d = 1;
1682 } else {
1683 $i = strlen($str)-1;
1684 $d = -1;
1685 }
1686
1687 for( ; strlen($str{$i}) && $n<$p; $i+=$d) {
1688 $c = (int)ord($str{$i});
1689 if (!($c & 0x80)) // single-byte (0xxxxxx)
1690 $n++;
1691 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1692 $n++;
1693 }
1694 if (!strlen($str{$i})) return false; // offset beyond string length
1695
1696 if ($pos >= 0) {
1697 // skip trailing multi-byte data bytes
1698 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1699 } else {
1700 // correct offset
1701 $i++;
1702 }
1703
1704 return $i;
1705 }
1706
1707 /**
1708 * Translates an 'absolute' byte position into a character position.
1709 * Unit tested by Kasper.
1710 *
1711 * @param string UTF-8 string
1712 * @param integer byte position
1713 * @return integer character position
1714 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1715 */
1716 function utf8_byte2char_pos($str,$pos) {
1717 $n = 0; // number of characters
1718 for($i=$pos; $i>0; $i--) {
1719 $c = (int)ord($str{$i});
1720 if (!($c & 0x80)) // single-byte (0xxxxxx)
1721 $n++;
1722 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1723 $n++;
1724 }
1725 if (!strlen($str{$i})) return false; // offset beyond string length
1726
1727 return $n;
1728 }
1729
1730 /**
1731 * Maps all characters of an UTF-8 string.
1732 *
1733 * @param string UTF-8 string
1734 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1735 * @param string 'case': conversion 'toLower' or 'toUpper'
1736 * @return string the converted string
1737 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1738 */
1739 function utf8_char_mapping($str,$mode,$opt='') {
1740 if (!$this->initUnicodeData($mode)) return $str; // do nothing
1741
1742 $out = '';
1743 switch($mode) {
1744 case 'case':
1745 $map =& $this->caseFolding['utf-8'][$opt];
1746 break;
1747
1748 case 'ascii':
1749 $map =& $this->toASCII['utf-8'];
1750 break;
1751
1752 default:
1753 return $str;
1754 }
1755
1756 for($i=0; strlen($str{$i}); $i++) {
1757 $c = ord($str{$i});
1758 if (!($c & 0x80)) // single-byte (0xxxxxx)
1759 $mbc = $str{$i};
1760 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1761 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1762 $mbc = substr($str,$i,$bc);
1763 $i += $bc-1;
1764 }
1765
1766 if (isset($map[$mbc])) {
1767 $out .= $map[$mbc];
1768 } else {
1769 $out .= $mbc;
1770 }
1771 }
1772
1773 return $out;
1774 }
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793 /********************************************
1794 *
1795 * Internal EUC string operation functions
1796 *
1797 * Extended Unix Code:
1798 * ASCII compatible 7bit single bytes chars
1799 * 8bit two byte chars
1800 *
1801 * Shift-JIS is treated as a special case.
1802 *
1803 ********************************************/
1804
1805 /**
1806 * Cuts a string in the EUC charset family short at a given byte length.
1807 *
1808 * @param string EUC multibyte character string
1809 * @param integer the byte length
1810 * @param string the charset
1811 * @return string the shortened string
1812 * @see mb_strcut()
1813 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1814 */
1815 function euc_strtrunc($str,$len,$charset) {
1816 $sjis = ($charset == 'shift_jis');
1817 for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
1818 $c = ord($str{$i});
1819 if ($sjis) {
1820 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1821 }
1822 else {
1823 if ($c >= 0x80) $i++; // advance a double-byte char
1824 }
1825 }
1826 if (!strlen($str{$i})) return $str; // string shorter than supplied length
1827
1828 if ($i>$len)
1829 return substr($str,0,$len-1); // we ended on a first byte
1830 else
1831 return substr($str,0,$len);
1832 }
1833
1834 /**
1835 * Returns a part of a string in the EUC charset family.
1836 *
1837 * @param string EUC multibyte character string
1838 * @param integer start position (character position)
1839 * @param string the charset
1840 * @param integer length (in characters)
1841 * @return string the substring
1842 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1843 */
1844 function euc_substr($str,$start,$charset,$len=null) {
1845 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1846 if ($byte_start === false) return false; // $start outside string length
1847
1848 $str = substr($str,$byte_start);
1849
1850 if ($len!=null) {
1851 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1852 if ($byte_end === false) // $len outside actual string length
1853 return $str;
1854 else
1855 return substr($str,0,$byte_end);
1856 }
1857 else return $str;
1858 }
1859
1860 /**
1861 * Counts the number of characters of a string in the EUC charset family.
1862 *
1863 * @param string EUC multibyte character string
1864 * @param string the charset
1865 * @return integer the number of characters
1866 * @see strlen()
1867 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1868 */
1869 function euc_strlen($str,$charset) {
1870 $sjis = ($charset == 'shift_jis');
1871 $n=0;
1872 for ($i=0; strlen($str{$i}); $i++) {
1873 $c = ord($str{$i});
1874 if ($sjis) {
1875 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1876 }
1877 else {
1878 if ($c >= 0x80) $i++; // advance a double-byte char
1879 }
1880
1881 $n++;
1882 }
1883
1884 return $n;
1885 }
1886
1887 /**
1888 * Translates a character position into an 'absolute' byte position.
1889 *
1890 * @param string EUC multibyte character string
1891 * @param integer character position (negative values start from the end)
1892 * @param string the charset
1893 * @return integer byte position
1894 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1895 */
1896 function euc_char2byte_pos($str,$pos,$charset) {
1897 $sjis = ($charset == 'shift_jis');
1898 $n = 0; // number of characters seen
1899 $p = abs($pos); // number of characters wanted
1900
1901 if ($pos >= 0) {
1902 $i = 0;
1903 $d = 1;
1904 } else {
1905 $i = strlen($str)-1;
1906 $d = -1;
1907 }
1908
1909 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
1910 $c = ord($str{$i});
1911 if ($sjis) {
1912 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char
1913 }
1914 else {
1915 if ($c >= 0x80) $i+=$d; // advance a double-byte char
1916 }
1917
1918 $n++;
1919 }
1920 if (!strlen($str{$i})) return false; // offset beyond string length
1921
1922 if ($pos < 0) $i++; // correct offset
1923
1924 return $i;
1925 }
1926
1927 /**
1928 * Maps all characters of a string in the EUC charset family.
1929 *
1930 * @param string EUC multibyte character string
1931 * @param string the charset
1932 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1933 * @param string 'case': conversion 'toLower' or 'toUpper'
1934 * @return string the converted string
1935 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1936 */
1937 function euc_char_mapping($str,$charset,$mode,$opt='') {
1938 switch($mode) {
1939 case 'case':
1940 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1941 $map =& $this->caseFolding[$charset][$opt];
1942 break;
1943
1944 case 'ascii':
1945 if (!$this->initToASCII($charset)) return $str; // do nothing
1946 $map =& $this->toASCII[$charset];
1947 break;
1948
1949 default:
1950 return $str;
1951 }
1952
1953 $sjis = ($charset == 'shift_jis');
1954 $out = '';
1955 for($i=0; strlen($str{$i}); $i++) {
1956 $mbc = $str{$i};
1957 $c = ord($mbc);
1958
1959 if ($sjis) {
1960 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
1961 $mbc = substr($str,$i,2);
1962 $i++;
1963 }
1964 }
1965 else {
1966 if ($c >= 0x80) { // a double-byte char
1967 $mbc = substr($str,$i,2);
1968 $i++;
1969 }
1970 }
1971
1972 if (isset($map[$mbc])) {
1973 $out .= $map[$mbc];
1974 } else {
1975 $out .= $mbc;
1976 }
1977 }
1978
1979 return $out;
1980 }
1981
1982 }
1983
1984 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
1985 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
1986 }
1987 ?>