More CSH content added
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2004 Kasper Skaarhoj (kasper@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasper@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 128: class t3lib_cs
38 * 442: function parse_charset($charset)
39 * 460: function get_locale_charset($locale)
40 * 492: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
41 * 529: function utf8_encode($str,$charset)
42 * 576: function utf8_decode($str,$charset,$useEntityForNoChar=0)
43 * 619: function utf8_to_entities($str)
44 * 652: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
45 * 686: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
46 * 736: function UnumberToChar($cbyte)
47 * 781: function utf8CharToUnumber($str,$hex=0)
48 *
49 * SECTION: Init functions
50 * 824: function initCharset($charset)
51 * 885: function initCaseFoldingUTF8()
52 * 973: function initCaseFolding($charset)
53 *
54 * SECTION: String operation functions
55 * 1058: function substr($charset,$string,$start,$len=null)
56 * 1096: function strlen($charset,$string)
57 * 1124: function crop($charset,$string,$len,$crop='')
58 * 1165: function strtrunc($charset,$string,$len)
59 * 1197: function conv_case($charset,$string,$case)
60 *
61 * SECTION: Internal UTF-8 string operation functions
62 * 1264: function utf8_substr($str,$start,$len=null)
63 * 1297: function utf8_strlen($str)
64 * 1318: function utf8_strtrunc($str,$len)
65 * 1340: function utf8_strpos($haystack,$needle,$offset=0)
66 * 1363: function utf8_strrpos($haystack,$needle)
67 * 1383: function utf8_char2byte_pos($str,$pos)
68 * 1424: function utf8_byte2char_pos($str,$pos)
69 * 1448: function utf8_conv_case($str,$case)
70 *
71 * SECTION: Internal EUC string operation functions
72 * 1514: function euc_strtrunc($str,$len,$charset)
73 * 1543: function euc_substr($str,$start,$charset,$len=null)
74 * 1568: function euc_strlen($str,$charset)
75 * 1595: function euc_char2byte_pos($str,$pos,$charset)
76 * 1636: function euc_conv_case($str,$case,$charset)
77 *
78 * TOTAL FUNCTIONS: 31
79 * (This index is automatically created/updated by the extension "extdeveval")
80 *
81 */
82
83
84
85
86
87
88
89
90 /**
91 * Notes on UTF-8
92 *
93 * Functions working on UTF-8 strings:
94 *
95 * - strchr/strstr
96 * - strrchr
97 * - substr_count
98 * - implode/explode/join
99 *
100 * Functions nearly working on UTF-8 strings:
101 *
102 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf_strlen
103 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained 7-bit ASCII
104 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
105 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
106 *
107 * Functions NOT working on UTF-8 strings:
108 *
109 * - str*cmp
110 * - stristr
111 * - stripos
112 * - substr
113 * - strrev
114 * - ereg/eregi
115 * - split/spliti
116 * - preg_*
117 * - ...
118 *
119 */
120 /**
121 * Class for conversion between charsets
122 *
123 * @author Kasper Skaarhoj <kasper@typo3.com>
124 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
125 * @package TYPO3
126 * @subpackage t3lib
127 */
128 class t3lib_cs {
129 var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
130
131 // This is the array where parsed conversion tables are stored (cached)
132 var $parsedCharsets=array();
133
134 // An array where case folding data will be stored (cached)
135 var $caseFolding=array();
136
137 // An array where charset-to-ASCII mappings are stored (cached)
138 var $toASCII=array();
139
140 // This tells the converter which charsets has two bytes per char:
141 var $twoByteSets=array(
142 'ucs-2'=>1, // 2-byte Unicode
143 );
144
145 // This tells the converter which charsets has four bytes per char:
146 var $fourByteSets=array(
147 'ucs-4'=>1, // 4-byte Unicode
148 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
149 );
150
151 // This tells the converter which charsets use a scheme like the Extended Unix Code:
152 var $eucBasedSets=array(
153 'gb2312'=>1, // Chinese, simplified.
154 'big5'=>1, // Chinese, traditional.
155 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
156 );
157
158 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
159 // http://czyborra.com/charsets/iso8859.html
160 var $synonyms=array(
161 'us' => 'ascii',
162 'us-ascii'=> 'ascii',
163 'cp819' => 'iso-8859-1',
164 'ibm819' => 'iso-8859-1',
165 'iso-ir-100' => 'iso-8859-1',
166 'iso-ir-109' => 'iso-8859-2',
167 'iso-ir-148' => 'iso-8859-9',
168 'iso-ir-199' => 'iso-8859-14',
169 'iso-ir-203' => 'iso-8859-15',
170 'csisolatin1' => 'iso-8859-1',
171 'csisolatin2' => 'iso-8859-2',
172 'csisolatin3' => 'iso-8859-3',
173 'csisolatin5' => 'iso-8859-9',
174 'csisolatin8' => 'iso-8859-14',
175 'csisolatin9' => 'iso-8859-15',
176 'csisolatingreek' => 'iso-8859-7',
177 'iso-celtic' => 'iso-8859-14',
178 'latin1' => 'iso-8859-1',
179 'latin2' => 'iso-8859-2',
180 'latin3' => 'iso-8859-3',
181 'latin5' => 'iso-8859-9',
182 'latin6' => 'iso-8859-10',
183 'latin8' => 'iso-8859-14',
184 'latin9' => 'iso-8859-15',
185 'l1' => 'iso-8859-1',
186 'l2' => 'iso-8859-2',
187 'l3' => 'iso-8859-3',
188 'l5' => 'iso-8859-9',
189 'l6' => 'iso-8859-10',
190 'l8' => 'iso-8859-14',
191 'l9' => 'iso-8859-15',
192 'cyrillic' => 'iso-8859-5',
193 'arabic' => 'iso-8859-6',
194 'tis-620' => 'iso-8859-11',
195 'win874' => 'windows-874',
196 'win1250' => 'windows-1250',
197 'win1251' => 'windows-1251',
198 'win1252' => 'windows-1252',
199 'win1253' => 'windows-1253',
200 'win1254' => 'windows-1254',
201 'win1255' => 'windows-1255',
202 'win1256' => 'windows-1256',
203 'win1257' => 'windows-1257',
204 'win1258' => 'windows-1258',
205 'cp1250' => 'windows-1250',
206 'cp1251' => 'windows-1251',
207 'cp1252' => 'windows-1252',
208 'ms-ee' => 'windows-1250',
209 'ms-ansi' => 'windows-1252',
210 'ms-greek' => 'windows-1253',
211 'ms-turk' => 'windows-1254',
212 'winbaltrim' => 'windows-1257',
213 'koi-8ru' => 'koi-8r',
214 'koi8r' => 'koi-8r',
215 'cp878' => 'koi-8r',
216 'mac' => 'macroman',
217 'macintosh' => 'macroman',
218 'euc-cn' => 'gb2312',
219 'x-euc-cn' => 'gb2312',
220 'euccn' => 'gb2312',
221 'cp936' => 'gb2312',
222 'big-5' => 'big5',
223 'cp950' => 'big5',
224 'eucjp' => 'euc-jp',
225 'sjis' => 'shift_jis',
226 'shift-jis' => 'shift_jis',
227 'cp932' => 'shift_jis',
228 'utf7' => 'utf-7',
229 'utf8' => 'utf-8',
230 'utf16' => 'utf-16',
231 'utf32' => 'utf-32',
232 'utf8' => 'utf-8',
233 'ucs2' => 'ucs-2',
234 'ucs4' => 'ucs-4',
235 );
236
237 // mapping of iso-639:2 language codes to language (family) names
238 var $lang_to_langfamily=array(
239 // iso-639:2 language codes, see:
240 // http://www.w3.org/WAI/ER/IG/ert/iso639.htm
241 // http://www.unicode.org/onlinedat/languages.html
242 'ar' => 'arabic',
243 'bg' => 'cyrillic',
244 'cs' => 'east_european',
245 'da' => 'west_european',
246 'de' => 'west_european',
247 'es' => 'west_european',
248 'et' => 'estonian',
249 'eu' => 'west_european',
250 'fi' => 'west_european',
251 'fr' => 'west_european',
252 'gr' => 'greek',
253 'hr' => 'east_european',
254 'hu' => 'east_european',
255 'iw' => 'hebrew',
256 'is' => 'west_european',
257 'it' => 'west_european',
258 'ja' => 'japanese',
259 'kl' => 'west_european',
260 'ko' => 'korean',
261 'lt' => 'lithuanian',
262 'lv' => 'west_european', // Latvian/Lettish
263 'nl' => 'west_european',
264 'no' => 'west_european',
265 'pl' => 'east_european',
266 'pt' => 'west_european',
267 'ro' => 'east_european',
268 'ru' => 'cyrillic',
269 'sk' => 'east_european',
270 'sl' => 'east_european',
271 'sv' => 'west_european',
272 'th' => 'thai',
273 'uk' => 'cyrillic',
274 'vi' => 'vietnamese',
275 'zh' => 'chinese',
276 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
277 'chs' => 'simpl_chinese',
278 'cht' => 'trad_chinese',
279 'csy' => 'east_european',
280 'dan' => 'west_european',
281 'deu' => 'west_european',
282 'dea' => 'west_european',
283 'des' => 'west_european',
284 'ena' => 'west_european',
285 'enc' => 'west_european',
286 'eng' => 'west_european',
287 'enz' => 'west_european',
288 'enu' => 'west_european',
289 'nld' => 'west_european',
290 'nlb' => 'west_european',
291 'fin' => 'west_european',
292 'fra' => 'west_european',
293 'frb' => 'west_european',
294 'frc' => 'west_european',
295 'frs' => 'west_european',
296 'ell' => 'greek',
297 'hun' => 'east_european',
298 'isl' => 'west_euorpean',
299 'ita' => 'west_european',
300 'its' => 'west_european',
301 'jpn' => 'japanese',
302 'kor' => 'korean',
303 'nor' => 'west_european',
304 'non' => 'west_european',
305 'plk' => 'east_european',
306 'ptg' => 'west_european',
307 'ptb' => 'west_european',
308 'rus' => 'east_european',
309 'sky' => 'east_european',
310 'esp' => 'west_european',
311 'esm' => 'west_european',
312 'esn' => 'west_european',
313 'sve' => 'west_european',
314 'trk' => 'turkish',
315 // English language names
316 'bulgarian' => 'east_european',
317 'catalan' => 'west_european',
318 'croatian' => 'east_european',
319 'czech' => 'east_european',
320 'danish' => 'west_european',
321 'dutch' => 'west_european',
322 'english' => 'west_european',
323 'finnish' => 'west_european',
324 'french' => 'west_european',
325 'galician' => 'west_european',
326 'german' => 'west_european',
327 'hungarian' => 'east_european',
328 'icelandic' => 'west_european',
329 'italian' => 'west_european',
330 'latvian' => 'west_european',
331 'lettish' => 'west_european',
332 'norwegian' => 'west_european',
333 'polish' => 'east_european',
334 'portuguese' => 'west_european',
335 'russian' => 'cyrillic',
336 'romanian' => 'east_european',
337 'slovak' => 'east_european',
338 'slovenian' => 'east_european',
339 'spanish' => 'west_european',
340 'svedish' => 'west_european',
341 'turkish' => 'east_european',
342 'ukrainian' => 'cyrillic',
343 );
344
345 // mapping of language (family) names to charsets on Unix
346 var $lang_to_charset_unix=array(
347 'west_european' => 'iso-8859-1',
348 'estonian' => 'iso-8859-1',
349 'east_european' => 'iso-8859-2',
350 'baltic' => 'iso-8859-4',
351 'cyrillic' => 'iso-8859-5',
352 'arabic' => 'iso-8859-6',
353 'greek' => 'iso-8859-7',
354 'hebrew' => 'iso-8859-8',
355 'turkish' => 'iso-8859-9',
356 'thai' => 'iso-8859-11', // = TIS-620
357 'lithuanian' => 'iso-8859-13',
358 'chinese' => 'gb2312', // = euc-cn
359 'japanese' => 'euc-jp',
360 'korean' => 'euc-kr',
361 'simpl_chinese' => 'gb2312',
362 'trad_chinese' => 'big5',
363 'vietnamese' => '',
364 );
365
366 // mapping of language (family) names to charsets on Windows
367 var $lang_to_charset_windows=array(
368 'east_european' => 'windows-1250',
369 'cyrillic' => 'windows-1251',
370 'west_european' => 'windows-1252',
371 'greek' => 'windows-1253',
372 'turkish' => 'windows-1254',
373 'hebrew' => 'windows-1255',
374 'arabic' => 'windows-1256',
375 'baltic' => 'windows-1257',
376 'estonian' => 'windows-1257',
377 'lithuanian' => 'windows-1257',
378 'vietnamese' => 'windows-1258',
379 'thai' => 'cp874',
380 'korean' => 'cp950',
381 'chinese' => 'gb2312',
382 'japanese' => 'shift_jis',
383 'simpl_chinese' => 'gb2312',
384 'trad_chinese' => 'big5',
385 );
386
387 // mapping of locale names to charsets
388 var $locale_to_charset=array(
389 'japanese.euc' => 'euc-jp',
390 'ja_jp.ujis' => 'euc-jp',
391 'korean.euc' => 'euc-kr',
392 'zh_cn' => 'gb2312',
393 'zh_hk' => 'big5',
394 'zh_tw' => 'big5',
395 );
396
397 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
398 // Empty values means "iso-8859-1"
399 var $charSetArray = array(
400 'dk' => '',
401 'de' => '',
402 'no' => '',
403 'it' => '',
404 'fr' => '',
405 'es' => '',
406 'nl' => '',
407 'cz' => 'windows-1250',
408 'pl' => 'iso-8859-2',
409 'si' => 'windows-1250',
410 'fi' => '',
411 'tr' => 'iso-8859-9',
412 'se' => '',
413 'pt' => '',
414 'ru' => 'windows-1251',
415 'ro' => 'iso-8859-2',
416 'ch' => 'gb2312',
417 'sk' => 'windows-1250',
418 'lt' => 'windows-1257',
419 'is' => 'utf-8',
420 'hr' => 'windows-1250',
421 'hu' => 'iso-8859-2',
422 'gl' => '',
423 'th' => 'iso-8859-11',
424 'gr' => 'iso-8859-7',
425 'hk' => 'big5',
426 'eu' => '',
427 'bg' => 'windows-1251',
428 'br' => '',
429 'et' => 'iso-8859-4',
430 'ar' => 'iso-8859-6',
431 'he' => 'utf-8',
432 'ua' => 'windows-1251',
433 'jp' => 'shift_jis',
434 'lv' => 'utf-8',
435 'vn' => 'utf-8',
436 );
437
438 /**
439 * Normalize - changes input character set to lowercase letters.
440 *
441 * @param string Input charset
442 * @return string Normalized charset
443 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
444 */
445 function parse_charset($charset) {
446 $charset = strtolower($charset);
447 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
448
449 return $charset;
450 }
451
452 /**
453 * Get the charset of a locale.
454 *
455 * ln language
456 * ln_CN language / country
457 * ln_CN.cs language / country / charset
458 *
459 * @param string Locale string
460 * @return string Charset resolved for locale string
461 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
462 */
463 function get_locale_charset($locale) {
464 $locale = strtolower($locale);
465
466 // exact locale specific charset?
467 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale];
468
469 // locale contains charset: use it
470 list($locale,$charset) = explode('.',$locale);
471 if ($charset) return $this->parse_charset($charset);
472
473 // get language
474 list($language,$country) = explode('_',$locale);
475 if (isset($this->lang_to_langfamily[$language])) $language = $this->lang_to_langfamily[$language];
476
477 if (TYPO3_OS == 'WIN') {
478 $cs = $this->lang_to_charset_windows[$language];
479 } else {
480 $cs = $this->lang_to_charset_unix[$language];
481 }
482
483 return $cs ? $cs : 'iso-8859-1';
484 }
485
486
487
488
489
490
491
492
493
494 /********************************************
495 *
496 * Charset Conversion functions
497 *
498 ********************************************/
499
500 /**
501 * Convert from one charset to another charset.
502 *
503 * @param string Input string
504 * @param string From charset (the current charset of the string)
505 * @param string To charset (the output charset wanted)
506 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
507 * @return string Converted string
508 */
509 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
510 if ($fromCS==$toCS) return $str;
511
512 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
513 if ($toCS=='utf-8' || !$useEntityForNoChar) {
514 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
515 case 'mbstring':
516 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
517 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
518 break;
519
520 case 'iconv':
521 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
522 if (false !== $conv_str) return $conv_str;
523 break;
524
525 case 'recode':
526 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
527 if (false !== $conv_str) return $conv_str;
528 break;
529 }
530 // fallback to TYPO3 conversion
531 }
532
533 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
534 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
535 return $str;
536 }
537
538
539 /**
540 * Converts $str from $charset to UTF-8
541 *
542 * @param string String in local charset to convert to UTF-8
543 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
544 * @return string Output string, converted to UTF-8
545 */
546 function utf8_encode($str,$charset) {
547
548 // Charset is case-insensitive.
549 if ($this->initCharset($charset)) { // Parse conv. table if not already...
550 $strLen = strlen($str);
551 $outStr='';
552
553 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
554 $chr=substr($str,$a,1);
555 $ord=ord($chr);
556 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
557 $ord2 = ord($str{$a+1});
558 $ord = $ord<<8 & $ord2; // assume big endian
559
560 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
561 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
562 } else $outStr.=chr($this->noCharByteVal); // No char exists
563 $a++;
564 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
565 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
566 $a++;
567 $ord2=ord(substr($str,$a,1));
568 $ord = $ord*256+$ord2;
569 }
570 elseif ($charset == 'shift_jis' && ($ord <160 || $ord>223)) { // Shift-JIS is like EUC, but chars between 160 and 223 are single byte
571 $a++;
572 $ord2=ord(substr($str,$a,1));
573 $ord = $ord*256+$ord2;
574 }
575
576 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
577 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
578 } else $outStr.=chr($this->noCharByteVal); // No char exists
579 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
580 }
581 return $outStr;
582 }
583 }
584
585 /**
586 * Converts $str from UTF-8 to $charset
587 *
588 * @param string String in UTF-8 to convert to local charset
589 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
590 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
591 * @return string Output string, converted to local charset
592 */
593 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
594
595 // Charset is case-insensitive.
596 if ($this->initCharset($charset)) { // Parse conv. table if not already...
597 $strLen = strlen($str);
598 $outStr='';
599 $buf='';
600 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
601 $chr=substr($str,$a,1);
602 $ord=ord($chr);
603 if ($ord>127) { // This means multibyte! (first byte!)
604 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
605
606 $buf=$chr; // Add first byte
607 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
608 $ord = $ord << 1; // Shift it left and ...
609 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
610 $a++; // Increase pointer...
611 $buf.=substr($str,$a,1); // ... and add the next char.
612 } else break;
613 }
614
615 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
616 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
617 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
618 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
619 } else $outStr.= chr($mByte);
620 } elseif ($useEntityForNoChar) { // Create num entity:
621 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
622 } else $outStr.=chr($this->noCharByteVal); // No char exists
623 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
624 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
625 }
626 return $outStr;
627 }
628 }
629
630 /**
631 * Converts all chars > 127 to numeric entities.
632 *
633 * @param string Input string
634 * @return string Output string
635 */
636 function utf8_to_entities($str) {
637 $strLen = strlen($str);
638 $outStr='';
639 $buf='';
640 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
641 $chr=substr($str,$a,1);
642 $ord=ord($chr);
643 if ($ord>127) { // This means multibyte! (first byte!)
644 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
645 $buf=$chr; // Add first byte
646 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
647 $ord = $ord << 1; // Shift it left and ...
648 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
649 $a++; // Increase pointer...
650 $buf.=substr($str,$a,1); // ... and add the next char.
651 } else break;
652 }
653
654 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
655 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
656 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
657 }
658
659 return $outStr;
660 }
661
662 /**
663 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
664 *
665 * @param string Input string, UTF-8
666 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
667 * @return string Output string
668 */
669 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
670 if ($alsoStdHtmlEnt) {
671 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
672 }
673
674 $token = md5(microtime());
675 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
676 foreach($parts as $k => $v) {
677 if ($k%2) {
678 if (substr($v,0,1)=='#') { // Dec or hex entities:
679 if (substr($v,1,1)=='x') {
680 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
681 } else {
682 $parts[$k] = $this->UnumberToChar(substr($v,1));
683 }
684 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
685 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
686 } else { // No conversion:
687 $parts[$k] ='&'.$v.';';
688 }
689 }
690 }
691
692 return implode('',$parts);
693 }
694
695 /**
696 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
697 *
698 * @param string Input string, UTF-8
699 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
700 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
701 * @return array Output array with the char numbers
702 */
703 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
704 // If entities must be registered as well...:
705 if ($convEntities) {
706 $str = $this->entities_to_utf8($str,1);
707 }
708 // Do conversion:
709 $strLen = strlen($str);
710 $outArr=array();
711 $buf='';
712 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
713 $chr=substr($str,$a,1);
714 $ord=ord($chr);
715 if ($ord>127) { // This means multibyte! (first byte!)
716 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
717 $buf=$chr; // Add first byte
718 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
719 $ord = $ord << 1; // Shift it left and ...
720 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
721 $a++; // Increase pointer...
722 $buf.=substr($str,$a,1); // ... and add the next char.
723 } else break;
724 }
725
726 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
727 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
728 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
729 }
730
731 return $outArr;
732 }
733
734 /**
735 * Converts a UNICODE number to a UTF-8 multibyte character
736 * Algorithm based on script found at From: http://czyborra.com/utf/
737 * Unit-tested by Kasper
738 *
739 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
740 *
741 * bytes | bits | representation
742 * 1 | 7 | 0vvvvvvv
743 * 2 | 11 | 110vvvvv 10vvvvvv
744 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
745 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
746 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
747 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
748 *
749 * @param integer UNICODE integer
750 * @return string UTF-8 multibyte character string
751 * @see utf8CharToUnumber()
752 */
753 function UnumberToChar($cbyte) {
754 $str='';
755
756 if ($cbyte < 0x80) {
757 $str.=chr($cbyte);
758 } else if ($cbyte < 0x800) {
759 $str.=chr(0xC0 | ($cbyte >> 6));
760 $str.=chr(0x80 | ($cbyte & 0x3F));
761 } else if ($cbyte < 0x10000) {
762 $str.=chr(0xE0 | ($cbyte >> 12));
763 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
764 $str.=chr(0x80 | ($cbyte & 0x3F));
765 } else if ($cbyte < 0x200000) {
766 $str.=chr(0xF0 | ($cbyte >> 18));
767 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
768 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
769 $str.=chr(0x80 | ($cbyte & 0x3F));
770 } else if ($cbyte < 0x4000000) {
771 $str.=chr(0xF8 | ($cbyte >> 24));
772 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
773 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
774 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
775 $str.=chr(0x80 | ($cbyte & 0x3F));
776 } else if ($cbyte < 0x80000000) {
777 $str.=chr(0xFC | ($cbyte >> 30));
778 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
779 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
780 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
781 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
782 $str.=chr(0x80 | ($cbyte & 0x3F));
783 } else { // Cannot express a 32-bit character in UTF-8
784 $str .= chr($this->noCharByteVal);
785 }
786 return $str;
787 }
788
789 /**
790 * Converts a UTF-8 Multibyte character to a UNICODE number
791 * Unit-tested by Kasper
792 *
793 * @param string UTF-8 multibyte character string
794 * @param boolean If set, then a hex. number is returned.
795 * @return integer UNICODE integer
796 * @see UnumberToChar()
797 */
798 function utf8CharToUnumber($str,$hex=0) {
799 $ord=ord(substr($str,0,1)); // First char
800
801 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
802 $binBuf='';
803 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
804 $ord = $ord << 1; // Shift it left and ...
805 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
806 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
807 } else break;
808 }
809 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
810
811 $int = bindec($binBuf);
812 } else $int = $ord;
813
814 return $hex ? 'x'.dechex($int) : $int;
815 }
816
817
818
819
820
821
822
823
824
825 /********************************************
826 *
827 * Init functions
828 *
829 ********************************************/
830
831 /**
832 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
833 * This function is automatically called by the conversion functions
834 *
835 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
836 *
837 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
838 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
839 * @access private
840 */
841 function initCharset($charset) {
842 // Only process if the charset is not yet loaded:
843 if (!is_array($this->parsedCharsets[$charset])) {
844
845 // Conversion table filename:
846 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
847
848 // If the conversion table is found:
849 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
850 // Cache file for charsets:
851 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
852 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
853 if ($cacheFile && @is_file($cacheFile)) {
854 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
855 } else {
856 // Parse conversion table into lines:
857 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
858 // Initialize the internal variable holding the conv. table:
859 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
860 // traverse the lines:
861 $detectedType='';
862 foreach($lines as $value) {
863 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
864
865 // Detect type if not done yet: (Done on first real line)
866 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
867 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
868
869 if ($detectedType=='ms-token') {
870 list($hexbyte,$utf8) = split('=|:',$value,3);
871 } elseif ($detectedType=='whitespaced') {
872 $regA=array();
873 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
874 $hexbyte = $regA[1];
875 $utf8 = 'U+'.$regA[2];
876 }
877 $decval = hexdec(trim($hexbyte));
878 if ($decval>127) {
879 $utf8decval = hexdec(substr(trim($utf8),2));
880 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
881 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
882 }
883 }
884 }
885 if ($cacheFile) {
886 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
887 }
888 }
889 return 2;
890 } else return false;
891 } else return 1;
892 }
893
894 /**
895 * This function initializes all UTF-8 character data tables.
896 *
897 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
898 *
899 * @param string ???
900 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
901 * @access private
902 */
903 function initUnicodeData($mode=null) {
904 // cache files
905 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
906 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
907
908 // Only process if the tables are not yet loaded
909 switch($mode) {
910 case 'case':
911 if (is_array($this->caseFolding['utf-8'])) return 1;
912
913 // Use cached version if possible
914 if ($cacheFileCase && @is_file($cacheFileCase)) {
915 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFile));
916 return 2;
917 }
918 break;
919
920 case 'ascii':
921 if (is_array($this->toASCII['utf-8'])) return 1;
922
923 // Use cached version if possible
924 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
925 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
926 return 2;
927 }
928 break;
929 }
930
931 // process main Unicode data file
932 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
933 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
934
935 $fh = fopen($unicodeDataFile,'r');
936 if (!$fh) return false;
937
938 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
939 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
940 $this->caseFolding['utf-8'] = array();
941 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
942 $utf8CaseFolding['toUpper'] = array();
943 $utf8CaseFolding['toLower'] = array();
944 $utf8CaseFolding['toTitle'] = array();
945
946 $decomposition = array(); // array of temp. decompositions
947 $mark = array(); // array of chars that are marks (eg. composing accents)
948 $number = array(); // array of chars that are numbers (eg. digits)
949
950 while (!feof($fh)) {
951 $line = fgets($fh);
952 // has a lot of info
953 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
954
955 $ord = hexdec($char);
956 if ($ord > 0xFFFF) break; // only process the BMP
957
958 $utf8_char = $this->UnumberToChar($ord);
959
960 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
961 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
962 // store "title" only when different from "upper" (only a few)
963 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
964
965 switch ($cat{0}) {
966 case 'M': // mark (accent, umlaut, ...)
967 $mark["U+$char"] = 1;
968 break;
969
970 case 'N': // numeric value
971 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
972 }
973
974 // accented Latin letters without "official" decomposition
975 $match = array();
976 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) {
977 $c = ord($match[2]);
978 if ($match[1] == 'SMALL') $c += 32;
979
980 $decomposition["U+$char"] = array(dechex($c));
981 continue;
982 }
983
984 $match = array();
985 if (ereg('(<.*>)? *(.+)',$decomp,$match)) {
986 switch($match[1]) {
987 case '<circle>': // add parenthesis as circle replacement, eg (1)
988 $match[2] = '0028 '.$match[2].' 0029';
989 break;
990
991 case '<square>': // add square brackets as square replacement, eg [1]
992 $match[2] = '005B '.$match[2].' 005D';
993 break;
994
995 case '<compat>': // ignore multi char decompositions that start with a space
996 if (ereg('^0020 ',$match[2])) continue 2;
997 break;
998
999 // ignore Arabic and vertical layout presentation decomposition
1000 case '<initial>':
1001 case '<medial>':
1002 case '<final>':
1003 case '<isolated>':
1004 case '<vertical>':
1005 continue 2;
1006 }
1007 $decomposition["U+$char"] = split(' ',$match[2]);
1008 }
1009 }
1010 fclose($fh);
1011
1012 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1013 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
1014 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1015
1016 $fh = fopen($specialCasingFile,'r');
1017 if ($fh) {
1018 while (!feof($fh)) {
1019 $line = fgets($fh);
1020 if ($line{0} != '#' && trim($line) != '') {
1021
1022 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
1023 if ($cond == '' || $cond{0} == '#') {
1024 $utf8_char = $this->UnumberToChar(hexdec($char));
1025 if ($char != $lower) {
1026 $arr = split(' ',$lower);
1027 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1028 $utf8CaseFolding['toLower'][$utf8_char] = implode($arr);
1029 }
1030 if ($char != $title && $title != $upper) {
1031 $arr = split(' ',$title);
1032 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1033 $utf8CaseFolding['toTitle'][$utf8_char] = implode($arr);
1034 }
1035 if ($char != $upper) {
1036 $arr = split(' ',$upper);
1037 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1038 $utf8CaseFolding['toUpper'][$utf8_char] = implode($arr);
1039 }
1040 }
1041 }
1042 }
1043 fclose($fh);
1044 }
1045 }
1046
1047 // custom decompositions
1048 $decomposition['U+00A5'] = array('0079','0065','006E'); // YEN SIGN => yen
1049 $decomposition['U+00A6'] = array('007C'); // BROKEN BAR => |
1050 $decomposition['U+00AB'] = array('003C','003C'); // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK => <<
1051 $decomposition['U+00A9'] = array('0028','0063','0029'); // COPYRIGHT SIGN => (c)
1052 $decomposition['U+00AE'] = array('0028','0052','0029'); // REGISTERED SIGN => (R)
1053 $decomposition['U+00B1'] = array('002B','002F','002D'); // PLUS-MINUS SIGN => +/-
1054 $decomposition['U+00B5'] = array('0075'); // MICRO SIGN => u
1055 $decomposition['U+00B7'] = array('002A'); // MIDDLE DOT => *
1056 $decomposition['U+00BB'] = array('003E','003E'); // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK => <<
1057 $decomposition['U+00C4'] = array('0041','0045'); // LATIN CAPITAL LETTER A WITH DIAERESIS => AE (German)
1058 $decomposition['U+00C5'] = array('0041','0041'); // LATIN CAPITAL LETTER A WITH RING ABOVE => AA (Danish)
1059 $decomposition['U+00C6'] = array('0041','0045'); // LATIN CAPITAL LETTER AE => AE (Danish)
1060 $decomposition['U+00D6'] = array('004F','0045'); // LATIN CAPITAL LETTER O WITH DIAERESIS => OE (German)
1061 $decomposition['U+00D7'] = array('002A'); // MULTIPLICATION SIGN => *
1062 $decomposition['U+00D8'] = array('004F','0045'); // LATIN CAPITAL LETTER O WITH STROKE => OE (Danish)
1063 $decomposition['U+00DC'] = array('0055','0045'); // LATIN CAPITAL LETTER U WITH DIAERESIS => UE (German)
1064 $decomposition['U+00E4'] = array('0061','0065'); // LATIN SMALL LETTER A WITH DIAERESIS => ae (German)
1065 $decomposition['U+00E5'] = array('0061','0061'); // LATIN SMALL LETTER A WITH RING ABOVE => aa (Danish)
1066 $decomposition['U+00DF'] = array('0073','0073'); // LATIN SMALL LETTER SHARP S => ss (German)
1067 $decomposition['U+00E6'] = array('0061','0065'); // LATIN SMALL LETTER AE => ae (Danish)
1068 $decomposition['U+00F6'] = array('006F','0065'); // LATIN SMALL LETTER O WITH DIAERESIS => oe (German)
1069 $decomposition['U+00F7'] = array('002F'); // DIVISION SIGN => /
1070 $decomposition['U+00F8'] = array('006F','0065'); // LATIN SMALL LETTER O WITH STROKE => oe (Danish)
1071 $decomposition['U+00FC'] = array('0075','0065'); // LATIN SMALL LETTER U WITH DIAERESIS => ue (German)
1072 $decomposition['U+0152'] = array('004F','0045'); // LATIN CAPITAL LETTER OE => OE
1073 $decomposition['U+0153'] = array('006F','0065'); // LATIN SMALL LETTER OE => oe
1074 $decomposition['U+0192'] = array('0066'); // LATIN SMALL LETTER F WITH HOOK => f
1075 $decomposition['U+02BC'] = array('0027'); // MODIFIER LETTER APOSTROPHE => '
1076 $decomposition['U+02CA'] = array('0027'); // MODIFIER LETTER ACUTE ACCENT => '
1077 $decomposition['U+2010'] = array('002D'); // HYPHEN => -
1078 $decomposition['U+2013'] = array('002D'); // EN DASH => -
1079 $decomposition['U+2014'] = array('002D'); // EM DASH => -
1080 $decomposition['U+2018'] = array('0060'); // LEFT SINGLE QUOTATION MARK => `
1081 $decomposition['U+2019'] = array('0027'); // RIGHT SINGLE QUOTATION MARK >= '
1082 $decomposition['U+201C'] = array('0022'); // LEFT DOUBLE QUOTATION MARK => "
1083 $decomposition['U+201D'] = array('0022'); // RIGHT DOUBLE QUOTATION MARK => "
1084 $decomposition['U+201E'] = array('0022'); // DOUBLE LOW-9 QUOTATION MARK => "
1085 $decomposition['U+2022'] = array('002A'); // BULLET => *
1086 $decomposition['U+2039'] = array('003C'); // SINGLE LEFT-POINTING ANGLE QUOTATION MARK => <
1087 $decomposition['U+203A'] = array('003E'); // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK => >
1088 $decomposition['U+2044'] = array('002F'); // FRACTION SLASH => /
1089 $decomposition['U+20A0'] = array('0045','0055','0052'); // EURO-CURRENCY SIGN => EUR
1090 $decomposition['U+20AC'] = array('0045','0055','0052'); // EURO-CURRENCY SIGN => EUR
1091
1092 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1093 foreach($decomposition as $from => $to) {
1094 $code_decomp = array();
1095
1096 while ($code_value = array_shift($to)) {
1097 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1098 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
1099 array_unshift($to, $cv);
1100 }
1101 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1102 array_push($code_decomp, $code_value);
1103 }
1104 }
1105 if (count($code_decomp)) {
1106 $decomposition[$from] = $code_decomp;
1107 } else {
1108 unset($decomposition[$from]);
1109 }
1110 }
1111
1112 // create ascii only mapping
1113 $this->toASCII['utf-8'] = array();
1114 $ascii =& $this->toASCII['utf-8'];
1115
1116 foreach($decomposition as $from => $to) {
1117 $code_decomp = array();
1118 while ($code_value = array_shift($to)) {
1119 $ord = hexdec($code_value);
1120 if ($ord > 127)
1121 continue 2; // skip decompositions containing non-ASCII chars
1122 else
1123 array_push($code_decomp,chr($ord));
1124 }
1125 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1126 }
1127
1128 // add numeric decompositions
1129 foreach($number as $from => $to) {
1130 $utf8_char = $this->UnumberToChar(hexdec($from));
1131 if (!isset($ascii[$utf8_char])) {
1132 $ascii[$utf8_char] = $to;
1133 }
1134 }
1135
1136 if ($cacheFileCase) {
1137 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1138 }
1139
1140 if ($cacheFileASCII) {
1141 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1142 }
1143
1144 return 3;
1145 }
1146
1147 /**
1148 * This function initializes the folding table for a charset other than UTF-8.
1149 * This function is automatically called by the case folding functions.
1150 *
1151 * @param string Charset for which to initialize case folding.
1152 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1153 * @access private
1154 */
1155 function initCaseFolding($charset) {
1156 // Only process if the case table is not yet loaded:
1157 if (is_array($this->caseFolding[$charset])) return 1;
1158
1159 // Use cached version if possible
1160 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1161 if ($cacheFile && @is_file($cacheFile)) {
1162 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1163 return 2;
1164 }
1165
1166 // init UTF-8 conversion for this charset
1167 if (!$this->initCharset($charset)) {
1168 return false;
1169 }
1170
1171 // UTF-8 case folding is used as the base conversion table
1172 if (!$this->initUnicodeData('case')) {
1173 return false;
1174 }
1175
1176 $nochar = chr($this->noCharByteVal);
1177 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1178 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1179 $c = $this->utf8_decode($utf8, $charset);
1180
1181 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1182 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1183 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1184
1185 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1186 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1187 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
1188
1189 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1190 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1191 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1192 }
1193
1194 // add the ASCII case table
1195 for ($i=ord('a'); $i<=ord('z'); $i++) {
1196 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
1197 }
1198 for ($i=ord('A'); $i<=ord('Z'); $i++) {
1199 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
1200 }
1201
1202 if ($cacheFile) {
1203 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
1204 }
1205
1206 return 3;
1207 }
1208
1209 /**
1210 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1211 * This function is automatically called by the ASCII transliteration functions.
1212 *
1213 * @param string Charset for which to initialize conversion.
1214 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1215 * @access private
1216 */
1217 function initToASCII($charset) {
1218 // Only process if the case table is not yet loaded:
1219 if (is_array($this->toASCII[$charset])) return 1;
1220
1221 // Use cached version if possible
1222 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1223 if ($cacheFile && @is_file($cacheFile)) {
1224 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1225 return 2;
1226 }
1227
1228 // init UTF-8 conversion for this charset
1229 if (!$this->initCharset($charset)) {
1230 return false;
1231 }
1232
1233 // UTF-8/ASCII transliteration is used as the base conversion table
1234 if (!$this->initUnicodeData('ascii')) {
1235 return false;
1236 }
1237
1238 $nochar = chr($this->noCharByteVal);
1239 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1240 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1241 $c = $this->utf8_decode($utf8, $charset);
1242
1243 if (isset($this->toASCII['utf-8'][$utf8])) {
1244 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1245 }
1246 }
1247
1248 if ($cacheFile) {
1249 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
1250 }
1251
1252 return 3;
1253 }
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270 /********************************************
1271 *
1272 * String operation functions
1273 *
1274 ********************************************/
1275
1276 /**
1277 * Returns a part of a string.
1278 * Unit-tested by Kasper (single byte charsets only)
1279 *
1280 * @param string The character set
1281 * @param string Character string
1282 * @param integer Start position (character position)
1283 * @param integer Length (in characters)
1284 * @return string The substring
1285 * @see substr(), mb_substr()
1286 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1287 */
1288 function substr($charset,$string,$start,$len=null) {
1289 if ($len===0) return '';
1290
1291 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1292 // cannot omit $len, when specifying charset
1293 if ($len==null) {
1294 $enc = mb_internal_encoding(); // save internal encoding
1295 mb_internal_encoding('utf-8');
1296 $str = mb_substr($string,$start);
1297 mb_internal_encoding($enc); // restore internal encoding
1298
1299 return $str;
1300 }
1301 else return mb_substr($string,$start,$len,'utf-8');
1302 } elseif ($charset == 'utf-8') {
1303 return $this->utf8_substr($string,$start,$len);
1304 } elseif ($this->eucBasedSets[$charset]) {
1305 return $this->euc_substr($string,$start,$charset,$len);
1306 } elseif ($this->twoByteSets[$charset]) {
1307 return substr($string,$start*2,$len*2);
1308 } elseif ($this->fourByteSets[$charset]) {
1309 return substr($string,$start*4,$len*4);
1310 }
1311
1312 // treat everything else as single-byte encoding
1313 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
1314 }
1315
1316 /**
1317 * Counts the number of characters.
1318 * Unit-tested by Kasper (single byte charsets only)
1319 *
1320 * @param string The character set
1321 * @param string Character string
1322 * @return integer The number of characters
1323 * @see strlen()
1324 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1325 */
1326 function strlen($charset,$string) {
1327 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1328 return mb_strlen($string,$charset);
1329 } elseif ($charset == 'utf-8') {
1330 return $this->utf8_strlen($string);
1331 } elseif ($this->eucBasedSets[$charset]) {
1332 return $this->euc_strlen($string,$charset);
1333 } elseif ($this->twoByteSets[$charset]) {
1334 return strlen($string)/2;
1335 } elseif ($this->fourByteSets[$charset]) {
1336 return strlen($string)/4;
1337 }
1338 // treat everything else as single-byte encoding
1339 return strlen($string);
1340 }
1341
1342 /**
1343 * Truncates a string and pre-/appends a string.
1344 * Unit tested by Kasper
1345 *
1346 * @param string The character set
1347 * @param string Character string
1348 * @param integer Length (in characters)
1349 * @param string Crop signifier
1350 * @return string The shortened string
1351 * @see substr(), mb_strimwidth()
1352 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1353 */
1354 function crop($charset,$string,$len,$crop='') {
1355 if (intval($len) == 0) return $string;
1356
1357 if ($charset == 'utf-8') {
1358 $i = $this->utf8_char2byte_pos($string,$len);
1359 } elseif ($this->eucBasedSets[$charset]) {
1360 $i = $this->euc_char2byte_pos($string,$len,$charset);
1361 } else {
1362 if ($len > 0) {
1363 $i = $len;
1364 } else {
1365 $i = strlen($string)+$len;
1366 if ($i<=0) $i = false;
1367 }
1368 }
1369
1370 if ($i === false) { // $len outside actual string length
1371 return $string;
1372 } else {
1373 if ($len > 0) {
1374 if (isset($string{$i})) {
1375 return substr($string,0,$i).$crop;
1376 }
1377 } else {
1378 if (isset($string{$i-1})) {
1379 return $crop.substr($string,$i);
1380 }
1381 }
1382
1383 /*
1384 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1385 if ($len > 0) {
1386 return substr($string,0,$i).$crop;
1387 } else {
1388 return $crop.substr($string,$i);
1389 }
1390 }
1391 */
1392 }
1393 return $string;
1394 }
1395
1396 /**
1397 * Cuts a string short at a given byte length.
1398 *
1399 * @param string The character set
1400 * @param string Character string
1401 * @param integer The byte length
1402 * @return string The shortened string
1403 * @see mb_strcut()
1404 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1405 */
1406 function strtrunc($charset,$string,$len) {
1407 if ($len <= 0) return '';
1408
1409 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1410 return mb_strcut($string,0,$len,$charset);
1411 } elseif ($charset == 'utf-8') {
1412 return $this->utf8_strtrunc($string,$len);
1413 } elseif ($this->eucBasedSets[$charset]) {
1414 return $this->euc_strtrunc($string,$charset);
1415 } elseif ($this->twoByteSets[$charset]) {
1416 if ($len % 2) $len--; // don't cut at odd positions
1417 } elseif ($this->fourByteSets[$charset]) {
1418 $x = $len % 4;
1419 $len -= $x; // realign to position dividable by four
1420 }
1421 // treat everything else as single-byte encoding
1422 return substr($string,0,$len);
1423 }
1424
1425 /**
1426 * Translates all characters of a string into their respective case values.
1427 * Unlike strtolower() and strtoupper() this method is locale independent.
1428 * Note that the string length may change!
1429 * eg. lower case German "ß" (sharp S) becomes uper case "SS"
1430 * Unit-tested by Kasper
1431 * Real case folding is language dependent, this method ignores this fact.
1432 *
1433 * @param string Character set of string
1434 * @param string Input string to convert case for
1435 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1436 * @return string The converted string
1437 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1438 * @see strtolower(), strtoupper()
1439 */
1440 function conv_case($charset,$string,$case) {
1441 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' &&
1442 float(phpversion()) >= 4.3) {
1443 if ($case == 'toLower') {
1444 return mb_strtolower($str,'utf-8');
1445 } else {
1446 return mb_strtoupper($str,'utf-8');
1447 }
1448 } elseif ($charset == 'utf-8') {
1449 return $this->utf8_conv_case($string,$case);
1450 } elseif (isset($this->eucBasedSets[$charset])) {
1451 return $this->euc_conv_case($string,$case,$charset);
1452 }
1453
1454 // treat everything else as single-byte encoding
1455 if (!$this->initCaseFolding($charset)) return $string; // do nothing
1456 $out = '';
1457 $caseConv =& $this->caseFolding[$charset][$case];
1458
1459 for($i=0; isset($string{$i}); $i++) {
1460 $c = $string{$i};
1461 $cc = $caseConv[$c];
1462 if ($cc) {
1463 $out .= $cc;
1464 } else {
1465 $out .= $c;
1466 }
1467 }
1468
1469 // is a simple strtr() faster or slower than the code above?
1470 // perhaps faster for small single-byte tables but slower for large multi-byte tables?
1471 //
1472 // return strtr($string,$this->caseFolding[$charset][$case]);
1473
1474 return $out;
1475 }
1476
1477 /**
1478 * Converts special chars (like ÆØÅæøå, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1479 *
1480 * @param string Character set of string
1481 * @param string Input string to convert
1482 * @return string The converted string
1483 */
1484 function specCharsToASCII($charset,$string) {
1485 if ($charset == 'utf-8') {
1486 return $this->utf8_toASCII($string);
1487 } elseif (isset($this->eucBasedSets[$charset])) {
1488 return $this->euc_toASCII($string,$charset);
1489 }
1490
1491 // treat everything else as single-byte encoding
1492 if (!$this->initToASCII($charset)) return $string; // do nothing
1493 $out = '';
1494 $ascii =& $this->toASCII[$charset];
1495
1496 for($i=0; isset($string{$i}); $i++) {
1497 $c = $string{$i};
1498 if (isset($ascii[$c])) {
1499 $out .= $ascii[$c];
1500 } else {
1501 $out .= $c;
1502 }
1503 }
1504
1505 return $out;
1506 }
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519 /********************************************
1520 *
1521 * Internal UTF-8 string operation functions
1522 *
1523 ********************************************/
1524
1525 /**
1526 * Returns a part of a UTF-8 string.
1527 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1528 *
1529 * @param string UTF-8 string
1530 * @param integer Start position (character position)
1531 * @param integer Length (in characters)
1532 * @return string The substring
1533 * @see substr()
1534 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1535 */
1536 function utf8_substr($str,$start,$len=null) {
1537 if (!strcmp($len,'0')) return '';
1538
1539 $byte_start = $this->utf8_char2byte_pos($str,$start);
1540 if ($byte_start === false) {
1541 if ($start > 0) {
1542 return false; // $start outside string length
1543 } else {
1544 $start = 0;
1545 }
1546 }
1547
1548 $str = substr($str,$byte_start);
1549
1550 if ($len!=null) {
1551 $byte_end = $this->utf8_char2byte_pos($str,$len);
1552 if ($byte_end === false) // $len outside actual string length
1553 return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string.
1554 else
1555 return substr($str,0,$byte_end);
1556 }
1557 else return $str;
1558 }
1559
1560 /**
1561 * Counts the number of characters of a string in UTF-8.
1562 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1563 *
1564 * @param string UTF-8 multibyte character string
1565 * @return integer The number of characters
1566 * @see strlen()
1567 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1568 */
1569 function utf8_strlen($str) {
1570 $n=0;
1571 for($i=0; isset($str{$i}); $i++) {
1572 $c = ord($str{$i});
1573 if (!($c & 0x80)) // single-byte (0xxxxxx)
1574 $n++;
1575 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1576 $n++;
1577 }
1578 return $n;
1579 }
1580
1581 /**
1582 * Truncates a string in UTF-8 short at a given byte length.
1583 *
1584 * @param string UTF-8 multibyte character string
1585 * @param integer the byte length
1586 * @return string the shortened string
1587 * @see mb_strcut()
1588 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1589 */
1590 function utf8_strtrunc($str,$len) {
1591 $i = $len-1;
1592 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1593 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1594 if ($i <= 0) return ''; // sanity check
1595 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1596 if ($bc+$i > $len) return substr($str,0,$i);
1597 // fallthru: multibyte char fits into length
1598 }
1599 return substr($str,$len);
1600 }
1601
1602 /**
1603 * Find position of first occurrence of a string, both arguments are in UTF-8.
1604 *
1605 * @param string UTF-8 string to search in
1606 * @param string UTF-8 string to search for
1607 * @param integer Positition to start the search
1608 * @return integer The character position
1609 * @see strpos()
1610 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1611 */
1612 function utf8_strpos($haystack,$needle,$offset=0) {
1613 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1614 return mb_strpos($haystack,$needle,'utf-8');
1615 }
1616
1617 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1618 if ($byte_offset === false) return false; // offset beyond string length
1619
1620 $byte_pos = strpos($haystack,$needle,$byte_offset);
1621 if ($byte_pos === false) return false; // needle not found
1622
1623 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1624 }
1625
1626 /**
1627 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1628 *
1629 * @param string UTF-8 string to search in
1630 * @param string UTF-8 character to search for (single character)
1631 * @return integer The character position
1632 * @see strrpos()
1633 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1634 */
1635 function utf8_strrpos($haystack,$needle) {
1636 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1637 return mb_strrpos($haystack,$needle,'utf-8');
1638 }
1639
1640 $byte_pos = strrpos($haystack,$needle);
1641 if ($byte_pos === false) return false; // needle not found
1642
1643 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1644 }
1645
1646 /**
1647 * Translates a character position into an 'absolute' byte position.
1648 * Unit tested by Kasper.
1649 *
1650 * @param string UTF-8 string
1651 * @param integer Character position (negative values start from the end)
1652 * @return integer Byte position
1653 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1654 */
1655 function utf8_char2byte_pos($str,$pos) {
1656 $n = 0; // number of characters found
1657 $p = abs($pos); // number of characters wanted
1658
1659 if ($pos >= 0) {
1660 $i = 0;
1661 $d = 1;
1662 } else {
1663 $i = strlen($str)-1;
1664 $d = -1;
1665 }
1666
1667 for( ; isset($str{$i}) && $n<$p; $i+=$d) {
1668 $c = (int)ord($str{$i});
1669 if (!($c & 0x80)) // single-byte (0xxxxxx)
1670 $n++;
1671 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1672 $n++;
1673 }
1674 if (!isset($str{$i})) return false; // offset beyond string length
1675
1676 if ($pos >= 0) {
1677 // skip trailing multi-byte data bytes
1678 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1679 } else {
1680 // correct offset
1681 $i++;
1682 }
1683
1684 return $i;
1685 }
1686
1687 /**
1688 * Translates an 'absolute' byte position into a character position.
1689 * Unit tested by Kasper.
1690 *
1691 * @param string UTF-8 string
1692 * @param integer byte position
1693 * @return integer character position
1694 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1695 */
1696 function utf8_byte2char_pos($str,$pos) {
1697 $n = 0; // number of characters
1698 for($i=$pos; $i>0; $i--) {
1699 $c = (int)ord($str{$i});
1700 if (!($c & 0x80)) // single-byte (0xxxxxx)
1701 $n++;
1702 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1703 $n++;
1704 }
1705 if (!isset($str{$i})) return false; // offset beyond string length
1706
1707 return $n;
1708 }
1709
1710 /**
1711 * Translates all characters of an UTF-8 string into their respective case values.
1712 * Unit-tested by Kasper
1713 *
1714 * @param string UTF-8 string
1715 * @param string conversion: 'toLower' or 'toUpper'
1716 * @return string the converted string
1717 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1718 * @see strtolower(), strtoupper(), mb_convert_case()
1719 */
1720 function utf8_conv_case($str,$case) {
1721 if (!$this->initUnicodeData('case')) return $str; // do nothing
1722
1723 $out = '';
1724 $caseConv =& $this->caseFolding['utf-8'][$case];
1725
1726 for($i=0; isset($str{$i}); $i++) {
1727 $c = ord($str{$i});
1728 if (!($c & 0x80)) // single-byte (0xxxxxx)
1729 $mbc = $str{$i};
1730 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1731 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1732 $mbc = substr($str,$i,$bc);
1733 $i += $bc-1;
1734 }
1735
1736 if (isset($caseConv[$mbc])) {
1737 $out .= $caseConv[$mbc];
1738 } else {
1739 $out .= $mbc;
1740 }
1741 }
1742
1743 return $out;
1744 }
1745
1746 /**
1747 * Converts chars with accents, umlauts or composed to ASCII equivalents.
1748 *
1749 * @param string Input string to convert
1750 * @return string The converted string
1751 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1752 */
1753 function utf8_toASCII($str) {
1754 if (!$this->initUnicodeData('ascii')) return $str; // do nothing
1755
1756 $out = '';
1757 $toASCII =& $this->toASCII['utf-8'];
1758
1759 for($i=0; isset($str{$i}); $i++) {
1760 $c = ord($str{$i});
1761 if (!($c & 0x80)) // single-byte (0xxxxxx)
1762 $mbc = $str{$i};
1763 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1764 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1765 $mbc = substr($str,$i,$bc);
1766 $i += $bc-1;
1767 }
1768
1769 if (isset($toASCII[$mbc])) {
1770 $out .= $toASCII[$mbc];
1771 } else {
1772 $out .= $mbc;
1773 }
1774 }
1775
1776 return $out;
1777 }
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795 /********************************************
1796 *
1797 * Internal EUC string operation functions
1798 *
1799 * Extended Unix Code:
1800 * ASCII compatible 7bit single bytes chars
1801 * 8bit two byte chars
1802 *
1803 * Shift-JIS is treated as a special case.
1804 *
1805 ********************************************/
1806
1807 /**
1808 * Cuts a string in the EUC charset family short at a given byte length.
1809 *
1810 * @param string EUC multibyte character string
1811 * @param integer the byte length
1812 * @param string the charset
1813 * @return string the shortened string
1814 * @see mb_strcut()
1815 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1816 */
1817 function euc_strtrunc($str,$len,$charset) {
1818 $sjis = ($charset == 'shift_jis');
1819 for ($i=0; isset($str{$i}) && $i<$len; $i++) {
1820 $c = ord($str{$i});
1821 if ($sjis) {
1822 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1823 }
1824 else {
1825 if ($c >= 0x80) $i++; // advance a double-byte char
1826 }
1827 }
1828 if (!isset($str{$i})) return $str; // string shorter than supplied length
1829
1830 if ($i>$len)
1831 return substr($str,0,$len-1); // we ended on a first byte
1832 else
1833 return substr($str,0,$len);
1834 }
1835
1836 /**
1837 * Returns a part of a string in the EUC charset family.
1838 *
1839 * @param string EUC multibyte character string
1840 * @param integer start position (character position)
1841 * @param string the charset
1842 * @param integer length (in characters)
1843 * @return string the substring
1844 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1845 */
1846 function euc_substr($str,$start,$charset,$len=null) {
1847 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1848 if ($byte_start === false) return false; // $start outside string length
1849
1850 $str = substr($str,$byte_start);
1851
1852 if ($len!=null) {
1853 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1854 if ($byte_end === false) // $len outside actual string length
1855 return $str;
1856 else
1857 return substr($str,0,$byte_end);
1858 }
1859 else return $str;
1860 }
1861
1862 /**
1863 * Counts the number of characters of a string in the EUC charset family.
1864 *
1865 * @param string EUC multibyte character string
1866 * @param string the charset
1867 * @return integer the number of characters
1868 * @see strlen()
1869 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1870 */
1871 function euc_strlen($str,$charset) {
1872 $sjis = ($charset == 'shift_jis');
1873 $n=0;
1874 for ($i=0; isset($str{$i}); $i++) {
1875 $c = ord($str{$i});
1876 if ($sjis) {
1877 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1878 }
1879 else {
1880 if ($c >= 0x80) $i++; // advance a double-byte char
1881 }
1882
1883 $n++;
1884 }
1885
1886 return $n;
1887 }
1888
1889 /**
1890 * Translates a character position into an 'absolute' byte position.
1891 *
1892 * @param string EUC multibyte character string
1893 * @param integer character position (negative values start from the end)
1894 * @param string the charset
1895 * @return integer byte position
1896 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1897 */
1898 function euc_char2byte_pos($str,$pos,$charset) {
1899 $sjis = ($charset == 'shift_jis');
1900 $n = 0; // number of characters seen
1901 $p = abs($pos); // number of characters wanted
1902
1903 if ($pos >= 0) {
1904 $i = 0;
1905 $d = 1;
1906 } else {
1907 $i = strlen($str)-1;
1908 $d = -1;
1909 }
1910
1911 for ( ; isset($str{$i}) && $n<$p; $i+=$d) {
1912 $c = ord($str{$i});
1913 if ($sjis) {
1914 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char
1915 }
1916 else {
1917 if ($c >= 0x80) $i+=$d; // advance a double-byte char
1918 }
1919
1920 $n++;
1921 }
1922 if (!isset($str{$i})) return false; // offset beyond string length
1923
1924 if ($pos < 0) $i++; // correct offset
1925
1926 return $i;
1927 }
1928
1929 /**
1930 * Translates all characters of a string in the EUC charset family into their respective case values.
1931 *
1932 * @param string EUC multibyte character string
1933 * @param string conversion: 'toLower' or 'toUpper'
1934 * @param string the charset
1935 * @return string the converted string
1936 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1937 * @see strtolower(), strtoupper(), mb_convert_case()
1938 */
1939 function euc_conv_case($str,$case,$charset) {
1940 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1941
1942 $sjis = ($charset == 'shift_jis');
1943 $out = '';
1944 $caseConv =& $this->caseFolding[$charset][$case];
1945 for($i=0; isset($str{$i}); $i++) {
1946 $mbc = $str{$i};
1947 $c = ord($mbc);
1948
1949 if ($sjis) {
1950 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
1951 $mbc = substr($str,$i,2);
1952 $i++;
1953 }
1954 }
1955 else {
1956 if ($c >= 0x80) { // a double-byte char
1957 $mbc = substr($str,$i,2);
1958 $i++;
1959 }
1960 }
1961
1962 if (isset($caseConv[$mbc])) {
1963 $out .= $caseConv[$mbc];
1964 } else {
1965 $out .= $mbc;
1966 }
1967 }
1968
1969 return $out;
1970 }
1971
1972 /**
1973 * Converts chars with accents, umlauts or composed to ASCII equivalents.
1974 *
1975 * @param string Input string to convert
1976 * @param string The charset
1977 * @return string The converted string
1978 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1979 */
1980 function euc_toASCII($str,$charset) {
1981 if (!$this->initToASCII($charset)) return $str; // do nothing
1982
1983 $sjis = ($charset == 'shift_jis');
1984 $out = '';
1985 $toASCII =& $this->toASCII[$charset];
1986
1987 for($i=0; isset($str{$i}); $i++) {
1988 $mbc = $str{$i};
1989 $c = ord($mbc);
1990 if ($sjis) {
1991 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
1992 $mbc = substr($str,$i,2);
1993 $i++;
1994 }
1995 }
1996 else {
1997 if ($c >= 0x80) { // a double-byte char
1998 $mbc = substr($str,$i,2);
1999 $i++;
2000 }
2001 }
2002
2003 if (isset($toASCII[$mbc])) {
2004 $out .= $toASCII[$mbc];
2005 } else {
2006 $out .= $mbc;
2007 }
2008 }
2009
2010 return $out;
2011 }
2012
2013 }
2014
2015 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
2016 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2017 }
2018 ?>