Added euc-kr/cp949.
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2004 Kasper Skaarhoj (kasper@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasper@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 128: class t3lib_cs
38 * 442: function parse_charset($charset)
39 * 460: function get_locale_charset($locale)
40 * 492: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
41 * 529: function utf8_encode($str,$charset)
42 * 576: function utf8_decode($str,$charset,$useEntityForNoChar=0)
43 * 619: function utf8_to_entities($str)
44 * 652: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
45 * 686: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
46 * 736: function UnumberToChar($cbyte)
47 * 781: function utf8CharToUnumber($str,$hex=0)
48 *
49 * SECTION: Init functions
50 * 824: function initCharset($charset)
51 * 885: function initCaseFoldingUTF8()
52 * 973: function initCaseFolding($charset)
53 *
54 * SECTION: String operation functions
55 * 1058: function substr($charset,$string,$start,$len=null)
56 * 1096: function strlen($charset,$string)
57 * 1124: function crop($charset,$string,$len,$crop='')
58 * 1165: function strtrunc($charset,$string,$len)
59 * 1197: function conv_case($charset,$string,$case)
60 *
61 * SECTION: Internal UTF-8 string operation functions
62 * 1264: function utf8_substr($str,$start,$len=null)
63 * 1297: function utf8_strlen($str)
64 * 1318: function utf8_strtrunc($str,$len)
65 * 1340: function utf8_strpos($haystack,$needle,$offset=0)
66 * 1363: function utf8_strrpos($haystack,$needle)
67 * 1383: function utf8_char2byte_pos($str,$pos)
68 * 1424: function utf8_byte2char_pos($str,$pos)
69 * 1448: function utf8_conv_case($str,$case)
70 *
71 * SECTION: Internal EUC string operation functions
72 * 1514: function euc_strtrunc($str,$len,$charset)
73 * 1543: function euc_substr($str,$start,$charset,$len=null)
74 * 1568: function euc_strlen($str,$charset)
75 * 1595: function euc_char2byte_pos($str,$pos,$charset)
76 * 1636: function euc_conv_case($str,$case,$charset)
77 *
78 * TOTAL FUNCTIONS: 31
79 * (This index is automatically created/updated by the extension "extdeveval")
80 *
81 */
82
83
84
85
86
87
88
89
90 /**
91 * Notes on UTF-8
92 *
93 * Functions working on UTF-8 strings:
94 *
95 * - strchr/strstr
96 * - strrchr
97 * - substr_count
98 * - implode/explode/join
99 *
100 * Functions nearly working on UTF-8 strings:
101 *
102 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf_strlen
103 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained 7-bit ASCII
104 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
105 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
106 *
107 * Functions NOT working on UTF-8 strings:
108 *
109 * - str*cmp
110 * - stristr
111 * - stripos
112 * - substr
113 * - strrev
114 * - ereg/eregi
115 * - split/spliti
116 * - preg_*
117 * - ...
118 *
119 */
120 /**
121 * Class for conversion between charsets
122 *
123 * @author Kasper Skaarhoj <kasper@typo3.com>
124 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
125 * @package TYPO3
126 * @subpackage t3lib
127 */
128 class t3lib_cs {
129 var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
130
131 // This is the array where parsed conversion tables are stored (cached)
132 var $parsedCharsets=array();
133
134 // An array where case folding data will be stored (cached)
135 var $caseFolding=array();
136
137 // An array where charset-to-ASCII mappings are stored (cached)
138 var $toASCII=array();
139
140 // This tells the converter which charsets has two bytes per char:
141 var $twoByteSets=array(
142 'ucs-2'=>1, // 2-byte Unicode
143 );
144
145 // This tells the converter which charsets has four bytes per char:
146 var $fourByteSets=array(
147 'ucs-4'=>1, // 4-byte Unicode
148 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
149 );
150
151 // This tells the converter which charsets use a scheme like the Extended Unix Code:
152 var $eucBasedSets=array(
153 'gb2312'=>1, // Chinese, simplified.
154 'big5'=>1, // Chinese, traditional.
155 'euc-kr'=>1, // Korean
156 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
157 );
158
159 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
160 // http://czyborra.com/charsets/iso8859.html
161 var $synonyms=array(
162 'us' => 'ascii',
163 'us-ascii'=> 'ascii',
164 'cp819' => 'iso-8859-1',
165 'ibm819' => 'iso-8859-1',
166 'iso-ir-100' => 'iso-8859-1',
167 'iso-ir-109' => 'iso-8859-2',
168 'iso-ir-148' => 'iso-8859-9',
169 'iso-ir-199' => 'iso-8859-14',
170 'iso-ir-203' => 'iso-8859-15',
171 'csisolatin1' => 'iso-8859-1',
172 'csisolatin2' => 'iso-8859-2',
173 'csisolatin3' => 'iso-8859-3',
174 'csisolatin5' => 'iso-8859-9',
175 'csisolatin8' => 'iso-8859-14',
176 'csisolatin9' => 'iso-8859-15',
177 'csisolatingreek' => 'iso-8859-7',
178 'iso-celtic' => 'iso-8859-14',
179 'latin1' => 'iso-8859-1',
180 'latin2' => 'iso-8859-2',
181 'latin3' => 'iso-8859-3',
182 'latin5' => 'iso-8859-9',
183 'latin6' => 'iso-8859-10',
184 'latin8' => 'iso-8859-14',
185 'latin9' => 'iso-8859-15',
186 'l1' => 'iso-8859-1',
187 'l2' => 'iso-8859-2',
188 'l3' => 'iso-8859-3',
189 'l5' => 'iso-8859-9',
190 'l6' => 'iso-8859-10',
191 'l8' => 'iso-8859-14',
192 'l9' => 'iso-8859-15',
193 'cyrillic' => 'iso-8859-5',
194 'arabic' => 'iso-8859-6',
195 'tis-620' => 'iso-8859-11',
196 'win874' => 'windows-874',
197 'win1250' => 'windows-1250',
198 'win1251' => 'windows-1251',
199 'win1252' => 'windows-1252',
200 'win1253' => 'windows-1253',
201 'win1254' => 'windows-1254',
202 'win1255' => 'windows-1255',
203 'win1256' => 'windows-1256',
204 'win1257' => 'windows-1257',
205 'win1258' => 'windows-1258',
206 'cp1250' => 'windows-1250',
207 'cp1251' => 'windows-1251',
208 'cp1252' => 'windows-1252',
209 'ms-ee' => 'windows-1250',
210 'ms-ansi' => 'windows-1252',
211 'ms-greek' => 'windows-1253',
212 'ms-turk' => 'windows-1254',
213 'winbaltrim' => 'windows-1257',
214 'koi-8ru' => 'koi-8r',
215 'koi8r' => 'koi-8r',
216 'cp878' => 'koi-8r',
217 'mac' => 'macroman',
218 'macintosh' => 'macroman',
219 'euc-cn' => 'gb2312',
220 'x-euc-cn' => 'gb2312',
221 'euccn' => 'gb2312',
222 'cp936' => 'gb2312',
223 'big-5' => 'big5',
224 'cp950' => 'big5',
225 'eucjp' => 'euc-jp',
226 'sjis' => 'shift_jis',
227 'shift-jis' => 'shift_jis',
228 'cp932' => 'shift_jis',
229 'cp949' => 'euc-kr',
230 'utf7' => 'utf-7',
231 'utf8' => 'utf-8',
232 'utf16' => 'utf-16',
233 'utf32' => 'utf-32',
234 'utf8' => 'utf-8',
235 'ucs2' => 'ucs-2',
236 'ucs4' => 'ucs-4',
237 );
238
239 // mapping of iso-639:2 language codes to language (family) names
240 var $lang_to_langfamily=array(
241 // iso-639:2 language codes, see:
242 // http://www.w3.org/WAI/ER/IG/ert/iso639.htm
243 // http://www.unicode.org/onlinedat/languages.html
244 'ar' => 'arabic',
245 'bg' => 'cyrillic',
246 'cs' => 'east_european',
247 'da' => 'west_european',
248 'de' => 'west_european',
249 'es' => 'west_european',
250 'et' => 'estonian',
251 'eu' => 'west_european',
252 'fi' => 'west_european',
253 'fr' => 'west_european',
254 'gr' => 'greek',
255 'hr' => 'east_european',
256 'hu' => 'east_european',
257 'iw' => 'hebrew',
258 'is' => 'west_european',
259 'it' => 'west_european',
260 'ja' => 'japanese',
261 'kl' => 'west_european',
262 'ko' => 'korean',
263 'lt' => 'lithuanian',
264 'lv' => 'west_european', // Latvian/Lettish
265 'nl' => 'west_european',
266 'no' => 'west_european',
267 'pl' => 'east_european',
268 'pt' => 'west_european',
269 'ro' => 'east_european',
270 'ru' => 'cyrillic',
271 'sk' => 'east_european',
272 'sl' => 'east_european',
273 'sv' => 'west_european',
274 'th' => 'thai',
275 'uk' => 'cyrillic',
276 'vi' => 'vietnamese',
277 'zh' => 'chinese',
278 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
279 'chs' => 'simpl_chinese',
280 'cht' => 'trad_chinese',
281 'csy' => 'east_european',
282 'dan' => 'west_european',
283 'deu' => 'west_european',
284 'dea' => 'west_european',
285 'des' => 'west_european',
286 'ena' => 'west_european',
287 'enc' => 'west_european',
288 'eng' => 'west_european',
289 'enz' => 'west_european',
290 'enu' => 'west_european',
291 'nld' => 'west_european',
292 'nlb' => 'west_european',
293 'fin' => 'west_european',
294 'fra' => 'west_european',
295 'frb' => 'west_european',
296 'frc' => 'west_european',
297 'frs' => 'west_european',
298 'ell' => 'greek',
299 'hun' => 'east_european',
300 'isl' => 'west_euorpean',
301 'ita' => 'west_european',
302 'its' => 'west_european',
303 'jpn' => 'japanese',
304 'kor' => 'korean',
305 'nor' => 'west_european',
306 'non' => 'west_european',
307 'plk' => 'east_european',
308 'ptg' => 'west_european',
309 'ptb' => 'west_european',
310 'rus' => 'east_european',
311 'sky' => 'east_european',
312 'esp' => 'west_european',
313 'esm' => 'west_european',
314 'esn' => 'west_european',
315 'sve' => 'west_european',
316 'trk' => 'turkish',
317 // English language names
318 'bulgarian' => 'east_european',
319 'catalan' => 'west_european',
320 'croatian' => 'east_european',
321 'czech' => 'east_european',
322 'danish' => 'west_european',
323 'dutch' => 'west_european',
324 'english' => 'west_european',
325 'finnish' => 'west_european',
326 'french' => 'west_european',
327 'galician' => 'west_european',
328 'german' => 'west_european',
329 'hungarian' => 'east_european',
330 'icelandic' => 'west_european',
331 'italian' => 'west_european',
332 'latvian' => 'west_european',
333 'lettish' => 'west_european',
334 'norwegian' => 'west_european',
335 'polish' => 'east_european',
336 'portuguese' => 'west_european',
337 'russian' => 'cyrillic',
338 'romanian' => 'east_european',
339 'slovak' => 'east_european',
340 'slovenian' => 'east_european',
341 'spanish' => 'west_european',
342 'svedish' => 'west_european',
343 'turkish' => 'east_european',
344 'ukrainian' => 'cyrillic',
345 );
346
347 // mapping of language (family) names to charsets on Unix
348 var $lang_to_charset_unix=array(
349 'west_european' => 'iso-8859-1',
350 'estonian' => 'iso-8859-1',
351 'east_european' => 'iso-8859-2',
352 'baltic' => 'iso-8859-4',
353 'cyrillic' => 'iso-8859-5',
354 'arabic' => 'iso-8859-6',
355 'greek' => 'iso-8859-7',
356 'hebrew' => 'iso-8859-8',
357 'turkish' => 'iso-8859-9',
358 'thai' => 'iso-8859-11', // = TIS-620
359 'lithuanian' => 'iso-8859-13',
360 'chinese' => 'gb2312', // = euc-cn
361 'japanese' => 'euc-jp',
362 'korean' => 'euc-kr',
363 'simpl_chinese' => 'gb2312',
364 'trad_chinese' => 'big5',
365 'vietnamese' => '',
366 );
367
368 // mapping of language (family) names to charsets on Windows
369 var $lang_to_charset_windows=array(
370 'east_european' => 'windows-1250',
371 'cyrillic' => 'windows-1251',
372 'west_european' => 'windows-1252',
373 'greek' => 'windows-1253',
374 'turkish' => 'windows-1254',
375 'hebrew' => 'windows-1255',
376 'arabic' => 'windows-1256',
377 'baltic' => 'windows-1257',
378 'estonian' => 'windows-1257',
379 'lithuanian' => 'windows-1257',
380 'vietnamese' => 'windows-1258',
381 'thai' => 'cp874',
382 'korean' => 'cp949',
383 'chinese' => 'gb2312',
384 'japanese' => 'shift_jis',
385 'simpl_chinese' => 'gb2312',
386 'trad_chinese' => 'big5',
387 );
388
389 // mapping of locale names to charsets
390 var $locale_to_charset=array(
391 'japanese.euc' => 'euc-jp',
392 'ja_jp.ujis' => 'euc-jp',
393 'korean.euc' => 'euc-kr',
394 'zh_cn' => 'gb2312',
395 'zh_hk' => 'big5',
396 'zh_tw' => 'big5',
397 );
398
399 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
400 // Empty values means "iso-8859-1"
401 var $charSetArray = array(
402 'dk' => '',
403 'de' => '',
404 'no' => '',
405 'it' => '',
406 'fr' => '',
407 'es' => '',
408 'nl' => '',
409 'cz' => 'windows-1250',
410 'pl' => 'iso-8859-2',
411 'si' => 'windows-1250',
412 'fi' => '',
413 'tr' => 'iso-8859-9',
414 'se' => '',
415 'pt' => '',
416 'ru' => 'windows-1251',
417 'ro' => 'iso-8859-2',
418 'ch' => 'gb2312',
419 'sk' => 'windows-1250',
420 'lt' => 'windows-1257',
421 'is' => 'utf-8',
422 'hr' => 'windows-1250',
423 'hu' => 'iso-8859-2',
424 'gl' => '',
425 'th' => 'iso-8859-11',
426 'gr' => 'iso-8859-7',
427 'hk' => 'big5',
428 'eu' => '',
429 'bg' => 'windows-1251',
430 'br' => '',
431 'et' => 'iso-8859-4',
432 'ar' => 'iso-8859-6',
433 'he' => 'utf-8',
434 'ua' => 'windows-1251',
435 'jp' => 'shift_jis',
436 'lv' => 'utf-8',
437 'vn' => 'utf-8',
438 'ca' => 'iso-8859-15',
439 'ba' => 'iso-8859-2',
440 'kr' => 'euc-kr',
441 );
442
443 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
444 // Empty values means sames as Typo3
445 var $isoArray = array(
446 'dk' => 'da',
447 'de' => '',
448 'no' => '',
449 'it' => '',
450 'fr' => '',
451 'es' => '',
452 'nl' => '',
453 'cz' => 'cs',
454 'pl' => '',
455 'si' => 'sl',
456 'fi' => '',
457 'tr' => '',
458 'se' => 'sv',
459 'pt' => '',
460 'ru' => '',
461 'ro' => '',
462 'ch' => 'zh_CN',
463 'sk' => '',
464 'lt' => '',
465 'is' => '',
466 'hr' => '',
467 'hu' => '',
468 'gl' => '', // Greenlandic
469 'th' => '',
470 'gr' => 'el',
471 'hk' => 'zh_HK',
472 'eu' => '',
473 'bg' => '',
474 'br' => 'pt_BR',
475 'et' => '',
476 'ar' => '',
477 'he' => 'iw',
478 'ua' => 'uk',
479 'jp' => 'ja',
480 'lv' => '',
481 'vn' => 'vi',
482 'ca' => '',
483 'ba' => '', // Bosnian
484 'kr' => '',
485 );
486
487 /**
488 * Normalize - changes input character set to lowercase letters.
489 *
490 * @param string Input charset
491 * @return string Normalized charset
492 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
493 */
494 function parse_charset($charset) {
495 $charset = strtolower($charset);
496 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
497
498 return $charset;
499 }
500
501 /**
502 * Get the charset of a locale.
503 *
504 * ln language
505 * ln_CN language / country
506 * ln_CN.cs language / country / charset
507 * ln_CN.cs@mod language / country / charset / modifier
508 *
509 * @param string Locale string
510 * @return string Charset resolved for locale string
511 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
512 */
513 function get_locale_charset($locale) {
514 $locale = strtolower($locale);
515
516 // exact locale specific charset?
517 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale];
518
519 // get modifier
520 list($locale,$modifier) = explode('@',$locale);
521
522 // locale contains charset: use it
523 list($locale,$charset) = explode('.',$locale);
524 if ($charset) return $this->parse_charset($charset);
525
526 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
527 if ($modifier == 'euro') return 'iso-8859-15';
528
529 // get language
530 list($language,$country) = explode('_',$locale);
531 if (isset($this->lang_to_langfamily[$language])) $language = $this->lang_to_langfamily[$language];
532
533 if (TYPO3_OS == 'WIN') {
534 $cs = $this->lang_to_charset_windows[$language];
535 } else {
536 $cs = $this->lang_to_charset_unix[$language];
537 }
538
539 return $cs ? $cs : 'iso-8859-1';
540 }
541
542
543
544
545
546
547
548
549
550 /********************************************
551 *
552 * Charset Conversion functions
553 *
554 ********************************************/
555
556 /**
557 * Convert from one charset to another charset.
558 *
559 * @param string Input string
560 * @param string From charset (the current charset of the string)
561 * @param string To charset (the output charset wanted)
562 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
563 * @return string Converted string
564 * @see convArray()
565 */
566 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
567 if ($fromCS==$toCS) return $str;
568
569 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
570 if ($toCS=='utf-8' || !$useEntityForNoChar) {
571 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
572 case 'mbstring':
573 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
574 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
575 break;
576
577 case 'iconv':
578 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
579 if (false !== $conv_str) return $conv_str;
580 break;
581
582 case 'recode':
583 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
584 if (false !== $conv_str) return $conv_str;
585 break;
586 }
587 // fallback to TYPO3 conversion
588 }
589
590 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
591 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
592 return $str;
593 }
594
595 /**
596 * Convert all elements in ARRAY from one charset to another charset.
597 * NOTICE: Array is passed by reference!
598 *
599 * @param string Input array, possibly multidimensional
600 * @param string From charset (the current charset of the string)
601 * @param string To charset (the output charset wanted)
602 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
603 * @return void
604 * @see conv()
605 */
606 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
607 foreach($array as $key => $value) {
608 if (is_array($array[$key])) {
609 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
610 } else {
611 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
612 }
613 }
614 }
615
616 /**
617 * Converts $str from $charset to UTF-8
618 *
619 * @param string String in local charset to convert to UTF-8
620 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
621 * @return string Output string, converted to UTF-8
622 */
623 function utf8_encode($str,$charset) {
624
625 // Charset is case-insensitive.
626 if ($this->initCharset($charset)) { // Parse conv. table if not already...
627 $strLen = strlen($str);
628 $outStr='';
629
630 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
631 $chr=substr($str,$a,1);
632 $ord=ord($chr);
633 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
634 $ord2 = ord($str{$a+1});
635 $ord = $ord<<8 & $ord2; // assume big endian
636
637 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
638 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
639 } else $outStr.=chr($this->noCharByteVal); // No char exists
640 $a++;
641 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
642 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
643 $a++;
644 $ord2=ord(substr($str,$a,1));
645 $ord = $ord*256+$ord2;
646 }
647 elseif ($charset == 'shift_jis' && ($ord <160 || $ord>223)) { // Shift-JIS is like EUC, but chars between 160 and 223 are single byte
648 $a++;
649 $ord2=ord(substr($str,$a,1));
650 $ord = $ord*256+$ord2;
651 }
652
653 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
654 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
655 } else $outStr.=chr($this->noCharByteVal); // No char exists
656 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
657 }
658 return $outStr;
659 }
660 }
661
662 /**
663 * Converts $str from UTF-8 to $charset
664 *
665 * @param string String in UTF-8 to convert to local charset
666 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
667 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
668 * @return string Output string, converted to local charset
669 */
670 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
671
672 // Charset is case-insensitive.
673 if ($this->initCharset($charset)) { // Parse conv. table if not already...
674 $strLen = strlen($str);
675 $outStr='';
676 $buf='';
677 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
678 $chr=substr($str,$a,1);
679 $ord=ord($chr);
680 if ($ord>127) { // This means multibyte! (first byte!)
681 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
682
683 $buf=$chr; // Add first byte
684 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
685 $ord = $ord << 1; // Shift it left and ...
686 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
687 $a++; // Increase pointer...
688 $buf.=substr($str,$a,1); // ... and add the next char.
689 } else break;
690 }
691
692 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
693 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
694 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
695 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
696 } else $outStr.= chr($mByte);
697 } elseif ($useEntityForNoChar) { // Create num entity:
698 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
699 } else $outStr.=chr($this->noCharByteVal); // No char exists
700 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
701 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
702 }
703 return $outStr;
704 }
705 }
706
707 /**
708 * Converts all chars > 127 to numeric entities.
709 *
710 * @param string Input string
711 * @return string Output string
712 */
713 function utf8_to_entities($str) {
714 $strLen = strlen($str);
715 $outStr='';
716 $buf='';
717 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
718 $chr=substr($str,$a,1);
719 $ord=ord($chr);
720 if ($ord>127) { // This means multibyte! (first byte!)
721 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
722 $buf=$chr; // Add first byte
723 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
724 $ord = $ord << 1; // Shift it left and ...
725 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
726 $a++; // Increase pointer...
727 $buf.=substr($str,$a,1); // ... and add the next char.
728 } else break;
729 }
730
731 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
732 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
733 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
734 }
735
736 return $outStr;
737 }
738
739 /**
740 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
741 *
742 * @param string Input string, UTF-8
743 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
744 * @return string Output string
745 */
746 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
747 if ($alsoStdHtmlEnt) {
748 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
749 }
750
751 $token = md5(microtime());
752 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
753 foreach($parts as $k => $v) {
754 if ($k%2) {
755 if (substr($v,0,1)=='#') { // Dec or hex entities:
756 if (substr($v,1,1)=='x') {
757 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
758 } else {
759 $parts[$k] = $this->UnumberToChar(substr($v,1));
760 }
761 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
762 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
763 } else { // No conversion:
764 $parts[$k] ='&'.$v.';';
765 }
766 }
767 }
768
769 return implode('',$parts);
770 }
771
772 /**
773 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
774 *
775 * @param string Input string, UTF-8
776 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
777 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
778 * @return array Output array with the char numbers
779 */
780 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
781 // If entities must be registered as well...:
782 if ($convEntities) {
783 $str = $this->entities_to_utf8($str,1);
784 }
785 // Do conversion:
786 $strLen = strlen($str);
787 $outArr=array();
788 $buf='';
789 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
790 $chr=substr($str,$a,1);
791 $ord=ord($chr);
792 if ($ord>127) { // This means multibyte! (first byte!)
793 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
794 $buf=$chr; // Add first byte
795 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
796 $ord = $ord << 1; // Shift it left and ...
797 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
798 $a++; // Increase pointer...
799 $buf.=substr($str,$a,1); // ... and add the next char.
800 } else break;
801 }
802
803 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
804 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
805 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
806 }
807
808 return $outArr;
809 }
810
811 /**
812 * Converts a UNICODE number to a UTF-8 multibyte character
813 * Algorithm based on script found at From: http://czyborra.com/utf/
814 * Unit-tested by Kasper
815 *
816 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
817 *
818 * bytes | bits | representation
819 * 1 | 7 | 0vvvvvvv
820 * 2 | 11 | 110vvvvv 10vvvvvv
821 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
822 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
823 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
824 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
825 *
826 * @param integer UNICODE integer
827 * @return string UTF-8 multibyte character string
828 * @see utf8CharToUnumber()
829 */
830 function UnumberToChar($cbyte) {
831 $str='';
832
833 if ($cbyte < 0x80) {
834 $str.=chr($cbyte);
835 } else if ($cbyte < 0x800) {
836 $str.=chr(0xC0 | ($cbyte >> 6));
837 $str.=chr(0x80 | ($cbyte & 0x3F));
838 } else if ($cbyte < 0x10000) {
839 $str.=chr(0xE0 | ($cbyte >> 12));
840 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
841 $str.=chr(0x80 | ($cbyte & 0x3F));
842 } else if ($cbyte < 0x200000) {
843 $str.=chr(0xF0 | ($cbyte >> 18));
844 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
845 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
846 $str.=chr(0x80 | ($cbyte & 0x3F));
847 } else if ($cbyte < 0x4000000) {
848 $str.=chr(0xF8 | ($cbyte >> 24));
849 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
850 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
851 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
852 $str.=chr(0x80 | ($cbyte & 0x3F));
853 } else if ($cbyte < 0x80000000) {
854 $str.=chr(0xFC | ($cbyte >> 30));
855 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
856 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
857 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
858 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
859 $str.=chr(0x80 | ($cbyte & 0x3F));
860 } else { // Cannot express a 32-bit character in UTF-8
861 $str .= chr($this->noCharByteVal);
862 }
863 return $str;
864 }
865
866 /**
867 * Converts a UTF-8 Multibyte character to a UNICODE number
868 * Unit-tested by Kasper
869 *
870 * @param string UTF-8 multibyte character string
871 * @param boolean If set, then a hex. number is returned.
872 * @return integer UNICODE integer
873 * @see UnumberToChar()
874 */
875 function utf8CharToUnumber($str,$hex=0) {
876 $ord=ord(substr($str,0,1)); // First char
877
878 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
879 $binBuf='';
880 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
881 $ord = $ord << 1; // Shift it left and ...
882 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
883 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
884 } else break;
885 }
886 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
887
888 $int = bindec($binBuf);
889 } else $int = $ord;
890
891 return $hex ? 'x'.dechex($int) : $int;
892 }
893
894
895
896
897
898
899
900
901
902 /********************************************
903 *
904 * Init functions
905 *
906 ********************************************/
907
908 /**
909 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
910 * This function is automatically called by the conversion functions
911 *
912 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
913 *
914 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
915 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
916 * @access private
917 */
918 function initCharset($charset) {
919 // Only process if the charset is not yet loaded:
920 if (!is_array($this->parsedCharsets[$charset])) {
921
922 // Conversion table filename:
923 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
924
925 // If the conversion table is found:
926 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
927 // Cache file for charsets:
928 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
929 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
930 if ($cacheFile && @is_file($cacheFile)) {
931 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
932 } else {
933 // Parse conversion table into lines:
934 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
935 // Initialize the internal variable holding the conv. table:
936 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
937 // traverse the lines:
938 $detectedType='';
939 foreach($lines as $value) {
940 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
941
942 // Detect type if not done yet: (Done on first real line)
943 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
944 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
945
946 if ($detectedType=='ms-token') {
947 list($hexbyte,$utf8) = split('=|:',$value,3);
948 } elseif ($detectedType=='whitespaced') {
949 $regA=array();
950 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
951 $hexbyte = $regA[1];
952 $utf8 = 'U+'.$regA[2];
953 }
954 $decval = hexdec(trim($hexbyte));
955 if ($decval>127) {
956 $utf8decval = hexdec(substr(trim($utf8),2));
957 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
958 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
959 }
960 }
961 }
962 if ($cacheFile) {
963 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
964 }
965 }
966 return 2;
967 } else return false;
968 } else return 1;
969 }
970
971 /**
972 * This function initializes all UTF-8 character data tables.
973 *
974 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
975 *
976 * @param string Mode ("case", "ascii", ...)
977 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
978 * @access private
979 */
980 function initUnicodeData($mode=null) {
981 // cache files
982 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
983 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
984
985 // Only process if the tables are not yet loaded
986 switch($mode) {
987 case 'case':
988 if (is_array($this->caseFolding['utf-8'])) return 1;
989
990 // Use cached version if possible
991 if ($cacheFileCase && @is_file($cacheFileCase)) {
992 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
993 return 2;
994 }
995 break;
996
997 case 'ascii':
998 if (is_array($this->toASCII['utf-8'])) return 1;
999
1000 // Use cached version if possible
1001 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1002 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1003 return 2;
1004 }
1005 break;
1006 }
1007
1008 // process main Unicode data file
1009 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
1010 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
1011
1012 $fh = fopen($unicodeDataFile,'rb');
1013 if (!$fh) return false;
1014
1015 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1016 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1017 $this->caseFolding['utf-8'] = array();
1018 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1019 $utf8CaseFolding['toUpper'] = array();
1020 $utf8CaseFolding['toLower'] = array();
1021 $utf8CaseFolding['toTitle'] = array();
1022
1023 $decomposition = array(); // array of temp. decompositions
1024 $mark = array(); // array of chars that are marks (eg. composing accents)
1025 $number = array(); // array of chars that are numbers (eg. digits)
1026 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1027
1028 while (!feof($fh)) {
1029 $line = fgets($fh,4096);
1030 // has a lot of info
1031 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
1032
1033 $ord = hexdec($char);
1034 if ($ord > 0xFFFF) break; // only process the BMP
1035
1036 $utf8_char = $this->UnumberToChar($ord);
1037
1038 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1039 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1040 // store "title" only when different from "upper" (only a few)
1041 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1042
1043 switch ($cat{0}) {
1044 case 'M': // mark (accent, umlaut, ...)
1045 $mark["U+$char"] = 1;
1046 break;
1047
1048 case 'N': // numeric value
1049 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
1050 }
1051
1052 // accented Latin letters without "official" decomposition
1053 $match = array();
1054 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) {
1055 $c = ord($match[2]);
1056 if ($match[1] == 'SMALL') $c += 32;
1057
1058 $decomposition["U+$char"] = array(dechex($c));
1059 continue;
1060 }
1061
1062 $match = array();
1063 if (ereg('(<.*>)? *(.+)',$decomp,$match)) {
1064 switch($match[1]) {
1065 case '<circle>': // add parenthesis as circle replacement, eg (1)
1066 $match[2] = '0028 '.$match[2].' 0029';
1067 break;
1068
1069 case '<square>': // add square brackets as square replacement, eg [1]
1070 $match[2] = '005B '.$match[2].' 005D';
1071 break;
1072
1073 case '<compat>': // ignore multi char decompositions that start with a space
1074 if (ereg('^0020 ',$match[2])) continue 2;
1075 break;
1076
1077 // ignore Arabic and vertical layout presentation decomposition
1078 case '<initial>':
1079 case '<medial>':
1080 case '<final>':
1081 case '<isolated>':
1082 case '<vertical>':
1083 continue 2;
1084 }
1085 $decomposition["U+$char"] = split(' ',$match[2]);
1086 }
1087 }
1088 fclose($fh);
1089
1090 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1091 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
1092 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1093 $fh = fopen($specialCasingFile,'rb');
1094 if ($fh) {
1095 while (!feof($fh)) {
1096 $line = fgets($fh,4096);
1097 if ($line{0} != '#' && trim($line) != '') {
1098
1099 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
1100 if ($cond == '' || $cond{0} == '#') {
1101 $utf8_char = $this->UnumberToChar(hexdec($char));
1102 if ($char != $lower) {
1103 $arr = split(' ',$lower);
1104 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1105 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1106 }
1107 if ($char != $title && $title != $upper) {
1108 $arr = split(' ',$title);
1109 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1110 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1111 }
1112 if ($char != $upper) {
1113 $arr = split(' ',$upper);
1114 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1115 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1116 }
1117 }
1118 }
1119 }
1120 fclose($fh);
1121 }
1122 }
1123
1124 // process custom decompositions
1125 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
1126 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1127 $fh = fopen($customTranslitFile,'rb');
1128 if ($fh) {
1129 while (!feof($fh)) {
1130 $line = fgets($fh,4096);
1131 if ($line{0} != '#' && trim($line) != '') {
1132 list($char,$translit) = t3lib_div::trimExplode(';', $line);
1133 if (!$translit) $omit["U+$char"] = 1;
1134 $decomposition["U+$char"] = split(' ', $translit);
1135
1136 }
1137 }
1138 fclose($fh);
1139 }
1140 }
1141
1142 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1143 foreach($decomposition as $from => $to) {
1144 $code_decomp = array();
1145
1146 while ($code_value = array_shift($to)) {
1147 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1148 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
1149 array_unshift($to, $cv);
1150 }
1151 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1152 array_push($code_decomp, $code_value);
1153 }
1154 }
1155 if (count($code_decomp) || isset($omit[$from])) {
1156 $decomposition[$from] = $code_decomp;
1157 } else {
1158 unset($decomposition[$from]);
1159 }
1160 }
1161
1162 // create ascii only mapping
1163 $this->toASCII['utf-8'] = array();
1164 $ascii =& $this->toASCII['utf-8'];
1165
1166 foreach($decomposition as $from => $to) {
1167 $code_decomp = array();
1168 while ($code_value = array_shift($to)) {
1169 $ord = hexdec($code_value);
1170 if ($ord > 127)
1171 continue 2; // skip decompositions containing non-ASCII chars
1172 else
1173 array_push($code_decomp,chr($ord));
1174 }
1175 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1176 }
1177
1178 // add numeric decompositions
1179 foreach($number as $from => $to) {
1180 $utf8_char = $this->UnumberToChar(hexdec($from));
1181 if (!isset($ascii[$utf8_char])) {
1182 $ascii[$utf8_char] = $to;
1183 }
1184 }
1185
1186 if ($cacheFileCase) {
1187 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1188 }
1189
1190 if ($cacheFileASCII) {
1191 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1192 }
1193
1194 return 3;
1195 }
1196
1197 /**
1198 * This function initializes the folding table for a charset other than UTF-8.
1199 * This function is automatically called by the case folding functions.
1200 *
1201 * @param string Charset for which to initialize case folding.
1202 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1203 * @access private
1204 */
1205 function initCaseFolding($charset) {
1206 // Only process if the case table is not yet loaded:
1207 if (is_array($this->caseFolding[$charset])) return 1;
1208
1209 // Use cached version if possible
1210 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1211 if ($cacheFile && @is_file($cacheFile)) {
1212 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1213 return 2;
1214 }
1215
1216 // init UTF-8 conversion for this charset
1217 if (!$this->initCharset($charset)) {
1218 return false;
1219 }
1220
1221 // UTF-8 case folding is used as the base conversion table
1222 if (!$this->initUnicodeData('case')) {
1223 return false;
1224 }
1225
1226 $nochar = chr($this->noCharByteVal);
1227 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1228 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1229 $c = $this->utf8_decode($utf8, $charset);
1230
1231 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1232 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1233 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1234
1235 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1236 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1237 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
1238
1239 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1240 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1241 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1242 }
1243
1244 // add the ASCII case table
1245 for ($i=ord('a'); $i<=ord('z'); $i++) {
1246 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
1247 }
1248 for ($i=ord('A'); $i<=ord('Z'); $i++) {
1249 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
1250 }
1251
1252 if ($cacheFile) {
1253 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
1254 }
1255
1256 return 3;
1257 }
1258
1259 /**
1260 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1261 * This function is automatically called by the ASCII transliteration functions.
1262 *
1263 * @param string Charset for which to initialize conversion.
1264 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1265 * @access private
1266 */
1267 function initToASCII($charset) {
1268 // Only process if the case table is not yet loaded:
1269 if (is_array($this->toASCII[$charset])) return 1;
1270
1271 // Use cached version if possible
1272 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1273 if ($cacheFile && @is_file($cacheFile)) {
1274 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1275 return 2;
1276 }
1277
1278 // init UTF-8 conversion for this charset
1279 if (!$this->initCharset($charset)) {
1280 return false;
1281 }
1282
1283 // UTF-8/ASCII transliteration is used as the base conversion table
1284 if (!$this->initUnicodeData('ascii')) {
1285 return false;
1286 }
1287
1288 $nochar = chr($this->noCharByteVal);
1289 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1290 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1291 $c = $this->utf8_decode($utf8, $charset);
1292
1293 if (isset($this->toASCII['utf-8'][$utf8])) {
1294 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1295 }
1296 }
1297
1298 if ($cacheFile) {
1299 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
1300 }
1301
1302 return 3;
1303 }
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320 /********************************************
1321 *
1322 * String operation functions
1323 *
1324 ********************************************/
1325
1326 /**
1327 * Returns a part of a string.
1328 * Unit-tested by Kasper (single byte charsets only)
1329 *
1330 * @param string The character set
1331 * @param string Character string
1332 * @param integer Start position (character position)
1333 * @param integer Length (in characters)
1334 * @return string The substring
1335 * @see substr(), mb_substr()
1336 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1337 */
1338 function substr($charset,$string,$start,$len=null) {
1339 if ($len===0) return '';
1340
1341 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1342 // cannot omit $len, when specifying charset
1343 if ($len==null) {
1344 $enc = mb_internal_encoding(); // save internal encoding
1345 mb_internal_encoding('utf-8');
1346 $str = mb_substr($string,$start);
1347 mb_internal_encoding($enc); // restore internal encoding
1348
1349 return $str;
1350 }
1351 else return mb_substr($string,$start,$len,'utf-8');
1352 } elseif ($charset == 'utf-8') {
1353 return $this->utf8_substr($string,$start,$len);
1354 } elseif ($this->eucBasedSets[$charset]) {
1355 return $this->euc_substr($string,$start,$charset,$len);
1356 } elseif ($this->twoByteSets[$charset]) {
1357 return substr($string,$start*2,$len*2);
1358 } elseif ($this->fourByteSets[$charset]) {
1359 return substr($string,$start*4,$len*4);
1360 }
1361
1362 // treat everything else as single-byte encoding
1363 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
1364 }
1365
1366 /**
1367 * Counts the number of characters.
1368 * Unit-tested by Kasper (single byte charsets only)
1369 *
1370 * @param string The character set
1371 * @param string Character string
1372 * @return integer The number of characters
1373 * @see strlen()
1374 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1375 */
1376 function strlen($charset,$string) {
1377 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1378 return mb_strlen($string,$charset);
1379 } elseif ($charset == 'utf-8') {
1380 return $this->utf8_strlen($string);
1381 } elseif ($this->eucBasedSets[$charset]) {
1382 return $this->euc_strlen($string,$charset);
1383 } elseif ($this->twoByteSets[$charset]) {
1384 return strlen($string)/2;
1385 } elseif ($this->fourByteSets[$charset]) {
1386 return strlen($string)/4;
1387 }
1388 // treat everything else as single-byte encoding
1389 return strlen($string);
1390 }
1391
1392 /**
1393 * Truncates a string and pre-/appends a string.
1394 * Unit tested by Kasper
1395 *
1396 * @param string The character set
1397 * @param string Character string
1398 * @param integer Length (in characters)
1399 * @param string Crop signifier
1400 * @return string The shortened string
1401 * @see substr(), mb_strimwidth()
1402 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1403 */
1404 function crop($charset,$string,$len,$crop='') {
1405 if (intval($len) == 0) return $string;
1406
1407 if ($charset == 'utf-8') {
1408 $i = $this->utf8_char2byte_pos($string,$len);
1409 } elseif ($this->eucBasedSets[$charset]) {
1410 $i = $this->euc_char2byte_pos($string,$len,$charset);
1411 } else {
1412 if ($len > 0) {
1413 $i = $len;
1414 } else {
1415 $i = strlen($string)+$len;
1416 if ($i<=0) $i = false;
1417 }
1418 }
1419
1420 if ($i === false) { // $len outside actual string length
1421 return $string;
1422 } else {
1423 if ($len > 0) {
1424 if (strlen($string{$i})) {
1425 return substr($string,0,$i).$crop;
1426
1427 }
1428 } else {
1429 if (strlen($string{$i-1})) {
1430 return $crop.substr($string,$i);
1431 }
1432 }
1433
1434 /*
1435 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1436 if ($len > 0) {
1437 return substr($string,0,$i).$crop;
1438 } else {
1439 return $crop.substr($string,$i);
1440 }
1441 }
1442 */
1443 }
1444 return $string;
1445 }
1446
1447 /**
1448 * Cuts a string short at a given byte length.
1449 *
1450 * @param string The character set
1451 * @param string Character string
1452 * @param integer The byte length
1453 * @return string The shortened string
1454 * @see mb_strcut()
1455 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1456 */
1457 function strtrunc($charset,$string,$len) {
1458 if ($len <= 0) return '';
1459
1460 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1461 return mb_strcut($string,0,$len,$charset);
1462 } elseif ($charset == 'utf-8') {
1463 return $this->utf8_strtrunc($string,$len);
1464 } elseif ($this->eucBasedSets[$charset]) {
1465 return $this->euc_strtrunc($string,$charset);
1466 } elseif ($this->twoByteSets[$charset]) {
1467 if ($len % 2) $len--; // don't cut at odd positions
1468 } elseif ($this->fourByteSets[$charset]) {
1469 $x = $len % 4;
1470 $len -= $x; // realign to position dividable by four
1471 }
1472 // treat everything else as single-byte encoding
1473 return substr($string,0,$len);
1474 }
1475
1476 /**
1477 * Translates all characters of a string into their respective case values.
1478 * Unlike strtolower() and strtoupper() this method is locale independent.
1479 * Note that the string length may change!
1480 * eg. lower case German �(sharp S) becomes upper case "SS"
1481 * Unit-tested by Kasper
1482 * Real case folding is language dependent, this method ignores this fact.
1483 *
1484 * @param string Character set of string
1485 * @param string Input string to convert case for
1486 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1487 * @return string The converted string
1488 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1489 * @see strtolower(), strtoupper()
1490 */
1491 function conv_case($charset,$string,$case) {
1492 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' && float(phpversion()) >= 4.3) {
1493 if ($case == 'toLower') {
1494 return mb_strtolower($str,'utf-8');
1495 } else {
1496 return mb_strtoupper($str,'utf-8');
1497 }
1498 } elseif ($charset == 'utf-8') {
1499 return $this->utf8_char_mapping($string,'case',$case);
1500 } elseif (isset($this->eucBasedSets[$charset])) {
1501 return $this->euc_char_mapping($string,$charset,'case',$case);
1502 } else {
1503 // treat everything else as single-byte encoding
1504 return $this->sb_char_mapping($string,$charset,'case',$case);
1505 }
1506
1507 return $string;
1508 }
1509
1510 /**
1511 * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
1512 *
1513 * @param string Character set of string
1514 * @param string Input string to convert
1515 * @return string The converted string
1516 */
1517 function specCharsToASCII($charset,$string) {
1518 if ($charset == 'utf-8') {
1519 return $this->utf8_char_mapping($string,'ascii');
1520 } elseif (isset($this->eucBasedSets[$charset])) {
1521 return $this->euc_char_mapping($string,$charset,'ascii');
1522 } else {
1523 // treat everything else as single-byte encoding
1524 return $this->sb_char_mapping($string,$charset,'ascii');
1525 }
1526
1527 return $string;
1528 }
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541 /********************************************
1542 *
1543 * Internal string operation functions
1544 *
1545 ********************************************/
1546
1547 /**
1548 * Maps all characters of a string in a single byte charset.
1549 *
1550 * @param string the string
1551 * @param string the charset
1552 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1553 * @param string 'case': conversion 'toLower' or 'toUpper'
1554 * @return string the converted string
1555 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1556 */
1557 function sb_char_mapping($str,$charset,$mode,$opt='') {
1558 switch($mode) {
1559 case 'case':
1560 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1561 $map =& $this->caseFolding[$charset][$opt];
1562 break;
1563
1564 case 'ascii':
1565 if (!$this->initToASCII($charset)) return $str; // do nothing
1566 $map =& $this->toASCII[$charset];
1567 break;
1568
1569 default:
1570 return $str;
1571 }
1572
1573 $out = '';
1574 for($i=0; strlen($str{$i}); $i++) {
1575 $c = $str{$i};
1576 if (isset($map[$c])) {
1577 $out .= $map[$c];
1578 } else {
1579 $out .= $c;
1580 }
1581 }
1582
1583 return $out;
1584 }
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595 /********************************************
1596 *
1597 * Internal UTF-8 string operation functions
1598 *
1599 ********************************************/
1600
1601 /**
1602 * Returns a part of a UTF-8 string.
1603 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1604 *
1605 * @param string UTF-8 string
1606 * @param integer Start position (character position)
1607 * @param integer Length (in characters)
1608 * @return string The substring
1609 * @see substr()
1610 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1611 */
1612 function utf8_substr($str,$start,$len=null) {
1613 if (!strcmp($len,'0')) return '';
1614
1615 $byte_start = $this->utf8_char2byte_pos($str,$start);
1616 if ($byte_start === false) {
1617 if ($start > 0) {
1618 return false; // $start outside string length
1619 } else {
1620 $start = 0;
1621 }
1622 }
1623
1624 $str = substr($str,$byte_start);
1625
1626 if ($len!=null) {
1627 $byte_end = $this->utf8_char2byte_pos($str,$len);
1628 if ($byte_end === false) // $len outside actual string length
1629 return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string.
1630 else
1631 return substr($str,0,$byte_end);
1632 }
1633 else return $str;
1634 }
1635
1636 /**
1637 * Counts the number of characters of a string in UTF-8.
1638 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1639 *
1640 * @param string UTF-8 multibyte character string
1641 * @return integer The number of characters
1642 * @see strlen()
1643 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1644 */
1645 function utf8_strlen($str) {
1646 $n=0;
1647 for($i=0; strlen($str{$i}); $i++) {
1648 $c = ord($str{$i});
1649 if (!($c & 0x80)) // single-byte (0xxxxxx)
1650 $n++;
1651 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1652 $n++;
1653 }
1654 return $n;
1655 }
1656
1657 /**
1658 * Truncates a string in UTF-8 short at a given byte length.
1659 *
1660 * @param string UTF-8 multibyte character string
1661 * @param integer the byte length
1662 * @return string the shortened string
1663 * @see mb_strcut()
1664 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1665 */
1666 function utf8_strtrunc($str,$len) {
1667 $i = $len-1;
1668 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1669 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1670 if ($i <= 0) return ''; // sanity check
1671 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1672 if ($bc+$i > $len) return substr($str,0,$i);
1673 // fallthru: multibyte char fits into length
1674 }
1675 return substr($str,$len);
1676 }
1677
1678 /**
1679 * Find position of first occurrence of a string, both arguments are in UTF-8.
1680 *
1681 * @param string UTF-8 string to search in
1682 * @param string UTF-8 string to search for
1683 * @param integer Positition to start the search
1684 * @return integer The character position
1685 * @see strpos()
1686 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1687 */
1688 function utf8_strpos($haystack,$needle,$offset=0) {
1689 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1690 return mb_strpos($haystack,$needle,'utf-8');
1691 }
1692
1693 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1694 if ($byte_offset === false) return false; // offset beyond string length
1695
1696 $byte_pos = strpos($haystack,$needle,$byte_offset);
1697 if ($byte_pos === false) return false; // needle not found
1698
1699 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1700 }
1701
1702 /**
1703 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1704 *
1705 * @param string UTF-8 string to search in
1706 * @param string UTF-8 character to search for (single character)
1707 * @return integer The character position
1708 * @see strrpos()
1709 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1710 */
1711 function utf8_strrpos($haystack,$needle) {
1712 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1713 return mb_strrpos($haystack,$needle,'utf-8');
1714 }
1715
1716 $byte_pos = strrpos($haystack,$needle);
1717 if ($byte_pos === false) return false; // needle not found
1718
1719 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1720 }
1721
1722 /**
1723 * Translates a character position into an 'absolute' byte position.
1724 * Unit tested by Kasper.
1725 *
1726 * @param string UTF-8 string
1727 * @param integer Character position (negative values start from the end)
1728 * @return integer Byte position
1729 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1730 */
1731 function utf8_char2byte_pos($str,$pos) {
1732 $n = 0; // number of characters found
1733 $p = abs($pos); // number of characters wanted
1734
1735 if ($pos >= 0) {
1736 $i = 0;
1737 $d = 1;
1738 } else {
1739 $i = strlen($str)-1;
1740 $d = -1;
1741 }
1742
1743 for( ; strlen($str{$i}) && $n<$p; $i+=$d) {
1744 $c = (int)ord($str{$i});
1745 if (!($c & 0x80)) // single-byte (0xxxxxx)
1746 $n++;
1747 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1748 $n++;
1749 }
1750 if (!strlen($str{$i})) return false; // offset beyond string length
1751
1752 if ($pos >= 0) {
1753 // skip trailing multi-byte data bytes
1754 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1755 } else {
1756 // correct offset
1757 $i++;
1758 }
1759
1760 return $i;
1761 }
1762
1763 /**
1764 * Translates an 'absolute' byte position into a character position.
1765 * Unit tested by Kasper.
1766 *
1767 * @param string UTF-8 string
1768 * @param integer byte position
1769 * @return integer character position
1770 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1771 */
1772 function utf8_byte2char_pos($str,$pos) {
1773 $n = 0; // number of characters
1774 for($i=$pos; $i>0; $i--) {
1775 $c = (int)ord($str{$i});
1776 if (!($c & 0x80)) // single-byte (0xxxxxx)
1777 $n++;
1778 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1779 $n++;
1780 }
1781 if (!strlen($str{$i})) return false; // offset beyond string length
1782
1783 return $n;
1784 }
1785
1786 /**
1787 * Maps all characters of an UTF-8 string.
1788 *
1789 * @param string UTF-8 string
1790 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1791 * @param string 'case': conversion 'toLower' or 'toUpper'
1792 * @return string the converted string
1793 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1794 */
1795 function utf8_char_mapping($str,$mode,$opt='') {
1796 if (!$this->initUnicodeData($mode)) return $str; // do nothing
1797
1798 $out = '';
1799 switch($mode) {
1800 case 'case':
1801 $map =& $this->caseFolding['utf-8'][$opt];
1802 break;
1803
1804 case 'ascii':
1805 $map =& $this->toASCII['utf-8'];
1806 break;
1807
1808 default:
1809 return $str;
1810 }
1811
1812 for($i=0; strlen($str{$i}); $i++) {
1813 $c = ord($str{$i});
1814 if (!($c & 0x80)) // single-byte (0xxxxxx)
1815 $mbc = $str{$i};
1816 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1817 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1818 $mbc = substr($str,$i,$bc);
1819 $i += $bc-1;
1820 }
1821
1822 if (isset($map[$mbc])) {
1823 $out .= $map[$mbc];
1824 } else {
1825 $out .= $mbc;
1826 }
1827 }
1828
1829 return $out;
1830 }
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849 /********************************************
1850 *
1851 * Internal EUC string operation functions
1852 *
1853 * Extended Unix Code:
1854 * ASCII compatible 7bit single bytes chars
1855 * 8bit two byte chars
1856 *
1857 * Shift-JIS is treated as a special case.
1858 *
1859 ********************************************/
1860
1861 /**
1862 * Cuts a string in the EUC charset family short at a given byte length.
1863 *
1864 * @param string EUC multibyte character string
1865 * @param integer the byte length
1866 * @param string the charset
1867 * @return string the shortened string
1868 * @see mb_strcut()
1869 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1870 */
1871 function euc_strtrunc($str,$len,$charset) {
1872 $sjis = ($charset == 'shift_jis');
1873 for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
1874 $c = ord($str{$i});
1875 if ($sjis) {
1876 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1877 }
1878 else {
1879 if ($c >= 0x80) $i++; // advance a double-byte char
1880 }
1881 }
1882 if (!strlen($str{$i})) return $str; // string shorter than supplied length
1883
1884 if ($i>$len)
1885 return substr($str,0,$len-1); // we ended on a first byte
1886 else
1887 return substr($str,0,$len);
1888 }
1889
1890 /**
1891 * Returns a part of a string in the EUC charset family.
1892 *
1893 * @param string EUC multibyte character string
1894 * @param integer start position (character position)
1895 * @param string the charset
1896 * @param integer length (in characters)
1897 * @return string the substring
1898 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1899 */
1900 function euc_substr($str,$start,$charset,$len=null) {
1901 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1902 if ($byte_start === false) return false; // $start outside string length
1903
1904 $str = substr($str,$byte_start);
1905
1906 if ($len!=null) {
1907 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1908 if ($byte_end === false) // $len outside actual string length
1909 return $str;
1910 else
1911 return substr($str,0,$byte_end);
1912 }
1913 else return $str;
1914 }
1915
1916 /**
1917 * Counts the number of characters of a string in the EUC charset family.
1918 *
1919 * @param string EUC multibyte character string
1920 * @param string the charset
1921 * @return integer the number of characters
1922 * @see strlen()
1923 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1924 */
1925 function euc_strlen($str,$charset) {
1926 $sjis = ($charset == 'shift_jis');
1927 $n=0;
1928 for ($i=0; strlen($str{$i}); $i++) {
1929 $c = ord($str{$i});
1930 if ($sjis) {
1931 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1932 }
1933 else {
1934 if ($c >= 0x80) $i++; // advance a double-byte char
1935 }
1936
1937 $n++;
1938 }
1939
1940 return $n;
1941 }
1942
1943 /**
1944 * Translates a character position into an 'absolute' byte position.
1945 *
1946 * @param string EUC multibyte character string
1947 * @param integer character position (negative values start from the end)
1948 * @param string the charset
1949 * @return integer byte position
1950 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1951 */
1952 function euc_char2byte_pos($str,$pos,$charset) {
1953 $sjis = ($charset == 'shift_jis');
1954 $n = 0; // number of characters seen
1955 $p = abs($pos); // number of characters wanted
1956
1957 if ($pos >= 0) {
1958 $i = 0;
1959 $d = 1;
1960 } else {
1961 $i = strlen($str)-1;
1962 $d = -1;
1963 }
1964
1965 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
1966 $c = ord($str{$i});
1967 if ($sjis) {
1968 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char
1969 }
1970 else {
1971 if ($c >= 0x80) $i+=$d; // advance a double-byte char
1972 }
1973
1974 $n++;
1975 }
1976 if (!strlen($str{$i})) return false; // offset beyond string length
1977
1978 if ($pos < 0) $i++; // correct offset
1979
1980 return $i;
1981 }
1982
1983 /**
1984 * Maps all characters of a string in the EUC charset family.
1985 *
1986 * @param string EUC multibyte character string
1987 * @param string the charset
1988 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1989 * @param string 'case': conversion 'toLower' or 'toUpper'
1990 * @return string the converted string
1991 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1992 */
1993 function euc_char_mapping($str,$charset,$mode,$opt='') {
1994 switch($mode) {
1995 case 'case':
1996 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1997 $map =& $this->caseFolding[$charset][$opt];
1998 break;
1999
2000 case 'ascii':
2001 if (!$this->initToASCII($charset)) return $str; // do nothing
2002 $map =& $this->toASCII[$charset];
2003 break;
2004
2005 default:
2006 return $str;
2007 }
2008
2009 $sjis = ($charset == 'shift_jis');
2010 $out = '';
2011 for($i=0; strlen($str{$i}); $i++) {
2012 $mbc = $str{$i};
2013 $c = ord($mbc);
2014
2015 if ($sjis) {
2016 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2017 $mbc = substr($str,$i,2);
2018 $i++;
2019 }
2020 }
2021 else {
2022 if ($c >= 0x80) { // a double-byte char
2023 $mbc = substr($str,$i,2);
2024 $i++;
2025 }
2026 }
2027
2028 if (isset($map[$mbc])) {
2029 $out .= $map[$mbc];
2030 } else {
2031 $out .= $mbc;
2032 }
2033 }
2034
2035 return $out;
2036 }
2037
2038 }
2039
2040 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
2041 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2042 }
2043 ?>