* Added Hindi language
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2005 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 136: class t3lib_cs
38 * 503: function parse_charset($charset)
39 * 522: function get_locale_charset($locale)
40 *
41 * SECTION: Charset Conversion functions
42 * 575: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
43 * 615: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
44 * 632: function utf8_encode($str,$charset)
45 * 678: function utf8_decode($str,$charset,$useEntityForNoChar=0)
46 * 721: function utf8_to_entities($str)
47 * 754: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
48 * 788: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
49 * 838: function UnumberToChar($cbyte)
50 * 883: function utf8CharToUnumber($str,$hex=0)
51 *
52 * SECTION: Init functions
53 * 926: function initCharset($charset)
54 * 988: function initUnicodeData($mode=null)
55 * 1213: function initCaseFolding($charset)
56 * 1275: function initToASCII($charset)
57 *
58 * SECTION: String operation functions
59 * 1346: function substr($charset,$string,$start,$len=null)
60 * 1384: function strlen($charset,$string)
61 * 1412: function crop($charset,$string,$len,$crop='')
62 * 1465: function strtrunc($charset,$string,$len)
63 * 1499: function conv_case($charset,$string,$case)
64 * 1525: function specCharsToASCII($charset,$string)
65 *
66 * SECTION: Internal string operation functions
67 * 1565: function sb_char_mapping($str,$charset,$mode,$opt='')
68 *
69 * SECTION: Internal UTF-8 string operation functions
70 * 1620: function utf8_substr($str,$start,$len=null)
71 * 1653: function utf8_strlen($str)
72 * 1674: function utf8_strtrunc($str,$len)
73 * 1696: function utf8_strpos($haystack,$needle,$offset=0)
74 * 1719: function utf8_strrpos($haystack,$needle)
75 * 1739: function utf8_char2byte_pos($str,$pos)
76 * 1780: function utf8_byte2char_pos($str,$pos)
77 * 1803: function utf8_char_mapping($str,$mode,$opt='')
78 *
79 * SECTION: Internal EUC string operation functions
80 * 1879: function euc_strtrunc($str,$len,$charset)
81 * 1908: function euc_substr($str,$start,$charset,$len=null)
82 * 1933: function euc_strlen($str,$charset)
83 * 1960: function euc_char2byte_pos($str,$pos,$charset)
84 * 2001: function euc_char_mapping($str,$charset,$mode,$opt='')
85 *
86 * TOTAL FUNCTIONS: 35
87 * (This index is automatically created/updated by the extension "extdeveval")
88 *
89 */
90
91
92
93
94
95
96
97
98 /**
99 * Notes on UTF-8
100 *
101 * Functions working on UTF-8 strings:
102 *
103 * - strchr/strstr
104 * - strrchr
105 * - substr_count
106 * - implode/explode/join
107 *
108 * Functions nearly working on UTF-8 strings:
109 *
110 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf_strlen
111 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained 7-bit ASCII
112 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
113 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
114 *
115 * Functions NOT working on UTF-8 strings:
116 *
117 * - str*cmp
118 * - stristr
119 * - stripos
120 * - substr
121 * - strrev
122 * - ereg/eregi
123 * - split/spliti
124 * - preg_*
125 * - ...
126 *
127 */
128 /**
129 * Class for conversion between charsets
130 *
131 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
132 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
133 * @package TYPO3
134 * @subpackage t3lib
135 */
136 class t3lib_cs {
137 var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
138
139 // This is the array where parsed conversion tables are stored (cached)
140 var $parsedCharsets=array();
141
142 // An array where case folding data will be stored (cached)
143 var $caseFolding=array();
144
145 // An array where charset-to-ASCII mappings are stored (cached)
146 var $toASCII=array();
147
148 // This tells the converter which charsets has two bytes per char:
149 var $twoByteSets=array(
150 'ucs-2'=>1, // 2-byte Unicode
151 );
152
153 // This tells the converter which charsets has four bytes per char:
154 var $fourByteSets=array(
155 'ucs-4'=>1, // 4-byte Unicode
156 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
157 );
158
159 // This tells the converter which charsets use a scheme like the Extended Unix Code:
160 var $eucBasedSets=array(
161 'gb2312'=>1, // Chinese, simplified.
162 'big5'=>1, // Chinese, traditional.
163 'euc-kr'=>1, // Korean
164 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
165 );
166
167 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
168 // http://czyborra.com/charsets/iso8859.html
169 var $synonyms=array(
170 'us' => 'ascii',
171 'us-ascii'=> 'ascii',
172 'cp819' => 'iso-8859-1',
173 'ibm819' => 'iso-8859-1',
174 'iso-ir-100' => 'iso-8859-1',
175 'iso-ir-109' => 'iso-8859-2',
176 'iso-ir-148' => 'iso-8859-9',
177 'iso-ir-199' => 'iso-8859-14',
178 'iso-ir-203' => 'iso-8859-15',
179 'csisolatin1' => 'iso-8859-1',
180 'csisolatin2' => 'iso-8859-2',
181 'csisolatin3' => 'iso-8859-3',
182 'csisolatin5' => 'iso-8859-9',
183 'csisolatin8' => 'iso-8859-14',
184 'csisolatin9' => 'iso-8859-15',
185 'csisolatingreek' => 'iso-8859-7',
186 'iso-celtic' => 'iso-8859-14',
187 'latin1' => 'iso-8859-1',
188 'latin2' => 'iso-8859-2',
189 'latin3' => 'iso-8859-3',
190 'latin5' => 'iso-8859-9',
191 'latin6' => 'iso-8859-10',
192 'latin8' => 'iso-8859-14',
193 'latin9' => 'iso-8859-15',
194 'l1' => 'iso-8859-1',
195 'l2' => 'iso-8859-2',
196 'l3' => 'iso-8859-3',
197 'l5' => 'iso-8859-9',
198 'l6' => 'iso-8859-10',
199 'l8' => 'iso-8859-14',
200 'l9' => 'iso-8859-15',
201 'cyrillic' => 'iso-8859-5',
202 'arabic' => 'iso-8859-6',
203 'tis-620' => 'iso-8859-11',
204 'win874' => 'windows-874',
205 'win1250' => 'windows-1250',
206 'win1251' => 'windows-1251',
207 'win1252' => 'windows-1252',
208 'win1253' => 'windows-1253',
209 'win1254' => 'windows-1254',
210 'win1255' => 'windows-1255',
211 'win1256' => 'windows-1256',
212 'win1257' => 'windows-1257',
213 'win1258' => 'windows-1258',
214 'cp1250' => 'windows-1250',
215 'cp1251' => 'windows-1251',
216 'cp1252' => 'windows-1252',
217 'ms-ee' => 'windows-1250',
218 'ms-ansi' => 'windows-1252',
219 'ms-greek' => 'windows-1253',
220 'ms-turk' => 'windows-1254',
221 'winbaltrim' => 'windows-1257',
222 'koi-8ru' => 'koi-8r',
223 'koi8r' => 'koi-8r',
224 'cp878' => 'koi-8r',
225 'mac' => 'macroman',
226 'macintosh' => 'macroman',
227 'euc-cn' => 'gb2312',
228 'x-euc-cn' => 'gb2312',
229 'euccn' => 'gb2312',
230 'cp936' => 'gb2312',
231 'big-5' => 'big5',
232 'cp950' => 'big5',
233 'eucjp' => 'euc-jp',
234 'sjis' => 'shift_jis',
235 'shift-jis' => 'shift_jis',
236 'cp932' => 'shift_jis',
237 'cp949' => 'euc-kr',
238 'utf7' => 'utf-7',
239 'utf8' => 'utf-8',
240 'utf16' => 'utf-16',
241 'utf32' => 'utf-32',
242 'utf8' => 'utf-8',
243 'ucs2' => 'ucs-2',
244 'ucs4' => 'ucs-4',
245 );
246
247 // mapping of iso-639:2 language codes to language (family) names
248 var $lang_to_langfamily=array(
249 // iso-639:2 language codes, see:
250 // http://www.w3.org/WAI/ER/IG/ert/iso639.htm
251 // http://www.unicode.org/onlinedat/languages.html
252 'ar' => 'arabic',
253 'bg' => 'cyrillic',
254 'cs' => 'east_european',
255 'da' => 'west_european',
256 'de' => 'west_european',
257 'es' => 'west_european',
258 'et' => 'estonian',
259 'eu' => 'west_european',
260 'fi' => 'west_european',
261 'fr' => 'west_european',
262 'gr' => 'greek',
263 'hr' => 'east_european',
264 'hu' => 'east_european',
265 'iw' => 'hebrew',
266 'is' => 'west_european',
267 'it' => 'west_european',
268 'ja' => 'japanese',
269 'kl' => 'west_european',
270 'ko' => 'korean',
271 'lt' => 'lithuanian',
272 'lv' => 'west_european', // Latvian/Lettish
273 'nl' => 'west_european',
274 'no' => 'west_european',
275 'pl' => 'east_european',
276 'pt' => 'west_european',
277 'ro' => 'east_european',
278 'ru' => 'cyrillic',
279 'sk' => 'east_european',
280 'sl' => 'east_european',
281 'sv' => 'west_european',
282 'th' => 'thai',
283 'uk' => 'cyrillic',
284 'vi' => 'vietnamese',
285 'zh' => 'chinese',
286 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
287 'chs' => 'simpl_chinese',
288 'cht' => 'trad_chinese',
289 'csy' => 'east_european',
290 'dan' => 'west_european',
291 'deu' => 'west_european',
292 'dea' => 'west_european',
293 'des' => 'west_european',
294 'ena' => 'west_european',
295 'enc' => 'west_european',
296 'eng' => 'west_european',
297 'enz' => 'west_european',
298 'enu' => 'west_european',
299 'nld' => 'west_european',
300 'nlb' => 'west_european',
301 'fin' => 'west_european',
302 'fra' => 'west_european',
303 'frb' => 'west_european',
304 'frc' => 'west_european',
305 'frs' => 'west_european',
306 'ell' => 'greek',
307 'hun' => 'east_european',
308 'isl' => 'west_euorpean',
309 'ita' => 'west_european',
310 'its' => 'west_european',
311 'jpn' => 'japanese',
312 'kor' => 'korean',
313 'nor' => 'west_european',
314 'non' => 'west_european',
315 'plk' => 'east_european',
316 'ptg' => 'west_european',
317 'ptb' => 'west_european',
318 'rus' => 'east_european',
319 'sky' => 'east_european',
320 'esp' => 'west_european',
321 'esm' => 'west_european',
322 'esn' => 'west_european',
323 'sve' => 'west_european',
324 'trk' => 'turkish',
325 // English language names
326 'bulgarian' => 'east_european',
327 'catalan' => 'west_european',
328 'croatian' => 'east_european',
329 'czech' => 'east_european',
330 'danish' => 'west_european',
331 'dutch' => 'west_european',
332 'english' => 'west_european',
333 'finnish' => 'west_european',
334 'french' => 'west_european',
335 'galician' => 'west_european',
336 'german' => 'west_european',
337 'hungarian' => 'east_european',
338 'icelandic' => 'west_european',
339 'italian' => 'west_european',
340 'latvian' => 'west_european',
341 'lettish' => 'west_european',
342 'norwegian' => 'west_european',
343 'polish' => 'east_european',
344 'portuguese' => 'west_european',
345 'russian' => 'cyrillic',
346 'romanian' => 'east_european',
347 'slovak' => 'east_european',
348 'slovenian' => 'east_european',
349 'spanish' => 'west_european',
350 'svedish' => 'west_european',
351 'turkish' => 'east_european',
352 'ukrainian' => 'cyrillic',
353 );
354
355 // mapping of language (family) names to charsets on Unix
356 var $lang_to_charset_unix=array(
357 'west_european' => 'iso-8859-1',
358 'estonian' => 'iso-8859-1',
359 'east_european' => 'iso-8859-2',
360 'baltic' => 'iso-8859-4',
361 'cyrillic' => 'iso-8859-5',
362 'arabic' => 'iso-8859-6',
363 'greek' => 'iso-8859-7',
364 'hebrew' => 'iso-8859-8',
365 'turkish' => 'iso-8859-9',
366 'thai' => 'iso-8859-11', // = TIS-620
367 'lithuanian' => 'iso-8859-13',
368 'chinese' => 'gb2312', // = euc-cn
369 'japanese' => 'euc-jp',
370 'korean' => 'euc-kr',
371 'simpl_chinese' => 'gb2312',
372 'trad_chinese' => 'big5',
373 'vietnamese' => '',
374 );
375
376 // mapping of language (family) names to charsets on Windows
377 var $lang_to_charset_windows=array(
378 'east_european' => 'windows-1250',
379 'cyrillic' => 'windows-1251',
380 'west_european' => 'windows-1252',
381 'greek' => 'windows-1253',
382 'turkish' => 'windows-1254',
383 'hebrew' => 'windows-1255',
384 'arabic' => 'windows-1256',
385 'baltic' => 'windows-1257',
386 'estonian' => 'windows-1257',
387 'lithuanian' => 'windows-1257',
388 'vietnamese' => 'windows-1258',
389 'thai' => 'cp874',
390 'korean' => 'cp949',
391 'chinese' => 'gb2312',
392 'japanese' => 'shift_jis',
393 'simpl_chinese' => 'gb2312',
394 'trad_chinese' => 'big5',
395 );
396
397 // mapping of locale names to charsets
398 var $locale_to_charset=array(
399 'japanese.euc' => 'euc-jp',
400 'ja_jp.ujis' => 'euc-jp',
401 'korean.euc' => 'euc-kr',
402 'zh_cn' => 'gb2312',
403 'zh_hk' => 'big5',
404 'zh_tw' => 'big5',
405 );
406
407 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
408 // Empty values means "iso-8859-1"
409 var $charSetArray = array(
410 'dk' => '',
411 'de' => '',
412 'no' => '',
413 'it' => '',
414 'fr' => '',
415 'es' => '',
416 'nl' => '',
417 'cz' => 'windows-1250',
418 'pl' => 'iso-8859-2',
419 'si' => 'windows-1250',
420 'fi' => '',
421 'tr' => 'iso-8859-9',
422 'se' => '',
423 'pt' => '',
424 'ru' => 'windows-1251',
425 'ro' => 'iso-8859-2',
426 'ch' => 'gb2312',
427 'sk' => 'windows-1250',
428 'lt' => 'windows-1257',
429 'is' => 'utf-8',
430 'hr' => 'windows-1250',
431 'hu' => 'iso-8859-2',
432 'gl' => '',
433 'th' => 'iso-8859-11',
434 'gr' => 'iso-8859-7',
435 'hk' => 'big5',
436 'eu' => '',
437 'bg' => 'windows-1251',
438 'br' => '',
439 'et' => 'iso-8859-4',
440 'ar' => 'iso-8859-6',
441 'he' => 'utf-8',
442 'ua' => 'windows-1251',
443 'jp' => 'shift_jis',
444 'lv' => 'utf-8',
445 'vn' => 'utf-8',
446 'ca' => 'iso-8859-15',
447 'ba' => 'iso-8859-2',
448 'kr' => 'euc-kr',
449 'eo' => 'utf-8',
450 'my' => '',
451 'hi' => 'utf-8',
452 );
453
454 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
455 // Empty values means sames as Typo3
456 var $isoArray = array(
457 'dk' => 'da',
458 'de' => '',
459 'no' => '',
460 'it' => '',
461 'fr' => '',
462 'es' => '',
463 'nl' => '',
464 'cz' => 'cs',
465 'pl' => '',
466 'si' => 'sl',
467 'fi' => '',
468 'tr' => '',
469 'se' => 'sv',
470 'pt' => '',
471 'ru' => '',
472 'ro' => '',
473 'ch' => 'zh_CN',
474 'sk' => '',
475 'lt' => '',
476 'is' => '',
477 'hr' => '',
478 'hu' => '',
479 'gl' => '', // Greenlandic
480 'th' => '',
481 'gr' => 'el',
482 'hk' => 'zh_HK',
483 'eu' => '',
484 'bg' => '',
485 'br' => 'pt_BR',
486 'et' => '',
487 'ar' => '',
488 'he' => 'iw',
489 'ua' => 'uk',
490 'jp' => 'ja',
491 'lv' => '',
492 'vn' => 'vi',
493 'ca' => '',
494 'ba' => '', // Bosnian
495 'kr' => '',
496 );
497
498 /**
499 * Normalize - changes input character set to lowercase letters.
500 *
501 * @param string Input charset
502 * @return string Normalized charset
503 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
504 */
505 function parse_charset($charset) {
506 $charset = strtolower($charset);
507 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
508
509 return $charset;
510 }
511
512 /**
513 * Get the charset of a locale.
514 *
515 * ln language
516 * ln_CN language / country
517 * ln_CN.cs language / country / charset
518 * ln_CN.cs@mod language / country / charset / modifier
519 *
520 * @param string Locale string
521 * @return string Charset resolved for locale string
522 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
523 */
524 function get_locale_charset($locale) {
525 $locale = strtolower($locale);
526
527 // exact locale specific charset?
528 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale];
529
530 // get modifier
531 list($locale,$modifier) = explode('@',$locale);
532
533 // locale contains charset: use it
534 list($locale,$charset) = explode('.',$locale);
535 if ($charset) return $this->parse_charset($charset);
536
537 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
538 if ($modifier == 'euro') return 'iso-8859-15';
539
540 // get language
541 list($language,$country) = explode('_',$locale);
542 if (isset($this->lang_to_langfamily[$language])) $language = $this->lang_to_langfamily[$language];
543
544 if (TYPO3_OS == 'WIN') {
545 $cs = $this->lang_to_charset_windows[$language];
546 } else {
547 $cs = $this->lang_to_charset_unix[$language];
548 }
549
550 return $cs ? $cs : 'iso-8859-1';
551 }
552
553
554
555
556
557
558
559
560
561 /********************************************
562 *
563 * Charset Conversion functions
564 *
565 ********************************************/
566
567 /**
568 * Convert from one charset to another charset.
569 *
570 * @param string Input string
571 * @param string From charset (the current charset of the string)
572 * @param string To charset (the output charset wanted)
573 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
574 * @return string Converted string
575 * @see convArray()
576 */
577 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
578 if ($fromCS==$toCS) return $str;
579
580 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
581 if ($toCS=='utf-8' || !$useEntityForNoChar) {
582 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
583 case 'mbstring':
584 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
585 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
586 break;
587
588 case 'iconv':
589 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
590 if (false !== $conv_str) return $conv_str;
591 break;
592
593 case 'recode':
594 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
595 if (false !== $conv_str) return $conv_str;
596 break;
597 }
598 // fallback to TYPO3 conversion
599 }
600
601 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
602 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
603 return $str;
604 }
605
606 /**
607 * Convert all elements in ARRAY from one charset to another charset.
608 * NOTICE: Array is passed by reference!
609 *
610 * @param string Input array, possibly multidimensional
611 * @param string From charset (the current charset of the string)
612 * @param string To charset (the output charset wanted)
613 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
614 * @return void
615 * @see conv()
616 */
617 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
618 foreach($array as $key => $value) {
619 if (is_array($array[$key])) {
620 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
621 } else {
622 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
623 }
624 }
625 }
626
627 /**
628 * Converts $str from $charset to UTF-8
629 *
630 * @param string String in local charset to convert to UTF-8
631 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
632 * @return string Output string, converted to UTF-8
633 */
634 function utf8_encode($str,$charset) {
635
636 if ($charset === 'utf-8') return $str;
637
638 // Charset is case-insensitive.
639 if ($this->initCharset($charset)) { // Parse conv. table if not already...
640 $strLen = strlen($str);
641 $outStr='';
642
643 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
644 $chr=substr($str,$a,1);
645 $ord=ord($chr);
646 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
647 $ord2 = ord($str{$a+1});
648 $ord = $ord<<8 & $ord2; // assume big endian
649
650 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
651 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
652 } else $outStr.=chr($this->noCharByteVal); // No char exists
653 $a++;
654 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
655 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
656 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
657 $a++;
658 $ord2=ord(substr($str,$a,1));
659 $ord = $ord*256+$ord2;
660 }
661 }
662
663 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
664 $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
665 } else $outStr.= chr($this->noCharByteVal); // No char exists
666 } else $outStr.= $chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
667 }
668 return $outStr;
669 }
670 }
671
672 /**
673 * Converts $str from UTF-8 to $charset
674 *
675 * @param string String in UTF-8 to convert to local charset
676 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
677 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
678 * @return string Output string, converted to local charset
679 */
680 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
681
682 // Charset is case-insensitive.
683 if ($this->initCharset($charset)) { // Parse conv. table if not already...
684 $strLen = strlen($str);
685 $outStr='';
686 $buf='';
687 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
688 $chr=substr($str,$a,1);
689 $ord=ord($chr);
690 if ($ord>127) { // This means multibyte! (first byte!)
691 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
692
693 $buf=$chr; // Add first byte
694 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
695 $ord = $ord << 1; // Shift it left and ...
696 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
697 $a++; // Increase pointer...
698 $buf.=substr($str,$a,1); // ... and add the next char.
699 } else break;
700 }
701
702 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
703 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
704 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
705 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
706 } else $outStr.= chr($mByte);
707 } elseif ($useEntityForNoChar) { // Create num entity:
708 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
709 } else $outStr.=chr($this->noCharByteVal); // No char exists
710 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
711 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
712 }
713 return $outStr;
714 }
715 }
716
717 /**
718 * Converts all chars > 127 to numeric entities.
719 *
720 * @param string Input string
721 * @return string Output string
722 */
723 function utf8_to_entities($str) {
724 $strLen = strlen($str);
725 $outStr='';
726 $buf='';
727 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
728 $chr=substr($str,$a,1);
729 $ord=ord($chr);
730 if ($ord>127) { // This means multibyte! (first byte!)
731 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
732 $buf=$chr; // Add first byte
733 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
734 $ord = $ord << 1; // Shift it left and ...
735 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
736 $a++; // Increase pointer...
737 $buf.=substr($str,$a,1); // ... and add the next char.
738 } else break;
739 }
740
741 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
742 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
743 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
744 }
745
746 return $outStr;
747 }
748
749 /**
750 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
751 *
752 * @param string Input string, UTF-8
753 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
754 * @return string Output string
755 */
756 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
757 if ($alsoStdHtmlEnt) {
758 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
759 }
760
761 $token = md5(microtime());
762 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
763 foreach($parts as $k => $v) {
764 if ($k%2) {
765 if (substr($v,0,1)=='#') { // Dec or hex entities:
766 if (substr($v,1,1)=='x') {
767 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
768 } else {
769 $parts[$k] = $this->UnumberToChar(substr($v,1));
770 }
771 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
772 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
773 } else { // No conversion:
774 $parts[$k] ='&'.$v.';';
775 }
776 }
777 }
778
779 return implode('',$parts);
780 }
781
782 /**
783 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
784 *
785 * @param string Input string, UTF-8
786 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
787 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
788 * @return array Output array with the char numbers
789 */
790 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
791 // If entities must be registered as well...:
792 if ($convEntities) {
793 $str = $this->entities_to_utf8($str,1);
794 }
795 // Do conversion:
796 $strLen = strlen($str);
797 $outArr=array();
798 $buf='';
799 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
800 $chr=substr($str,$a,1);
801 $ord=ord($chr);
802 if ($ord>127) { // This means multibyte! (first byte!)
803 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
804 $buf=$chr; // Add first byte
805 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
806 $ord = $ord << 1; // Shift it left and ...
807 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
808 $a++; // Increase pointer...
809 $buf.=substr($str,$a,1); // ... and add the next char.
810 } else break;
811 }
812
813 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
814 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
815 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
816 }
817
818 return $outArr;
819 }
820
821 /**
822 * Converts a UNICODE number to a UTF-8 multibyte character
823 * Algorithm based on script found at From: http://czyborra.com/utf/
824 * Unit-tested by Kasper
825 *
826 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
827 *
828 * bytes | bits | representation
829 * 1 | 7 | 0vvvvvvv
830 * 2 | 11 | 110vvvvv 10vvvvvv
831 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
832 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
833 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
834 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
835 *
836 * @param integer UNICODE integer
837 * @return string UTF-8 multibyte character string
838 * @see utf8CharToUnumber()
839 */
840 function UnumberToChar($cbyte) {
841 $str='';
842
843 if ($cbyte < 0x80) {
844 $str.=chr($cbyte);
845 } else if ($cbyte < 0x800) {
846 $str.=chr(0xC0 | ($cbyte >> 6));
847 $str.=chr(0x80 | ($cbyte & 0x3F));
848 } else if ($cbyte < 0x10000) {
849 $str.=chr(0xE0 | ($cbyte >> 12));
850 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
851 $str.=chr(0x80 | ($cbyte & 0x3F));
852 } else if ($cbyte < 0x200000) {
853 $str.=chr(0xF0 | ($cbyte >> 18));
854 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
855 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
856 $str.=chr(0x80 | ($cbyte & 0x3F));
857 } else if ($cbyte < 0x4000000) {
858 $str.=chr(0xF8 | ($cbyte >> 24));
859 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
860 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
861 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
862 $str.=chr(0x80 | ($cbyte & 0x3F));
863 } else if ($cbyte < 0x80000000) {
864 $str.=chr(0xFC | ($cbyte >> 30));
865 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
866 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
867 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
868 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
869 $str.=chr(0x80 | ($cbyte & 0x3F));
870 } else { // Cannot express a 32-bit character in UTF-8
871 $str .= chr($this->noCharByteVal);
872 }
873 return $str;
874 }
875
876 /**
877 * Converts a UTF-8 Multibyte character to a UNICODE number
878 * Unit-tested by Kasper
879 *
880 * @param string UTF-8 multibyte character string
881 * @param boolean If set, then a hex. number is returned.
882 * @return integer UNICODE integer
883 * @see UnumberToChar()
884 */
885 function utf8CharToUnumber($str,$hex=0) {
886 $ord=ord(substr($str,0,1)); // First char
887
888 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
889 $binBuf='';
890 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
891 $ord = $ord << 1; // Shift it left and ...
892 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
893 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
894 } else break;
895 }
896 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
897
898 $int = bindec($binBuf);
899 } else $int = $ord;
900
901 return $hex ? 'x'.dechex($int) : $int;
902 }
903
904
905
906
907
908
909
910
911
912 /********************************************
913 *
914 * Init functions
915 *
916 ********************************************/
917
918 /**
919 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
920 * This function is automatically called by the conversion functions
921 *
922 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
923 *
924 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
925 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
926 * @access private
927 */
928 function initCharset($charset) {
929 // Only process if the charset is not yet loaded:
930 if (!is_array($this->parsedCharsets[$charset])) {
931
932 // Conversion table filename:
933 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
934
935 // If the conversion table is found:
936 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
937 // Cache file for charsets:
938 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
939 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
940 if ($cacheFile && @is_file($cacheFile)) {
941 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
942 } else {
943 // Parse conversion table into lines:
944 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
945 // Initialize the internal variable holding the conv. table:
946 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
947 // traverse the lines:
948 $detectedType='';
949 foreach($lines as $value) {
950 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
951
952 // Detect type if not done yet: (Done on first real line)
953 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
954 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
955
956 if ($detectedType=='ms-token') {
957 list($hexbyte,$utf8) = split('=|:',$value,3);
958 } elseif ($detectedType=='whitespaced') {
959 $regA=array();
960 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
961 $hexbyte = $regA[1];
962 $utf8 = 'U+'.$regA[2];
963 }
964 $decval = hexdec(trim($hexbyte));
965 if ($decval>127) {
966 $utf8decval = hexdec(substr(trim($utf8),2));
967 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
968 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
969 }
970 }
971 }
972 if ($cacheFile) {
973 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
974 }
975 }
976 return 2;
977 } else return false;
978 } else return 1;
979 }
980
981 /**
982 * This function initializes all UTF-8 character data tables.
983 *
984 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
985 *
986 * @param string Mode ("case", "ascii", ...)
987 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
988 * @access private
989 */
990 function initUnicodeData($mode=null) {
991 // cache files
992 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
993 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
994
995 // Only process if the tables are not yet loaded
996 switch($mode) {
997 case 'case':
998 if (is_array($this->caseFolding['utf-8'])) return 1;
999
1000 // Use cached version if possible
1001 if ($cacheFileCase && @is_file($cacheFileCase)) {
1002 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1003 return 2;
1004 }
1005 break;
1006
1007 case 'ascii':
1008 if (is_array($this->toASCII['utf-8'])) return 1;
1009
1010 // Use cached version if possible
1011 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1012 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1013 return 2;
1014 }
1015 break;
1016 }
1017
1018 // process main Unicode data file
1019 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
1020 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
1021
1022 $fh = fopen($unicodeDataFile,'rb');
1023 if (!$fh) return false;
1024
1025 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1026 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1027 $this->caseFolding['utf-8'] = array();
1028 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1029 $utf8CaseFolding['toUpper'] = array();
1030 $utf8CaseFolding['toLower'] = array();
1031 $utf8CaseFolding['toTitle'] = array();
1032
1033 $decomposition = array(); // array of temp. decompositions
1034 $mark = array(); // array of chars that are marks (eg. composing accents)
1035 $number = array(); // array of chars that are numbers (eg. digits)
1036 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1037
1038 while (!feof($fh)) {
1039 $line = fgets($fh,4096);
1040 // has a lot of info
1041 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
1042
1043 $ord = hexdec($char);
1044 if ($ord > 0xFFFF) break; // only process the BMP
1045
1046 $utf8_char = $this->UnumberToChar($ord);
1047
1048 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1049 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1050 // store "title" only when different from "upper" (only a few)
1051 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1052
1053 switch ($cat{0}) {
1054 case 'M': // mark (accent, umlaut, ...)
1055 $mark["U+$char"] = 1;
1056 break;
1057
1058 case 'N': // numeric value
1059 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
1060 }
1061
1062 // accented Latin letters without "official" decomposition
1063 $match = array();
1064 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) {
1065 $c = ord($match[2]);
1066 if ($match[1] == 'SMALL') $c += 32;
1067
1068 $decomposition["U+$char"] = array(dechex($c));
1069 continue;
1070 }
1071
1072 $match = array();
1073 if (ereg('(<.*>)? *(.+)',$decomp,$match)) {
1074 switch($match[1]) {
1075 case '<circle>': // add parenthesis as circle replacement, eg (1)
1076 $match[2] = '0028 '.$match[2].' 0029';
1077 break;
1078
1079 case '<square>': // add square brackets as square replacement, eg [1]
1080 $match[2] = '005B '.$match[2].' 005D';
1081 break;
1082
1083 case '<compat>': // ignore multi char decompositions that start with a space
1084 if (ereg('^0020 ',$match[2])) continue 2;
1085 break;
1086
1087 // ignore Arabic and vertical layout presentation decomposition
1088 case '<initial>':
1089 case '<medial>':
1090 case '<final>':
1091 case '<isolated>':
1092 case '<vertical>':
1093 continue 2;
1094 }
1095 $decomposition["U+$char"] = split(' ',$match[2]);
1096 }
1097 }
1098 fclose($fh);
1099
1100 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1101 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
1102 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1103 $fh = fopen($specialCasingFile,'rb');
1104 if ($fh) {
1105 while (!feof($fh)) {
1106 $line = fgets($fh,4096);
1107 if ($line{0} != '#' && trim($line) != '') {
1108
1109 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
1110 if ($cond == '' || $cond{0} == '#') {
1111 $utf8_char = $this->UnumberToChar(hexdec($char));
1112 if ($char != $lower) {
1113 $arr = split(' ',$lower);
1114 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1115 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1116 }
1117 if ($char != $title && $title != $upper) {
1118 $arr = split(' ',$title);
1119 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1120 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1121 }
1122 if ($char != $upper) {
1123 $arr = split(' ',$upper);
1124 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1125 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1126 }
1127 }
1128 }
1129 }
1130 fclose($fh);
1131 }
1132 }
1133
1134 // process custom decompositions
1135 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
1136 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1137 $fh = fopen($customTranslitFile,'rb');
1138 if ($fh) {
1139 while (!feof($fh)) {
1140 $line = fgets($fh,4096);
1141 if ($line{0} != '#' && trim($line) != '') {
1142 list($char,$translit) = t3lib_div::trimExplode(';', $line);
1143 if (!$translit) $omit["U+$char"] = 1;
1144 $decomposition["U+$char"] = split(' ', $translit);
1145
1146 }
1147 }
1148 fclose($fh);
1149 }
1150 }
1151
1152 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1153 foreach($decomposition as $from => $to) {
1154 $code_decomp = array();
1155
1156 while ($code_value = array_shift($to)) {
1157 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1158 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
1159 array_unshift($to, $cv);
1160 }
1161 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1162 array_push($code_decomp, $code_value);
1163 }
1164 }
1165 if (count($code_decomp) || isset($omit[$from])) {
1166 $decomposition[$from] = $code_decomp;
1167 } else {
1168 unset($decomposition[$from]);
1169 }
1170 }
1171
1172 // create ascii only mapping
1173 $this->toASCII['utf-8'] = array();
1174 $ascii =& $this->toASCII['utf-8'];
1175
1176 foreach($decomposition as $from => $to) {
1177 $code_decomp = array();
1178 while ($code_value = array_shift($to)) {
1179 $ord = hexdec($code_value);
1180 if ($ord > 127)
1181 continue 2; // skip decompositions containing non-ASCII chars
1182 else
1183 array_push($code_decomp,chr($ord));
1184 }
1185 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1186 }
1187
1188 // add numeric decompositions
1189 foreach($number as $from => $to) {
1190 $utf8_char = $this->UnumberToChar(hexdec($from));
1191 if (!isset($ascii[$utf8_char])) {
1192 $ascii[$utf8_char] = $to;
1193 }
1194 }
1195
1196 if ($cacheFileCase) {
1197 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1198 }
1199
1200 if ($cacheFileASCII) {
1201 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1202 }
1203
1204 return 3;
1205 }
1206
1207 /**
1208 * This function initializes the folding table for a charset other than UTF-8.
1209 * This function is automatically called by the case folding functions.
1210 *
1211 * @param string Charset for which to initialize case folding.
1212 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1213 * @access private
1214 */
1215 function initCaseFolding($charset) {
1216 // Only process if the case table is not yet loaded:
1217 if (is_array($this->caseFolding[$charset])) return 1;
1218
1219 // Use cached version if possible
1220 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1221 if ($cacheFile && @is_file($cacheFile)) {
1222 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1223 return 2;
1224 }
1225
1226 // init UTF-8 conversion for this charset
1227 if (!$this->initCharset($charset)) {
1228 return false;
1229 }
1230
1231 // UTF-8 case folding is used as the base conversion table
1232 if (!$this->initUnicodeData('case')) {
1233 return false;
1234 }
1235
1236 $nochar = chr($this->noCharByteVal);
1237 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1238 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1239 $c = $this->utf8_decode($utf8, $charset);
1240
1241 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1242 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1243 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1244
1245 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1246 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1247 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
1248
1249 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1250 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1251 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1252 }
1253
1254 // add the ASCII case table
1255 for ($i=ord('a'); $i<=ord('z'); $i++) {
1256 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
1257 }
1258 for ($i=ord('A'); $i<=ord('Z'); $i++) {
1259 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
1260 }
1261
1262 if ($cacheFile) {
1263 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
1264 }
1265
1266 return 3;
1267 }
1268
1269 /**
1270 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1271 * This function is automatically called by the ASCII transliteration functions.
1272 *
1273 * @param string Charset for which to initialize conversion.
1274 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1275 * @access private
1276 */
1277 function initToASCII($charset) {
1278 // Only process if the case table is not yet loaded:
1279 if (is_array($this->toASCII[$charset])) return 1;
1280
1281 // Use cached version if possible
1282 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1283 if ($cacheFile && @is_file($cacheFile)) {
1284 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1285 return 2;
1286 }
1287
1288 // init UTF-8 conversion for this charset
1289 if (!$this->initCharset($charset)) {
1290 return false;
1291 }
1292
1293 // UTF-8/ASCII transliteration is used as the base conversion table
1294 if (!$this->initUnicodeData('ascii')) {
1295 return false;
1296 }
1297
1298 $nochar = chr($this->noCharByteVal);
1299 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1300 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1301 $c = $this->utf8_decode($utf8, $charset);
1302
1303 if (isset($this->toASCII['utf-8'][$utf8])) {
1304 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1305 }
1306 }
1307
1308 if ($cacheFile) {
1309 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
1310 }
1311
1312 return 3;
1313 }
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330 /********************************************
1331 *
1332 * String operation functions
1333 *
1334 ********************************************/
1335
1336 /**
1337 * Returns a part of a string.
1338 * Unit-tested by Kasper (single byte charsets only)
1339 *
1340 * @param string The character set
1341 * @param string Character string
1342 * @param integer Start position (character position)
1343 * @param integer Length (in characters)
1344 * @return string The substring
1345 * @see substr(), mb_substr()
1346 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1347 */
1348 function substr($charset,$string,$start,$len=null) {
1349 if ($len===0) return '';
1350
1351 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1352 // cannot omit $len, when specifying charset
1353 if ($len==null) {
1354 $enc = mb_internal_encoding(); // save internal encoding
1355 mb_internal_encoding('utf-8');
1356 $str = mb_substr($string,$start);
1357 mb_internal_encoding($enc); // restore internal encoding
1358
1359 return $str;
1360 }
1361 else return mb_substr($string,$start,$len,'utf-8');
1362 } elseif ($charset == 'utf-8') {
1363 return $this->utf8_substr($string,$start,$len);
1364 } elseif ($this->eucBasedSets[$charset]) {
1365 return $this->euc_substr($string,$start,$charset,$len);
1366 } elseif ($this->twoByteSets[$charset]) {
1367 return substr($string,$start*2,$len*2);
1368 } elseif ($this->fourByteSets[$charset]) {
1369 return substr($string,$start*4,$len*4);
1370 }
1371
1372 // treat everything else as single-byte encoding
1373 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
1374 }
1375
1376 /**
1377 * Counts the number of characters.
1378 * Unit-tested by Kasper (single byte charsets only)
1379 *
1380 * @param string The character set
1381 * @param string Character string
1382 * @return integer The number of characters
1383 * @see strlen()
1384 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1385 */
1386 function strlen($charset,$string) {
1387 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1388 return mb_strlen($string,$charset);
1389 } elseif ($charset == 'utf-8') {
1390 return $this->utf8_strlen($string);
1391 } elseif ($this->eucBasedSets[$charset]) {
1392 return $this->euc_strlen($string,$charset);
1393 } elseif ($this->twoByteSets[$charset]) {
1394 return strlen($string)/2;
1395 } elseif ($this->fourByteSets[$charset]) {
1396 return strlen($string)/4;
1397 }
1398 // treat everything else as single-byte encoding
1399 return strlen($string);
1400 }
1401
1402 /**
1403 * Truncates a string and pre-/appends a string.
1404 * Unit tested by Kasper
1405 *
1406 * @param string The character set
1407 * @param string Character string
1408 * @param integer Length (in characters)
1409 * @param string Crop signifier
1410 * @return string The shortened string
1411 * @see substr(), mb_strimwidth()
1412 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1413 */
1414 function crop($charset,$string,$len,$crop='') {
1415 if (intval($len) == 0) return $string;
1416
1417 if ($charset == 'utf-8') {
1418 $i = $this->utf8_char2byte_pos($string,$len);
1419 } elseif ($this->eucBasedSets[$charset]) {
1420 $i = $this->euc_char2byte_pos($string,$len,$charset);
1421 } else {
1422 if ($len > 0) {
1423 $i = $len;
1424 } else {
1425 $i = strlen($string)+$len;
1426 if ($i<=0) $i = false;
1427 }
1428 }
1429
1430 if ($i === false) { // $len outside actual string length
1431 return $string;
1432 } else {
1433 if ($len > 0) {
1434 if (strlen($string{$i})) {
1435 return substr($string,0,$i).$crop;
1436
1437 }
1438 } else {
1439 if (strlen($string{$i-1})) {
1440 return $crop.substr($string,$i);
1441 }
1442 }
1443
1444 /*
1445 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1446 if ($len > 0) {
1447 return substr($string,0,$i).$crop;
1448 } else {
1449 return $crop.substr($string,$i);
1450 }
1451 }
1452 */
1453 }
1454 return $string;
1455 }
1456
1457 /**
1458 * Cuts a string short at a given byte length.
1459 *
1460 * @param string The character set
1461 * @param string Character string
1462 * @param integer The byte length
1463 * @return string The shortened string
1464 * @see mb_strcut()
1465 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1466 */
1467 function strtrunc($charset,$string,$len) {
1468 if ($len <= 0) return '';
1469
1470 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1471 return mb_strcut($string,0,$len,$charset);
1472 } elseif ($charset == 'utf-8') {
1473 return $this->utf8_strtrunc($string,$len);
1474 } elseif ($this->eucBasedSets[$charset]) {
1475 return $this->euc_strtrunc($string,$charset);
1476 } elseif ($this->twoByteSets[$charset]) {
1477 if ($len % 2) $len--; // don't cut at odd positions
1478 } elseif ($this->fourByteSets[$charset]) {
1479 $x = $len % 4;
1480 $len -= $x; // realign to position dividable by four
1481 }
1482 // treat everything else as single-byte encoding
1483 return substr($string,0,$len);
1484 }
1485
1486 /**
1487 * Translates all characters of a string into their respective case values.
1488 * Unlike strtolower() and strtoupper() this method is locale independent.
1489 * Note that the string length may change!
1490 * eg. lower case German �(sharp S) becomes upper case "SS"
1491 * Unit-tested by Kasper
1492 * Real case folding is language dependent, this method ignores this fact.
1493 *
1494 * @param string Character set of string
1495 * @param string Input string to convert case for
1496 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1497 * @return string The converted string
1498 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1499 * @see strtolower(), strtoupper()
1500 */
1501 function conv_case($charset,$string,$case) {
1502 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' && float(phpversion()) >= 4.3) {
1503 if ($case == 'toLower') {
1504 return mb_strtolower($string,'utf-8');
1505 } else {
1506 return mb_strtoupper($string,'utf-8');
1507 }
1508 } elseif ($charset == 'utf-8') {
1509 return $this->utf8_char_mapping($string,'case',$case);
1510 } elseif (isset($this->eucBasedSets[$charset])) {
1511 return $this->euc_char_mapping($string,$charset,'case',$case);
1512 } else {
1513 // treat everything else as single-byte encoding
1514 return $this->sb_char_mapping($string,$charset,'case',$case);
1515 }
1516
1517 return $string;
1518 }
1519
1520 /**
1521 * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
1522 *
1523 * @param string Character set of string
1524 * @param string Input string to convert
1525 * @return string The converted string
1526 */
1527 function specCharsToASCII($charset,$string) {
1528 if ($charset == 'utf-8') {
1529 return $this->utf8_char_mapping($string,'ascii');
1530 } elseif (isset($this->eucBasedSets[$charset])) {
1531 return $this->euc_char_mapping($string,$charset,'ascii');
1532 } else {
1533 // treat everything else as single-byte encoding
1534 return $this->sb_char_mapping($string,$charset,'ascii');
1535 }
1536
1537 return $string;
1538 }
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551 /********************************************
1552 *
1553 * Internal string operation functions
1554 *
1555 ********************************************/
1556
1557 /**
1558 * Maps all characters of a string in a single byte charset.
1559 *
1560 * @param string the string
1561 * @param string the charset
1562 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1563 * @param string 'case': conversion 'toLower' or 'toUpper'
1564 * @return string the converted string
1565 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1566 */
1567 function sb_char_mapping($str,$charset,$mode,$opt='') {
1568 switch($mode) {
1569 case 'case':
1570 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1571 $map =& $this->caseFolding[$charset][$opt];
1572 break;
1573
1574 case 'ascii':
1575 if (!$this->initToASCII($charset)) return $str; // do nothing
1576 $map =& $this->toASCII[$charset];
1577 break;
1578
1579 default:
1580 return $str;
1581 }
1582
1583 $out = '';
1584 for($i=0; strlen($str{$i}); $i++) {
1585 $c = $str{$i};
1586 if (isset($map[$c])) {
1587 $out .= $map[$c];
1588 } else {
1589 $out .= $c;
1590 }
1591 }
1592
1593 return $out;
1594 }
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605 /********************************************
1606 *
1607 * Internal UTF-8 string operation functions
1608 *
1609 ********************************************/
1610
1611 /**
1612 * Returns a part of a UTF-8 string.
1613 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1614 *
1615 * @param string UTF-8 string
1616 * @param integer Start position (character position)
1617 * @param integer Length (in characters)
1618 * @return string The substring
1619 * @see substr()
1620 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1621 */
1622 function utf8_substr($str,$start,$len=null) {
1623 if (!strcmp($len,'0')) return '';
1624
1625 $byte_start = $this->utf8_char2byte_pos($str,$start);
1626 if ($byte_start === false) {
1627 if ($start > 0) {
1628 return false; // $start outside string length
1629 } else {
1630 $start = 0;
1631 }
1632 }
1633
1634 $str = substr($str,$byte_start);
1635
1636 if ($len!=null) {
1637 $byte_end = $this->utf8_char2byte_pos($str,$len);
1638 if ($byte_end === false) // $len outside actual string length
1639 return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string.
1640 else
1641 return substr($str,0,$byte_end);
1642 }
1643 else return $str;
1644 }
1645
1646 /**
1647 * Counts the number of characters of a string in UTF-8.
1648 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1649 *
1650 * @param string UTF-8 multibyte character string
1651 * @return integer The number of characters
1652 * @see strlen()
1653 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1654 */
1655 function utf8_strlen($str) {
1656 $n=0;
1657 for($i=0; strlen($str{$i}); $i++) {
1658 $c = ord($str{$i});
1659 if (!($c & 0x80)) // single-byte (0xxxxxx)
1660 $n++;
1661 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1662 $n++;
1663 }
1664 return $n;
1665 }
1666
1667 /**
1668 * Truncates a string in UTF-8 short at a given byte length.
1669 *
1670 * @param string UTF-8 multibyte character string
1671 * @param integer the byte length
1672 * @return string the shortened string
1673 * @see mb_strcut()
1674 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1675 */
1676 function utf8_strtrunc($str,$len) {
1677 $i = $len-1;
1678 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1679 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1680 if ($i <= 0) return ''; // sanity check
1681 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1682 if ($bc+$i > $len) return substr($str,0,$i);
1683 // fallthru: multibyte char fits into length
1684 }
1685 return substr($str,0,$len);
1686 }
1687
1688 /**
1689 * Find position of first occurrence of a string, both arguments are in UTF-8.
1690 *
1691 * @param string UTF-8 string to search in
1692 * @param string UTF-8 string to search for
1693 * @param integer Positition to start the search
1694 * @return integer The character position
1695 * @see strpos()
1696 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1697 */
1698 function utf8_strpos($haystack,$needle,$offset=0) {
1699 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1700 return mb_strpos($haystack,$needle,'utf-8');
1701 }
1702
1703 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1704 if ($byte_offset === false) return false; // offset beyond string length
1705
1706 $byte_pos = strpos($haystack,$needle,$byte_offset);
1707 if ($byte_pos === false) return false; // needle not found
1708
1709 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1710 }
1711
1712 /**
1713 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1714 *
1715 * @param string UTF-8 string to search in
1716 * @param string UTF-8 character to search for (single character)
1717 * @return integer The character position
1718 * @see strrpos()
1719 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1720 */
1721 function utf8_strrpos($haystack,$needle) {
1722 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1723 return mb_strrpos($haystack,$needle,'utf-8');
1724 }
1725
1726 $byte_pos = strrpos($haystack,$needle);
1727 if ($byte_pos === false) return false; // needle not found
1728
1729 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1730 }
1731
1732 /**
1733 * Translates a character position into an 'absolute' byte position.
1734 * Unit tested by Kasper.
1735 *
1736 * @param string UTF-8 string
1737 * @param integer Character position (negative values start from the end)
1738 * @return integer Byte position
1739 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1740 */
1741 function utf8_char2byte_pos($str,$pos) {
1742 $n = 0; // number of characters found
1743 $p = abs($pos); // number of characters wanted
1744
1745 if ($pos >= 0) {
1746 $i = 0;
1747 $d = 1;
1748 } else {
1749 $i = strlen($str)-1;
1750 $d = -1;
1751 }
1752
1753 for( ; strlen($str{$i}) && $n<$p; $i+=$d) {
1754 $c = (int)ord($str{$i});
1755 if (!($c & 0x80)) // single-byte (0xxxxxx)
1756 $n++;
1757 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1758 $n++;
1759 }
1760 if (!strlen($str{$i})) return false; // offset beyond string length
1761
1762 if ($pos >= 0) {
1763 // skip trailing multi-byte data bytes
1764 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1765 } else {
1766 // correct offset
1767 $i++;
1768 }
1769
1770 return $i;
1771 }
1772
1773 /**
1774 * Translates an 'absolute' byte position into a character position.
1775 * Unit tested by Kasper.
1776 *
1777 * @param string UTF-8 string
1778 * @param integer byte position
1779 * @return integer character position
1780 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1781 */
1782 function utf8_byte2char_pos($str,$pos) {
1783 $n = 0; // number of characters
1784 for($i=$pos; $i>0; $i--) {
1785 $c = (int)ord($str{$i});
1786 if (!($c & 0x80)) // single-byte (0xxxxxx)
1787 $n++;
1788 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1789 $n++;
1790 }
1791 if (!strlen($str{$i})) return false; // offset beyond string length
1792
1793 return $n;
1794 }
1795
1796 /**
1797 * Maps all characters of an UTF-8 string.
1798 *
1799 * @param string UTF-8 string
1800 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1801 * @param string 'case': conversion 'toLower' or 'toUpper'
1802 * @return string the converted string
1803 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1804 */
1805 function utf8_char_mapping($str,$mode,$opt='') {
1806 if (!$this->initUnicodeData($mode)) return $str; // do nothing
1807
1808 $out = '';
1809 switch($mode) {
1810 case 'case':
1811 $map =& $this->caseFolding['utf-8'][$opt];
1812 break;
1813
1814 case 'ascii':
1815 $map =& $this->toASCII['utf-8'];
1816 break;
1817
1818 default:
1819 return $str;
1820 }
1821
1822 for($i=0; strlen($str{$i}); $i++) {
1823 $c = ord($str{$i});
1824 if (!($c & 0x80)) // single-byte (0xxxxxx)
1825 $mbc = $str{$i};
1826 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1827 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1828 $mbc = substr($str,$i,$bc);
1829 $i += $bc-1;
1830 }
1831
1832 if (isset($map[$mbc])) {
1833 $out .= $map[$mbc];
1834 } else {
1835 $out .= $mbc;
1836 }
1837 }
1838
1839 return $out;
1840 }
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859 /********************************************
1860 *
1861 * Internal EUC string operation functions
1862 *
1863 * Extended Unix Code:
1864 * ASCII compatible 7bit single bytes chars
1865 * 8bit two byte chars
1866 *
1867 * Shift-JIS is treated as a special case.
1868 *
1869 ********************************************/
1870
1871 /**
1872 * Cuts a string in the EUC charset family short at a given byte length.
1873 *
1874 * @param string EUC multibyte character string
1875 * @param integer the byte length
1876 * @param string the charset
1877 * @return string the shortened string
1878 * @see mb_strcut()
1879 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1880 */
1881 function euc_strtrunc($str,$len,$charset) {
1882 $sjis = ($charset == 'shift_jis');
1883 for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
1884 $c = ord($str{$i});
1885 if ($sjis) {
1886 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1887 }
1888 else {
1889 if ($c >= 0x80) $i++; // advance a double-byte char
1890 }
1891 }
1892 if (!strlen($str{$i})) return $str; // string shorter than supplied length
1893
1894 if ($i>$len)
1895 return substr($str,0,$len-1); // we ended on a first byte
1896 else
1897 return substr($str,0,$len);
1898 }
1899
1900 /**
1901 * Returns a part of a string in the EUC charset family.
1902 *
1903 * @param string EUC multibyte character string
1904 * @param integer start position (character position)
1905 * @param string the charset
1906 * @param integer length (in characters)
1907 * @return string the substring
1908 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1909 */
1910 function euc_substr($str,$start,$charset,$len=null) {
1911 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1912 if ($byte_start === false) return false; // $start outside string length
1913
1914 $str = substr($str,$byte_start);
1915
1916 if ($len!=null) {
1917 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1918 if ($byte_end === false) // $len outside actual string length
1919 return $str;
1920 else
1921 return substr($str,0,$byte_end);
1922 }
1923 else return $str;
1924 }
1925
1926 /**
1927 * Counts the number of characters of a string in the EUC charset family.
1928 *
1929 * @param string EUC multibyte character string
1930 * @param string the charset
1931 * @return integer the number of characters
1932 * @see strlen()
1933 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1934 */
1935 function euc_strlen($str,$charset) {
1936 $sjis = ($charset == 'shift_jis');
1937 $n=0;
1938 for ($i=0; strlen($str{$i}); $i++) {
1939 $c = ord($str{$i});
1940 if ($sjis) {
1941 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1942 }
1943 else {
1944 if ($c >= 0x80) $i++; // advance a double-byte char
1945 }
1946
1947 $n++;
1948 }
1949
1950 return $n;
1951 }
1952
1953 /**
1954 * Translates a character position into an 'absolute' byte position.
1955 *
1956 * @param string EUC multibyte character string
1957 * @param integer character position (negative values start from the end)
1958 * @param string the charset
1959 * @return integer byte position
1960 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1961 */
1962 function euc_char2byte_pos($str,$pos,$charset) {
1963 $sjis = ($charset == 'shift_jis');
1964 $n = 0; // number of characters seen
1965 $p = abs($pos); // number of characters wanted
1966
1967 if ($pos >= 0) {
1968 $i = 0;
1969 $d = 1;
1970 } else {
1971 $i = strlen($str)-1;
1972 $d = -1;
1973 }
1974
1975 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
1976 $c = ord($str{$i});
1977 if ($sjis) {
1978 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char
1979 }
1980 else {
1981 if ($c >= 0x80) $i+=$d; // advance a double-byte char
1982 }
1983
1984 $n++;
1985 }
1986 if (!strlen($str{$i})) return false; // offset beyond string length
1987
1988 if ($pos < 0) $i++; // correct offset
1989
1990 return $i;
1991 }
1992
1993 /**
1994 * Maps all characters of a string in the EUC charset family.
1995 *
1996 * @param string EUC multibyte character string
1997 * @param string the charset
1998 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1999 * @param string 'case': conversion 'toLower' or 'toUpper'
2000 * @return string the converted string
2001 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2002 */
2003 function euc_char_mapping($str,$charset,$mode,$opt='') {
2004 switch($mode) {
2005 case 'case':
2006 if (!$this->initCaseFolding($charset)) return $str; // do nothing
2007 $map =& $this->caseFolding[$charset][$opt];
2008 break;
2009
2010 case 'ascii':
2011 if (!$this->initToASCII($charset)) return $str; // do nothing
2012 $map =& $this->toASCII[$charset];
2013 break;
2014
2015 default:
2016 return $str;
2017 }
2018
2019 $sjis = ($charset == 'shift_jis');
2020 $out = '';
2021 for($i=0; strlen($str{$i}); $i++) {
2022 $mbc = $str{$i};
2023 $c = ord($mbc);
2024
2025 if ($sjis) {
2026 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2027 $mbc = substr($str,$i,2);
2028 $i++;
2029 }
2030 }
2031 else {
2032 if ($c >= 0x80) { // a double-byte char
2033 $mbc = substr($str,$i,2);
2034 $i++;
2035 }
2036 }
2037
2038 if (isset($map[$mbc])) {
2039 $out .= $map[$mbc];
2040 } else {
2041 $out .= $mbc;
2042 }
2043 }
2044
2045 return $out;
2046 }
2047
2048 }
2049
2050 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
2051 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2052 }
2053 ?>