See Changelog: Updates to Indexed Search (mainly), t3lib_cs (bug), t3lib_tcemain...
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2004 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 136: class t3lib_cs
38 * 502: function parse_charset($charset)
39 * 521: function get_locale_charset($locale)
40 *
41 * SECTION: Charset Conversion functions
42 * 574: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
43 * 614: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
44 * 631: function utf8_encode($str,$charset)
45 * 678: function utf8_decode($str,$charset,$useEntityForNoChar=0)
46 * 721: function utf8_to_entities($str)
47 * 754: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
48 * 788: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
49 * 838: function UnumberToChar($cbyte)
50 * 883: function utf8CharToUnumber($str,$hex=0)
51 *
52 * SECTION: Init functions
53 * 926: function initCharset($charset)
54 * 988: function initUnicodeData($mode=null)
55 * 1213: function initCaseFolding($charset)
56 * 1275: function initToASCII($charset)
57 *
58 * SECTION: String operation functions
59 * 1346: function substr($charset,$string,$start,$len=null)
60 * 1384: function strlen($charset,$string)
61 * 1412: function crop($charset,$string,$len,$crop='')
62 * 1465: function strtrunc($charset,$string,$len)
63 * 1499: function conv_case($charset,$string,$case)
64 * 1525: function specCharsToASCII($charset,$string)
65 *
66 * SECTION: Internal string operation functions
67 * 1565: function sb_char_mapping($str,$charset,$mode,$opt='')
68 *
69 * SECTION: Internal UTF-8 string operation functions
70 * 1620: function utf8_substr($str,$start,$len=null)
71 * 1653: function utf8_strlen($str)
72 * 1674: function utf8_strtrunc($str,$len)
73 * 1696: function utf8_strpos($haystack,$needle,$offset=0)
74 * 1719: function utf8_strrpos($haystack,$needle)
75 * 1739: function utf8_char2byte_pos($str,$pos)
76 * 1780: function utf8_byte2char_pos($str,$pos)
77 * 1803: function utf8_char_mapping($str,$mode,$opt='')
78 *
79 * SECTION: Internal EUC string operation functions
80 * 1879: function euc_strtrunc($str,$len,$charset)
81 * 1908: function euc_substr($str,$start,$charset,$len=null)
82 * 1933: function euc_strlen($str,$charset)
83 * 1960: function euc_char2byte_pos($str,$pos,$charset)
84 * 2001: function euc_char_mapping($str,$charset,$mode,$opt='')
85 *
86 * TOTAL FUNCTIONS: 35
87 * (This index is automatically created/updated by the extension "extdeveval")
88 *
89 */
90
91
92
93
94
95
96
97
98 /**
99 * Notes on UTF-8
100 *
101 * Functions working on UTF-8 strings:
102 *
103 * - strchr/strstr
104 * - strrchr
105 * - substr_count
106 * - implode/explode/join
107 *
108 * Functions nearly working on UTF-8 strings:
109 *
110 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf_strlen
111 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained 7-bit ASCII
112 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
113 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
114 *
115 * Functions NOT working on UTF-8 strings:
116 *
117 * - str*cmp
118 * - stristr
119 * - stripos
120 * - substr
121 * - strrev
122 * - ereg/eregi
123 * - split/spliti
124 * - preg_*
125 * - ...
126 *
127 */
128 /**
129 * Class for conversion between charsets
130 *
131 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
132 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
133 * @package TYPO3
134 * @subpackage t3lib
135 */
136 class t3lib_cs {
137 var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
138
139 // This is the array where parsed conversion tables are stored (cached)
140 var $parsedCharsets=array();
141
142 // An array where case folding data will be stored (cached)
143 var $caseFolding=array();
144
145 // An array where charset-to-ASCII mappings are stored (cached)
146 var $toASCII=array();
147
148 // This tells the converter which charsets has two bytes per char:
149 var $twoByteSets=array(
150 'ucs-2'=>1, // 2-byte Unicode
151 );
152
153 // This tells the converter which charsets has four bytes per char:
154 var $fourByteSets=array(
155 'ucs-4'=>1, // 4-byte Unicode
156 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
157 );
158
159 // This tells the converter which charsets use a scheme like the Extended Unix Code:
160 var $eucBasedSets=array(
161 'gb2312'=>1, // Chinese, simplified.
162 'big5'=>1, // Chinese, traditional.
163 'euc-kr'=>1, // Korean
164 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
165 );
166
167 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
168 // http://czyborra.com/charsets/iso8859.html
169 var $synonyms=array(
170 'us' => 'ascii',
171 'us-ascii'=> 'ascii',
172 'cp819' => 'iso-8859-1',
173 'ibm819' => 'iso-8859-1',
174 'iso-ir-100' => 'iso-8859-1',
175 'iso-ir-109' => 'iso-8859-2',
176 'iso-ir-148' => 'iso-8859-9',
177 'iso-ir-199' => 'iso-8859-14',
178 'iso-ir-203' => 'iso-8859-15',
179 'csisolatin1' => 'iso-8859-1',
180 'csisolatin2' => 'iso-8859-2',
181 'csisolatin3' => 'iso-8859-3',
182 'csisolatin5' => 'iso-8859-9',
183 'csisolatin8' => 'iso-8859-14',
184 'csisolatin9' => 'iso-8859-15',
185 'csisolatingreek' => 'iso-8859-7',
186 'iso-celtic' => 'iso-8859-14',
187 'latin1' => 'iso-8859-1',
188 'latin2' => 'iso-8859-2',
189 'latin3' => 'iso-8859-3',
190 'latin5' => 'iso-8859-9',
191 'latin6' => 'iso-8859-10',
192 'latin8' => 'iso-8859-14',
193 'latin9' => 'iso-8859-15',
194 'l1' => 'iso-8859-1',
195 'l2' => 'iso-8859-2',
196 'l3' => 'iso-8859-3',
197 'l5' => 'iso-8859-9',
198 'l6' => 'iso-8859-10',
199 'l8' => 'iso-8859-14',
200 'l9' => 'iso-8859-15',
201 'cyrillic' => 'iso-8859-5',
202 'arabic' => 'iso-8859-6',
203 'tis-620' => 'iso-8859-11',
204 'win874' => 'windows-874',
205 'win1250' => 'windows-1250',
206 'win1251' => 'windows-1251',
207 'win1252' => 'windows-1252',
208 'win1253' => 'windows-1253',
209 'win1254' => 'windows-1254',
210 'win1255' => 'windows-1255',
211 'win1256' => 'windows-1256',
212 'win1257' => 'windows-1257',
213 'win1258' => 'windows-1258',
214 'cp1250' => 'windows-1250',
215 'cp1251' => 'windows-1251',
216 'cp1252' => 'windows-1252',
217 'ms-ee' => 'windows-1250',
218 'ms-ansi' => 'windows-1252',
219 'ms-greek' => 'windows-1253',
220 'ms-turk' => 'windows-1254',
221 'winbaltrim' => 'windows-1257',
222 'koi-8ru' => 'koi-8r',
223 'koi8r' => 'koi-8r',
224 'cp878' => 'koi-8r',
225 'mac' => 'macroman',
226 'macintosh' => 'macroman',
227 'euc-cn' => 'gb2312',
228 'x-euc-cn' => 'gb2312',
229 'euccn' => 'gb2312',
230 'cp936' => 'gb2312',
231 'big-5' => 'big5',
232 'cp950' => 'big5',
233 'eucjp' => 'euc-jp',
234 'sjis' => 'shift_jis',
235 'shift-jis' => 'shift_jis',
236 'cp932' => 'shift_jis',
237 'cp949' => 'euc-kr',
238 'utf7' => 'utf-7',
239 'utf8' => 'utf-8',
240 'utf16' => 'utf-16',
241 'utf32' => 'utf-32',
242 'utf8' => 'utf-8',
243 'ucs2' => 'ucs-2',
244 'ucs4' => 'ucs-4',
245 );
246
247 // mapping of iso-639:2 language codes to language (family) names
248 var $lang_to_langfamily=array(
249 // iso-639:2 language codes, see:
250 // http://www.w3.org/WAI/ER/IG/ert/iso639.htm
251 // http://www.unicode.org/onlinedat/languages.html
252 'ar' => 'arabic',
253 'bg' => 'cyrillic',
254 'cs' => 'east_european',
255 'da' => 'west_european',
256 'de' => 'west_european',
257 'es' => 'west_european',
258 'et' => 'estonian',
259 'eu' => 'west_european',
260 'fi' => 'west_european',
261 'fr' => 'west_european',
262 'gr' => 'greek',
263 'hr' => 'east_european',
264 'hu' => 'east_european',
265 'iw' => 'hebrew',
266 'is' => 'west_european',
267 'it' => 'west_european',
268 'ja' => 'japanese',
269 'kl' => 'west_european',
270 'ko' => 'korean',
271 'lt' => 'lithuanian',
272 'lv' => 'west_european', // Latvian/Lettish
273 'nl' => 'west_european',
274 'no' => 'west_european',
275 'pl' => 'east_european',
276 'pt' => 'west_european',
277 'ro' => 'east_european',
278 'ru' => 'cyrillic',
279 'sk' => 'east_european',
280 'sl' => 'east_european',
281 'sv' => 'west_european',
282 'th' => 'thai',
283 'uk' => 'cyrillic',
284 'vi' => 'vietnamese',
285 'zh' => 'chinese',
286 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
287 'chs' => 'simpl_chinese',
288 'cht' => 'trad_chinese',
289 'csy' => 'east_european',
290 'dan' => 'west_european',
291 'deu' => 'west_european',
292 'dea' => 'west_european',
293 'des' => 'west_european',
294 'ena' => 'west_european',
295 'enc' => 'west_european',
296 'eng' => 'west_european',
297 'enz' => 'west_european',
298 'enu' => 'west_european',
299 'nld' => 'west_european',
300 'nlb' => 'west_european',
301 'fin' => 'west_european',
302 'fra' => 'west_european',
303 'frb' => 'west_european',
304 'frc' => 'west_european',
305 'frs' => 'west_european',
306 'ell' => 'greek',
307 'hun' => 'east_european',
308 'isl' => 'west_euorpean',
309 'ita' => 'west_european',
310 'its' => 'west_european',
311 'jpn' => 'japanese',
312 'kor' => 'korean',
313 'nor' => 'west_european',
314 'non' => 'west_european',
315 'plk' => 'east_european',
316 'ptg' => 'west_european',
317 'ptb' => 'west_european',
318 'rus' => 'east_european',
319 'sky' => 'east_european',
320 'esp' => 'west_european',
321 'esm' => 'west_european',
322 'esn' => 'west_european',
323 'sve' => 'west_european',
324 'trk' => 'turkish',
325 // English language names
326 'bulgarian' => 'east_european',
327 'catalan' => 'west_european',
328 'croatian' => 'east_european',
329 'czech' => 'east_european',
330 'danish' => 'west_european',
331 'dutch' => 'west_european',
332 'english' => 'west_european',
333 'finnish' => 'west_european',
334 'french' => 'west_european',
335 'galician' => 'west_european',
336 'german' => 'west_european',
337 'hungarian' => 'east_european',
338 'icelandic' => 'west_european',
339 'italian' => 'west_european',
340 'latvian' => 'west_european',
341 'lettish' => 'west_european',
342 'norwegian' => 'west_european',
343 'polish' => 'east_european',
344 'portuguese' => 'west_european',
345 'russian' => 'cyrillic',
346 'romanian' => 'east_european',
347 'slovak' => 'east_european',
348 'slovenian' => 'east_european',
349 'spanish' => 'west_european',
350 'svedish' => 'west_european',
351 'turkish' => 'east_european',
352 'ukrainian' => 'cyrillic',
353 );
354
355 // mapping of language (family) names to charsets on Unix
356 var $lang_to_charset_unix=array(
357 'west_european' => 'iso-8859-1',
358 'estonian' => 'iso-8859-1',
359 'east_european' => 'iso-8859-2',
360 'baltic' => 'iso-8859-4',
361 'cyrillic' => 'iso-8859-5',
362 'arabic' => 'iso-8859-6',
363 'greek' => 'iso-8859-7',
364 'hebrew' => 'iso-8859-8',
365 'turkish' => 'iso-8859-9',
366 'thai' => 'iso-8859-11', // = TIS-620
367 'lithuanian' => 'iso-8859-13',
368 'chinese' => 'gb2312', // = euc-cn
369 'japanese' => 'euc-jp',
370 'korean' => 'euc-kr',
371 'simpl_chinese' => 'gb2312',
372 'trad_chinese' => 'big5',
373 'vietnamese' => '',
374 );
375
376 // mapping of language (family) names to charsets on Windows
377 var $lang_to_charset_windows=array(
378 'east_european' => 'windows-1250',
379 'cyrillic' => 'windows-1251',
380 'west_european' => 'windows-1252',
381 'greek' => 'windows-1253',
382 'turkish' => 'windows-1254',
383 'hebrew' => 'windows-1255',
384 'arabic' => 'windows-1256',
385 'baltic' => 'windows-1257',
386 'estonian' => 'windows-1257',
387 'lithuanian' => 'windows-1257',
388 'vietnamese' => 'windows-1258',
389 'thai' => 'cp874',
390 'korean' => 'cp949',
391 'chinese' => 'gb2312',
392 'japanese' => 'shift_jis',
393 'simpl_chinese' => 'gb2312',
394 'trad_chinese' => 'big5',
395 );
396
397 // mapping of locale names to charsets
398 var $locale_to_charset=array(
399 'japanese.euc' => 'euc-jp',
400 'ja_jp.ujis' => 'euc-jp',
401 'korean.euc' => 'euc-kr',
402 'zh_cn' => 'gb2312',
403 'zh_hk' => 'big5',
404 'zh_tw' => 'big5',
405 );
406
407 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
408 // Empty values means "iso-8859-1"
409 var $charSetArray = array(
410 'dk' => '',
411 'de' => '',
412 'no' => '',
413 'it' => '',
414 'fr' => '',
415 'es' => '',
416 'nl' => '',
417 'cz' => 'windows-1250',
418 'pl' => 'iso-8859-2',
419 'si' => 'windows-1250',
420 'fi' => '',
421 'tr' => 'iso-8859-9',
422 'se' => '',
423 'pt' => '',
424 'ru' => 'windows-1251',
425 'ro' => 'iso-8859-2',
426 'ch' => 'gb2312',
427 'sk' => 'windows-1250',
428 'lt' => 'windows-1257',
429 'is' => 'utf-8',
430 'hr' => 'windows-1250',
431 'hu' => 'iso-8859-2',
432 'gl' => '',
433 'th' => 'iso-8859-11',
434 'gr' => 'iso-8859-7',
435 'hk' => 'big5',
436 'eu' => '',
437 'bg' => 'windows-1251',
438 'br' => '',
439 'et' => 'iso-8859-4',
440 'ar' => 'iso-8859-6',
441 'he' => 'utf-8',
442 'ua' => 'windows-1251',
443 'jp' => 'shift_jis',
444 'lv' => 'utf-8',
445 'vn' => 'utf-8',
446 'ca' => 'iso-8859-15',
447 'ba' => 'iso-8859-2',
448 'kr' => 'euc-kr',
449 );
450
451 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
452 // Empty values means sames as Typo3
453 var $isoArray = array(
454 'dk' => 'da',
455 'de' => '',
456 'no' => '',
457 'it' => '',
458 'fr' => '',
459 'es' => '',
460 'nl' => '',
461 'cz' => 'cs',
462 'pl' => '',
463 'si' => 'sl',
464 'fi' => '',
465 'tr' => '',
466 'se' => 'sv',
467 'pt' => '',
468 'ru' => '',
469 'ro' => '',
470 'ch' => 'zh_CN',
471 'sk' => '',
472 'lt' => '',
473 'is' => '',
474 'hr' => '',
475 'hu' => '',
476 'gl' => '', // Greenlandic
477 'th' => '',
478 'gr' => 'el',
479 'hk' => 'zh_HK',
480 'eu' => '',
481 'bg' => '',
482 'br' => 'pt_BR',
483 'et' => '',
484 'ar' => '',
485 'he' => 'iw',
486 'ua' => 'uk',
487 'jp' => 'ja',
488 'lv' => '',
489 'vn' => 'vi',
490 'ca' => '',
491 'ba' => '', // Bosnian
492 'kr' => '',
493 );
494
495 /**
496 * Normalize - changes input character set to lowercase letters.
497 *
498 * @param string Input charset
499 * @return string Normalized charset
500 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
501 */
502 function parse_charset($charset) {
503 $charset = strtolower($charset);
504 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
505
506 return $charset;
507 }
508
509 /**
510 * Get the charset of a locale.
511 *
512 * ln language
513 * ln_CN language / country
514 * ln_CN.cs language / country / charset
515 * ln_CN.cs@mod language / country / charset / modifier
516 *
517 * @param string Locale string
518 * @return string Charset resolved for locale string
519 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
520 */
521 function get_locale_charset($locale) {
522 $locale = strtolower($locale);
523
524 // exact locale specific charset?
525 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale];
526
527 // get modifier
528 list($locale,$modifier) = explode('@',$locale);
529
530 // locale contains charset: use it
531 list($locale,$charset) = explode('.',$locale);
532 if ($charset) return $this->parse_charset($charset);
533
534 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
535 if ($modifier == 'euro') return 'iso-8859-15';
536
537 // get language
538 list($language,$country) = explode('_',$locale);
539 if (isset($this->lang_to_langfamily[$language])) $language = $this->lang_to_langfamily[$language];
540
541 if (TYPO3_OS == 'WIN') {
542 $cs = $this->lang_to_charset_windows[$language];
543 } else {
544 $cs = $this->lang_to_charset_unix[$language];
545 }
546
547 return $cs ? $cs : 'iso-8859-1';
548 }
549
550
551
552
553
554
555
556
557
558 /********************************************
559 *
560 * Charset Conversion functions
561 *
562 ********************************************/
563
564 /**
565 * Convert from one charset to another charset.
566 *
567 * @param string Input string
568 * @param string From charset (the current charset of the string)
569 * @param string To charset (the output charset wanted)
570 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
571 * @return string Converted string
572 * @see convArray()
573 */
574 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
575 if ($fromCS==$toCS) return $str;
576
577 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
578 if ($toCS=='utf-8' || !$useEntityForNoChar) {
579 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
580 case 'mbstring':
581 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
582 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
583 break;
584
585 case 'iconv':
586 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
587 if (false !== $conv_str) return $conv_str;
588 break;
589
590 case 'recode':
591 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
592 if (false !== $conv_str) return $conv_str;
593 break;
594 }
595 // fallback to TYPO3 conversion
596 }
597
598 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
599 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
600 return $str;
601 }
602
603 /**
604 * Convert all elements in ARRAY from one charset to another charset.
605 * NOTICE: Array is passed by reference!
606 *
607 * @param string Input array, possibly multidimensional
608 * @param string From charset (the current charset of the string)
609 * @param string To charset (the output charset wanted)
610 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
611 * @return void
612 * @see conv()
613 */
614 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
615 foreach($array as $key => $value) {
616 if (is_array($array[$key])) {
617 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
618 } else {
619 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
620 }
621 }
622 }
623
624 /**
625 * Converts $str from $charset to UTF-8
626 *
627 * @param string String in local charset to convert to UTF-8
628 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
629 * @return string Output string, converted to UTF-8
630 */
631 function utf8_encode($str,$charset) {
632
633 // Charset is case-insensitive.
634 if ($this->initCharset($charset)) { // Parse conv. table if not already...
635 $strLen = strlen($str);
636 $outStr='';
637
638 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
639 $chr=substr($str,$a,1);
640 $ord=ord($chr);
641 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
642 $ord2 = ord($str{$a+1});
643 $ord = $ord<<8 & $ord2; // assume big endian
644
645 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
646 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
647 } else $outStr.=chr($this->noCharByteVal); // No char exists
648 $a++;
649 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
650 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
651 $a++;
652 $ord2=ord(substr($str,$a,1));
653 $ord = $ord*256+$ord2;
654 }
655 elseif ($charset == 'shift_jis' && ($ord <160 || $ord>223)) { // Shift-JIS is like EUC, but chars between 160 and 223 are single byte
656 $a++;
657 $ord2=ord(substr($str,$a,1));
658 $ord = $ord*256+$ord2;
659 }
660
661 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
662 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
663 } else $outStr.=chr($this->noCharByteVal); // No char exists
664 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
665 }
666 return $outStr;
667 }
668 }
669
670 /**
671 * Converts $str from UTF-8 to $charset
672 *
673 * @param string String in UTF-8 to convert to local charset
674 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
675 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
676 * @return string Output string, converted to local charset
677 */
678 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
679
680 // Charset is case-insensitive.
681 if ($this->initCharset($charset)) { // Parse conv. table if not already...
682 $strLen = strlen($str);
683 $outStr='';
684 $buf='';
685 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
686 $chr=substr($str,$a,1);
687 $ord=ord($chr);
688 if ($ord>127) { // This means multibyte! (first byte!)
689 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
690
691 $buf=$chr; // Add first byte
692 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
693 $ord = $ord << 1; // Shift it left and ...
694 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
695 $a++; // Increase pointer...
696 $buf.=substr($str,$a,1); // ... and add the next char.
697 } else break;
698 }
699
700 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
701 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
702 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
703 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
704 } else $outStr.= chr($mByte);
705 } elseif ($useEntityForNoChar) { // Create num entity:
706 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
707 } else $outStr.=chr($this->noCharByteVal); // No char exists
708 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
709 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
710 }
711 return $outStr;
712 }
713 }
714
715 /**
716 * Converts all chars > 127 to numeric entities.
717 *
718 * @param string Input string
719 * @return string Output string
720 */
721 function utf8_to_entities($str) {
722 $strLen = strlen($str);
723 $outStr='';
724 $buf='';
725 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
726 $chr=substr($str,$a,1);
727 $ord=ord($chr);
728 if ($ord>127) { // This means multibyte! (first byte!)
729 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
730 $buf=$chr; // Add first byte
731 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
732 $ord = $ord << 1; // Shift it left and ...
733 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
734 $a++; // Increase pointer...
735 $buf.=substr($str,$a,1); // ... and add the next char.
736 } else break;
737 }
738
739 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
740 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
741 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
742 }
743
744 return $outStr;
745 }
746
747 /**
748 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
749 *
750 * @param string Input string, UTF-8
751 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
752 * @return string Output string
753 */
754 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
755 if ($alsoStdHtmlEnt) {
756 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
757 }
758
759 $token = md5(microtime());
760 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
761 foreach($parts as $k => $v) {
762 if ($k%2) {
763 if (substr($v,0,1)=='#') { // Dec or hex entities:
764 if (substr($v,1,1)=='x') {
765 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
766 } else {
767 $parts[$k] = $this->UnumberToChar(substr($v,1));
768 }
769 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
770 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
771 } else { // No conversion:
772 $parts[$k] ='&'.$v.';';
773 }
774 }
775 }
776
777 return implode('',$parts);
778 }
779
780 /**
781 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
782 *
783 * @param string Input string, UTF-8
784 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
785 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
786 * @return array Output array with the char numbers
787 */
788 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
789 // If entities must be registered as well...:
790 if ($convEntities) {
791 $str = $this->entities_to_utf8($str,1);
792 }
793 // Do conversion:
794 $strLen = strlen($str);
795 $outArr=array();
796 $buf='';
797 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
798 $chr=substr($str,$a,1);
799 $ord=ord($chr);
800 if ($ord>127) { // This means multibyte! (first byte!)
801 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
802 $buf=$chr; // Add first byte
803 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
804 $ord = $ord << 1; // Shift it left and ...
805 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
806 $a++; // Increase pointer...
807 $buf.=substr($str,$a,1); // ... and add the next char.
808 } else break;
809 }
810
811 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
812 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
813 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
814 }
815
816 return $outArr;
817 }
818
819 /**
820 * Converts a UNICODE number to a UTF-8 multibyte character
821 * Algorithm based on script found at From: http://czyborra.com/utf/
822 * Unit-tested by Kasper
823 *
824 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
825 *
826 * bytes | bits | representation
827 * 1 | 7 | 0vvvvvvv
828 * 2 | 11 | 110vvvvv 10vvvvvv
829 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
830 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
831 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
832 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
833 *
834 * @param integer UNICODE integer
835 * @return string UTF-8 multibyte character string
836 * @see utf8CharToUnumber()
837 */
838 function UnumberToChar($cbyte) {
839 $str='';
840
841 if ($cbyte < 0x80) {
842 $str.=chr($cbyte);
843 } else if ($cbyte < 0x800) {
844 $str.=chr(0xC0 | ($cbyte >> 6));
845 $str.=chr(0x80 | ($cbyte & 0x3F));
846 } else if ($cbyte < 0x10000) {
847 $str.=chr(0xE0 | ($cbyte >> 12));
848 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
849 $str.=chr(0x80 | ($cbyte & 0x3F));
850 } else if ($cbyte < 0x200000) {
851 $str.=chr(0xF0 | ($cbyte >> 18));
852 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
853 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
854 $str.=chr(0x80 | ($cbyte & 0x3F));
855 } else if ($cbyte < 0x4000000) {
856 $str.=chr(0xF8 | ($cbyte >> 24));
857 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
858 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
859 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
860 $str.=chr(0x80 | ($cbyte & 0x3F));
861 } else if ($cbyte < 0x80000000) {
862 $str.=chr(0xFC | ($cbyte >> 30));
863 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
864 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
865 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
866 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
867 $str.=chr(0x80 | ($cbyte & 0x3F));
868 } else { // Cannot express a 32-bit character in UTF-8
869 $str .= chr($this->noCharByteVal);
870 }
871 return $str;
872 }
873
874 /**
875 * Converts a UTF-8 Multibyte character to a UNICODE number
876 * Unit-tested by Kasper
877 *
878 * @param string UTF-8 multibyte character string
879 * @param boolean If set, then a hex. number is returned.
880 * @return integer UNICODE integer
881 * @see UnumberToChar()
882 */
883 function utf8CharToUnumber($str,$hex=0) {
884 $ord=ord(substr($str,0,1)); // First char
885
886 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
887 $binBuf='';
888 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
889 $ord = $ord << 1; // Shift it left and ...
890 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
891 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
892 } else break;
893 }
894 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
895
896 $int = bindec($binBuf);
897 } else $int = $ord;
898
899 return $hex ? 'x'.dechex($int) : $int;
900 }
901
902
903
904
905
906
907
908
909
910 /********************************************
911 *
912 * Init functions
913 *
914 ********************************************/
915
916 /**
917 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
918 * This function is automatically called by the conversion functions
919 *
920 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
921 *
922 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
923 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
924 * @access private
925 */
926 function initCharset($charset) {
927 // Only process if the charset is not yet loaded:
928 if (!is_array($this->parsedCharsets[$charset])) {
929
930 // Conversion table filename:
931 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
932
933 // If the conversion table is found:
934 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
935 // Cache file for charsets:
936 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
937 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
938 if ($cacheFile && @is_file($cacheFile)) {
939 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
940 } else {
941 // Parse conversion table into lines:
942 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
943 // Initialize the internal variable holding the conv. table:
944 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
945 // traverse the lines:
946 $detectedType='';
947 foreach($lines as $value) {
948 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
949
950 // Detect type if not done yet: (Done on first real line)
951 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
952 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
953
954 if ($detectedType=='ms-token') {
955 list($hexbyte,$utf8) = split('=|:',$value,3);
956 } elseif ($detectedType=='whitespaced') {
957 $regA=array();
958 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
959 $hexbyte = $regA[1];
960 $utf8 = 'U+'.$regA[2];
961 }
962 $decval = hexdec(trim($hexbyte));
963 if ($decval>127) {
964 $utf8decval = hexdec(substr(trim($utf8),2));
965 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
966 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
967 }
968 }
969 }
970 if ($cacheFile) {
971 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
972 }
973 }
974 return 2;
975 } else return false;
976 } else return 1;
977 }
978
979 /**
980 * This function initializes all UTF-8 character data tables.
981 *
982 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
983 *
984 * @param string Mode ("case", "ascii", ...)
985 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
986 * @access private
987 */
988 function initUnicodeData($mode=null) {
989 // cache files
990 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
991 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
992
993 // Only process if the tables are not yet loaded
994 switch($mode) {
995 case 'case':
996 if (is_array($this->caseFolding['utf-8'])) return 1;
997
998 // Use cached version if possible
999 if ($cacheFileCase && @is_file($cacheFileCase)) {
1000 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1001 return 2;
1002 }
1003 break;
1004
1005 case 'ascii':
1006 if (is_array($this->toASCII['utf-8'])) return 1;
1007
1008 // Use cached version if possible
1009 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1010 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1011 return 2;
1012 }
1013 break;
1014 }
1015
1016 // process main Unicode data file
1017 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
1018 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
1019
1020 $fh = fopen($unicodeDataFile,'rb');
1021 if (!$fh) return false;
1022
1023 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1024 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1025 $this->caseFolding['utf-8'] = array();
1026 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1027 $utf8CaseFolding['toUpper'] = array();
1028 $utf8CaseFolding['toLower'] = array();
1029 $utf8CaseFolding['toTitle'] = array();
1030
1031 $decomposition = array(); // array of temp. decompositions
1032 $mark = array(); // array of chars that are marks (eg. composing accents)
1033 $number = array(); // array of chars that are numbers (eg. digits)
1034 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1035
1036 while (!feof($fh)) {
1037 $line = fgets($fh,4096);
1038 // has a lot of info
1039 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
1040
1041 $ord = hexdec($char);
1042 if ($ord > 0xFFFF) break; // only process the BMP
1043
1044 $utf8_char = $this->UnumberToChar($ord);
1045
1046 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1047 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1048 // store "title" only when different from "upper" (only a few)
1049 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1050
1051 switch ($cat{0}) {
1052 case 'M': // mark (accent, umlaut, ...)
1053 $mark["U+$char"] = 1;
1054 break;
1055
1056 case 'N': // numeric value
1057 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
1058 }
1059
1060 // accented Latin letters without "official" decomposition
1061 $match = array();
1062 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) {
1063 $c = ord($match[2]);
1064 if ($match[1] == 'SMALL') $c += 32;
1065
1066 $decomposition["U+$char"] = array(dechex($c));
1067 continue;
1068 }
1069
1070 $match = array();
1071 if (ereg('(<.*>)? *(.+)',$decomp,$match)) {
1072 switch($match[1]) {
1073 case '<circle>': // add parenthesis as circle replacement, eg (1)
1074 $match[2] = '0028 '.$match[2].' 0029';
1075 break;
1076
1077 case '<square>': // add square brackets as square replacement, eg [1]
1078 $match[2] = '005B '.$match[2].' 005D';
1079 break;
1080
1081 case '<compat>': // ignore multi char decompositions that start with a space
1082 if (ereg('^0020 ',$match[2])) continue 2;
1083 break;
1084
1085 // ignore Arabic and vertical layout presentation decomposition
1086 case '<initial>':
1087 case '<medial>':
1088 case '<final>':
1089 case '<isolated>':
1090 case '<vertical>':
1091 continue 2;
1092 }
1093 $decomposition["U+$char"] = split(' ',$match[2]);
1094 }
1095 }
1096 fclose($fh);
1097
1098 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1099 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
1100 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1101 $fh = fopen($specialCasingFile,'rb');
1102 if ($fh) {
1103 while (!feof($fh)) {
1104 $line = fgets($fh,4096);
1105 if ($line{0} != '#' && trim($line) != '') {
1106
1107 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
1108 if ($cond == '' || $cond{0} == '#') {
1109 $utf8_char = $this->UnumberToChar(hexdec($char));
1110 if ($char != $lower) {
1111 $arr = split(' ',$lower);
1112 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1113 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1114 }
1115 if ($char != $title && $title != $upper) {
1116 $arr = split(' ',$title);
1117 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1118 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1119 }
1120 if ($char != $upper) {
1121 $arr = split(' ',$upper);
1122 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1123 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1124 }
1125 }
1126 }
1127 }
1128 fclose($fh);
1129 }
1130 }
1131
1132 // process custom decompositions
1133 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
1134 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1135 $fh = fopen($customTranslitFile,'rb');
1136 if ($fh) {
1137 while (!feof($fh)) {
1138 $line = fgets($fh,4096);
1139 if ($line{0} != '#' && trim($line) != '') {
1140 list($char,$translit) = t3lib_div::trimExplode(';', $line);
1141 if (!$translit) $omit["U+$char"] = 1;
1142 $decomposition["U+$char"] = split(' ', $translit);
1143
1144 }
1145 }
1146 fclose($fh);
1147 }
1148 }
1149
1150 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1151 foreach($decomposition as $from => $to) {
1152 $code_decomp = array();
1153
1154 while ($code_value = array_shift($to)) {
1155 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1156 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
1157 array_unshift($to, $cv);
1158 }
1159 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1160 array_push($code_decomp, $code_value);
1161 }
1162 }
1163 if (count($code_decomp) || isset($omit[$from])) {
1164 $decomposition[$from] = $code_decomp;
1165 } else {
1166 unset($decomposition[$from]);
1167 }
1168 }
1169
1170 // create ascii only mapping
1171 $this->toASCII['utf-8'] = array();
1172 $ascii =& $this->toASCII['utf-8'];
1173
1174 foreach($decomposition as $from => $to) {
1175 $code_decomp = array();
1176 while ($code_value = array_shift($to)) {
1177 $ord = hexdec($code_value);
1178 if ($ord > 127)
1179 continue 2; // skip decompositions containing non-ASCII chars
1180 else
1181 array_push($code_decomp,chr($ord));
1182 }
1183 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1184 }
1185
1186 // add numeric decompositions
1187 foreach($number as $from => $to) {
1188 $utf8_char = $this->UnumberToChar(hexdec($from));
1189 if (!isset($ascii[$utf8_char])) {
1190 $ascii[$utf8_char] = $to;
1191 }
1192 }
1193
1194 if ($cacheFileCase) {
1195 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1196 }
1197
1198 if ($cacheFileASCII) {
1199 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1200 }
1201
1202 return 3;
1203 }
1204
1205 /**
1206 * This function initializes the folding table for a charset other than UTF-8.
1207 * This function is automatically called by the case folding functions.
1208 *
1209 * @param string Charset for which to initialize case folding.
1210 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1211 * @access private
1212 */
1213 function initCaseFolding($charset) {
1214 // Only process if the case table is not yet loaded:
1215 if (is_array($this->caseFolding[$charset])) return 1;
1216
1217 // Use cached version if possible
1218 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1219 if ($cacheFile && @is_file($cacheFile)) {
1220 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1221 return 2;
1222 }
1223
1224 // init UTF-8 conversion for this charset
1225 if (!$this->initCharset($charset)) {
1226 return false;
1227 }
1228
1229 // UTF-8 case folding is used as the base conversion table
1230 if (!$this->initUnicodeData('case')) {
1231 return false;
1232 }
1233
1234 $nochar = chr($this->noCharByteVal);
1235 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1236 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1237 $c = $this->utf8_decode($utf8, $charset);
1238
1239 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1240 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1241 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1242
1243 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1244 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1245 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
1246
1247 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1248 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1249 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1250 }
1251
1252 // add the ASCII case table
1253 for ($i=ord('a'); $i<=ord('z'); $i++) {
1254 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
1255 }
1256 for ($i=ord('A'); $i<=ord('Z'); $i++) {
1257 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
1258 }
1259
1260 if ($cacheFile) {
1261 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
1262 }
1263
1264 return 3;
1265 }
1266
1267 /**
1268 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1269 * This function is automatically called by the ASCII transliteration functions.
1270 *
1271 * @param string Charset for which to initialize conversion.
1272 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1273 * @access private
1274 */
1275 function initToASCII($charset) {
1276 // Only process if the case table is not yet loaded:
1277 if (is_array($this->toASCII[$charset])) return 1;
1278
1279 // Use cached version if possible
1280 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1281 if ($cacheFile && @is_file($cacheFile)) {
1282 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1283 return 2;
1284 }
1285
1286 // init UTF-8 conversion for this charset
1287 if (!$this->initCharset($charset)) {
1288 return false;
1289 }
1290
1291 // UTF-8/ASCII transliteration is used as the base conversion table
1292 if (!$this->initUnicodeData('ascii')) {
1293 return false;
1294 }
1295
1296 $nochar = chr($this->noCharByteVal);
1297 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1298 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1299 $c = $this->utf8_decode($utf8, $charset);
1300
1301 if (isset($this->toASCII['utf-8'][$utf8])) {
1302 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1303 }
1304 }
1305
1306 if ($cacheFile) {
1307 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
1308 }
1309
1310 return 3;
1311 }
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328 /********************************************
1329 *
1330 * String operation functions
1331 *
1332 ********************************************/
1333
1334 /**
1335 * Returns a part of a string.
1336 * Unit-tested by Kasper (single byte charsets only)
1337 *
1338 * @param string The character set
1339 * @param string Character string
1340 * @param integer Start position (character position)
1341 * @param integer Length (in characters)
1342 * @return string The substring
1343 * @see substr(), mb_substr()
1344 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1345 */
1346 function substr($charset,$string,$start,$len=null) {
1347 if ($len===0) return '';
1348
1349 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1350 // cannot omit $len, when specifying charset
1351 if ($len==null) {
1352 $enc = mb_internal_encoding(); // save internal encoding
1353 mb_internal_encoding('utf-8');
1354 $str = mb_substr($string,$start);
1355 mb_internal_encoding($enc); // restore internal encoding
1356
1357 return $str;
1358 }
1359 else return mb_substr($string,$start,$len,'utf-8');
1360 } elseif ($charset == 'utf-8') {
1361 return $this->utf8_substr($string,$start,$len);
1362 } elseif ($this->eucBasedSets[$charset]) {
1363 return $this->euc_substr($string,$start,$charset,$len);
1364 } elseif ($this->twoByteSets[$charset]) {
1365 return substr($string,$start*2,$len*2);
1366 } elseif ($this->fourByteSets[$charset]) {
1367 return substr($string,$start*4,$len*4);
1368 }
1369
1370 // treat everything else as single-byte encoding
1371 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
1372 }
1373
1374 /**
1375 * Counts the number of characters.
1376 * Unit-tested by Kasper (single byte charsets only)
1377 *
1378 * @param string The character set
1379 * @param string Character string
1380 * @return integer The number of characters
1381 * @see strlen()
1382 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1383 */
1384 function strlen($charset,$string) {
1385 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1386 return mb_strlen($string,$charset);
1387 } elseif ($charset == 'utf-8') {
1388 return $this->utf8_strlen($string);
1389 } elseif ($this->eucBasedSets[$charset]) {
1390 return $this->euc_strlen($string,$charset);
1391 } elseif ($this->twoByteSets[$charset]) {
1392 return strlen($string)/2;
1393 } elseif ($this->fourByteSets[$charset]) {
1394 return strlen($string)/4;
1395 }
1396 // treat everything else as single-byte encoding
1397 return strlen($string);
1398 }
1399
1400 /**
1401 * Truncates a string and pre-/appends a string.
1402 * Unit tested by Kasper
1403 *
1404 * @param string The character set
1405 * @param string Character string
1406 * @param integer Length (in characters)
1407 * @param string Crop signifier
1408 * @return string The shortened string
1409 * @see substr(), mb_strimwidth()
1410 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1411 */
1412 function crop($charset,$string,$len,$crop='') {
1413 if (intval($len) == 0) return $string;
1414
1415 if ($charset == 'utf-8') {
1416 $i = $this->utf8_char2byte_pos($string,$len);
1417 } elseif ($this->eucBasedSets[$charset]) {
1418 $i = $this->euc_char2byte_pos($string,$len,$charset);
1419 } else {
1420 if ($len > 0) {
1421 $i = $len;
1422 } else {
1423 $i = strlen($string)+$len;
1424 if ($i<=0) $i = false;
1425 }
1426 }
1427
1428 if ($i === false) { // $len outside actual string length
1429 return $string;
1430 } else {
1431 if ($len > 0) {
1432 if (strlen($string{$i})) {
1433 return substr($string,0,$i).$crop;
1434
1435 }
1436 } else {
1437 if (strlen($string{$i-1})) {
1438 return $crop.substr($string,$i);
1439 }
1440 }
1441
1442 /*
1443 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1444 if ($len > 0) {
1445 return substr($string,0,$i).$crop;
1446 } else {
1447 return $crop.substr($string,$i);
1448 }
1449 }
1450 */
1451 }
1452 return $string;
1453 }
1454
1455 /**
1456 * Cuts a string short at a given byte length.
1457 *
1458 * @param string The character set
1459 * @param string Character string
1460 * @param integer The byte length
1461 * @return string The shortened string
1462 * @see mb_strcut()
1463 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1464 */
1465 function strtrunc($charset,$string,$len) {
1466 if ($len <= 0) return '';
1467
1468 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1469 return mb_strcut($string,0,$len,$charset);
1470 } elseif ($charset == 'utf-8') {
1471 return $this->utf8_strtrunc($string,$len);
1472 } elseif ($this->eucBasedSets[$charset]) {
1473 return $this->euc_strtrunc($string,$charset);
1474 } elseif ($this->twoByteSets[$charset]) {
1475 if ($len % 2) $len--; // don't cut at odd positions
1476 } elseif ($this->fourByteSets[$charset]) {
1477 $x = $len % 4;
1478 $len -= $x; // realign to position dividable by four
1479 }
1480 // treat everything else as single-byte encoding
1481 return substr($string,0,$len);
1482 }
1483
1484 /**
1485 * Translates all characters of a string into their respective case values.
1486 * Unlike strtolower() and strtoupper() this method is locale independent.
1487 * Note that the string length may change!
1488 * eg. lower case German �(sharp S) becomes upper case "SS"
1489 * Unit-tested by Kasper
1490 * Real case folding is language dependent, this method ignores this fact.
1491 *
1492 * @param string Character set of string
1493 * @param string Input string to convert case for
1494 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1495 * @return string The converted string
1496 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1497 * @see strtolower(), strtoupper()
1498 */
1499 function conv_case($charset,$string,$case) {
1500 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' && float(phpversion()) >= 4.3) {
1501 if ($case == 'toLower') {
1502 return mb_strtolower($str,'utf-8');
1503 } else {
1504 return mb_strtoupper($str,'utf-8');
1505 }
1506 } elseif ($charset == 'utf-8') {
1507 return $this->utf8_char_mapping($string,'case',$case);
1508 } elseif (isset($this->eucBasedSets[$charset])) {
1509 return $this->euc_char_mapping($string,$charset,'case',$case);
1510 } else {
1511 // treat everything else as single-byte encoding
1512 return $this->sb_char_mapping($string,$charset,'case',$case);
1513 }
1514
1515 return $string;
1516 }
1517
1518 /**
1519 * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
1520 *
1521 * @param string Character set of string
1522 * @param string Input string to convert
1523 * @return string The converted string
1524 */
1525 function specCharsToASCII($charset,$string) {
1526 if ($charset == 'utf-8') {
1527 return $this->utf8_char_mapping($string,'ascii');
1528 } elseif (isset($this->eucBasedSets[$charset])) {
1529 return $this->euc_char_mapping($string,$charset,'ascii');
1530 } else {
1531 // treat everything else as single-byte encoding
1532 return $this->sb_char_mapping($string,$charset,'ascii');
1533 }
1534
1535 return $string;
1536 }
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549 /********************************************
1550 *
1551 * Internal string operation functions
1552 *
1553 ********************************************/
1554
1555 /**
1556 * Maps all characters of a string in a single byte charset.
1557 *
1558 * @param string the string
1559 * @param string the charset
1560 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1561 * @param string 'case': conversion 'toLower' or 'toUpper'
1562 * @return string the converted string
1563 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1564 */
1565 function sb_char_mapping($str,$charset,$mode,$opt='') {
1566 switch($mode) {
1567 case 'case':
1568 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1569 $map =& $this->caseFolding[$charset][$opt];
1570 break;
1571
1572 case 'ascii':
1573 if (!$this->initToASCII($charset)) return $str; // do nothing
1574 $map =& $this->toASCII[$charset];
1575 break;
1576
1577 default:
1578 return $str;
1579 }
1580
1581 $out = '';
1582 for($i=0; strlen($str{$i}); $i++) {
1583 $c = $str{$i};
1584 if (isset($map[$c])) {
1585 $out .= $map[$c];
1586 } else {
1587 $out .= $c;
1588 }
1589 }
1590
1591 return $out;
1592 }
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603 /********************************************
1604 *
1605 * Internal UTF-8 string operation functions
1606 *
1607 ********************************************/
1608
1609 /**
1610 * Returns a part of a UTF-8 string.
1611 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1612 *
1613 * @param string UTF-8 string
1614 * @param integer Start position (character position)
1615 * @param integer Length (in characters)
1616 * @return string The substring
1617 * @see substr()
1618 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1619 */
1620 function utf8_substr($str,$start,$len=null) {
1621 if (!strcmp($len,'0')) return '';
1622
1623 $byte_start = $this->utf8_char2byte_pos($str,$start);
1624 if ($byte_start === false) {
1625 if ($start > 0) {
1626 return false; // $start outside string length
1627 } else {
1628 $start = 0;
1629 }
1630 }
1631
1632 $str = substr($str,$byte_start);
1633
1634 if ($len!=null) {
1635 $byte_end = $this->utf8_char2byte_pos($str,$len);
1636 if ($byte_end === false) // $len outside actual string length
1637 return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string.
1638 else
1639 return substr($str,0,$byte_end);
1640 }
1641 else return $str;
1642 }
1643
1644 /**
1645 * Counts the number of characters of a string in UTF-8.
1646 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1647 *
1648 * @param string UTF-8 multibyte character string
1649 * @return integer The number of characters
1650 * @see strlen()
1651 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1652 */
1653 function utf8_strlen($str) {
1654 $n=0;
1655 for($i=0; strlen($str{$i}); $i++) {
1656 $c = ord($str{$i});
1657 if (!($c & 0x80)) // single-byte (0xxxxxx)
1658 $n++;
1659 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1660 $n++;
1661 }
1662 return $n;
1663 }
1664
1665 /**
1666 * Truncates a string in UTF-8 short at a given byte length.
1667 *
1668 * @param string UTF-8 multibyte character string
1669 * @param integer the byte length
1670 * @return string the shortened string
1671 * @see mb_strcut()
1672 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1673 */
1674 function utf8_strtrunc($str,$len) {
1675 $i = $len-1;
1676 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1677 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1678 if ($i <= 0) return ''; // sanity check
1679 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1680 if ($bc+$i > $len) return substr($str,0,$i);
1681 // fallthru: multibyte char fits into length
1682 }
1683 return substr($str,0,$len);
1684 }
1685
1686 /**
1687 * Find position of first occurrence of a string, both arguments are in UTF-8.
1688 *
1689 * @param string UTF-8 string to search in
1690 * @param string UTF-8 string to search for
1691 * @param integer Positition to start the search
1692 * @return integer The character position
1693 * @see strpos()
1694 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1695 */
1696 function utf8_strpos($haystack,$needle,$offset=0) {
1697 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1698 return mb_strpos($haystack,$needle,'utf-8');
1699 }
1700
1701 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1702 if ($byte_offset === false) return false; // offset beyond string length
1703
1704 $byte_pos = strpos($haystack,$needle,$byte_offset);
1705 if ($byte_pos === false) return false; // needle not found
1706
1707 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1708 }
1709
1710 /**
1711 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1712 *
1713 * @param string UTF-8 string to search in
1714 * @param string UTF-8 character to search for (single character)
1715 * @return integer The character position
1716 * @see strrpos()
1717 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1718 */
1719 function utf8_strrpos($haystack,$needle) {
1720 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1721 return mb_strrpos($haystack,$needle,'utf-8');
1722 }
1723
1724 $byte_pos = strrpos($haystack,$needle);
1725 if ($byte_pos === false) return false; // needle not found
1726
1727 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1728 }
1729
1730 /**
1731 * Translates a character position into an 'absolute' byte position.
1732 * Unit tested by Kasper.
1733 *
1734 * @param string UTF-8 string
1735 * @param integer Character position (negative values start from the end)
1736 * @return integer Byte position
1737 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1738 */
1739 function utf8_char2byte_pos($str,$pos) {
1740 $n = 0; // number of characters found
1741 $p = abs($pos); // number of characters wanted
1742
1743 if ($pos >= 0) {
1744 $i = 0;
1745 $d = 1;
1746 } else {
1747 $i = strlen($str)-1;
1748 $d = -1;
1749 }
1750
1751 for( ; strlen($str{$i}) && $n<$p; $i+=$d) {
1752 $c = (int)ord($str{$i});
1753 if (!($c & 0x80)) // single-byte (0xxxxxx)
1754 $n++;
1755 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1756 $n++;
1757 }
1758 if (!strlen($str{$i})) return false; // offset beyond string length
1759
1760 if ($pos >= 0) {
1761 // skip trailing multi-byte data bytes
1762 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1763 } else {
1764 // correct offset
1765 $i++;
1766 }
1767
1768 return $i;
1769 }
1770
1771 /**
1772 * Translates an 'absolute' byte position into a character position.
1773 * Unit tested by Kasper.
1774 *
1775 * @param string UTF-8 string
1776 * @param integer byte position
1777 * @return integer character position
1778 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1779 */
1780 function utf8_byte2char_pos($str,$pos) {
1781 $n = 0; // number of characters
1782 for($i=$pos; $i>0; $i--) {
1783 $c = (int)ord($str{$i});
1784 if (!($c & 0x80)) // single-byte (0xxxxxx)
1785 $n++;
1786 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1787 $n++;
1788 }
1789 if (!strlen($str{$i})) return false; // offset beyond string length
1790
1791 return $n;
1792 }
1793
1794 /**
1795 * Maps all characters of an UTF-8 string.
1796 *
1797 * @param string UTF-8 string
1798 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1799 * @param string 'case': conversion 'toLower' or 'toUpper'
1800 * @return string the converted string
1801 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1802 */
1803 function utf8_char_mapping($str,$mode,$opt='') {
1804 if (!$this->initUnicodeData($mode)) return $str; // do nothing
1805
1806 $out = '';
1807 switch($mode) {
1808 case 'case':
1809 $map =& $this->caseFolding['utf-8'][$opt];
1810 break;
1811
1812 case 'ascii':
1813 $map =& $this->toASCII['utf-8'];
1814 break;
1815
1816 default:
1817 return $str;
1818 }
1819
1820 for($i=0; strlen($str{$i}); $i++) {
1821 $c = ord($str{$i});
1822 if (!($c & 0x80)) // single-byte (0xxxxxx)
1823 $mbc = $str{$i};
1824 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1825 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1826 $mbc = substr($str,$i,$bc);
1827 $i += $bc-1;
1828 }
1829
1830 if (isset($map[$mbc])) {
1831 $out .= $map[$mbc];
1832 } else {
1833 $out .= $mbc;
1834 }
1835 }
1836
1837 return $out;
1838 }
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857 /********************************************
1858 *
1859 * Internal EUC string operation functions
1860 *
1861 * Extended Unix Code:
1862 * ASCII compatible 7bit single bytes chars
1863 * 8bit two byte chars
1864 *
1865 * Shift-JIS is treated as a special case.
1866 *
1867 ********************************************/
1868
1869 /**
1870 * Cuts a string in the EUC charset family short at a given byte length.
1871 *
1872 * @param string EUC multibyte character string
1873 * @param integer the byte length
1874 * @param string the charset
1875 * @return string the shortened string
1876 * @see mb_strcut()
1877 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1878 */
1879 function euc_strtrunc($str,$len,$charset) {
1880 $sjis = ($charset == 'shift_jis');
1881 for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
1882 $c = ord($str{$i});
1883 if ($sjis) {
1884 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1885 }
1886 else {
1887 if ($c >= 0x80) $i++; // advance a double-byte char
1888 }
1889 }
1890 if (!strlen($str{$i})) return $str; // string shorter than supplied length
1891
1892 if ($i>$len)
1893 return substr($str,0,$len-1); // we ended on a first byte
1894 else
1895 return substr($str,0,$len);
1896 }
1897
1898 /**
1899 * Returns a part of a string in the EUC charset family.
1900 *
1901 * @param string EUC multibyte character string
1902 * @param integer start position (character position)
1903 * @param string the charset
1904 * @param integer length (in characters)
1905 * @return string the substring
1906 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1907 */
1908 function euc_substr($str,$start,$charset,$len=null) {
1909 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1910 if ($byte_start === false) return false; // $start outside string length
1911
1912 $str = substr($str,$byte_start);
1913
1914 if ($len!=null) {
1915 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1916 if ($byte_end === false) // $len outside actual string length
1917 return $str;
1918 else
1919 return substr($str,0,$byte_end);
1920 }
1921 else return $str;
1922 }
1923
1924 /**
1925 * Counts the number of characters of a string in the EUC charset family.
1926 *
1927 * @param string EUC multibyte character string
1928 * @param string the charset
1929 * @return integer the number of characters
1930 * @see strlen()
1931 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1932 */
1933 function euc_strlen($str,$charset) {
1934 $sjis = ($charset == 'shift_jis');
1935 $n=0;
1936 for ($i=0; strlen($str{$i}); $i++) {
1937 $c = ord($str{$i});
1938 if ($sjis) {
1939 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1940 }
1941 else {
1942 if ($c >= 0x80) $i++; // advance a double-byte char
1943 }
1944
1945 $n++;
1946 }
1947
1948 return $n;
1949 }
1950
1951 /**
1952 * Translates a character position into an 'absolute' byte position.
1953 *
1954 * @param string EUC multibyte character string
1955 * @param integer character position (negative values start from the end)
1956 * @param string the charset
1957 * @return integer byte position
1958 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1959 */
1960 function euc_char2byte_pos($str,$pos,$charset) {
1961 $sjis = ($charset == 'shift_jis');
1962 $n = 0; // number of characters seen
1963 $p = abs($pos); // number of characters wanted
1964
1965 if ($pos >= 0) {
1966 $i = 0;
1967 $d = 1;
1968 } else {
1969 $i = strlen($str)-1;
1970 $d = -1;
1971 }
1972
1973 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
1974 $c = ord($str{$i});
1975 if ($sjis) {
1976 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char
1977 }
1978 else {
1979 if ($c >= 0x80) $i+=$d; // advance a double-byte char
1980 }
1981
1982 $n++;
1983 }
1984 if (!strlen($str{$i})) return false; // offset beyond string length
1985
1986 if ($pos < 0) $i++; // correct offset
1987
1988 return $i;
1989 }
1990
1991 /**
1992 * Maps all characters of a string in the EUC charset family.
1993 *
1994 * @param string EUC multibyte character string
1995 * @param string the charset
1996 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1997 * @param string 'case': conversion 'toLower' or 'toUpper'
1998 * @return string the converted string
1999 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2000 */
2001 function euc_char_mapping($str,$charset,$mode,$opt='') {
2002 switch($mode) {
2003 case 'case':
2004 if (!$this->initCaseFolding($charset)) return $str; // do nothing
2005 $map =& $this->caseFolding[$charset][$opt];
2006 break;
2007
2008 case 'ascii':
2009 if (!$this->initToASCII($charset)) return $str; // do nothing
2010 $map =& $this->toASCII[$charset];
2011 break;
2012
2013 default:
2014 return $str;
2015 }
2016
2017 $sjis = ($charset == 'shift_jis');
2018 $out = '';
2019 for($i=0; strlen($str{$i}); $i++) {
2020 $mbc = $str{$i};
2021 $c = ord($mbc);
2022
2023 if ($sjis) {
2024 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2025 $mbc = substr($str,$i,2);
2026 $i++;
2027 }
2028 }
2029 else {
2030 if ($c >= 0x80) { // a double-byte char
2031 $mbc = substr($str,$i,2);
2032 $i++;
2033 }
2034 }
2035
2036 if (isset($map[$mbc])) {
2037 $out .= $map[$mbc];
2038 } else {
2039 $out .= $mbc;
2040 }
2041 }
2042
2043 return $out;
2044 }
2045
2046 }
2047
2048 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
2049 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2050 }
2051 ?>