Fixed a few small bugs in indexed-search
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2004 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 136: class t3lib_cs
38 * 502: function parse_charset($charset)
39 * 521: function get_locale_charset($locale)
40 *
41 * SECTION: Charset Conversion functions
42 * 574: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
43 * 614: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
44 * 631: function utf8_encode($str,$charset)
45 * 678: function utf8_decode($str,$charset,$useEntityForNoChar=0)
46 * 721: function utf8_to_entities($str)
47 * 754: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
48 * 788: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
49 * 838: function UnumberToChar($cbyte)
50 * 883: function utf8CharToUnumber($str,$hex=0)
51 *
52 * SECTION: Init functions
53 * 926: function initCharset($charset)
54 * 988: function initUnicodeData($mode=null)
55 * 1213: function initCaseFolding($charset)
56 * 1275: function initToASCII($charset)
57 *
58 * SECTION: String operation functions
59 * 1346: function substr($charset,$string,$start,$len=null)
60 * 1384: function strlen($charset,$string)
61 * 1412: function crop($charset,$string,$len,$crop='')
62 * 1465: function strtrunc($charset,$string,$len)
63 * 1499: function conv_case($charset,$string,$case)
64 * 1525: function specCharsToASCII($charset,$string)
65 *
66 * SECTION: Internal string operation functions
67 * 1565: function sb_char_mapping($str,$charset,$mode,$opt='')
68 *
69 * SECTION: Internal UTF-8 string operation functions
70 * 1620: function utf8_substr($str,$start,$len=null)
71 * 1653: function utf8_strlen($str)
72 * 1674: function utf8_strtrunc($str,$len)
73 * 1696: function utf8_strpos($haystack,$needle,$offset=0)
74 * 1719: function utf8_strrpos($haystack,$needle)
75 * 1739: function utf8_char2byte_pos($str,$pos)
76 * 1780: function utf8_byte2char_pos($str,$pos)
77 * 1803: function utf8_char_mapping($str,$mode,$opt='')
78 *
79 * SECTION: Internal EUC string operation functions
80 * 1879: function euc_strtrunc($str,$len,$charset)
81 * 1908: function euc_substr($str,$start,$charset,$len=null)
82 * 1933: function euc_strlen($str,$charset)
83 * 1960: function euc_char2byte_pos($str,$pos,$charset)
84 * 2001: function euc_char_mapping($str,$charset,$mode,$opt='')
85 *
86 * TOTAL FUNCTIONS: 35
87 * (This index is automatically created/updated by the extension "extdeveval")
88 *
89 */
90
91
92
93
94
95
96
97
98 /**
99 * Notes on UTF-8
100 *
101 * Functions working on UTF-8 strings:
102 *
103 * - strchr/strstr
104 * - strrchr
105 * - substr_count
106 * - implode/explode/join
107 *
108 * Functions nearly working on UTF-8 strings:
109 *
110 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf_strlen
111 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained 7-bit ASCII
112 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
113 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
114 *
115 * Functions NOT working on UTF-8 strings:
116 *
117 * - str*cmp
118 * - stristr
119 * - stripos
120 * - substr
121 * - strrev
122 * - ereg/eregi
123 * - split/spliti
124 * - preg_*
125 * - ...
126 *
127 */
128 /**
129 * Class for conversion between charsets
130 *
131 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
132 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
133 * @package TYPO3
134 * @subpackage t3lib
135 */
136 class t3lib_cs {
137 var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
138
139 // This is the array where parsed conversion tables are stored (cached)
140 var $parsedCharsets=array();
141
142 // An array where case folding data will be stored (cached)
143 var $caseFolding=array();
144
145 // An array where charset-to-ASCII mappings are stored (cached)
146 var $toASCII=array();
147
148 // This tells the converter which charsets has two bytes per char:
149 var $twoByteSets=array(
150 'ucs-2'=>1, // 2-byte Unicode
151 );
152
153 // This tells the converter which charsets has four bytes per char:
154 var $fourByteSets=array(
155 'ucs-4'=>1, // 4-byte Unicode
156 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
157 );
158
159 // This tells the converter which charsets use a scheme like the Extended Unix Code:
160 var $eucBasedSets=array(
161 'gb2312'=>1, // Chinese, simplified.
162 'big5'=>1, // Chinese, traditional.
163 'euc-kr'=>1, // Korean
164 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
165 );
166
167 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
168 // http://czyborra.com/charsets/iso8859.html
169 var $synonyms=array(
170 'us' => 'ascii',
171 'us-ascii'=> 'ascii',
172 'cp819' => 'iso-8859-1',
173 'ibm819' => 'iso-8859-1',
174 'iso-ir-100' => 'iso-8859-1',
175 'iso-ir-109' => 'iso-8859-2',
176 'iso-ir-148' => 'iso-8859-9',
177 'iso-ir-199' => 'iso-8859-14',
178 'iso-ir-203' => 'iso-8859-15',
179 'csisolatin1' => 'iso-8859-1',
180 'csisolatin2' => 'iso-8859-2',
181 'csisolatin3' => 'iso-8859-3',
182 'csisolatin5' => 'iso-8859-9',
183 'csisolatin8' => 'iso-8859-14',
184 'csisolatin9' => 'iso-8859-15',
185 'csisolatingreek' => 'iso-8859-7',
186 'iso-celtic' => 'iso-8859-14',
187 'latin1' => 'iso-8859-1',
188 'latin2' => 'iso-8859-2',
189 'latin3' => 'iso-8859-3',
190 'latin5' => 'iso-8859-9',
191 'latin6' => 'iso-8859-10',
192 'latin8' => 'iso-8859-14',
193 'latin9' => 'iso-8859-15',
194 'l1' => 'iso-8859-1',
195 'l2' => 'iso-8859-2',
196 'l3' => 'iso-8859-3',
197 'l5' => 'iso-8859-9',
198 'l6' => 'iso-8859-10',
199 'l8' => 'iso-8859-14',
200 'l9' => 'iso-8859-15',
201 'cyrillic' => 'iso-8859-5',
202 'arabic' => 'iso-8859-6',
203 'tis-620' => 'iso-8859-11',
204 'win874' => 'windows-874',
205 'win1250' => 'windows-1250',
206 'win1251' => 'windows-1251',
207 'win1252' => 'windows-1252',
208 'win1253' => 'windows-1253',
209 'win1254' => 'windows-1254',
210 'win1255' => 'windows-1255',
211 'win1256' => 'windows-1256',
212 'win1257' => 'windows-1257',
213 'win1258' => 'windows-1258',
214 'cp1250' => 'windows-1250',
215 'cp1251' => 'windows-1251',
216 'cp1252' => 'windows-1252',
217 'ms-ee' => 'windows-1250',
218 'ms-ansi' => 'windows-1252',
219 'ms-greek' => 'windows-1253',
220 'ms-turk' => 'windows-1254',
221 'winbaltrim' => 'windows-1257',
222 'koi-8ru' => 'koi-8r',
223 'koi8r' => 'koi-8r',
224 'cp878' => 'koi-8r',
225 'mac' => 'macroman',
226 'macintosh' => 'macroman',
227 'euc-cn' => 'gb2312',
228 'x-euc-cn' => 'gb2312',
229 'euccn' => 'gb2312',
230 'cp936' => 'gb2312',
231 'big-5' => 'big5',
232 'cp950' => 'big5',
233 'eucjp' => 'euc-jp',
234 'sjis' => 'shift_jis',
235 'shift-jis' => 'shift_jis',
236 'cp932' => 'shift_jis',
237 'cp949' => 'euc-kr',
238 'utf7' => 'utf-7',
239 'utf8' => 'utf-8',
240 'utf16' => 'utf-16',
241 'utf32' => 'utf-32',
242 'utf8' => 'utf-8',
243 'ucs2' => 'ucs-2',
244 'ucs4' => 'ucs-4',
245 );
246
247 // mapping of iso-639:2 language codes to language (family) names
248 var $lang_to_langfamily=array(
249 // iso-639:2 language codes, see:
250 // http://www.w3.org/WAI/ER/IG/ert/iso639.htm
251 // http://www.unicode.org/onlinedat/languages.html
252 'ar' => 'arabic',
253 'bg' => 'cyrillic',
254 'cs' => 'east_european',
255 'da' => 'west_european',
256 'de' => 'west_european',
257 'es' => 'west_european',
258 'et' => 'estonian',
259 'eu' => 'west_european',
260 'fi' => 'west_european',
261 'fr' => 'west_european',
262 'gr' => 'greek',
263 'hr' => 'east_european',
264 'hu' => 'east_european',
265 'iw' => 'hebrew',
266 'is' => 'west_european',
267 'it' => 'west_european',
268 'ja' => 'japanese',
269 'kl' => 'west_european',
270 'ko' => 'korean',
271 'lt' => 'lithuanian',
272 'lv' => 'west_european', // Latvian/Lettish
273 'nl' => 'west_european',
274 'no' => 'west_european',
275 'pl' => 'east_european',
276 'pt' => 'west_european',
277 'ro' => 'east_european',
278 'ru' => 'cyrillic',
279 'sk' => 'east_european',
280 'sl' => 'east_european',
281 'sv' => 'west_european',
282 'th' => 'thai',
283 'uk' => 'cyrillic',
284 'vi' => 'vietnamese',
285 'zh' => 'chinese',
286 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
287 'chs' => 'simpl_chinese',
288 'cht' => 'trad_chinese',
289 'csy' => 'east_european',
290 'dan' => 'west_european',
291 'deu' => 'west_european',
292 'dea' => 'west_european',
293 'des' => 'west_european',
294 'ena' => 'west_european',
295 'enc' => 'west_european',
296 'eng' => 'west_european',
297 'enz' => 'west_european',
298 'enu' => 'west_european',
299 'nld' => 'west_european',
300 'nlb' => 'west_european',
301 'fin' => 'west_european',
302 'fra' => 'west_european',
303 'frb' => 'west_european',
304 'frc' => 'west_european',
305 'frs' => 'west_european',
306 'ell' => 'greek',
307 'hun' => 'east_european',
308 'isl' => 'west_euorpean',
309 'ita' => 'west_european',
310 'its' => 'west_european',
311 'jpn' => 'japanese',
312 'kor' => 'korean',
313 'nor' => 'west_european',
314 'non' => 'west_european',
315 'plk' => 'east_european',
316 'ptg' => 'west_european',
317 'ptb' => 'west_european',
318 'rus' => 'east_european',
319 'sky' => 'east_european',
320 'esp' => 'west_european',
321 'esm' => 'west_european',
322 'esn' => 'west_european',
323 'sve' => 'west_european',
324 'trk' => 'turkish',
325 // English language names
326 'bulgarian' => 'east_european',
327 'catalan' => 'west_european',
328 'croatian' => 'east_european',
329 'czech' => 'east_european',
330 'danish' => 'west_european',
331 'dutch' => 'west_european',
332 'english' => 'west_european',
333 'finnish' => 'west_european',
334 'french' => 'west_european',
335 'galician' => 'west_european',
336 'german' => 'west_european',
337 'hungarian' => 'east_european',
338 'icelandic' => 'west_european',
339 'italian' => 'west_european',
340 'latvian' => 'west_european',
341 'lettish' => 'west_european',
342 'norwegian' => 'west_european',
343 'polish' => 'east_european',
344 'portuguese' => 'west_european',
345 'russian' => 'cyrillic',
346 'romanian' => 'east_european',
347 'slovak' => 'east_european',
348 'slovenian' => 'east_european',
349 'spanish' => 'west_european',
350 'svedish' => 'west_european',
351 'turkish' => 'east_european',
352 'ukrainian' => 'cyrillic',
353 );
354
355 // mapping of language (family) names to charsets on Unix
356 var $lang_to_charset_unix=array(
357 'west_european' => 'iso-8859-1',
358 'estonian' => 'iso-8859-1',
359 'east_european' => 'iso-8859-2',
360 'baltic' => 'iso-8859-4',
361 'cyrillic' => 'iso-8859-5',
362 'arabic' => 'iso-8859-6',
363 'greek' => 'iso-8859-7',
364 'hebrew' => 'iso-8859-8',
365 'turkish' => 'iso-8859-9',
366 'thai' => 'iso-8859-11', // = TIS-620
367 'lithuanian' => 'iso-8859-13',
368 'chinese' => 'gb2312', // = euc-cn
369 'japanese' => 'euc-jp',
370 'korean' => 'euc-kr',
371 'simpl_chinese' => 'gb2312',
372 'trad_chinese' => 'big5',
373 'vietnamese' => '',
374 );
375
376 // mapping of language (family) names to charsets on Windows
377 var $lang_to_charset_windows=array(
378 'east_european' => 'windows-1250',
379 'cyrillic' => 'windows-1251',
380 'west_european' => 'windows-1252',
381 'greek' => 'windows-1253',
382 'turkish' => 'windows-1254',
383 'hebrew' => 'windows-1255',
384 'arabic' => 'windows-1256',
385 'baltic' => 'windows-1257',
386 'estonian' => 'windows-1257',
387 'lithuanian' => 'windows-1257',
388 'vietnamese' => 'windows-1258',
389 'thai' => 'cp874',
390 'korean' => 'cp949',
391 'chinese' => 'gb2312',
392 'japanese' => 'shift_jis',
393 'simpl_chinese' => 'gb2312',
394 'trad_chinese' => 'big5',
395 );
396
397 // mapping of locale names to charsets
398 var $locale_to_charset=array(
399 'japanese.euc' => 'euc-jp',
400 'ja_jp.ujis' => 'euc-jp',
401 'korean.euc' => 'euc-kr',
402 'zh_cn' => 'gb2312',
403 'zh_hk' => 'big5',
404 'zh_tw' => 'big5',
405 );
406
407 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
408 // Empty values means "iso-8859-1"
409 var $charSetArray = array(
410 'dk' => '',
411 'de' => '',
412 'no' => '',
413 'it' => '',
414 'fr' => '',
415 'es' => '',
416 'nl' => '',
417 'cz' => 'windows-1250',
418 'pl' => 'iso-8859-2',
419 'si' => 'windows-1250',
420 'fi' => '',
421 'tr' => 'iso-8859-9',
422 'se' => '',
423 'pt' => '',
424 'ru' => 'windows-1251',
425 'ro' => 'iso-8859-2',
426 'ch' => 'gb2312',
427 'sk' => 'windows-1250',
428 'lt' => 'windows-1257',
429 'is' => 'utf-8',
430 'hr' => 'windows-1250',
431 'hu' => 'iso-8859-2',
432 'gl' => '',
433 'th' => 'iso-8859-11',
434 'gr' => 'iso-8859-7',
435 'hk' => 'big5',
436 'eu' => '',
437 'bg' => 'windows-1251',
438 'br' => '',
439 'et' => 'iso-8859-4',
440 'ar' => 'iso-8859-6',
441 'he' => 'utf-8',
442 'ua' => 'windows-1251',
443 'jp' => 'shift_jis',
444 'lv' => 'utf-8',
445 'vn' => 'utf-8',
446 'ca' => 'iso-8859-15',
447 'ba' => 'iso-8859-2',
448 'kr' => 'euc-kr',
449 );
450
451 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
452 // Empty values means sames as Typo3
453 var $isoArray = array(
454 'dk' => 'da',
455 'de' => '',
456 'no' => '',
457 'it' => '',
458 'fr' => '',
459 'es' => '',
460 'nl' => '',
461 'cz' => 'cs',
462 'pl' => '',
463 'si' => 'sl',
464 'fi' => '',
465 'tr' => '',
466 'se' => 'sv',
467 'pt' => '',
468 'ru' => '',
469 'ro' => '',
470 'ch' => 'zh_CN',
471 'sk' => '',
472 'lt' => '',
473 'is' => '',
474 'hr' => '',
475 'hu' => '',
476 'gl' => '', // Greenlandic
477 'th' => '',
478 'gr' => 'el',
479 'hk' => 'zh_HK',
480 'eu' => '',
481 'bg' => '',
482 'br' => 'pt_BR',
483 'et' => '',
484 'ar' => '',
485 'he' => 'iw',
486 'ua' => 'uk',
487 'jp' => 'ja',
488 'lv' => '',
489 'vn' => 'vi',
490 'ca' => '',
491 'ba' => '', // Bosnian
492 'kr' => '',
493 );
494
495 /**
496 * Normalize - changes input character set to lowercase letters.
497 *
498 * @param string Input charset
499 * @return string Normalized charset
500 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
501 */
502 function parse_charset($charset) {
503 $charset = strtolower($charset);
504 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
505
506 return $charset;
507 }
508
509 /**
510 * Get the charset of a locale.
511 *
512 * ln language
513 * ln_CN language / country
514 * ln_CN.cs language / country / charset
515 * ln_CN.cs@mod language / country / charset / modifier
516 *
517 * @param string Locale string
518 * @return string Charset resolved for locale string
519 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
520 */
521 function get_locale_charset($locale) {
522 $locale = strtolower($locale);
523
524 // exact locale specific charset?
525 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale];
526
527 // get modifier
528 list($locale,$modifier) = explode('@',$locale);
529
530 // locale contains charset: use it
531 list($locale,$charset) = explode('.',$locale);
532 if ($charset) return $this->parse_charset($charset);
533
534 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
535 if ($modifier == 'euro') return 'iso-8859-15';
536
537 // get language
538 list($language,$country) = explode('_',$locale);
539 if (isset($this->lang_to_langfamily[$language])) $language = $this->lang_to_langfamily[$language];
540
541 if (TYPO3_OS == 'WIN') {
542 $cs = $this->lang_to_charset_windows[$language];
543 } else {
544 $cs = $this->lang_to_charset_unix[$language];
545 }
546
547 return $cs ? $cs : 'iso-8859-1';
548 }
549
550
551
552
553
554
555
556
557
558 /********************************************
559 *
560 * Charset Conversion functions
561 *
562 ********************************************/
563
564 /**
565 * Convert from one charset to another charset.
566 *
567 * @param string Input string
568 * @param string From charset (the current charset of the string)
569 * @param string To charset (the output charset wanted)
570 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
571 * @return string Converted string
572 * @see convArray()
573 */
574 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
575 if ($fromCS==$toCS) return $str;
576
577 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
578 if ($toCS=='utf-8' || !$useEntityForNoChar) {
579 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
580 case 'mbstring':
581 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
582 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
583 break;
584
585 case 'iconv':
586 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
587 if (false !== $conv_str) return $conv_str;
588 break;
589
590 case 'recode':
591 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
592 if (false !== $conv_str) return $conv_str;
593 break;
594 }
595 // fallback to TYPO3 conversion
596 }
597
598 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
599 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
600 return $str;
601 }
602
603 /**
604 * Convert all elements in ARRAY from one charset to another charset.
605 * NOTICE: Array is passed by reference!
606 *
607 * @param string Input array, possibly multidimensional
608 * @param string From charset (the current charset of the string)
609 * @param string To charset (the output charset wanted)
610 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
611 * @return void
612 * @see conv()
613 */
614 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
615 foreach($array as $key => $value) {
616 if (is_array($array[$key])) {
617 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
618 } else {
619 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
620 }
621 }
622 }
623
624 /**
625 * Converts $str from $charset to UTF-8
626 *
627 * @param string String in local charset to convert to UTF-8
628 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
629 * @return string Output string, converted to UTF-8
630 */
631 function utf8_encode($str,$charset) {
632
633 if ($charset === 'utf-8') return $str;
634
635 // Charset is case-insensitive.
636 if ($this->initCharset($charset)) { // Parse conv. table if not already...
637 $strLen = strlen($str);
638 $outStr='';
639
640 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
641 $chr=substr($str,$a,1);
642 $ord=ord($chr);
643 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
644 $ord2 = ord($str{$a+1});
645 $ord = $ord<<8 & $ord2; // assume big endian
646
647 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
648 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
649 } else $outStr.=chr($this->noCharByteVal); // No char exists
650 $a++;
651 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
652 if ($charset == 'shift_jis' && ($ord <160 || $ord>223)) { // Shift-JIS is like EUC, but chars between 160 and 223 are single byte
653 $a++;
654 $ord2 = ord(substr($str,$a,1));
655 $ord = $ord*256+$ord2;
656 } elseif (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
657 $a++;
658 $ord2 = ord(substr($str,$a,1));
659 $ord = $ord*256+$ord2;
660 }
661
662 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
663 $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
664 } else $outStr.= chr($this->noCharByteVal); // No char exists
665 } else $outStr.= $chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
666 }
667 return $outStr;
668 }
669 }
670
671 /**
672 * Converts $str from UTF-8 to $charset
673 *
674 * @param string String in UTF-8 to convert to local charset
675 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
676 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
677 * @return string Output string, converted to local charset
678 */
679 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
680
681 // Charset is case-insensitive.
682 if ($this->initCharset($charset)) { // Parse conv. table if not already...
683 $strLen = strlen($str);
684 $outStr='';
685 $buf='';
686 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
687 $chr=substr($str,$a,1);
688 $ord=ord($chr);
689 if ($ord>127) { // This means multibyte! (first byte!)
690 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
691
692 $buf=$chr; // Add first byte
693 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
694 $ord = $ord << 1; // Shift it left and ...
695 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
696 $a++; // Increase pointer...
697 $buf.=substr($str,$a,1); // ... and add the next char.
698 } else break;
699 }
700
701 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
702 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
703 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
704 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
705 } else $outStr.= chr($mByte);
706 } elseif ($useEntityForNoChar) { // Create num entity:
707 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
708 } else $outStr.=chr($this->noCharByteVal); // No char exists
709 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
710 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
711 }
712 return $outStr;
713 }
714 }
715
716 /**
717 * Converts all chars > 127 to numeric entities.
718 *
719 * @param string Input string
720 * @return string Output string
721 */
722 function utf8_to_entities($str) {
723 $strLen = strlen($str);
724 $outStr='';
725 $buf='';
726 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
727 $chr=substr($str,$a,1);
728 $ord=ord($chr);
729 if ($ord>127) { // This means multibyte! (first byte!)
730 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
731 $buf=$chr; // Add first byte
732 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
733 $ord = $ord << 1; // Shift it left and ...
734 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
735 $a++; // Increase pointer...
736 $buf.=substr($str,$a,1); // ... and add the next char.
737 } else break;
738 }
739
740 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
741 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
742 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
743 }
744
745 return $outStr;
746 }
747
748 /**
749 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
750 *
751 * @param string Input string, UTF-8
752 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
753 * @return string Output string
754 */
755 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
756 if ($alsoStdHtmlEnt) {
757 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
758 }
759
760 $token = md5(microtime());
761 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
762 foreach($parts as $k => $v) {
763 if ($k%2) {
764 if (substr($v,0,1)=='#') { // Dec or hex entities:
765 if (substr($v,1,1)=='x') {
766 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
767 } else {
768 $parts[$k] = $this->UnumberToChar(substr($v,1));
769 }
770 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
771 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
772 } else { // No conversion:
773 $parts[$k] ='&'.$v.';';
774 }
775 }
776 }
777
778 return implode('',$parts);
779 }
780
781 /**
782 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
783 *
784 * @param string Input string, UTF-8
785 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
786 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
787 * @return array Output array with the char numbers
788 */
789 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
790 // If entities must be registered as well...:
791 if ($convEntities) {
792 $str = $this->entities_to_utf8($str,1);
793 }
794 // Do conversion:
795 $strLen = strlen($str);
796 $outArr=array();
797 $buf='';
798 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
799 $chr=substr($str,$a,1);
800 $ord=ord($chr);
801 if ($ord>127) { // This means multibyte! (first byte!)
802 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
803 $buf=$chr; // Add first byte
804 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
805 $ord = $ord << 1; // Shift it left and ...
806 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
807 $a++; // Increase pointer...
808 $buf.=substr($str,$a,1); // ... and add the next char.
809 } else break;
810 }
811
812 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
813 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
814 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
815 }
816
817 return $outArr;
818 }
819
820 /**
821 * Converts a UNICODE number to a UTF-8 multibyte character
822 * Algorithm based on script found at From: http://czyborra.com/utf/
823 * Unit-tested by Kasper
824 *
825 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
826 *
827 * bytes | bits | representation
828 * 1 | 7 | 0vvvvvvv
829 * 2 | 11 | 110vvvvv 10vvvvvv
830 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
831 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
832 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
833 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
834 *
835 * @param integer UNICODE integer
836 * @return string UTF-8 multibyte character string
837 * @see utf8CharToUnumber()
838 */
839 function UnumberToChar($cbyte) {
840 $str='';
841
842 if ($cbyte < 0x80) {
843 $str.=chr($cbyte);
844 } else if ($cbyte < 0x800) {
845 $str.=chr(0xC0 | ($cbyte >> 6));
846 $str.=chr(0x80 | ($cbyte & 0x3F));
847 } else if ($cbyte < 0x10000) {
848 $str.=chr(0xE0 | ($cbyte >> 12));
849 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
850 $str.=chr(0x80 | ($cbyte & 0x3F));
851 } else if ($cbyte < 0x200000) {
852 $str.=chr(0xF0 | ($cbyte >> 18));
853 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
854 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
855 $str.=chr(0x80 | ($cbyte & 0x3F));
856 } else if ($cbyte < 0x4000000) {
857 $str.=chr(0xF8 | ($cbyte >> 24));
858 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
859 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
860 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
861 $str.=chr(0x80 | ($cbyte & 0x3F));
862 } else if ($cbyte < 0x80000000) {
863 $str.=chr(0xFC | ($cbyte >> 30));
864 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
865 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
866 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
867 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
868 $str.=chr(0x80 | ($cbyte & 0x3F));
869 } else { // Cannot express a 32-bit character in UTF-8
870 $str .= chr($this->noCharByteVal);
871 }
872 return $str;
873 }
874
875 /**
876 * Converts a UTF-8 Multibyte character to a UNICODE number
877 * Unit-tested by Kasper
878 *
879 * @param string UTF-8 multibyte character string
880 * @param boolean If set, then a hex. number is returned.
881 * @return integer UNICODE integer
882 * @see UnumberToChar()
883 */
884 function utf8CharToUnumber($str,$hex=0) {
885 $ord=ord(substr($str,0,1)); // First char
886
887 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
888 $binBuf='';
889 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
890 $ord = $ord << 1; // Shift it left and ...
891 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
892 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
893 } else break;
894 }
895 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
896
897 $int = bindec($binBuf);
898 } else $int = $ord;
899
900 return $hex ? 'x'.dechex($int) : $int;
901 }
902
903
904
905
906
907
908
909
910
911 /********************************************
912 *
913 * Init functions
914 *
915 ********************************************/
916
917 /**
918 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
919 * This function is automatically called by the conversion functions
920 *
921 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
922 *
923 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
924 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
925 * @access private
926 */
927 function initCharset($charset) {
928 // Only process if the charset is not yet loaded:
929 if (!is_array($this->parsedCharsets[$charset])) {
930
931 // Conversion table filename:
932 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
933
934 // If the conversion table is found:
935 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
936 // Cache file for charsets:
937 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
938 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
939 if ($cacheFile && @is_file($cacheFile)) {
940 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
941 } else {
942 // Parse conversion table into lines:
943 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
944 // Initialize the internal variable holding the conv. table:
945 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
946 // traverse the lines:
947 $detectedType='';
948 foreach($lines as $value) {
949 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
950
951 // Detect type if not done yet: (Done on first real line)
952 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
953 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
954
955 if ($detectedType=='ms-token') {
956 list($hexbyte,$utf8) = split('=|:',$value,3);
957 } elseif ($detectedType=='whitespaced') {
958 $regA=array();
959 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
960 $hexbyte = $regA[1];
961 $utf8 = 'U+'.$regA[2];
962 }
963 $decval = hexdec(trim($hexbyte));
964 if ($decval>127) {
965 $utf8decval = hexdec(substr(trim($utf8),2));
966 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
967 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
968 }
969 }
970 }
971 if ($cacheFile) {
972 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
973 }
974 }
975 return 2;
976 } else return false;
977 } else return 1;
978 }
979
980 /**
981 * This function initializes all UTF-8 character data tables.
982 *
983 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
984 *
985 * @param string Mode ("case", "ascii", ...)
986 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
987 * @access private
988 */
989 function initUnicodeData($mode=null) {
990 // cache files
991 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
992 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
993
994 // Only process if the tables are not yet loaded
995 switch($mode) {
996 case 'case':
997 if (is_array($this->caseFolding['utf-8'])) return 1;
998
999 // Use cached version if possible
1000 if ($cacheFileCase && @is_file($cacheFileCase)) {
1001 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1002 return 2;
1003 }
1004 break;
1005
1006 case 'ascii':
1007 if (is_array($this->toASCII['utf-8'])) return 1;
1008
1009 // Use cached version if possible
1010 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1011 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1012 return 2;
1013 }
1014 break;
1015 }
1016
1017 // process main Unicode data file
1018 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
1019 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
1020
1021 $fh = fopen($unicodeDataFile,'rb');
1022 if (!$fh) return false;
1023
1024 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1025 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1026 $this->caseFolding['utf-8'] = array();
1027 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1028 $utf8CaseFolding['toUpper'] = array();
1029 $utf8CaseFolding['toLower'] = array();
1030 $utf8CaseFolding['toTitle'] = array();
1031
1032 $decomposition = array(); // array of temp. decompositions
1033 $mark = array(); // array of chars that are marks (eg. composing accents)
1034 $number = array(); // array of chars that are numbers (eg. digits)
1035 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1036
1037 while (!feof($fh)) {
1038 $line = fgets($fh,4096);
1039 // has a lot of info
1040 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
1041
1042 $ord = hexdec($char);
1043 if ($ord > 0xFFFF) break; // only process the BMP
1044
1045 $utf8_char = $this->UnumberToChar($ord);
1046
1047 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1048 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1049 // store "title" only when different from "upper" (only a few)
1050 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1051
1052 switch ($cat{0}) {
1053 case 'M': // mark (accent, umlaut, ...)
1054 $mark["U+$char"] = 1;
1055 break;
1056
1057 case 'N': // numeric value
1058 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
1059 }
1060
1061 // accented Latin letters without "official" decomposition
1062 $match = array();
1063 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) {
1064 $c = ord($match[2]);
1065 if ($match[1] == 'SMALL') $c += 32;
1066
1067 $decomposition["U+$char"] = array(dechex($c));
1068 continue;
1069 }
1070
1071 $match = array();
1072 if (ereg('(<.*>)? *(.+)',$decomp,$match)) {
1073 switch($match[1]) {
1074 case '<circle>': // add parenthesis as circle replacement, eg (1)
1075 $match[2] = '0028 '.$match[2].' 0029';
1076 break;
1077
1078 case '<square>': // add square brackets as square replacement, eg [1]
1079 $match[2] = '005B '.$match[2].' 005D';
1080 break;
1081
1082 case '<compat>': // ignore multi char decompositions that start with a space
1083 if (ereg('^0020 ',$match[2])) continue 2;
1084 break;
1085
1086 // ignore Arabic and vertical layout presentation decomposition
1087 case '<initial>':
1088 case '<medial>':
1089 case '<final>':
1090 case '<isolated>':
1091 case '<vertical>':
1092 continue 2;
1093 }
1094 $decomposition["U+$char"] = split(' ',$match[2]);
1095 }
1096 }
1097 fclose($fh);
1098
1099 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1100 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
1101 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1102 $fh = fopen($specialCasingFile,'rb');
1103 if ($fh) {
1104 while (!feof($fh)) {
1105 $line = fgets($fh,4096);
1106 if ($line{0} != '#' && trim($line) != '') {
1107
1108 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
1109 if ($cond == '' || $cond{0} == '#') {
1110 $utf8_char = $this->UnumberToChar(hexdec($char));
1111 if ($char != $lower) {
1112 $arr = split(' ',$lower);
1113 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1114 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1115 }
1116 if ($char != $title && $title != $upper) {
1117 $arr = split(' ',$title);
1118 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1119 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1120 }
1121 if ($char != $upper) {
1122 $arr = split(' ',$upper);
1123 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1124 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1125 }
1126 }
1127 }
1128 }
1129 fclose($fh);
1130 }
1131 }
1132
1133 // process custom decompositions
1134 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
1135 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1136 $fh = fopen($customTranslitFile,'rb');
1137 if ($fh) {
1138 while (!feof($fh)) {
1139 $line = fgets($fh,4096);
1140 if ($line{0} != '#' && trim($line) != '') {
1141 list($char,$translit) = t3lib_div::trimExplode(';', $line);
1142 if (!$translit) $omit["U+$char"] = 1;
1143 $decomposition["U+$char"] = split(' ', $translit);
1144
1145 }
1146 }
1147 fclose($fh);
1148 }
1149 }
1150
1151 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1152 foreach($decomposition as $from => $to) {
1153 $code_decomp = array();
1154
1155 while ($code_value = array_shift($to)) {
1156 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1157 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
1158 array_unshift($to, $cv);
1159 }
1160 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1161 array_push($code_decomp, $code_value);
1162 }
1163 }
1164 if (count($code_decomp) || isset($omit[$from])) {
1165 $decomposition[$from] = $code_decomp;
1166 } else {
1167 unset($decomposition[$from]);
1168 }
1169 }
1170
1171 // create ascii only mapping
1172 $this->toASCII['utf-8'] = array();
1173 $ascii =& $this->toASCII['utf-8'];
1174
1175 foreach($decomposition as $from => $to) {
1176 $code_decomp = array();
1177 while ($code_value = array_shift($to)) {
1178 $ord = hexdec($code_value);
1179 if ($ord > 127)
1180 continue 2; // skip decompositions containing non-ASCII chars
1181 else
1182 array_push($code_decomp,chr($ord));
1183 }
1184 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1185 }
1186
1187 // add numeric decompositions
1188 foreach($number as $from => $to) {
1189 $utf8_char = $this->UnumberToChar(hexdec($from));
1190 if (!isset($ascii[$utf8_char])) {
1191 $ascii[$utf8_char] = $to;
1192 }
1193 }
1194
1195 if ($cacheFileCase) {
1196 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1197 }
1198
1199 if ($cacheFileASCII) {
1200 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1201 }
1202
1203 return 3;
1204 }
1205
1206 /**
1207 * This function initializes the folding table for a charset other than UTF-8.
1208 * This function is automatically called by the case folding functions.
1209 *
1210 * @param string Charset for which to initialize case folding.
1211 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1212 * @access private
1213 */
1214 function initCaseFolding($charset) {
1215 // Only process if the case table is not yet loaded:
1216 if (is_array($this->caseFolding[$charset])) return 1;
1217
1218 // Use cached version if possible
1219 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1220 if ($cacheFile && @is_file($cacheFile)) {
1221 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1222 return 2;
1223 }
1224
1225 // init UTF-8 conversion for this charset
1226 if (!$this->initCharset($charset)) {
1227 return false;
1228 }
1229
1230 // UTF-8 case folding is used as the base conversion table
1231 if (!$this->initUnicodeData('case')) {
1232 return false;
1233 }
1234
1235 $nochar = chr($this->noCharByteVal);
1236 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1237 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1238 $c = $this->utf8_decode($utf8, $charset);
1239
1240 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1241 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1242 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1243
1244 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1245 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1246 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
1247
1248 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1249 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1250 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1251 }
1252
1253 // add the ASCII case table
1254 for ($i=ord('a'); $i<=ord('z'); $i++) {
1255 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
1256 }
1257 for ($i=ord('A'); $i<=ord('Z'); $i++) {
1258 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
1259 }
1260
1261 if ($cacheFile) {
1262 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
1263 }
1264
1265 return 3;
1266 }
1267
1268 /**
1269 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1270 * This function is automatically called by the ASCII transliteration functions.
1271 *
1272 * @param string Charset for which to initialize conversion.
1273 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1274 * @access private
1275 */
1276 function initToASCII($charset) {
1277 // Only process if the case table is not yet loaded:
1278 if (is_array($this->toASCII[$charset])) return 1;
1279
1280 // Use cached version if possible
1281 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1282 if ($cacheFile && @is_file($cacheFile)) {
1283 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1284 return 2;
1285 }
1286
1287 // init UTF-8 conversion for this charset
1288 if (!$this->initCharset($charset)) {
1289 return false;
1290 }
1291
1292 // UTF-8/ASCII transliteration is used as the base conversion table
1293 if (!$this->initUnicodeData('ascii')) {
1294 return false;
1295 }
1296
1297 $nochar = chr($this->noCharByteVal);
1298 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1299 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1300 $c = $this->utf8_decode($utf8, $charset);
1301
1302 if (isset($this->toASCII['utf-8'][$utf8])) {
1303 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1304 }
1305 }
1306
1307 if ($cacheFile) {
1308 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
1309 }
1310
1311 return 3;
1312 }
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329 /********************************************
1330 *
1331 * String operation functions
1332 *
1333 ********************************************/
1334
1335 /**
1336 * Returns a part of a string.
1337 * Unit-tested by Kasper (single byte charsets only)
1338 *
1339 * @param string The character set
1340 * @param string Character string
1341 * @param integer Start position (character position)
1342 * @param integer Length (in characters)
1343 * @return string The substring
1344 * @see substr(), mb_substr()
1345 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1346 */
1347 function substr($charset,$string,$start,$len=null) {
1348 if ($len===0) return '';
1349
1350 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1351 // cannot omit $len, when specifying charset
1352 if ($len==null) {
1353 $enc = mb_internal_encoding(); // save internal encoding
1354 mb_internal_encoding('utf-8');
1355 $str = mb_substr($string,$start);
1356 mb_internal_encoding($enc); // restore internal encoding
1357
1358 return $str;
1359 }
1360 else return mb_substr($string,$start,$len,'utf-8');
1361 } elseif ($charset == 'utf-8') {
1362 return $this->utf8_substr($string,$start,$len);
1363 } elseif ($this->eucBasedSets[$charset]) {
1364 return $this->euc_substr($string,$start,$charset,$len);
1365 } elseif ($this->twoByteSets[$charset]) {
1366 return substr($string,$start*2,$len*2);
1367 } elseif ($this->fourByteSets[$charset]) {
1368 return substr($string,$start*4,$len*4);
1369 }
1370
1371 // treat everything else as single-byte encoding
1372 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
1373 }
1374
1375 /**
1376 * Counts the number of characters.
1377 * Unit-tested by Kasper (single byte charsets only)
1378 *
1379 * @param string The character set
1380 * @param string Character string
1381 * @return integer The number of characters
1382 * @see strlen()
1383 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1384 */
1385 function strlen($charset,$string) {
1386 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1387 return mb_strlen($string,$charset);
1388 } elseif ($charset == 'utf-8') {
1389 return $this->utf8_strlen($string);
1390 } elseif ($this->eucBasedSets[$charset]) {
1391 return $this->euc_strlen($string,$charset);
1392 } elseif ($this->twoByteSets[$charset]) {
1393 return strlen($string)/2;
1394 } elseif ($this->fourByteSets[$charset]) {
1395 return strlen($string)/4;
1396 }
1397 // treat everything else as single-byte encoding
1398 return strlen($string);
1399 }
1400
1401 /**
1402 * Truncates a string and pre-/appends a string.
1403 * Unit tested by Kasper
1404 *
1405 * @param string The character set
1406 * @param string Character string
1407 * @param integer Length (in characters)
1408 * @param string Crop signifier
1409 * @return string The shortened string
1410 * @see substr(), mb_strimwidth()
1411 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1412 */
1413 function crop($charset,$string,$len,$crop='') {
1414 if (intval($len) == 0) return $string;
1415
1416 if ($charset == 'utf-8') {
1417 $i = $this->utf8_char2byte_pos($string,$len);
1418 } elseif ($this->eucBasedSets[$charset]) {
1419 $i = $this->euc_char2byte_pos($string,$len,$charset);
1420 } else {
1421 if ($len > 0) {
1422 $i = $len;
1423 } else {
1424 $i = strlen($string)+$len;
1425 if ($i<=0) $i = false;
1426 }
1427 }
1428
1429 if ($i === false) { // $len outside actual string length
1430 return $string;
1431 } else {
1432 if ($len > 0) {
1433 if (strlen($string{$i})) {
1434 return substr($string,0,$i).$crop;
1435
1436 }
1437 } else {
1438 if (strlen($string{$i-1})) {
1439 return $crop.substr($string,$i);
1440 }
1441 }
1442
1443 /*
1444 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1445 if ($len > 0) {
1446 return substr($string,0,$i).$crop;
1447 } else {
1448 return $crop.substr($string,$i);
1449 }
1450 }
1451 */
1452 }
1453 return $string;
1454 }
1455
1456 /**
1457 * Cuts a string short at a given byte length.
1458 *
1459 * @param string The character set
1460 * @param string Character string
1461 * @param integer The byte length
1462 * @return string The shortened string
1463 * @see mb_strcut()
1464 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1465 */
1466 function strtrunc($charset,$string,$len) {
1467 if ($len <= 0) return '';
1468
1469 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1470 return mb_strcut($string,0,$len,$charset);
1471 } elseif ($charset == 'utf-8') {
1472 return $this->utf8_strtrunc($string,$len);
1473 } elseif ($this->eucBasedSets[$charset]) {
1474 return $this->euc_strtrunc($string,$charset);
1475 } elseif ($this->twoByteSets[$charset]) {
1476 if ($len % 2) $len--; // don't cut at odd positions
1477 } elseif ($this->fourByteSets[$charset]) {
1478 $x = $len % 4;
1479 $len -= $x; // realign to position dividable by four
1480 }
1481 // treat everything else as single-byte encoding
1482 return substr($string,0,$len);
1483 }
1484
1485 /**
1486 * Translates all characters of a string into their respective case values.
1487 * Unlike strtolower() and strtoupper() this method is locale independent.
1488 * Note that the string length may change!
1489 * eg. lower case German �(sharp S) becomes upper case "SS"
1490 * Unit-tested by Kasper
1491 * Real case folding is language dependent, this method ignores this fact.
1492 *
1493 * @param string Character set of string
1494 * @param string Input string to convert case for
1495 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1496 * @return string The converted string
1497 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1498 * @see strtolower(), strtoupper()
1499 */
1500 function conv_case($charset,$string,$case) {
1501 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' && float(phpversion()) >= 4.3) {
1502 if ($case == 'toLower') {
1503 return mb_strtolower($str,'utf-8');
1504 } else {
1505 return mb_strtoupper($str,'utf-8');
1506 }
1507 } elseif ($charset == 'utf-8') {
1508 return $this->utf8_char_mapping($string,'case',$case);
1509 } elseif (isset($this->eucBasedSets[$charset])) {
1510 return $this->euc_char_mapping($string,$charset,'case',$case);
1511 } else {
1512 // treat everything else as single-byte encoding
1513 return $this->sb_char_mapping($string,$charset,'case',$case);
1514 }
1515
1516 return $string;
1517 }
1518
1519 /**
1520 * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
1521 *
1522 * @param string Character set of string
1523 * @param string Input string to convert
1524 * @return string The converted string
1525 */
1526 function specCharsToASCII($charset,$string) {
1527 if ($charset == 'utf-8') {
1528 return $this->utf8_char_mapping($string,'ascii');
1529 } elseif (isset($this->eucBasedSets[$charset])) {
1530 return $this->euc_char_mapping($string,$charset,'ascii');
1531 } else {
1532 // treat everything else as single-byte encoding
1533 return $this->sb_char_mapping($string,$charset,'ascii');
1534 }
1535
1536 return $string;
1537 }
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550 /********************************************
1551 *
1552 * Internal string operation functions
1553 *
1554 ********************************************/
1555
1556 /**
1557 * Maps all characters of a string in a single byte charset.
1558 *
1559 * @param string the string
1560 * @param string the charset
1561 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1562 * @param string 'case': conversion 'toLower' or 'toUpper'
1563 * @return string the converted string
1564 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1565 */
1566 function sb_char_mapping($str,$charset,$mode,$opt='') {
1567 switch($mode) {
1568 case 'case':
1569 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1570 $map =& $this->caseFolding[$charset][$opt];
1571 break;
1572
1573 case 'ascii':
1574 if (!$this->initToASCII($charset)) return $str; // do nothing
1575 $map =& $this->toASCII[$charset];
1576 break;
1577
1578 default:
1579 return $str;
1580 }
1581
1582 $out = '';
1583 for($i=0; strlen($str{$i}); $i++) {
1584 $c = $str{$i};
1585 if (isset($map[$c])) {
1586 $out .= $map[$c];
1587 } else {
1588 $out .= $c;
1589 }
1590 }
1591
1592 return $out;
1593 }
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604 /********************************************
1605 *
1606 * Internal UTF-8 string operation functions
1607 *
1608 ********************************************/
1609
1610 /**
1611 * Returns a part of a UTF-8 string.
1612 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1613 *
1614 * @param string UTF-8 string
1615 * @param integer Start position (character position)
1616 * @param integer Length (in characters)
1617 * @return string The substring
1618 * @see substr()
1619 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1620 */
1621 function utf8_substr($str,$start,$len=null) {
1622 if (!strcmp($len,'0')) return '';
1623
1624 $byte_start = $this->utf8_char2byte_pos($str,$start);
1625 if ($byte_start === false) {
1626 if ($start > 0) {
1627 return false; // $start outside string length
1628 } else {
1629 $start = 0;
1630 }
1631 }
1632
1633 $str = substr($str,$byte_start);
1634
1635 if ($len!=null) {
1636 $byte_end = $this->utf8_char2byte_pos($str,$len);
1637 if ($byte_end === false) // $len outside actual string length
1638 return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string.
1639 else
1640 return substr($str,0,$byte_end);
1641 }
1642 else return $str;
1643 }
1644
1645 /**
1646 * Counts the number of characters of a string in UTF-8.
1647 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1648 *
1649 * @param string UTF-8 multibyte character string
1650 * @return integer The number of characters
1651 * @see strlen()
1652 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1653 */
1654 function utf8_strlen($str) {
1655 $n=0;
1656 for($i=0; strlen($str{$i}); $i++) {
1657 $c = ord($str{$i});
1658 if (!($c & 0x80)) // single-byte (0xxxxxx)
1659 $n++;
1660 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1661 $n++;
1662 }
1663 return $n;
1664 }
1665
1666 /**
1667 * Truncates a string in UTF-8 short at a given byte length.
1668 *
1669 * @param string UTF-8 multibyte character string
1670 * @param integer the byte length
1671 * @return string the shortened string
1672 * @see mb_strcut()
1673 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1674 */
1675 function utf8_strtrunc($str,$len) {
1676 $i = $len-1;
1677 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1678 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1679 if ($i <= 0) return ''; // sanity check
1680 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1681 if ($bc+$i > $len) return substr($str,0,$i);
1682 // fallthru: multibyte char fits into length
1683 }
1684 return substr($str,0,$len);
1685 }
1686
1687 /**
1688 * Find position of first occurrence of a string, both arguments are in UTF-8.
1689 *
1690 * @param string UTF-8 string to search in
1691 * @param string UTF-8 string to search for
1692 * @param integer Positition to start the search
1693 * @return integer The character position
1694 * @see strpos()
1695 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1696 */
1697 function utf8_strpos($haystack,$needle,$offset=0) {
1698 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1699 return mb_strpos($haystack,$needle,'utf-8');
1700 }
1701
1702 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1703 if ($byte_offset === false) return false; // offset beyond string length
1704
1705 $byte_pos = strpos($haystack,$needle,$byte_offset);
1706 if ($byte_pos === false) return false; // needle not found
1707
1708 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1709 }
1710
1711 /**
1712 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1713 *
1714 * @param string UTF-8 string to search in
1715 * @param string UTF-8 character to search for (single character)
1716 * @return integer The character position
1717 * @see strrpos()
1718 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1719 */
1720 function utf8_strrpos($haystack,$needle) {
1721 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1722 return mb_strrpos($haystack,$needle,'utf-8');
1723 }
1724
1725 $byte_pos = strrpos($haystack,$needle);
1726 if ($byte_pos === false) return false; // needle not found
1727
1728 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1729 }
1730
1731 /**
1732 * Translates a character position into an 'absolute' byte position.
1733 * Unit tested by Kasper.
1734 *
1735 * @param string UTF-8 string
1736 * @param integer Character position (negative values start from the end)
1737 * @return integer Byte position
1738 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1739 */
1740 function utf8_char2byte_pos($str,$pos) {
1741 $n = 0; // number of characters found
1742 $p = abs($pos); // number of characters wanted
1743
1744 if ($pos >= 0) {
1745 $i = 0;
1746 $d = 1;
1747 } else {
1748 $i = strlen($str)-1;
1749 $d = -1;
1750 }
1751
1752 for( ; strlen($str{$i}) && $n<$p; $i+=$d) {
1753 $c = (int)ord($str{$i});
1754 if (!($c & 0x80)) // single-byte (0xxxxxx)
1755 $n++;
1756 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1757 $n++;
1758 }
1759 if (!strlen($str{$i})) return false; // offset beyond string length
1760
1761 if ($pos >= 0) {
1762 // skip trailing multi-byte data bytes
1763 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1764 } else {
1765 // correct offset
1766 $i++;
1767 }
1768
1769 return $i;
1770 }
1771
1772 /**
1773 * Translates an 'absolute' byte position into a character position.
1774 * Unit tested by Kasper.
1775 *
1776 * @param string UTF-8 string
1777 * @param integer byte position
1778 * @return integer character position
1779 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1780 */
1781 function utf8_byte2char_pos($str,$pos) {
1782 $n = 0; // number of characters
1783 for($i=$pos; $i>0; $i--) {
1784 $c = (int)ord($str{$i});
1785 if (!($c & 0x80)) // single-byte (0xxxxxx)
1786 $n++;
1787 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1788 $n++;
1789 }
1790 if (!strlen($str{$i})) return false; // offset beyond string length
1791
1792 return $n;
1793 }
1794
1795 /**
1796 * Maps all characters of an UTF-8 string.
1797 *
1798 * @param string UTF-8 string
1799 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1800 * @param string 'case': conversion 'toLower' or 'toUpper'
1801 * @return string the converted string
1802 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1803 */
1804 function utf8_char_mapping($str,$mode,$opt='') {
1805 if (!$this->initUnicodeData($mode)) return $str; // do nothing
1806
1807 $out = '';
1808 switch($mode) {
1809 case 'case':
1810 $map =& $this->caseFolding['utf-8'][$opt];
1811 break;
1812
1813 case 'ascii':
1814 $map =& $this->toASCII['utf-8'];
1815 break;
1816
1817 default:
1818 return $str;
1819 }
1820
1821 for($i=0; strlen($str{$i}); $i++) {
1822 $c = ord($str{$i});
1823 if (!($c & 0x80)) // single-byte (0xxxxxx)
1824 $mbc = $str{$i};
1825 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1826 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1827 $mbc = substr($str,$i,$bc);
1828 $i += $bc-1;
1829 }
1830
1831 if (isset($map[$mbc])) {
1832 $out .= $map[$mbc];
1833 } else {
1834 $out .= $mbc;
1835 }
1836 }
1837
1838 return $out;
1839 }
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858 /********************************************
1859 *
1860 * Internal EUC string operation functions
1861 *
1862 * Extended Unix Code:
1863 * ASCII compatible 7bit single bytes chars
1864 * 8bit two byte chars
1865 *
1866 * Shift-JIS is treated as a special case.
1867 *
1868 ********************************************/
1869
1870 /**
1871 * Cuts a string in the EUC charset family short at a given byte length.
1872 *
1873 * @param string EUC multibyte character string
1874 * @param integer the byte length
1875 * @param string the charset
1876 * @return string the shortened string
1877 * @see mb_strcut()
1878 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1879 */
1880 function euc_strtrunc($str,$len,$charset) {
1881 $sjis = ($charset == 'shift_jis');
1882 for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
1883 $c = ord($str{$i});
1884 if ($sjis) {
1885 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1886 }
1887 else {
1888 if ($c >= 0x80) $i++; // advance a double-byte char
1889 }
1890 }
1891 if (!strlen($str{$i})) return $str; // string shorter than supplied length
1892
1893 if ($i>$len)
1894 return substr($str,0,$len-1); // we ended on a first byte
1895 else
1896 return substr($str,0,$len);
1897 }
1898
1899 /**
1900 * Returns a part of a string in the EUC charset family.
1901 *
1902 * @param string EUC multibyte character string
1903 * @param integer start position (character position)
1904 * @param string the charset
1905 * @param integer length (in characters)
1906 * @return string the substring
1907 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1908 */
1909 function euc_substr($str,$start,$charset,$len=null) {
1910 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1911 if ($byte_start === false) return false; // $start outside string length
1912
1913 $str = substr($str,$byte_start);
1914
1915 if ($len!=null) {
1916 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1917 if ($byte_end === false) // $len outside actual string length
1918 return $str;
1919 else
1920 return substr($str,0,$byte_end);
1921 }
1922 else return $str;
1923 }
1924
1925 /**
1926 * Counts the number of characters of a string in the EUC charset family.
1927 *
1928 * @param string EUC multibyte character string
1929 * @param string the charset
1930 * @return integer the number of characters
1931 * @see strlen()
1932 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1933 */
1934 function euc_strlen($str,$charset) {
1935 $sjis = ($charset == 'shift_jis');
1936 $n=0;
1937 for ($i=0; strlen($str{$i}); $i++) {
1938 $c = ord($str{$i});
1939 if ($sjis) {
1940 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1941 }
1942 else {
1943 if ($c >= 0x80) $i++; // advance a double-byte char
1944 }
1945
1946 $n++;
1947 }
1948
1949 return $n;
1950 }
1951
1952 /**
1953 * Translates a character position into an 'absolute' byte position.
1954 *
1955 * @param string EUC multibyte character string
1956 * @param integer character position (negative values start from the end)
1957 * @param string the charset
1958 * @return integer byte position
1959 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1960 */
1961 function euc_char2byte_pos($str,$pos,$charset) {
1962 $sjis = ($charset == 'shift_jis');
1963 $n = 0; // number of characters seen
1964 $p = abs($pos); // number of characters wanted
1965
1966 if ($pos >= 0) {
1967 $i = 0;
1968 $d = 1;
1969 } else {
1970 $i = strlen($str)-1;
1971 $d = -1;
1972 }
1973
1974 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
1975 $c = ord($str{$i});
1976 if ($sjis) {
1977 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char
1978 }
1979 else {
1980 if ($c >= 0x80) $i+=$d; // advance a double-byte char
1981 }
1982
1983 $n++;
1984 }
1985 if (!strlen($str{$i})) return false; // offset beyond string length
1986
1987 if ($pos < 0) $i++; // correct offset
1988
1989 return $i;
1990 }
1991
1992 /**
1993 * Maps all characters of a string in the EUC charset family.
1994 *
1995 * @param string EUC multibyte character string
1996 * @param string the charset
1997 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1998 * @param string 'case': conversion 'toLower' or 'toUpper'
1999 * @return string the converted string
2000 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2001 */
2002 function euc_char_mapping($str,$charset,$mode,$opt='') {
2003 switch($mode) {
2004 case 'case':
2005 if (!$this->initCaseFolding($charset)) return $str; // do nothing
2006 $map =& $this->caseFolding[$charset][$opt];
2007 break;
2008
2009 case 'ascii':
2010 if (!$this->initToASCII($charset)) return $str; // do nothing
2011 $map =& $this->toASCII[$charset];
2012 break;
2013
2014 default:
2015 return $str;
2016 }
2017
2018 $sjis = ($charset == 'shift_jis');
2019 $out = '';
2020 for($i=0; strlen($str{$i}); $i++) {
2021 $mbc = $str{$i};
2022 $c = ord($mbc);
2023
2024 if ($sjis) {
2025 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2026 $mbc = substr($str,$i,2);
2027 $i++;
2028 }
2029 }
2030 else {
2031 if ($c >= 0x80) { // a double-byte char
2032 $mbc = substr($str,$i,2);
2033 $i++;
2034 }
2035 }
2036
2037 if (isset($map[$mbc])) {
2038 $out .= $map[$mbc];
2039 } else {
2040 $out .= $mbc;
2041 }
2042 }
2043
2044 return $out;
2045 }
2046
2047 }
2048
2049 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
2050 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2051 }
2052 ?>