Fix utf8_encode for shift_jis (problem with chars between 160 and 223)
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2004 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 136: class t3lib_cs
38 * 502: function parse_charset($charset)
39 * 521: function get_locale_charset($locale)
40 *
41 * SECTION: Charset Conversion functions
42 * 574: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
43 * 614: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
44 * 631: function utf8_encode($str,$charset)
45 * 678: function utf8_decode($str,$charset,$useEntityForNoChar=0)
46 * 721: function utf8_to_entities($str)
47 * 754: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
48 * 788: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
49 * 838: function UnumberToChar($cbyte)
50 * 883: function utf8CharToUnumber($str,$hex=0)
51 *
52 * SECTION: Init functions
53 * 926: function initCharset($charset)
54 * 988: function initUnicodeData($mode=null)
55 * 1213: function initCaseFolding($charset)
56 * 1275: function initToASCII($charset)
57 *
58 * SECTION: String operation functions
59 * 1346: function substr($charset,$string,$start,$len=null)
60 * 1384: function strlen($charset,$string)
61 * 1412: function crop($charset,$string,$len,$crop='')
62 * 1465: function strtrunc($charset,$string,$len)
63 * 1499: function conv_case($charset,$string,$case)
64 * 1525: function specCharsToASCII($charset,$string)
65 *
66 * SECTION: Internal string operation functions
67 * 1565: function sb_char_mapping($str,$charset,$mode,$opt='')
68 *
69 * SECTION: Internal UTF-8 string operation functions
70 * 1620: function utf8_substr($str,$start,$len=null)
71 * 1653: function utf8_strlen($str)
72 * 1674: function utf8_strtrunc($str,$len)
73 * 1696: function utf8_strpos($haystack,$needle,$offset=0)
74 * 1719: function utf8_strrpos($haystack,$needle)
75 * 1739: function utf8_char2byte_pos($str,$pos)
76 * 1780: function utf8_byte2char_pos($str,$pos)
77 * 1803: function utf8_char_mapping($str,$mode,$opt='')
78 *
79 * SECTION: Internal EUC string operation functions
80 * 1879: function euc_strtrunc($str,$len,$charset)
81 * 1908: function euc_substr($str,$start,$charset,$len=null)
82 * 1933: function euc_strlen($str,$charset)
83 * 1960: function euc_char2byte_pos($str,$pos,$charset)
84 * 2001: function euc_char_mapping($str,$charset,$mode,$opt='')
85 *
86 * TOTAL FUNCTIONS: 35
87 * (This index is automatically created/updated by the extension "extdeveval")
88 *
89 */
90
91
92
93
94
95
96
97
98 /**
99 * Notes on UTF-8
100 *
101 * Functions working on UTF-8 strings:
102 *
103 * - strchr/strstr
104 * - strrchr
105 * - substr_count
106 * - implode/explode/join
107 *
108 * Functions nearly working on UTF-8 strings:
109 *
110 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf_strlen
111 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained 7-bit ASCII
112 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
113 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
114 *
115 * Functions NOT working on UTF-8 strings:
116 *
117 * - str*cmp
118 * - stristr
119 * - stripos
120 * - substr
121 * - strrev
122 * - ereg/eregi
123 * - split/spliti
124 * - preg_*
125 * - ...
126 *
127 */
128 /**
129 * Class for conversion between charsets
130 *
131 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
132 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
133 * @package TYPO3
134 * @subpackage t3lib
135 */
136 class t3lib_cs {
137 var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
138
139 // This is the array where parsed conversion tables are stored (cached)
140 var $parsedCharsets=array();
141
142 // An array where case folding data will be stored (cached)
143 var $caseFolding=array();
144
145 // An array where charset-to-ASCII mappings are stored (cached)
146 var $toASCII=array();
147
148 // This tells the converter which charsets has two bytes per char:
149 var $twoByteSets=array(
150 'ucs-2'=>1, // 2-byte Unicode
151 );
152
153 // This tells the converter which charsets has four bytes per char:
154 var $fourByteSets=array(
155 'ucs-4'=>1, // 4-byte Unicode
156 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
157 );
158
159 // This tells the converter which charsets use a scheme like the Extended Unix Code:
160 var $eucBasedSets=array(
161 'gb2312'=>1, // Chinese, simplified.
162 'big5'=>1, // Chinese, traditional.
163 'euc-kr'=>1, // Korean
164 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
165 );
166
167 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
168 // http://czyborra.com/charsets/iso8859.html
169 var $synonyms=array(
170 'us' => 'ascii',
171 'us-ascii'=> 'ascii',
172 'cp819' => 'iso-8859-1',
173 'ibm819' => 'iso-8859-1',
174 'iso-ir-100' => 'iso-8859-1',
175 'iso-ir-109' => 'iso-8859-2',
176 'iso-ir-148' => 'iso-8859-9',
177 'iso-ir-199' => 'iso-8859-14',
178 'iso-ir-203' => 'iso-8859-15',
179 'csisolatin1' => 'iso-8859-1',
180 'csisolatin2' => 'iso-8859-2',
181 'csisolatin3' => 'iso-8859-3',
182 'csisolatin5' => 'iso-8859-9',
183 'csisolatin8' => 'iso-8859-14',
184 'csisolatin9' => 'iso-8859-15',
185 'csisolatingreek' => 'iso-8859-7',
186 'iso-celtic' => 'iso-8859-14',
187 'latin1' => 'iso-8859-1',
188 'latin2' => 'iso-8859-2',
189 'latin3' => 'iso-8859-3',
190 'latin5' => 'iso-8859-9',
191 'latin6' => 'iso-8859-10',
192 'latin8' => 'iso-8859-14',
193 'latin9' => 'iso-8859-15',
194 'l1' => 'iso-8859-1',
195 'l2' => 'iso-8859-2',
196 'l3' => 'iso-8859-3',
197 'l5' => 'iso-8859-9',
198 'l6' => 'iso-8859-10',
199 'l8' => 'iso-8859-14',
200 'l9' => 'iso-8859-15',
201 'cyrillic' => 'iso-8859-5',
202 'arabic' => 'iso-8859-6',
203 'tis-620' => 'iso-8859-11',
204 'win874' => 'windows-874',
205 'win1250' => 'windows-1250',
206 'win1251' => 'windows-1251',
207 'win1252' => 'windows-1252',
208 'win1253' => 'windows-1253',
209 'win1254' => 'windows-1254',
210 'win1255' => 'windows-1255',
211 'win1256' => 'windows-1256',
212 'win1257' => 'windows-1257',
213 'win1258' => 'windows-1258',
214 'cp1250' => 'windows-1250',
215 'cp1251' => 'windows-1251',
216 'cp1252' => 'windows-1252',
217 'ms-ee' => 'windows-1250',
218 'ms-ansi' => 'windows-1252',
219 'ms-greek' => 'windows-1253',
220 'ms-turk' => 'windows-1254',
221 'winbaltrim' => 'windows-1257',
222 'koi-8ru' => 'koi-8r',
223 'koi8r' => 'koi-8r',
224 'cp878' => 'koi-8r',
225 'mac' => 'macroman',
226 'macintosh' => 'macroman',
227 'euc-cn' => 'gb2312',
228 'x-euc-cn' => 'gb2312',
229 'euccn' => 'gb2312',
230 'cp936' => 'gb2312',
231 'big-5' => 'big5',
232 'cp950' => 'big5',
233 'eucjp' => 'euc-jp',
234 'sjis' => 'shift_jis',
235 'shift-jis' => 'shift_jis',
236 'cp932' => 'shift_jis',
237 'cp949' => 'euc-kr',
238 'utf7' => 'utf-7',
239 'utf8' => 'utf-8',
240 'utf16' => 'utf-16',
241 'utf32' => 'utf-32',
242 'utf8' => 'utf-8',
243 'ucs2' => 'ucs-2',
244 'ucs4' => 'ucs-4',
245 );
246
247 // mapping of iso-639:2 language codes to language (family) names
248 var $lang_to_langfamily=array(
249 // iso-639:2 language codes, see:
250 // http://www.w3.org/WAI/ER/IG/ert/iso639.htm
251 // http://www.unicode.org/onlinedat/languages.html
252 'ar' => 'arabic',
253 'bg' => 'cyrillic',
254 'cs' => 'east_european',
255 'da' => 'west_european',
256 'de' => 'west_european',
257 'es' => 'west_european',
258 'et' => 'estonian',
259 'eu' => 'west_european',
260 'fi' => 'west_european',
261 'fr' => 'west_european',
262 'gr' => 'greek',
263 'hr' => 'east_european',
264 'hu' => 'east_european',
265 'iw' => 'hebrew',
266 'is' => 'west_european',
267 'it' => 'west_european',
268 'ja' => 'japanese',
269 'kl' => 'west_european',
270 'ko' => 'korean',
271 'lt' => 'lithuanian',
272 'lv' => 'west_european', // Latvian/Lettish
273 'nl' => 'west_european',
274 'no' => 'west_european',
275 'pl' => 'east_european',
276 'pt' => 'west_european',
277 'ro' => 'east_european',
278 'ru' => 'cyrillic',
279 'sk' => 'east_european',
280 'sl' => 'east_european',
281 'sv' => 'west_european',
282 'th' => 'thai',
283 'uk' => 'cyrillic',
284 'vi' => 'vietnamese',
285 'zh' => 'chinese',
286 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
287 'chs' => 'simpl_chinese',
288 'cht' => 'trad_chinese',
289 'csy' => 'east_european',
290 'dan' => 'west_european',
291 'deu' => 'west_european',
292 'dea' => 'west_european',
293 'des' => 'west_european',
294 'ena' => 'west_european',
295 'enc' => 'west_european',
296 'eng' => 'west_european',
297 'enz' => 'west_european',
298 'enu' => 'west_european',
299 'nld' => 'west_european',
300 'nlb' => 'west_european',
301 'fin' => 'west_european',
302 'fra' => 'west_european',
303 'frb' => 'west_european',
304 'frc' => 'west_european',
305 'frs' => 'west_european',
306 'ell' => 'greek',
307 'hun' => 'east_european',
308 'isl' => 'west_euorpean',
309 'ita' => 'west_european',
310 'its' => 'west_european',
311 'jpn' => 'japanese',
312 'kor' => 'korean',
313 'nor' => 'west_european',
314 'non' => 'west_european',
315 'plk' => 'east_european',
316 'ptg' => 'west_european',
317 'ptb' => 'west_european',
318 'rus' => 'east_european',
319 'sky' => 'east_european',
320 'esp' => 'west_european',
321 'esm' => 'west_european',
322 'esn' => 'west_european',
323 'sve' => 'west_european',
324 'trk' => 'turkish',
325 // English language names
326 'bulgarian' => 'east_european',
327 'catalan' => 'west_european',
328 'croatian' => 'east_european',
329 'czech' => 'east_european',
330 'danish' => 'west_european',
331 'dutch' => 'west_european',
332 'english' => 'west_european',
333 'finnish' => 'west_european',
334 'french' => 'west_european',
335 'galician' => 'west_european',
336 'german' => 'west_european',
337 'hungarian' => 'east_european',
338 'icelandic' => 'west_european',
339 'italian' => 'west_european',
340 'latvian' => 'west_european',
341 'lettish' => 'west_european',
342 'norwegian' => 'west_european',
343 'polish' => 'east_european',
344 'portuguese' => 'west_european',
345 'russian' => 'cyrillic',
346 'romanian' => 'east_european',
347 'slovak' => 'east_european',
348 'slovenian' => 'east_european',
349 'spanish' => 'west_european',
350 'svedish' => 'west_european',
351 'turkish' => 'east_european',
352 'ukrainian' => 'cyrillic',
353 );
354
355 // mapping of language (family) names to charsets on Unix
356 var $lang_to_charset_unix=array(
357 'west_european' => 'iso-8859-1',
358 'estonian' => 'iso-8859-1',
359 'east_european' => 'iso-8859-2',
360 'baltic' => 'iso-8859-4',
361 'cyrillic' => 'iso-8859-5',
362 'arabic' => 'iso-8859-6',
363 'greek' => 'iso-8859-7',
364 'hebrew' => 'iso-8859-8',
365 'turkish' => 'iso-8859-9',
366 'thai' => 'iso-8859-11', // = TIS-620
367 'lithuanian' => 'iso-8859-13',
368 'chinese' => 'gb2312', // = euc-cn
369 'japanese' => 'euc-jp',
370 'korean' => 'euc-kr',
371 'simpl_chinese' => 'gb2312',
372 'trad_chinese' => 'big5',
373 'vietnamese' => '',
374 );
375
376 // mapping of language (family) names to charsets on Windows
377 var $lang_to_charset_windows=array(
378 'east_european' => 'windows-1250',
379 'cyrillic' => 'windows-1251',
380 'west_european' => 'windows-1252',
381 'greek' => 'windows-1253',
382 'turkish' => 'windows-1254',
383 'hebrew' => 'windows-1255',
384 'arabic' => 'windows-1256',
385 'baltic' => 'windows-1257',
386 'estonian' => 'windows-1257',
387 'lithuanian' => 'windows-1257',
388 'vietnamese' => 'windows-1258',
389 'thai' => 'cp874',
390 'korean' => 'cp949',
391 'chinese' => 'gb2312',
392 'japanese' => 'shift_jis',
393 'simpl_chinese' => 'gb2312',
394 'trad_chinese' => 'big5',
395 );
396
397 // mapping of locale names to charsets
398 var $locale_to_charset=array(
399 'japanese.euc' => 'euc-jp',
400 'ja_jp.ujis' => 'euc-jp',
401 'korean.euc' => 'euc-kr',
402 'zh_cn' => 'gb2312',
403 'zh_hk' => 'big5',
404 'zh_tw' => 'big5',
405 );
406
407 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
408 // Empty values means "iso-8859-1"
409 var $charSetArray = array(
410 'dk' => '',
411 'de' => '',
412 'no' => '',
413 'it' => '',
414 'fr' => '',
415 'es' => '',
416 'nl' => '',
417 'cz' => 'windows-1250',
418 'pl' => 'iso-8859-2',
419 'si' => 'windows-1250',
420 'fi' => '',
421 'tr' => 'iso-8859-9',
422 'se' => '',
423 'pt' => '',
424 'ru' => 'windows-1251',
425 'ro' => 'iso-8859-2',
426 'ch' => 'gb2312',
427 'sk' => 'windows-1250',
428 'lt' => 'windows-1257',
429 'is' => 'utf-8',
430 'hr' => 'windows-1250',
431 'hu' => 'iso-8859-2',
432 'gl' => '',
433 'th' => 'iso-8859-11',
434 'gr' => 'iso-8859-7',
435 'hk' => 'big5',
436 'eu' => '',
437 'bg' => 'windows-1251',
438 'br' => '',
439 'et' => 'iso-8859-4',
440 'ar' => 'iso-8859-6',
441 'he' => 'utf-8',
442 'ua' => 'windows-1251',
443 'jp' => 'shift_jis',
444 'lv' => 'utf-8',
445 'vn' => 'utf-8',
446 'ca' => 'iso-8859-15',
447 'ba' => 'iso-8859-2',
448 'kr' => 'euc-kr',
449 );
450
451 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
452 // Empty values means sames as Typo3
453 var $isoArray = array(
454 'dk' => 'da',
455 'de' => '',
456 'no' => '',
457 'it' => '',
458 'fr' => '',
459 'es' => '',
460 'nl' => '',
461 'cz' => 'cs',
462 'pl' => '',
463 'si' => 'sl',
464 'fi' => '',
465 'tr' => '',
466 'se' => 'sv',
467 'pt' => '',
468 'ru' => '',
469 'ro' => '',
470 'ch' => 'zh_CN',
471 'sk' => '',
472 'lt' => '',
473 'is' => '',
474 'hr' => '',
475 'hu' => '',
476 'gl' => '', // Greenlandic
477 'th' => '',
478 'gr' => 'el',
479 'hk' => 'zh_HK',
480 'eu' => '',
481 'bg' => '',
482 'br' => 'pt_BR',
483 'et' => '',
484 'ar' => '',
485 'he' => 'iw',
486 'ua' => 'uk',
487 'jp' => 'ja',
488 'lv' => '',
489 'vn' => 'vi',
490 'ca' => '',
491 'ba' => '', // Bosnian
492 'kr' => '',
493 );
494
495 /**
496 * Normalize - changes input character set to lowercase letters.
497 *
498 * @param string Input charset
499 * @return string Normalized charset
500 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
501 */
502 function parse_charset($charset) {
503 $charset = strtolower($charset);
504 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
505
506 return $charset;
507 }
508
509 /**
510 * Get the charset of a locale.
511 *
512 * ln language
513 * ln_CN language / country
514 * ln_CN.cs language / country / charset
515 * ln_CN.cs@mod language / country / charset / modifier
516 *
517 * @param string Locale string
518 * @return string Charset resolved for locale string
519 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
520 */
521 function get_locale_charset($locale) {
522 $locale = strtolower($locale);
523
524 // exact locale specific charset?
525 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale];
526
527 // get modifier
528 list($locale,$modifier) = explode('@',$locale);
529
530 // locale contains charset: use it
531 list($locale,$charset) = explode('.',$locale);
532 if ($charset) return $this->parse_charset($charset);
533
534 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
535 if ($modifier == 'euro') return 'iso-8859-15';
536
537 // get language
538 list($language,$country) = explode('_',$locale);
539 if (isset($this->lang_to_langfamily[$language])) $language = $this->lang_to_langfamily[$language];
540
541 if (TYPO3_OS == 'WIN') {
542 $cs = $this->lang_to_charset_windows[$language];
543 } else {
544 $cs = $this->lang_to_charset_unix[$language];
545 }
546
547 return $cs ? $cs : 'iso-8859-1';
548 }
549
550
551
552
553
554
555
556
557
558 /********************************************
559 *
560 * Charset Conversion functions
561 *
562 ********************************************/
563
564 /**
565 * Convert from one charset to another charset.
566 *
567 * @param string Input string
568 * @param string From charset (the current charset of the string)
569 * @param string To charset (the output charset wanted)
570 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
571 * @return string Converted string
572 * @see convArray()
573 */
574 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
575 if ($fromCS==$toCS) return $str;
576
577 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
578 if ($toCS=='utf-8' || !$useEntityForNoChar) {
579 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
580 case 'mbstring':
581 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
582 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
583 break;
584
585 case 'iconv':
586 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
587 if (false !== $conv_str) return $conv_str;
588 break;
589
590 case 'recode':
591 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
592 if (false !== $conv_str) return $conv_str;
593 break;
594 }
595 // fallback to TYPO3 conversion
596 }
597
598 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
599 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
600 return $str;
601 }
602
603 /**
604 * Convert all elements in ARRAY from one charset to another charset.
605 * NOTICE: Array is passed by reference!
606 *
607 * @param string Input array, possibly multidimensional
608 * @param string From charset (the current charset of the string)
609 * @param string To charset (the output charset wanted)
610 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
611 * @return void
612 * @see conv()
613 */
614 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
615 foreach($array as $key => $value) {
616 if (is_array($array[$key])) {
617 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
618 } else {
619 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
620 }
621 }
622 }
623
624 /**
625 * Converts $str from $charset to UTF-8
626 *
627 * @param string String in local charset to convert to UTF-8
628 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
629 * @return string Output string, converted to UTF-8
630 */
631 function utf8_encode($str,$charset) {
632
633 if ($charset === 'utf-8') return $str;
634
635 // Charset is case-insensitive.
636 if ($this->initCharset($charset)) { // Parse conv. table if not already...
637 $strLen = strlen($str);
638 $outStr='';
639
640 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
641 $chr=substr($str,$a,1);
642 $ord=ord($chr);
643 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
644 $ord2 = ord($str{$a+1});
645 $ord = $ord<<8 & $ord2; // assume big endian
646
647 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
648 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
649 } else $outStr.=chr($this->noCharByteVal); // No char exists
650 $a++;
651 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
652 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
653 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
654 $a++;
655 $ord2=ord(substr($str,$a,1));
656 $ord = $ord*256+$ord2;
657 }
658 }
659
660 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
661 $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
662 } else $outStr.= chr($this->noCharByteVal); // No char exists
663 } else $outStr.= $chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
664 }
665 return $outStr;
666 }
667 }
668
669 /**
670 * Converts $str from UTF-8 to $charset
671 *
672 * @param string String in UTF-8 to convert to local charset
673 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
674 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
675 * @return string Output string, converted to local charset
676 */
677 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
678
679 // Charset is case-insensitive.
680 if ($this->initCharset($charset)) { // Parse conv. table if not already...
681 $strLen = strlen($str);
682 $outStr='';
683 $buf='';
684 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
685 $chr=substr($str,$a,1);
686 $ord=ord($chr);
687 if ($ord>127) { // This means multibyte! (first byte!)
688 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
689
690 $buf=$chr; // Add first byte
691 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
692 $ord = $ord << 1; // Shift it left and ...
693 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
694 $a++; // Increase pointer...
695 $buf.=substr($str,$a,1); // ... and add the next char.
696 } else break;
697 }
698
699 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
700 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
701 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
702 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
703 } else $outStr.= chr($mByte);
704 } elseif ($useEntityForNoChar) { // Create num entity:
705 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
706 } else $outStr.=chr($this->noCharByteVal); // No char exists
707 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
708 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
709 }
710 return $outStr;
711 }
712 }
713
714 /**
715 * Converts all chars > 127 to numeric entities.
716 *
717 * @param string Input string
718 * @return string Output string
719 */
720 function utf8_to_entities($str) {
721 $strLen = strlen($str);
722 $outStr='';
723 $buf='';
724 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
725 $chr=substr($str,$a,1);
726 $ord=ord($chr);
727 if ($ord>127) { // This means multibyte! (first byte!)
728 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
729 $buf=$chr; // Add first byte
730 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
731 $ord = $ord << 1; // Shift it left and ...
732 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
733 $a++; // Increase pointer...
734 $buf.=substr($str,$a,1); // ... and add the next char.
735 } else break;
736 }
737
738 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
739 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
740 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
741 }
742
743 return $outStr;
744 }
745
746 /**
747 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
748 *
749 * @param string Input string, UTF-8
750 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
751 * @return string Output string
752 */
753 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
754 if ($alsoStdHtmlEnt) {
755 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
756 }
757
758 $token = md5(microtime());
759 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
760 foreach($parts as $k => $v) {
761 if ($k%2) {
762 if (substr($v,0,1)=='#') { // Dec or hex entities:
763 if (substr($v,1,1)=='x') {
764 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
765 } else {
766 $parts[$k] = $this->UnumberToChar(substr($v,1));
767 }
768 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
769 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
770 } else { // No conversion:
771 $parts[$k] ='&'.$v.';';
772 }
773 }
774 }
775
776 return implode('',$parts);
777 }
778
779 /**
780 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
781 *
782 * @param string Input string, UTF-8
783 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
784 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
785 * @return array Output array with the char numbers
786 */
787 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
788 // If entities must be registered as well...:
789 if ($convEntities) {
790 $str = $this->entities_to_utf8($str,1);
791 }
792 // Do conversion:
793 $strLen = strlen($str);
794 $outArr=array();
795 $buf='';
796 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
797 $chr=substr($str,$a,1);
798 $ord=ord($chr);
799 if ($ord>127) { // This means multibyte! (first byte!)
800 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
801 $buf=$chr; // Add first byte
802 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
803 $ord = $ord << 1; // Shift it left and ...
804 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
805 $a++; // Increase pointer...
806 $buf.=substr($str,$a,1); // ... and add the next char.
807 } else break;
808 }
809
810 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
811 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
812 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
813 }
814
815 return $outArr;
816 }
817
818 /**
819 * Converts a UNICODE number to a UTF-8 multibyte character
820 * Algorithm based on script found at From: http://czyborra.com/utf/
821 * Unit-tested by Kasper
822 *
823 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
824 *
825 * bytes | bits | representation
826 * 1 | 7 | 0vvvvvvv
827 * 2 | 11 | 110vvvvv 10vvvvvv
828 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
829 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
830 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
831 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
832 *
833 * @param integer UNICODE integer
834 * @return string UTF-8 multibyte character string
835 * @see utf8CharToUnumber()
836 */
837 function UnumberToChar($cbyte) {
838 $str='';
839
840 if ($cbyte < 0x80) {
841 $str.=chr($cbyte);
842 } else if ($cbyte < 0x800) {
843 $str.=chr(0xC0 | ($cbyte >> 6));
844 $str.=chr(0x80 | ($cbyte & 0x3F));
845 } else if ($cbyte < 0x10000) {
846 $str.=chr(0xE0 | ($cbyte >> 12));
847 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
848 $str.=chr(0x80 | ($cbyte & 0x3F));
849 } else if ($cbyte < 0x200000) {
850 $str.=chr(0xF0 | ($cbyte >> 18));
851 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
852 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
853 $str.=chr(0x80 | ($cbyte & 0x3F));
854 } else if ($cbyte < 0x4000000) {
855 $str.=chr(0xF8 | ($cbyte >> 24));
856 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
857 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
858 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
859 $str.=chr(0x80 | ($cbyte & 0x3F));
860 } else if ($cbyte < 0x80000000) {
861 $str.=chr(0xFC | ($cbyte >> 30));
862 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
863 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
864 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
865 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
866 $str.=chr(0x80 | ($cbyte & 0x3F));
867 } else { // Cannot express a 32-bit character in UTF-8
868 $str .= chr($this->noCharByteVal);
869 }
870 return $str;
871 }
872
873 /**
874 * Converts a UTF-8 Multibyte character to a UNICODE number
875 * Unit-tested by Kasper
876 *
877 * @param string UTF-8 multibyte character string
878 * @param boolean If set, then a hex. number is returned.
879 * @return integer UNICODE integer
880 * @see UnumberToChar()
881 */
882 function utf8CharToUnumber($str,$hex=0) {
883 $ord=ord(substr($str,0,1)); // First char
884
885 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
886 $binBuf='';
887 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
888 $ord = $ord << 1; // Shift it left and ...
889 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
890 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
891 } else break;
892 }
893 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
894
895 $int = bindec($binBuf);
896 } else $int = $ord;
897
898 return $hex ? 'x'.dechex($int) : $int;
899 }
900
901
902
903
904
905
906
907
908
909 /********************************************
910 *
911 * Init functions
912 *
913 ********************************************/
914
915 /**
916 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
917 * This function is automatically called by the conversion functions
918 *
919 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
920 *
921 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
922 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
923 * @access private
924 */
925 function initCharset($charset) {
926 // Only process if the charset is not yet loaded:
927 if (!is_array($this->parsedCharsets[$charset])) {
928
929 // Conversion table filename:
930 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
931
932 // If the conversion table is found:
933 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
934 // Cache file for charsets:
935 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
936 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
937 if ($cacheFile && @is_file($cacheFile)) {
938 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
939 } else {
940 // Parse conversion table into lines:
941 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
942 // Initialize the internal variable holding the conv. table:
943 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
944 // traverse the lines:
945 $detectedType='';
946 foreach($lines as $value) {
947 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
948
949 // Detect type if not done yet: (Done on first real line)
950 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
951 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
952
953 if ($detectedType=='ms-token') {
954 list($hexbyte,$utf8) = split('=|:',$value,3);
955 } elseif ($detectedType=='whitespaced') {
956 $regA=array();
957 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
958 $hexbyte = $regA[1];
959 $utf8 = 'U+'.$regA[2];
960 }
961 $decval = hexdec(trim($hexbyte));
962 if ($decval>127) {
963 $utf8decval = hexdec(substr(trim($utf8),2));
964 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
965 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
966 }
967 }
968 }
969 if ($cacheFile) {
970 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
971 }
972 }
973 return 2;
974 } else return false;
975 } else return 1;
976 }
977
978 /**
979 * This function initializes all UTF-8 character data tables.
980 *
981 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
982 *
983 * @param string Mode ("case", "ascii", ...)
984 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
985 * @access private
986 */
987 function initUnicodeData($mode=null) {
988 // cache files
989 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
990 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
991
992 // Only process if the tables are not yet loaded
993 switch($mode) {
994 case 'case':
995 if (is_array($this->caseFolding['utf-8'])) return 1;
996
997 // Use cached version if possible
998 if ($cacheFileCase && @is_file($cacheFileCase)) {
999 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1000 return 2;
1001 }
1002 break;
1003
1004 case 'ascii':
1005 if (is_array($this->toASCII['utf-8'])) return 1;
1006
1007 // Use cached version if possible
1008 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1009 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1010 return 2;
1011 }
1012 break;
1013 }
1014
1015 // process main Unicode data file
1016 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
1017 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
1018
1019 $fh = fopen($unicodeDataFile,'rb');
1020 if (!$fh) return false;
1021
1022 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1023 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1024 $this->caseFolding['utf-8'] = array();
1025 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1026 $utf8CaseFolding['toUpper'] = array();
1027 $utf8CaseFolding['toLower'] = array();
1028 $utf8CaseFolding['toTitle'] = array();
1029
1030 $decomposition = array(); // array of temp. decompositions
1031 $mark = array(); // array of chars that are marks (eg. composing accents)
1032 $number = array(); // array of chars that are numbers (eg. digits)
1033 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1034
1035 while (!feof($fh)) {
1036 $line = fgets($fh,4096);
1037 // has a lot of info
1038 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
1039
1040 $ord = hexdec($char);
1041 if ($ord > 0xFFFF) break; // only process the BMP
1042
1043 $utf8_char = $this->UnumberToChar($ord);
1044
1045 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1046 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1047 // store "title" only when different from "upper" (only a few)
1048 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1049
1050 switch ($cat{0}) {
1051 case 'M': // mark (accent, umlaut, ...)
1052 $mark["U+$char"] = 1;
1053 break;
1054
1055 case 'N': // numeric value
1056 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
1057 }
1058
1059 // accented Latin letters without "official" decomposition
1060 $match = array();
1061 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) {
1062 $c = ord($match[2]);
1063 if ($match[1] == 'SMALL') $c += 32;
1064
1065 $decomposition["U+$char"] = array(dechex($c));
1066 continue;
1067 }
1068
1069 $match = array();
1070 if (ereg('(<.*>)? *(.+)',$decomp,$match)) {
1071 switch($match[1]) {
1072 case '<circle>': // add parenthesis as circle replacement, eg (1)
1073 $match[2] = '0028 '.$match[2].' 0029';
1074 break;
1075
1076 case '<square>': // add square brackets as square replacement, eg [1]
1077 $match[2] = '005B '.$match[2].' 005D';
1078 break;
1079
1080 case '<compat>': // ignore multi char decompositions that start with a space
1081 if (ereg('^0020 ',$match[2])) continue 2;
1082 break;
1083
1084 // ignore Arabic and vertical layout presentation decomposition
1085 case '<initial>':
1086 case '<medial>':
1087 case '<final>':
1088 case '<isolated>':
1089 case '<vertical>':
1090 continue 2;
1091 }
1092 $decomposition["U+$char"] = split(' ',$match[2]);
1093 }
1094 }
1095 fclose($fh);
1096
1097 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1098 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
1099 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1100 $fh = fopen($specialCasingFile,'rb');
1101 if ($fh) {
1102 while (!feof($fh)) {
1103 $line = fgets($fh,4096);
1104 if ($line{0} != '#' && trim($line) != '') {
1105
1106 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
1107 if ($cond == '' || $cond{0} == '#') {
1108 $utf8_char = $this->UnumberToChar(hexdec($char));
1109 if ($char != $lower) {
1110 $arr = split(' ',$lower);
1111 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1112 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1113 }
1114 if ($char != $title && $title != $upper) {
1115 $arr = split(' ',$title);
1116 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1117 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1118 }
1119 if ($char != $upper) {
1120 $arr = split(' ',$upper);
1121 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1122 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1123 }
1124 }
1125 }
1126 }
1127 fclose($fh);
1128 }
1129 }
1130
1131 // process custom decompositions
1132 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
1133 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1134 $fh = fopen($customTranslitFile,'rb');
1135 if ($fh) {
1136 while (!feof($fh)) {
1137 $line = fgets($fh,4096);
1138 if ($line{0} != '#' && trim($line) != '') {
1139 list($char,$translit) = t3lib_div::trimExplode(';', $line);
1140 if (!$translit) $omit["U+$char"] = 1;
1141 $decomposition["U+$char"] = split(' ', $translit);
1142
1143 }
1144 }
1145 fclose($fh);
1146 }
1147 }
1148
1149 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1150 foreach($decomposition as $from => $to) {
1151 $code_decomp = array();
1152
1153 while ($code_value = array_shift($to)) {
1154 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1155 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
1156 array_unshift($to, $cv);
1157 }
1158 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1159 array_push($code_decomp, $code_value);
1160 }
1161 }
1162 if (count($code_decomp) || isset($omit[$from])) {
1163 $decomposition[$from] = $code_decomp;
1164 } else {
1165 unset($decomposition[$from]);
1166 }
1167 }
1168
1169 // create ascii only mapping
1170 $this->toASCII['utf-8'] = array();
1171 $ascii =& $this->toASCII['utf-8'];
1172
1173 foreach($decomposition as $from => $to) {
1174 $code_decomp = array();
1175 while ($code_value = array_shift($to)) {
1176 $ord = hexdec($code_value);
1177 if ($ord > 127)
1178 continue 2; // skip decompositions containing non-ASCII chars
1179 else
1180 array_push($code_decomp,chr($ord));
1181 }
1182 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1183 }
1184
1185 // add numeric decompositions
1186 foreach($number as $from => $to) {
1187 $utf8_char = $this->UnumberToChar(hexdec($from));
1188 if (!isset($ascii[$utf8_char])) {
1189 $ascii[$utf8_char] = $to;
1190 }
1191 }
1192
1193 if ($cacheFileCase) {
1194 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1195 }
1196
1197 if ($cacheFileASCII) {
1198 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1199 }
1200
1201 return 3;
1202 }
1203
1204 /**
1205 * This function initializes the folding table for a charset other than UTF-8.
1206 * This function is automatically called by the case folding functions.
1207 *
1208 * @param string Charset for which to initialize case folding.
1209 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1210 * @access private
1211 */
1212 function initCaseFolding($charset) {
1213 // Only process if the case table is not yet loaded:
1214 if (is_array($this->caseFolding[$charset])) return 1;
1215
1216 // Use cached version if possible
1217 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1218 if ($cacheFile && @is_file($cacheFile)) {
1219 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1220 return 2;
1221 }
1222
1223 // init UTF-8 conversion for this charset
1224 if (!$this->initCharset($charset)) {
1225 return false;
1226 }
1227
1228 // UTF-8 case folding is used as the base conversion table
1229 if (!$this->initUnicodeData('case')) {
1230 return false;
1231 }
1232
1233 $nochar = chr($this->noCharByteVal);
1234 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1235 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1236 $c = $this->utf8_decode($utf8, $charset);
1237
1238 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1239 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1240 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1241
1242 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1243 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1244 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
1245
1246 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1247 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1248 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1249 }
1250
1251 // add the ASCII case table
1252 for ($i=ord('a'); $i<=ord('z'); $i++) {
1253 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
1254 }
1255 for ($i=ord('A'); $i<=ord('Z'); $i++) {
1256 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
1257 }
1258
1259 if ($cacheFile) {
1260 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
1261 }
1262
1263 return 3;
1264 }
1265
1266 /**
1267 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1268 * This function is automatically called by the ASCII transliteration functions.
1269 *
1270 * @param string Charset for which to initialize conversion.
1271 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1272 * @access private
1273 */
1274 function initToASCII($charset) {
1275 // Only process if the case table is not yet loaded:
1276 if (is_array($this->toASCII[$charset])) return 1;
1277
1278 // Use cached version if possible
1279 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1280 if ($cacheFile && @is_file($cacheFile)) {
1281 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1282 return 2;
1283 }
1284
1285 // init UTF-8 conversion for this charset
1286 if (!$this->initCharset($charset)) {
1287 return false;
1288 }
1289
1290 // UTF-8/ASCII transliteration is used as the base conversion table
1291 if (!$this->initUnicodeData('ascii')) {
1292 return false;
1293 }
1294
1295 $nochar = chr($this->noCharByteVal);
1296 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1297 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1298 $c = $this->utf8_decode($utf8, $charset);
1299
1300 if (isset($this->toASCII['utf-8'][$utf8])) {
1301 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1302 }
1303 }
1304
1305 if ($cacheFile) {
1306 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
1307 }
1308
1309 return 3;
1310 }
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327 /********************************************
1328 *
1329 * String operation functions
1330 *
1331 ********************************************/
1332
1333 /**
1334 * Returns a part of a string.
1335 * Unit-tested by Kasper (single byte charsets only)
1336 *
1337 * @param string The character set
1338 * @param string Character string
1339 * @param integer Start position (character position)
1340 * @param integer Length (in characters)
1341 * @return string The substring
1342 * @see substr(), mb_substr()
1343 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1344 */
1345 function substr($charset,$string,$start,$len=null) {
1346 if ($len===0) return '';
1347
1348 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1349 // cannot omit $len, when specifying charset
1350 if ($len==null) {
1351 $enc = mb_internal_encoding(); // save internal encoding
1352 mb_internal_encoding('utf-8');
1353 $str = mb_substr($string,$start);
1354 mb_internal_encoding($enc); // restore internal encoding
1355
1356 return $str;
1357 }
1358 else return mb_substr($string,$start,$len,'utf-8');
1359 } elseif ($charset == 'utf-8') {
1360 return $this->utf8_substr($string,$start,$len);
1361 } elseif ($this->eucBasedSets[$charset]) {
1362 return $this->euc_substr($string,$start,$charset,$len);
1363 } elseif ($this->twoByteSets[$charset]) {
1364 return substr($string,$start*2,$len*2);
1365 } elseif ($this->fourByteSets[$charset]) {
1366 return substr($string,$start*4,$len*4);
1367 }
1368
1369 // treat everything else as single-byte encoding
1370 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
1371 }
1372
1373 /**
1374 * Counts the number of characters.
1375 * Unit-tested by Kasper (single byte charsets only)
1376 *
1377 * @param string The character set
1378 * @param string Character string
1379 * @return integer The number of characters
1380 * @see strlen()
1381 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1382 */
1383 function strlen($charset,$string) {
1384 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1385 return mb_strlen($string,$charset);
1386 } elseif ($charset == 'utf-8') {
1387 return $this->utf8_strlen($string);
1388 } elseif ($this->eucBasedSets[$charset]) {
1389 return $this->euc_strlen($string,$charset);
1390 } elseif ($this->twoByteSets[$charset]) {
1391 return strlen($string)/2;
1392 } elseif ($this->fourByteSets[$charset]) {
1393 return strlen($string)/4;
1394 }
1395 // treat everything else as single-byte encoding
1396 return strlen($string);
1397 }
1398
1399 /**
1400 * Truncates a string and pre-/appends a string.
1401 * Unit tested by Kasper
1402 *
1403 * @param string The character set
1404 * @param string Character string
1405 * @param integer Length (in characters)
1406 * @param string Crop signifier
1407 * @return string The shortened string
1408 * @see substr(), mb_strimwidth()
1409 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1410 */
1411 function crop($charset,$string,$len,$crop='') {
1412 if (intval($len) == 0) return $string;
1413
1414 if ($charset == 'utf-8') {
1415 $i = $this->utf8_char2byte_pos($string,$len);
1416 } elseif ($this->eucBasedSets[$charset]) {
1417 $i = $this->euc_char2byte_pos($string,$len,$charset);
1418 } else {
1419 if ($len > 0) {
1420 $i = $len;
1421 } else {
1422 $i = strlen($string)+$len;
1423 if ($i<=0) $i = false;
1424 }
1425 }
1426
1427 if ($i === false) { // $len outside actual string length
1428 return $string;
1429 } else {
1430 if ($len > 0) {
1431 if (strlen($string{$i})) {
1432 return substr($string,0,$i).$crop;
1433
1434 }
1435 } else {
1436 if (strlen($string{$i-1})) {
1437 return $crop.substr($string,$i);
1438 }
1439 }
1440
1441 /*
1442 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1443 if ($len > 0) {
1444 return substr($string,0,$i).$crop;
1445 } else {
1446 return $crop.substr($string,$i);
1447 }
1448 }
1449 */
1450 }
1451 return $string;
1452 }
1453
1454 /**
1455 * Cuts a string short at a given byte length.
1456 *
1457 * @param string The character set
1458 * @param string Character string
1459 * @param integer The byte length
1460 * @return string The shortened string
1461 * @see mb_strcut()
1462 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1463 */
1464 function strtrunc($charset,$string,$len) {
1465 if ($len <= 0) return '';
1466
1467 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1468 return mb_strcut($string,0,$len,$charset);
1469 } elseif ($charset == 'utf-8') {
1470 return $this->utf8_strtrunc($string,$len);
1471 } elseif ($this->eucBasedSets[$charset]) {
1472 return $this->euc_strtrunc($string,$charset);
1473 } elseif ($this->twoByteSets[$charset]) {
1474 if ($len % 2) $len--; // don't cut at odd positions
1475 } elseif ($this->fourByteSets[$charset]) {
1476 $x = $len % 4;
1477 $len -= $x; // realign to position dividable by four
1478 }
1479 // treat everything else as single-byte encoding
1480 return substr($string,0,$len);
1481 }
1482
1483 /**
1484 * Translates all characters of a string into their respective case values.
1485 * Unlike strtolower() and strtoupper() this method is locale independent.
1486 * Note that the string length may change!
1487 * eg. lower case German �(sharp S) becomes upper case "SS"
1488 * Unit-tested by Kasper
1489 * Real case folding is language dependent, this method ignores this fact.
1490 *
1491 * @param string Character set of string
1492 * @param string Input string to convert case for
1493 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1494 * @return string The converted string
1495 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1496 * @see strtolower(), strtoupper()
1497 */
1498 function conv_case($charset,$string,$case) {
1499 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' && float(phpversion()) >= 4.3) {
1500 if ($case == 'toLower') {
1501 return mb_strtolower($string,'utf-8');
1502 } else {
1503 return mb_strtoupper($string,'utf-8');
1504 }
1505 } elseif ($charset == 'utf-8') {
1506 return $this->utf8_char_mapping($string,'case',$case);
1507 } elseif (isset($this->eucBasedSets[$charset])) {
1508 return $this->euc_char_mapping($string,$charset,'case',$case);
1509 } else {
1510 // treat everything else as single-byte encoding
1511 return $this->sb_char_mapping($string,$charset,'case',$case);
1512 }
1513
1514 return $string;
1515 }
1516
1517 /**
1518 * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
1519 *
1520 * @param string Character set of string
1521 * @param string Input string to convert
1522 * @return string The converted string
1523 */
1524 function specCharsToASCII($charset,$string) {
1525 if ($charset == 'utf-8') {
1526 return $this->utf8_char_mapping($string,'ascii');
1527 } elseif (isset($this->eucBasedSets[$charset])) {
1528 return $this->euc_char_mapping($string,$charset,'ascii');
1529 } else {
1530 // treat everything else as single-byte encoding
1531 return $this->sb_char_mapping($string,$charset,'ascii');
1532 }
1533
1534 return $string;
1535 }
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548 /********************************************
1549 *
1550 * Internal string operation functions
1551 *
1552 ********************************************/
1553
1554 /**
1555 * Maps all characters of a string in a single byte charset.
1556 *
1557 * @param string the string
1558 * @param string the charset
1559 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1560 * @param string 'case': conversion 'toLower' or 'toUpper'
1561 * @return string the converted string
1562 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1563 */
1564 function sb_char_mapping($str,$charset,$mode,$opt='') {
1565 switch($mode) {
1566 case 'case':
1567 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1568 $map =& $this->caseFolding[$charset][$opt];
1569 break;
1570
1571 case 'ascii':
1572 if (!$this->initToASCII($charset)) return $str; // do nothing
1573 $map =& $this->toASCII[$charset];
1574 break;
1575
1576 default:
1577 return $str;
1578 }
1579
1580 $out = '';
1581 for($i=0; strlen($str{$i}); $i++) {
1582 $c = $str{$i};
1583 if (isset($map[$c])) {
1584 $out .= $map[$c];
1585 } else {
1586 $out .= $c;
1587 }
1588 }
1589
1590 return $out;
1591 }
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602 /********************************************
1603 *
1604 * Internal UTF-8 string operation functions
1605 *
1606 ********************************************/
1607
1608 /**
1609 * Returns a part of a UTF-8 string.
1610 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1611 *
1612 * @param string UTF-8 string
1613 * @param integer Start position (character position)
1614 * @param integer Length (in characters)
1615 * @return string The substring
1616 * @see substr()
1617 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1618 */
1619 function utf8_substr($str,$start,$len=null) {
1620 if (!strcmp($len,'0')) return '';
1621
1622 $byte_start = $this->utf8_char2byte_pos($str,$start);
1623 if ($byte_start === false) {
1624 if ($start > 0) {
1625 return false; // $start outside string length
1626 } else {
1627 $start = 0;
1628 }
1629 }
1630
1631 $str = substr($str,$byte_start);
1632
1633 if ($len!=null) {
1634 $byte_end = $this->utf8_char2byte_pos($str,$len);
1635 if ($byte_end === false) // $len outside actual string length
1636 return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string.
1637 else
1638 return substr($str,0,$byte_end);
1639 }
1640 else return $str;
1641 }
1642
1643 /**
1644 * Counts the number of characters of a string in UTF-8.
1645 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1646 *
1647 * @param string UTF-8 multibyte character string
1648 * @return integer The number of characters
1649 * @see strlen()
1650 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1651 */
1652 function utf8_strlen($str) {
1653 $n=0;
1654 for($i=0; strlen($str{$i}); $i++) {
1655 $c = ord($str{$i});
1656 if (!($c & 0x80)) // single-byte (0xxxxxx)
1657 $n++;
1658 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1659 $n++;
1660 }
1661 return $n;
1662 }
1663
1664 /**
1665 * Truncates a string in UTF-8 short at a given byte length.
1666 *
1667 * @param string UTF-8 multibyte character string
1668 * @param integer the byte length
1669 * @return string the shortened string
1670 * @see mb_strcut()
1671 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1672 */
1673 function utf8_strtrunc($str,$len) {
1674 $i = $len-1;
1675 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1676 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1677 if ($i <= 0) return ''; // sanity check
1678 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1679 if ($bc+$i > $len) return substr($str,0,$i);
1680 // fallthru: multibyte char fits into length
1681 }
1682 return substr($str,0,$len);
1683 }
1684
1685 /**
1686 * Find position of first occurrence of a string, both arguments are in UTF-8.
1687 *
1688 * @param string UTF-8 string to search in
1689 * @param string UTF-8 string to search for
1690 * @param integer Positition to start the search
1691 * @return integer The character position
1692 * @see strpos()
1693 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1694 */
1695 function utf8_strpos($haystack,$needle,$offset=0) {
1696 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1697 return mb_strpos($haystack,$needle,'utf-8');
1698 }
1699
1700 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1701 if ($byte_offset === false) return false; // offset beyond string length
1702
1703 $byte_pos = strpos($haystack,$needle,$byte_offset);
1704 if ($byte_pos === false) return false; // needle not found
1705
1706 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1707 }
1708
1709 /**
1710 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1711 *
1712 * @param string UTF-8 string to search in
1713 * @param string UTF-8 character to search for (single character)
1714 * @return integer The character position
1715 * @see strrpos()
1716 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1717 */
1718 function utf8_strrpos($haystack,$needle) {
1719 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1720 return mb_strrpos($haystack,$needle,'utf-8');
1721 }
1722
1723 $byte_pos = strrpos($haystack,$needle);
1724 if ($byte_pos === false) return false; // needle not found
1725
1726 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1727 }
1728
1729 /**
1730 * Translates a character position into an 'absolute' byte position.
1731 * Unit tested by Kasper.
1732 *
1733 * @param string UTF-8 string
1734 * @param integer Character position (negative values start from the end)
1735 * @return integer Byte position
1736 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1737 */
1738 function utf8_char2byte_pos($str,$pos) {
1739 $n = 0; // number of characters found
1740 $p = abs($pos); // number of characters wanted
1741
1742 if ($pos >= 0) {
1743 $i = 0;
1744 $d = 1;
1745 } else {
1746 $i = strlen($str)-1;
1747 $d = -1;
1748 }
1749
1750 for( ; strlen($str{$i}) && $n<$p; $i+=$d) {
1751 $c = (int)ord($str{$i});
1752 if (!($c & 0x80)) // single-byte (0xxxxxx)
1753 $n++;
1754 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1755 $n++;
1756 }
1757 if (!strlen($str{$i})) return false; // offset beyond string length
1758
1759 if ($pos >= 0) {
1760 // skip trailing multi-byte data bytes
1761 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1762 } else {
1763 // correct offset
1764 $i++;
1765 }
1766
1767 return $i;
1768 }
1769
1770 /**
1771 * Translates an 'absolute' byte position into a character position.
1772 * Unit tested by Kasper.
1773 *
1774 * @param string UTF-8 string
1775 * @param integer byte position
1776 * @return integer character position
1777 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1778 */
1779 function utf8_byte2char_pos($str,$pos) {
1780 $n = 0; // number of characters
1781 for($i=$pos; $i>0; $i--) {
1782 $c = (int)ord($str{$i});
1783 if (!($c & 0x80)) // single-byte (0xxxxxx)
1784 $n++;
1785 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1786 $n++;
1787 }
1788 if (!strlen($str{$i})) return false; // offset beyond string length
1789
1790 return $n;
1791 }
1792
1793 /**
1794 * Maps all characters of an UTF-8 string.
1795 *
1796 * @param string UTF-8 string
1797 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1798 * @param string 'case': conversion 'toLower' or 'toUpper'
1799 * @return string the converted string
1800 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1801 */
1802 function utf8_char_mapping($str,$mode,$opt='') {
1803 if (!$this->initUnicodeData($mode)) return $str; // do nothing
1804
1805 $out = '';
1806 switch($mode) {
1807 case 'case':
1808 $map =& $this->caseFolding['utf-8'][$opt];
1809 break;
1810
1811 case 'ascii':
1812 $map =& $this->toASCII['utf-8'];
1813 break;
1814
1815 default:
1816 return $str;
1817 }
1818
1819 for($i=0; strlen($str{$i}); $i++) {
1820 $c = ord($str{$i});
1821 if (!($c & 0x80)) // single-byte (0xxxxxx)
1822 $mbc = $str{$i};
1823 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1824 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1825 $mbc = substr($str,$i,$bc);
1826 $i += $bc-1;
1827 }
1828
1829 if (isset($map[$mbc])) {
1830 $out .= $map[$mbc];
1831 } else {
1832 $out .= $mbc;
1833 }
1834 }
1835
1836 return $out;
1837 }
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856 /********************************************
1857 *
1858 * Internal EUC string operation functions
1859 *
1860 * Extended Unix Code:
1861 * ASCII compatible 7bit single bytes chars
1862 * 8bit two byte chars
1863 *
1864 * Shift-JIS is treated as a special case.
1865 *
1866 ********************************************/
1867
1868 /**
1869 * Cuts a string in the EUC charset family short at a given byte length.
1870 *
1871 * @param string EUC multibyte character string
1872 * @param integer the byte length
1873 * @param string the charset
1874 * @return string the shortened string
1875 * @see mb_strcut()
1876 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1877 */
1878 function euc_strtrunc($str,$len,$charset) {
1879 $sjis = ($charset == 'shift_jis');
1880 for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
1881 $c = ord($str{$i});
1882 if ($sjis) {
1883 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1884 }
1885 else {
1886 if ($c >= 0x80) $i++; // advance a double-byte char
1887 }
1888 }
1889 if (!strlen($str{$i})) return $str; // string shorter than supplied length
1890
1891 if ($i>$len)
1892 return substr($str,0,$len-1); // we ended on a first byte
1893 else
1894 return substr($str,0,$len);
1895 }
1896
1897 /**
1898 * Returns a part of a string in the EUC charset family.
1899 *
1900 * @param string EUC multibyte character string
1901 * @param integer start position (character position)
1902 * @param string the charset
1903 * @param integer length (in characters)
1904 * @return string the substring
1905 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1906 */
1907 function euc_substr($str,$start,$charset,$len=null) {
1908 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1909 if ($byte_start === false) return false; // $start outside string length
1910
1911 $str = substr($str,$byte_start);
1912
1913 if ($len!=null) {
1914 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1915 if ($byte_end === false) // $len outside actual string length
1916 return $str;
1917 else
1918 return substr($str,0,$byte_end);
1919 }
1920 else return $str;
1921 }
1922
1923 /**
1924 * Counts the number of characters of a string in the EUC charset family.
1925 *
1926 * @param string EUC multibyte character string
1927 * @param string the charset
1928 * @return integer the number of characters
1929 * @see strlen()
1930 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1931 */
1932 function euc_strlen($str,$charset) {
1933 $sjis = ($charset == 'shift_jis');
1934 $n=0;
1935 for ($i=0; strlen($str{$i}); $i++) {
1936 $c = ord($str{$i});
1937 if ($sjis) {
1938 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1939 }
1940 else {
1941 if ($c >= 0x80) $i++; // advance a double-byte char
1942 }
1943
1944 $n++;
1945 }
1946
1947 return $n;
1948 }
1949
1950 /**
1951 * Translates a character position into an 'absolute' byte position.
1952 *
1953 * @param string EUC multibyte character string
1954 * @param integer character position (negative values start from the end)
1955 * @param string the charset
1956 * @return integer byte position
1957 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1958 */
1959 function euc_char2byte_pos($str,$pos,$charset) {
1960 $sjis = ($charset == 'shift_jis');
1961 $n = 0; // number of characters seen
1962 $p = abs($pos); // number of characters wanted
1963
1964 if ($pos >= 0) {
1965 $i = 0;
1966 $d = 1;
1967 } else {
1968 $i = strlen($str)-1;
1969 $d = -1;
1970 }
1971
1972 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
1973 $c = ord($str{$i});
1974 if ($sjis) {
1975 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char
1976 }
1977 else {
1978 if ($c >= 0x80) $i+=$d; // advance a double-byte char
1979 }
1980
1981 $n++;
1982 }
1983 if (!strlen($str{$i})) return false; // offset beyond string length
1984
1985 if ($pos < 0) $i++; // correct offset
1986
1987 return $i;
1988 }
1989
1990 /**
1991 * Maps all characters of a string in the EUC charset family.
1992 *
1993 * @param string EUC multibyte character string
1994 * @param string the charset
1995 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1996 * @param string 'case': conversion 'toLower' or 'toUpper'
1997 * @return string the converted string
1998 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1999 */
2000 function euc_char_mapping($str,$charset,$mode,$opt='') {
2001 switch($mode) {
2002 case 'case':
2003 if (!$this->initCaseFolding($charset)) return $str; // do nothing
2004 $map =& $this->caseFolding[$charset][$opt];
2005 break;
2006
2007 case 'ascii':
2008 if (!$this->initToASCII($charset)) return $str; // do nothing
2009 $map =& $this->toASCII[$charset];
2010 break;
2011
2012 default:
2013 return $str;
2014 }
2015
2016 $sjis = ($charset == 'shift_jis');
2017 $out = '';
2018 for($i=0; strlen($str{$i}); $i++) {
2019 $mbc = $str{$i};
2020 $c = ord($mbc);
2021
2022 if ($sjis) {
2023 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2024 $mbc = substr($str,$i,2);
2025 $i++;
2026 }
2027 }
2028 else {
2029 if ($c >= 0x80) { // a double-byte char
2030 $mbc = substr($str,$i,2);
2031 $i++;
2032 }
2033 }
2034
2035 if (isset($map[$mbc])) {
2036 $out .= $map[$mbc];
2037 } else {
2038 $out .= $mbc;
2039 }
2040 }
2041
2042 return $out;
2043 }
2044
2045 }
2046
2047 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
2048 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2049 }
2050 ?>