* (trivial) Cleanup of NEWS.txt
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2007 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 136: class t3lib_cs
38 * 488: function parse_charset($charset)
39 * 507: function get_locale_charset($locale)
40 *
41 * SECTION: Charset Conversion functions
42 * 560: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
43 * 600: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
44 * 617: function utf8_encode($str,$charset)
45 * 663: function utf8_decode($str,$charset,$useEntityForNoChar=0)
46 * 706: function utf8_to_entities($str)
47 * 739: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
48 * 773: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
49 * 823: function UnumberToChar($cbyte)
50 * 868: function utf8CharToUnumber($str,$hex=0)
51 *
52 * SECTION: Init functions
53 * 911: function initCharset($charset)
54 * 973: function initUnicodeData($mode=null)
55 * 1198: function initCaseFolding($charset)
56 * 1260: function initToASCII($charset)
57 *
58 * SECTION: String operation functions
59 * 1331: function substr($charset,$string,$start,$len=null)
60 * 1384: function strlen($charset,$string)
61 * 1414: function crop($charset,$string,$len,$crop='')
62 * 1467: function strtrunc($charset,$string,$len)
63 * 1501: function conv_case($charset,$string,$case)
64 * 1527: function specCharsToASCII($charset,$string)
65 *
66 * SECTION: Internal string operation functions
67 * 1567: function sb_char_mapping($str,$charset,$mode,$opt='')
68 *
69 * SECTION: Internal UTF-8 string operation functions
70 * 1622: function utf8_substr($str,$start,$len=null)
71 * 1655: function utf8_strlen($str)
72 * 1676: function utf8_strtrunc($str,$len)
73 * 1698: function utf8_strpos($haystack,$needle,$offset=0)
74 * 1723: function utf8_strrpos($haystack,$needle)
75 * 1745: function utf8_char2byte_pos($str,$pos)
76 * 1786: function utf8_byte2char_pos($str,$pos)
77 * 1809: function utf8_char_mapping($str,$mode,$opt='')
78 *
79 * SECTION: Internal EUC string operation functions
80 * 1885: function euc_strtrunc($str,$len,$charset)
81 * 1914: function euc_substr($str,$start,$charset,$len=null)
82 * 1939: function euc_strlen($str,$charset)
83 * 1966: function euc_char2byte_pos($str,$pos,$charset)
84 * 2007: function euc_char_mapping($str,$charset,$mode,$opt='')
85 *
86 * TOTAL FUNCTIONS: 35
87 * (This index is automatically created/updated by the extension "extdeveval")
88 *
89 */
90
91
92
93
94
95
96
97
98 /**
99 * Notes on UTF-8
100 *
101 * Functions working on UTF-8 strings:
102 *
103 * - strchr/strstr
104 * - strrchr
105 * - substr_count
106 * - implode/explode/join
107 *
108 * Functions nearly working on UTF-8 strings:
109 *
110 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
111 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
112 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
113 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
114 *
115 * Functions NOT working on UTF-8 strings:
116 *
117 * - str*cmp
118 * - stristr
119 * - stripos
120 * - substr
121 * - strrev
122 * - ereg/eregi
123 * - split/spliti
124 * - preg_*
125 * - ...
126 *
127 */
128 /**
129 * Class for conversion between charsets
130 *
131 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
132 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
133 * @package TYPO3
134 * @subpackage t3lib
135 */
136 class t3lib_cs {
137 var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
138
139 // This is the array where parsed conversion tables are stored (cached)
140 var $parsedCharsets=array();
141
142 // An array where case folding data will be stored (cached)
143 var $caseFolding=array();
144
145 // An array where charset-to-ASCII mappings are stored (cached)
146 var $toASCII=array();
147
148 // This tells the converter which charsets has two bytes per char:
149 var $twoByteSets=array(
150 'ucs-2'=>1, // 2-byte Unicode
151 );
152
153 // This tells the converter which charsets has four bytes per char:
154 var $fourByteSets=array(
155 'ucs-4'=>1, // 4-byte Unicode
156 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
157 );
158
159 // This tells the converter which charsets use a scheme like the Extended Unix Code:
160 var $eucBasedSets=array(
161 'gb2312'=>1, // Chinese, simplified.
162 'big5'=>1, // Chinese, traditional.
163 'euc-kr'=>1, // Korean
164 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
165 );
166
167 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
168 // http://czyborra.com/charsets/iso8859.html
169 var $synonyms=array(
170 'us' => 'ascii',
171 'us-ascii'=> 'ascii',
172 'cp819' => 'iso-8859-1',
173 'ibm819' => 'iso-8859-1',
174 'iso-ir-100' => 'iso-8859-1',
175 'iso-ir-109' => 'iso-8859-2',
176 'iso-ir-148' => 'iso-8859-9',
177 'iso-ir-199' => 'iso-8859-14',
178 'iso-ir-203' => 'iso-8859-15',
179 'csisolatin1' => 'iso-8859-1',
180 'csisolatin2' => 'iso-8859-2',
181 'csisolatin3' => 'iso-8859-3',
182 'csisolatin5' => 'iso-8859-9',
183 'csisolatin8' => 'iso-8859-14',
184 'csisolatin9' => 'iso-8859-15',
185 'csisolatingreek' => 'iso-8859-7',
186 'iso-celtic' => 'iso-8859-14',
187 'latin1' => 'iso-8859-1',
188 'latin2' => 'iso-8859-2',
189 'latin3' => 'iso-8859-3',
190 'latin5' => 'iso-8859-9',
191 'latin6' => 'iso-8859-10',
192 'latin8' => 'iso-8859-14',
193 'latin9' => 'iso-8859-15',
194 'l1' => 'iso-8859-1',
195 'l2' => 'iso-8859-2',
196 'l3' => 'iso-8859-3',
197 'l5' => 'iso-8859-9',
198 'l6' => 'iso-8859-10',
199 'l8' => 'iso-8859-14',
200 'l9' => 'iso-8859-15',
201 'cyrillic' => 'iso-8859-5',
202 'arabic' => 'iso-8859-6',
203 'tis-620' => 'iso-8859-11',
204 'win874' => 'windows-874',
205 'win1250' => 'windows-1250',
206 'win1251' => 'windows-1251',
207 'win1252' => 'windows-1252',
208 'win1253' => 'windows-1253',
209 'win1254' => 'windows-1254',
210 'win1255' => 'windows-1255',
211 'win1256' => 'windows-1256',
212 'win1257' => 'windows-1257',
213 'win1258' => 'windows-1258',
214 'cp1250' => 'windows-1250',
215 'cp1251' => 'windows-1251',
216 'cp1252' => 'windows-1252',
217 'ms-ee' => 'windows-1250',
218 'ms-ansi' => 'windows-1252',
219 'ms-greek' => 'windows-1253',
220 'ms-turk' => 'windows-1254',
221 'winbaltrim' => 'windows-1257',
222 'koi-8ru' => 'koi-8r',
223 'koi8r' => 'koi-8r',
224 'cp878' => 'koi-8r',
225 'mac' => 'macroman',
226 'macintosh' => 'macroman',
227 'euc-cn' => 'gb2312',
228 'x-euc-cn' => 'gb2312',
229 'euccn' => 'gb2312',
230 'cp936' => 'gb2312',
231 'big-5' => 'big5',
232 'cp950' => 'big5',
233 'eucjp' => 'euc-jp',
234 'sjis' => 'shift_jis',
235 'shift-jis' => 'shift_jis',
236 'cp932' => 'shift_jis',
237 'cp949' => 'euc-kr',
238 'utf7' => 'utf-7',
239 'utf8' => 'utf-8',
240 'utf16' => 'utf-16',
241 'utf32' => 'utf-32',
242 'utf8' => 'utf-8',
243 'ucs2' => 'ucs-2',
244 'ucs4' => 'ucs-4',
245 );
246
247 // mapping of iso-639:2 language codes to script names
248 var $lang_to_script=array(
249 // iso-639:2 language codes, see:
250 // http://www.w3.org/WAI/ER/IG/ert/iso639.htm
251 // http://www.loc.gov/standards/iso639-2/langcodes.html
252 // http://www.unicode.org/onlinedat/languages.html
253 'ar' => 'arabic',
254 'bg' => 'cyrillic', // Bulgarian
255 'bs' => 'east_european', // Bosnian
256 'cs' => 'east_european', // Czech
257 'da' => 'west_european', // Danish
258 'de' => 'west_european', // German
259 'es' => 'west_european', // Spanish
260 'et' => 'estonian',
261 'eo' => 'unicode', // Esperanto
262 'eu' => 'west_european', // Basque
263 'fa' => 'arabic', // Persian
264 'fi' => 'west_european', // Finish
265 'fo' => 'west_european', // Faroese
266 'fr' => 'west_european', // French
267 'gr' => 'greek',
268 'he' => 'hebrew', // Hebrew (since 1998)
269 'hi' => 'unicode', // Hindi
270 'hr' => 'east_european', // Croatian
271 'hu' => 'east_european', // Hungarian
272 'iw' => 'hebrew', // Hebrew (til 1998)
273 'is' => 'west_european', // Icelandic
274 'it' => 'west_european', // Italian
275 'ja' => 'japanese',
276 'kl' => 'west_european', // Greenlandic
277 'ko' => 'korean',
278 'lt' => 'lithuanian',
279 'lv' => 'west_european', // Latvian/Lettish
280 'nl' => 'west_european', // Dutch
281 'no' => 'west_european', // Norwegian
282 'pl' => 'east_european', // Polish
283 'pt' => 'west_european', // Portuguese
284 'ro' => 'east_european', // Romanian
285 'ru' => 'cyrillic', // Russian
286 'sk' => 'east_european', // Slovak
287 'sl' => 'east_european', // Slovenian
288 'sr' => 'cyrillic', // Serbian
289 'sv' => 'west_european', // Swedish
290 'sq' => 'albanian', // Albanian
291 'th' => 'thai',
292 'uk' => 'cyrillic', // Ukranian
293 'vi' => 'vietnamese',
294 'zh' => 'chinese',
295 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
296 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
297 'ara' => 'arabic',
298 'bgr' => 'cyrillic', // Bulgarian
299 'cat' => 'west_european', // Catalan
300 'chs' => 'simpl_chinese',
301 'cht' => 'trad_chinese',
302 'csy' => 'east_european', // Czech
303 'dan' => 'west_european', // Danisch
304 'deu' => 'west_european', // German
305 'dea' => 'west_european', // German (Austrian)
306 'des' => 'west_european', // German (Swiss)
307 'ena' => 'west_european', // English (Australian)
308 'enc' => 'west_european', // English (Canadian)
309 'eng' => 'west_european', // English
310 'enz' => 'west_european', // English (New Zealand)
311 'enu' => 'west_european', // English (United States)
312 'euq' => 'west_european', // Basque
313 'fos' => 'west_european', // Faroese
314 'far' => 'arabic', // Persian
315 'fin' => 'west_european', // Finish
316 'fra' => 'west_european', // French
317 'frb' => 'west_european', // French (Belgian)
318 'frc' => 'west_european', // French (Canadian)
319 'frs' => 'west_european', // French (Swiss)
320 'ell' => 'greek',
321 'heb' => 'hebrew',
322 'hin' => 'unicode', // Hindi
323 'hun' => 'east_european', // Hungarian
324 'isl' => 'west_euorpean', // Icelandic
325 'ita' => 'west_european', // Italian
326 'its' => 'west_european', // Italian (Swiss)
327 'jpn' => 'japanese',
328 'kor' => 'korean',
329 'lth' => 'lithuanian',
330 'lvi' => 'west_european', // Latvian/Lettish
331 'msl' => 'west_european', // Malay
332 'nlb' => 'west_european', // Dutch (Belgian)
333 'nld' => 'west_european', // Dutch
334 'nor' => 'west_european', // Norwegian (bokmal)
335 'non' => 'west_european', // Norwegian (nynorsk)
336 'plk' => 'east_european', // Polish
337 'ptg' => 'west_european', // Portuguese
338 'ptb' => 'west_european', // Portuguese (Brazil)
339 'rom' => 'east_european', // Romanian
340 'rus' => 'cyrillic', // Russian
341 'slv' => 'east_european', // Slovenian
342 'sky' => 'east_european', // Slovak
343 'srl' => 'east_european', // Serbian (Latin)
344 'srb' => 'cyrillic', // Serbian (Cyrillic)
345 'esp' => 'west_european', // Spanish (trad. sort)
346 'esm' => 'west_european', // Spanish (Mexican)
347 'esn' => 'west_european', // Spanish (internat. sort)
348 'sve' => 'west_european', // Swedish
349 'sqi' => 'albanian', // Albanian
350 'tha' => 'thai',
351 'trk' => 'turkish',
352 'ukr' => 'cyrillic', // Ukrainian
353 // English language names
354 'albanian' => 'albanian',
355 'arabic' => 'arabic',
356 'basque' => 'west_european',
357 'bosnian' => 'east_european',
358 'bulgarian' => 'east_european',
359 'catalan' => 'west_european',
360 'croatian' => 'east_european',
361 'czech' => 'east_european',
362 'danish' => 'west_european',
363 'dutch' => 'west_european',
364 'english' => 'west_european',
365 'esperanto' => 'unicode',
366 'estonian' => 'estonian',
367 'faroese' => 'west_european',
368 'farsi' => 'arabic',
369 'finnish' => 'west_european',
370 'french' => 'west_european',
371 'galician' => 'west_european',
372 'german' => 'west_european',
373 'greek' => 'greek',
374 'greenlandic' => 'west_european',
375 'hebrew' => 'hebrew',
376 'hindi' => 'unicode',
377 'hungarian' => 'east_european',
378 'icelandic' => 'west_european',
379 'italian' => 'west_european',
380 'latvian' => 'west_european',
381 'lettish' => 'west_european',
382 'lithuanian' => 'lithuanian',
383 'malay' => 'west_european',
384 'norwegian' => 'west_european',
385 'persian' => 'arabic',
386 'polish' => 'east_european',
387 'portuguese' => 'west_european',
388 'russian' => 'cyrillic',
389 'romanian' => 'east_european',
390 'serbian' => 'cyrillic',
391 'slovak' => 'east_european',
392 'slovenian' => 'east_european',
393 'spanish' => 'west_european',
394 'svedish' => 'west_european',
395 'that' => 'thai',
396 'turkish' => 'turkish',
397 'ukrainian' => 'cyrillic',
398 );
399
400 // mapping of language (family) names to charsets on Unix
401 var $script_to_charset_unix=array(
402 'west_european' => 'iso-8859-1',
403 'estonian' => 'iso-8859-1',
404 'east_european' => 'iso-8859-2',
405 'baltic' => 'iso-8859-4',
406 'cyrillic' => 'iso-8859-5',
407 'arabic' => 'iso-8859-6',
408 'greek' => 'iso-8859-7',
409 'hebrew' => 'iso-8859-8',
410 'turkish' => 'iso-8859-9',
411 'thai' => 'iso-8859-11', // = TIS-620
412 'lithuanian' => 'iso-8859-13',
413 'chinese' => 'gb2312', // = euc-cn
414 'japanese' => 'euc-jp',
415 'korean' => 'euc-kr',
416 'simpl_chinese' => 'gb2312',
417 'trad_chinese' => 'big5',
418 'vietnamese' => '',
419 'unicode' => 'utf-8',
420 'albanian' => 'utf-8'
421 );
422
423 // mapping of language (family) names to charsets on Windows
424 var $script_to_charset_windows=array(
425 'east_european' => 'windows-1250',
426 'cyrillic' => 'windows-1251',
427 'west_european' => 'windows-1252',
428 'greek' => 'windows-1253',
429 'turkish' => 'windows-1254',
430 'hebrew' => 'windows-1255',
431 'arabic' => 'windows-1256',
432 'baltic' => 'windows-1257',
433 'estonian' => 'windows-1257',
434 'lithuanian' => 'windows-1257',
435 'vietnamese' => 'windows-1258',
436 'thai' => 'cp874',
437 'korean' => 'cp949',
438 'chinese' => 'gb2312',
439 'japanese' => 'shift_jis',
440 'simpl_chinese' => 'gb2312',
441 'trad_chinese' => 'big5',
442 'albanian' => 'windows-1250'
443 );
444
445 // mapping of locale names to charsets
446 var $locale_to_charset=array(
447 'japanese.euc' => 'euc-jp',
448 'ja_jp.ujis' => 'euc-jp',
449 'korean.euc' => 'euc-kr',
450 'sr@Latn' => 'iso-8859-2',
451 'zh_cn' => 'gb2312',
452 'zh_hk' => 'big5',
453 'zh_tw' => 'big5',
454 );
455
456 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
457 // Empty values means "iso-8859-1"
458 var $charSetArray = array(
459 'dk' => '',
460 'de' => '',
461 'no' => '',
462 'it' => '',
463 'fr' => '',
464 'es' => '',
465 'nl' => '',
466 'cz' => 'windows-1250',
467 'pl' => 'iso-8859-2',
468 'si' => 'windows-1250',
469 'fi' => '',
470 'tr' => 'iso-8859-9',
471 'se' => '',
472 'pt' => '',
473 'ru' => 'windows-1251',
474 'ro' => 'iso-8859-2',
475 'ch' => 'gb2312',
476 'sk' => 'windows-1250',
477 'lt' => 'windows-1257',
478 'is' => 'utf-8',
479 'hr' => 'windows-1250',
480 'hu' => 'iso-8859-2',
481 'gl' => '',
482 'th' => 'iso-8859-11',
483 'gr' => 'iso-8859-7',
484 'hk' => 'big5',
485 'eu' => '',
486 'bg' => 'windows-1251',
487 'br' => '',
488 'et' => 'iso-8859-4',
489 'ar' => 'iso-8859-6',
490 'he' => 'utf-8',
491 'ua' => 'windows-1251',
492 'jp' => 'shift_jis',
493 'lv' => 'utf-8',
494 'vn' => 'utf-8',
495 'ca' => 'iso-8859-15',
496 'ba' => 'iso-8859-2',
497 'kr' => 'euc-kr',
498 'eo' => 'utf-8',
499 'my' => '',
500 'hi' => 'utf-8',
501 'fo' => 'utf-8',
502 'fa' => 'utf-8',
503 'sr' => 'utf-8',
504 'sq' => 'utf-8'
505 );
506
507 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
508 // Missing keys means: same as Typo3
509 var $isoArray = array(
510 'ba' => 'bs',
511 'br' => 'pt_BR',
512 'ch' => 'zh_CN',
513 'cz' => 'cs',
514 'dk' => 'da',
515 'si' => 'sl',
516 'se' => 'sv',
517 'gl' => 'kl',
518 'gr' => 'el',
519 'hk' => 'zh_HK',
520 'kr' => 'ko',
521 'ua' => 'uk',
522 'jp' => 'ja',
523 'vn' => 'vi',
524 );
525
526 /**
527 * Normalize - changes input character set to lowercase letters.
528 *
529 * @param string Input charset
530 * @return string Normalized charset
531 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
532 */
533 function parse_charset($charset) {
534 $charset = trim(strtolower($charset));
535 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
536
537 return $charset;
538 }
539
540 /**
541 * Get the charset of a locale.
542 *
543 * ln language
544 * ln_CN language / country
545 * ln_CN.cs language / country / charset
546 * ln_CN.cs@mod language / country / charset / modifier
547 *
548 * @param string Locale string
549 * @return string Charset resolved for locale string
550 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
551 */
552 function get_locale_charset($locale) {
553 $locale = strtolower($locale);
554
555 // exact locale specific charset?
556 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale];
557
558 // get modifier
559 list($locale,$modifier) = explode('@',$locale);
560
561 // locale contains charset: use it
562 list($locale,$charset) = explode('.',$locale);
563 if ($charset) return $this->parse_charset($charset);
564
565 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
566 if ($modifier == 'euro') return 'iso-8859-15';
567
568 // get language
569 list($language,$country) = explode('_',$locale);
570 if (isset($this->lang_to_script[$language])) $script = $this->lang_to_script[$language];
571
572 if (TYPO3_OS == 'WIN') {
573 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'window-1252';
574 } else {
575 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
576 }
577
578 return $cs;
579 }
580
581
582
583
584
585
586
587
588
589 /********************************************
590 *
591 * Charset Conversion functions
592 *
593 ********************************************/
594
595 /**
596 * Convert from one charset to another charset.
597 *
598 * @param string Input string
599 * @param string From charset (the current charset of the string)
600 * @param string To charset (the output charset wanted)
601 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
602 * @return string Converted string
603 * @see convArray()
604 */
605 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
606 if ($fromCS==$toCS) return $str;
607
608 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
609 if ($toCS=='utf-8' || !$useEntityForNoChar) {
610 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
611 case 'mbstring':
612 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
613 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
614 break;
615
616 case 'iconv':
617 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
618 if (false !== $conv_str) return $conv_str;
619 break;
620
621 case 'recode':
622 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
623 if (false !== $conv_str) return $conv_str;
624 break;
625 }
626 // fallback to TYPO3 conversion
627 }
628
629 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
630 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
631 return $str;
632 }
633
634 /**
635 * Convert all elements in ARRAY from one charset to another charset.
636 * NOTICE: Array is passed by reference!
637 *
638 * @param string Input array, possibly multidimensional
639 * @param string From charset (the current charset of the string)
640 * @param string To charset (the output charset wanted)
641 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
642 * @return void
643 * @see conv()
644 */
645 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
646 foreach($array as $key => $value) {
647 if (is_array($array[$key])) {
648 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
649 } else {
650 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
651 }
652 }
653 }
654
655 /**
656 * Converts $str from $charset to UTF-8
657 *
658 * @param string String in local charset to convert to UTF-8
659 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
660 * @return string Output string, converted to UTF-8
661 */
662 function utf8_encode($str,$charset) {
663
664 if ($charset === 'utf-8') return $str;
665
666 // Charset is case-insensitive.
667 if ($this->initCharset($charset)) { // Parse conv. table if not already...
668 $strLen = strlen($str);
669 $outStr='';
670
671 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
672 $chr=substr($str,$a,1);
673 $ord=ord($chr);
674 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
675 $ord2 = ord($str{$a+1});
676 $ord = $ord<<8 | $ord2; // assume big endian
677
678 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
679 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
680 } else $outStr.=chr($this->noCharByteVal); // No char exists
681 $a++;
682 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
683 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
684 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
685 $a++;
686 $ord2=ord(substr($str,$a,1));
687 $ord = $ord*256+$ord2;
688 }
689 }
690
691 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
692 $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
693 } else $outStr.= chr($this->noCharByteVal); // No char exists
694 } else $outStr.= $chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
695 }
696 return $outStr;
697 }
698 }
699
700 /**
701 * Converts $str from UTF-8 to $charset
702 *
703 * @param string String in UTF-8 to convert to local charset
704 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
705 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
706 * @return string Output string, converted to local charset
707 */
708 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
709
710 // Charset is case-insensitive.
711 if ($this->initCharset($charset)) { // Parse conv. table if not already...
712 $strLen = strlen($str);
713 $outStr='';
714 $buf='';
715 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
716 $chr=substr($str,$a,1);
717 $ord=ord($chr);
718 if ($ord>127) { // This means multibyte! (first byte!)
719 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
720
721 $buf=$chr; // Add first byte
722 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
723 $ord = $ord << 1; // Shift it left and ...
724 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
725 $a++; // Increase pointer...
726 $buf.=substr($str,$a,1); // ... and add the next char.
727 } else break;
728 }
729
730 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
731 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
732 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
733 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
734 } else $outStr.= chr($mByte);
735 } elseif ($useEntityForNoChar) { // Create num entity:
736 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
737 } else $outStr.=chr($this->noCharByteVal); // No char exists
738 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
739 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
740 }
741 return $outStr;
742 }
743 }
744
745 /**
746 * Converts all chars > 127 to numeric entities.
747 *
748 * @param string Input string
749 * @return string Output string
750 */
751 function utf8_to_entities($str) {
752 $strLen = strlen($str);
753 $outStr='';
754 $buf='';
755 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
756 $chr=substr($str,$a,1);
757 $ord=ord($chr);
758 if ($ord>127) { // This means multibyte! (first byte!)
759 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
760 $buf=$chr; // Add first byte
761 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
762 $ord = $ord << 1; // Shift it left and ...
763 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
764 $a++; // Increase pointer...
765 $buf.=substr($str,$a,1); // ... and add the next char.
766 } else break;
767 }
768
769 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
770 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
771 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
772 }
773
774 return $outStr;
775 }
776
777 /**
778 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
779 *
780 * @param string Input string, UTF-8
781 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
782 * @return string Output string
783 */
784 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
785 if ($alsoStdHtmlEnt) {
786 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
787 }
788
789 $token = md5(microtime());
790 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
791 foreach($parts as $k => $v) {
792 if ($k%2) {
793 if (substr($v,0,1)=='#') { // Dec or hex entities:
794 if (substr($v,1,1)=='x') {
795 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
796 } else {
797 $parts[$k] = $this->UnumberToChar(substr($v,1));
798 }
799 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
800 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
801 } else { // No conversion:
802 $parts[$k] ='&'.$v.';';
803 }
804 }
805 }
806
807 return implode('',$parts);
808 }
809
810 /**
811 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
812 *
813 * @param string Input string, UTF-8
814 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
815 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
816 * @return array Output array with the char numbers
817 */
818 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
819 // If entities must be registered as well...:
820 if ($convEntities) {
821 $str = $this->entities_to_utf8($str,1);
822 }
823 // Do conversion:
824 $strLen = strlen($str);
825 $outArr=array();
826 $buf='';
827 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
828 $chr=substr($str,$a,1);
829 $ord=ord($chr);
830 if ($ord>127) { // This means multibyte! (first byte!)
831 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
832 $buf=$chr; // Add first byte
833 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
834 $ord = $ord << 1; // Shift it left and ...
835 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
836 $a++; // Increase pointer...
837 $buf.=substr($str,$a,1); // ... and add the next char.
838 } else break;
839 }
840
841 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
842 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
843 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
844 }
845
846 return $outArr;
847 }
848
849 /**
850 * Converts a UNICODE number to a UTF-8 multibyte character
851 * Algorithm based on script found at From: http://czyborra.com/utf/
852 * Unit-tested by Kasper
853 *
854 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
855 *
856 * bytes | bits | representation
857 * 1 | 7 | 0vvvvvvv
858 * 2 | 11 | 110vvvvv 10vvvvvv
859 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
860 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
861 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
862 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
863 *
864 * @param integer UNICODE integer
865 * @return string UTF-8 multibyte character string
866 * @see utf8CharToUnumber()
867 */
868 function UnumberToChar($cbyte) {
869 $str='';
870
871 if ($cbyte < 0x80) {
872 $str.=chr($cbyte);
873 } else if ($cbyte < 0x800) {
874 $str.=chr(0xC0 | ($cbyte >> 6));
875 $str.=chr(0x80 | ($cbyte & 0x3F));
876 } else if ($cbyte < 0x10000) {
877 $str.=chr(0xE0 | ($cbyte >> 12));
878 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
879 $str.=chr(0x80 | ($cbyte & 0x3F));
880 } else if ($cbyte < 0x200000) {
881 $str.=chr(0xF0 | ($cbyte >> 18));
882 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
883 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
884 $str.=chr(0x80 | ($cbyte & 0x3F));
885 } else if ($cbyte < 0x4000000) {
886 $str.=chr(0xF8 | ($cbyte >> 24));
887 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
888 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
889 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
890 $str.=chr(0x80 | ($cbyte & 0x3F));
891 } else if ($cbyte < 0x80000000) {
892 $str.=chr(0xFC | ($cbyte >> 30));
893 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
894 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
895 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
896 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
897 $str.=chr(0x80 | ($cbyte & 0x3F));
898 } else { // Cannot express a 32-bit character in UTF-8
899 $str .= chr($this->noCharByteVal);
900 }
901 return $str;
902 }
903
904 /**
905 * Converts a UTF-8 Multibyte character to a UNICODE number
906 * Unit-tested by Kasper
907 *
908 * @param string UTF-8 multibyte character string
909 * @param boolean If set, then a hex. number is returned.
910 * @return integer UNICODE integer
911 * @see UnumberToChar()
912 */
913 function utf8CharToUnumber($str,$hex=0) {
914 $ord=ord(substr($str,0,1)); // First char
915
916 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
917 $binBuf='';
918 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
919 $ord = $ord << 1; // Shift it left and ...
920 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
921 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
922 } else break;
923 }
924 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
925
926 $int = bindec($binBuf);
927 } else $int = $ord;
928
929 return $hex ? 'x'.dechex($int) : $int;
930 }
931
932
933
934
935
936
937
938
939
940 /********************************************
941 *
942 * Init functions
943 *
944 ********************************************/
945
946 /**
947 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
948 * This function is automatically called by the conversion functions
949 *
950 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
951 *
952 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
953 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
954 * @access private
955 */
956 function initCharset($charset) {
957 // Only process if the charset is not yet loaded:
958 if (!is_array($this->parsedCharsets[$charset])) {
959
960 // Conversion table filename:
961 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
962
963 // If the conversion table is found:
964 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
965 // Cache file for charsets:
966 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
967 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
968 if ($cacheFile && @is_file($cacheFile)) {
969 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
970 } else {
971 // Parse conversion table into lines:
972 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
973 // Initialize the internal variable holding the conv. table:
974 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
975 // traverse the lines:
976 $detectedType='';
977 foreach($lines as $value) {
978 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
979
980 // Detect type if not done yet: (Done on first real line)
981 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
982 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
983
984 if ($detectedType=='ms-token') {
985 list($hexbyte,$utf8) = split('=|:',$value,3);
986 } elseif ($detectedType=='whitespaced') {
987 $regA=array();
988 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
989 $hexbyte = $regA[1];
990 $utf8 = 'U+'.$regA[2];
991 }
992 $decval = hexdec(trim($hexbyte));
993 if ($decval>127) {
994 $utf8decval = hexdec(substr(trim($utf8),2));
995 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
996 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
997 }
998 }
999 }
1000 if ($cacheFile) {
1001 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
1002 }
1003 }
1004 return 2;
1005 } else return false;
1006 } else return 1;
1007 }
1008
1009 /**
1010 * This function initializes all UTF-8 character data tables.
1011 *
1012 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1013 *
1014 * @param string Mode ("case", "ascii", ...)
1015 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1016 * @access private
1017 */
1018 function initUnicodeData($mode=null) {
1019 // cache files
1020 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1021 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1022
1023 // Only process if the tables are not yet loaded
1024 switch($mode) {
1025 case 'case':
1026 if (is_array($this->caseFolding['utf-8'])) return 1;
1027
1028 // Use cached version if possible
1029 if ($cacheFileCase && @is_file($cacheFileCase)) {
1030 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1031 return 2;
1032 }
1033 break;
1034
1035 case 'ascii':
1036 if (is_array($this->toASCII['utf-8'])) return 1;
1037
1038 // Use cached version if possible
1039 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1040 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1041 return 2;
1042 }
1043 break;
1044 }
1045
1046 // process main Unicode data file
1047 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
1048 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
1049
1050 $fh = fopen($unicodeDataFile,'rb');
1051 if (!$fh) return false;
1052
1053 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1054 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1055 $this->caseFolding['utf-8'] = array();
1056 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1057 $utf8CaseFolding['toUpper'] = array();
1058 $utf8CaseFolding['toLower'] = array();
1059 $utf8CaseFolding['toTitle'] = array();
1060
1061 $decomposition = array(); // array of temp. decompositions
1062 $mark = array(); // array of chars that are marks (eg. composing accents)
1063 $number = array(); // array of chars that are numbers (eg. digits)
1064 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1065
1066 while (!feof($fh)) {
1067 $line = fgets($fh,4096);
1068 // has a lot of info
1069 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
1070
1071 $ord = hexdec($char);
1072 if ($ord > 0xFFFF) break; // only process the BMP
1073
1074 $utf8_char = $this->UnumberToChar($ord);
1075
1076 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1077 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1078 // store "title" only when different from "upper" (only a few)
1079 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1080
1081 switch ($cat{0}) {
1082 case 'M': // mark (accent, umlaut, ...)
1083 $mark["U+$char"] = 1;
1084 break;
1085
1086 case 'N': // numeric value
1087 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
1088 }
1089
1090 // accented Latin letters without "official" decomposition
1091 $match = array();
1092 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) {
1093 $c = ord($match[2]);
1094 if ($match[1] == 'SMALL') $c += 32;
1095
1096 $decomposition["U+$char"] = array(dechex($c));
1097 continue;
1098 }
1099
1100 $match = array();
1101 if (ereg('(<.*>)? *(.+)',$decomp,$match)) {
1102 switch($match[1]) {
1103 case '<circle>': // add parenthesis as circle replacement, eg (1)
1104 $match[2] = '0028 '.$match[2].' 0029';
1105 break;
1106
1107 case '<square>': // add square brackets as square replacement, eg [1]
1108 $match[2] = '005B '.$match[2].' 005D';
1109 break;
1110
1111 case '<compat>': // ignore multi char decompositions that start with a space
1112 if (ereg('^0020 ',$match[2])) continue 2;
1113 break;
1114
1115 // ignore Arabic and vertical layout presentation decomposition
1116 case '<initial>':
1117 case '<medial>':
1118 case '<final>':
1119 case '<isolated>':
1120 case '<vertical>':
1121 continue 2;
1122 }
1123 $decomposition["U+$char"] = split(' ',$match[2]);
1124 }
1125 }
1126 fclose($fh);
1127
1128 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1129 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
1130 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1131 $fh = fopen($specialCasingFile,'rb');
1132 if ($fh) {
1133 while (!feof($fh)) {
1134 $line = fgets($fh,4096);
1135 if ($line{0} != '#' && trim($line) != '') {
1136
1137 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
1138 if ($cond == '' || $cond{0} == '#') {
1139 $utf8_char = $this->UnumberToChar(hexdec($char));
1140 if ($char != $lower) {
1141 $arr = split(' ',$lower);
1142 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1143 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1144 }
1145 if ($char != $title && $title != $upper) {
1146 $arr = split(' ',$title);
1147 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1148 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1149 }
1150 if ($char != $upper) {
1151 $arr = split(' ',$upper);
1152 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1153 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1154 }
1155 }
1156 }
1157 }
1158 fclose($fh);
1159 }
1160 }
1161
1162 // process custom decompositions
1163 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
1164 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1165 $fh = fopen($customTranslitFile,'rb');
1166 if ($fh) {
1167 while (!feof($fh)) {
1168 $line = fgets($fh,4096);
1169 if ($line{0} != '#' && trim($line) != '') {
1170 list($char,$translit) = t3lib_div::trimExplode(';', $line);
1171 if (!$translit) $omit["U+$char"] = 1;
1172 $decomposition["U+$char"] = split(' ', $translit);
1173
1174 }
1175 }
1176 fclose($fh);
1177 }
1178 }
1179
1180 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1181 foreach($decomposition as $from => $to) {
1182 $code_decomp = array();
1183
1184 while ($code_value = array_shift($to)) {
1185 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1186 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
1187 array_unshift($to, $cv);
1188 }
1189 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1190 array_push($code_decomp, $code_value);
1191 }
1192 }
1193 if (count($code_decomp) || isset($omit[$from])) {
1194 $decomposition[$from] = $code_decomp;
1195 } else {
1196 unset($decomposition[$from]);
1197 }
1198 }
1199
1200 // create ascii only mapping
1201 $this->toASCII['utf-8'] = array();
1202 $ascii =& $this->toASCII['utf-8'];
1203
1204 foreach($decomposition as $from => $to) {
1205 $code_decomp = array();
1206 while ($code_value = array_shift($to)) {
1207 $ord = hexdec($code_value);
1208 if ($ord > 127)
1209 continue 2; // skip decompositions containing non-ASCII chars
1210 else
1211 array_push($code_decomp,chr($ord));
1212 }
1213 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1214 }
1215
1216 // add numeric decompositions
1217 foreach($number as $from => $to) {
1218 $utf8_char = $this->UnumberToChar(hexdec($from));
1219 if (!isset($ascii[$utf8_char])) {
1220 $ascii[$utf8_char] = $to;
1221 }
1222 }
1223
1224 if ($cacheFileCase) {
1225 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1226 }
1227
1228 if ($cacheFileASCII) {
1229 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1230 }
1231
1232 return 3;
1233 }
1234
1235 /**
1236 * This function initializes the folding table for a charset other than UTF-8.
1237 * This function is automatically called by the case folding functions.
1238 *
1239 * @param string Charset for which to initialize case folding.
1240 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1241 * @access private
1242 */
1243 function initCaseFolding($charset) {
1244 // Only process if the case table is not yet loaded:
1245 if (is_array($this->caseFolding[$charset])) return 1;
1246
1247 // Use cached version if possible
1248 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1249 if ($cacheFile && @is_file($cacheFile)) {
1250 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1251 return 2;
1252 }
1253
1254 // init UTF-8 conversion for this charset
1255 if (!$this->initCharset($charset)) {
1256 return false;
1257 }
1258
1259 // UTF-8 case folding is used as the base conversion table
1260 if (!$this->initUnicodeData('case')) {
1261 return false;
1262 }
1263
1264 $nochar = chr($this->noCharByteVal);
1265 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1266 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1267 $c = $this->utf8_decode($utf8, $charset);
1268
1269 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1270 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1271 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1272
1273 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1274 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1275 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
1276
1277 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1278 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1279 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1280 }
1281
1282 // add the ASCII case table
1283 for ($i=ord('a'); $i<=ord('z'); $i++) {
1284 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
1285 }
1286 for ($i=ord('A'); $i<=ord('Z'); $i++) {
1287 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
1288 }
1289
1290 if ($cacheFile) {
1291 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
1292 }
1293
1294 return 3;
1295 }
1296
1297 /**
1298 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1299 * This function is automatically called by the ASCII transliteration functions.
1300 *
1301 * @param string Charset for which to initialize conversion.
1302 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1303 * @access private
1304 */
1305 function initToASCII($charset) {
1306 // Only process if the case table is not yet loaded:
1307 if (is_array($this->toASCII[$charset])) return 1;
1308
1309 // Use cached version if possible
1310 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1311 if ($cacheFile && @is_file($cacheFile)) {
1312 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1313 return 2;
1314 }
1315
1316 // init UTF-8 conversion for this charset
1317 if (!$this->initCharset($charset)) {
1318 return false;
1319 }
1320
1321 // UTF-8/ASCII transliteration is used as the base conversion table
1322 if (!$this->initUnicodeData('ascii')) {
1323 return false;
1324 }
1325
1326 $nochar = chr($this->noCharByteVal);
1327 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1328 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1329 $c = $this->utf8_decode($utf8, $charset);
1330
1331 if (isset($this->toASCII['utf-8'][$utf8])) {
1332 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1333 }
1334 }
1335
1336 if ($cacheFile) {
1337 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
1338 }
1339
1340 return 3;
1341 }
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358 /********************************************
1359 *
1360 * String operation functions
1361 *
1362 ********************************************/
1363
1364 /**
1365 * Returns a part of a string.
1366 * Unit-tested by Kasper (single byte charsets only)
1367 *
1368 * @param string The character set
1369 * @param string Character string
1370 * @param integer Start position (character position)
1371 * @param integer Length (in characters)
1372 * @return string The substring
1373 * @see substr(), mb_substr()
1374 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1375 */
1376 function substr($charset,$string,$start,$len=null) {
1377 if ($len===0) return '';
1378
1379 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1380 // cannot omit $len, when specifying charset
1381 if ($len==null) {
1382 $enc = mb_internal_encoding(); // save internal encoding
1383 mb_internal_encoding($charset);
1384 $str = mb_substr($string,$start);
1385 mb_internal_encoding($enc); // restore internal encoding
1386
1387 return $str;
1388 }
1389 else {
1390 return mb_substr($string,$start,$len,$charset);
1391 }
1392 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1393 // cannot omit $len, when specifying charset
1394 if ($len==null) {
1395 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1396 iconv_set_encoding('internal_encoding',$charset);
1397 $str = iconv_substr($string,$start);
1398 iconv_set_encoding('internal_encoding',$enc); // restore internal encoding
1399
1400 return $str;
1401 }
1402 else {
1403 return iconv_substr($string,$start,$len,$charset);
1404 }
1405 } elseif ($charset == 'utf-8') {
1406 return $this->utf8_substr($string,$start,$len);
1407 } elseif ($this->eucBasedSets[$charset]) {
1408 return $this->euc_substr($string,$start,$charset,$len);
1409 } elseif ($this->twoByteSets[$charset]) {
1410 return substr($string,$start*2,$len*2);
1411 } elseif ($this->fourByteSets[$charset]) {
1412 return substr($string,$start*4,$len*4);
1413 }
1414
1415 // treat everything else as single-byte encoding
1416 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
1417 }
1418
1419 /**
1420 * Counts the number of characters.
1421 * Unit-tested by Kasper (single byte charsets only)
1422 *
1423 * @param string The character set
1424 * @param string Character string
1425 * @return integer The number of characters
1426 * @see strlen()
1427 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1428 */
1429 function strlen($charset,$string) {
1430 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1431 return mb_strlen($string,$charset);
1432 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1433 return iconv_strlen($string,$charset);
1434 } elseif ($charset == 'utf-8') {
1435 return $this->utf8_strlen($string);
1436 } elseif ($this->eucBasedSets[$charset]) {
1437 return $this->euc_strlen($string,$charset);
1438 } elseif ($this->twoByteSets[$charset]) {
1439 return strlen($string)/2;
1440 } elseif ($this->fourByteSets[$charset]) {
1441 return strlen($string)/4;
1442 }
1443 // treat everything else as single-byte encoding
1444 return strlen($string);
1445 }
1446
1447 /**
1448 * Truncates a string and pre-/appends a string.
1449 * Unit tested by Kasper
1450 *
1451 * @param string The character set
1452 * @param string Character string
1453 * @param integer Length (in characters)
1454 * @param string Crop signifier
1455 * @return string The shortened string
1456 * @see substr(), mb_strimwidth()
1457 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1458 */
1459 function crop($charset,$string,$len,$crop='') {
1460 if (intval($len) == 0) return $string;
1461
1462 if ($charset == 'utf-8') {
1463 $i = $this->utf8_char2byte_pos($string,$len);
1464 } elseif ($this->eucBasedSets[$charset]) {
1465 $i = $this->euc_char2byte_pos($string,$len,$charset);
1466 } else {
1467 if ($len > 0) {
1468 $i = $len;
1469 } else {
1470 $i = strlen($string)+$len;
1471 if ($i<=0) $i = false;
1472 }
1473 }
1474
1475 if ($i === false) { // $len outside actual string length
1476 return $string;
1477 } else {
1478 if ($len > 0) {
1479 if (strlen($string{$i})) {
1480 return substr($string,0,$i).$crop;
1481
1482 }
1483 } else {
1484 if (strlen($string{$i-1})) {
1485 return $crop.substr($string,$i);
1486 }
1487 }
1488
1489 /*
1490 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1491 if ($len > 0) {
1492 return substr($string,0,$i).$crop;
1493 } else {
1494 return $crop.substr($string,$i);
1495 }
1496 }
1497 */
1498 }
1499 return $string;
1500 }
1501
1502 /**
1503 * Cuts a string short at a given byte length.
1504 *
1505 * @param string The character set
1506 * @param string Character string
1507 * @param integer The byte length
1508 * @return string The shortened string
1509 * @see mb_strcut()
1510 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1511 */
1512 function strtrunc($charset,$string,$len) {
1513 if ($len <= 0) return '';
1514
1515 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1516 return mb_strcut($string,0,$len,$charset);
1517 } elseif ($charset == 'utf-8') {
1518 return $this->utf8_strtrunc($string,$len);
1519 } elseif ($this->eucBasedSets[$charset]) {
1520 return $this->euc_strtrunc($string,$charset);
1521 } elseif ($this->twoByteSets[$charset]) {
1522 if ($len % 2) $len--; // don't cut at odd positions
1523 } elseif ($this->fourByteSets[$charset]) {
1524 $x = $len % 4;
1525 $len -= $x; // realign to position dividable by four
1526 }
1527 // treat everything else as single-byte encoding
1528 return substr($string,0,$len);
1529 }
1530
1531 /**
1532 * Translates all characters of a string into their respective case values.
1533 * Unlike strtolower() and strtoupper() this method is locale independent.
1534 * Note that the string length may change!
1535 * eg. lower case German �(sharp S) becomes upper case "SS"
1536 * Unit-tested by Kasper
1537 * Real case folding is language dependent, this method ignores this fact.
1538 *
1539 * @param string Character set of string
1540 * @param string Input string to convert case for
1541 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1542 * @return string The converted string
1543 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1544 * @see strtolower(), strtoupper()
1545 */
1546 function conv_case($charset,$string,$case) {
1547 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1548 if ($case == 'toLower') {
1549 $string = mb_strtolower($string,$charset);
1550 } else {
1551 $string = mb_strtoupper($string,$charset);
1552 }
1553 } elseif ($charset == 'utf-8') {
1554 $string = $this->utf8_char_mapping($string,'case',$case);
1555 } elseif (isset($this->eucBasedSets[$charset])) {
1556 $string = $this->euc_char_mapping($string,$charset,'case',$case);
1557 } else {
1558 // treat everything else as single-byte encoding
1559 $string = $this->sb_char_mapping($string,$charset,'case',$case);
1560 }
1561
1562 return $string;
1563 }
1564
1565 /**
1566 * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
1567 *
1568 * @param string Character set of string
1569 * @param string Input string to convert
1570 * @return string The converted string
1571 */
1572 function specCharsToASCII($charset,$string) {
1573 if ($charset == 'utf-8') {
1574 $string = $this->utf8_char_mapping($string,'ascii');
1575 } elseif (isset($this->eucBasedSets[$charset])) {
1576 $string = $this->euc_char_mapping($string,$charset,'ascii');
1577 } else {
1578 // treat everything else as single-byte encoding
1579 $string = $this->sb_char_mapping($string,$charset,'ascii');
1580 }
1581
1582 return $string;
1583 }
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596 /********************************************
1597 *
1598 * Internal string operation functions
1599 *
1600 ********************************************/
1601
1602 /**
1603 * Maps all characters of a string in a single byte charset.
1604 *
1605 * @param string the string
1606 * @param string the charset
1607 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1608 * @param string 'case': conversion 'toLower' or 'toUpper'
1609 * @return string the converted string
1610 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1611 */
1612 function sb_char_mapping($str,$charset,$mode,$opt='') {
1613 switch($mode) {
1614 case 'case':
1615 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1616 $map =& $this->caseFolding[$charset][$opt];
1617 break;
1618
1619 case 'ascii':
1620 if (!$this->initToASCII($charset)) return $str; // do nothing
1621 $map =& $this->toASCII[$charset];
1622 break;
1623
1624 default:
1625 return $str;
1626 }
1627
1628 $out = '';
1629 for($i=0; strlen($str{$i}); $i++) {
1630 $c = $str{$i};
1631 if (isset($map[$c])) {
1632 $out .= $map[$c];
1633 } else {
1634 $out .= $c;
1635 }
1636 }
1637
1638 return $out;
1639 }
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650 /********************************************
1651 *
1652 * Internal UTF-8 string operation functions
1653 *
1654 ********************************************/
1655
1656 /**
1657 * Returns a part of a UTF-8 string.
1658 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1659 *
1660 * @param string UTF-8 string
1661 * @param integer Start position (character position)
1662 * @param integer Length (in characters)
1663 * @return string The substring
1664 * @see substr()
1665 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1666 */
1667 function utf8_substr($str,$start,$len=null) {
1668 if (!strcmp($len,'0')) return '';
1669
1670 $byte_start = $this->utf8_char2byte_pos($str,$start);
1671 if ($byte_start === false) {
1672 if ($start > 0) {
1673 return false; // $start outside string length
1674 } else {
1675 $start = 0;
1676 }
1677 }
1678
1679 $str = substr($str,$byte_start);
1680
1681 if ($len!=null) {
1682 $byte_end = $this->utf8_char2byte_pos($str,$len);
1683 if ($byte_end === false) // $len outside actual string length
1684 return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string.
1685 else
1686 return substr($str,0,$byte_end);
1687 }
1688 else return $str;
1689 }
1690
1691 /**
1692 * Counts the number of characters of a string in UTF-8.
1693 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1694 *
1695 * @param string UTF-8 multibyte character string
1696 * @return integer The number of characters
1697 * @see strlen()
1698 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1699 */
1700 function utf8_strlen($str) {
1701 $n=0;
1702 for($i=0; strlen($str{$i}); $i++) {
1703 $c = ord($str{$i});
1704 if (!($c & 0x80)) // single-byte (0xxxxxx)
1705 $n++;
1706 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1707 $n++;
1708 }
1709 return $n;
1710 }
1711
1712 /**
1713 * Truncates a string in UTF-8 short at a given byte length.
1714 *
1715 * @param string UTF-8 multibyte character string
1716 * @param integer the byte length
1717 * @return string the shortened string
1718 * @see mb_strcut()
1719 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1720 */
1721 function utf8_strtrunc($str,$len) {
1722 $i = $len-1;
1723 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1724 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1725 if ($i <= 0) return ''; // sanity check
1726 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1727 if ($bc+$i > $len) return substr($str,0,$i);
1728 // fallthru: multibyte char fits into length
1729 }
1730 return substr($str,0,$len);
1731 }
1732
1733 /**
1734 * Find position of first occurrence of a string, both arguments are in UTF-8.
1735 *
1736 * @param string UTF-8 string to search in
1737 * @param string UTF-8 string to search for
1738 * @param integer Positition to start the search
1739 * @return integer The character position
1740 * @see strpos()
1741 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1742 */
1743 function utf8_strpos($haystack,$needle,$offset=0) {
1744 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1745 return mb_strpos($haystack,$needle,$offset,'utf-8');
1746 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1747 return iconv_strpos($haystack,$needle,$offset,'utf-8');
1748 }
1749
1750 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1751 if ($byte_offset === false) return false; // offset beyond string length
1752
1753 $byte_pos = strpos($haystack,$needle,$byte_offset);
1754 if ($byte_pos === false) return false; // needle not found
1755
1756 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1757 }
1758
1759 /**
1760 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1761 *
1762 * @param string UTF-8 string to search in
1763 * @param string UTF-8 character to search for (single character)
1764 * @return integer The character position
1765 * @see strrpos()
1766 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1767 */
1768 function utf8_strrpos($haystack,$needle) {
1769 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1770 return mb_strrpos($haystack,$needle,'utf-8');
1771 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1772 return iconv_strrpos($haystack,$needle,'utf-8');
1773 }
1774
1775 $byte_pos = strrpos($haystack,$needle);
1776 if ($byte_pos === false) return false; // needle not found
1777
1778 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1779 }
1780
1781 /**
1782 * Translates a character position into an 'absolute' byte position.
1783 * Unit tested by Kasper.
1784 *
1785 * @param string UTF-8 string
1786 * @param integer Character position (negative values start from the end)
1787 * @return integer Byte position
1788 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1789 */
1790 function utf8_char2byte_pos($str,$pos) {
1791 $n = 0; // number of characters found
1792 $p = abs($pos); // number of characters wanted
1793
1794 if ($pos >= 0) {
1795 $i = 0;
1796 $d = 1;
1797 } else {
1798 $i = strlen($str)-1;
1799 $d = -1;
1800 }
1801
1802 for( ; strlen($str{$i}) && $n<$p; $i+=$d) {
1803 $c = (int)ord($str{$i});
1804 if (!($c & 0x80)) // single-byte (0xxxxxx)
1805 $n++;
1806 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1807 $n++;
1808 }
1809 if (!strlen($str{$i})) return false; // offset beyond string length
1810
1811 if ($pos >= 0) {
1812 // skip trailing multi-byte data bytes
1813 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1814 } else {
1815 // correct offset
1816 $i++;
1817 }
1818
1819 return $i;
1820 }
1821
1822 /**
1823 * Translates an 'absolute' byte position into a character position.
1824 * Unit tested by Kasper.
1825 *
1826 * @param string UTF-8 string
1827 * @param integer byte position
1828 * @return integer character position
1829 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1830 */
1831 function utf8_byte2char_pos($str,$pos) {
1832 $n = 0; // number of characters
1833 for($i=$pos; $i>0; $i--) {
1834 $c = (int)ord($str{$i});
1835 if (!($c & 0x80)) // single-byte (0xxxxxx)
1836 $n++;
1837 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1838 $n++;
1839 }
1840 if (!strlen($str{$i})) return false; // offset beyond string length
1841
1842 return $n;
1843 }
1844
1845 /**
1846 * Maps all characters of an UTF-8 string.
1847 *
1848 * @param string UTF-8 string
1849 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1850 * @param string 'case': conversion 'toLower' or 'toUpper'
1851 * @return string the converted string
1852 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1853 */
1854 function utf8_char_mapping($str,$mode,$opt='') {
1855 if (!$this->initUnicodeData($mode)) return $str; // do nothing
1856
1857 $out = '';
1858 switch($mode) {
1859 case 'case':
1860 $map =& $this->caseFolding['utf-8'][$opt];
1861 break;
1862
1863 case 'ascii':
1864 $map =& $this->toASCII['utf-8'];
1865 break;
1866
1867 default:
1868 return $str;
1869 }
1870
1871 for($i=0; strlen($str{$i}); $i++) {
1872 $c = ord($str{$i});
1873 if (!($c & 0x80)) // single-byte (0xxxxxx)
1874 $mbc = $str{$i};
1875 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1876 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1877 $mbc = substr($str,$i,$bc);
1878 $i += $bc-1;
1879 }
1880
1881 if (isset($map[$mbc])) {
1882 $out .= $map[$mbc];
1883 } else {
1884 $out .= $mbc;
1885 }
1886 }
1887
1888 return $out;
1889 }
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908 /********************************************
1909 *
1910 * Internal EUC string operation functions
1911 *
1912 * Extended Unix Code:
1913 * ASCII compatible 7bit single bytes chars
1914 * 8bit two byte chars
1915 *
1916 * Shift-JIS is treated as a special case.
1917 *
1918 ********************************************/
1919
1920 /**
1921 * Cuts a string in the EUC charset family short at a given byte length.
1922 *
1923 * @param string EUC multibyte character string
1924 * @param integer the byte length
1925 * @param string the charset
1926 * @return string the shortened string
1927 * @see mb_strcut()
1928 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1929 */
1930 function euc_strtrunc($str,$len,$charset) {
1931 $sjis = ($charset == 'shift_jis');
1932 for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
1933 $c = ord($str{$i});
1934 if ($sjis) {
1935 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1936 }
1937 else {
1938 if ($c >= 0x80) $i++; // advance a double-byte char
1939 }
1940 }
1941 if (!strlen($str{$i})) return $str; // string shorter than supplied length
1942
1943 if ($i>$len)
1944 return substr($str,0,$len-1); // we ended on a first byte
1945 else
1946 return substr($str,0,$len);
1947 }
1948
1949 /**
1950 * Returns a part of a string in the EUC charset family.
1951 *
1952 * @param string EUC multibyte character string
1953 * @param integer start position (character position)
1954 * @param string the charset
1955 * @param integer length (in characters)
1956 * @return string the substring
1957 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1958 */
1959 function euc_substr($str,$start,$charset,$len=null) {
1960 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1961 if ($byte_start === false) return false; // $start outside string length
1962
1963 $str = substr($str,$byte_start);
1964
1965 if ($len!=null) {
1966 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1967 if ($byte_end === false) // $len outside actual string length
1968 return $str;
1969 else
1970 return substr($str,0,$byte_end);
1971 }
1972 else return $str;
1973 }
1974
1975 /**
1976 * Counts the number of characters of a string in the EUC charset family.
1977 *
1978 * @param string EUC multibyte character string
1979 * @param string the charset
1980 * @return integer the number of characters
1981 * @see strlen()
1982 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1983 */
1984 function euc_strlen($str,$charset) {
1985 $sjis = ($charset == 'shift_jis');
1986 $n=0;
1987 for ($i=0; strlen($str{$i}); $i++) {
1988 $c = ord($str{$i});
1989 if ($sjis) {
1990 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1991 }
1992 else {
1993 if ($c >= 0x80) $i++; // advance a double-byte char
1994 }
1995
1996 $n++;
1997 }
1998
1999 return $n;
2000 }
2001
2002 /**
2003 * Translates a character position into an 'absolute' byte position.
2004 *
2005 * @param string EUC multibyte character string
2006 * @param integer character position (negative values start from the end)
2007 * @param string the charset
2008 * @return integer byte position
2009 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2010 */
2011 function euc_char2byte_pos($str,$pos,$charset) {
2012 $sjis = ($charset == 'shift_jis');
2013 $n = 0; // number of characters seen
2014 $p = abs($pos); // number of characters wanted
2015
2016 if ($pos >= 0) {
2017 $i = 0;
2018 $d = 1;
2019 } else {
2020 $i = strlen($str)-1;
2021 $d = -1;
2022 }
2023
2024 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
2025 $c = ord($str{$i});
2026 if ($sjis) {
2027 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char
2028 }
2029 else {
2030 if ($c >= 0x80) $i+=$d; // advance a double-byte char
2031 }
2032
2033 $n++;
2034 }
2035 if (!strlen($str{$i})) return false; // offset beyond string length
2036
2037 if ($pos < 0) $i++; // correct offset
2038
2039 return $i;
2040 }
2041
2042 /**
2043 * Maps all characters of a string in the EUC charset family.
2044 *
2045 * @param string EUC multibyte character string
2046 * @param string the charset
2047 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2048 * @param string 'case': conversion 'toLower' or 'toUpper'
2049 * @return string the converted string
2050 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2051 */
2052 function euc_char_mapping($str,$charset,$mode,$opt='') {
2053 switch($mode) {
2054 case 'case':
2055 if (!$this->initCaseFolding($charset)) return $str; // do nothing
2056 $map =& $this->caseFolding[$charset][$opt];
2057 break;
2058
2059 case 'ascii':
2060 if (!$this->initToASCII($charset)) return $str; // do nothing
2061 $map =& $this->toASCII[$charset];
2062 break;
2063
2064 default:
2065 return $str;
2066 }
2067
2068 $sjis = ($charset == 'shift_jis');
2069 $out = '';
2070 for($i=0; strlen($str{$i}); $i++) {
2071 $mbc = $str{$i};
2072 $c = ord($mbc);
2073
2074 if ($sjis) {
2075 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2076 $mbc = substr($str,$i,2);
2077 $i++;
2078 }
2079 }
2080 else {
2081 if ($c >= 0x80) { // a double-byte char
2082 $mbc = substr($str,$i,2);
2083 $i++;
2084 }
2085 }
2086
2087 if (isset($map[$mbc])) {
2088 $out .= $map[$mbc];
2089 } else {
2090 $out .= $mbc;
2091 }
2092 }
2093
2094 return $out;
2095 }
2096
2097 }
2098
2099 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
2100 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2101 }
2102 ?>