Fixed bug #10266: No user authentication for >1 TYPO3 installation under one domain...
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2009 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 136: class t3lib_cs
38 * 488: function parse_charset($charset)
39 * 507: function get_locale_charset($locale)
40 *
41 * SECTION: Charset Conversion functions
42 * 560: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
43 * 600: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
44 * 617: function utf8_encode($str,$charset)
45 * 663: function utf8_decode($str,$charset,$useEntityForNoChar=0)
46 * 706: function utf8_to_entities($str)
47 * 739: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
48 * 773: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
49 * 823: function UnumberToChar($cbyte)
50 * 868: function utf8CharToUnumber($str,$hex=0)
51 *
52 * SECTION: Init functions
53 * 911: function initCharset($charset)
54 * 973: function initUnicodeData($mode=null)
55 * 1198: function initCaseFolding($charset)
56 * 1260: function initToASCII($charset)
57 *
58 * SECTION: String operation functions
59 * 1331: function substr($charset,$string,$start,$len=null)
60 * 1384: function strlen($charset,$string)
61 * 1414: function crop($charset,$string,$len,$crop='')
62 * 1467: function strtrunc($charset,$string,$len)
63 * 1501: function conv_case($charset,$string,$case)
64 * 1527: function specCharsToASCII($charset,$string)
65 *
66 * SECTION: Internal string operation functions
67 * 1567: function sb_char_mapping($str,$charset,$mode,$opt='')
68 *
69 * SECTION: Internal UTF-8 string operation functions
70 * 1622: function utf8_substr($str,$start,$len=null)
71 * 1655: function utf8_strlen($str)
72 * 1676: function utf8_strtrunc($str,$len)
73 * 1698: function utf8_strpos($haystack,$needle,$offset=0)
74 * 1723: function utf8_strrpos($haystack,$needle)
75 * 1745: function utf8_char2byte_pos($str,$pos)
76 * 1786: function utf8_byte2char_pos($str,$pos)
77 * 1809: function utf8_char_mapping($str,$mode,$opt='')
78 *
79 * SECTION: Internal EUC string operation functions
80 * 1885: function euc_strtrunc($str,$len,$charset)
81 * 1914: function euc_substr($str,$start,$charset,$len=null)
82 * 1939: function euc_strlen($str,$charset)
83 * 1966: function euc_char2byte_pos($str,$pos,$charset)
84 * 2007: function euc_char_mapping($str,$charset,$mode,$opt='')
85 *
86 * TOTAL FUNCTIONS: 35
87 * (This index is automatically created/updated by the extension "extdeveval")
88 *
89 */
90
91
92
93
94
95
96
97
98 /**
99 * Notes on UTF-8
100 *
101 * Functions working on UTF-8 strings:
102 *
103 * - strchr/strstr
104 * - strrchr
105 * - substr_count
106 * - implode/explode/join
107 *
108 * Functions nearly working on UTF-8 strings:
109 *
110 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
111 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
112 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
113 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
114 *
115 * Functions NOT working on UTF-8 strings:
116 *
117 * - str*cmp
118 * - stristr
119 * - stripos
120 * - substr
121 * - strrev
122 * - ereg/eregi
123 * - split/spliti
124 * - preg_*
125 * - ...
126 *
127 */
128 /**
129 * Class for conversion between charsets
130 *
131 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
132 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
133 * @package TYPO3
134 * @subpackage t3lib
135 */
136 class t3lib_cs {
137 var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
138
139 // This is the array where parsed conversion tables are stored (cached)
140 var $parsedCharsets=array();
141
142 // An array where case folding data will be stored (cached)
143 var $caseFolding=array();
144
145 // An array where charset-to-ASCII mappings are stored (cached)
146 var $toASCII=array();
147
148 // This tells the converter which charsets has two bytes per char:
149 var $twoByteSets=array(
150 'ucs-2'=>1, // 2-byte Unicode
151 );
152
153 // This tells the converter which charsets has four bytes per char:
154 var $fourByteSets=array(
155 'ucs-4'=>1, // 4-byte Unicode
156 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
157 );
158
159 // This tells the converter which charsets use a scheme like the Extended Unix Code:
160 var $eucBasedSets=array(
161 'gb2312'=>1, // Chinese, simplified.
162 'big5'=>1, // Chinese, traditional.
163 'euc-kr'=>1, // Korean
164 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
165 );
166
167 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
168 // http://czyborra.com/charsets/iso8859.html
169 var $synonyms=array(
170 'us' => 'ascii',
171 'us-ascii'=> 'ascii',
172 'cp819' => 'iso-8859-1',
173 'ibm819' => 'iso-8859-1',
174 'iso-ir-100' => 'iso-8859-1',
175 'iso-ir-109' => 'iso-8859-2',
176 'iso-ir-148' => 'iso-8859-9',
177 'iso-ir-199' => 'iso-8859-14',
178 'iso-ir-203' => 'iso-8859-15',
179 'csisolatin1' => 'iso-8859-1',
180 'csisolatin2' => 'iso-8859-2',
181 'csisolatin3' => 'iso-8859-3',
182 'csisolatin5' => 'iso-8859-9',
183 'csisolatin8' => 'iso-8859-14',
184 'csisolatin9' => 'iso-8859-15',
185 'csisolatingreek' => 'iso-8859-7',
186 'iso-celtic' => 'iso-8859-14',
187 'latin1' => 'iso-8859-1',
188 'latin2' => 'iso-8859-2',
189 'latin3' => 'iso-8859-3',
190 'latin5' => 'iso-8859-9',
191 'latin6' => 'iso-8859-10',
192 'latin8' => 'iso-8859-14',
193 'latin9' => 'iso-8859-15',
194 'l1' => 'iso-8859-1',
195 'l2' => 'iso-8859-2',
196 'l3' => 'iso-8859-3',
197 'l5' => 'iso-8859-9',
198 'l6' => 'iso-8859-10',
199 'l8' => 'iso-8859-14',
200 'l9' => 'iso-8859-15',
201 'cyrillic' => 'iso-8859-5',
202 'arabic' => 'iso-8859-6',
203 'tis-620' => 'iso-8859-11',
204 'win874' => 'windows-874',
205 'win1250' => 'windows-1250',
206 'win1251' => 'windows-1251',
207 'win1252' => 'windows-1252',
208 'win1253' => 'windows-1253',
209 'win1254' => 'windows-1254',
210 'win1255' => 'windows-1255',
211 'win1256' => 'windows-1256',
212 'win1257' => 'windows-1257',
213 'win1258' => 'windows-1258',
214 'cp1250' => 'windows-1250',
215 'cp1251' => 'windows-1251',
216 'cp1252' => 'windows-1252',
217 'ms-ee' => 'windows-1250',
218 'ms-ansi' => 'windows-1252',
219 'ms-greek' => 'windows-1253',
220 'ms-turk' => 'windows-1254',
221 'winbaltrim' => 'windows-1257',
222 'koi-8ru' => 'koi-8r',
223 'koi8r' => 'koi-8r',
224 'cp878' => 'koi-8r',
225 'mac' => 'macroman',
226 'macintosh' => 'macroman',
227 'euc-cn' => 'gb2312',
228 'x-euc-cn' => 'gb2312',
229 'euccn' => 'gb2312',
230 'cp936' => 'gb2312',
231 'big-5' => 'big5',
232 'cp950' => 'big5',
233 'eucjp' => 'euc-jp',
234 'sjis' => 'shift_jis',
235 'shift-jis' => 'shift_jis',
236 'cp932' => 'shift_jis',
237 'cp949' => 'euc-kr',
238 'utf7' => 'utf-7',
239 'utf8' => 'utf-8',
240 'utf16' => 'utf-16',
241 'utf32' => 'utf-32',
242 'utf8' => 'utf-8',
243 'ucs2' => 'ucs-2',
244 'ucs4' => 'ucs-4',
245 );
246
247 // mapping of iso-639:2 language codes to script names
248 var $lang_to_script=array(
249 // iso-639:2 language codes, see:
250 // http://www.w3.org/WAI/ER/IG/ert/iso639.htm
251 // http://www.loc.gov/standards/iso639-2/langcodes.html
252 // http://www.unicode.org/onlinedat/languages.html
253 'ar' => 'arabic',
254 'bg' => 'cyrillic', // Bulgarian
255 'bs' => 'east_european', // Bosnian
256 'cs' => 'east_european', // Czech
257 'da' => 'west_european', // Danish
258 'de' => 'west_european', // German
259 'es' => 'west_european', // Spanish
260 'et' => 'estonian',
261 'eo' => 'unicode', // Esperanto
262 'eu' => 'west_european', // Basque
263 'fa' => 'arabic', // Persian
264 'fi' => 'west_european', // Finish
265 'fo' => 'west_european', // Faroese
266 'fr' => 'west_european', // French
267 'ga' => 'west_european', // Galician
268 'ge' => 'unicode', // Georgian
269 'gr' => 'greek',
270 'he' => 'hebrew', // Hebrew (since 1998)
271 'hi' => 'unicode', // Hindi
272 'hr' => 'east_european', // Croatian
273 'hu' => 'east_european', // Hungarian
274 'iw' => 'hebrew', // Hebrew (til 1998)
275 'is' => 'west_european', // Icelandic
276 'it' => 'west_european', // Italian
277 'ja' => 'japanese',
278 'kl' => 'west_european', // Greenlandic
279 'ko' => 'korean',
280 'lt' => 'lithuanian',
281 'lv' => 'west_european', // Latvian/Lettish
282 'nl' => 'west_european', // Dutch
283 'no' => 'west_european', // Norwegian
284 'pl' => 'east_european', // Polish
285 'pt' => 'west_european', // Portuguese
286 'ro' => 'east_european', // Romanian
287 'ru' => 'cyrillic', // Russian
288 'sk' => 'east_european', // Slovak
289 'sl' => 'east_european', // Slovenian
290 'sr' => 'cyrillic', // Serbian
291 'sv' => 'west_european', // Swedish
292 'sq' => 'albanian', // Albanian
293 'th' => 'thai',
294 'uk' => 'cyrillic', // Ukranian
295 'vi' => 'vietnamese',
296 'zh' => 'chinese',
297 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
298 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
299 'ara' => 'arabic',
300 'bgr' => 'cyrillic', // Bulgarian
301 'cat' => 'west_european', // Catalan
302 'chs' => 'simpl_chinese',
303 'cht' => 'trad_chinese',
304 'csy' => 'east_european', // Czech
305 'dan' => 'west_european', // Danisch
306 'deu' => 'west_european', // German
307 'dea' => 'west_european', // German (Austrian)
308 'des' => 'west_european', // German (Swiss)
309 'ena' => 'west_european', // English (Australian)
310 'enc' => 'west_european', // English (Canadian)
311 'eng' => 'west_european', // English
312 'enz' => 'west_european', // English (New Zealand)
313 'enu' => 'west_european', // English (United States)
314 'euq' => 'west_european', // Basque
315 'fos' => 'west_european', // Faroese
316 'far' => 'arabic', // Persian
317 'fin' => 'west_european', // Finish
318 'fra' => 'west_european', // French
319 'frb' => 'west_european', // French (Belgian)
320 'frc' => 'west_european', // French (Canadian)
321 'frs' => 'west_european', // French (Swiss)
322 'geo' => 'unicode', // Georgian
323 'glg' => 'west_european', // Galician
324 'ell' => 'greek',
325 'heb' => 'hebrew',
326 'hin' => 'unicode', // Hindi
327 'hun' => 'east_european', // Hungarian
328 'isl' => 'west_euorpean', // Icelandic
329 'ita' => 'west_european', // Italian
330 'its' => 'west_european', // Italian (Swiss)
331 'jpn' => 'japanese',
332 'kor' => 'korean',
333 'lth' => 'lithuanian',
334 'lvi' => 'west_european', // Latvian/Lettish
335 'msl' => 'west_european', // Malay
336 'nlb' => 'west_european', // Dutch (Belgian)
337 'nld' => 'west_european', // Dutch
338 'nor' => 'west_european', // Norwegian (bokmal)
339 'non' => 'west_european', // Norwegian (nynorsk)
340 'plk' => 'east_european', // Polish
341 'ptg' => 'west_european', // Portuguese
342 'ptb' => 'west_european', // Portuguese (Brazil)
343 'rom' => 'east_european', // Romanian
344 'rus' => 'cyrillic', // Russian
345 'slv' => 'east_european', // Slovenian
346 'sky' => 'east_european', // Slovak
347 'srl' => 'east_european', // Serbian (Latin)
348 'srb' => 'cyrillic', // Serbian (Cyrillic)
349 'esp' => 'west_european', // Spanish (trad. sort)
350 'esm' => 'west_european', // Spanish (Mexican)
351 'esn' => 'west_european', // Spanish (internat. sort)
352 'sve' => 'west_european', // Swedish
353 'sqi' => 'albanian', // Albanian
354 'tha' => 'thai',
355 'trk' => 'turkish',
356 'ukr' => 'cyrillic', // Ukrainian
357 // English language names
358 'albanian' => 'albanian',
359 'arabic' => 'arabic',
360 'basque' => 'west_european',
361 'bosnian' => 'east_european',
362 'bulgarian' => 'east_european',
363 'catalan' => 'west_european',
364 'croatian' => 'east_european',
365 'czech' => 'east_european',
366 'danish' => 'west_european',
367 'dutch' => 'west_european',
368 'english' => 'west_european',
369 'esperanto' => 'unicode',
370 'estonian' => 'estonian',
371 'faroese' => 'west_european',
372 'farsi' => 'arabic',
373 'finnish' => 'west_european',
374 'french' => 'west_european',
375 'galician' => 'west_european',
376 'georgian' => 'unicode',
377 'german' => 'west_european',
378 'greek' => 'greek',
379 'greenlandic' => 'west_european',
380 'hebrew' => 'hebrew',
381 'hindi' => 'unicode',
382 'hungarian' => 'east_european',
383 'icelandic' => 'west_european',
384 'italian' => 'west_european',
385 'latvian' => 'west_european',
386 'lettish' => 'west_european',
387 'lithuanian' => 'lithuanian',
388 'malay' => 'west_european',
389 'norwegian' => 'west_european',
390 'persian' => 'arabic',
391 'polish' => 'east_european',
392 'portuguese' => 'west_european',
393 'russian' => 'cyrillic',
394 'romanian' => 'east_european',
395 'serbian' => 'cyrillic',
396 'slovak' => 'east_european',
397 'slovenian' => 'east_european',
398 'spanish' => 'west_european',
399 'svedish' => 'west_european',
400 'that' => 'thai',
401 'turkish' => 'turkish',
402 'ukrainian' => 'cyrillic',
403 );
404
405 // mapping of language (family) names to charsets on Unix
406 var $script_to_charset_unix=array(
407 'west_european' => 'iso-8859-1',
408 'estonian' => 'iso-8859-1',
409 'east_european' => 'iso-8859-2',
410 'baltic' => 'iso-8859-4',
411 'cyrillic' => 'iso-8859-5',
412 'arabic' => 'iso-8859-6',
413 'greek' => 'iso-8859-7',
414 'hebrew' => 'iso-8859-8',
415 'turkish' => 'iso-8859-9',
416 'thai' => 'iso-8859-11', // = TIS-620
417 'lithuanian' => 'iso-8859-13',
418 'chinese' => 'gb2312', // = euc-cn
419 'japanese' => 'euc-jp',
420 'korean' => 'euc-kr',
421 'simpl_chinese' => 'gb2312',
422 'trad_chinese' => 'big5',
423 'vietnamese' => '',
424 'unicode' => 'utf-8',
425 'albanian' => 'utf-8'
426 );
427
428 // mapping of language (family) names to charsets on Windows
429 var $script_to_charset_windows=array(
430 'east_european' => 'windows-1250',
431 'cyrillic' => 'windows-1251',
432 'west_european' => 'windows-1252',
433 'greek' => 'windows-1253',
434 'turkish' => 'windows-1254',
435 'hebrew' => 'windows-1255',
436 'arabic' => 'windows-1256',
437 'baltic' => 'windows-1257',
438 'estonian' => 'windows-1257',
439 'lithuanian' => 'windows-1257',
440 'vietnamese' => 'windows-1258',
441 'thai' => 'cp874',
442 'korean' => 'cp949',
443 'chinese' => 'gb2312',
444 'japanese' => 'shift_jis',
445 'simpl_chinese' => 'gb2312',
446 'trad_chinese' => 'big5',
447 'albanian' => 'windows-1250',
448 'unicode' => 'utf-8'
449 );
450
451 // mapping of locale names to charsets
452 var $locale_to_charset=array(
453 'japanese.euc' => 'euc-jp',
454 'ja_jp.ujis' => 'euc-jp',
455 'korean.euc' => 'euc-kr',
456 'sr@Latn' => 'iso-8859-2',
457 'zh_cn' => 'gb2312',
458 'zh_hk' => 'big5',
459 'zh_tw' => 'big5',
460 );
461
462 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
463 // Empty values means "iso-8859-1"
464 var $charSetArray = array(
465 'dk' => '',
466 'de' => '',
467 'no' => '',
468 'it' => '',
469 'fr' => '',
470 'es' => '',
471 'nl' => '',
472 'cz' => 'windows-1250',
473 'pl' => 'iso-8859-2',
474 'si' => 'windows-1250',
475 'fi' => '',
476 'tr' => 'iso-8859-9',
477 'se' => '',
478 'pt' => '',
479 'ru' => 'windows-1251',
480 'ro' => 'iso-8859-2',
481 'ch' => 'gb2312',
482 'sk' => 'windows-1250',
483 'lt' => 'windows-1257',
484 'is' => 'utf-8',
485 'hr' => 'windows-1250',
486 'hu' => 'iso-8859-2',
487 'gl' => '',
488 'th' => 'iso-8859-11',
489 'gr' => 'iso-8859-7',
490 'hk' => 'big5',
491 'eu' => '',
492 'bg' => 'windows-1251',
493 'br' => '',
494 'et' => 'iso-8859-4',
495 'ar' => 'iso-8859-6',
496 'he' => 'utf-8',
497 'ua' => 'windows-1251',
498 'jp' => 'shift_jis',
499 'lv' => 'utf-8',
500 'vn' => 'utf-8',
501 'ca' => 'iso-8859-15',
502 'ba' => 'iso-8859-2',
503 'kr' => 'euc-kr',
504 'eo' => 'utf-8',
505 'my' => '',
506 'hi' => 'utf-8',
507 'fo' => 'utf-8',
508 'fa' => 'utf-8',
509 'sr' => 'utf-8',
510 'sq' => 'utf-8',
511 'ge' => 'utf-8',
512 'ga' => '',
513 );
514
515 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
516 // Missing keys means: same as Typo3
517 var $isoArray = array(
518 'ba' => 'bs',
519 'br' => 'pt_BR',
520 'ch' => 'zh_CN',
521 'cz' => 'cs',
522 'dk' => 'da',
523 'si' => 'sl',
524 'se' => 'sv',
525 'gl' => 'kl',
526 'gr' => 'el',
527 'hk' => 'zh_HK',
528 'kr' => 'ko',
529 'ua' => 'uk',
530 'jp' => 'ja',
531 'vn' => 'vi',
532 );
533
534 /**
535 * Normalize - changes input character set to lowercase letters.
536 *
537 * @param string Input charset
538 * @return string Normalized charset
539 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
540 */
541 function parse_charset($charset) {
542 $charset = trim(strtolower($charset));
543 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
544
545 return $charset;
546 }
547
548 /**
549 * Get the charset of a locale.
550 *
551 * ln language
552 * ln_CN language / country
553 * ln_CN.cs language / country / charset
554 * ln_CN.cs@mod language / country / charset / modifier
555 *
556 * @param string Locale string
557 * @return string Charset resolved for locale string
558 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
559 */
560 function get_locale_charset($locale) {
561 $locale = strtolower($locale);
562
563 // exact locale specific charset?
564 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale];
565
566 // get modifier
567 list($locale,$modifier) = explode('@',$locale);
568
569 // locale contains charset: use it
570 list($locale,$charset) = explode('.',$locale);
571 if ($charset) return $this->parse_charset($charset);
572
573 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
574 if ($modifier == 'euro') return 'iso-8859-15';
575
576 // get language
577 list($language,$country) = explode('_',$locale);
578 if (isset($this->lang_to_script[$language])) $script = $this->lang_to_script[$language];
579
580 if (TYPO3_OS == 'WIN') {
581 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
582 } else {
583 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
584 }
585
586 return $cs;
587 }
588
589
590
591
592
593
594
595
596
597 /********************************************
598 *
599 * Charset Conversion functions
600 *
601 ********************************************/
602
603 /**
604 * Convert from one charset to another charset.
605 *
606 * @param string Input string
607 * @param string From charset (the current charset of the string)
608 * @param string To charset (the output charset wanted)
609 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
610 * @return string Converted string
611 * @see convArray()
612 */
613 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
614 if ($fromCS==$toCS) return $str;
615
616 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
617 if ($toCS=='utf-8' || !$useEntityForNoChar) {
618 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
619 case 'mbstring':
620 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
621 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
622 break;
623
624 case 'iconv':
625 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
626 if (false !== $conv_str) return $conv_str;
627 break;
628
629 case 'recode':
630 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
631 if (false !== $conv_str) return $conv_str;
632 break;
633 }
634 // fallback to TYPO3 conversion
635 }
636
637 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
638 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
639 return $str;
640 }
641
642 /**
643 * Convert all elements in ARRAY from one charset to another charset.
644 * NOTICE: Array is passed by reference!
645 *
646 * @param string Input array, possibly multidimensional
647 * @param string From charset (the current charset of the string)
648 * @param string To charset (the output charset wanted)
649 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
650 * @return void
651 * @see conv()
652 */
653 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
654 foreach($array as $key => $value) {
655 if (is_array($array[$key])) {
656 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
657 } else {
658 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
659 }
660 }
661 }
662
663 /**
664 * Converts $str from $charset to UTF-8
665 *
666 * @param string String in local charset to convert to UTF-8
667 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
668 * @return string Output string, converted to UTF-8
669 */
670 function utf8_encode($str,$charset) {
671
672 if ($charset === 'utf-8') return $str;
673
674 // Charset is case-insensitive.
675 if ($this->initCharset($charset)) { // Parse conv. table if not already...
676 $strLen = strlen($str);
677 $outStr='';
678
679 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
680 $chr=substr($str,$a,1);
681 $ord=ord($chr);
682 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
683 $ord2 = ord($str{$a+1});
684 $ord = $ord<<8 | $ord2; // assume big endian
685
686 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
687 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
688 } else $outStr.=chr($this->noCharByteVal); // No char exists
689 $a++;
690 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
691 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
692 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
693 $a++;
694 $ord2=ord(substr($str,$a,1));
695 $ord = $ord*256+$ord2;
696 }
697 }
698
699 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
700 $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
701 } else $outStr.= chr($this->noCharByteVal); // No char exists
702 } else $outStr.= $chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
703 }
704 return $outStr;
705 }
706 }
707
708 /**
709 * Converts $str from UTF-8 to $charset
710 *
711 * @param string String in UTF-8 to convert to local charset
712 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
713 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
714 * @return string Output string, converted to local charset
715 */
716 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
717
718 if ($charset === 'utf-8') {
719 return $str;
720 }
721
722 // Charset is case-insensitive.
723 if ($this->initCharset($charset)) { // Parse conv. table if not already...
724 $strLen = strlen($str);
725 $outStr='';
726 $buf='';
727 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
728 $chr=substr($str,$a,1);
729 $ord=ord($chr);
730 if ($ord>127) { // This means multibyte! (first byte!)
731 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
732
733 $buf=$chr; // Add first byte
734 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
735 $ord = $ord << 1; // Shift it left and ...
736 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
737 $a++; // Increase pointer...
738 $buf.=substr($str,$a,1); // ... and add the next char.
739 } else break;
740 }
741
742 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
743 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
744 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
745 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
746 } else $outStr.= chr($mByte);
747 } elseif ($useEntityForNoChar) { // Create num entity:
748 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
749 } else $outStr.=chr($this->noCharByteVal); // No char exists
750 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
751 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
752 }
753 return $outStr;
754 }
755 }
756
757 /**
758 * Converts all chars > 127 to numeric entities.
759 *
760 * @param string Input string
761 * @return string Output string
762 */
763 function utf8_to_entities($str) {
764 $strLen = strlen($str);
765 $outStr='';
766 $buf='';
767 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
768 $chr=substr($str,$a,1);
769 $ord=ord($chr);
770 if ($ord>127) { // This means multibyte! (first byte!)
771 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
772 $buf=$chr; // Add first byte
773 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
774 $ord = $ord << 1; // Shift it left and ...
775 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
776 $a++; // Increase pointer...
777 $buf.=substr($str,$a,1); // ... and add the next char.
778 } else break;
779 }
780
781 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
782 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
783 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
784 }
785
786 return $outStr;
787 }
788
789 /**
790 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
791 *
792 * @param string Input string, UTF-8
793 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
794 * @return string Output string
795 */
796 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
797 if ($alsoStdHtmlEnt) {
798 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
799 }
800
801 $token = md5(microtime());
802 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
803 foreach($parts as $k => $v) {
804 if ($k%2) {
805 if (substr($v,0,1)=='#') { // Dec or hex entities:
806 if (substr($v,1,1)=='x') {
807 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
808 } else {
809 $parts[$k] = $this->UnumberToChar(substr($v,1));
810 }
811 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
812 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
813 } else { // No conversion:
814 $parts[$k] ='&'.$v.';';
815 }
816 }
817 }
818
819 return implode('',$parts);
820 }
821
822 /**
823 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
824 *
825 * @param string Input string, UTF-8
826 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
827 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
828 * @return array Output array with the char numbers
829 */
830 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
831 // If entities must be registered as well...:
832 if ($convEntities) {
833 $str = $this->entities_to_utf8($str,1);
834 }
835 // Do conversion:
836 $strLen = strlen($str);
837 $outArr=array();
838 $buf='';
839 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
840 $chr=substr($str,$a,1);
841 $ord=ord($chr);
842 if ($ord>127) { // This means multibyte! (first byte!)
843 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
844 $buf=$chr; // Add first byte
845 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
846 $ord = $ord << 1; // Shift it left and ...
847 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
848 $a++; // Increase pointer...
849 $buf.=substr($str,$a,1); // ... and add the next char.
850 } else break;
851 }
852
853 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
854 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
855 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
856 }
857
858 return $outArr;
859 }
860
861 /**
862 * Converts a UNICODE number to a UTF-8 multibyte character
863 * Algorithm based on script found at From: http://czyborra.com/utf/
864 * Unit-tested by Kasper
865 *
866 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
867 *
868 * bytes | bits | representation
869 * 1 | 7 | 0vvvvvvv
870 * 2 | 11 | 110vvvvv 10vvvvvv
871 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
872 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
873 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
874 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
875 *
876 * @param integer UNICODE integer
877 * @return string UTF-8 multibyte character string
878 * @see utf8CharToUnumber()
879 */
880 function UnumberToChar($cbyte) {
881 $str='';
882
883 if ($cbyte < 0x80) {
884 $str.=chr($cbyte);
885 } else if ($cbyte < 0x800) {
886 $str.=chr(0xC0 | ($cbyte >> 6));
887 $str.=chr(0x80 | ($cbyte & 0x3F));
888 } else if ($cbyte < 0x10000) {
889 $str.=chr(0xE0 | ($cbyte >> 12));
890 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
891 $str.=chr(0x80 | ($cbyte & 0x3F));
892 } else if ($cbyte < 0x200000) {
893 $str.=chr(0xF0 | ($cbyte >> 18));
894 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
895 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
896 $str.=chr(0x80 | ($cbyte & 0x3F));
897 } else if ($cbyte < 0x4000000) {
898 $str.=chr(0xF8 | ($cbyte >> 24));
899 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
900 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
901 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
902 $str.=chr(0x80 | ($cbyte & 0x3F));
903 } else if ($cbyte < 0x80000000) {
904 $str.=chr(0xFC | ($cbyte >> 30));
905 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
906 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
907 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
908 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
909 $str.=chr(0x80 | ($cbyte & 0x3F));
910 } else { // Cannot express a 32-bit character in UTF-8
911 $str .= chr($this->noCharByteVal);
912 }
913 return $str;
914 }
915
916 /**
917 * Converts a UTF-8 Multibyte character to a UNICODE number
918 * Unit-tested by Kasper
919 *
920 * @param string UTF-8 multibyte character string
921 * @param boolean If set, then a hex. number is returned.
922 * @return integer UNICODE integer
923 * @see UnumberToChar()
924 */
925 function utf8CharToUnumber($str,$hex=0) {
926 $ord=ord(substr($str,0,1)); // First char
927
928 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
929 $binBuf='';
930 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
931 $ord = $ord << 1; // Shift it left and ...
932 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
933 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
934 } else break;
935 }
936 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
937
938 $int = bindec($binBuf);
939 } else $int = $ord;
940
941 return $hex ? 'x'.dechex($int) : $int;
942 }
943
944
945
946
947
948
949
950
951
952 /********************************************
953 *
954 * Init functions
955 *
956 ********************************************/
957
958 /**
959 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
960 * This function is automatically called by the conversion functions
961 *
962 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
963 *
964 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
965 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
966 * @access private
967 */
968 function initCharset($charset) {
969 // Only process if the charset is not yet loaded:
970 if (!is_array($this->parsedCharsets[$charset])) {
971
972 // Conversion table filename:
973 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
974
975 // If the conversion table is found:
976 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
977 // Cache file for charsets:
978 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
979 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
980 if ($cacheFile && @is_file($cacheFile)) {
981 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
982 } else {
983 // Parse conversion table into lines:
984 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
985 // Initialize the internal variable holding the conv. table:
986 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
987 // traverse the lines:
988 $detectedType='';
989 foreach($lines as $value) {
990 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
991
992 // Detect type if not done yet: (Done on first real line)
993 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
994 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
995
996 if ($detectedType=='ms-token') {
997 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
998 } elseif ($detectedType=='whitespaced') {
999 $regA=array();
1000 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
1001 $hexbyte = $regA[1];
1002 $utf8 = 'U+'.$regA[2];
1003 }
1004 $decval = hexdec(trim($hexbyte));
1005 if ($decval>127) {
1006 $utf8decval = hexdec(substr(trim($utf8),2));
1007 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
1008 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
1009 }
1010 }
1011 }
1012 if ($cacheFile) {
1013 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
1014 }
1015 }
1016 return 2;
1017 } else return false;
1018 } else return 1;
1019 }
1020
1021 /**
1022 * This function initializes all UTF-8 character data tables.
1023 *
1024 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1025 *
1026 * @param string Mode ("case", "ascii", ...)
1027 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1028 * @access private
1029 */
1030 function initUnicodeData($mode=null) {
1031 // cache files
1032 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1033 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1034
1035 // Only process if the tables are not yet loaded
1036 switch($mode) {
1037 case 'case':
1038 if (is_array($this->caseFolding['utf-8'])) return 1;
1039
1040 // Use cached version if possible
1041 if ($cacheFileCase && @is_file($cacheFileCase)) {
1042 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1043 return 2;
1044 }
1045 break;
1046
1047 case 'ascii':
1048 if (is_array($this->toASCII['utf-8'])) return 1;
1049
1050 // Use cached version if possible
1051 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1052 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1053 return 2;
1054 }
1055 break;
1056 }
1057
1058 // process main Unicode data file
1059 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
1060 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
1061
1062 $fh = fopen($unicodeDataFile,'rb');
1063 if (!$fh) return false;
1064
1065 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1066 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1067 $this->caseFolding['utf-8'] = array();
1068 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1069 $utf8CaseFolding['toUpper'] = array();
1070 $utf8CaseFolding['toLower'] = array();
1071 $utf8CaseFolding['toTitle'] = array();
1072
1073 $decomposition = array(); // array of temp. decompositions
1074 $mark = array(); // array of chars that are marks (eg. composing accents)
1075 $number = array(); // array of chars that are numbers (eg. digits)
1076 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1077
1078 while (!feof($fh)) {
1079 $line = fgets($fh,4096);
1080 // has a lot of info
1081 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = explode(';', rtrim($line));
1082
1083 $ord = hexdec($char);
1084 if ($ord > 0xFFFF) break; // only process the BMP
1085
1086 $utf8_char = $this->UnumberToChar($ord);
1087
1088 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1089 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1090 // store "title" only when different from "upper" (only a few)
1091 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1092
1093 switch ($cat{0}) {
1094 case 'M': // mark (accent, umlaut, ...)
1095 $mark["U+$char"] = 1;
1096 break;
1097
1098 case 'N': // numeric value
1099 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
1100 }
1101
1102 // accented Latin letters without "official" decomposition
1103 $match = array();
1104 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) {
1105 $c = ord($match[2]);
1106 if ($match[1] == 'SMALL') $c += 32;
1107
1108 $decomposition["U+$char"] = array(dechex($c));
1109 continue;
1110 }
1111
1112 $match = array();
1113 if (ereg('(<.*>)? *(.+)',$decomp,$match)) {
1114 switch($match[1]) {
1115 case '<circle>': // add parenthesis as circle replacement, eg (1)
1116 $match[2] = '0028 '.$match[2].' 0029';
1117 break;
1118
1119 case '<square>': // add square brackets as square replacement, eg [1]
1120 $match[2] = '005B '.$match[2].' 005D';
1121 break;
1122
1123 case '<compat>': // ignore multi char decompositions that start with a space
1124 if (ereg('^0020 ',$match[2])) continue 2;
1125 break;
1126
1127 // ignore Arabic and vertical layout presentation decomposition
1128 case '<initial>':
1129 case '<medial>':
1130 case '<final>':
1131 case '<isolated>':
1132 case '<vertical>':
1133 continue 2;
1134 }
1135 $decomposition["U+$char"] = explode(' ', $match[2]);
1136 }
1137 }
1138 fclose($fh);
1139
1140 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1141 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
1142 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1143 $fh = fopen($specialCasingFile,'rb');
1144 if ($fh) {
1145 while (!feof($fh)) {
1146 $line = fgets($fh,4096);
1147 if ($line{0} != '#' && trim($line) != '') {
1148
1149 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
1150 if ($cond == '' || $cond{0} == '#') {
1151 $utf8_char = $this->UnumberToChar(hexdec($char));
1152 if ($char != $lower) {
1153 $arr = explode(' ', $lower);
1154 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1155 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1156 }
1157 if ($char != $title && $title != $upper) {
1158 $arr = explode(' ', $title);
1159 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1160 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1161 }
1162 if ($char != $upper) {
1163 $arr = explode(' ', $upper);
1164 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1165 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1166 }
1167 }
1168 }
1169 }
1170 fclose($fh);
1171 }
1172 }
1173
1174 // process custom decompositions
1175 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
1176 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1177 $fh = fopen($customTranslitFile,'rb');
1178 if ($fh) {
1179 while (!feof($fh)) {
1180 $line = fgets($fh,4096);
1181 if ($line{0} != '#' && trim($line) != '') {
1182 list($char,$translit) = t3lib_div::trimExplode(';', $line);
1183 if (!$translit) $omit["U+$char"] = 1;
1184 $decomposition["U+$char"] = explode(' ', $translit);
1185
1186 }
1187 }
1188 fclose($fh);
1189 }
1190 }
1191
1192 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1193 foreach($decomposition as $from => $to) {
1194 $code_decomp = array();
1195
1196 while ($code_value = array_shift($to)) {
1197 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1198 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
1199 array_unshift($to, $cv);
1200 }
1201 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1202 array_push($code_decomp, $code_value);
1203 }
1204 }
1205 if (count($code_decomp) || isset($omit[$from])) {
1206 $decomposition[$from] = $code_decomp;
1207 } else {
1208 unset($decomposition[$from]);
1209 }
1210 }
1211
1212 // create ascii only mapping
1213 $this->toASCII['utf-8'] = array();
1214 $ascii =& $this->toASCII['utf-8'];
1215
1216 foreach($decomposition as $from => $to) {
1217 $code_decomp = array();
1218 while ($code_value = array_shift($to)) {
1219 $ord = hexdec($code_value);
1220 if ($ord > 127)
1221 continue 2; // skip decompositions containing non-ASCII chars
1222 else
1223 array_push($code_decomp,chr($ord));
1224 }
1225 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1226 }
1227
1228 // add numeric decompositions
1229 foreach($number as $from => $to) {
1230 $utf8_char = $this->UnumberToChar(hexdec($from));
1231 if (!isset($ascii[$utf8_char])) {
1232 $ascii[$utf8_char] = $to;
1233 }
1234 }
1235
1236 if ($cacheFileCase) {
1237 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1238 }
1239
1240 if ($cacheFileASCII) {
1241 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1242 }
1243
1244 return 3;
1245 }
1246
1247 /**
1248 * This function initializes the folding table for a charset other than UTF-8.
1249 * This function is automatically called by the case folding functions.
1250 *
1251 * @param string Charset for which to initialize case folding.
1252 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1253 * @access private
1254 */
1255 function initCaseFolding($charset) {
1256 // Only process if the case table is not yet loaded:
1257 if (is_array($this->caseFolding[$charset])) return 1;
1258
1259 // Use cached version if possible
1260 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1261 if ($cacheFile && @is_file($cacheFile)) {
1262 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1263 return 2;
1264 }
1265
1266 // init UTF-8 conversion for this charset
1267 if (!$this->initCharset($charset)) {
1268 return false;
1269 }
1270
1271 // UTF-8 case folding is used as the base conversion table
1272 if (!$this->initUnicodeData('case')) {
1273 return false;
1274 }
1275
1276 $nochar = chr($this->noCharByteVal);
1277 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1278 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1279 $c = $this->utf8_decode($utf8, $charset);
1280
1281 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1282 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1283 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1284
1285 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1286 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1287 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
1288
1289 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1290 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1291 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1292 }
1293
1294 // add the ASCII case table
1295 for ($i=ord('a'); $i<=ord('z'); $i++) {
1296 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
1297 }
1298 for ($i=ord('A'); $i<=ord('Z'); $i++) {
1299 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
1300 }
1301
1302 if ($cacheFile) {
1303 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
1304 }
1305
1306 return 3;
1307 }
1308
1309 /**
1310 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1311 * This function is automatically called by the ASCII transliteration functions.
1312 *
1313 * @param string Charset for which to initialize conversion.
1314 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1315 * @access private
1316 */
1317 function initToASCII($charset) {
1318 // Only process if the case table is not yet loaded:
1319 if (is_array($this->toASCII[$charset])) return 1;
1320
1321 // Use cached version if possible
1322 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1323 if ($cacheFile && @is_file($cacheFile)) {
1324 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1325 return 2;
1326 }
1327
1328 // init UTF-8 conversion for this charset
1329 if (!$this->initCharset($charset)) {
1330 return false;
1331 }
1332
1333 // UTF-8/ASCII transliteration is used as the base conversion table
1334 if (!$this->initUnicodeData('ascii')) {
1335 return false;
1336 }
1337
1338 $nochar = chr($this->noCharByteVal);
1339 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1340 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1341 $c = $this->utf8_decode($utf8, $charset);
1342
1343 if (isset($this->toASCII['utf-8'][$utf8])) {
1344 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1345 }
1346 }
1347
1348 if ($cacheFile) {
1349 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
1350 }
1351
1352 return 3;
1353 }
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370 /********************************************
1371 *
1372 * String operation functions
1373 *
1374 ********************************************/
1375
1376 /**
1377 * Returns a part of a string.
1378 * Unit-tested by Kasper (single byte charsets only)
1379 *
1380 * @param string The character set
1381 * @param string Character string
1382 * @param integer Start position (character position)
1383 * @param integer Length (in characters)
1384 * @return string The substring
1385 * @see substr(), mb_substr()
1386 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1387 */
1388 function substr($charset,$string,$start,$len=null) {
1389 if ($len===0) return '';
1390
1391 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1392 // cannot omit $len, when specifying charset
1393 if ($len==null) {
1394 $enc = mb_internal_encoding(); // save internal encoding
1395 mb_internal_encoding($charset);
1396 $str = mb_substr($string,$start);
1397 mb_internal_encoding($enc); // restore internal encoding
1398
1399 return $str;
1400 }
1401 else {
1402 return mb_substr($string,$start,$len,$charset);
1403 }
1404 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1405 // cannot omit $len, when specifying charset
1406 if ($len==null) {
1407 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1408 iconv_set_encoding('internal_encoding',$charset);
1409 $str = iconv_substr($string,$start);
1410 iconv_set_encoding('internal_encoding',$enc); // restore internal encoding
1411
1412 return $str;
1413 }
1414 else {
1415 return iconv_substr($string,$start,$len,$charset);
1416 }
1417 } elseif ($charset == 'utf-8') {
1418 return $this->utf8_substr($string,$start,$len);
1419 } elseif ($this->eucBasedSets[$charset]) {
1420 return $this->euc_substr($string,$start,$charset,$len);
1421 } elseif ($this->twoByteSets[$charset]) {
1422 return substr($string,$start*2,$len*2);
1423 } elseif ($this->fourByteSets[$charset]) {
1424 return substr($string,$start*4,$len*4);
1425 }
1426
1427 // treat everything else as single-byte encoding
1428 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
1429 }
1430
1431 /**
1432 * Counts the number of characters.
1433 * Unit-tested by Kasper (single byte charsets only)
1434 *
1435 * @param string The character set
1436 * @param string Character string
1437 * @return integer The number of characters
1438 * @see strlen()
1439 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1440 */
1441 function strlen($charset,$string) {
1442 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1443 return mb_strlen($string,$charset);
1444 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1445 return iconv_strlen($string,$charset);
1446 } elseif ($charset == 'utf-8') {
1447 return $this->utf8_strlen($string);
1448 } elseif ($this->eucBasedSets[$charset]) {
1449 return $this->euc_strlen($string,$charset);
1450 } elseif ($this->twoByteSets[$charset]) {
1451 return strlen($string)/2;
1452 } elseif ($this->fourByteSets[$charset]) {
1453 return strlen($string)/4;
1454 }
1455 // treat everything else as single-byte encoding
1456 return strlen($string);
1457 }
1458
1459 /**
1460 * Truncates a string and pre-/appends a string.
1461 * Unit tested by Kasper
1462 *
1463 * @param string The character set
1464 * @param string Character string
1465 * @param integer Length (in characters)
1466 * @param string Crop signifier
1467 * @return string The shortened string
1468 * @see substr(), mb_strimwidth()
1469 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1470 */
1471 function crop($charset,$string,$len,$crop='') {
1472 if (intval($len) == 0) return $string;
1473
1474 if ($charset == 'utf-8') {
1475 $i = $this->utf8_char2byte_pos($string,$len);
1476 } elseif ($this->eucBasedSets[$charset]) {
1477 $i = $this->euc_char2byte_pos($string,$len,$charset);
1478 } else {
1479 if ($len > 0) {
1480 $i = $len;
1481 } else {
1482 $i = strlen($string)+$len;
1483 if ($i<=0) $i = false;
1484 }
1485 }
1486
1487 if ($i === false) { // $len outside actual string length
1488 return $string;
1489 } else {
1490 if ($len > 0) {
1491 if (strlen($string{$i})) {
1492 return substr($string,0,$i).$crop;
1493
1494 }
1495 } else {
1496 if (strlen($string{$i-1})) {
1497 return $crop.substr($string,$i);
1498 }
1499 }
1500
1501 /*
1502 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1503 if ($len > 0) {
1504 return substr($string,0,$i).$crop;
1505 } else {
1506 return $crop.substr($string,$i);
1507 }
1508 }
1509 */
1510 }
1511 return $string;
1512 }
1513
1514 /**
1515 * Cuts a string short at a given byte length.
1516 *
1517 * @param string The character set
1518 * @param string Character string
1519 * @param integer The byte length
1520 * @return string The shortened string
1521 * @see mb_strcut()
1522 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1523 */
1524 function strtrunc($charset,$string,$len) {
1525 if ($len <= 0) return '';
1526
1527 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1528 return mb_strcut($string,0,$len,$charset);
1529 } elseif ($charset == 'utf-8') {
1530 return $this->utf8_strtrunc($string,$len);
1531 } elseif ($this->eucBasedSets[$charset]) {
1532 return $this->euc_strtrunc($string,$charset);
1533 } elseif ($this->twoByteSets[$charset]) {
1534 if ($len % 2) $len--; // don't cut at odd positions
1535 } elseif ($this->fourByteSets[$charset]) {
1536 $x = $len % 4;
1537 $len -= $x; // realign to position dividable by four
1538 }
1539 // treat everything else as single-byte encoding
1540 return substr($string,0,$len);
1541 }
1542
1543 /**
1544 * Translates all characters of a string into their respective case values.
1545 * Unlike strtolower() and strtoupper() this method is locale independent.
1546 * Note that the string length may change!
1547 * eg. lower case German �(sharp S) becomes upper case "SS"
1548 * Unit-tested by Kasper
1549 * Real case folding is language dependent, this method ignores this fact.
1550 *
1551 * @param string Character set of string
1552 * @param string Input string to convert case for
1553 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1554 * @return string The converted string
1555 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1556 * @see strtolower(), strtoupper()
1557 */
1558 function conv_case($charset,$string,$case) {
1559 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1560 if ($case == 'toLower') {
1561 $string = mb_strtolower($string,$charset);
1562 } else {
1563 $string = mb_strtoupper($string,$charset);
1564 }
1565 } elseif ($charset == 'utf-8') {
1566 $string = $this->utf8_char_mapping($string,'case',$case);
1567 } elseif (isset($this->eucBasedSets[$charset])) {
1568 $string = $this->euc_char_mapping($string,$charset,'case',$case);
1569 } else {
1570 // treat everything else as single-byte encoding
1571 $string = $this->sb_char_mapping($string,$charset,'case',$case);
1572 }
1573
1574 return $string;
1575 }
1576
1577 /**
1578 * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
1579 *
1580 * @param string Character set of string
1581 * @param string Input string to convert
1582 * @return string The converted string
1583 */
1584 function specCharsToASCII($charset,$string) {
1585 if ($charset == 'utf-8') {
1586 $string = $this->utf8_char_mapping($string,'ascii');
1587 } elseif (isset($this->eucBasedSets[$charset])) {
1588 $string = $this->euc_char_mapping($string,$charset,'ascii');
1589 } else {
1590 // treat everything else as single-byte encoding
1591 $string = $this->sb_char_mapping($string,$charset,'ascii');
1592 }
1593
1594 return $string;
1595 }
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608 /********************************************
1609 *
1610 * Internal string operation functions
1611 *
1612 ********************************************/
1613
1614 /**
1615 * Maps all characters of a string in a single byte charset.
1616 *
1617 * @param string the string
1618 * @param string the charset
1619 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1620 * @param string 'case': conversion 'toLower' or 'toUpper'
1621 * @return string the converted string
1622 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1623 */
1624 function sb_char_mapping($str,$charset,$mode,$opt='') {
1625 switch($mode) {
1626 case 'case':
1627 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1628 $map =& $this->caseFolding[$charset][$opt];
1629 break;
1630
1631 case 'ascii':
1632 if (!$this->initToASCII($charset)) return $str; // do nothing
1633 $map =& $this->toASCII[$charset];
1634 break;
1635
1636 default:
1637 return $str;
1638 }
1639
1640 $out = '';
1641 for($i=0; strlen($str{$i}); $i++) {
1642 $c = $str{$i};
1643 if (isset($map[$c])) {
1644 $out .= $map[$c];
1645 } else {
1646 $out .= $c;
1647 }
1648 }
1649
1650 return $out;
1651 }
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662 /********************************************
1663 *
1664 * Internal UTF-8 string operation functions
1665 *
1666 ********************************************/
1667
1668 /**
1669 * Returns a part of a UTF-8 string.
1670 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1671 *
1672 * @param string UTF-8 string
1673 * @param integer Start position (character position)
1674 * @param integer Length (in characters)
1675 * @return string The substring
1676 * @see substr()
1677 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1678 */
1679 function utf8_substr($str,$start,$len=null) {
1680 if (!strcmp($len,'0')) return '';
1681
1682 $byte_start = $this->utf8_char2byte_pos($str,$start);
1683 if ($byte_start === false) {
1684 if ($start > 0) {
1685 return false; // $start outside string length
1686 } else {
1687 $start = 0;
1688 }
1689 }
1690
1691 $str = substr($str,$byte_start);
1692
1693 if ($len!=null) {
1694 $byte_end = $this->utf8_char2byte_pos($str,$len);
1695 if ($byte_end === false) // $len outside actual string length
1696 return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string.
1697 else
1698 return substr($str,0,$byte_end);
1699 }
1700 else return $str;
1701 }
1702
1703 /**
1704 * Counts the number of characters of a string in UTF-8.
1705 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1706 *
1707 * @param string UTF-8 multibyte character string
1708 * @return integer The number of characters
1709 * @see strlen()
1710 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1711 */
1712 function utf8_strlen($str) {
1713 $n=0;
1714 for($i=0; strlen($str{$i}); $i++) {
1715 $c = ord($str{$i});
1716 if (!($c & 0x80)) // single-byte (0xxxxxx)
1717 $n++;
1718 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1719 $n++;
1720 }
1721 return $n;
1722 }
1723
1724 /**
1725 * Truncates a string in UTF-8 short at a given byte length.
1726 *
1727 * @param string UTF-8 multibyte character string
1728 * @param integer the byte length
1729 * @return string the shortened string
1730 * @see mb_strcut()
1731 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1732 */
1733 function utf8_strtrunc($str,$len) {
1734 $i = $len-1;
1735 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1736 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1737 if ($i <= 0) return ''; // sanity check
1738 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1739 if ($bc+$i > $len) return substr($str,0,$i);
1740 // fallthru: multibyte char fits into length
1741 }
1742 return substr($str,0,$len);
1743 }
1744
1745 /**
1746 * Find position of first occurrence of a string, both arguments are in UTF-8.
1747 *
1748 * @param string UTF-8 string to search in
1749 * @param string UTF-8 string to search for
1750 * @param integer Positition to start the search
1751 * @return integer The character position
1752 * @see strpos()
1753 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1754 */
1755 function utf8_strpos($haystack,$needle,$offset=0) {
1756 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1757 return mb_strpos($haystack,$needle,$offset,'utf-8');
1758 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1759 return iconv_strpos($haystack,$needle,$offset,'utf-8');
1760 }
1761
1762 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1763 if ($byte_offset === false) return false; // offset beyond string length
1764
1765 $byte_pos = strpos($haystack,$needle,$byte_offset);
1766 if ($byte_pos === false) return false; // needle not found
1767
1768 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1769 }
1770
1771 /**
1772 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1773 *
1774 * @param string UTF-8 string to search in
1775 * @param string UTF-8 character to search for (single character)
1776 * @return integer The character position
1777 * @see strrpos()
1778 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1779 */
1780 function utf8_strrpos($haystack,$needle) {
1781 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1782 return mb_strrpos($haystack,$needle,'utf-8');
1783 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1784 return iconv_strrpos($haystack,$needle,'utf-8');
1785 }
1786
1787 $byte_pos = strrpos($haystack,$needle);
1788 if ($byte_pos === false) return false; // needle not found
1789
1790 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1791 }
1792
1793 /**
1794 * Translates a character position into an 'absolute' byte position.
1795 * Unit tested by Kasper.
1796 *
1797 * @param string UTF-8 string
1798 * @param integer Character position (negative values start from the end)
1799 * @return integer Byte position
1800 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1801 */
1802 function utf8_char2byte_pos($str,$pos) {
1803 $n = 0; // number of characters found
1804 $p = abs($pos); // number of characters wanted
1805
1806 if ($pos >= 0) {
1807 $i = 0;
1808 $d = 1;
1809 } else {
1810 $i = strlen($str)-1;
1811 $d = -1;
1812 }
1813
1814 for( ; strlen($str{$i}) && $n<$p; $i+=$d) {
1815 $c = (int)ord($str{$i});
1816 if (!($c & 0x80)) // single-byte (0xxxxxx)
1817 $n++;
1818 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1819 $n++;
1820 }
1821 if (!strlen($str{$i})) return false; // offset beyond string length
1822
1823 if ($pos >= 0) {
1824 // skip trailing multi-byte data bytes
1825 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1826 } else {
1827 // correct offset
1828 $i++;
1829 }
1830
1831 return $i;
1832 }
1833
1834 /**
1835 * Translates an 'absolute' byte position into a character position.
1836 * Unit tested by Kasper.
1837 *
1838 * @param string UTF-8 string
1839 * @param integer byte position
1840 * @return integer character position
1841 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1842 */
1843 function utf8_byte2char_pos($str,$pos) {
1844 $n = 0; // number of characters
1845 for($i=$pos; $i>0; $i--) {
1846 $c = (int)ord($str{$i});
1847 if (!($c & 0x80)) // single-byte (0xxxxxx)
1848 $n++;
1849 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1850 $n++;
1851 }
1852 if (!strlen($str{$i})) return false; // offset beyond string length
1853
1854 return $n;
1855 }
1856
1857 /**
1858 * Maps all characters of an UTF-8 string.
1859 *
1860 * @param string UTF-8 string
1861 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1862 * @param string 'case': conversion 'toLower' or 'toUpper'
1863 * @return string the converted string
1864 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1865 */
1866 function utf8_char_mapping($str,$mode,$opt='') {
1867 if (!$this->initUnicodeData($mode)) return $str; // do nothing
1868
1869 $out = '';
1870 switch($mode) {
1871 case 'case':
1872 $map =& $this->caseFolding['utf-8'][$opt];
1873 break;
1874
1875 case 'ascii':
1876 $map =& $this->toASCII['utf-8'];
1877 break;
1878
1879 default:
1880 return $str;
1881 }
1882
1883 for($i=0; strlen($str{$i}); $i++) {
1884 $c = ord($str{$i});
1885 if (!($c & 0x80)) // single-byte (0xxxxxx)
1886 $mbc = $str{$i};
1887 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1888 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1889 $mbc = substr($str,$i,$bc);
1890 $i += $bc-1;
1891 }
1892
1893 if (isset($map[$mbc])) {
1894 $out .= $map[$mbc];
1895 } else {
1896 $out .= $mbc;
1897 }
1898 }
1899
1900 return $out;
1901 }
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920 /********************************************
1921 *
1922 * Internal EUC string operation functions
1923 *
1924 * Extended Unix Code:
1925 * ASCII compatible 7bit single bytes chars
1926 * 8bit two byte chars
1927 *
1928 * Shift-JIS is treated as a special case.
1929 *
1930 ********************************************/
1931
1932 /**
1933 * Cuts a string in the EUC charset family short at a given byte length.
1934 *
1935 * @param string EUC multibyte character string
1936 * @param integer the byte length
1937 * @param string the charset
1938 * @return string the shortened string
1939 * @see mb_strcut()
1940 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1941 */
1942 function euc_strtrunc($str,$len,$charset) {
1943 $sjis = ($charset == 'shift_jis');
1944 for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
1945 $c = ord($str{$i});
1946 if ($sjis) {
1947 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1948 }
1949 else {
1950 if ($c >= 0x80) $i++; // advance a double-byte char
1951 }
1952 }
1953 if (!strlen($str{$i})) return $str; // string shorter than supplied length
1954
1955 if ($i>$len)
1956 return substr($str,0,$len-1); // we ended on a first byte
1957 else
1958 return substr($str,0,$len);
1959 }
1960
1961 /**
1962 * Returns a part of a string in the EUC charset family.
1963 *
1964 * @param string EUC multibyte character string
1965 * @param integer start position (character position)
1966 * @param string the charset
1967 * @param integer length (in characters)
1968 * @return string the substring
1969 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1970 */
1971 function euc_substr($str,$start,$charset,$len=null) {
1972 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1973 if ($byte_start === false) return false; // $start outside string length
1974
1975 $str = substr($str,$byte_start);
1976
1977 if ($len!=null) {
1978 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1979 if ($byte_end === false) // $len outside actual string length
1980 return $str;
1981 else
1982 return substr($str,0,$byte_end);
1983 }
1984 else return $str;
1985 }
1986
1987 /**
1988 * Counts the number of characters of a string in the EUC charset family.
1989 *
1990 * @param string EUC multibyte character string
1991 * @param string the charset
1992 * @return integer the number of characters
1993 * @see strlen()
1994 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1995 */
1996 function euc_strlen($str,$charset) {
1997 $sjis = ($charset == 'shift_jis');
1998 $n=0;
1999 for ($i=0; strlen($str{$i}); $i++) {
2000 $c = ord($str{$i});
2001 if ($sjis) {
2002 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
2003 }
2004 else {
2005 if ($c >= 0x80) $i++; // advance a double-byte char
2006 }
2007
2008 $n++;
2009 }
2010
2011 return $n;
2012 }
2013
2014 /**
2015 * Translates a character position into an 'absolute' byte position.
2016 *
2017 * @param string EUC multibyte character string
2018 * @param integer character position (negative values start from the end)
2019 * @param string the charset
2020 * @return integer byte position
2021 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2022 */
2023 function euc_char2byte_pos($str,$pos,$charset) {
2024 $sjis = ($charset == 'shift_jis');
2025 $n = 0; // number of characters seen
2026 $p = abs($pos); // number of characters wanted
2027
2028 if ($pos >= 0) {
2029 $i = 0;
2030 $d = 1;
2031 } else {
2032 $i = strlen($str)-1;
2033 $d = -1;
2034 }
2035
2036 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
2037 $c = ord($str{$i});
2038 if ($sjis) {
2039 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char
2040 }
2041 else {
2042 if ($c >= 0x80) $i+=$d; // advance a double-byte char
2043 }
2044
2045 $n++;
2046 }
2047 if (!strlen($str{$i})) return false; // offset beyond string length
2048
2049 if ($pos < 0) $i++; // correct offset
2050
2051 return $i;
2052 }
2053
2054 /**
2055 * Maps all characters of a string in the EUC charset family.
2056 *
2057 * @param string EUC multibyte character string
2058 * @param string the charset
2059 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2060 * @param string 'case': conversion 'toLower' or 'toUpper'
2061 * @return string the converted string
2062 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2063 */
2064 function euc_char_mapping($str,$charset,$mode,$opt='') {
2065 switch($mode) {
2066 case 'case':
2067 if (!$this->initCaseFolding($charset)) return $str; // do nothing
2068 $map =& $this->caseFolding[$charset][$opt];
2069 break;
2070
2071 case 'ascii':
2072 if (!$this->initToASCII($charset)) return $str; // do nothing
2073 $map =& $this->toASCII[$charset];
2074 break;
2075
2076 default:
2077 return $str;
2078 }
2079
2080 $sjis = ($charset == 'shift_jis');
2081 $out = '';
2082 for($i=0; strlen($str{$i}); $i++) {
2083 $mbc = $str{$i};
2084 $c = ord($mbc);
2085
2086 if ($sjis) {
2087 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2088 $mbc = substr($str,$i,2);
2089 $i++;
2090 }
2091 }
2092 else {
2093 if ($c >= 0x80) { // a double-byte char
2094 $mbc = substr($str,$i,2);
2095 $i++;
2096 }
2097 }
2098
2099 if (isset($map[$mbc])) {
2100 $out .= $map[$mbc];
2101 } else {
2102 $out .= $mbc;
2103 }
2104 }
2105
2106 return $out;
2107 }
2108
2109 }
2110
2111 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
2112 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2113 }
2114
2115 ?>