Fixed issue #17284: Formprotection persistToken method is called too often, causing...
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 136: class t3lib_cs
38 * 488: function parse_charset($charset)
39 * 507: function get_locale_charset($locale)
40 *
41 * SECTION: Charset Conversion functions
42 * 560: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
43 * 600: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
44 * 617: function utf8_encode($str,$charset)
45 * 663: function utf8_decode($str,$charset,$useEntityForNoChar=0)
46 * 706: function utf8_to_entities($str)
47 * 739: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
48 * 773: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
49 * 823: function UnumberToChar($cbyte)
50 * 868: function utf8CharToUnumber($str,$hex=0)
51 *
52 * SECTION: Init functions
53 * 911: function initCharset($charset)
54 * 973: function initUnicodeData($mode=null)
55 * 1198: function initCaseFolding($charset)
56 * 1260: function initToASCII($charset)
57 *
58 * SECTION: String operation functions
59 * 1331: function substr($charset,$string,$start,$len=null)
60 * 1384: function strlen($charset,$string)
61 * 1414: function crop($charset,$string,$len,$crop='')
62 * 1467: function strtrunc($charset,$string,$len)
63 * 1501: function conv_case($charset,$string,$case)
64 * 1527: function specCharsToASCII($charset,$string)
65 *
66 * SECTION: Internal string operation functions
67 * 1567: function sb_char_mapping($str,$charset,$mode,$opt='')
68 *
69 * SECTION: Internal UTF-8 string operation functions
70 * 1622: function utf8_substr($str,$start,$len=null)
71 * 1655: function utf8_strlen($str)
72 * 1676: function utf8_strtrunc($str,$len)
73 * 1698: function utf8_strpos($haystack,$needle,$offset=0)
74 * 1723: function utf8_strrpos($haystack,$needle)
75 * 1745: function utf8_char2byte_pos($str,$pos)
76 * 1786: function utf8_byte2char_pos($str,$pos)
77 * 1809: function utf8_char_mapping($str,$mode,$opt='')
78 *
79 * SECTION: Internal EUC string operation functions
80 * 1885: function euc_strtrunc($str,$len,$charset)
81 * 1914: function euc_substr($str,$start,$charset,$len=null)
82 * 1939: function euc_strlen($str,$charset)
83 * 1966: function euc_char2byte_pos($str,$pos,$charset)
84 * 2007: function euc_char_mapping($str,$charset,$mode,$opt='')
85 *
86 * TOTAL FUNCTIONS: 35
87 * (This index is automatically created/updated by the extension "extdeveval")
88 *
89 */
90
91
92 /**
93 * Notes on UTF-8
94 *
95 * Functions working on UTF-8 strings:
96 *
97 * - strchr/strstr
98 * - strrchr
99 * - substr_count
100 * - implode/explode/join
101 *
102 * Functions nearly working on UTF-8 strings:
103 *
104 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
105 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
106 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
107 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
108 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
109 *
110 * Functions NOT working on UTF-8 strings:
111 *
112 * - str*cmp
113 * - stristr
114 * - stripos
115 * - substr
116 * - strrev
117 * - split/spliti
118 * - ...
119 *
120 */
121 /**
122 * Class for conversion between charsets
123 *
124 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
125 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
126 * @package TYPO3
127 * @subpackage t3lib
128 */
129 class t3lib_cs {
130 var $noCharByteVal = 63; // ASCII Value for chars with no equivalent.
131
132 // This is the array where parsed conversion tables are stored (cached)
133 var $parsedCharsets = array();
134
135 // An array where case folding data will be stored (cached)
136 var $caseFolding = array();
137
138 // An array where charset-to-ASCII mappings are stored (cached)
139 var $toASCII = array();
140
141 // This tells the converter which charsets has two bytes per char:
142 var $twoByteSets = array(
143 'ucs-2' => 1, // 2-byte Unicode
144 );
145
146 // This tells the converter which charsets has four bytes per char:
147 var $fourByteSets = array(
148 'ucs-4' => 1, // 4-byte Unicode
149 'utf-32' => 1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
150 );
151
152 // This tells the converter which charsets use a scheme like the Extended Unix Code:
153 var $eucBasedSets = array(
154 'gb2312' => 1, // Chinese, simplified.
155 'big5' => 1, // Chinese, traditional.
156 'euc-kr' => 1, // Korean
157 'shift_jis' => 1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
158 );
159
160 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
161 // http://czyborra.com/charsets/iso8859.html
162 var $synonyms = array(
163 'us' => 'ascii',
164 'us-ascii' => 'ascii',
165 'cp819' => 'iso-8859-1',
166 'ibm819' => 'iso-8859-1',
167 'iso-ir-100' => 'iso-8859-1',
168 'iso-ir-101' => 'iso-8859-2',
169 'iso-ir-109' => 'iso-8859-3',
170 'iso-ir-110' => 'iso-8859-4',
171 'iso-ir-144' => 'iso-8859-5',
172 'iso-ir-127' => 'iso-8859-6',
173 'iso-ir-126' => 'iso-8859-7',
174 'iso-ir-138' => 'iso-8859-8',
175 'iso-ir-148' => 'iso-8859-9',
176 'iso-ir-157' => 'iso-8859-10',
177 'iso-ir-179' => 'iso-8859-13',
178 'iso-ir-199' => 'iso-8859-14',
179 'iso-ir-203' => 'iso-8859-15',
180 'csisolatin1' => 'iso-8859-1',
181 'csisolatin2' => 'iso-8859-2',
182 'csisolatin3' => 'iso-8859-3',
183 'csisolatin5' => 'iso-8859-9',
184 'csisolatin8' => 'iso-8859-14',
185 'csisolatin9' => 'iso-8859-15',
186 'csisolatingreek' => 'iso-8859-7',
187 'iso-celtic' => 'iso-8859-14',
188 'latin1' => 'iso-8859-1',
189 'latin2' => 'iso-8859-2',
190 'latin3' => 'iso-8859-3',
191 'latin5' => 'iso-8859-9',
192 'latin6' => 'iso-8859-10',
193 'latin8' => 'iso-8859-14',
194 'latin9' => 'iso-8859-15',
195 'l1' => 'iso-8859-1',
196 'l2' => 'iso-8859-2',
197 'l3' => 'iso-8859-3',
198 'l5' => 'iso-8859-9',
199 'l6' => 'iso-8859-10',
200 'l8' => 'iso-8859-14',
201 'l9' => 'iso-8859-15',
202 'cyrillic' => 'iso-8859-5',
203 'arabic' => 'iso-8859-6',
204 'tis-620' => 'iso-8859-11',
205 'win874' => 'windows-874',
206 'win1250' => 'windows-1250',
207 'win1251' => 'windows-1251',
208 'win1252' => 'windows-1252',
209 'win1253' => 'windows-1253',
210 'win1254' => 'windows-1254',
211 'win1255' => 'windows-1255',
212 'win1256' => 'windows-1256',
213 'win1257' => 'windows-1257',
214 'win1258' => 'windows-1258',
215 'cp1250' => 'windows-1250',
216 'cp1251' => 'windows-1251',
217 'cp1252' => 'windows-1252',
218 'ms-ee' => 'windows-1250',
219 'ms-ansi' => 'windows-1252',
220 'ms-greek' => 'windows-1253',
221 'ms-turk' => 'windows-1254',
222 'winbaltrim' => 'windows-1257',
223 'koi-8ru' => 'koi-8r',
224 'koi8r' => 'koi-8r',
225 'cp878' => 'koi-8r',
226 'mac' => 'macroman',
227 'macintosh' => 'macroman',
228 'euc-cn' => 'gb2312',
229 'x-euc-cn' => 'gb2312',
230 'euccn' => 'gb2312',
231 'cp936' => 'gb2312',
232 'big-5' => 'big5',
233 'cp950' => 'big5',
234 'eucjp' => 'euc-jp',
235 'sjis' => 'shift_jis',
236 'shift-jis' => 'shift_jis',
237 'cp932' => 'shift_jis',
238 'cp949' => 'euc-kr',
239 'utf7' => 'utf-7',
240 'utf8' => 'utf-8',
241 'utf16' => 'utf-16',
242 'utf32' => 'utf-32',
243 'utf8' => 'utf-8',
244 'ucs2' => 'ucs-2',
245 'ucs4' => 'ucs-4',
246 );
247
248 // mapping of iso-639-1 language codes to script names
249 var $lang_to_script = array(
250 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
251 'ar' => 'arabic',
252 'bg' => 'cyrillic', // Bulgarian
253 'bs' => 'east_european', // Bosnian
254 'cs' => 'east_european', // Czech
255 'da' => 'west_european', // Danish
256 'de' => 'west_european', // German
257 'es' => 'west_european', // Spanish
258 'et' => 'estonian',
259 'eo' => 'unicode', // Esperanto
260 'eu' => 'west_european', // Basque
261 'fa' => 'arabic', // Persian
262 'fi' => 'west_european', // Finish
263 'fo' => 'west_european', // Faroese
264 'fr' => 'west_european', // French
265 'ga' => 'west_european', // Galician
266 'ge' => 'unicode', // Georgian
267 'gr' => 'greek',
268 'he' => 'hebrew', // Hebrew (since 1998)
269 'hi' => 'unicode', // Hindi
270 'hr' => 'east_european', // Croatian
271 'hu' => 'east_european', // Hungarian
272 'iw' => 'hebrew', // Hebrew (til 1998)
273 'is' => 'west_european', // Icelandic
274 'it' => 'west_european', // Italian
275 'ja' => 'japanese',
276 'kl' => 'west_european', // Greenlandic
277 'km' => 'unicode', // Khmer
278 'ko' => 'korean',
279 'lt' => 'lithuanian',
280 'lv' => 'west_european', // Latvian/Lettish
281 'nl' => 'west_european', // Dutch
282 'no' => 'west_european', // Norwegian
283 'nb' => 'west_european', // Norwegian Bokmal
284 'nn' => 'west_european', // Norwegian Nynorsk
285 'pl' => 'east_european', // Polish
286 'pt' => 'west_european', // Portuguese
287 'ro' => 'east_european', // Romanian
288 'ru' => 'cyrillic', // Russian
289 'sk' => 'east_european', // Slovak
290 'sl' => 'east_european', // Slovenian
291 'sr' => 'cyrillic', // Serbian
292 'sv' => 'west_european', // Swedish
293 'sq' => 'albanian', // Albanian
294 'th' => 'thai',
295 'uk' => 'cyrillic', // Ukranian
296 'vi' => 'vietnamese',
297 'zh' => 'chinese',
298 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
299 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
300 'ara' => 'arabic',
301 'bgr' => 'cyrillic', // Bulgarian
302 'cat' => 'west_european', // Catalan
303 'chs' => 'simpl_chinese',
304 'cht' => 'trad_chinese',
305 'csy' => 'east_european', // Czech
306 'dan' => 'west_european', // Danisch
307 'deu' => 'west_european', // German
308 'dea' => 'west_european', // German (Austrian)
309 'des' => 'west_european', // German (Swiss)
310 'ena' => 'west_european', // English (Australian)
311 'enc' => 'west_european', // English (Canadian)
312 'eng' => 'west_european', // English
313 'enz' => 'west_european', // English (New Zealand)
314 'enu' => 'west_european', // English (United States)
315 'euq' => 'west_european', // Basque
316 'fos' => 'west_european', // Faroese
317 'far' => 'arabic', // Persian
318 'fin' => 'west_european', // Finish
319 'fra' => 'west_european', // French
320 'frb' => 'west_european', // French (Belgian)
321 'frc' => 'west_european', // French (Canadian)
322 'frs' => 'west_european', // French (Swiss)
323 'geo' => 'unicode', // Georgian
324 'glg' => 'west_european', // Galician
325 'ell' => 'greek',
326 'heb' => 'hebrew',
327 'hin' => 'unicode', // Hindi
328 'hun' => 'east_european', // Hungarian
329 'isl' => 'west_euorpean', // Icelandic
330 'ita' => 'west_european', // Italian
331 'its' => 'west_european', // Italian (Swiss)
332 'jpn' => 'japanese',
333 'khm' => 'unicode', // Khmer
334 'kor' => 'korean',
335 'lth' => 'lithuanian',
336 'lvi' => 'west_european', // Latvian/Lettish
337 'msl' => 'west_european', // Malay
338 'nlb' => 'west_european', // Dutch (Belgian)
339 'nld' => 'west_european', // Dutch
340 'nor' => 'west_european', // Norwegian (bokmal)
341 'non' => 'west_european', // Norwegian (nynorsk)
342 'plk' => 'east_european', // Polish
343 'ptg' => 'west_european', // Portuguese
344 'ptb' => 'west_european', // Portuguese (Brazil)
345 'rom' => 'east_european', // Romanian
346 'rus' => 'cyrillic', // Russian
347 'slv' => 'east_european', // Slovenian
348 'sky' => 'east_european', // Slovak
349 'srl' => 'east_european', // Serbian (Latin)
350 'srb' => 'cyrillic', // Serbian (Cyrillic)
351 'esp' => 'west_european', // Spanish (trad. sort)
352 'esm' => 'west_european', // Spanish (Mexican)
353 'esn' => 'west_european', // Spanish (internat. sort)
354 'sve' => 'west_european', // Swedish
355 'sqi' => 'albanian', // Albanian
356 'tha' => 'thai',
357 'trk' => 'turkish',
358 'ukr' => 'cyrillic', // Ukrainian
359 // English language names
360 'albanian' => 'albanian',
361 'arabic' => 'arabic',
362 'basque' => 'west_european',
363 'bosnian' => 'east_european',
364 'bulgarian' => 'east_european',
365 'catalan' => 'west_european',
366 'croatian' => 'east_european',
367 'czech' => 'east_european',
368 'danish' => 'west_european',
369 'dutch' => 'west_european',
370 'english' => 'west_european',
371 'esperanto' => 'unicode',
372 'estonian' => 'estonian',
373 'faroese' => 'west_european',
374 'farsi' => 'arabic',
375 'finnish' => 'west_european',
376 'french' => 'west_european',
377 'galician' => 'west_european',
378 'georgian' => 'unicode',
379 'german' => 'west_european',
380 'greek' => 'greek',
381 'greenlandic' => 'west_european',
382 'hebrew' => 'hebrew',
383 'hindi' => 'unicode',
384 'hungarian' => 'east_european',
385 'icelandic' => 'west_european',
386 'italian' => 'west_european',
387 'khmer' => 'unicode',
388 'latvian' => 'west_european',
389 'lettish' => 'west_european',
390 'lithuanian' => 'lithuanian',
391 'malay' => 'west_european',
392 'norwegian' => 'west_european',
393 'persian' => 'arabic',
394 'polish' => 'east_european',
395 'portuguese' => 'west_european',
396 'russian' => 'cyrillic',
397 'romanian' => 'east_european',
398 'serbian' => 'cyrillic',
399 'slovak' => 'east_european',
400 'slovenian' => 'east_european',
401 'spanish' => 'west_european',
402 'svedish' => 'west_european',
403 'that' => 'thai',
404 'turkish' => 'turkish',
405 'ukrainian' => 'cyrillic',
406 );
407
408 // mapping of language (family) names to charsets on Unix
409 var $script_to_charset_unix = array(
410 'west_european' => 'iso-8859-1',
411 'estonian' => 'iso-8859-1',
412 'east_european' => 'iso-8859-2',
413 'baltic' => 'iso-8859-4',
414 'cyrillic' => 'iso-8859-5',
415 'arabic' => 'iso-8859-6',
416 'greek' => 'iso-8859-7',
417 'hebrew' => 'iso-8859-8',
418 'turkish' => 'iso-8859-9',
419 'thai' => 'iso-8859-11', // = TIS-620
420 'lithuanian' => 'iso-8859-13',
421 'chinese' => 'gb2312', // = euc-cn
422 'japanese' => 'euc-jp',
423 'korean' => 'euc-kr',
424 'simpl_chinese' => 'gb2312',
425 'trad_chinese' => 'big5',
426 'vietnamese' => '',
427 'unicode' => 'utf-8',
428 'albanian' => 'utf-8'
429 );
430
431 // mapping of language (family) names to charsets on Windows
432 var $script_to_charset_windows = array(
433 'east_european' => 'windows-1250',
434 'cyrillic' => 'windows-1251',
435 'west_european' => 'windows-1252',
436 'greek' => 'windows-1253',
437 'turkish' => 'windows-1254',
438 'hebrew' => 'windows-1255',
439 'arabic' => 'windows-1256',
440 'baltic' => 'windows-1257',
441 'estonian' => 'windows-1257',
442 'lithuanian' => 'windows-1257',
443 'vietnamese' => 'windows-1258',
444 'thai' => 'cp874',
445 'korean' => 'cp949',
446 'chinese' => 'gb2312',
447 'japanese' => 'shift_jis',
448 'simpl_chinese' => 'gb2312',
449 'trad_chinese' => 'big5',
450 'albanian' => 'windows-1250',
451 'unicode' => 'utf-8'
452 );
453
454 // mapping of locale names to charsets
455 var $locale_to_charset = array(
456 'japanese.euc' => 'euc-jp',
457 'ja_jp.ujis' => 'euc-jp',
458 'korean.euc' => 'euc-kr',
459 'sr@Latn' => 'iso-8859-2',
460 'zh_cn' => 'gb2312',
461 'zh_hk' => 'big5',
462 'zh_tw' => 'big5',
463 );
464
465 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
466 // Empty values means "iso-8859-1"
467 var $charSetArray = array(
468 'dk' => '',
469 'de' => '',
470 'no' => '',
471 'it' => '',
472 'fr' => '',
473 'es' => '',
474 'nl' => '',
475 'cz' => 'windows-1250',
476 'pl' => 'iso-8859-2',
477 'si' => 'windows-1250',
478 'fi' => '',
479 'tr' => 'iso-8859-9',
480 'se' => '',
481 'pt' => '',
482 'ru' => 'windows-1251',
483 'ro' => 'iso-8859-2',
484 'ch' => 'gb2312',
485 'sk' => 'windows-1250',
486 'lt' => 'windows-1257',
487 'is' => 'utf-8',
488 'hr' => 'windows-1250',
489 'hu' => 'iso-8859-2',
490 'gl' => '',
491 'th' => 'iso-8859-11',
492 'gr' => 'iso-8859-7',
493 'hk' => 'big5',
494 'eu' => '',
495 'bg' => 'windows-1251',
496 'br' => '',
497 'et' => 'iso-8859-4',
498 'ar' => 'iso-8859-6',
499 'he' => 'utf-8',
500 'ua' => 'windows-1251',
501 'jp' => 'shift_jis',
502 'lv' => 'utf-8',
503 'vn' => 'utf-8',
504 'ca' => 'iso-8859-15',
505 'ba' => 'iso-8859-2',
506 'kr' => 'euc-kr',
507 'eo' => 'utf-8',
508 'my' => '',
509 'hi' => 'utf-8',
510 'fo' => 'utf-8',
511 'fa' => 'utf-8',
512 'sr' => 'utf-8',
513 'sq' => 'utf-8',
514 'ge' => 'utf-8',
515 'ga' => '',
516 'km' => 'utf-8',
517 'qc' => '',
518 );
519
520 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
521 // Missing keys means: same as Typo3
522 var $isoArray = array(
523 'ba' => 'bs',
524 'br' => 'pt_BR',
525 'ch' => 'zh_CN',
526 'cz' => 'cs',
527 'dk' => 'da',
528 'si' => 'sl',
529 'se' => 'sv',
530 'gl' => 'kl',
531 'gr' => 'el',
532 'hk' => 'zh_HK',
533 'kr' => 'ko',
534 'ua' => 'uk',
535 'jp' => 'ja',
536 'qc' => 'fr_CA',
537 'vn' => 'vi',
538 );
539
540 /**
541 * Normalize - changes input character set to lowercase letters.
542 *
543 * @param string Input charset
544 * @return string Normalized charset
545 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
546 */
547 function parse_charset($charset) {
548 $charset = trim(strtolower($charset));
549 if (isset($this->synonyms[$charset])) {
550 $charset = $this->synonyms[$charset];
551 }
552
553 return $charset;
554 }
555
556 /**
557 * Get the charset of a locale.
558 *
559 * ln language
560 * ln_CN language / country
561 * ln_CN.cs language / country / charset
562 * ln_CN.cs@mod language / country / charset / modifier
563 *
564 * @param string Locale string
565 * @return string Charset resolved for locale string
566 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
567 */
568 function get_locale_charset($locale) {
569 $locale = strtolower($locale);
570
571 // exact locale specific charset?
572 if (isset($this->locale_to_charset[$locale])) {
573 return $this->locale_to_charset[$locale];
574 }
575
576 // get modifier
577 list($locale, $modifier) = explode('@', $locale);
578
579 // locale contains charset: use it
580 list($locale, $charset) = explode('.', $locale);
581 if ($charset) {
582 return $this->parse_charset($charset);
583 }
584
585 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
586 if ($modifier == 'euro') {
587 return 'iso-8859-15';
588 }
589
590 // get language
591 list($language, $country) = explode('_', $locale);
592 if (isset($this->lang_to_script[$language])) {
593 $script = $this->lang_to_script[$language];
594 }
595
596 if (TYPO3_OS == 'WIN') {
597 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
598 } else {
599 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
600 }
601
602 return $cs;
603 }
604
605
606 /********************************************
607 *
608 * Charset Conversion functions
609 *
610 ********************************************/
611
612 /**
613 * Convert from one charset to another charset.
614 *
615 * @param string Input string
616 * @param string From charset (the current charset of the string)
617 * @param string To charset (the output charset wanted)
618 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
619 * @return string Converted string
620 * @see convArray()
621 */
622 function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
623 if ($fromCS == $toCS) {
624 return $str;
625 }
626
627 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
628 if ($toCS == 'utf-8' || !$useEntityForNoChar) {
629 switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
630 case 'mbstring':
631 $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
632 if (FALSE !== $conv_str) {
633 return $conv_str;
634 } // returns false for unsupported charsets
635 break;
636
637 case 'iconv':
638 $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
639 if (FALSE !== $conv_str) {
640 return $conv_str;
641 }
642 break;
643
644 case 'recode':
645 $conv_str = recode_string($fromCS . '..' . $toCS, $str);
646 if (FALSE !== $conv_str) {
647 return $conv_str;
648 }
649 break;
650 }
651 // fallback to TYPO3 conversion
652 }
653
654 if ($fromCS != 'utf-8') {
655 $str = $this->utf8_encode($str, $fromCS);
656 }
657 if ($toCS != 'utf-8') {
658 $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
659 }
660 return $str;
661 }
662
663 /**
664 * Convert all elements in ARRAY with type string from one charset to another charset.
665 * NOTICE: Array is passed by reference!
666 *
667 * @param string Input array, possibly multidimensional
668 * @param string From charset (the current charset of the string)
669 * @param string To charset (the output charset wanted)
670 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
671 * @return void
672 * @see conv()
673 */
674 function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
675 foreach ($array as $key => $value) {
676 if (is_array($array[$key])) {
677 $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
678 } elseif (is_string($array[$key])) {
679 $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
680 }
681 }
682 }
683
684 /**
685 * Converts $str from $charset to UTF-8
686 *
687 * @param string String in local charset to convert to UTF-8
688 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
689 * @return string Output string, converted to UTF-8
690 */
691 function utf8_encode($str, $charset) {
692
693 if ($charset === 'utf-8') {
694 return $str;
695 }
696
697 // Charset is case-insensitive.
698 if ($this->initCharset($charset)) { // Parse conv. table if not already...
699 $strLen = strlen($str);
700 $outStr = '';
701
702 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in string.
703 $chr = substr($str, $a, 1);
704 $ord = ord($chr);
705 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
706 $ord2 = ord($str{$a + 1});
707 $ord = $ord << 8 | $ord2; // assume big endian
708
709 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
710 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
711 } else {
712 $outStr .= chr($this->noCharByteVal);
713 } // No char exists
714 $a++;
715 } elseif ($ord > 127) { // If char has value over 127 it's a multibyte char in UTF-8
716 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
717 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
718 $a++;
719 $ord2 = ord(substr($str, $a, 1));
720 $ord = $ord * 256 + $ord2;
721 }
722 }
723
724 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
725 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
726 } else {
727 $outStr .= chr($this->noCharByteVal);
728 } // No char exists
729 } else {
730 $outStr .= $chr;
731 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
732 }
733 return $outStr;
734 }
735 }
736
737 /**
738 * Converts $str from UTF-8 to $charset
739 *
740 * @param string String in UTF-8 to convert to local charset
741 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
742 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
743 * @return string Output string, converted to local charset
744 */
745 function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
746
747 if ($charset === 'utf-8') {
748 return $str;
749 }
750
751 // Charset is case-insensitive.
752 if ($this->initCharset($charset)) { // Parse conv. table if not already...
753 $strLen = strlen($str);
754 $outStr = '';
755 $buf = '';
756 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) { // Traverse each char in UTF-8 string.
757 $chr = substr($str, $a, 1);
758 $ord = ord($chr);
759 if ($ord > 127) { // This means multibyte! (first byte!)
760 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
761
762 $buf = $chr; // Add first byte
763 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
764 $ord = $ord << 1; // Shift it left and ...
765 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
766 $a++; // Increase pointer...
767 $buf .= substr($str, $a, 1); // ... and add the next char.
768 } else {
769 break;
770 }
771 }
772
773 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
774 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
775 if ($mByte > 255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
776 $outStr .= chr(($mByte >> 8) & 255) . chr($mByte & 255);
777 } else {
778 $outStr .= chr($mByte);
779 }
780 } elseif ($useEntityForNoChar) { // Create num entity:
781 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
782 } else {
783 $outStr .= chr($this->noCharByteVal);
784 } // No char exists
785 } else {
786 $outStr .= chr($this->noCharByteVal);
787 } // No char exists (MIDDLE of MB sequence!)
788 } else {
789 $outStr .= $chr;
790 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
791 }
792 return $outStr;
793 }
794 }
795
796 /**
797 * Converts all chars > 127 to numeric entities.
798 *
799 * @param string Input string
800 * @return string Output string
801 */
802 function utf8_to_entities($str) {
803 $strLen = strlen($str);
804 $outStr = '';
805 $buf = '';
806 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
807 $chr = substr($str, $a, 1);
808 $ord = ord($chr);
809 if ($ord > 127) { // This means multibyte! (first byte!)
810 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
811 $buf = $chr; // Add first byte
812 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
813 $ord = $ord << 1; // Shift it left and ...
814 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
815 $a++; // Increase pointer...
816 $buf .= substr($str, $a, 1); // ... and add the next char.
817 } else {
818 break;
819 }
820 }
821
822 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
823 } else {
824 $outStr .= chr($this->noCharByteVal);
825 } // No char exists (MIDDLE of MB sequence!)
826 } else {
827 $outStr .= $chr;
828 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
829 }
830
831 return $outStr;
832 }
833
834 /**
835 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
836 *
837 * @param string Input string, UTF-8
838 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
839 * @return string Output string
840 */
841 function entities_to_utf8($str, $alsoStdHtmlEnt = 0) {
842 if ($alsoStdHtmlEnt) {
843 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
844 }
845
846 $token = md5(microtime());
847 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
848 foreach ($parts as $k => $v) {
849 if ($k % 2) {
850 if (substr($v, 0, 1) == '#') { // Dec or hex entities:
851 if (substr($v, 1, 1) == 'x') {
852 $parts[$k] = $this->UnumberToChar(hexdec(substr($v, 2)));
853 } else {
854 $parts[$k] = $this->UnumberToChar(substr($v, 1));
855 }
856 } elseif ($alsoStdHtmlEnt && $trans_tbl['&' . $v . ';']) { // Other entities:
857 $parts[$k] = $this->utf8_encode($trans_tbl['&' . $v . ';'], 'iso-8859-1');
858 } else { // No conversion:
859 $parts[$k] = '&' . $v . ';';
860 }
861 }
862 }
863
864 return implode('', $parts);
865 }
866
867 /**
868 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
869 *
870 * @param string Input string, UTF-8
871 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
872 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
873 * @return array Output array with the char numbers
874 */
875 function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
876 // If entities must be registered as well...:
877 if ($convEntities) {
878 $str = $this->entities_to_utf8($str, 1);
879 }
880 // Do conversion:
881 $strLen = strlen($str);
882 $outArr = array();
883 $buf = '';
884 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
885 $chr = substr($str, $a, 1);
886 $ord = ord($chr);
887 if ($ord > 127) { // This means multibyte! (first byte!)
888 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
889 $buf = $chr; // Add first byte
890 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
891 $ord = $ord << 1; // Shift it left and ...
892 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
893 $a++; // Increase pointer...
894 $buf .= substr($str, $a, 1); // ... and add the next char.
895 } else {
896 break;
897 }
898 }
899
900 $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
901 } else {
902 $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
903 } // No char exists (MIDDLE of MB sequence!)
904 } else {
905 $outArr[] = $retChar ? chr($ord) : $ord;
906 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
907 }
908
909 return $outArr;
910 }
911
912 /**
913 * Converts a UNICODE number to a UTF-8 multibyte character
914 * Algorithm based on script found at From: http://czyborra.com/utf/
915 * Unit-tested by Kasper
916 *
917 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
918 *
919 * bytes | bits | representation
920 * 1 | 7 | 0vvvvvvv
921 * 2 | 11 | 110vvvvv 10vvvvvv
922 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
923 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
924 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
925 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
926 *
927 * @param integer UNICODE integer
928 * @return string UTF-8 multibyte character string
929 * @see utf8CharToUnumber()
930 */
931 function UnumberToChar($cbyte) {
932 $str = '';
933
934 if ($cbyte < 0x80) {
935 $str .= chr($cbyte);
936 } else {
937 if ($cbyte < 0x800) {
938 $str .= chr(0xC0 | ($cbyte >> 6));
939 $str .= chr(0x80 | ($cbyte & 0x3F));
940 } else {
941 if ($cbyte < 0x10000) {
942 $str .= chr(0xE0 | ($cbyte >> 12));
943 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
944 $str .= chr(0x80 | ($cbyte & 0x3F));
945 } else {
946 if ($cbyte < 0x200000) {
947 $str .= chr(0xF0 | ($cbyte >> 18));
948 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
949 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
950 $str .= chr(0x80 | ($cbyte & 0x3F));
951 } else {
952 if ($cbyte < 0x4000000) {
953 $str .= chr(0xF8 | ($cbyte >> 24));
954 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
955 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
956 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
957 $str .= chr(0x80 | ($cbyte & 0x3F));
958 } else {
959 if ($cbyte < 0x80000000) {
960 $str .= chr(0xFC | ($cbyte >> 30));
961 $str .= chr(0x80 | (($cbyte >> 24) & 0x3F));
962 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
963 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
964 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
965 $str .= chr(0x80 | ($cbyte & 0x3F));
966 } else { // Cannot express a 32-bit character in UTF-8
967 $str .= chr($this->noCharByteVal);
968 }
969 }
970 }
971 }
972 }
973 }
974 return $str;
975 }
976
977 /**
978 * Converts a UTF-8 Multibyte character to a UNICODE number
979 * Unit-tested by Kasper
980 *
981 * @param string UTF-8 multibyte character string
982 * @param boolean If set, then a hex. number is returned.
983 * @return integer UNICODE integer
984 * @see UnumberToChar()
985 */
986 function utf8CharToUnumber($str, $hex = 0) {
987 $ord = ord(substr($str, 0, 1)); // First char
988
989 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
990 $binBuf = '';
991 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
992 $ord = $ord << 1; // Shift it left and ...
993 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
994 $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6);
995 } else {
996 break;
997 }
998 }
999 $binBuf = substr('00000000' . decbin(ord(substr($str, 0, 1))), -(6 - $b)) . $binBuf;
1000
1001 $int = bindec($binBuf);
1002 } else {
1003 $int = $ord;
1004 }
1005
1006 return $hex ? 'x' . dechex($int) : $int;
1007 }
1008
1009
1010 /********************************************
1011 *
1012 * Init functions
1013 *
1014 ********************************************/
1015
1016 /**
1017 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
1018 * This function is automatically called by the conversion functions
1019 *
1020 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
1021 *
1022 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
1023 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
1024 * @access private
1025 */
1026 function initCharset($charset) {
1027 // Only process if the charset is not yet loaded:
1028 if (!is_array($this->parsedCharsets[$charset])) {
1029
1030 // Conversion table filename:
1031 $charsetConvTableFile = PATH_t3lib . 'csconvtbl/' . $charset . '.tbl';
1032
1033 // If the conversion table is found:
1034 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
1035 // Cache file for charsets:
1036 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1037 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1038 if ($cacheFile && @is_file($cacheFile)) {
1039 $this->parsedCharsets[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1040 } else {
1041 // Parse conversion table into lines:
1042 $lines = t3lib_div::trimExplode(LF, t3lib_div::getUrl($charsetConvTableFile), 1);
1043 // Initialize the internal variable holding the conv. table:
1044 $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
1045 // traverse the lines:
1046 $detectedType = '';
1047 foreach ($lines as $value) {
1048 if (trim($value) && substr($value, 0, 1) != '#') { // Comment line or blanks are ignored.
1049
1050 // Detect type if not done yet: (Done on first real line)
1051 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1052 if (!$detectedType) {
1053 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
1054 }
1055
1056 if ($detectedType == 'ms-token') {
1057 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1058 } elseif ($detectedType == 'whitespaced') {
1059 $regA = array();
1060 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1061 $hexbyte = $regA[1];
1062 $utf8 = 'U+' . $regA[2];
1063 }
1064 $decval = hexdec(trim($hexbyte));
1065 if ($decval > 127) {
1066 $utf8decval = hexdec(substr(trim($utf8), 2));
1067 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1068 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1069 }
1070 }
1071 }
1072 if ($cacheFile) {
1073 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1074 }
1075 }
1076 return 2;
1077 } else {
1078 return FALSE;
1079 }
1080 } else {
1081 return 1;
1082 }
1083 }
1084
1085 /**
1086 * This function initializes all UTF-8 character data tables.
1087 *
1088 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1089 *
1090 * @param string Mode ("case", "ascii", ...)
1091 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1092 * @access private
1093 */
1094 function initUnicodeData($mode = NULL) {
1095 // cache files
1096 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1097 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1098
1099 // Only process if the tables are not yet loaded
1100 switch ($mode) {
1101 case 'case':
1102 if (is_array($this->caseFolding['utf-8'])) {
1103 return 1;
1104 }
1105
1106 // Use cached version if possible
1107 if ($cacheFileCase && @is_file($cacheFileCase)) {
1108 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1109 return 2;
1110 }
1111 break;
1112
1113 case 'ascii':
1114 if (is_array($this->toASCII['utf-8'])) {
1115 return 1;
1116 }
1117
1118 // Use cached version if possible
1119 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1120 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1121 return 2;
1122 }
1123 break;
1124 }
1125
1126 // process main Unicode data file
1127 $unicodeDataFile = PATH_t3lib . 'unidata/UnicodeData.txt';
1128 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1129 return FALSE;
1130 }
1131
1132 $fh = fopen($unicodeDataFile, 'rb');
1133 if (!$fh) {
1134 return FALSE;
1135 }
1136
1137 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1138 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1139 $this->caseFolding['utf-8'] = array();
1140 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1141 $utf8CaseFolding['toUpper'] = array();
1142 $utf8CaseFolding['toLower'] = array();
1143 $utf8CaseFolding['toTitle'] = array();
1144
1145 $decomposition = array(); // array of temp. decompositions
1146 $mark = array(); // array of chars that are marks (eg. composing accents)
1147 $number = array(); // array of chars that are numbers (eg. digits)
1148 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1149
1150 while (!feof($fh)) {
1151 $line = fgets($fh, 4096);
1152 // has a lot of info
1153 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line));
1154
1155 $ord = hexdec($char);
1156 if ($ord > 0xFFFF) {
1157 break;
1158 } // only process the BMP
1159
1160 $utf8_char = $this->UnumberToChar($ord);
1161
1162 if ($upper) {
1163 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1164 }
1165 if ($lower) {
1166 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1167 }
1168 // store "title" only when different from "upper" (only a few)
1169 if ($title && $title != $upper) {
1170 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1171 }
1172
1173 switch ($cat{0}) {
1174 case 'M': // mark (accent, umlaut, ...)
1175 $mark["U+$char"] = 1;
1176 break;
1177
1178 case 'N': // numeric value
1179 if ($ord > 0x80 && $num != '') {
1180 $number["U+$char"] = $num;
1181 }
1182 }
1183
1184 // accented Latin letters without "official" decomposition
1185 $match = array();
1186 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1187 $c = ord($match[2]);
1188 if ($match[1] == 'SMALL') {
1189 $c += 32;
1190 }
1191
1192 $decomposition["U+$char"] = array(dechex($c));
1193 continue;
1194 }
1195
1196 $match = array();
1197 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1198 switch ($match[1]) {
1199 case '<circle>': // add parenthesis as circle replacement, eg (1)
1200 $match[2] = '0028 ' . $match[2] . ' 0029';
1201 break;
1202
1203 case '<square>': // add square brackets as square replacement, eg [1]
1204 $match[2] = '005B ' . $match[2] . ' 005D';
1205 break;
1206
1207 case '<compat>': // ignore multi char decompositions that start with a space
1208 if (preg_match('/^0020 /', $match[2])) {
1209 continue 2;
1210 }
1211 break;
1212
1213 // ignore Arabic and vertical layout presentation decomposition
1214 case '<initial>':
1215 case '<medial>':
1216 case '<final>':
1217 case '<isolated>':
1218 case '<vertical>':
1219 continue 2;
1220 }
1221 $decomposition["U+$char"] = explode(' ', $match[2]);
1222 }
1223 }
1224 fclose($fh);
1225
1226 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1227 $specialCasingFile = PATH_t3lib . 'unidata/SpecialCasing.txt';
1228 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1229 $fh = fopen($specialCasingFile, 'rb');
1230 if ($fh) {
1231 while (!feof($fh)) {
1232 $line = fgets($fh, 4096);
1233 if ($line{0} != '#' && trim($line) != '') {
1234
1235 list($char, $lower, $title, $upper, $cond) = t3lib_div::trimExplode(';', $line);
1236 if ($cond == '' || $cond{0} == '#') {
1237 $utf8_char = $this->UnumberToChar(hexdec($char));
1238 if ($char != $lower) {
1239 $arr = explode(' ', $lower);
1240 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1241 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1242 }
1243 if ($char != $title && $title != $upper) {
1244 $arr = explode(' ', $title);
1245 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1246 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1247 }
1248 if ($char != $upper) {
1249 $arr = explode(' ', $upper);
1250 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1251 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1252 }
1253 }
1254 }
1255 }
1256 fclose($fh);
1257 }
1258 }
1259
1260 // process custom decompositions
1261 $customTranslitFile = PATH_t3lib . 'unidata/Translit.txt';
1262 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1263 $fh = fopen($customTranslitFile, 'rb');
1264 if ($fh) {
1265 while (!feof($fh)) {
1266 $line = fgets($fh, 4096);
1267 if ($line{0} != '#' && trim($line) != '') {
1268 list($char, $translit) = t3lib_div::trimExplode(';', $line);
1269 if (!$translit) {
1270 $omit["U+$char"] = 1;
1271 }
1272 $decomposition["U+$char"] = explode(' ', $translit);
1273
1274 }
1275 }
1276 fclose($fh);
1277 }
1278 }
1279
1280 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1281 foreach ($decomposition as $from => $to) {
1282 $code_decomp = array();
1283
1284 while ($code_value = array_shift($to)) {
1285 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1286 foreach (array_reverse($decomposition["U+$code_value"]) as $cv) {
1287 array_unshift($to, $cv);
1288 }
1289 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1290 array_push($code_decomp, $code_value);
1291 }
1292 }
1293 if (count($code_decomp) || isset($omit[$from])) {
1294 $decomposition[$from] = $code_decomp;
1295 } else {
1296 unset($decomposition[$from]);
1297 }
1298 }
1299
1300 // create ascii only mapping
1301 $this->toASCII['utf-8'] = array();
1302 $ascii =& $this->toASCII['utf-8'];
1303
1304 foreach ($decomposition as $from => $to) {
1305 $code_decomp = array();
1306 while ($code_value = array_shift($to)) {
1307 $ord = hexdec($code_value);
1308 if ($ord > 127) {
1309 continue 2;
1310 } // skip decompositions containing non-ASCII chars
1311 else
1312 {
1313 array_push($code_decomp, chr($ord));
1314 }
1315 }
1316 $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1317 }
1318
1319 // add numeric decompositions
1320 foreach ($number as $from => $to) {
1321 $utf8_char = $this->UnumberToChar(hexdec($from));
1322 if (!isset($ascii[$utf8_char])) {
1323 $ascii[$utf8_char] = $to;
1324 }
1325 }
1326
1327 if ($cacheFileCase) {
1328 t3lib_div::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1329 }
1330
1331 if ($cacheFileASCII) {
1332 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1333 }
1334
1335 return 3;
1336 }
1337
1338 /**
1339 * This function initializes the folding table for a charset other than UTF-8.
1340 * This function is automatically called by the case folding functions.
1341 *
1342 * @param string Charset for which to initialize case folding.
1343 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1344 * @access private
1345 */
1346 function initCaseFolding($charset) {
1347 // Only process if the case table is not yet loaded:
1348 if (is_array($this->caseFolding[$charset])) {
1349 return 1;
1350 }
1351
1352 // Use cached version if possible
1353 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1354 if ($cacheFile && @is_file($cacheFile)) {
1355 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1356 return 2;
1357 }
1358
1359 // init UTF-8 conversion for this charset
1360 if (!$this->initCharset($charset)) {
1361 return FALSE;
1362 }
1363
1364 // UTF-8 case folding is used as the base conversion table
1365 if (!$this->initUnicodeData('case')) {
1366 return FALSE;
1367 }
1368
1369 $nochar = chr($this->noCharByteVal);
1370 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1371 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1372 $c = $this->utf8_decode($utf8, $charset);
1373
1374 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1375 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1376 if ($cc != '' && $cc != $nochar) {
1377 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1378 }
1379
1380 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1381 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1382 if ($cc != '' && $cc != $nochar) {
1383 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1384 }
1385
1386 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1387 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1388 if ($cc != '' && $cc != $nochar) {
1389 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1390 }
1391 }
1392
1393 // add the ASCII case table
1394 for ($i = ord('a'); $i <= ord('z'); $i++) {
1395 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1396 }
1397 for ($i = ord('A'); $i <= ord('Z'); $i++) {
1398 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1399 }
1400
1401 if ($cacheFile) {
1402 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1403 }
1404
1405 return 3;
1406 }
1407
1408 /**
1409 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1410 * This function is automatically called by the ASCII transliteration functions.
1411 *
1412 * @param string Charset for which to initialize conversion.
1413 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1414 * @access private
1415 */
1416 function initToASCII($charset) {
1417 // Only process if the case table is not yet loaded:
1418 if (is_array($this->toASCII[$charset])) {
1419 return 1;
1420 }
1421
1422 // Use cached version if possible
1423 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1424 if ($cacheFile && @is_file($cacheFile)) {
1425 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1426 return 2;
1427 }
1428
1429 // init UTF-8 conversion for this charset
1430 if (!$this->initCharset($charset)) {
1431 return FALSE;
1432 }
1433
1434 // UTF-8/ASCII transliteration is used as the base conversion table
1435 if (!$this->initUnicodeData('ascii')) {
1436 return FALSE;
1437 }
1438
1439 $nochar = chr($this->noCharByteVal);
1440 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1441 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1442 $c = $this->utf8_decode($utf8, $charset);
1443
1444 if (isset($this->toASCII['utf-8'][$utf8])) {
1445 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1446 }
1447 }
1448
1449 if ($cacheFile) {
1450 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1451 }
1452
1453 return 3;
1454 }
1455
1456
1457 /********************************************
1458 *
1459 * String operation functions
1460 *
1461 ********************************************/
1462
1463 /**
1464 * Returns a part of a string.
1465 * Unit-tested by Kasper (single byte charsets only)
1466 *
1467 * @param string The character set
1468 * @param string Character string
1469 * @param integer Start position (character position)
1470 * @param integer Length (in characters)
1471 * @return string The substring
1472 * @see substr(), mb_substr()
1473 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1474 */
1475 function substr($charset, $string, $start, $len = NULL) {
1476 if ($len === 0 || $string === '') {
1477 return '';
1478 }
1479
1480 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1481 // cannot omit $len, when specifying charset
1482 if ($len == NULL) {
1483 $enc = mb_internal_encoding(); // save internal encoding
1484 mb_internal_encoding($charset);
1485 $str = mb_substr($string, $start);
1486 mb_internal_encoding($enc); // restore internal encoding
1487
1488 return $str;
1489 }
1490 else {
1491 return mb_substr($string, $start, $len, $charset);
1492 }
1493 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1494 // cannot omit $len, when specifying charset
1495 if ($len == NULL) {
1496 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1497 iconv_set_encoding('internal_encoding', $charset);
1498 $str = iconv_substr($string, $start);
1499 iconv_set_encoding('internal_encoding', $enc); // restore internal encoding
1500
1501 return $str;
1502 }
1503 else {
1504 return iconv_substr($string, $start, $len, $charset);
1505 }
1506 } elseif ($charset == 'utf-8') {
1507 return $this->utf8_substr($string, $start, $len);
1508 } elseif ($this->eucBasedSets[$charset]) {
1509 return $this->euc_substr($string, $start, $charset, $len);
1510 } elseif ($this->twoByteSets[$charset]) {
1511 return substr($string, $start * 2, $len * 2);
1512 } elseif ($this->fourByteSets[$charset]) {
1513 return substr($string, $start * 4, $len * 4);
1514 }
1515
1516 // treat everything else as single-byte encoding
1517 return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
1518 }
1519
1520 /**
1521 * Counts the number of characters.
1522 * Unit-tested by Kasper (single byte charsets only)
1523 *
1524 * @param string The character set
1525 * @param string Character string
1526 * @return integer The number of characters
1527 * @see strlen()
1528 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1529 */
1530 function strlen($charset, $string) {
1531 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1532 return mb_strlen($string, $charset);
1533 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1534 return iconv_strlen($string, $charset);
1535 } elseif ($charset == 'utf-8') {
1536 return $this->utf8_strlen($string);
1537 } elseif ($this->eucBasedSets[$charset]) {
1538 return $this->euc_strlen($string, $charset);
1539 } elseif ($this->twoByteSets[$charset]) {
1540 return strlen($string) / 2;
1541 } elseif ($this->fourByteSets[$charset]) {
1542 return strlen($string) / 4;
1543 }
1544 // treat everything else as single-byte encoding
1545 return strlen($string);
1546 }
1547
1548 /**
1549 * Method to crop strings using the mb_substr function.
1550 *
1551 * @param string The character set
1552 * @param string String to be cropped
1553 * @param integer Crop length (in characters)
1554 * @param string Crop signifier
1555 * @return string The shortened string
1556 * @see mb_strlen(), mb_substr()
1557 */
1558 protected function cropMbstring($charset, $string, $len, $crop = '') {
1559 if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) {
1560 return $string;
1561 }
1562
1563 if ($len > 0) {
1564 $string = mb_substr($string, 0, $len, $charset) . $crop;
1565 } else {
1566 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1567 }
1568
1569 return $string;
1570 }
1571
1572 /**
1573 * Truncates a string and pre-/appends a string.
1574 * Unit tested by Kasper
1575 *
1576 * @param string The character set
1577 * @param string Character string
1578 * @param integer Length (in characters)
1579 * @param string Crop signifier
1580 * @return string The shortened string
1581 * @see substr(), mb_strimwidth()
1582 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1583 */
1584 function crop($charset, $string, $len, $crop = '') {
1585 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1586 return $this->cropMbstring($charset, $string, $len, $crop);
1587 }
1588
1589 if (intval($len) == 0) {
1590 return $string;
1591 }
1592
1593 if ($charset == 'utf-8') {
1594 $i = $this->utf8_char2byte_pos($string, $len);
1595 } elseif ($this->eucBasedSets[$charset]) {
1596 $i = $this->euc_char2byte_pos($string, $len, $charset);
1597 } else {
1598 if ($len > 0) {
1599 $i = $len;
1600 } else {
1601 $i = strlen($string) + $len;
1602 if ($i <= 0) {
1603 $i = FALSE;
1604 }
1605 }
1606 }
1607
1608 if ($i === FALSE) { // $len outside actual string length
1609 return $string;
1610 } else {
1611 if ($len > 0) {
1612 if (strlen($string{$i})) {
1613 return substr($string, 0, $i) . $crop;
1614
1615 }
1616 } else {
1617 if (strlen($string{$i - 1})) {
1618 return $crop . substr($string, $i);
1619 }
1620 }
1621
1622 /*
1623 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1624 if ($len > 0) {
1625 return substr($string,0,$i).$crop;
1626 } else {
1627 return $crop.substr($string,$i);
1628 }
1629 }
1630 */
1631 }
1632 return $string;
1633 }
1634
1635 /**
1636 * Cuts a string short at a given byte length.
1637 *
1638 * @param string The character set
1639 * @param string Character string
1640 * @param integer The byte length
1641 * @return string The shortened string
1642 * @see mb_strcut()
1643 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1644 */
1645 function strtrunc($charset, $string, $len) {
1646 if ($len <= 0) {
1647 return '';
1648 }
1649
1650 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1651 return mb_strcut($string, 0, $len, $charset);
1652 } elseif ($charset == 'utf-8') {
1653 return $this->utf8_strtrunc($string, $len);
1654 } elseif ($this->eucBasedSets[$charset]) {
1655 return $this->euc_strtrunc($string, $len, $charset);
1656 } elseif ($this->twoByteSets[$charset]) {
1657 if ($len % 2) {
1658 $len--;
1659 } // don't cut at odd positions
1660 } elseif ($this->fourByteSets[$charset]) {
1661 $x = $len % 4;
1662 $len -= $x; // realign to position dividable by four
1663 }
1664 // treat everything else as single-byte encoding
1665 return substr($string, 0, $len);
1666 }
1667
1668 /**
1669 * Translates all characters of a string into their respective case values.
1670 * Unlike strtolower() and strtoupper() this method is locale independent.
1671 * Note that the string length may change!
1672 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1673 * Unit-tested by Kasper
1674 * Real case folding is language dependent, this method ignores this fact.
1675 *
1676 * @param string Character set of string
1677 * @param string Input string to convert case for
1678 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1679 * @return string The converted string
1680 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1681 * @see strtolower(), strtoupper()
1682 */
1683 function conv_case($charset, $string, $case) {
1684 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1685 if ($case == 'toLower') {
1686 $string = mb_strtolower($string, $charset);
1687 } else {
1688 $string = mb_strtoupper($string, $charset);
1689 }
1690 } elseif ($charset == 'utf-8') {
1691 $string = $this->utf8_char_mapping($string, 'case', $case);
1692 } elseif (isset($this->eucBasedSets[$charset])) {
1693 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1694 } else {
1695 // treat everything else as single-byte encoding
1696 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1697 }
1698
1699 return $string;
1700 }
1701
1702 /**
1703 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1704 *
1705 * @param string Character set of string
1706 * @param string Input string to convert
1707 * @return string The converted string
1708 */
1709 function specCharsToASCII($charset, $string) {
1710 if ($charset == 'utf-8') {
1711 $string = $this->utf8_char_mapping($string, 'ascii');
1712 } elseif (isset($this->eucBasedSets[$charset])) {
1713 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1714 } else {
1715 // treat everything else as single-byte encoding
1716 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1717 }
1718
1719 return $string;
1720 }
1721
1722
1723 /**
1724 * converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1725 * into a TYPO3-readable language code
1726 * @param $languageCodesList list of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1727 * see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
1728 * @return string a preferred language that TYPO3 supports, or "default" if none found
1729 * @author Benjamin Mack (benni.typo3.org)
1730 */
1731 public function getPreferredClientLanguage($languageCodesList) {
1732 $allLanguageCodes = array();
1733 $selectedLanguage = 'default';
1734
1735 // get all languages where TYPO3 code is the same as the ISO code
1736 foreach ($this->charSetArray as $typo3Lang => $charSet) {
1737 $allLanguageCodes[$typo3Lang] = $typo3Lang;
1738 }
1739
1740 // get all languages where TYPO3 code differs from ISO code
1741 // or needs the country part
1742 // the iso codes will here overwrite the default typo3 language in the key
1743 foreach ($this->isoArray as $typo3Lang => $isoLang) {
1744 $isoLang = join('-', explode('_', $isoLang));
1745 $allLanguageCodes[$typo3Lang] = $isoLang;
1746 }
1747
1748 // move the iso codes to the (because we're comparing the keys with "isset" later on)
1749 $allLanguageCodes = array_flip($allLanguageCodes);
1750
1751
1752 $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList);
1753 // order the preferred languages after they key
1754 $sortedPreferredLanguages = array();
1755 foreach ($preferredLanguages as $preferredLanguage) {
1756 $quality = 1.0;
1757 if (strpos($preferredLanguage, ';q=') !== FALSE) {
1758 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1759 }
1760 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1761 }
1762
1763 // loop through the languages, with the highest priority first
1764 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1765 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1766 if (isset($allLanguageCodes[$preferredLanguage])) {
1767 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1768 break;
1769 }
1770
1771 // strip the country code from the end
1772 list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
1773 if (isset($allLanguageCodes[$preferredLanguage])) {
1774 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1775 break;
1776 }
1777 }
1778 if (!$selectedLanguage || $selectedLanguage == 'en') {
1779 $selectedLanguage = 'default';
1780 }
1781 return $selectedLanguage;
1782 }
1783
1784
1785 /********************************************
1786 *
1787 * Internal string operation functions
1788 *
1789 ********************************************/
1790
1791 /**
1792 * Maps all characters of a string in a single byte charset.
1793 *
1794 * @param string the string
1795 * @param string the charset
1796 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1797 * @param string 'case': conversion 'toLower' or 'toUpper'
1798 * @return string the converted string
1799 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1800 */
1801 function sb_char_mapping($str, $charset, $mode, $opt = '') {
1802 switch ($mode) {
1803 case 'case':
1804 if (!$this->initCaseFolding($charset)) {
1805 return $str;
1806 } // do nothing
1807 $map =& $this->caseFolding[$charset][$opt];
1808 break;
1809
1810 case 'ascii':
1811 if (!$this->initToASCII($charset)) {
1812 return $str;
1813 } // do nothing
1814 $map =& $this->toASCII[$charset];
1815 break;
1816
1817 default:
1818 return $str;
1819 }
1820
1821 $out = '';
1822 for ($i = 0; strlen($str{$i}); $i++) {
1823 $c = $str{$i};
1824 if (isset($map[$c])) {
1825 $out .= $map[$c];
1826 } else {
1827 $out .= $c;
1828 }
1829 }
1830
1831 return $out;
1832 }
1833
1834
1835 /********************************************
1836 *
1837 * Internal UTF-8 string operation functions
1838 *
1839 ********************************************/
1840
1841 /**
1842 * Returns a part of a UTF-8 string.
1843 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1844 *
1845 * @param string UTF-8 string
1846 * @param integer Start position (character position)
1847 * @param integer Length (in characters)
1848 * @return string The substring
1849 * @see substr()
1850 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1851 */
1852 function utf8_substr($str, $start, $len = NULL) {
1853 if (!strcmp($len, '0')) {
1854 return '';
1855 }
1856
1857 $byte_start = $this->utf8_char2byte_pos($str, $start);
1858 if ($byte_start === FALSE) {
1859 if ($start > 0) {
1860 return FALSE; // $start outside string length
1861 } else {
1862 $start = 0;
1863 }
1864 }
1865
1866 $str = substr($str, $byte_start);
1867
1868 if ($len != NULL) {
1869 $byte_end = $this->utf8_char2byte_pos($str, $len);
1870 if ($byte_end === FALSE) // $len outside actual string length
1871 {
1872 return $len < 0 ? '' : $str;
1873 } // When length is less than zero and exceeds, then we return blank string.
1874 else
1875 {
1876 return substr($str, 0, $byte_end);
1877 }
1878 }
1879 else {
1880 return $str;
1881 }
1882 }
1883
1884 /**
1885 * Counts the number of characters of a string in UTF-8.
1886 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1887 *
1888 * @param string UTF-8 multibyte character string
1889 * @return integer The number of characters
1890 * @see strlen()
1891 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1892 */
1893 function utf8_strlen($str) {
1894 $n = 0;
1895 for ($i = 0; strlen($str{$i}); $i++) {
1896 $c = ord($str{$i});
1897 if (!($c & 0x80)) // single-byte (0xxxxxx)
1898 {
1899 $n++;
1900 }
1901 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1902 {
1903 $n++;
1904 }
1905 }
1906 return $n;
1907 }
1908
1909 /**
1910 * Truncates a string in UTF-8 short at a given byte length.
1911 *
1912 * @param string UTF-8 multibyte character string
1913 * @param integer the byte length
1914 * @return string the shortened string
1915 * @see mb_strcut()
1916 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1917 */
1918 function utf8_strtrunc($str, $len) {
1919 $i = $len - 1;
1920 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1921 for (; $i > 0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1922 if ($i <= 0) {
1923 return '';
1924 } // sanity check
1925 for ($bc = 0, $mbs = ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1926 if ($bc + $i > $len) {
1927 return substr($str, 0, $i);
1928 }
1929 // fallthru: multibyte char fits into length
1930 }
1931 return substr($str, 0, $len);
1932 }
1933
1934 /**
1935 * Find position of first occurrence of a string, both arguments are in UTF-8.
1936 *
1937 * @param string UTF-8 string to search in
1938 * @param string UTF-8 string to search for
1939 * @param integer Positition to start the search
1940 * @return integer The character position
1941 * @see strpos()
1942 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1943 */
1944 function utf8_strpos($haystack, $needle, $offset = 0) {
1945 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1946 return mb_strpos($haystack, $needle, $offset, 'utf-8');
1947 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1948 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1949 }
1950
1951 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1952 if ($byte_offset === FALSE) {
1953 return FALSE;
1954 } // offset beyond string length
1955
1956 $byte_pos = strpos($haystack, $needle, $byte_offset);
1957 if ($byte_pos === FALSE) {
1958 return FALSE;
1959 } // needle not found
1960
1961 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1962 }
1963
1964 /**
1965 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1966 *
1967 * @param string UTF-8 string to search in
1968 * @param string UTF-8 character to search for (single character)
1969 * @return integer The character position
1970 * @see strrpos()
1971 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1972 */
1973 function utf8_strrpos($haystack, $needle) {
1974 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1975 return mb_strrpos($haystack, $needle, 'utf-8');
1976 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1977 return iconv_strrpos($haystack, $needle, 'utf-8');
1978 }
1979
1980 $byte_pos = strrpos($haystack, $needle);
1981 if ($byte_pos === FALSE) {
1982 return FALSE;
1983 } // needle not found
1984
1985 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1986 }
1987
1988 /**
1989 * Translates a character position into an 'absolute' byte position.
1990 * Unit tested by Kasper.
1991 *
1992 * @param string UTF-8 string
1993 * @param integer Character position (negative values start from the end)
1994 * @return integer Byte position
1995 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1996 */
1997 function utf8_char2byte_pos($str, $pos) {
1998 $n = 0; // number of characters found
1999 $p = abs($pos); // number of characters wanted
2000
2001 if ($pos >= 0) {
2002 $i = 0;
2003 $d = 1;
2004 } else {
2005 $i = strlen($str) - 1;
2006 $d = -1;
2007 }
2008
2009 for (; strlen($str{$i}) && $n < $p; $i += $d) {
2010 $c = (int) ord($str{$i});
2011 if (!($c & 0x80)) // single-byte (0xxxxxx)
2012 {
2013 $n++;
2014 }
2015 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2016 {
2017 $n++;
2018 }
2019 }
2020 if (!strlen($str{$i})) {
2021 return FALSE;
2022 } // offset beyond string length
2023
2024 if ($pos >= 0) {
2025 // skip trailing multi-byte data bytes
2026 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) {
2027 $i++;
2028 }
2029 } else {
2030 // correct offset
2031 $i++;
2032 }
2033
2034 return $i;
2035 }
2036
2037 /**
2038 * Translates an 'absolute' byte position into a character position.
2039 * Unit tested by Kasper.
2040 *
2041 * @param string UTF-8 string
2042 * @param integer byte position
2043 * @return integer character position
2044 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2045 */
2046 function utf8_byte2char_pos($str, $pos) {
2047 $n = 0; // number of characters
2048 for ($i = $pos; $i > 0; $i--) {
2049 $c = (int) ord($str{$i});
2050 if (!($c & 0x80)) // single-byte (0xxxxxx)
2051 {
2052 $n++;
2053 }
2054 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2055 {
2056 $n++;
2057 }
2058 }
2059 if (!strlen($str{$i})) {
2060 return FALSE;
2061 } // offset beyond string length
2062
2063 return $n;
2064 }
2065
2066 /**
2067 * Maps all characters of an UTF-8 string.
2068 *
2069 * @param string UTF-8 string
2070 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2071 * @param string 'case': conversion 'toLower' or 'toUpper'
2072 * @return string the converted string
2073 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2074 */
2075 function utf8_char_mapping($str, $mode, $opt = '') {
2076 if (!$this->initUnicodeData($mode)) {
2077 return $str;
2078 } // do nothing
2079
2080 $out = '';
2081 switch ($mode) {
2082 case 'case':
2083 $map =& $this->caseFolding['utf-8'][$opt];
2084 break;
2085
2086 case 'ascii':
2087 $map =& $this->toASCII['utf-8'];
2088 break;
2089
2090 default:
2091 return $str;
2092 }
2093
2094 for ($i = 0; strlen($str{$i}); $i++) {
2095 $c = ord($str{$i});
2096 if (!($c & 0x80)) // single-byte (0xxxxxx)
2097 {
2098 $mbc = $str{$i};
2099 }
2100 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
2101 for ($bc = 0; $c & 0x80; $c = $c << 1) {
2102 $bc++;
2103 } // calculate number of bytes
2104 $mbc = substr($str, $i, $bc);
2105 $i += $bc - 1;
2106 }
2107
2108 if (isset($map[$mbc])) {
2109 $out .= $map[$mbc];
2110 } else {
2111 $out .= $mbc;
2112 }
2113 }
2114
2115 return $out;
2116 }
2117
2118
2119 /********************************************
2120 *
2121 * Internal EUC string operation functions
2122 *
2123 * Extended Unix Code:
2124 * ASCII compatible 7bit single bytes chars
2125 * 8bit two byte chars
2126 *
2127 * Shift-JIS is treated as a special case.
2128 *
2129 ********************************************/
2130
2131 /**
2132 * Cuts a string in the EUC charset family short at a given byte length.
2133 *
2134 * @param string EUC multibyte character string
2135 * @param integer the byte length
2136 * @param string the charset
2137 * @return string the shortened string
2138 * @see mb_strcut()
2139 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2140 */
2141 function euc_strtrunc($str, $len, $charset) {
2142 $sjis = ($charset == 'shift_jis');
2143 for ($i = 0; strlen($str{$i}) && $i < $len; $i++) {
2144 $c = ord($str{$i});
2145 if ($sjis) {
2146 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2147 $i++;
2148 } // advance a double-byte char
2149 }
2150 else {
2151 if ($c >= 0x80) {
2152 $i++;
2153 } // advance a double-byte char
2154 }
2155 }
2156 if (!strlen($str{$i})) {
2157 return $str;
2158 } // string shorter than supplied length
2159
2160 if ($i > $len) {
2161 return substr($str, 0, $len - 1); // we ended on a first byte
2162 } else {
2163 return substr($str, 0, $len);
2164 }
2165 }
2166
2167 /**
2168 * Returns a part of a string in the EUC charset family.
2169 *
2170 * @param string EUC multibyte character string
2171 * @param integer start position (character position)
2172 * @param string the charset
2173 * @param integer length (in characters)
2174 * @return string the substring
2175 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2176 */
2177 function euc_substr($str, $start, $charset, $len = NULL) {
2178 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2179 if ($byte_start === FALSE) {
2180 return FALSE;
2181 } // $start outside string length
2182
2183 $str = substr($str, $byte_start);
2184
2185 if ($len != NULL) {
2186 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2187 if ($byte_end === FALSE) // $len outside actual string length
2188 {
2189 return $str;
2190 }
2191 else
2192 {
2193 return substr($str, 0, $byte_end);
2194 }
2195 }
2196 else {
2197 return $str;
2198 }
2199 }
2200
2201 /**
2202 * Counts the number of characters of a string in the EUC charset family.
2203 *
2204 * @param string EUC multibyte character string
2205 * @param string the charset
2206 * @return integer the number of characters
2207 * @see strlen()
2208 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2209 */
2210 function euc_strlen($str, $charset) {
2211 $sjis = ($charset == 'shift_jis');
2212 $n = 0;
2213 for ($i = 0; strlen($str{$i}); $i++) {
2214 $c = ord($str{$i});
2215 if ($sjis) {
2216 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2217 $i++;
2218 } // advance a double-byte char
2219 }
2220 else {
2221 if ($c >= 0x80) {
2222 $i++;
2223 } // advance a double-byte char
2224 }
2225
2226 $n++;
2227 }
2228
2229 return $n;
2230 }
2231
2232 /**
2233 * Translates a character position into an 'absolute' byte position.
2234 *
2235 * @param string EUC multibyte character string
2236 * @param integer character position (negative values start from the end)
2237 * @param string the charset
2238 * @return integer byte position
2239 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2240 */
2241 function euc_char2byte_pos($str, $pos, $charset) {
2242 $sjis = ($charset == 'shift_jis');
2243 $n = 0; // number of characters seen
2244 $p = abs($pos); // number of characters wanted
2245
2246 if ($pos >= 0) {
2247 $i = 0;
2248 $d = 1;
2249 } else {
2250 $i = strlen($str) - 1;
2251 $d = -1;
2252 }
2253
2254 for (; strlen($str{$i}) && $n < $p; $i += $d) {
2255 $c = ord($str{$i});
2256 if ($sjis) {
2257 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2258 $i += $d;
2259 } // advance a double-byte char
2260 }
2261 else {
2262 if ($c >= 0x80) {
2263 $i += $d;
2264 } // advance a double-byte char
2265 }
2266
2267 $n++;
2268 }
2269 if (!strlen($str{$i})) {
2270 return FALSE;
2271 } // offset beyond string length
2272
2273 if ($pos < 0) {
2274 $i++;
2275 } // correct offset
2276
2277 return $i;
2278 }
2279
2280 /**
2281 * Maps all characters of a string in the EUC charset family.
2282 *
2283 * @param string EUC multibyte character string
2284 * @param string the charset
2285 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2286 * @param string 'case': conversion 'toLower' or 'toUpper'
2287 * @return string the converted string
2288 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2289 */
2290 function euc_char_mapping($str, $charset, $mode, $opt = '') {
2291 switch ($mode) {
2292 case 'case':
2293 if (!$this->initCaseFolding($charset)) {
2294 return $str;
2295 } // do nothing
2296 $map =& $this->caseFolding[$charset][$opt];
2297 break;
2298
2299 case 'ascii':
2300 if (!$this->initToASCII($charset)) {
2301 return $str;
2302 } // do nothing
2303 $map =& $this->toASCII[$charset];
2304 break;
2305
2306 default:
2307 return $str;
2308 }
2309
2310 $sjis = ($charset == 'shift_jis');
2311 $out = '';
2312 for ($i = 0; strlen($str{$i}); $i++) {
2313 $mbc = $str{$i};
2314 $c = ord($mbc);
2315
2316 if ($sjis) {
2317 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2318 $mbc = substr($str, $i, 2);
2319 $i++;
2320 }
2321 }
2322 else {
2323 if ($c >= 0x80) { // a double-byte char
2324 $mbc = substr($str, $i, 2);
2325 $i++;
2326 }
2327 }
2328
2329 if (isset($map[$mbc])) {
2330 $out .= $map[$mbc];
2331 } else {
2332 $out .= $mbc;
2333 }
2334 }
2335
2336 return $out;
2337 }
2338
2339 }
2340
2341 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])) {
2342 include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2343 }
2344
2345 ?>