[FEATURE] Allow .ts file extension for static typoscript templates
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
28 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
29 */
30
31 /**
32 * Notes on UTF-8
33 *
34 * Functions working on UTF-8 strings:
35 *
36 * - strchr/strstr
37 * - strrchr
38 * - substr_count
39 * - implode/explode/join
40 *
41 * Functions nearly working on UTF-8 strings:
42 *
43 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
44 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
45 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
46 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
47 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
48 *
49 * Functions NOT working on UTF-8 strings:
50 *
51 * - str*cmp
52 * - stristr
53 * - stripos
54 * - substr
55 * - strrev
56 * - split/spliti
57 * - ...
58 *
59 */
60 /**
61 * Class for conversion between charsets
62 *
63 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
64 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
65 * @package TYPO3
66 * @subpackage t3lib
67 */
68 class t3lib_cs {
69
70 /**
71 * @var t3lib_l10n_Locales
72 */
73 protected $locales;
74
75 // ASCII Value for chars with no equivalent.
76 var $noCharByteVal = 63;
77
78 // This is the array where parsed conversion tables are stored (cached)
79 var $parsedCharsets = array();
80
81 // An array where case folding data will be stored (cached)
82 var $caseFolding = array();
83
84 // An array where charset-to-ASCII mappings are stored (cached)
85 var $toASCII = array();
86
87 // This tells the converter which charsets has two bytes per char:
88 var $twoByteSets = array(
89 'ucs-2' => 1, // 2-byte Unicode
90 );
91
92 // This tells the converter which charsets has four bytes per char:
93 var $fourByteSets = array(
94 'ucs-4' => 1, // 4-byte Unicode
95 'utf-32' => 1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
96 );
97
98 // This tells the converter which charsets use a scheme like the Extended Unix Code:
99 var $eucBasedSets = array(
100 'gb2312' => 1, // Chinese, simplified.
101 'big5' => 1, // Chinese, traditional.
102 'euc-kr' => 1, // Korean
103 'shift_jis' => 1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
104 );
105
106 // See http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
107 // http://czyborra.com/charsets/iso8859.html
108 var $synonyms = array(
109 'us' => 'ascii',
110 'us-ascii' => 'ascii',
111 'cp819' => 'iso-8859-1',
112 'ibm819' => 'iso-8859-1',
113 'iso-ir-100' => 'iso-8859-1',
114 'iso-ir-101' => 'iso-8859-2',
115 'iso-ir-109' => 'iso-8859-3',
116 'iso-ir-110' => 'iso-8859-4',
117 'iso-ir-144' => 'iso-8859-5',
118 'iso-ir-127' => 'iso-8859-6',
119 'iso-ir-126' => 'iso-8859-7',
120 'iso-ir-138' => 'iso-8859-8',
121 'iso-ir-148' => 'iso-8859-9',
122 'iso-ir-157' => 'iso-8859-10',
123 'iso-ir-179' => 'iso-8859-13',
124 'iso-ir-199' => 'iso-8859-14',
125 'iso-ir-203' => 'iso-8859-15',
126 'csisolatin1' => 'iso-8859-1',
127 'csisolatin2' => 'iso-8859-2',
128 'csisolatin3' => 'iso-8859-3',
129 'csisolatin5' => 'iso-8859-9',
130 'csisolatin8' => 'iso-8859-14',
131 'csisolatin9' => 'iso-8859-15',
132 'csisolatingreek' => 'iso-8859-7',
133 'iso-celtic' => 'iso-8859-14',
134 'latin1' => 'iso-8859-1',
135 'latin2' => 'iso-8859-2',
136 'latin3' => 'iso-8859-3',
137 'latin5' => 'iso-8859-9',
138 'latin6' => 'iso-8859-10',
139 'latin8' => 'iso-8859-14',
140 'latin9' => 'iso-8859-15',
141 'l1' => 'iso-8859-1',
142 'l2' => 'iso-8859-2',
143 'l3' => 'iso-8859-3',
144 'l5' => 'iso-8859-9',
145 'l6' => 'iso-8859-10',
146 'l8' => 'iso-8859-14',
147 'l9' => 'iso-8859-15',
148 'cyrillic' => 'iso-8859-5',
149 'arabic' => 'iso-8859-6',
150 'tis-620' => 'iso-8859-11',
151 'win874' => 'windows-874',
152 'win1250' => 'windows-1250',
153 'win1251' => 'windows-1251',
154 'win1252' => 'windows-1252',
155 'win1253' => 'windows-1253',
156 'win1254' => 'windows-1254',
157 'win1255' => 'windows-1255',
158 'win1256' => 'windows-1256',
159 'win1257' => 'windows-1257',
160 'win1258' => 'windows-1258',
161 'cp1250' => 'windows-1250',
162 'cp1251' => 'windows-1251',
163 'cp1252' => 'windows-1252',
164 'ms-ee' => 'windows-1250',
165 'ms-ansi' => 'windows-1252',
166 'ms-greek' => 'windows-1253',
167 'ms-turk' => 'windows-1254',
168 'winbaltrim' => 'windows-1257',
169 'koi-8ru' => 'koi-8r',
170 'koi8r' => 'koi-8r',
171 'cp878' => 'koi-8r',
172 'mac' => 'macroman',
173 'macintosh' => 'macroman',
174 'euc-cn' => 'gb2312',
175 'x-euc-cn' => 'gb2312',
176 'euccn' => 'gb2312',
177 'cp936' => 'gb2312',
178 'big-5' => 'big5',
179 'cp950' => 'big5',
180 'eucjp' => 'euc-jp',
181 'sjis' => 'shift_jis',
182 'shift-jis' => 'shift_jis',
183 'cp932' => 'shift_jis',
184 'cp949' => 'euc-kr',
185 'utf7' => 'utf-7',
186 'utf8' => 'utf-8',
187 'utf16' => 'utf-16',
188 'utf32' => 'utf-32',
189 'utf8' => 'utf-8',
190 'ucs2' => 'ucs-2',
191 'ucs4' => 'ucs-4',
192 );
193
194 // Mapping of iso-639-1 language codes to script names
195 var $lang_to_script = array(
196 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
197 'af' => 'west_european', //Afrikaans
198 'ar' => 'arabic',
199 'bg' => 'cyrillic', // Bulgarian
200 'bs' => 'east_european', // Bosnian
201 'cs' => 'east_european', // Czech
202 'da' => 'west_european', // Danish
203 'de' => 'west_european', // German
204 'es' => 'west_european', // Spanish
205 'et' => 'estonian',
206 'eo' => 'unicode', // Esperanto
207 'eu' => 'west_european', // Basque
208 'fa' => 'arabic', // Persian
209 'fi' => 'west_european', // Finish
210 'fo' => 'west_european', // Faroese
211 'fr' => 'west_european', // French
212 'ga' => 'west_european', // Irish
213 'gl' => 'west_european', // Galician
214 'gr' => 'greek',
215 'he' => 'hebrew', // Hebrew (since 1998)
216 'hi' => 'unicode', // Hindi
217 'hr' => 'east_european', // Croatian
218 'hu' => 'east_european', // Hungarian
219 'iw' => 'hebrew', // Hebrew (til 1998)
220 'is' => 'west_european', // Icelandic
221 'it' => 'west_european', // Italian
222 'ja' => 'japanese',
223 'ka' => 'unicode', // Georgian
224 'kl' => 'west_european', // Greenlandic
225 'km' => 'unicode', // Khmer
226 'ko' => 'korean',
227 'lt' => 'lithuanian',
228 'lv' => 'west_european', // Latvian/Lettish
229 'nl' => 'west_european', // Dutch
230 'no' => 'west_european', // Norwegian
231 'nb' => 'west_european', // Norwegian Bokmal
232 'nn' => 'west_european', // Norwegian Nynorsk
233 'pl' => 'east_european', // Polish
234 'pt' => 'west_european', // Portuguese
235 'ro' => 'east_european', // Romanian
236 'ru' => 'cyrillic', // Russian
237 'sk' => 'east_european', // Slovak
238 'sl' => 'east_european', // Slovenian
239 'sr' => 'cyrillic', // Serbian
240 'sv' => 'west_european', // Swedish
241 'sq' => 'albanian', // Albanian
242 'th' => 'thai',
243 'uk' => 'cyrillic', // Ukranian
244 'vi' => 'vietnamese',
245 'zh' => 'chinese',
246 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
247 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
248 'afk'=> 'west_european', // Afrikaans
249 'ara' => 'arabic',
250 'bgr' => 'cyrillic', // Bulgarian
251 'cat' => 'west_european', // Catalan
252 'chs' => 'simpl_chinese',
253 'cht' => 'trad_chinese',
254 'csy' => 'east_european', // Czech
255 'dan' => 'west_european', // Danisch
256 'deu' => 'west_european', // German
257 'dea' => 'west_european', // German (Austrian)
258 'des' => 'west_european', // German (Swiss)
259 'ena' => 'west_european', // English (Australian)
260 'enc' => 'west_european', // English (Canadian)
261 'eng' => 'west_european', // English
262 'enz' => 'west_european', // English (New Zealand)
263 'enu' => 'west_european', // English (United States)
264 'euq' => 'west_european', // Basque
265 'fos' => 'west_european', // Faroese
266 'far' => 'arabic', // Persian
267 'fin' => 'west_european', // Finish
268 'fra' => 'west_european', // French
269 'frb' => 'west_european', // French (Belgian)
270 'frc' => 'west_european', // French (Canadian)
271 'frs' => 'west_european', // French (Swiss)
272 'geo' => 'unicode', // Georgian
273 'glg' => 'west_european', // Galician
274 'ell' => 'greek',
275 'heb' => 'hebrew',
276 'hin' => 'unicode', // Hindi
277 'hun' => 'east_european', // Hungarian
278 'isl' => 'west_european', // Icelandic
279 'ita' => 'west_european', // Italian
280 'its' => 'west_european', // Italian (Swiss)
281 'jpn' => 'japanese',
282 'khm' => 'unicode', // Khmer
283 'kor' => 'korean',
284 'lth' => 'lithuanian',
285 'lvi' => 'west_european', // Latvian/Lettish
286 'msl' => 'west_european', // Malay
287 'nlb' => 'west_european', // Dutch (Belgian)
288 'nld' => 'west_european', // Dutch
289 'nor' => 'west_european', // Norwegian (bokmal)
290 'non' => 'west_european', // Norwegian (nynorsk)
291 'plk' => 'east_european', // Polish
292 'ptg' => 'west_european', // Portuguese
293 'ptb' => 'west_european', // Portuguese (Brazil)
294 'rom' => 'east_european', // Romanian
295 'rus' => 'cyrillic', // Russian
296 'slv' => 'east_european', // Slovenian
297 'sky' => 'east_european', // Slovak
298 'srl' => 'east_european', // Serbian (Latin)
299 'srb' => 'cyrillic', // Serbian (Cyrillic)
300 'esp' => 'west_european', // Spanish (trad. sort)
301 'esm' => 'west_european', // Spanish (Mexican)
302 'esn' => 'west_european', // Spanish (internat. sort)
303 'sve' => 'west_european', // Swedish
304 'sqi' => 'albanian', // Albanian
305 'tha' => 'thai',
306 'trk' => 'turkish',
307 'ukr' => 'cyrillic', // Ukrainian
308 // English language names
309 'afrikaans' => 'west_european',
310 'albanian' => 'albanian',
311 'arabic' => 'arabic',
312 'basque' => 'west_european',
313 'bosnian' => 'east_european',
314 'bulgarian' => 'east_european',
315 'catalan' => 'west_european',
316 'croatian' => 'east_european',
317 'czech' => 'east_european',
318 'danish' => 'west_european',
319 'dutch' => 'west_european',
320 'english' => 'west_european',
321 'esperanto' => 'unicode',
322 'estonian' => 'estonian',
323 'faroese' => 'west_european',
324 'farsi' => 'arabic',
325 'finnish' => 'west_european',
326 'french' => 'west_european',
327 'galician' => 'west_european',
328 'georgian' => 'unicode',
329 'german' => 'west_european',
330 'greek' => 'greek',
331 'greenlandic' => 'west_european',
332 'hebrew' => 'hebrew',
333 'hindi' => 'unicode',
334 'hungarian' => 'east_european',
335 'icelandic' => 'west_european',
336 'italian' => 'west_european',
337 'khmer' => 'unicode',
338 'latvian' => 'west_european',
339 'lettish' => 'west_european',
340 'lithuanian' => 'lithuanian',
341 'malay' => 'west_european',
342 'norwegian' => 'west_european',
343 'persian' => 'arabic',
344 'polish' => 'east_european',
345 'portuguese' => 'west_european',
346 'russian' => 'cyrillic',
347 'romanian' => 'east_european',
348 'serbian' => 'cyrillic',
349 'slovak' => 'east_european',
350 'slovenian' => 'east_european',
351 'spanish' => 'west_european',
352 'svedish' => 'west_european',
353 'that' => 'thai',
354 'turkish' => 'turkish',
355 'ukrainian' => 'cyrillic',
356 );
357
358 // Mapping of language (family) names to charsets on Unix
359 var $script_to_charset_unix = array(
360 'west_european' => 'iso-8859-1',
361 'estonian' => 'iso-8859-1',
362 'east_european' => 'iso-8859-2',
363 'baltic' => 'iso-8859-4',
364 'cyrillic' => 'iso-8859-5',
365 'arabic' => 'iso-8859-6',
366 'greek' => 'iso-8859-7',
367 'hebrew' => 'iso-8859-8',
368 'turkish' => 'iso-8859-9',
369 'thai' => 'iso-8859-11', // = TIS-620
370 'lithuanian' => 'iso-8859-13',
371 'chinese' => 'gb2312', // = euc-cn
372 'japanese' => 'euc-jp',
373 'korean' => 'euc-kr',
374 'simpl_chinese' => 'gb2312',
375 'trad_chinese' => 'big5',
376 'vietnamese' => '',
377 'unicode' => 'utf-8',
378 'albanian' => 'utf-8'
379 );
380
381 // Mapping of language (family) names to charsets on Windows
382 var $script_to_charset_windows = array(
383 'east_european' => 'windows-1250',
384 'cyrillic' => 'windows-1251',
385 'west_european' => 'windows-1252',
386 'greek' => 'windows-1253',
387 'turkish' => 'windows-1254',
388 'hebrew' => 'windows-1255',
389 'arabic' => 'windows-1256',
390 'baltic' => 'windows-1257',
391 'estonian' => 'windows-1257',
392 'lithuanian' => 'windows-1257',
393 'vietnamese' => 'windows-1258',
394 'thai' => 'cp874',
395 'korean' => 'cp949',
396 'chinese' => 'gb2312',
397 'japanese' => 'shift_jis',
398 'simpl_chinese' => 'gb2312',
399 'trad_chinese' => 'big5',
400 'albanian' => 'windows-1250',
401 'unicode' => 'utf-8'
402 );
403
404 // Mapping of locale names to charsets
405 var $locale_to_charset = array(
406 'japanese.euc' => 'euc-jp',
407 'ja_jp.ujis' => 'euc-jp',
408 'korean.euc' => 'euc-kr',
409 'sr@Latn' => 'iso-8859-2',
410 'zh_cn' => 'gb2312',
411 'zh_hk' => 'big5',
412 'zh_tw' => 'big5',
413 );
414
415 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
416 // Empty values means "iso-8859-1"
417 var $charSetArray = array(
418 'af' => '',
419 'ar' => 'iso-8859-6',
420 'ba' => 'iso-8859-2',
421 'bg' => 'windows-1251',
422 'br' => '',
423 'ca' => 'iso-8859-15',
424 'ch' => 'gb2312',
425 'cs' => 'windows-1250',
426 'cz' => 'windows-1250',
427 'da' => '',
428 'de' => '',
429 'dk' => '',
430 'el' => 'iso-8859-7',
431 'eo' => 'utf-8',
432 'es' => '',
433 'et' => 'iso-8859-4',
434 'eu' => '',
435 'fa' => 'utf-8',
436 'fi' => '',
437 'fo' => 'utf-8',
438 'fr' => '',
439 'fr_CA' => '',
440 'ga' => '',
441 'ge' => 'utf-8',
442 'gl' => '',
443 'gr' => 'iso-8859-7',
444 'he' => 'utf-8',
445 'hi' => 'utf-8',
446 'hk' => 'big5',
447 'hr' => 'windows-1250',
448 'hu' => 'iso-8859-2',
449 'is' => 'utf-8',
450 'it' => '',
451 'ja' => 'shift_jis',
452 'jp' => 'shift_jis',
453 'ka' => 'utf-8',
454 'kl' => 'utf-8',
455 'km' => 'utf-8',
456 'ko' => 'euc-kr',
457 'kr' => 'euc-kr',
458 'lt' => 'windows-1257',
459 'lv' => 'utf-8',
460 'ms' => '',
461 'my' => '',
462 'nl' => '',
463 'no' => '',
464 'pl' => 'iso-8859-2',
465 'pt' => '',
466 'pt_BR' => '',
467 'qc' => '',
468 'ro' => 'iso-8859-2',
469 'ru' => 'windows-1251',
470 'se' => '',
471 'si' => 'windows-1250',
472 'sk' => 'windows-1250',
473 'sl' => 'windows-1250',
474 'sq' => 'utf-8',
475 'sr' => 'utf-8',
476 'sv' => '',
477 'th' => 'iso-8859-11',
478 'tr' => 'iso-8859-9',
479 'ua' => 'windows-1251',
480 'uk' => 'windows-1251',
481 'vi' => 'utf-8',
482 'vn' => 'utf-8',
483 'zh' => 'big5',
484 );
485
486 /**
487 * Default constructor.
488 */
489 public function __construct() {
490 $this->locales = t3lib_div::makeInstance('t3lib_l10n_Locales');
491 }
492
493 /**
494 * Normalize - changes input character set to lowercase letters.
495 *
496 * @param string $charset Input charset
497 * @return string Normalized charset
498 */
499 function parse_charset($charset) {
500 $charset = trim(strtolower($charset));
501 if (isset($this->synonyms[$charset])) {
502 $charset = $this->synonyms[$charset];
503 }
504
505 return $charset;
506 }
507
508 /**
509 * Get the charset of a locale.
510 *
511 * ln language
512 * ln_CN language / country
513 * ln_CN.cs language / country / charset
514 * ln_CN.cs@mod language / country / charset / modifier
515 *
516 * @param string $locale Locale string
517 * @return string Charset resolved for locale string
518 */
519 function get_locale_charset($locale) {
520 $locale = strtolower($locale);
521
522 // Exact locale specific charset?
523 if (isset($this->locale_to_charset[$locale])) {
524 return $this->locale_to_charset[$locale];
525 }
526
527 // Get modifier
528 list($locale, $modifier) = explode('@', $locale);
529
530 // Locale contains charset: use it
531 list($locale, $charset) = explode('.', $locale);
532 if ($charset) {
533 return $this->parse_charset($charset);
534 }
535
536 // Modifier is 'euro' (after charset check, because of xx.utf-8@euro)
537 if ($modifier == 'euro') {
538 return 'iso-8859-15';
539 }
540
541 // Get language
542 list($language, $country) = explode('_', $locale);
543 if (isset($this->lang_to_script[$language])) {
544 $script = $this->lang_to_script[$language];
545 }
546
547 if (TYPO3_OS == 'WIN') {
548 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
549 } else {
550 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'utf-8';
551 }
552
553 return $cs;
554 }
555
556 /********************************************
557 *
558 * Charset Conversion functions
559 *
560 ********************************************/
561
562 /**
563 * Convert from one charset to another charset.
564 *
565 * @param string $str Input string
566 * @param string $fromCS From charset (the current charset of the string)
567 * @param string $toCS To charset (the output charset wanted)
568 * @param boolean $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
569 * @return string Converted string
570 * @see convArray()
571 */
572 function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
573 if ($fromCS == $toCS) {
574 return $str;
575 }
576
577 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
578 if ($toCS == 'utf-8' || !$useEntityForNoChar) {
579 switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
580 case 'mbstring':
581 $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
582 if (FALSE !== $conv_str) {
583 return $conv_str;
584 } // Returns FALSE for unsupported charsets
585 break;
586
587 case 'iconv':
588 $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
589 if (FALSE !== $conv_str) {
590 return $conv_str;
591 }
592 break;
593
594 case 'recode':
595 $conv_str = recode_string($fromCS . '..' . $toCS, $str);
596 if (FALSE !== $conv_str) {
597 return $conv_str;
598 }
599 break;
600 }
601 // Fallback to TYPO3 conversion
602 }
603
604 if ($fromCS != 'utf-8') {
605 $str = $this->utf8_encode($str, $fromCS);
606 }
607 if ($toCS != 'utf-8') {
608 $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
609 }
610 return $str;
611 }
612
613 /**
614 * Convert all elements in ARRAY with type string from one charset to another charset.
615 * NOTICE: Array is passed by reference!
616 *
617 * @param string $array Input array, possibly multidimensional
618 * @param string $fromCS From charset (the current charset of the string)
619 * @param string $toCS To charset (the output charset wanted)
620 * @param boolean $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
621 * @return void
622 * @see conv()
623 */
624 function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
625 foreach ($array as $key => $value) {
626 if (is_array($array[$key])) {
627 $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
628 } elseif (is_string($array[$key])) {
629 $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
630 }
631 }
632 }
633
634 /**
635 * Converts $str from $charset to UTF-8
636 *
637 * @param string $str String in local charset to convert to UTF-8
638 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
639 * @return string Output string, converted to UTF-8
640 */
641 function utf8_encode($str, $charset) {
642
643 if ($charset === 'utf-8') {
644 return $str;
645 }
646
647 // Charset is case-insensitive
648 // Parse conv. table if not already
649 if ($this->initCharset($charset)) {
650 $strLen = strlen($str);
651 $outStr = '';
652
653 // Traverse each char in string
654 for ($a = 0; $a < $strLen; $a++) {
655 $chr = substr($str, $a, 1);
656 $ord = ord($chr);
657
658 // If the charset has two bytes per char
659 if (isset($this->twoByteSets[$charset])) {
660 $ord2 = ord($str{$a + 1});
661 // Assume big endian
662 $ord = $ord << 8 | $ord2;
663
664 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
665 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
666 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
667 } else {
668 $outStr .= chr($this->noCharByteVal);
669 } // No char exists
670 $a++;
671 } elseif ($ord > 127) { // If char has value over 127 it's a multibyte char in UTF-8
672 // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
673 if (isset($this->eucBasedSets[$charset])) {
674 // Shift-JIS: chars between 160 and 223 are single byte
675 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) {
676 $a++;
677 $ord2 = ord(substr($str, $a, 1));
678 $ord = $ord * 256 + $ord2;
679 }
680 }
681
682 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
683 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
684 } else {
685 $outStr .= chr($this->noCharByteVal);
686 } // No char exists
687 } else {
688 $outStr .= $chr;
689 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
690 }
691 return $outStr;
692 }
693 }
694
695 /**
696 * Converts $str from UTF-8 to $charset
697 *
698 * @param string $str String in UTF-8 to convert to local charset
699 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
700 * @param boolean $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
701 * @return string Output string, converted to local charset
702 */
703 function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
704
705 if ($charset === 'utf-8') {
706 return $str;
707 }
708
709 // Charset is case-insensitive.
710 // Parse conv. table if not already
711 if ($this->initCharset($charset)) {
712 $strLen = strlen($str);
713 $outStr = '';
714 $buf = '';
715
716 // Traverse each char in UTF-8 string
717 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) {
718 $chr = substr($str, $a, 1);
719 $ord = ord($chr);
720 // This means multibyte! (first byte!)
721 if ($ord > 127) {
722 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
723 if ($ord & 64) {
724
725 // Add first byte
726 $buf = $chr;
727 // For each byte in multibyte string
728 for ($b = 0; $b < 8; $b++) {
729 // Shift it left and
730 $ord = $ord << 1;
731 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
732 if ($ord & 128) {
733 $a++;
734 // ... and add the next char.
735 $buf .= substr($str, $a, 1);
736 } else {
737 break;
738 }
739 }
740 // If the UTF-8 char-sequence is found then...
741 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
742 // The local number
743 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
744 // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
745 if ($mByte > 255) {
746 $outStr .= chr(($mByte >> 8) & 255) . chr($mByte & 255);
747 } else {
748 $outStr .= chr($mByte);
749 }
750 } elseif ($useEntityForNoChar) { // Create num entity:
751 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
752 } else {
753 $outStr .= chr($this->noCharByteVal);
754 } // No char exists
755 } else {
756 $outStr .= chr($this->noCharByteVal);
757 } // No char exists (MIDDLE of MB sequence!)
758 } else {
759 $outStr .= $chr;
760 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
761 }
762 return $outStr;
763 }
764 }
765
766 /**
767 * Converts all chars > 127 to numeric entities.
768 *
769 * @param string $str Input string
770 * @return string Output string
771 */
772 function utf8_to_entities($str) {
773 $strLen = strlen($str);
774 $outStr = '';
775 $buf = '';
776
777 // Traverse each char in UTF-8 string.
778 for ($a = 0; $a < $strLen; $a++) {
779 $chr = substr($str, $a, 1);
780 $ord = ord($chr);
781 // This means multibyte! (first byte!)
782 if ($ord > 127) {
783 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
784 if ($ord & 64) {
785 // Add first byte
786 $buf = $chr;
787 // For each byte in multibyte string...
788 for ($b = 0; $b < 8; $b++) {
789 // Shift it left and ...
790 $ord = $ord << 1;
791 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
792 if ($ord & 128) {
793 $a++;
794 // ... and add the next char.
795 $buf .= substr($str, $a, 1);
796 } else {
797 break;
798 }
799 }
800
801 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
802 } else {
803 $outStr .= chr($this->noCharByteVal);
804 } // No char exists (MIDDLE of MB sequence!)
805 } else {
806 $outStr .= $chr;
807 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
808 }
809
810 return $outStr;
811 }
812
813 /**
814 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
815 *
816 * @param string $str Input string, UTF-8
817 * @param boolean $alsoStdHtmlEnt If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
818 * @return string Output string
819 */
820 function entities_to_utf8($str, $alsoStdHtmlEnt = FALSE) {
821 if ($alsoStdHtmlEnt) {
822 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8'));
823 }
824
825 $token = md5(microtime());
826 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
827 foreach ($parts as $k => $v) {
828 // Only take every second element
829 if ($k % 2 === 0) {
830 continue;
831 }
832
833 $position = 0;
834 // Dec or hex entities
835 if (substr($v, $position, 1) == '#') {
836 $position++;
837 if (substr($v, $position, 1) == 'x') {
838 $v = hexdec(substr($v, ++$position));
839 } else {
840 $v = substr($v, $position);
841 }
842 $parts[$k] = $this->UnumberToChar($v);
843 } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) { // Other entities:
844 $parts[$k] = $trans_tbl['&' . $v . ';'];
845 } else { // No conversion:
846 $parts[$k] = '&' . $v . ';';
847 }
848 }
849
850 return implode('', $parts);
851 }
852
853 /**
854 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
855 *
856 * @param string $str Input string, UTF-8
857 * @param boolean $convEntities If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
858 * @param boolean $retChar If set, then instead of integer numbers the real UTF-8 char is returned.
859 * @return array Output array with the char numbers
860 */
861 function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
862 // If entities must be registered as well...:
863 if ($convEntities) {
864 $str = $this->entities_to_utf8($str, 1);
865 }
866 // Do conversion:
867 $strLen = strlen($str);
868 $outArr = array();
869 $buf = '';
870 // Traverse each char in UTF-8 string.
871 for ($a = 0; $a < $strLen; $a++) {
872 $chr = substr($str, $a, 1);
873 $ord = ord($chr);
874 // This means multibyte! (first byte!)
875 if ($ord > 127) {
876 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
877 if ($ord & 64) {
878 // Add first byte
879 $buf = $chr;
880 // For each byte in multibyte string...
881 for ($b = 0; $b < 8; $b++) {
882 // Shift it left and ...
883 $ord = $ord << 1;
884 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
885 if ($ord & 128) {
886 $a++;
887 // ... and add the next char.
888 $buf .= substr($str, $a, 1);
889 } else {
890 break;
891 }
892 }
893
894 $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
895 } else {
896 $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
897 } // No char exists (MIDDLE of MB sequence!)
898 } else {
899 $outArr[] = $retChar ? chr($ord) : $ord;
900 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
901 }
902
903 return $outArr;
904 }
905
906 /**
907 * Converts a UNICODE number to a UTF-8 multibyte character
908 * Algorithm based on script found at From: http://czyborra.com/utf/
909 * Unit-tested by Kasper
910 *
911 * The binary representation of the character's integer value is thus simply spread across the bytes
912 * and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
913 *
914 * bytes | bits | representation
915 * 1 | 7 | 0vvvvvvv
916 * 2 | 11 | 110vvvvv 10vvvvvv
917 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
918 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
919 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
920 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
921 *
922 * @param integer $cbyte UNICODE integer
923 * @return string UTF-8 multibyte character string
924 * @see utf8CharToUnumber()
925 */
926 function UnumberToChar($cbyte) {
927 $str = '';
928
929 if ($cbyte < 0x80) {
930 $str .= chr($cbyte);
931 } else {
932 if ($cbyte < 0x800) {
933 $str .= chr(0xC0 | ($cbyte >> 6));
934 $str .= chr(0x80 | ($cbyte & 0x3F));
935 } else {
936 if ($cbyte < 0x10000) {
937 $str .= chr(0xE0 | ($cbyte >> 12));
938 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
939 $str .= chr(0x80 | ($cbyte & 0x3F));
940 } else {
941 if ($cbyte < 0x200000) {
942 $str .= chr(0xF0 | ($cbyte >> 18));
943 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
944 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
945 $str .= chr(0x80 | ($cbyte & 0x3F));
946 } else {
947 if ($cbyte < 0x4000000) {
948 $str .= chr(0xF8 | ($cbyte >> 24));
949 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
950 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
951 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
952 $str .= chr(0x80 | ($cbyte & 0x3F));
953 } else {
954 if ($cbyte < 0x80000000) {
955 $str .= chr(0xFC | ($cbyte >> 30));
956 $str .= chr(0x80 | (($cbyte >> 24) & 0x3F));
957 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
958 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
959 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
960 $str .= chr(0x80 | ($cbyte & 0x3F));
961 } else { // Cannot express a 32-bit character in UTF-8
962 $str .= chr($this->noCharByteVal);
963 }
964 }
965 }
966 }
967 }
968 }
969 return $str;
970 }
971
972 /**
973 * Converts a UTF-8 Multibyte character to a UNICODE number
974 * Unit-tested by Kasper
975 *
976 * @param string $str UTF-8 multibyte character string
977 * @param boolean $hex If set, then a hex. number is returned.
978 * @return integer UNICODE integer
979 * @see UnumberToChar()
980 */
981 function utf8CharToUnumber($str, $hex = 0) {
982 // First char
983 $ord = ord(substr($str, 0, 1));
984
985 // This verifyes that it IS a multi byte string
986 if (($ord & 192) == 192) {
987 $binBuf = '';
988 // For each byte in multibyte string...
989 for ($b = 0; $b < 8; $b++) {
990 // Shift it left and ...
991 $ord = $ord << 1;
992 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
993 if ($ord & 128) {
994 $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6);
995 } else {
996 break;
997 }
998 }
999 $binBuf = substr('00000000' . decbin(ord(substr($str, 0, 1))), -(6 - $b)) . $binBuf;
1000
1001 $int = bindec($binBuf);
1002 } else {
1003 $int = $ord;
1004 }
1005
1006 return $hex ? 'x' . dechex($int) : $int;
1007 }
1008
1009 /********************************************
1010 *
1011 * Init functions
1012 *
1013 ********************************************/
1014
1015 /**
1016 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
1017 * This function is automatically called by the conversion functions
1018 *
1019 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
1020 *
1021 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
1022 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
1023 * @acces private
1024 */
1025 function initCharset($charset) {
1026 // Only process if the charset is not yet loaded:
1027 if (!is_array($this->parsedCharsets[$charset])) {
1028
1029 // Conversion table filename:
1030 $charsetConvTableFile = PATH_t3lib . 'csconvtbl/' . $charset . '.tbl';
1031
1032 // If the conversion table is found:
1033 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
1034 // Cache file for charsets:
1035 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1036 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1037 if ($cacheFile && @is_file($cacheFile)) {
1038 $this->parsedCharsets[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1039 } else {
1040 // Parse conversion table into lines:
1041 $lines = t3lib_div::trimExplode(LF, t3lib_div::getUrl($charsetConvTableFile), 1);
1042 // Initialize the internal variable holding the conv. table:
1043 $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
1044 // traverse the lines:
1045 $detectedType = '';
1046 foreach ($lines as $value) {
1047 // Comment line or blanks are ignored.
1048 if (trim($value) && substr($value, 0, 1) != '#') {
1049
1050 // Detect type if not done yet: (Done on first real line)
1051 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1052 if (!$detectedType) {
1053 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
1054 }
1055
1056 if ($detectedType == 'ms-token') {
1057 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1058 } elseif ($detectedType == 'whitespaced') {
1059 $regA = array();
1060 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1061 $hexbyte = $regA[1];
1062 $utf8 = 'U+' . $regA[2];
1063 }
1064 $decval = hexdec(trim($hexbyte));
1065 if ($decval > 127) {
1066 $utf8decval = hexdec(substr(trim($utf8), 2));
1067 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1068 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1069 }
1070 }
1071 }
1072 if ($cacheFile) {
1073 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1074 }
1075 }
1076 return 2;
1077 } else {
1078 return FALSE;
1079 }
1080 } else {
1081 return 1;
1082 }
1083 }
1084
1085 /**
1086 * This function initializes all UTF-8 character data tables.
1087 *
1088 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1089 *
1090 * @param string $mode Mode ("case", "ascii", ...)
1091 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1092 * @access private
1093 */
1094 function initUnicodeData($mode = NULL) {
1095 // Cache files
1096 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1097 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1098
1099 // Only process if the tables are not yet loaded
1100 switch ($mode) {
1101 case 'case':
1102 if (is_array($this->caseFolding['utf-8'])) {
1103 return 1;
1104 }
1105
1106 // Use cached version if possible
1107 if ($cacheFileCase && @is_file($cacheFileCase)) {
1108 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1109 return 2;
1110 }
1111 break;
1112
1113 case 'ascii':
1114 if (is_array($this->toASCII['utf-8'])) {
1115 return 1;
1116 }
1117
1118 // Use cached version if possible
1119 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1120 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1121 return 2;
1122 }
1123 break;
1124 }
1125
1126 // Process main Unicode data file
1127 $unicodeDataFile = PATH_t3lib . 'unidata/UnicodeData.txt';
1128 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1129 return FALSE;
1130 }
1131
1132 $fh = fopen($unicodeDataFile, 'rb');
1133 if (!$fh) {
1134 return FALSE;
1135 }
1136
1137 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1138 // Note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1139 $this->caseFolding['utf-8'] = array();
1140 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1141 $utf8CaseFolding['toUpper'] = array();
1142 $utf8CaseFolding['toLower'] = array();
1143 $utf8CaseFolding['toTitle'] = array();
1144
1145 // Array of temp. decompositions
1146 $decomposition = array();
1147 // Array of chars that are marks (eg. composing accents)
1148 $mark = array();
1149 // Array of chars that are numbers (eg. digits)
1150 $number = array();
1151 // Array of chars to be omitted (eg. Russian hard sign)
1152 $omit = array();
1153
1154 while (!feof($fh)) {
1155 $line = fgets($fh, 4096);
1156 // Has a lot of info
1157 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line));
1158
1159 $ord = hexdec($char);
1160 if ($ord > 0xFFFF) {
1161 // Only process the BMP
1162 break;
1163 }
1164
1165 $utf8_char = $this->UnumberToChar($ord);
1166
1167 if ($upper) {
1168 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1169 }
1170 if ($lower) {
1171 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1172 }
1173 // Store "title" only when different from "upper" (only a few)
1174 if ($title && $title != $upper) {
1175 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1176 }
1177
1178 switch ($cat{0}) {
1179 case 'M': // mark (accent, umlaut, ...)
1180 $mark['U+' . $char] = 1;
1181 break;
1182
1183 case 'N': // numeric value
1184 if ($ord > 0x80 && $num != '') {
1185 $number['U+' . $char] = $num;
1186 }
1187 }
1188
1189 // Accented Latin letters without "official" decomposition
1190 $match = array();
1191 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1192 $c = ord($match[2]);
1193 if ($match[1] == 'SMALL') {
1194 $c += 32;
1195 }
1196
1197 $decomposition['U+' . $char] = array(dechex($c));
1198 continue;
1199 }
1200
1201 $match = array();
1202 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1203 switch ($match[1]) {
1204 case '<circle>': // add parenthesis as circle replacement, eg (1)
1205 $match[2] = '0028 ' . $match[2] . ' 0029';
1206 break;
1207
1208 case '<square>': // add square brackets as square replacement, eg [1]
1209 $match[2] = '005B ' . $match[2] . ' 005D';
1210 break;
1211
1212 case '<compat>': // ignore multi char decompositions that start with a space
1213 if (preg_match('/^0020 /', $match[2])) {
1214 continue 2;
1215 }
1216 break;
1217
1218 // Ignore Arabic and vertical layout presentation decomposition
1219 case '<initial>':
1220 case '<medial>':
1221 case '<final>':
1222 case '<isolated>':
1223 case '<vertical>':
1224 continue 2;
1225 }
1226 $decomposition['U+' . $char] = explode(' ', $match[2]);
1227 }
1228 }
1229 fclose($fh);
1230
1231 // Process additional Unicode data for casing (allow folded characters to expand into a sequence)
1232 $specialCasingFile = PATH_t3lib . 'unidata/SpecialCasing.txt';
1233 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1234 $fh = fopen($specialCasingFile, 'rb');
1235 if ($fh) {
1236 while (!feof($fh)) {
1237 $line = fgets($fh, 4096);
1238 if ($line{0} != '#' && trim($line) != '') {
1239
1240 list($char, $lower, $title, $upper, $cond) = t3lib_div::trimExplode(';', $line);
1241 if ($cond == '' || $cond{0} == '#') {
1242 $utf8_char = $this->UnumberToChar(hexdec($char));
1243 if ($char != $lower) {
1244 $arr = explode(' ', $lower);
1245 for ($i = 0; isset($arr[$i]); $i++) {
1246 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1247 }
1248 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1249 }
1250 if ($char != $title && $title != $upper) {
1251 $arr = explode(' ', $title);
1252 for ($i = 0; isset($arr[$i]); $i++) {
1253 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1254 }
1255 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1256 }
1257 if ($char != $upper) {
1258 $arr = explode(' ', $upper);
1259 for ($i = 0; isset($arr[$i]); $i++) {
1260 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1261 }
1262 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1263 }
1264 }
1265 }
1266 }
1267 fclose($fh);
1268 }
1269 }
1270
1271 // Process custom decompositions
1272 $customTranslitFile = PATH_t3lib . 'unidata/Translit.txt';
1273 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1274 $fh = fopen($customTranslitFile, 'rb');
1275 if ($fh) {
1276 while (!feof($fh)) {
1277 $line = fgets($fh, 4096);
1278 if ($line{0} != '#' && trim($line) != '') {
1279 list($char, $translit) = t3lib_div::trimExplode(';', $line);
1280 if (!$translit) {
1281 $omit['U+' . $char] = 1;
1282 }
1283 $decomposition['U+' . $char] = explode(' ', $translit);
1284
1285 }
1286 }
1287 fclose($fh);
1288 }
1289 }
1290
1291 // Decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1292 foreach ($decomposition as $from => $to) {
1293 $code_decomp = array();
1294
1295 while ($code_value = array_shift($to)) {
1296 // Do recursive decomposition
1297 if (isset($decomposition['U+' . $code_value])) {
1298 foreach (array_reverse($decomposition['U+' . $code_value]) as $cv) {
1299 array_unshift($to, $cv);
1300 }
1301 } elseif (!isset($mark['U+' . $code_value])) { // remove mark
1302 array_push($code_decomp, $code_value);
1303 }
1304 }
1305 if (count($code_decomp) || isset($omit[$from])) {
1306 $decomposition[$from] = $code_decomp;
1307 } else {
1308 unset($decomposition[$from]);
1309 }
1310 }
1311
1312 // Create ascii only mapping
1313 $this->toASCII['utf-8'] = array();
1314 $ascii =& $this->toASCII['utf-8'];
1315
1316 foreach ($decomposition as $from => $to) {
1317 $code_decomp = array();
1318 while ($code_value = array_shift($to)) {
1319 $ord = hexdec($code_value);
1320 if ($ord > 127) {
1321 continue 2;
1322 } else { // Skip decompositions containing non-ASCII chars
1323 array_push($code_decomp, chr($ord));
1324 }
1325 }
1326 $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1327 }
1328
1329 // Add numeric decompositions
1330 foreach ($number as $from => $to) {
1331 $utf8_char = $this->UnumberToChar(hexdec($from));
1332 if (!isset($ascii[$utf8_char])) {
1333 $ascii[$utf8_char] = $to;
1334 }
1335 }
1336
1337 if ($cacheFileCase) {
1338 t3lib_div::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1339 }
1340
1341 if ($cacheFileASCII) {
1342 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1343 }
1344
1345 return 3;
1346 }
1347
1348 /**
1349 * This function initializes the folding table for a charset other than UTF-8.
1350 * This function is automatically called by the case folding functions.
1351 *
1352 * @param string $charset Charset for which to initialize case folding.
1353 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1354 * @access private
1355 */
1356 function initCaseFolding($charset) {
1357 // Only process if the case table is not yet loaded:
1358 if (is_array($this->caseFolding[$charset])) {
1359 return 1;
1360 }
1361
1362 // Use cached version if possible
1363 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1364 if ($cacheFile && @is_file($cacheFile)) {
1365 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1366 return 2;
1367 }
1368
1369 // init UTF-8 conversion for this charset
1370 if (!$this->initCharset($charset)) {
1371 return FALSE;
1372 }
1373
1374 // UTF-8 case folding is used as the base conversion table
1375 if (!$this->initUnicodeData('case')) {
1376 return FALSE;
1377 }
1378
1379 $nochar = chr($this->noCharByteVal);
1380 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1381 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1382 $c = $this->utf8_decode($utf8, $charset);
1383
1384 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1385 if ($cc != '' && $cc != $nochar) {
1386 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1387 }
1388
1389 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1390 if ($cc != '' && $cc != $nochar) {
1391 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1392 }
1393
1394 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1395 if ($cc != '' && $cc != $nochar) {
1396 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1397 }
1398 }
1399
1400 // Add the ASCII case table
1401 for ($i = ord('a'); $i <= ord('z'); $i++) {
1402 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1403 }
1404 for ($i = ord('A'); $i <= ord('Z'); $i++) {
1405 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1406 }
1407
1408 if ($cacheFile) {
1409 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1410 }
1411
1412 return 3;
1413 }
1414
1415 /**
1416 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1417 * This function is automatically called by the ASCII transliteration functions.
1418 *
1419 * @param string $charset Charset for which to initialize conversion.
1420 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1421 * @access private
1422 */
1423 function initToASCII($charset) {
1424 // Only process if the case table is not yet loaded:
1425 if (is_array($this->toASCII[$charset])) {
1426 return 1;
1427 }
1428
1429 // Use cached version if possible
1430 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1431 if ($cacheFile && @is_file($cacheFile)) {
1432 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1433 return 2;
1434 }
1435
1436 // Init UTF-8 conversion for this charset
1437 if (!$this->initCharset($charset)) {
1438 return FALSE;
1439 }
1440
1441 // UTF-8/ASCII transliteration is used as the base conversion table
1442 if (!$this->initUnicodeData('ascii')) {
1443 return FALSE;
1444 }
1445
1446 $nochar = chr($this->noCharByteVal);
1447 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1448 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1449 $c = $this->utf8_decode($utf8, $charset);
1450
1451 if (isset($this->toASCII['utf-8'][$utf8])) {
1452 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1453 }
1454 }
1455
1456 if ($cacheFile) {
1457 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1458 }
1459
1460 return 3;
1461 }
1462
1463 /********************************************
1464 *
1465 * String operation functions
1466 *
1467 ********************************************/
1468
1469 /**
1470 * Returns a part of a string.
1471 * Unit-tested by Kasper (single byte charsets only)
1472 *
1473 * @param string $charset The character set
1474 * @param string $string Character string
1475 * @param integer $start Start position (character position)
1476 * @param integer $len Length (in characters)
1477 * @return string The substring
1478 * @see substr(), mb_substr()
1479 */
1480 function substr($charset, $string, $start, $len = NULL) {
1481 if ($len === 0 || $string === '') {
1482 return '';
1483 }
1484
1485 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1486 // Cannot omit $len, when specifying charset
1487 if ($len == NULL) {
1488 // Save internal encoding
1489 $enc = mb_internal_encoding();
1490 mb_internal_encoding($charset);
1491 $str = mb_substr($string, $start);
1492 // Restore internal encoding
1493 mb_internal_encoding($enc);
1494
1495 return $str;
1496 }
1497 else {
1498 return mb_substr($string, $start, $len, $charset);
1499 }
1500 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1501 // Cannot omit $len, when specifying charset
1502 if ($len == NULL) {
1503 // Save internal encoding
1504 $enc = iconv_get_encoding('internal_encoding');
1505 iconv_set_encoding('internal_encoding', $charset);
1506 $str = iconv_substr($string, $start);
1507 // Restore internal encoding
1508 iconv_set_encoding('internal_encoding', $enc);
1509
1510 return $str;
1511 }
1512 else {
1513 return iconv_substr($string, $start, $len, $charset);
1514 }
1515 } elseif ($charset == 'utf-8') {
1516 return $this->utf8_substr($string, $start, $len);
1517 } elseif ($this->eucBasedSets[$charset]) {
1518 return $this->euc_substr($string, $start, $charset, $len);
1519 } elseif ($this->twoByteSets[$charset]) {
1520 return substr($string, $start * 2, $len * 2);
1521 } elseif ($this->fourByteSets[$charset]) {
1522 return substr($string, $start * 4, $len * 4);
1523 }
1524
1525 // Treat everything else as single-byte encoding
1526 return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
1527 }
1528
1529 /**
1530 * Counts the number of characters.
1531 * Unit-tested by Kasper (single byte charsets only)
1532 *
1533 * @param string $charset The character set
1534 * @param string $string Character string
1535 * @return integer The number of characters
1536 * @see strlen()
1537 */
1538 function strlen($charset, $string) {
1539 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1540 return mb_strlen($string, $charset);
1541 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1542 return iconv_strlen($string, $charset);
1543 } elseif ($charset == 'utf-8') {
1544 return $this->utf8_strlen($string);
1545 } elseif ($this->eucBasedSets[$charset]) {
1546 return $this->euc_strlen($string, $charset);
1547 } elseif ($this->twoByteSets[$charset]) {
1548 return strlen($string) / 2;
1549 } elseif ($this->fourByteSets[$charset]) {
1550 return strlen($string) / 4;
1551 }
1552 // Treat everything else as single-byte encoding
1553 return strlen($string);
1554 }
1555
1556 /**
1557 * Method to crop strings using the mb_substr function.
1558 *
1559 * @param string $charset The character set
1560 * @param string $string String to be cropped
1561 * @param integer $len Crop length (in characters)
1562 * @param string $crop Crop signifier
1563 * @return string The shortened string
1564 * @see mb_strlen(), mb_substr()
1565 */
1566 protected function cropMbstring($charset, $string, $len, $crop = '') {
1567 if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) {
1568 return $string;
1569 }
1570
1571 if ($len > 0) {
1572 $string = mb_substr($string, 0, $len, $charset) . $crop;
1573 } else {
1574 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1575 }
1576
1577 return $string;
1578 }
1579
1580 /**
1581 * Truncates a string and pre-/appends a string.
1582 * Unit tested by Kasper
1583 *
1584 * @param string $charset The character set
1585 * @param string $string Character string
1586 * @param integer $len Length (in characters)
1587 * @param string $crop Crop signifier
1588 * @return string The shortened string
1589 * @see substr(), mb_strimwidth()
1590 */
1591 function crop($charset, $string, $len, $crop = '') {
1592 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1593 return $this->cropMbstring($charset, $string, $len, $crop);
1594 }
1595
1596 if (intval($len) == 0) {
1597 return $string;
1598 }
1599
1600 if ($charset == 'utf-8') {
1601 $i = $this->utf8_char2byte_pos($string, $len);
1602 } elseif ($this->eucBasedSets[$charset]) {
1603 $i = $this->euc_char2byte_pos($string, $len, $charset);
1604 } else {
1605 if ($len > 0) {
1606 $i = $len;
1607 } else {
1608 $i = strlen($string) + $len;
1609 if ($i <= 0) {
1610 $i = FALSE;
1611 }
1612 }
1613 }
1614
1615 // $len outside actual string length
1616 if ($i === FALSE) {
1617 return $string;
1618 } else {
1619 if ($len > 0) {
1620 if (strlen($string{$i})) {
1621 return substr($string, 0, $i) . $crop;
1622
1623 }
1624 } else {
1625 if (strlen($string{$i - 1})) {
1626 return $crop . substr($string, $i);
1627 }
1628 }
1629 }
1630 return $string;
1631 }
1632
1633 /**
1634 * Cuts a string short at a given byte length.
1635 *
1636 * @param string $charset The character set
1637 * @param string $string Character string
1638 * @param integer $len The byte length
1639 * @return string The shortened string
1640 * @see mb_strcut()
1641 */
1642 function strtrunc($charset, $string, $len) {
1643 if ($len <= 0) {
1644 return '';
1645 }
1646
1647 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1648 return mb_strcut($string, 0, $len, $charset);
1649 } elseif ($charset == 'utf-8') {
1650 return $this->utf8_strtrunc($string, $len);
1651 } elseif ($this->eucBasedSets[$charset]) {
1652 return $this->euc_strtrunc($string, $len, $charset);
1653 } elseif ($this->twoByteSets[$charset]) {
1654 if ($len % 2) {
1655 $len--;
1656 } // Don't cut at odd positions
1657 } elseif ($this->fourByteSets[$charset]) {
1658 $x = $len % 4;
1659 // Realign to position dividable by four
1660 $len -= $x;
1661 }
1662 // Treat everything else as single-byte encoding
1663 return substr($string, 0, $len);
1664 }
1665
1666 /**
1667 * Translates all characters of a string into their respective case values.
1668 * Unlike strtolower() and strtoupper() this method is locale independent.
1669 * Note that the string length may change!
1670 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1671 * Unit-tested by Kasper
1672 * Real case folding is language dependent, this method ignores this fact.
1673 *
1674 * @param string $charset Character set of string
1675 * @param string $string Input string to convert case for
1676 * @param string $case Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1677 * @return string The converted string
1678 * @see strtolower(), strtoupper()
1679 */
1680 function conv_case($charset, $string, $case) {
1681 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1682 if ($case == 'toLower') {
1683 $string = mb_strtolower($string, $charset);
1684 } else {
1685 $string = mb_strtoupper($string, $charset);
1686 }
1687 } elseif ($charset == 'utf-8') {
1688 $string = $this->utf8_char_mapping($string, 'case', $case);
1689 } elseif (isset($this->eucBasedSets[$charset])) {
1690 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1691 } else {
1692 // Treat everything else as single-byte encoding
1693 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1694 }
1695
1696 return $string;
1697 }
1698
1699 /**
1700 * Equivalent of lcfirst/ucfirst but using character set.
1701 *
1702 * @param string $charset
1703 * @param string $string
1704 * @param string $case
1705 * @return string
1706 * @see t3lib_cs::conv_case()
1707 */
1708 public function convCaseFirst($charset, $string, $case) {
1709 $firstChar = $this->substr($charset, $string, 0, 1);
1710 $firstChar = $this->conv_case($charset, $firstChar, $case);
1711 $remainder = $this->substr($charset, $string, 1);
1712 return $firstChar . $remainder;
1713 }
1714
1715 /**
1716 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1717 *
1718 * @param string $charset Character set of string
1719 * @param string $string Input string to convert
1720 * @return string The converted string
1721 */
1722 function specCharsToASCII($charset, $string) {
1723 if ($charset == 'utf-8') {
1724 $string = $this->utf8_char_mapping($string, 'ascii');
1725 } elseif (isset($this->eucBasedSets[$charset])) {
1726 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1727 } else {
1728 // Treat everything else as single-byte encoding
1729 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1730 }
1731
1732 return $string;
1733 }
1734
1735 /**
1736 * Converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1737 * into a TYPO3-readable language code
1738 *
1739 * @param string $languageCodesList List of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1740 * see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
1741 * @return string A preferred language that TYPO3 supports, or "default" if none found
1742 */
1743 public function getPreferredClientLanguage($languageCodesList) {
1744 $allLanguageCodes = array();
1745 $selectedLanguage = 'default';
1746
1747 // Get all languages where TYPO3 code is the same as the ISO code
1748 foreach ($this->charSetArray as $typo3Lang => $charSet) {
1749 $allLanguageCodes[$typo3Lang] = $typo3Lang;
1750 }
1751
1752 // Get all languages where TYPO3 code differs from ISO code
1753 // or needs the country part
1754 // the iso codes will here overwrite the default typo3 language in the key
1755 foreach ($this->locales->getIsoMapping() as $typo3Lang => $isoLang) {
1756 $isoLang = join('-', explode('_', $isoLang));
1757 $allLanguageCodes[$typo3Lang] = $isoLang;
1758 }
1759
1760 // Move the iso codes to the (because we're comparing the keys with "isset" later on)
1761 $allLanguageCodes = array_flip($allLanguageCodes);
1762
1763 $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList);
1764 // Order the preferred languages after they key
1765 $sortedPreferredLanguages = array();
1766 foreach ($preferredLanguages as $preferredLanguage) {
1767 $quality = 1.0;
1768 if (strpos($preferredLanguage, ';q=') !== FALSE) {
1769 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1770 }
1771 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1772 }
1773
1774 // Loop through the languages, with the highest priority first
1775 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1776 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1777 if (isset($allLanguageCodes[$preferredLanguage])) {
1778 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1779 break;
1780 }
1781
1782 // Strip the country code from the end
1783 list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
1784 if (isset($allLanguageCodes[$preferredLanguage])) {
1785 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1786 break;
1787 }
1788 }
1789 if (!$selectedLanguage || $selectedLanguage == 'en') {
1790 $selectedLanguage = 'default';
1791 }
1792 return $selectedLanguage;
1793 }
1794
1795 /********************************************
1796 *
1797 * Internal string operation functions
1798 *
1799 ********************************************/
1800
1801 /**
1802 * Maps all characters of a string in a single byte charset.
1803 *
1804 * @param string $str The string
1805 * @param string $charset The charset
1806 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1807 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
1808 * @return string The converted string
1809 */
1810 function sb_char_mapping($str, $charset, $mode, $opt = '') {
1811 switch ($mode) {
1812 case 'case':
1813 if (!$this->initCaseFolding($charset)) {
1814 return $str;
1815 } // Do nothing
1816 $map =& $this->caseFolding[$charset][$opt];
1817 break;
1818
1819 case 'ascii':
1820 if (!$this->initToASCII($charset)) {
1821 return $str;
1822 } // Do nothing
1823 $map =& $this->toASCII[$charset];
1824 break;
1825
1826 default:
1827 return $str;
1828 }
1829
1830 $out = '';
1831 for ($i = 0; strlen($str{$i}); $i++) {
1832 $c = $str{$i};
1833 if (isset($map[$c])) {
1834 $out .= $map[$c];
1835 } else {
1836 $out .= $c;
1837 }
1838 }
1839
1840 return $out;
1841 }
1842
1843 /********************************************
1844 *
1845 * Internal UTF-8 string operation functions
1846 *
1847 ********************************************/
1848
1849 /**
1850 * Returns a part of a UTF-8 string.
1851 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1852 *
1853 * @param string $str UTF-8 string
1854 * @param integer $start Start position (character position)
1855 * @param integer $len Length (in characters)
1856 * @return string The substring
1857 * @see substr()
1858 */
1859 function utf8_substr($str, $start, $len = NULL) {
1860 if (!strcmp($len, '0')) {
1861 return '';
1862 }
1863
1864 $byte_start = $this->utf8_char2byte_pos($str, $start);
1865 if ($byte_start === FALSE) {
1866 if ($start > 0) {
1867 // $start outside string length
1868 return FALSE;
1869 } else {
1870 $start = 0;
1871 }
1872 }
1873
1874 $str = substr($str, $byte_start);
1875
1876 if ($len != NULL) {
1877 $byte_end = $this->utf8_char2byte_pos($str, $len);
1878 // $len outside actual string length
1879 if ($byte_end === FALSE) {
1880 return $len < 0 ? '' : $str;
1881 } else { // When length is less than zero and exceeds, then we return blank string.
1882 return substr($str, 0, $byte_end);
1883 }
1884 } else {
1885 return $str;
1886 }
1887 }
1888
1889 /**
1890 * Counts the number of characters of a string in UTF-8.
1891 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1892 *
1893 * @param string $str UTF-8 multibyte character string
1894 * @return integer The number of characters
1895 * @see strlen()
1896 */
1897 function utf8_strlen($str) {
1898 $n = 0;
1899 for ($i = 0; strlen($str{$i}); $i++) {
1900 $c = ord($str{$i});
1901 // Single-byte (0xxxxxx)
1902 if (!($c & 0x80)) {
1903 $n++;
1904 } elseif (($c & 0xC0) == 0xC0) { // Multi-byte starting byte (11xxxxxx)
1905 $n++;
1906 }
1907 }
1908 return $n;
1909 }
1910
1911 /**
1912 * Truncates a string in UTF-8 short at a given byte length.
1913 *
1914 * @param string $str UTF-8 multibyte character string
1915 * @param integer $len The byte length
1916 * @return string The shortened string
1917 * @see mb_strcut()
1918 */
1919 function utf8_strtrunc($str, $len) {
1920 $i = $len - 1;
1921 // Part of a multibyte sequence
1922 if (ord($str{$i}) & 0x80) {
1923 for (; $i > 0 && !(ord($str{$i}) & 0x40); $i--) {
1924 // find the first byte
1925 }
1926 if ($i <= 0) {
1927 return '';
1928 } // Sanity check
1929 for ($bc = 0, $mbs = ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) {
1930 // Calculate number of bytes
1931 $bc++;
1932 }
1933 if ($bc + $i > $len) {
1934 return substr($str, 0, $i);
1935 }
1936 // Fallthru: multibyte char fits into length
1937 }
1938 return substr($str, 0, $len);
1939 }
1940
1941 /**
1942 * Find position of first occurrence of a string, both arguments are in UTF-8.
1943 *
1944 * @param string $haystack UTF-8 string to search in
1945 * @param string $needle UTF-8 string to search for
1946 * @param integer $offset Positition to start the search
1947 * @return integer The character position
1948 * @see strpos()
1949 */
1950 function utf8_strpos($haystack, $needle, $offset = 0) {
1951 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1952 return mb_strpos($haystack, $needle, $offset, 'utf-8');
1953 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1954 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1955 }
1956
1957 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1958 if ($byte_offset === FALSE) {
1959 // Offset beyond string length
1960 return FALSE;
1961 }
1962
1963 $byte_pos = strpos($haystack, $needle, $byte_offset);
1964 if ($byte_pos === FALSE) {
1965 // Needle not found
1966 return FALSE;
1967 }
1968
1969 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1970 }
1971
1972 /**
1973 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1974 *
1975 * @param string $haystack UTF-8 string to search in
1976 * @param string $needle UTF-8 character to search for (single character)
1977 * @return integer The character position
1978 * @see strrpos()
1979 */
1980 function utf8_strrpos($haystack, $needle) {
1981 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1982 return mb_strrpos($haystack, $needle, 'utf-8');
1983 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1984 return iconv_strrpos($haystack, $needle, 'utf-8');
1985 }
1986
1987 $byte_pos = strrpos($haystack, $needle);
1988 if ($byte_pos === FALSE) {
1989 // Needle not found
1990 return FALSE;
1991 }
1992
1993 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1994 }
1995
1996 /**
1997 * Translates a character position into an 'absolute' byte position.
1998 * Unit tested by Kasper.
1999 *
2000 * @param string $str UTF-8 string
2001 * @param integer $pos Character position (negative values start from the end)
2002 * @return integer Byte position
2003 */
2004 function utf8_char2byte_pos($str, $pos) {
2005 // Number of characters found
2006 $n = 0;
2007 // Number of characters wanted
2008 $p = abs($pos);
2009
2010 if ($pos >= 0) {
2011 $i = 0;
2012 $d = 1;
2013 } else {
2014 $i = strlen($str) - 1;
2015 $d = -1;
2016 }
2017
2018 for (; strlen($str{$i}) && $n < $p; $i += $d) {
2019 $c = (int) ord($str{$i});
2020 // single-byte (0xxxxxx)
2021 if (!($c & 0x80)) {
2022 $n++;
2023 } elseif (($c & 0xC0) == 0xC0) { // Multi-byte starting byte (11xxxxxx)
2024 $n++;
2025 }
2026 }
2027 if (!strlen($str{$i})) {
2028 // Offset beyond string length
2029 return FALSE;
2030 }
2031
2032 if ($pos >= 0) {
2033 // Skip trailing multi-byte data bytes
2034 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) {
2035 $i++;
2036 }
2037 } else {
2038 // Correct offset
2039 $i++;
2040 }
2041
2042 return $i;
2043 }
2044
2045 /**
2046 * Translates an 'absolute' byte position into a character position.
2047 * Unit tested by Kasper.
2048 *
2049 * @param string $str UTF-8 string
2050 * @param integer $pos Byte position
2051 * @return integer Character position
2052 */
2053 function utf8_byte2char_pos($str, $pos) {
2054 // Number of characters
2055 $n = 0;
2056 for ($i = $pos; $i > 0; $i--) {
2057 $c = (int) ord($str{$i});
2058 // single-byte (0xxxxxx)
2059 if (!($c & 0x80)) {
2060 $n++;
2061 } elseif (($c & 0xC0) == 0xC0) { // Multi-byte starting byte (11xxxxxx)
2062 $n++;
2063 }
2064 }
2065 if (!strlen($str{$i})) {
2066 // Offset beyond string length
2067 return FALSE;
2068 }
2069
2070 return $n;
2071 }
2072
2073 /**
2074 * Maps all characters of an UTF-8 string.
2075 *
2076 * @param string $str UTF-8 string
2077 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2078 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
2079 * @return string The converted string
2080 */
2081 function utf8_char_mapping($str, $mode, $opt = '') {
2082 if (!$this->initUnicodeData($mode)) {
2083 // Do nothing
2084 return $str;
2085 }
2086
2087 $out = '';
2088 switch ($mode) {
2089 case 'case':
2090 $map =& $this->caseFolding['utf-8'][$opt];
2091 break;
2092
2093 case 'ascii':
2094 $map =& $this->toASCII['utf-8'];
2095 break;
2096
2097 default:
2098 return $str;
2099 }
2100
2101 for ($i = 0; strlen($str{$i}); $i++) {
2102 $c = ord($str{$i});
2103 // single-byte (0xxxxxx)
2104 if (!($c & 0x80)) {
2105 $mbc = $str{$i};
2106 } elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
2107 for ($bc = 0; $c & 0x80; $c = $c << 1) {
2108 $bc++;
2109 } // calculate number of bytes
2110 $mbc = substr($str, $i, $bc);
2111 $i += $bc - 1;
2112 }
2113
2114 if (isset($map[$mbc])) {
2115 $out .= $map[$mbc];
2116 } else {
2117 $out .= $mbc;
2118 }
2119 }
2120
2121 return $out;
2122 }
2123
2124 /********************************************
2125 *
2126 * Internal EUC string operation functions
2127 *
2128 * Extended Unix Code:
2129 * ASCII compatible 7bit single bytes chars
2130 * 8bit two byte chars
2131 *
2132 * Shift-JIS is treated as a special case.
2133 *
2134 ********************************************/
2135
2136 /**
2137 * Cuts a string in the EUC charset family short at a given byte length.
2138 *
2139 * @param string $str EUC multibyte character string
2140 * @param integer $len The byte length
2141 * @param string $charset The charset
2142 * @return string The shortened string
2143 * @see mb_strcut()
2144 */
2145 function euc_strtrunc($str, $len, $charset) {
2146 $sjis = ($charset == 'shift_jis');
2147 for ($i = 0; strlen($str{$i}) && $i < $len; $i++) {
2148 $c = ord($str{$i});
2149 if ($sjis) {
2150 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2151 $i++;
2152 } // advance a double-byte char
2153 }
2154 else {
2155 if ($c >= 0x80) {
2156 $i++;
2157 } // advance a double-byte char
2158 }
2159 }
2160 if (!strlen($str{$i})) {
2161 return $str;
2162 } // string shorter than supplied length
2163
2164 if ($i > $len) {
2165 // We ended on a first byte
2166 return substr($str, 0, $len - 1);
2167 } else {
2168 return substr($str, 0, $len);
2169 }
2170 }
2171
2172 /**
2173 * Returns a part of a string in the EUC charset family.
2174 *
2175 * @param string $str EUC multibyte character string
2176 * @param integer $start Start position (character position)
2177 * @param string $charset The charset
2178 * @param integer $len Length (in characters)
2179 * @return string the substring
2180 */
2181 function euc_substr($str, $start, $charset, $len = NULL) {
2182 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2183 if ($byte_start === FALSE) {
2184 // $start outside string length
2185 return FALSE;
2186 }
2187
2188 $str = substr($str, $byte_start);
2189
2190 if ($len != NULL) {
2191 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2192 // $len outside actual string length
2193 if ($byte_end === FALSE) {
2194 return $str;
2195 } else {
2196 return substr($str, 0, $byte_end);
2197 }
2198 } else {
2199 return $str;
2200 }
2201 }
2202
2203 /**
2204 * Counts the number of characters of a string in the EUC charset family.
2205 *
2206 * @param string $str EUC multibyte character string
2207 * @param string $charset The charset
2208 * @return integer The number of characters
2209 * @see strlen()
2210 */
2211 function euc_strlen($str, $charset) {
2212 $sjis = ($charset == 'shift_jis');
2213 $n = 0;
2214 for ($i = 0; strlen($str{$i}); $i++) {
2215 $c = ord($str{$i});
2216 if ($sjis) {
2217 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2218 $i++;
2219 } // advance a double-byte char
2220 } else {
2221 if ($c >= 0x80) {
2222 $i++;
2223 } // advance a double-byte char
2224 }
2225
2226 $n++;
2227 }
2228
2229 return $n;
2230 }
2231
2232 /**
2233 * Translates a character position into an 'absolute' byte position.
2234 *
2235 * @param string $str EUC multibyte character string
2236 * @param integer $pos Character position (negative values start from the end)
2237 * @param string $charset The charset
2238 * @return integer Byte position
2239 */
2240 function euc_char2byte_pos($str, $pos, $charset) {
2241 $sjis = ($charset == 'shift_jis');
2242 // Number of characters seen
2243 $n = 0;
2244 // Number of characters wanted
2245 $p = abs($pos);
2246
2247 if ($pos >= 0) {
2248 $i = 0;
2249 $d = 1;
2250 } else {
2251 $i = strlen($str) - 1;
2252 $d = -1;
2253 }
2254
2255 for (; strlen($str{$i}) && $n < $p; $i += $d) {
2256 $c = ord($str{$i});
2257 if ($sjis) {
2258 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2259 $i += $d;
2260 } // advance a double-byte char
2261 } else {
2262 if ($c >= 0x80) {
2263 $i += $d;
2264 } // advance a double-byte char
2265 }
2266
2267 $n++;
2268 }
2269 if (!strlen($str{$i})) {
2270 return FALSE;
2271 } // offset beyond string length
2272
2273 if ($pos < 0) {
2274 $i++;
2275 } // correct offset
2276
2277 return $i;
2278 }
2279
2280 /**
2281 * Maps all characters of a string in the EUC charset family.
2282 *
2283 * @param string $str EUC multibyte character string
2284 * @param string $charset The charset
2285 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2286 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
2287 * @return string The converted string
2288 */
2289 function euc_char_mapping($str, $charset, $mode, $opt = '') {
2290 switch ($mode) {
2291 case 'case':
2292 if (!$this->initCaseFolding($charset)) {
2293 return $str;
2294 } // do nothing
2295 $map =& $this->caseFolding[$charset][$opt];
2296 break;
2297
2298 case 'ascii':
2299 if (!$this->initToASCII($charset)) {
2300 return $str;
2301 } // do nothing
2302 $map =& $this->toASCII[$charset];
2303 break;
2304
2305 default:
2306 return $str;
2307 }
2308
2309 $sjis = ($charset == 'shift_jis');
2310 $out = '';
2311 for ($i = 0; strlen($str{$i}); $i++) {
2312 $mbc = $str{$i};
2313 $c = ord($mbc);
2314
2315 if ($sjis) {
2316 // A double-byte char
2317 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2318 $mbc = substr($str, $i, 2);
2319 $i++;
2320 }
2321 } else {
2322 // A double-byte char
2323 if ($c >= 0x80) {
2324 $mbc = substr($str, $i, 2);
2325 $i++;
2326 }
2327 }
2328
2329 if (isset($map[$mbc])) {
2330 $out .= $map[$mbc];
2331 } else {
2332 $out .= $mbc;
2333 }
2334 }
2335
2336 return $out;
2337 }
2338 }
2339
2340 ?>