e7a1a114142bfde0c3e3e567e340b58c07b71528
[Packages/TYPO3.CMS.git] / typo3 / sysext / core / Classes / Charset / CharsetConverter.php
1 <?php
2 namespace TYPO3\CMS\Core\Charset;
3
4 /***************************************************************
5 * Copyright notice
6 *
7 * (c) 2003-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
8 * All rights reserved
9 *
10 * This script is part of the Typo3 project. The Typo3 project is
11 * free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * The GNU General Public License can be found at
17 * http://www.gnu.org/copyleft/gpl.html.
18 *
19 * This script is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
23 *
24 * This copyright notice MUST APPEAR in all copies of the script!
25 ***************************************************************/
26 /**
27 * Class for conversion between charsets.
28 *
29 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * Notes on UTF-8
34 *
35 * Functions working on UTF-8 strings:
36 *
37 * - strchr/strstr
38 * - strrchr
39 * - substr_count
40 * - implode/explode/join
41 *
42 * Functions nearly working on UTF-8 strings:
43 *
44 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
45 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
46 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
47 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
48 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
49 *
50 * Functions NOT working on UTF-8 strings:
51 *
52 * - str*cmp
53 * - stristr
54 * - stripos
55 * - substr
56 * - strrev
57 * - split/spliti
58 * - ...
59 */
60 /**
61 * Class for conversion between charsets
62 *
63 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
64 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
65 * @package TYPO3
66 * @subpackage t3lib
67 */
68 class CharsetConverter {
69
70 /**
71 * @var \TYPO3\CMS\Core\Localization\Locales
72 */
73 protected $locales;
74
75 // ASCII Value for chars with no equivalent.
76 /**
77 * @todo Define visibility
78 */
79 public $noCharByteVal = 63;
80
81 // This is the array where parsed conversion tables are stored (cached)
82 /**
83 * @todo Define visibility
84 */
85 public $parsedCharsets = array();
86
87 // An array where case folding data will be stored (cached)
88 /**
89 * @todo Define visibility
90 */
91 public $caseFolding = array();
92
93 // An array where charset-to-ASCII mappings are stored (cached)
94 /**
95 * @todo Define visibility
96 */
97 public $toASCII = array();
98
99 // This tells the converter which charsets has two bytes per char:
100 /**
101 * @todo Define visibility
102 */
103 public $twoByteSets = array(
104 'ucs-2' => 1
105 );
106
107 // This tells the converter which charsets has four bytes per char:
108 /**
109 * @todo Define visibility
110 */
111 public $fourByteSets = array(
112 'ucs-4' => 1,
113 // 4-byte Unicode
114 'utf-32' => 1
115 );
116
117 // This tells the converter which charsets use a scheme like the Extended Unix Code:
118 /**
119 * @todo Define visibility
120 */
121 public $eucBasedSets = array(
122 'gb2312' => 1,
123 // Chinese, simplified.
124 'big5' => 1,
125 // Chinese, traditional.
126 'euc-kr' => 1,
127 // Korean
128 'shift_jis' => 1
129 );
130
131 // See http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
132 // http://czyborra.com/charsets/iso8859.html
133 /**
134 * @todo Define visibility
135 */
136 public $synonyms = array(
137 'us' => 'ascii',
138 'us-ascii' => 'ascii',
139 'cp819' => 'iso-8859-1',
140 'ibm819' => 'iso-8859-1',
141 'iso-ir-100' => 'iso-8859-1',
142 'iso-ir-101' => 'iso-8859-2',
143 'iso-ir-109' => 'iso-8859-3',
144 'iso-ir-110' => 'iso-8859-4',
145 'iso-ir-144' => 'iso-8859-5',
146 'iso-ir-127' => 'iso-8859-6',
147 'iso-ir-126' => 'iso-8859-7',
148 'iso-ir-138' => 'iso-8859-8',
149 'iso-ir-148' => 'iso-8859-9',
150 'iso-ir-157' => 'iso-8859-10',
151 'iso-ir-179' => 'iso-8859-13',
152 'iso-ir-199' => 'iso-8859-14',
153 'iso-ir-203' => 'iso-8859-15',
154 'csisolatin1' => 'iso-8859-1',
155 'csisolatin2' => 'iso-8859-2',
156 'csisolatin3' => 'iso-8859-3',
157 'csisolatin5' => 'iso-8859-9',
158 'csisolatin8' => 'iso-8859-14',
159 'csisolatin9' => 'iso-8859-15',
160 'csisolatingreek' => 'iso-8859-7',
161 'iso-celtic' => 'iso-8859-14',
162 'latin1' => 'iso-8859-1',
163 'latin2' => 'iso-8859-2',
164 'latin3' => 'iso-8859-3',
165 'latin5' => 'iso-8859-9',
166 'latin6' => 'iso-8859-10',
167 'latin8' => 'iso-8859-14',
168 'latin9' => 'iso-8859-15',
169 'l1' => 'iso-8859-1',
170 'l2' => 'iso-8859-2',
171 'l3' => 'iso-8859-3',
172 'l5' => 'iso-8859-9',
173 'l6' => 'iso-8859-10',
174 'l8' => 'iso-8859-14',
175 'l9' => 'iso-8859-15',
176 'cyrillic' => 'iso-8859-5',
177 'arabic' => 'iso-8859-6',
178 'tis-620' => 'iso-8859-11',
179 'win874' => 'windows-874',
180 'win1250' => 'windows-1250',
181 'win1251' => 'windows-1251',
182 'win1252' => 'windows-1252',
183 'win1253' => 'windows-1253',
184 'win1254' => 'windows-1254',
185 'win1255' => 'windows-1255',
186 'win1256' => 'windows-1256',
187 'win1257' => 'windows-1257',
188 'win1258' => 'windows-1258',
189 'cp1250' => 'windows-1250',
190 'cp1251' => 'windows-1251',
191 'cp1252' => 'windows-1252',
192 'ms-ee' => 'windows-1250',
193 'ms-ansi' => 'windows-1252',
194 'ms-greek' => 'windows-1253',
195 'ms-turk' => 'windows-1254',
196 'winbaltrim' => 'windows-1257',
197 'koi-8ru' => 'koi-8r',
198 'koi8r' => 'koi-8r',
199 'cp878' => 'koi-8r',
200 'mac' => 'macroman',
201 'macintosh' => 'macroman',
202 'euc-cn' => 'gb2312',
203 'x-euc-cn' => 'gb2312',
204 'euccn' => 'gb2312',
205 'cp936' => 'gb2312',
206 'big-5' => 'big5',
207 'cp950' => 'big5',
208 'eucjp' => 'euc-jp',
209 'sjis' => 'shift_jis',
210 'shift-jis' => 'shift_jis',
211 'cp932' => 'shift_jis',
212 'cp949' => 'euc-kr',
213 'utf7' => 'utf-7',
214 'utf8' => 'utf-8',
215 'utf16' => 'utf-16',
216 'utf32' => 'utf-32',
217 'utf8' => 'utf-8',
218 'ucs2' => 'ucs-2',
219 'ucs4' => 'ucs-4'
220 );
221
222 // Mapping of iso-639-1 language codes to script names
223 /**
224 * @todo Define visibility
225 */
226 public $lang_to_script = array(
227 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
228 'af' => 'west_european',
229 //Afrikaans
230 'ar' => 'arabic',
231 'bg' => 'cyrillic',
232 // Bulgarian
233 'bs' => 'east_european',
234 // Bosnian
235 'cs' => 'east_european',
236 // Czech
237 'da' => 'west_european',
238 // Danish
239 'de' => 'west_european',
240 // German
241 'es' => 'west_european',
242 // Spanish
243 'et' => 'estonian',
244 'eo' => 'unicode',
245 // Esperanto
246 'eu' => 'west_european',
247 // Basque
248 'fa' => 'arabic',
249 // Persian
250 'fi' => 'west_european',
251 // Finish
252 'fo' => 'west_european',
253 // Faroese
254 'fr' => 'west_european',
255 // French
256 'ga' => 'west_european',
257 // Irish
258 'gl' => 'west_european',
259 // Galician
260 'gr' => 'greek',
261 'he' => 'hebrew',
262 // Hebrew (since 1998)
263 'hi' => 'unicode',
264 // Hindi
265 'hr' => 'east_european',
266 // Croatian
267 'hu' => 'east_european',
268 // Hungarian
269 'iw' => 'hebrew',
270 // Hebrew (til 1998)
271 'is' => 'west_european',
272 // Icelandic
273 'it' => 'west_european',
274 // Italian
275 'ja' => 'japanese',
276 'ka' => 'unicode',
277 // Georgian
278 'kl' => 'west_european',
279 // Greenlandic
280 'km' => 'unicode',
281 // Khmer
282 'ko' => 'korean',
283 'lt' => 'lithuanian',
284 'lv' => 'west_european',
285 // Latvian/Lettish
286 'nl' => 'west_european',
287 // Dutch
288 'no' => 'west_european',
289 // Norwegian
290 'nb' => 'west_european',
291 // Norwegian Bokmal
292 'nn' => 'west_european',
293 // Norwegian Nynorsk
294 'pl' => 'east_european',
295 // Polish
296 'pt' => 'west_european',
297 // Portuguese
298 'ro' => 'east_european',
299 // Romanian
300 'ru' => 'cyrillic',
301 // Russian
302 'sk' => 'east_european',
303 // Slovak
304 'sl' => 'east_european',
305 // Slovenian
306 'sr' => 'cyrillic',
307 // Serbian
308 'sv' => 'west_european',
309 // Swedish
310 'sq' => 'albanian',
311 // Albanian
312 'th' => 'thai',
313 'uk' => 'cyrillic',
314 // Ukranian
315 'vi' => 'vietnamese',
316 'zh' => 'chinese',
317 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
318 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
319 'afk' => 'west_european',
320 // Afrikaans
321 'ara' => 'arabic',
322 'bgr' => 'cyrillic',
323 // Bulgarian
324 'cat' => 'west_european',
325 // Catalan
326 'chs' => 'simpl_chinese',
327 'cht' => 'trad_chinese',
328 'csy' => 'east_european',
329 // Czech
330 'dan' => 'west_european',
331 // Danisch
332 'deu' => 'west_european',
333 // German
334 'dea' => 'west_european',
335 // German (Austrian)
336 'des' => 'west_european',
337 // German (Swiss)
338 'ena' => 'west_european',
339 // English (Australian)
340 'enc' => 'west_european',
341 // English (Canadian)
342 'eng' => 'west_european',
343 // English
344 'enz' => 'west_european',
345 // English (New Zealand)
346 'enu' => 'west_european',
347 // English (United States)
348 'euq' => 'west_european',
349 // Basque
350 'fos' => 'west_european',
351 // Faroese
352 'far' => 'arabic',
353 // Persian
354 'fin' => 'west_european',
355 // Finish
356 'fra' => 'west_european',
357 // French
358 'frb' => 'west_european',
359 // French (Belgian)
360 'frc' => 'west_european',
361 // French (Canadian)
362 'frs' => 'west_european',
363 // French (Swiss)
364 'geo' => 'unicode',
365 // Georgian
366 'glg' => 'west_european',
367 // Galician
368 'ell' => 'greek',
369 'heb' => 'hebrew',
370 'hin' => 'unicode',
371 // Hindi
372 'hun' => 'east_european',
373 // Hungarian
374 'isl' => 'west_european',
375 // Icelandic
376 'ita' => 'west_european',
377 // Italian
378 'its' => 'west_european',
379 // Italian (Swiss)
380 'jpn' => 'japanese',
381 'khm' => 'unicode',
382 // Khmer
383 'kor' => 'korean',
384 'lth' => 'lithuanian',
385 'lvi' => 'west_european',
386 // Latvian/Lettish
387 'msl' => 'west_european',
388 // Malay
389 'nlb' => 'west_european',
390 // Dutch (Belgian)
391 'nld' => 'west_european',
392 // Dutch
393 'nor' => 'west_european',
394 // Norwegian (bokmal)
395 'non' => 'west_european',
396 // Norwegian (nynorsk)
397 'plk' => 'east_european',
398 // Polish
399 'ptg' => 'west_european',
400 // Portuguese
401 'ptb' => 'west_european',
402 // Portuguese (Brazil)
403 'rom' => 'east_european',
404 // Romanian
405 'rus' => 'cyrillic',
406 // Russian
407 'slv' => 'east_european',
408 // Slovenian
409 'sky' => 'east_european',
410 // Slovak
411 'srl' => 'east_european',
412 // Serbian (Latin)
413 'srb' => 'cyrillic',
414 // Serbian (Cyrillic)
415 'esp' => 'west_european',
416 // Spanish (trad. sort)
417 'esm' => 'west_european',
418 // Spanish (Mexican)
419 'esn' => 'west_european',
420 // Spanish (internat. sort)
421 'sve' => 'west_european',
422 // Swedish
423 'sqi' => 'albanian',
424 // Albanian
425 'tha' => 'thai',
426 'trk' => 'turkish',
427 'ukr' => 'cyrillic',
428 // Ukrainian
429 // English language names
430 'afrikaans' => 'west_european',
431 'albanian' => 'albanian',
432 'arabic' => 'arabic',
433 'basque' => 'west_european',
434 'bosnian' => 'east_european',
435 'bulgarian' => 'east_european',
436 'catalan' => 'west_european',
437 'croatian' => 'east_european',
438 'czech' => 'east_european',
439 'danish' => 'west_european',
440 'dutch' => 'west_european',
441 'english' => 'west_european',
442 'esperanto' => 'unicode',
443 'estonian' => 'estonian',
444 'faroese' => 'west_european',
445 'farsi' => 'arabic',
446 'finnish' => 'west_european',
447 'french' => 'west_european',
448 'galician' => 'west_european',
449 'georgian' => 'unicode',
450 'german' => 'west_european',
451 'greek' => 'greek',
452 'greenlandic' => 'west_european',
453 'hebrew' => 'hebrew',
454 'hindi' => 'unicode',
455 'hungarian' => 'east_european',
456 'icelandic' => 'west_european',
457 'italian' => 'west_european',
458 'khmer' => 'unicode',
459 'latvian' => 'west_european',
460 'lettish' => 'west_european',
461 'lithuanian' => 'lithuanian',
462 'malay' => 'west_european',
463 'norwegian' => 'west_european',
464 'persian' => 'arabic',
465 'polish' => 'east_european',
466 'portuguese' => 'west_european',
467 'russian' => 'cyrillic',
468 'romanian' => 'east_european',
469 'serbian' => 'cyrillic',
470 'slovak' => 'east_european',
471 'slovenian' => 'east_european',
472 'spanish' => 'west_european',
473 'svedish' => 'west_european',
474 'that' => 'thai',
475 'turkish' => 'turkish',
476 'ukrainian' => 'cyrillic'
477 );
478
479 // Mapping of language (family) names to charsets on Unix
480 /**
481 * @todo Define visibility
482 */
483 public $script_to_charset_unix = array(
484 'west_european' => 'iso-8859-1',
485 'estonian' => 'iso-8859-1',
486 'east_european' => 'iso-8859-2',
487 'baltic' => 'iso-8859-4',
488 'cyrillic' => 'iso-8859-5',
489 'arabic' => 'iso-8859-6',
490 'greek' => 'iso-8859-7',
491 'hebrew' => 'iso-8859-8',
492 'turkish' => 'iso-8859-9',
493 'thai' => 'iso-8859-11',
494 // = TIS-620
495 'lithuanian' => 'iso-8859-13',
496 'chinese' => 'gb2312',
497 // = euc-cn
498 'japanese' => 'euc-jp',
499 'korean' => 'euc-kr',
500 'simpl_chinese' => 'gb2312',
501 'trad_chinese' => 'big5',
502 'vietnamese' => '',
503 'unicode' => 'utf-8',
504 'albanian' => 'utf-8'
505 );
506
507 // Mapping of language (family) names to charsets on Windows
508 /**
509 * @todo Define visibility
510 */
511 public $script_to_charset_windows = array(
512 'east_european' => 'windows-1250',
513 'cyrillic' => 'windows-1251',
514 'west_european' => 'windows-1252',
515 'greek' => 'windows-1253',
516 'turkish' => 'windows-1254',
517 'hebrew' => 'windows-1255',
518 'arabic' => 'windows-1256',
519 'baltic' => 'windows-1257',
520 'estonian' => 'windows-1257',
521 'lithuanian' => 'windows-1257',
522 'vietnamese' => 'windows-1258',
523 'thai' => 'cp874',
524 'korean' => 'cp949',
525 'chinese' => 'gb2312',
526 'japanese' => 'shift_jis',
527 'simpl_chinese' => 'gb2312',
528 'trad_chinese' => 'big5',
529 'albanian' => 'windows-1250',
530 'unicode' => 'utf-8'
531 );
532
533 // Mapping of locale names to charsets
534 /**
535 * @todo Define visibility
536 */
537 public $locale_to_charset = array(
538 'japanese.euc' => 'euc-jp',
539 'ja_jp.ujis' => 'euc-jp',
540 'korean.euc' => 'euc-kr',
541 'sr@Latn' => 'iso-8859-2',
542 'zh_cn' => 'gb2312',
543 'zh_hk' => 'big5',
544 'zh_tw' => 'big5'
545 );
546
547 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
548 // Empty values means "iso-8859-1"
549 /**
550 * @todo Define visibility
551 */
552 public $charSetArray = array(
553 'af' => '',
554 'ar' => 'iso-8859-6',
555 'ba' => 'iso-8859-2',
556 'bg' => 'windows-1251',
557 'br' => '',
558 'ca' => 'iso-8859-15',
559 'ch' => 'gb2312',
560 'cs' => 'windows-1250',
561 'cz' => 'windows-1250',
562 'da' => '',
563 'de' => '',
564 'dk' => '',
565 'el' => 'iso-8859-7',
566 'eo' => 'utf-8',
567 'es' => '',
568 'et' => 'iso-8859-4',
569 'eu' => '',
570 'fa' => 'utf-8',
571 'fi' => '',
572 'fo' => 'utf-8',
573 'fr' => '',
574 'fr_CA' => '',
575 'ga' => '',
576 'ge' => 'utf-8',
577 'gl' => '',
578 'gr' => 'iso-8859-7',
579 'he' => 'utf-8',
580 'hi' => 'utf-8',
581 'hk' => 'big5',
582 'hr' => 'windows-1250',
583 'hu' => 'iso-8859-2',
584 'is' => 'utf-8',
585 'it' => '',
586 'ja' => 'shift_jis',
587 'jp' => 'shift_jis',
588 'ka' => 'utf-8',
589 'kl' => 'utf-8',
590 'km' => 'utf-8',
591 'ko' => 'euc-kr',
592 'kr' => 'euc-kr',
593 'lt' => 'windows-1257',
594 'lv' => 'utf-8',
595 'ms' => '',
596 'my' => '',
597 'nl' => '',
598 'no' => '',
599 'pl' => 'iso-8859-2',
600 'pt' => '',
601 'pt_BR' => '',
602 'qc' => '',
603 'ro' => 'iso-8859-2',
604 'ru' => 'windows-1251',
605 'se' => '',
606 'si' => 'windows-1250',
607 'sk' => 'windows-1250',
608 'sl' => 'windows-1250',
609 'sq' => 'utf-8',
610 'sr' => 'utf-8',
611 'sv' => '',
612 'th' => 'iso-8859-11',
613 'tr' => 'iso-8859-9',
614 'ua' => 'windows-1251',
615 'uk' => 'windows-1251',
616 'vi' => 'utf-8',
617 'vn' => 'utf-8',
618 'zh' => 'big5'
619 );
620
621 /**
622 * Default constructor.
623 */
624 public function __construct() {
625 $this->locales = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\\CMS\\Core\\Localization\\Locales');
626 }
627
628 /**
629 * Normalize - changes input character set to lowercase letters.
630 *
631 * @param string $charset Input charset
632 * @return string Normalized charset
633 * @todo Define visibility
634 */
635 public function parse_charset($charset) {
636 $charset = trim(strtolower($charset));
637 if (isset($this->synonyms[$charset])) {
638 $charset = $this->synonyms[$charset];
639 }
640 return $charset;
641 }
642
643 /**
644 * Get the charset of a locale.
645 *
646 * ln language
647 * ln_CN language / country
648 * ln_CN.cs language / country / charset
649 * ln_CN.cs@mod language / country / charset / modifier
650 *
651 * @param string $locale Locale string
652 * @return string Charset resolved for locale string
653 * @todo Define visibility
654 */
655 public function get_locale_charset($locale) {
656 $locale = strtolower($locale);
657 // Exact locale specific charset?
658 if (isset($this->locale_to_charset[$locale])) {
659 return $this->locale_to_charset[$locale];
660 }
661 // Get modifier
662 list($locale, $modifier) = explode('@', $locale);
663 // Locale contains charset: use it
664 list($locale, $charset) = explode('.', $locale);
665 if ($charset) {
666 return $this->parse_charset($charset);
667 }
668 // Modifier is 'euro' (after charset check, because of xx.utf-8@euro)
669 if ($modifier == 'euro') {
670 return 'iso-8859-15';
671 }
672 // Get language
673 list($language, $country) = explode('_', $locale);
674 if (isset($this->lang_to_script[$language])) {
675 $script = $this->lang_to_script[$language];
676 }
677 if (TYPO3_OS == 'WIN') {
678 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
679 } else {
680 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'utf-8';
681 }
682 return $cs;
683 }
684
685 /********************************************
686 *
687 * Charset Conversion functions
688 *
689 ********************************************/
690 /**
691 * Convert from one charset to another charset.
692 *
693 * @param string $str Input string
694 * @param string $fromCS From charset (the current charset of the string)
695 * @param string $toCS To charset (the output charset wanted)
696 * @param boolean $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
697 * @return string Converted string
698 * @see convArray()
699 * @todo Define visibility
700 */
701 public function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
702 if ($fromCS == $toCS) {
703 return $str;
704 }
705 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
706 if ($toCS == 'utf-8' || !$useEntityForNoChar) {
707 switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
708 case 'mbstring':
709 $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
710 if (FALSE !== $conv_str) {
711 return $conv_str;
712 }
713 // Returns FALSE for unsupported charsets
714 break;
715 case 'iconv':
716 $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
717 if (FALSE !== $conv_str) {
718 return $conv_str;
719 }
720 break;
721 case 'recode':
722 $conv_str = recode_string($fromCS . '..' . $toCS, $str);
723 if (FALSE !== $conv_str) {
724 return $conv_str;
725 }
726 break;
727 }
728 }
729 if ($fromCS != 'utf-8') {
730 $str = $this->utf8_encode($str, $fromCS);
731 }
732 if ($toCS != 'utf-8') {
733 $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
734 }
735 return $str;
736 }
737
738 /**
739 * Convert all elements in ARRAY with type string from one charset to another charset.
740 * NOTICE: Array is passed by reference!
741 *
742 * @param string $array Input array, possibly multidimensional
743 * @param string $fromCS From charset (the current charset of the string)
744 * @param string $toCS To charset (the output charset wanted)
745 * @param boolean $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
746 * @return void
747 * @see conv()
748 * @todo Define visibility
749 */
750 public function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
751 foreach ($array as $key => $value) {
752 if (is_array($array[$key])) {
753 $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
754 } elseif (is_string($array[$key])) {
755 $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
756 }
757 }
758 }
759
760 /**
761 * Converts $str from $charset to UTF-8
762 *
763 * @param string $str String in local charset to convert to UTF-8
764 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
765 * @return string Output string, converted to UTF-8
766 * @todo Define visibility
767 */
768 public function utf8_encode($str, $charset) {
769 if ($charset === 'utf-8') {
770 return $str;
771 }
772 // Charset is case-insensitive
773 // Parse conv. table if not already
774 if ($this->initCharset($charset)) {
775 $strLen = strlen($str);
776 $outStr = '';
777 // Traverse each char in string
778 for ($a = 0; $a < $strLen; $a++) {
779 $chr = substr($str, $a, 1);
780 $ord = ord($chr);
781 // If the charset has two bytes per char
782 if (isset($this->twoByteSets[$charset])) {
783 $ord2 = ord($str[$a + 1]);
784 // Assume big endian
785 $ord = $ord << 8 | $ord2;
786 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
787 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
788 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
789 } else {
790 $outStr .= chr($this->noCharByteVal);
791 }
792 // No char exists
793 $a++;
794 } elseif ($ord > 127) {
795 // If char has value over 127 it's a multibyte char in UTF-8
796 // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
797 if (isset($this->eucBasedSets[$charset])) {
798 // Shift-JIS: chars between 160 and 223 are single byte
799 if ($charset != 'shift_jis' || ($ord < 160 || $ord > 223)) {
800 $a++;
801 $ord2 = ord(substr($str, $a, 1));
802 $ord = $ord * 256 + $ord2;
803 }
804 }
805 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
806 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
807 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
808 } else {
809 $outStr .= chr($this->noCharByteVal);
810 }
811 } else {
812 $outStr .= $chr;
813 }
814 }
815 return $outStr;
816 }
817 }
818
819 /**
820 * Converts $str from UTF-8 to $charset
821 *
822 * @param string $str String in UTF-8 to convert to local charset
823 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
824 * @param boolean $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
825 * @return string Output string, converted to local charset
826 * @todo Define visibility
827 */
828 public function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
829 if ($charset === 'utf-8') {
830 return $str;
831 }
832 // Charset is case-insensitive.
833 // Parse conv. table if not already
834 if ($this->initCharset($charset)) {
835 $strLen = strlen($str);
836 $outStr = '';
837 $buf = '';
838 // Traverse each char in UTF-8 string
839 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) {
840 $chr = substr($str, $a, 1);
841 $ord = ord($chr);
842 // This means multibyte! (first byte!)
843 if ($ord > 127) {
844 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
845 if ($ord & 64) {
846 // Add first byte
847 $buf = $chr;
848 // For each byte in multibyte string
849 for ($b = 0; $b < 8; $b++) {
850 // Shift it left and
851 $ord = $ord << 1;
852 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
853 if ($ord & 128) {
854 $a++;
855 // ... and add the next char.
856 $buf .= substr($str, $a, 1);
857 } else {
858 break;
859 }
860 }
861 // If the UTF-8 char-sequence is found then...
862 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
863 // The local number
864 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
865 // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
866 if ($mByte > 255) {
867 $outStr .= chr(($mByte >> 8 & 255)) . chr(($mByte & 255));
868 } else {
869 $outStr .= chr($mByte);
870 }
871 } elseif ($useEntityForNoChar) {
872 // Create num entity:
873 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
874 } else {
875 $outStr .= chr($this->noCharByteVal);
876 }
877 } else {
878 $outStr .= chr($this->noCharByteVal);
879 }
880 } else {
881 $outStr .= $chr;
882 }
883 }
884 return $outStr;
885 }
886 }
887
888 /**
889 * Converts all chars > 127 to numeric entities.
890 *
891 * @param string $str Input string
892 * @return string Output string
893 * @todo Define visibility
894 */
895 public function utf8_to_entities($str) {
896 $strLen = strlen($str);
897 $outStr = '';
898 $buf = '';
899 // Traverse each char in UTF-8 string.
900 for ($a = 0; $a < $strLen; $a++) {
901 $chr = substr($str, $a, 1);
902 $ord = ord($chr);
903 // This means multibyte! (first byte!)
904 if ($ord > 127) {
905 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
906 if ($ord & 64) {
907 // Add first byte
908 $buf = $chr;
909 // For each byte in multibyte string...
910 for ($b = 0; $b < 8; $b++) {
911 // Shift it left and ...
912 $ord = $ord << 1;
913 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
914 if ($ord & 128) {
915 $a++;
916 // ... and add the next char.
917 $buf .= substr($str, $a, 1);
918 } else {
919 break;
920 }
921 }
922 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
923 } else {
924 $outStr .= chr($this->noCharByteVal);
925 }
926 } else {
927 $outStr .= $chr;
928 }
929 }
930 return $outStr;
931 }
932
933 /**
934 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
935 *
936 * @param string $str Input string, UTF-8
937 * @param boolean $alsoStdHtmlEnt If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
938 * @return string Output string
939 * @todo Define visibility
940 */
941 public function entities_to_utf8($str, $alsoStdHtmlEnt = FALSE) {
942 if ($alsoStdHtmlEnt) {
943 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8'));
944 }
945 $token = md5(microtime());
946 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
947 foreach ($parts as $k => $v) {
948 // Only take every second element
949 if ($k % 2 === 0) {
950 continue;
951 }
952 $position = 0;
953 // Dec or hex entities
954 if (substr($v, $position, 1) == '#') {
955 $position++;
956 if (substr($v, $position, 1) == 'x') {
957 $v = hexdec(substr($v, ++$position));
958 } else {
959 $v = substr($v, $position);
960 }
961 $parts[$k] = $this->UnumberToChar($v);
962 } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) {
963 // Other entities:
964 $parts[$k] = $trans_tbl['&' . $v . ';'];
965 } else {
966 // No conversion:
967 $parts[$k] = '&' . $v . ';';
968 }
969 }
970 return implode('', $parts);
971 }
972
973 /**
974 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
975 *
976 * @param string $str Input string, UTF-8
977 * @param boolean $convEntities If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
978 * @param boolean $retChar If set, then instead of integer numbers the real UTF-8 char is returned.
979 * @return array Output array with the char numbers
980 * @todo Define visibility
981 */
982 public function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
983 // If entities must be registered as well...:
984 if ($convEntities) {
985 $str = $this->entities_to_utf8($str, 1);
986 }
987 // Do conversion:
988 $strLen = strlen($str);
989 $outArr = array();
990 $buf = '';
991 // Traverse each char in UTF-8 string.
992 for ($a = 0; $a < $strLen; $a++) {
993 $chr = substr($str, $a, 1);
994 $ord = ord($chr);
995 // This means multibyte! (first byte!)
996 if ($ord > 127) {
997 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
998 if ($ord & 64) {
999 // Add first byte
1000 $buf = $chr;
1001 // For each byte in multibyte string...
1002 for ($b = 0; $b < 8; $b++) {
1003 // Shift it left and ...
1004 $ord = $ord << 1;
1005 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
1006 if ($ord & 128) {
1007 $a++;
1008 // ... and add the next char.
1009 $buf .= substr($str, $a, 1);
1010 } else {
1011 break;
1012 }
1013 }
1014 $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
1015 } else {
1016 $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
1017 }
1018 } else {
1019 $outArr[] = $retChar ? chr($ord) : $ord;
1020 }
1021 }
1022 return $outArr;
1023 }
1024
1025 /**
1026 * Converts a UNICODE number to a UTF-8 multibyte character
1027 * Algorithm based on script found at From: http://czyborra.com/utf/
1028 * Unit-tested by Kasper
1029 *
1030 * The binary representation of the character's integer value is thus simply spread across the bytes
1031 * and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
1032 *
1033 * bytes | bits | representation
1034 * 1 | 7 | 0vvvvvvv
1035 * 2 | 11 | 110vvvvv 10vvvvvv
1036 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
1037 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
1038 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
1039 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
1040 *
1041 * @param integer $cbyte UNICODE integer
1042 * @return string UTF-8 multibyte character string
1043 * @see utf8CharToUnumber()
1044 * @todo Define visibility
1045 */
1046 public function UnumberToChar($cbyte) {
1047 $str = '';
1048 if ($cbyte < 128) {
1049 $str .= chr($cbyte);
1050 } else {
1051 if ($cbyte < 2048) {
1052 $str .= chr(192 | $cbyte >> 6);
1053 $str .= chr(128 | $cbyte & 63);
1054 } else {
1055 if ($cbyte < 65536) {
1056 $str .= chr(224 | $cbyte >> 12);
1057 $str .= chr(128 | $cbyte >> 6 & 63);
1058 $str .= chr(128 | $cbyte & 63);
1059 } else {
1060 if ($cbyte < 2097152) {
1061 $str .= chr(240 | $cbyte >> 18);
1062 $str .= chr(128 | $cbyte >> 12 & 63);
1063 $str .= chr(128 | $cbyte >> 6 & 63);
1064 $str .= chr(128 | $cbyte & 63);
1065 } else {
1066 if ($cbyte < 67108864) {
1067 $str .= chr(248 | $cbyte >> 24);
1068 $str .= chr(128 | $cbyte >> 18 & 63);
1069 $str .= chr(128 | $cbyte >> 12 & 63);
1070 $str .= chr(128 | $cbyte >> 6 & 63);
1071 $str .= chr(128 | $cbyte & 63);
1072 } else {
1073 if ($cbyte < 2147483648) {
1074 $str .= chr(252 | $cbyte >> 30);
1075 $str .= chr(128 | $cbyte >> 24 & 63);
1076 $str .= chr(128 | $cbyte >> 18 & 63);
1077 $str .= chr(128 | $cbyte >> 12 & 63);
1078 $str .= chr(128 | $cbyte >> 6 & 63);
1079 $str .= chr(128 | $cbyte & 63);
1080 } else {
1081 // Cannot express a 32-bit character in UTF-8
1082 $str .= chr($this->noCharByteVal);
1083 }
1084 }
1085 }
1086 }
1087 }
1088 }
1089 return $str;
1090 }
1091
1092 /**
1093 * Converts a UTF-8 Multibyte character to a UNICODE number
1094 * Unit-tested by Kasper
1095 *
1096 * @param string $str UTF-8 multibyte character string
1097 * @param boolean $hex If set, then a hex. number is returned.
1098 * @return integer UNICODE integer
1099 * @see UnumberToChar()
1100 * @todo Define visibility
1101 */
1102 public function utf8CharToUnumber($str, $hex = 0) {
1103 // First char
1104 $ord = ord(substr($str, 0, 1));
1105 // This verifyes that it IS a multi byte string
1106 if (($ord & 192) == 192) {
1107 $binBuf = '';
1108 // For each byte in multibyte string...
1109 for ($b = 0; $b < 8; $b++) {
1110 // Shift it left and ...
1111 $ord = $ord << 1;
1112 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
1113 if ($ord & 128) {
1114 $binBuf .= substr('00000000' . decbin(ord(substr($str, ($b + 1), 1))), -6);
1115 } else {
1116 break;
1117 }
1118 }
1119 $binBuf = substr(('00000000' . decbin(ord(substr($str, 0, 1)))), -(6 - $b)) . $binBuf;
1120 $int = bindec($binBuf);
1121 } else {
1122 $int = $ord;
1123 }
1124 return $hex ? 'x' . dechex($int) : $int;
1125 }
1126
1127 /********************************************
1128 *
1129 * Init functions
1130 *
1131 ********************************************/
1132 /**
1133 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
1134 * This function is automatically called by the conversion functions
1135 *
1136 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
1137 *
1138 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
1139 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
1140 * @acces private
1141 * @todo Define visibility
1142 */
1143 public function initCharset($charset) {
1144 // Only process if the charset is not yet loaded:
1145 if (!is_array($this->parsedCharsets[$charset])) {
1146 // Conversion table filename:
1147 $charsetConvTableFile = PATH_t3lib . 'csconvtbl/' . $charset . '.tbl';
1148 // If the conversion table is found:
1149 if ($charset && \TYPO3\CMS\Core\Utility\GeneralUtility::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
1150 // Cache file for charsets:
1151 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1152 $cacheFile = \TYPO3\CMS\Core\Utility\GeneralUtility::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1153 if ($cacheFile && @is_file($cacheFile)) {
1154 $this->parsedCharsets[$charset] = unserialize(\TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($cacheFile));
1155 } else {
1156 // Parse conversion table into lines:
1157 $lines = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(LF, \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($charsetConvTableFile), 1);
1158 // Initialize the internal variable holding the conv. table:
1159 $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
1160 // traverse the lines:
1161 $detectedType = '';
1162 foreach ($lines as $value) {
1163 // Comment line or blanks are ignored.
1164 if (trim($value) && substr($value, 0, 1) != '#') {
1165 // Detect type if not done yet: (Done on first real line)
1166 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1167 if (!$detectedType) {
1168 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
1169 }
1170 if ($detectedType == 'ms-token') {
1171 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1172 } elseif ($detectedType == 'whitespaced') {
1173 $regA = array();
1174 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1175 $hexbyte = $regA[1];
1176 $utf8 = 'U+' . $regA[2];
1177 }
1178 $decval = hexdec(trim($hexbyte));
1179 if ($decval > 127) {
1180 $utf8decval = hexdec(substr(trim($utf8), 2));
1181 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1182 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1183 }
1184 }
1185 }
1186 if ($cacheFile) {
1187 \TYPO3\CMS\Core\Utility\GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1188 }
1189 }
1190 return 2;
1191 } else {
1192 return FALSE;
1193 }
1194 } else {
1195 return 1;
1196 }
1197 }
1198
1199 /**
1200 * This function initializes all UTF-8 character data tables.
1201 *
1202 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1203 *
1204 * @param string $mode Mode ("case", "ascii", ...)
1205 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1206 * @access private
1207 * @todo Define visibility
1208 */
1209 public function initUnicodeData($mode = NULL) {
1210 // Cache files
1211 $cacheFileCase = \TYPO3\CMS\Core\Utility\GeneralUtility::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1212 $cacheFileASCII = \TYPO3\CMS\Core\Utility\GeneralUtility::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1213 // Only process if the tables are not yet loaded
1214 switch ($mode) {
1215 case 'case':
1216 if (is_array($this->caseFolding['utf-8'])) {
1217 return 1;
1218 }
1219 // Use cached version if possible
1220 if ($cacheFileCase && @is_file($cacheFileCase)) {
1221 $this->caseFolding['utf-8'] = unserialize(\TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($cacheFileCase));
1222 return 2;
1223 }
1224 break;
1225 case 'ascii':
1226 if (is_array($this->toASCII['utf-8'])) {
1227 return 1;
1228 }
1229 // Use cached version if possible
1230 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1231 $this->toASCII['utf-8'] = unserialize(\TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($cacheFileASCII));
1232 return 2;
1233 }
1234 break;
1235 }
1236 // Process main Unicode data file
1237 $unicodeDataFile = PATH_t3lib . 'unidata/UnicodeData.txt';
1238 if (!(\TYPO3\CMS\Core\Utility\GeneralUtility::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1239 return FALSE;
1240 }
1241 $fh = fopen($unicodeDataFile, 'rb');
1242 if (!$fh) {
1243 return FALSE;
1244 }
1245 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1246 // Note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1247 $this->caseFolding['utf-8'] = array();
1248 $utf8CaseFolding =& $this->caseFolding['utf-8'];
1249 // a shorthand
1250 $utf8CaseFolding['toUpper'] = array();
1251 $utf8CaseFolding['toLower'] = array();
1252 $utf8CaseFolding['toTitle'] = array();
1253 // Array of temp. decompositions
1254 $decomposition = array();
1255 // Array of chars that are marks (eg. composing accents)
1256 $mark = array();
1257 // Array of chars that are numbers (eg. digits)
1258 $number = array();
1259 // Array of chars to be omitted (eg. Russian hard sign)
1260 $omit = array();
1261 while (!feof($fh)) {
1262 $line = fgets($fh, 4096);
1263 // Has a lot of info
1264 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title, ) = explode(';', rtrim($line));
1265 $ord = hexdec($char);
1266 if ($ord > 65535) {
1267 // Only process the BMP
1268 break;
1269 }
1270 $utf8_char = $this->UnumberToChar($ord);
1271 if ($upper) {
1272 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1273 }
1274 if ($lower) {
1275 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1276 }
1277 // Store "title" only when different from "upper" (only a few)
1278 if ($title && $title != $upper) {
1279 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1280 }
1281 switch ($cat[0]) {
1282 case 'M':
1283 // mark (accent, umlaut, ...)
1284 $mark['U+' . $char] = 1;
1285 break;
1286 case 'N':
1287 // numeric value
1288 if ($ord > 128 && $num != '') {
1289 $number['U+' . $char] = $num;
1290 }
1291 }
1292 // Accented Latin letters without "official" decomposition
1293 $match = array();
1294 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1295 $c = ord($match[2]);
1296 if ($match[1] == 'SMALL') {
1297 $c += 32;
1298 }
1299 $decomposition['U+' . $char] = array(dechex($c));
1300 continue;
1301 }
1302 $match = array();
1303 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1304 switch ($match[1]) {
1305 case '<circle>':
1306 // add parenthesis as circle replacement, eg (1)
1307 $match[2] = '0028 ' . $match[2] . ' 0029';
1308 break;
1309 case '<square>':
1310 // add square brackets as square replacement, eg [1]
1311 $match[2] = '005B ' . $match[2] . ' 005D';
1312 break;
1313 case '<compat>':
1314 // ignore multi char decompositions that start with a space
1315 if (preg_match('/^0020 /', $match[2])) {
1316 continue 2;
1317 }
1318 break;
1319 case '<initial>':
1320
1321 case '<medial>':
1322
1323 case '<final>':
1324
1325 case '<isolated>':
1326
1327 case '<vertical>':
1328 continue 2;
1329 }
1330 $decomposition['U+' . $char] = explode(' ', $match[2]);
1331 }
1332 }
1333 fclose($fh);
1334 // Process additional Unicode data for casing (allow folded characters to expand into a sequence)
1335 $specialCasingFile = PATH_t3lib . 'unidata/SpecialCasing.txt';
1336 if (\TYPO3\CMS\Core\Utility\GeneralUtility::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1337 $fh = fopen($specialCasingFile, 'rb');
1338 if ($fh) {
1339 while (!feof($fh)) {
1340 $line = fgets($fh, 4096);
1341 if ($line[0] != '#' && trim($line) != '') {
1342 list($char, $lower, $title, $upper, $cond) = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(';', $line);
1343 if ($cond == '' || $cond[0] == '#') {
1344 $utf8_char = $this->UnumberToChar(hexdec($char));
1345 if ($char != $lower) {
1346 $arr = explode(' ', $lower);
1347 for ($i = 0; isset($arr[$i]); $i++) {
1348 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1349 }
1350 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1351 }
1352 if ($char != $title && $title != $upper) {
1353 $arr = explode(' ', $title);
1354 for ($i = 0; isset($arr[$i]); $i++) {
1355 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1356 }
1357 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1358 }
1359 if ($char != $upper) {
1360 $arr = explode(' ', $upper);
1361 for ($i = 0; isset($arr[$i]); $i++) {
1362 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1363 }
1364 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1365 }
1366 }
1367 }
1368 }
1369 fclose($fh);
1370 }
1371 }
1372 // Process custom decompositions
1373 $customTranslitFile = PATH_t3lib . 'unidata/Translit.txt';
1374 if (\TYPO3\CMS\Core\Utility\GeneralUtility::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1375 $fh = fopen($customTranslitFile, 'rb');
1376 if ($fh) {
1377 while (!feof($fh)) {
1378 $line = fgets($fh, 4096);
1379 if ($line[0] != '#' && trim($line) != '') {
1380 list($char, $translit) = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(';', $line);
1381 if (!$translit) {
1382 $omit['U+' . $char] = 1;
1383 }
1384 $decomposition['U+' . $char] = explode(' ', $translit);
1385 }
1386 }
1387 fclose($fh);
1388 }
1389 }
1390 // Decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1391 foreach ($decomposition as $from => $to) {
1392 $code_decomp = array();
1393 while ($code_value = array_shift($to)) {
1394 // Do recursive decomposition
1395 if (isset($decomposition['U+' . $code_value])) {
1396 foreach (array_reverse($decomposition['U+' . $code_value]) as $cv) {
1397 array_unshift($to, $cv);
1398 }
1399 } elseif (!isset($mark[('U+' . $code_value)])) {
1400 // remove mark
1401 array_push($code_decomp, $code_value);
1402 }
1403 }
1404 if (count($code_decomp) || isset($omit[$from])) {
1405 $decomposition[$from] = $code_decomp;
1406 } else {
1407 unset($decomposition[$from]);
1408 }
1409 }
1410 // Create ascii only mapping
1411 $this->toASCII['utf-8'] = array();
1412 $ascii =& $this->toASCII['utf-8'];
1413 foreach ($decomposition as $from => $to) {
1414 $code_decomp = array();
1415 while ($code_value = array_shift($to)) {
1416 $ord = hexdec($code_value);
1417 if ($ord > 127) {
1418 continue 2;
1419 } else {
1420 // Skip decompositions containing non-ASCII chars
1421 array_push($code_decomp, chr($ord));
1422 }
1423 }
1424 $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1425 }
1426 // Add numeric decompositions
1427 foreach ($number as $from => $to) {
1428 $utf8_char = $this->UnumberToChar(hexdec($from));
1429 if (!isset($ascii[$utf8_char])) {
1430 $ascii[$utf8_char] = $to;
1431 }
1432 }
1433 if ($cacheFileCase) {
1434 \TYPO3\CMS\Core\Utility\GeneralUtility::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1435 }
1436 if ($cacheFileASCII) {
1437 \TYPO3\CMS\Core\Utility\GeneralUtility::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1438 }
1439 return 3;
1440 }
1441
1442 /**
1443 * This function initializes the folding table for a charset other than UTF-8.
1444 * This function is automatically called by the case folding functions.
1445 *
1446 * @param string $charset Charset for which to initialize case folding.
1447 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1448 * @access private
1449 * @todo Define visibility
1450 */
1451 public function initCaseFolding($charset) {
1452 // Only process if the case table is not yet loaded:
1453 if (is_array($this->caseFolding[$charset])) {
1454 return 1;
1455 }
1456 // Use cached version if possible
1457 $cacheFile = \TYPO3\CMS\Core\Utility\GeneralUtility::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1458 if ($cacheFile && @is_file($cacheFile)) {
1459 $this->caseFolding[$charset] = unserialize(\TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($cacheFile));
1460 return 2;
1461 }
1462 // init UTF-8 conversion for this charset
1463 if (!$this->initCharset($charset)) {
1464 return FALSE;
1465 }
1466 // UTF-8 case folding is used as the base conversion table
1467 if (!$this->initUnicodeData('case')) {
1468 return FALSE;
1469 }
1470 $nochar = chr($this->noCharByteVal);
1471 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1472 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1473 $c = $this->utf8_decode($utf8, $charset);
1474 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1475 if ($cc != '' && $cc != $nochar) {
1476 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1477 }
1478 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1479 if ($cc != '' && $cc != $nochar) {
1480 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1481 }
1482 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1483 if ($cc != '' && $cc != $nochar) {
1484 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1485 }
1486 }
1487 // Add the ASCII case table
1488 $start = ord('a');
1489 $end = ord('z');
1490 for ($i = $start; $i <= $end; $i++) {
1491 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1492 }
1493 $start = ord('A');
1494 $end = ord('Z');
1495 for ($i = $start; $i <= $end; $i++) {
1496 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1497 }
1498 if ($cacheFile) {
1499 \TYPO3\CMS\Core\Utility\GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1500 }
1501 return 3;
1502 }
1503
1504 /**
1505 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1506 * This function is automatically called by the ASCII transliteration functions.
1507 *
1508 * @param string $charset Charset for which to initialize conversion.
1509 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1510 * @access private
1511 * @todo Define visibility
1512 */
1513 public function initToASCII($charset) {
1514 // Only process if the case table is not yet loaded:
1515 if (is_array($this->toASCII[$charset])) {
1516 return 1;
1517 }
1518 // Use cached version if possible
1519 $cacheFile = \TYPO3\CMS\Core\Utility\GeneralUtility::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1520 if ($cacheFile && @is_file($cacheFile)) {
1521 $this->toASCII[$charset] = unserialize(\TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($cacheFile));
1522 return 2;
1523 }
1524 // Init UTF-8 conversion for this charset
1525 if (!$this->initCharset($charset)) {
1526 return FALSE;
1527 }
1528 // UTF-8/ASCII transliteration is used as the base conversion table
1529 if (!$this->initUnicodeData('ascii')) {
1530 return FALSE;
1531 }
1532 $nochar = chr($this->noCharByteVal);
1533 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1534 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1535 $c = $this->utf8_decode($utf8, $charset);
1536 if (isset($this->toASCII['utf-8'][$utf8])) {
1537 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1538 }
1539 }
1540 if ($cacheFile) {
1541 \TYPO3\CMS\Core\Utility\GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1542 }
1543 return 3;
1544 }
1545
1546 /********************************************
1547 *
1548 * String operation functions
1549 *
1550 ********************************************/
1551 /**
1552 * Returns a part of a string.
1553 * Unit-tested by Kasper (single byte charsets only)
1554 *
1555 * @param string $charset The character set
1556 * @param string $string Character string
1557 * @param integer $start Start position (character position)
1558 * @param integer $len Length (in characters)
1559 * @return string The substring
1560 * @see substr(), mb_substr()
1561 * @todo Define visibility
1562 */
1563 public function substr($charset, $string, $start, $len = NULL) {
1564 if ($len === 0 || $string === '') {
1565 return '';
1566 }
1567 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1568 // Cannot omit $len, when specifying charset
1569 if ($len == NULL) {
1570 // Save internal encoding
1571 $enc = mb_internal_encoding();
1572 mb_internal_encoding($charset);
1573 $str = mb_substr($string, $start);
1574 // Restore internal encoding
1575 mb_internal_encoding($enc);
1576 return $str;
1577 } else {
1578 return mb_substr($string, $start, $len, $charset);
1579 }
1580 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1581 // Cannot omit $len, when specifying charset
1582 if ($len == NULL) {
1583 // Save internal encoding
1584 $enc = iconv_get_encoding('internal_encoding');
1585 iconv_set_encoding('internal_encoding', $charset);
1586 $str = iconv_substr($string, $start);
1587 // Restore internal encoding
1588 iconv_set_encoding('internal_encoding', $enc);
1589 return $str;
1590 } else {
1591 return iconv_substr($string, $start, $len, $charset);
1592 }
1593 } elseif ($charset == 'utf-8') {
1594 return $this->utf8_substr($string, $start, $len);
1595 } elseif ($this->eucBasedSets[$charset]) {
1596 return $this->euc_substr($string, $start, $charset, $len);
1597 } elseif ($this->twoByteSets[$charset]) {
1598 return substr($string, $start * 2, $len * 2);
1599 } elseif ($this->fourByteSets[$charset]) {
1600 return substr($string, $start * 4, $len * 4);
1601 }
1602 // Treat everything else as single-byte encoding
1603 return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
1604 }
1605
1606 /**
1607 * Counts the number of characters.
1608 * Unit-tested by Kasper (single byte charsets only)
1609 *
1610 * @param string $charset The character set
1611 * @param string $string Character string
1612 * @return integer The number of characters
1613 * @see strlen()
1614 * @todo Define visibility
1615 */
1616 public function strlen($charset, $string) {
1617 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1618 return mb_strlen($string, $charset);
1619 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1620 return iconv_strlen($string, $charset);
1621 } elseif ($charset == 'utf-8') {
1622 return $this->utf8_strlen($string);
1623 } elseif ($this->eucBasedSets[$charset]) {
1624 return $this->euc_strlen($string, $charset);
1625 } elseif ($this->twoByteSets[$charset]) {
1626 return strlen($string) / 2;
1627 } elseif ($this->fourByteSets[$charset]) {
1628 return strlen($string) / 4;
1629 }
1630 // Treat everything else as single-byte encoding
1631 return strlen($string);
1632 }
1633
1634 /**
1635 * Method to crop strings using the mb_substr function.
1636 *
1637 * @param string $charset The character set
1638 * @param string $string String to be cropped
1639 * @param integer $len Crop length (in characters)
1640 * @param string $crop Crop signifier
1641 * @return string The shortened string
1642 * @see mb_strlen(), mb_substr()
1643 */
1644 protected function cropMbstring($charset, $string, $len, $crop = '') {
1645 if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) {
1646 return $string;
1647 }
1648 if ($len > 0) {
1649 $string = mb_substr($string, 0, $len, $charset) . $crop;
1650 } else {
1651 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1652 }
1653 return $string;
1654 }
1655
1656 /**
1657 * Truncates a string and pre-/appends a string.
1658 * Unit tested by Kasper
1659 *
1660 * @param string $charset The character set
1661 * @param string $string Character string
1662 * @param integer $len Length (in characters)
1663 * @param string $crop Crop signifier
1664 * @return string The shortened string
1665 * @see substr(), mb_strimwidth()
1666 * @todo Define visibility
1667 */
1668 public function crop($charset, $string, $len, $crop = '') {
1669 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1670 return $this->cropMbstring($charset, $string, $len, $crop);
1671 }
1672 if (intval($len) == 0) {
1673 return $string;
1674 }
1675 if ($charset == 'utf-8') {
1676 $i = $this->utf8_char2byte_pos($string, $len);
1677 } elseif ($this->eucBasedSets[$charset]) {
1678 $i = $this->euc_char2byte_pos($string, $len, $charset);
1679 } else {
1680 if ($len > 0) {
1681 $i = $len;
1682 } else {
1683 $i = strlen($string) + $len;
1684 if ($i <= 0) {
1685 $i = FALSE;
1686 }
1687 }
1688 }
1689 // $len outside actual string length
1690 if ($i === FALSE) {
1691 return $string;
1692 } else {
1693 if ($len > 0) {
1694 if (strlen($string[$i])) {
1695 return substr($string, 0, $i) . $crop;
1696 }
1697 } else {
1698 if (strlen($string[$i - 1])) {
1699 return $crop . substr($string, $i);
1700 }
1701 }
1702 }
1703 return $string;
1704 }
1705
1706 /**
1707 * Cuts a string short at a given byte length.
1708 *
1709 * @param string $charset The character set
1710 * @param string $string Character string
1711 * @param integer $len The byte length
1712 * @return string The shortened string
1713 * @see mb_strcut()
1714 * @todo Define visibility
1715 */
1716 public function strtrunc($charset, $string, $len) {
1717 if ($len <= 0) {
1718 return '';
1719 }
1720 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1721 return mb_strcut($string, 0, $len, $charset);
1722 } elseif ($charset == 'utf-8') {
1723 return $this->utf8_strtrunc($string, $len);
1724 } elseif ($this->eucBasedSets[$charset]) {
1725 return $this->euc_strtrunc($string, $len, $charset);
1726 } elseif ($this->twoByteSets[$charset]) {
1727 if ($len % 2) {
1728 $len--;
1729 }
1730 } elseif ($this->fourByteSets[$charset]) {
1731 $x = $len % 4;
1732 // Realign to position dividable by four
1733 $len -= $x;
1734 }
1735 // Treat everything else as single-byte encoding
1736 return substr($string, 0, $len);
1737 }
1738
1739 /**
1740 * Translates all characters of a string into their respective case values.
1741 * Unlike strtolower() and strtoupper() this method is locale independent.
1742 * Note that the string length may change!
1743 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1744 * Unit-tested by Kasper
1745 * Real case folding is language dependent, this method ignores this fact.
1746 *
1747 * @param string $charset Character set of string
1748 * @param string $string Input string to convert case for
1749 * @param string $case Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1750 * @return string The converted string
1751 * @see strtolower(), strtoupper()
1752 * @todo Define visibility
1753 */
1754 public function conv_case($charset, $string, $case) {
1755 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1756 if ($case == 'toLower') {
1757 $string = mb_strtolower($string, $charset);
1758 } else {
1759 $string = mb_strtoupper($string, $charset);
1760 }
1761 } elseif ($charset == 'utf-8') {
1762 $string = $this->utf8_char_mapping($string, 'case', $case);
1763 } elseif (isset($this->eucBasedSets[$charset])) {
1764 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1765 } else {
1766 // Treat everything else as single-byte encoding
1767 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1768 }
1769 return $string;
1770 }
1771
1772 /**
1773 * Equivalent of lcfirst/ucfirst but using character set.
1774 *
1775 * @param string $charset
1776 * @param string $string
1777 * @param string $case
1778 * @return string
1779 * @see t3lib_cs::conv_case()
1780 */
1781 public function convCaseFirst($charset, $string, $case) {
1782 $firstChar = $this->substr($charset, $string, 0, 1);
1783 $firstChar = $this->conv_case($charset, $firstChar, $case);
1784 $remainder = $this->substr($charset, $string, 1);
1785 return $firstChar . $remainder;
1786 }
1787
1788 /**
1789 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1790 *
1791 * @param string $charset Character set of string
1792 * @param string $string Input string to convert
1793 * @return string The converted string
1794 * @todo Define visibility
1795 */
1796 public function specCharsToASCII($charset, $string) {
1797 if ($charset == 'utf-8') {
1798 $string = $this->utf8_char_mapping($string, 'ascii');
1799 } elseif (isset($this->eucBasedSets[$charset])) {
1800 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1801 } else {
1802 // Treat everything else as single-byte encoding
1803 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1804 }
1805 return $string;
1806 }
1807
1808 /**
1809 * Converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1810 * into a TYPO3-readable language code
1811 *
1812 * @param string $languageCodesList List of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1813 * @return string A preferred language that TYPO3 supports, or "default" if none found
1814 */
1815 public function getPreferredClientLanguage($languageCodesList) {
1816 $allLanguageCodes = array();
1817 $selectedLanguage = 'default';
1818 // Get all languages where TYPO3 code is the same as the ISO code
1819 foreach ($this->charSetArray as $typo3Lang => $charSet) {
1820 $allLanguageCodes[$typo3Lang] = $typo3Lang;
1821 }
1822 // Get all languages where TYPO3 code differs from ISO code
1823 // or needs the country part
1824 // the iso codes will here overwrite the default typo3 language in the key
1825 foreach ($this->locales->getIsoMapping() as $typo3Lang => $isoLang) {
1826 $isoLang = join('-', explode('_', $isoLang));
1827 $allLanguageCodes[$typo3Lang] = $isoLang;
1828 }
1829 // Move the iso codes to the (because we're comparing the keys with "isset" later on)
1830 $allLanguageCodes = array_flip($allLanguageCodes);
1831 $preferredLanguages = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',', $languageCodesList);
1832 // Order the preferred languages after they key
1833 $sortedPreferredLanguages = array();
1834 foreach ($preferredLanguages as $preferredLanguage) {
1835 $quality = 1.0;
1836 if (strpos($preferredLanguage, ';q=') !== FALSE) {
1837 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1838 }
1839 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1840 }
1841 // Loop through the languages, with the highest priority first
1842 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1843 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1844 if (isset($allLanguageCodes[$preferredLanguage])) {
1845 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1846 break;
1847 }
1848 // Strip the country code from the end
1849 list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
1850 if (isset($allLanguageCodes[$preferredLanguage])) {
1851 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1852 break;
1853 }
1854 }
1855 if (!$selectedLanguage || $selectedLanguage == 'en') {
1856 $selectedLanguage = 'default';
1857 }
1858 return $selectedLanguage;
1859 }
1860
1861 /********************************************
1862 *
1863 * Internal string operation functions
1864 *
1865 ********************************************/
1866 /**
1867 * Maps all characters of a string in a single byte charset.
1868 *
1869 * @param string $str The string
1870 * @param string $charset The charset
1871 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1872 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
1873 * @return string The converted string
1874 * @todo Define visibility
1875 */
1876 public function sb_char_mapping($str, $charset, $mode, $opt = '') {
1877 switch ($mode) {
1878 case 'case':
1879 if (!$this->initCaseFolding($charset)) {
1880 return $str;
1881 }
1882 // Do nothing
1883 $map =& $this->caseFolding[$charset][$opt];
1884 break;
1885 case 'ascii':
1886 if (!$this->initToASCII($charset)) {
1887 return $str;
1888 }
1889 // Do nothing
1890 $map =& $this->toASCII[$charset];
1891 break;
1892 default:
1893 return $str;
1894 }
1895 $out = '';
1896 for ($i = 0; strlen($str[$i]); $i++) {
1897 $c = $str[$i];
1898 if (isset($map[$c])) {
1899 $out .= $map[$c];
1900 } else {
1901 $out .= $c;
1902 }
1903 }
1904 return $out;
1905 }
1906
1907 /********************************************
1908 *
1909 * Internal UTF-8 string operation functions
1910 *
1911 ********************************************/
1912 /**
1913 * Returns a part of a UTF-8 string.
1914 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1915 *
1916 * @param string $str UTF-8 string
1917 * @param integer $start Start position (character position)
1918 * @param integer $len Length (in characters)
1919 * @return string The substring
1920 * @see substr()
1921 * @todo Define visibility
1922 */
1923 public function utf8_substr($str, $start, $len = NULL) {
1924 if (!strcmp($len, '0')) {
1925 return '';
1926 }
1927 $byte_start = $this->utf8_char2byte_pos($str, $start);
1928 if ($byte_start === FALSE) {
1929 if ($start > 0) {
1930 // $start outside string length
1931 return FALSE;
1932 } else {
1933 $start = 0;
1934 }
1935 }
1936 $str = substr($str, $byte_start);
1937 if ($len != NULL) {
1938 $byte_end = $this->utf8_char2byte_pos($str, $len);
1939 // $len outside actual string length
1940 if ($byte_end === FALSE) {
1941 return $len < 0 ? '' : $str;
1942 } else {
1943 // When length is less than zero and exceeds, then we return blank string.
1944 return substr($str, 0, $byte_end);
1945 }
1946 } else {
1947 return $str;
1948 }
1949 }
1950
1951 /**
1952 * Counts the number of characters of a string in UTF-8.
1953 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1954 *
1955 * @param string $str UTF-8 multibyte character string
1956 * @return integer The number of characters
1957 * @see strlen()
1958 * @todo Define visibility
1959 */
1960 public function utf8_strlen($str) {
1961 $n = 0;
1962 for ($i = 0; strlen($str[$i]); $i++) {
1963 $c = ord($str[$i]);
1964 // Single-byte (0xxxxxx)
1965 if (!($c & 128)) {
1966 $n++;
1967 } elseif (($c & 192) == 192) {
1968 // Multi-byte starting byte (11xxxxxx)
1969 $n++;
1970 }
1971 }
1972 return $n;
1973 }
1974
1975 /**
1976 * Truncates a string in UTF-8 short at a given byte length.
1977 *
1978 * @param string $str UTF-8 multibyte character string
1979 * @param integer $len The byte length
1980 * @return string The shortened string
1981 * @see mb_strcut()
1982 * @todo Define visibility
1983 */
1984 public function utf8_strtrunc($str, $len) {
1985 $i = $len - 1;
1986 // Part of a multibyte sequence
1987 if (ord($str[$i]) & 128) {
1988 for (; $i > 0 && !(ord($str[$i]) & 64); $i--) {
1989
1990 }
1991 if ($i <= 0) {
1992 return '';
1993 }
1994 // Sanity check
1995 for ($bc = 0, $mbs = ord($str[$i]); $mbs & 128; $mbs = $mbs << 1) {
1996 // Calculate number of bytes
1997 $bc++;
1998 }
1999 if ($bc + $i > $len) {
2000 return substr($str, 0, $i);
2001 }
2002 }
2003 return substr($str, 0, $len);
2004 }
2005
2006 /**
2007 * Find position of first occurrence of a string, both arguments are in UTF-8.
2008 *
2009 * @param string $haystack UTF-8 string to search in
2010 * @param string $needle UTF-8 string to search for
2011 * @param integer $offset Positition to start the search
2012 * @return integer The character position
2013 * @see strpos()
2014 * @todo Define visibility
2015 */
2016 public function utf8_strpos($haystack, $needle, $offset = 0) {
2017 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
2018 return mb_strpos($haystack, $needle, $offset, 'utf-8');
2019 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
2020 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
2021 }
2022 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
2023 if ($byte_offset === FALSE) {
2024 // Offset beyond string length
2025 return FALSE;
2026 }
2027 $byte_pos = strpos($haystack, $needle, $byte_offset);
2028 if ($byte_pos === FALSE) {
2029 // Needle not found
2030 return FALSE;
2031 }
2032 return $this->utf8_byte2char_pos($haystack, $byte_pos);
2033 }
2034
2035 /**
2036 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
2037 *
2038 * @param string $haystack UTF-8 string to search in
2039 * @param string $needle UTF-8 character to search for (single character)
2040 * @return integer The character position
2041 * @see strrpos()
2042 * @todo Define visibility
2043 */
2044 public function utf8_strrpos($haystack, $needle) {
2045 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
2046 return mb_strrpos($haystack, $needle, 'utf-8');
2047 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
2048 return iconv_strrpos($haystack, $needle, 'utf-8');
2049 }
2050 $byte_pos = strrpos($haystack, $needle);
2051 if ($byte_pos === FALSE) {
2052 // Needle not found
2053 return FALSE;
2054 }
2055 return $this->utf8_byte2char_pos($haystack, $byte_pos);
2056 }
2057
2058 /**
2059 * Translates a character position into an 'absolute' byte position.
2060 * Unit tested by Kasper.
2061 *
2062 * @param string $str UTF-8 string
2063 * @param integer $pos Character position (negative values start from the end)
2064 * @return integer Byte position
2065 * @todo Define visibility
2066 */
2067 public function utf8_char2byte_pos($str, $pos) {
2068 // Number of characters found
2069 $n = 0;
2070 // Number of characters wanted
2071 $p = abs($pos);
2072 if ($pos >= 0) {
2073 $i = 0;
2074 $d = 1;
2075 } else {
2076 $i = strlen($str) - 1;
2077 $d = -1;
2078 }
2079 for (; strlen($str[$i]) && $n < $p; $i += $d) {
2080 $c = (int) ord($str[$i]);
2081 // single-byte (0xxxxxx)
2082 if (!($c & 128)) {
2083 $n++;
2084 } elseif (($c & 192) == 192) {
2085 // Multi-byte starting byte (11xxxxxx)
2086 $n++;
2087 }
2088 }
2089 if (!strlen($str[$i])) {
2090 // Offset beyond string length
2091 return FALSE;
2092 }
2093 if ($pos >= 0) {
2094 // Skip trailing multi-byte data bytes
2095 while (ord($str[$i]) & 128 && !(ord($str[$i]) & 64)) {
2096 $i++;
2097 }
2098 } else {
2099 // Correct offset
2100 $i++;
2101 }
2102 return $i;
2103 }
2104
2105 /**
2106 * Translates an 'absolute' byte position into a character position.
2107 * Unit tested by Kasper.
2108 *
2109 * @param string $str UTF-8 string
2110 * @param integer $pos Byte position
2111 * @return integer Character position
2112 * @todo Define visibility
2113 */
2114 public function utf8_byte2char_pos($str, $pos) {
2115 // Number of characters
2116 $n = 0;
2117 for ($i = $pos; $i > 0; $i--) {
2118 $c = (int) ord($str[$i]);
2119 // single-byte (0xxxxxx)
2120 if (!($c & 128)) {
2121 $n++;
2122 } elseif (($c & 192) == 192) {
2123 // Multi-byte starting byte (11xxxxxx)
2124 $n++;
2125 }
2126 }
2127 if (!strlen($str[$i])) {
2128 // Offset beyond string length
2129 return FALSE;
2130 }
2131 return $n;
2132 }
2133
2134 /**
2135 * Maps all characters of an UTF-8 string.
2136 *
2137 * @param string $str UTF-8 string
2138 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2139 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
2140 * @return string The converted string
2141 * @todo Define visibility
2142 */
2143 public function utf8_char_mapping($str, $mode, $opt = '') {
2144 if (!$this->initUnicodeData($mode)) {
2145 // Do nothing
2146 return $str;
2147 }
2148 $out = '';
2149 switch ($mode) {
2150 case 'case':
2151 $map =& $this->caseFolding['utf-8'][$opt];
2152 break;
2153 case 'ascii':
2154 $map =& $this->toASCII['utf-8'];
2155 break;
2156 default:
2157 return $str;
2158 }
2159 for ($i = 0; strlen($str[$i]); $i++) {
2160 $c = ord($str[$i]);
2161 // single-byte (0xxxxxx)
2162 if (!($c & 128)) {
2163 $mbc = $str[$i];
2164 } elseif (($c & 192) == 192) {
2165 // multi-byte starting byte (11xxxxxx)
2166 for ($bc = 0; $c & 128; $c = $c << 1) {
2167 $bc++;
2168 }
2169 // calculate number of bytes
2170 $mbc = substr($str, $i, $bc);
2171 $i += $bc - 1;
2172 }
2173 if (isset($map[$mbc])) {
2174 $out .= $map[$mbc];
2175 } else {
2176 $out .= $mbc;
2177 }
2178 }
2179 return $out;
2180 }
2181
2182 /********************************************
2183 *
2184 * Internal EUC string operation functions
2185 *
2186 * Extended Unix Code:
2187 * ASCII compatible 7bit single bytes chars
2188 * 8bit two byte chars
2189 *
2190 * Shift-JIS is treated as a special case.
2191 *
2192 ********************************************/
2193 /**
2194 * Cuts a string in the EUC charset family short at a given byte length.
2195 *
2196 * @param string $str EUC multibyte character string
2197 * @param integer $len The byte length
2198 * @param string $charset The charset
2199 * @return string The shortened string
2200 * @see mb_strcut()
2201 * @todo Define visibility
2202 */
2203 public function euc_strtrunc($str, $len, $charset) {
2204 $sjis = $charset == 'shift_jis';
2205 for ($i = 0; strlen($str[$i]) && $i < $len; $i++) {
2206 $c = ord($str[$i]);
2207 if ($sjis) {
2208 if ($c >= 128 && $c < 160 || $c >= 224) {
2209 $i++;
2210 }
2211 } else {
2212 if ($c >= 128) {
2213 $i++;
2214 }
2215 }
2216 }
2217 if (!strlen($str[$i])) {
2218 return $str;
2219 }
2220 // string shorter than supplied length
2221 if ($i > $len) {
2222 // We ended on a first byte
2223 return substr($str, 0, $len - 1);
2224 } else {
2225 return substr($str, 0, $len);
2226 }
2227 }
2228
2229 /**
2230 * Returns a part of a string in the EUC charset family.
2231 *
2232 * @param string $str EUC multibyte character string
2233 * @param integer $start Start position (character position)
2234 * @param string $charset The charset
2235 * @param integer $len Length (in characters)
2236 * @return string the substring
2237 * @todo Define visibility
2238 */
2239 public function euc_substr($str, $start, $charset, $len = NULL) {
2240 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2241 if ($byte_start === FALSE) {
2242 // $start outside string length
2243 return FALSE;
2244 }
2245 $str = substr($str, $byte_start);
2246 if ($len != NULL) {
2247 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2248 // $len outside actual string length
2249 if ($byte_end === FALSE) {
2250 return $str;
2251 } else {
2252 return substr($str, 0, $byte_end);
2253 }
2254 } else {
2255 return $str;
2256 }
2257 }
2258
2259 /**
2260 * Counts the number of characters of a string in the EUC charset family.
2261 *
2262 * @param string $str EUC multibyte character string
2263 * @param string $charset The charset
2264 * @return integer The number of characters
2265 * @see strlen()
2266 * @todo Define visibility
2267 */
2268 public function euc_strlen($str, $charset) {
2269 $sjis = $charset == 'shift_jis';
2270 $n = 0;
2271 for ($i = 0; strlen($str[$i]); $i++) {
2272 $c = ord($str[$i]);
2273 if ($sjis) {
2274 if ($c >= 128 && $c < 160 || $c >= 224) {
2275 $i++;
2276 }
2277 } else {
2278 if ($c >= 128) {
2279 $i++;
2280 }
2281 }
2282 $n++;
2283 }
2284 return $n;
2285 }
2286
2287 /**
2288 * Translates a character position into an 'absolute' byte position.
2289 *
2290 * @param string $str EUC multibyte character string
2291 * @param integer $pos Character position (negative values start from the end)
2292 * @param string $charset The charset
2293 * @return integer Byte position
2294 * @todo Define visibility
2295 */
2296 public function euc_char2byte_pos($str, $pos, $charset) {
2297 $sjis = $charset == 'shift_jis';
2298 // Number of characters seen
2299 $n = 0;
2300 // Number of characters wanted
2301 $p = abs($pos);
2302 if ($pos >= 0) {
2303 $i = 0;
2304 $d = 1;
2305 } else {
2306 $i = strlen($str) - 1;
2307 $d = -1;
2308 }
2309 for (; strlen($str[$i]) && $n < $p; $i += $d) {
2310 $c = ord($str[$i]);
2311 if ($sjis) {
2312 if ($c >= 128 && $c < 160 || $c >= 224) {
2313 $i += $d;
2314 }
2315 } else {
2316 if ($c >= 128) {
2317 $i += $d;
2318 }
2319 }
2320 $n++;
2321 }
2322 if (!strlen($str[$i])) {
2323 return FALSE;
2324 }
2325 // offset beyond string length
2326 if ($pos < 0) {
2327 $i++;
2328 }
2329 // correct offset
2330 return $i;
2331 }
2332
2333 /**
2334 * Maps all characters of a string in the EUC charset family.
2335 *
2336 * @param string $str EUC multibyte character string
2337 * @param string $charset The charset
2338 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2339 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
2340 * @return string The converted string
2341 * @todo Define visibility
2342 */
2343 public function euc_char_mapping($str, $charset, $mode, $opt = '') {
2344 switch ($mode) {
2345 case 'case':
2346 if (!$this->initCaseFolding($charset)) {
2347 return $str;
2348 }
2349 // do nothing
2350 $map =& $this->caseFolding[$charset][$opt];
2351 break;
2352 case 'ascii':
2353 if (!$this->initToASCII($charset)) {
2354 return $str;
2355 }
2356 // do nothing
2357 $map =& $this->toASCII[$charset];
2358 break;
2359 default:
2360 return $str;
2361 }
2362 $sjis = $charset == 'shift_jis';
2363 $out = '';
2364 for ($i = 0; strlen($str[$i]); $i++) {
2365 $mbc = $str[$i];
2366 $c = ord($mbc);
2367 if ($sjis) {
2368 // A double-byte char
2369 if ($c >= 128 && $c < 160 || $c >= 224) {
2370 $mbc = substr($str, $i, 2);
2371 $i++;
2372 }
2373 } else {
2374 // A double-byte char
2375 if ($c >= 128) {
2376 $mbc = substr($str, $i, 2);
2377 $i++;
2378 }
2379 }
2380 if (isset($map[$mbc])) {
2381 $out .= $map[$mbc];
2382 } else {
2383 $out .= $mbc;
2384 }
2385 }
2386 return $out;
2387 }
2388
2389 }
2390
2391
2392 ?>