ac1335e64f75fb3a921a17179c94b33b4fdfe6c8
[Packages/TYPO3.CMS.git] / typo3 / sysext / core / Classes / Charset / CharsetConverter.php
1 <?php
2 namespace TYPO3\CMS\Core\Charset;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Utility\GeneralUtility;
18 use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
19
20 /**
21 * Notes on UTF-8
22 *
23 * Functions working on UTF-8 strings:
24 *
25 * - strchr/strstr
26 * - strrchr
27 * - substr_count
28 * - implode/explode/join
29 *
30 * Functions nearly working on UTF-8 strings:
31 *
32 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
33 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
34 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
35 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
36 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
37 *
38 * Functions NOT working on UTF-8 strings:
39 *
40 * - str*cmp
41 * - stristr
42 * - stripos
43 * - substr
44 * - strrev
45 * - split/spliti
46 * - ...
47 */
48
49 /**
50 * Class for conversion between charsets
51 *
52 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
53 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
54 */
55 class CharsetConverter {
56
57 /**
58 * @var \TYPO3\CMS\Core\Localization\Locales
59 */
60 protected $locales;
61
62 /**
63 * ASCII Value for chars with no equivalent.
64 *
65 * @var int
66 */
67 public $noCharByteVal = 63;
68
69 /**
70 * This is the array where parsed conversion tables are stored (cached)
71 *
72 * @var array
73 */
74 public $parsedCharsets = array();
75
76 /**
77 * An array where case folding data will be stored (cached)
78 *
79 * @var array
80 */
81 public $caseFolding = array();
82
83 /**
84 * An array where charset-to-ASCII mappings are stored (cached)
85 *
86 * @var array
87 */
88 public $toASCII = array();
89
90 /**
91 * This tells the converter which charsets has two bytes per char:
92 *
93 * @var array
94 */
95 public $twoByteSets = array(
96 'ucs-2' => 1
97 );
98
99 /**
100 * This tells the converter which charsets has four bytes per char:
101 *
102 * @var array
103 */
104 public $fourByteSets = array(
105 'ucs-4' => 1,
106 // 4-byte Unicode
107 'utf-32' => 1
108 );
109
110 /**
111 * This tells the converter which charsets use a scheme like the Extended Unix Code:
112 *
113 * @var array
114 */
115 public $eucBasedSets = array(
116 'gb2312' => 1,
117 // Chinese, simplified.
118 'big5' => 1,
119 // Chinese, traditional.
120 'euc-kr' => 1,
121 // Korean
122 'shift_jis' => 1
123 );
124
125 /**
126 * @link http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
127 * @link http://czyborra.com/charsets/iso8859.html
128 *
129 * @var array
130 */
131 public $synonyms = array(
132 'us' => 'ascii',
133 'us-ascii' => 'ascii',
134 'cp819' => 'iso-8859-1',
135 'ibm819' => 'iso-8859-1',
136 'iso-ir-100' => 'iso-8859-1',
137 'iso-ir-101' => 'iso-8859-2',
138 'iso-ir-109' => 'iso-8859-3',
139 'iso-ir-110' => 'iso-8859-4',
140 'iso-ir-144' => 'iso-8859-5',
141 'iso-ir-127' => 'iso-8859-6',
142 'iso-ir-126' => 'iso-8859-7',
143 'iso-ir-138' => 'iso-8859-8',
144 'iso-ir-148' => 'iso-8859-9',
145 'iso-ir-157' => 'iso-8859-10',
146 'iso-ir-179' => 'iso-8859-13',
147 'iso-ir-199' => 'iso-8859-14',
148 'iso-ir-203' => 'iso-8859-15',
149 'csisolatin1' => 'iso-8859-1',
150 'csisolatin2' => 'iso-8859-2',
151 'csisolatin3' => 'iso-8859-3',
152 'csisolatin5' => 'iso-8859-9',
153 'csisolatin8' => 'iso-8859-14',
154 'csisolatin9' => 'iso-8859-15',
155 'csisolatingreek' => 'iso-8859-7',
156 'iso-celtic' => 'iso-8859-14',
157 'latin1' => 'iso-8859-1',
158 'latin2' => 'iso-8859-2',
159 'latin3' => 'iso-8859-3',
160 'latin5' => 'iso-8859-9',
161 'latin6' => 'iso-8859-10',
162 'latin8' => 'iso-8859-14',
163 'latin9' => 'iso-8859-15',
164 'l1' => 'iso-8859-1',
165 'l2' => 'iso-8859-2',
166 'l3' => 'iso-8859-3',
167 'l5' => 'iso-8859-9',
168 'l6' => 'iso-8859-10',
169 'l8' => 'iso-8859-14',
170 'l9' => 'iso-8859-15',
171 'cyrillic' => 'iso-8859-5',
172 'arabic' => 'iso-8859-6',
173 'tis-620' => 'iso-8859-11',
174 'win874' => 'windows-874',
175 'win1250' => 'windows-1250',
176 'win1251' => 'windows-1251',
177 'win1252' => 'windows-1252',
178 'win1253' => 'windows-1253',
179 'win1254' => 'windows-1254',
180 'win1255' => 'windows-1255',
181 'win1256' => 'windows-1256',
182 'win1257' => 'windows-1257',
183 'win1258' => 'windows-1258',
184 'cp1250' => 'windows-1250',
185 'cp1251' => 'windows-1251',
186 'cp1252' => 'windows-1252',
187 'ms-ee' => 'windows-1250',
188 'ms-ansi' => 'windows-1252',
189 'ms-greek' => 'windows-1253',
190 'ms-turk' => 'windows-1254',
191 'winbaltrim' => 'windows-1257',
192 'koi-8ru' => 'koi-8r',
193 'koi8r' => 'koi-8r',
194 'cp878' => 'koi-8r',
195 'mac' => 'macroman',
196 'macintosh' => 'macroman',
197 'euc-cn' => 'gb2312',
198 'x-euc-cn' => 'gb2312',
199 'euccn' => 'gb2312',
200 'cp936' => 'gb2312',
201 'big-5' => 'big5',
202 'cp950' => 'big5',
203 'eucjp' => 'euc-jp',
204 'sjis' => 'shift_jis',
205 'shift-jis' => 'shift_jis',
206 'cp932' => 'shift_jis',
207 'cp949' => 'euc-kr',
208 'utf7' => 'utf-7',
209 'utf8' => 'utf-8',
210 'utf16' => 'utf-16',
211 'utf32' => 'utf-32',
212 'utf8' => 'utf-8',
213 'ucs2' => 'ucs-2',
214 'ucs4' => 'ucs-4'
215 );
216
217 /**
218 * Mapping of iso-639-1 language codes to script names
219 *
220 * @var array
221 */
222 public $lang_to_script = array(
223 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
224 'af' => 'west_european',
225 //Afrikaans
226 'ar' => 'arabic',
227 'bg' => 'cyrillic',
228 // Bulgarian
229 'bs' => 'east_european',
230 // Bosnian
231 'cs' => 'east_european',
232 // Czech
233 'da' => 'west_european',
234 // Danish
235 'de' => 'west_european',
236 // German
237 'es' => 'west_european',
238 // Spanish
239 'et' => 'estonian',
240 'eo' => 'unicode',
241 // Esperanto
242 'eu' => 'west_european',
243 // Basque
244 'fa' => 'arabic',
245 // Persian
246 'fi' => 'west_european',
247 // Finish
248 'fo' => 'west_european',
249 // Faroese
250 'fr' => 'west_european',
251 // French
252 'ga' => 'west_european',
253 // Irish
254 'gl' => 'west_european',
255 // Galician
256 'gr' => 'greek',
257 'he' => 'hebrew',
258 // Hebrew (since 1998)
259 'hi' => 'unicode',
260 // Hindi
261 'hr' => 'east_european',
262 // Croatian
263 'hu' => 'east_european',
264 // Hungarian
265 'iw' => 'hebrew',
266 // Hebrew (til 1998)
267 'is' => 'west_european',
268 // Icelandic
269 'it' => 'west_european',
270 // Italian
271 'ja' => 'japanese',
272 'ka' => 'unicode',
273 // Georgian
274 'kl' => 'west_european',
275 // Greenlandic
276 'km' => 'unicode',
277 // Khmer
278 'ko' => 'korean',
279 'lt' => 'lithuanian',
280 'lv' => 'west_european',
281 // Latvian/Lettish
282 'nl' => 'west_european',
283 // Dutch
284 'no' => 'west_european',
285 // Norwegian
286 'nb' => 'west_european',
287 // Norwegian Bokmal
288 'nn' => 'west_european',
289 // Norwegian Nynorsk
290 'pl' => 'east_european',
291 // Polish
292 'pt' => 'west_european',
293 // Portuguese
294 'ro' => 'east_european',
295 // Romanian
296 'ru' => 'cyrillic',
297 // Russian
298 'sk' => 'east_european',
299 // Slovak
300 'sl' => 'east_european',
301 // Slovenian
302 'sr' => 'cyrillic',
303 // Serbian
304 'sv' => 'west_european',
305 // Swedish
306 'sq' => 'albanian',
307 // Albanian
308 'th' => 'thai',
309 'uk' => 'cyrillic',
310 // Ukranian
311 'vi' => 'vietnamese',
312 'zh' => 'chinese',
313 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
314 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
315 'afk' => 'west_european',
316 // Afrikaans
317 'ara' => 'arabic',
318 'bgr' => 'cyrillic',
319 // Bulgarian
320 'cat' => 'west_european',
321 // Catalan
322 'chs' => 'simpl_chinese',
323 'cht' => 'trad_chinese',
324 'csy' => 'east_european',
325 // Czech
326 'dan' => 'west_european',
327 // Danisch
328 'deu' => 'west_european',
329 // German
330 'dea' => 'west_european',
331 // German (Austrian)
332 'des' => 'west_european',
333 // German (Swiss)
334 'ena' => 'west_european',
335 // English (Australian)
336 'enc' => 'west_european',
337 // English (Canadian)
338 'eng' => 'west_european',
339 // English
340 'enz' => 'west_european',
341 // English (New Zealand)
342 'enu' => 'west_european',
343 // English (United States)
344 'euq' => 'west_european',
345 // Basque
346 'fos' => 'west_european',
347 // Faroese
348 'far' => 'arabic',
349 // Persian
350 'fin' => 'west_european',
351 // Finish
352 'fra' => 'west_european',
353 // French
354 'frb' => 'west_european',
355 // French (Belgian)
356 'frc' => 'west_european',
357 // French (Canadian)
358 'frs' => 'west_european',
359 // French (Swiss)
360 'geo' => 'unicode',
361 // Georgian
362 'glg' => 'west_european',
363 // Galician
364 'ell' => 'greek',
365 'heb' => 'hebrew',
366 'hin' => 'unicode',
367 // Hindi
368 'hun' => 'east_european',
369 // Hungarian
370 'isl' => 'west_european',
371 // Icelandic
372 'ita' => 'west_european',
373 // Italian
374 'its' => 'west_european',
375 // Italian (Swiss)
376 'jpn' => 'japanese',
377 'khm' => 'unicode',
378 // Khmer
379 'kor' => 'korean',
380 'lth' => 'lithuanian',
381 'lvi' => 'west_european',
382 // Latvian/Lettish
383 'msl' => 'west_european',
384 // Malay
385 'nlb' => 'west_european',
386 // Dutch (Belgian)
387 'nld' => 'west_european',
388 // Dutch
389 'nor' => 'west_european',
390 // Norwegian (bokmal)
391 'non' => 'west_european',
392 // Norwegian (nynorsk)
393 'plk' => 'east_european',
394 // Polish
395 'ptg' => 'west_european',
396 // Portuguese
397 'ptb' => 'west_european',
398 // Portuguese (Brazil)
399 'rom' => 'east_european',
400 // Romanian
401 'rus' => 'cyrillic',
402 // Russian
403 'slv' => 'east_european',
404 // Slovenian
405 'sky' => 'east_european',
406 // Slovak
407 'srl' => 'east_european',
408 // Serbian (Latin)
409 'srb' => 'cyrillic',
410 // Serbian (Cyrillic)
411 'esp' => 'west_european',
412 // Spanish (trad. sort)
413 'esm' => 'west_european',
414 // Spanish (Mexican)
415 'esn' => 'west_european',
416 // Spanish (internat. sort)
417 'sve' => 'west_european',
418 // Swedish
419 'sqi' => 'albanian',
420 // Albanian
421 'tha' => 'thai',
422 'trk' => 'turkish',
423 'ukr' => 'cyrillic',
424 // Ukrainian
425 // English language names
426 'afrikaans' => 'west_european',
427 'albanian' => 'albanian',
428 'arabic' => 'arabic',
429 'basque' => 'west_european',
430 'bosnian' => 'east_european',
431 'bulgarian' => 'east_european',
432 'catalan' => 'west_european',
433 'croatian' => 'east_european',
434 'czech' => 'east_european',
435 'danish' => 'west_european',
436 'dutch' => 'west_european',
437 'english' => 'west_european',
438 'esperanto' => 'unicode',
439 'estonian' => 'estonian',
440 'faroese' => 'west_european',
441 'farsi' => 'arabic',
442 'finnish' => 'west_european',
443 'french' => 'west_european',
444 'galician' => 'west_european',
445 'georgian' => 'unicode',
446 'german' => 'west_european',
447 'greek' => 'greek',
448 'greenlandic' => 'west_european',
449 'hebrew' => 'hebrew',
450 'hindi' => 'unicode',
451 'hungarian' => 'east_european',
452 'icelandic' => 'west_european',
453 'italian' => 'west_european',
454 'khmer' => 'unicode',
455 'latvian' => 'west_european',
456 'lettish' => 'west_european',
457 'lithuanian' => 'lithuanian',
458 'malay' => 'west_european',
459 'norwegian' => 'west_european',
460 'persian' => 'arabic',
461 'polish' => 'east_european',
462 'portuguese' => 'west_european',
463 'russian' => 'cyrillic',
464 'romanian' => 'east_european',
465 'serbian' => 'cyrillic',
466 'slovak' => 'east_european',
467 'slovenian' => 'east_european',
468 'spanish' => 'west_european',
469 'svedish' => 'west_european',
470 'that' => 'thai',
471 'turkish' => 'turkish',
472 'ukrainian' => 'cyrillic'
473 );
474
475 /**
476 * Mapping of language (family) names to charsets on Unix
477 *
478 * @var array
479 */
480 public $script_to_charset_unix = array(
481 'west_european' => 'iso-8859-1',
482 'estonian' => 'iso-8859-1',
483 'east_european' => 'iso-8859-2',
484 'baltic' => 'iso-8859-4',
485 'cyrillic' => 'iso-8859-5',
486 'arabic' => 'iso-8859-6',
487 'greek' => 'iso-8859-7',
488 'hebrew' => 'iso-8859-8',
489 'turkish' => 'iso-8859-9',
490 'thai' => 'iso-8859-11',
491 // = TIS-620
492 'lithuanian' => 'iso-8859-13',
493 'chinese' => 'gb2312',
494 // = euc-cn
495 'japanese' => 'euc-jp',
496 'korean' => 'euc-kr',
497 'simpl_chinese' => 'gb2312',
498 'trad_chinese' => 'big5',
499 'vietnamese' => '',
500 'unicode' => 'utf-8',
501 'albanian' => 'utf-8'
502 );
503
504 /**
505 * Mapping of language (family) names to charsets on Windows
506 *
507 * @var array
508 */
509 public $script_to_charset_windows = array(
510 'east_european' => 'windows-1250',
511 'cyrillic' => 'windows-1251',
512 'west_european' => 'windows-1252',
513 'greek' => 'windows-1253',
514 'turkish' => 'windows-1254',
515 'hebrew' => 'windows-1255',
516 'arabic' => 'windows-1256',
517 'baltic' => 'windows-1257',
518 'estonian' => 'windows-1257',
519 'lithuanian' => 'windows-1257',
520 'vietnamese' => 'windows-1258',
521 'thai' => 'cp874',
522 'korean' => 'cp949',
523 'chinese' => 'gb2312',
524 'japanese' => 'shift_jis',
525 'simpl_chinese' => 'gb2312',
526 'trad_chinese' => 'big5',
527 'albanian' => 'windows-1250',
528 'unicode' => 'utf-8'
529 );
530
531 /**
532 * Mapping of locale names to charsets
533 *
534 * @var array
535 */
536 public $locale_to_charset = array(
537 'japanese.euc' => 'euc-jp',
538 'ja_jp.ujis' => 'euc-jp',
539 'korean.euc' => 'euc-kr',
540 'sr@Latn' => 'iso-8859-2',
541 'zh_cn' => 'gb2312',
542 'zh_hk' => 'big5',
543 'zh_tw' => 'big5'
544 );
545
546 /**
547 * TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
548 * Empty values means "iso-8859-1"
549 *
550 * @var array
551 */
552 public $charSetArray = array(
553 'af' => '',
554 'ar' => 'iso-8859-6',
555 'ba' => 'iso-8859-2',
556 'bg' => 'windows-1251',
557 'br' => '',
558 'ca' => 'iso-8859-15',
559 'ch' => 'gb2312',
560 'cs' => 'windows-1250',
561 'cz' => 'windows-1250',
562 'da' => '',
563 'de' => '',
564 'dk' => '',
565 'el' => 'iso-8859-7',
566 'eo' => 'utf-8',
567 'es' => '',
568 'et' => 'iso-8859-4',
569 'eu' => '',
570 'fa' => 'utf-8',
571 'fi' => '',
572 'fo' => 'utf-8',
573 'fr' => '',
574 'fr_CA' => '',
575 'ga' => '',
576 'ge' => 'utf-8',
577 'gl' => '',
578 'gr' => 'iso-8859-7',
579 'he' => 'utf-8',
580 'hi' => 'utf-8',
581 'hk' => 'big5',
582 'hr' => 'windows-1250',
583 'hu' => 'iso-8859-2',
584 'is' => 'utf-8',
585 'it' => '',
586 'ja' => 'shift_jis',
587 'jp' => 'shift_jis',
588 'ka' => 'utf-8',
589 'kl' => 'utf-8',
590 'km' => 'utf-8',
591 'ko' => 'euc-kr',
592 'kr' => 'euc-kr',
593 'lt' => 'windows-1257',
594 'lv' => 'utf-8',
595 'ms' => '',
596 'my' => '',
597 'nl' => '',
598 'no' => '',
599 'pl' => 'iso-8859-2',
600 'pt' => '',
601 'pt_BR' => '',
602 'qc' => '',
603 'ro' => 'iso-8859-2',
604 'ru' => 'windows-1251',
605 'se' => '',
606 'si' => 'windows-1250',
607 'sk' => 'windows-1250',
608 'sl' => 'windows-1250',
609 'sq' => 'utf-8',
610 'sr' => 'utf-8',
611 'sv' => '',
612 'th' => 'iso-8859-11',
613 'tr' => 'iso-8859-9',
614 'ua' => 'windows-1251',
615 'uk' => 'windows-1251',
616 'vi' => 'utf-8',
617 'vn' => 'utf-8',
618 'zh' => 'big5'
619 );
620
621 /**
622 * Default constructor.
623 */
624 public function __construct() {
625 $this->locales = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Localization\Locales::class);
626 }
627
628 /**
629 * Normalize - changes input character set to lowercase letters.
630 *
631 * @param string $charset Input charset
632 * @return string Normalized charset
633 */
634 public function parse_charset($charset) {
635 $charset = trim(strtolower($charset));
636 if (isset($this->synonyms[$charset])) {
637 $charset = $this->synonyms[$charset];
638 }
639 return $charset;
640 }
641
642 /**
643 * Get the charset of a locale.
644 *
645 * ln language
646 * ln_CN language / country
647 * ln_CN.cs language / country / charset
648 * ln_CN.cs@mod language / country / charset / modifier
649 *
650 * @param string $locale Locale string
651 * @return string Charset resolved for locale string
652 */
653 public function get_locale_charset($locale) {
654 $locale = strtolower($locale);
655 // Exact locale specific charset?
656 if (isset($this->locale_to_charset[$locale])) {
657 return $this->locale_to_charset[$locale];
658 }
659 // Get modifier
660 list($locale, $modifier) = explode('@', $locale);
661 // Locale contains charset: use it
662 list($locale, $charset) = explode('.', $locale);
663 if ($charset) {
664 return $this->parse_charset($charset);
665 }
666 // Modifier is 'euro' (after charset check, because of xx.utf-8@euro)
667 if ($modifier === 'euro') {
668 return 'iso-8859-15';
669 }
670 // Get language
671 list($language, $country) = explode('_', $locale);
672 if (isset($this->lang_to_script[$language])) {
673 $script = $this->lang_to_script[$language];
674 }
675 if (TYPO3_OS === 'WIN') {
676 $cs = $this->script_to_charset_windows[$script] ?: 'windows-1252';
677 } else {
678 $cs = $this->script_to_charset_unix[$script] ?: 'utf-8';
679 }
680 return $cs;
681 }
682
683 /********************************************
684 *
685 * Charset Conversion functions
686 *
687 ********************************************/
688 /**
689 * Convert from one charset to another charset.
690 *
691 * @param string $str Input string
692 * @param string $fromCS From charset (the current charset of the string)
693 * @param string $toCS To charset (the output charset wanted)
694 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
695 * @return string Converted string
696 * @see convArray()
697 */
698 public function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
699 if ($fromCS == $toCS) {
700 return $str;
701 }
702 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
703 if ($toCS === 'utf-8' || !$useEntityForNoChar) {
704 switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
705 case 'mbstring':
706 $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
707 if (FALSE !== $conv_str) {
708 return $conv_str;
709 }
710 // Returns FALSE for unsupported charsets
711 break;
712 case 'iconv':
713 $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
714 if (FALSE !== $conv_str) {
715 return $conv_str;
716 }
717 break;
718 case 'recode':
719 $conv_str = recode_string($fromCS . '..' . $toCS, $str);
720 if (FALSE !== $conv_str) {
721 return $conv_str;
722 }
723 break;
724 }
725 }
726 if ($fromCS !== 'utf-8') {
727 $str = $this->utf8_encode($str, $fromCS);
728 }
729 if ($toCS !== 'utf-8') {
730 $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
731 }
732 return $str;
733 }
734
735 /**
736 * Convert all elements in ARRAY with type string from one charset to another charset.
737 * NOTICE: Array is passed by reference!
738 *
739 * @param string $array Input array, possibly multidimensional
740 * @param string $fromCS From charset (the current charset of the string)
741 * @param string $toCS To charset (the output charset wanted)
742 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
743 * @return void
744 * @see conv()
745 */
746 public function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
747 foreach ($array as $key => $value) {
748 if (is_array($array[$key])) {
749 $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
750 } elseif (is_string($array[$key])) {
751 $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
752 }
753 }
754 }
755
756 /**
757 * Converts $str from $charset to UTF-8
758 *
759 * @param string $str String in local charset to convert to UTF-8
760 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
761 * @return string Output string, converted to UTF-8
762 */
763 public function utf8_encode($str, $charset) {
764 if ($charset === 'utf-8') {
765 return $str;
766 }
767 // Charset is case-insensitive
768 // Parse conv. table if not already
769 if ($this->initCharset($charset)) {
770 $strLen = strlen($str);
771 $outStr = '';
772 // Traverse each char in string
773 for ($a = 0; $a < $strLen; $a++) {
774 $chr = substr($str, $a, 1);
775 $ord = ord($chr);
776 // If the charset has two bytes per char
777 if (isset($this->twoByteSets[$charset])) {
778 $ord2 = ord($str[$a + 1]);
779 // Assume big endian
780 $ord = $ord << 8 | $ord2;
781 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
782 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
783 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
784 } else {
785 $outStr .= chr($this->noCharByteVal);
786 }
787 // No char exists
788 $a++;
789 } elseif ($ord > 127) {
790 // If char has value over 127 it's a multibyte char in UTF-8
791 // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
792 if (isset($this->eucBasedSets[$charset])) {
793 // Shift-JIS: chars between 160 and 223 are single byte
794 if ($charset !== 'shift_jis' || ($ord < 160 || $ord > 223)) {
795 $a++;
796 $ord2 = ord(substr($str, $a, 1));
797 $ord = $ord * 256 + $ord2;
798 }
799 }
800 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
801 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
802 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
803 } else {
804 $outStr .= chr($this->noCharByteVal);
805 }
806 } else {
807 $outStr .= $chr;
808 }
809 }
810 return $outStr;
811 }
812 }
813
814 /**
815 * Converts $str from UTF-8 to $charset
816 *
817 * @param string $str String in UTF-8 to convert to local charset
818 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
819 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
820 * @return string Output string, converted to local charset
821 */
822 public function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
823 if ($charset === 'utf-8') {
824 return $str;
825 }
826 // Charset is case-insensitive.
827 // Parse conv. table if not already
828 if ($this->initCharset($charset)) {
829 $strLen = strlen($str);
830 $outStr = '';
831 $buf = '';
832 // Traverse each char in UTF-8 string
833 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) {
834 $chr = substr($str, $a, 1);
835 $ord = ord($chr);
836 // This means multibyte! (first byte!)
837 if ($ord > 127) {
838 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
839 if ($ord & 64) {
840 // Add first byte
841 $buf = $chr;
842 // For each byte in multibyte string
843 for ($b = 0; $b < 8; $b++) {
844 // Shift it left and
845 $ord = $ord << 1;
846 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
847 if ($ord & 128) {
848 $a++;
849 // ... and add the next char.
850 $buf .= substr($str, $a, 1);
851 } else {
852 break;
853 }
854 }
855 // If the UTF-8 char-sequence is found then...
856 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
857 // The local number
858 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
859 // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
860 if ($mByte > 255) {
861 $outStr .= chr(($mByte >> 8 & 255)) . chr(($mByte & 255));
862 } else {
863 $outStr .= chr($mByte);
864 }
865 } elseif ($useEntityForNoChar) {
866 // Create num entity:
867 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
868 } else {
869 $outStr .= chr($this->noCharByteVal);
870 }
871 } else {
872 $outStr .= chr($this->noCharByteVal);
873 }
874 } else {
875 $outStr .= $chr;
876 }
877 }
878 return $outStr;
879 }
880 }
881
882 /**
883 * Converts all chars > 127 to numeric entities.
884 *
885 * @param string $str Input string
886 * @return string Output string
887 */
888 public function utf8_to_entities($str) {
889 $strLen = strlen($str);
890 $outStr = '';
891 $buf = '';
892 // Traverse each char in UTF-8 string.
893 for ($a = 0; $a < $strLen; $a++) {
894 $chr = substr($str, $a, 1);
895 $ord = ord($chr);
896 // This means multibyte! (first byte!)
897 if ($ord > 127) {
898 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
899 if ($ord & 64) {
900 // Add first byte
901 $buf = $chr;
902 // For each byte in multibyte string...
903 for ($b = 0; $b < 8; $b++) {
904 // Shift it left and ...
905 $ord = $ord << 1;
906 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
907 if ($ord & 128) {
908 $a++;
909 // ... and add the next char.
910 $buf .= substr($str, $a, 1);
911 } else {
912 break;
913 }
914 }
915 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
916 } else {
917 $outStr .= chr($this->noCharByteVal);
918 }
919 } else {
920 $outStr .= $chr;
921 }
922 }
923 return $outStr;
924 }
925
926 /**
927 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
928 *
929 * @param string $str Input string, UTF-8
930 * @param bool $alsoStdHtmlEnt If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
931 * @return string Output string
932 */
933 public function entities_to_utf8($str, $alsoStdHtmlEnt = FALSE) {
934 if ($alsoStdHtmlEnt) {
935 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8'));
936 }
937 $token = md5(microtime());
938 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
939 foreach ($parts as $k => $v) {
940 // Only take every second element
941 if ($k % 2 === 0) {
942 continue;
943 }
944 $position = 0;
945 // Dec or hex entities
946 if (substr($v, $position, 1) === '#') {
947 $position++;
948 if (substr($v, $position, 1) === 'x') {
949 $v = hexdec(substr($v, ++$position));
950 } else {
951 $v = substr($v, $position);
952 }
953 $parts[$k] = $this->UnumberToChar($v);
954 } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) {
955 // Other entities:
956 $v = $trans_tbl['&' . $v . ';'];
957 $parts[$k] = $v;
958 } else {
959 // No conversion:
960 $parts[$k] = '&' . $v . ';';
961 }
962 }
963 return implode('', $parts);
964 }
965
966 /**
967 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
968 *
969 * @param string $str Input string, UTF-8
970 * @param bool $convEntities If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
971 * @param bool $retChar If set, then instead of integer numbers the real UTF-8 char is returned.
972 * @return array Output array with the char numbers
973 */
974 public function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
975 // If entities must be registered as well...:
976 if ($convEntities) {
977 $str = $this->entities_to_utf8($str, 1);
978 }
979 // Do conversion:
980 $strLen = strlen($str);
981 $outArr = array();
982 $buf = '';
983 // Traverse each char in UTF-8 string.
984 for ($a = 0; $a < $strLen; $a++) {
985 $chr = substr($str, $a, 1);
986 $ord = ord($chr);
987 // This means multibyte! (first byte!)
988 if ($ord > 127) {
989 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
990 if ($ord & 64) {
991 // Add first byte
992 $buf = $chr;
993 // For each byte in multibyte string...
994 for ($b = 0; $b < 8; $b++) {
995 // Shift it left and ...
996 $ord = $ord << 1;
997 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
998 if ($ord & 128) {
999 $a++;
1000 // ... and add the next char.
1001 $buf .= substr($str, $a, 1);
1002 } else {
1003 break;
1004 }
1005 }
1006 $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
1007 } else {
1008 $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
1009 }
1010 } else {
1011 $outArr[] = $retChar ? chr($ord) : $ord;
1012 }
1013 }
1014 return $outArr;
1015 }
1016
1017 /**
1018 * Converts a UNICODE number to a UTF-8 multibyte character
1019 * Algorithm based on script found at From: http://czyborra.com/utf/
1020 * Unit-tested by Kasper
1021 *
1022 * The binary representation of the character's integer value is thus simply spread across the bytes
1023 * and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
1024 *
1025 * bytes | bits | representation
1026 * 1 | 7 | 0vvvvvvv
1027 * 2 | 11 | 110vvvvv 10vvvvvv
1028 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
1029 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
1030 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
1031 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
1032 *
1033 * @param int $cbyte UNICODE integer
1034 * @return string UTF-8 multibyte character string
1035 * @see utf8CharToUnumber()
1036 */
1037 public function UnumberToChar($cbyte) {
1038 $str = '';
1039 if ($cbyte < 128) {
1040 $str .= chr($cbyte);
1041 } else {
1042 if ($cbyte < 2048) {
1043 $str .= chr(192 | $cbyte >> 6);
1044 $str .= chr(128 | $cbyte & 63);
1045 } else {
1046 if ($cbyte < 65536) {
1047 $str .= chr(224 | $cbyte >> 12);
1048 $str .= chr(128 | $cbyte >> 6 & 63);
1049 $str .= chr(128 | $cbyte & 63);
1050 } else {
1051 if ($cbyte < 2097152) {
1052 $str .= chr(240 | $cbyte >> 18);
1053 $str .= chr(128 | $cbyte >> 12 & 63);
1054 $str .= chr(128 | $cbyte >> 6 & 63);
1055 $str .= chr(128 | $cbyte & 63);
1056 } else {
1057 if ($cbyte < 67108864) {
1058 $str .= chr(248 | $cbyte >> 24);
1059 $str .= chr(128 | $cbyte >> 18 & 63);
1060 $str .= chr(128 | $cbyte >> 12 & 63);
1061 $str .= chr(128 | $cbyte >> 6 & 63);
1062 $str .= chr(128 | $cbyte & 63);
1063 } else {
1064 if ($cbyte < 2147483648) {
1065 $str .= chr(252 | $cbyte >> 30);
1066 $str .= chr(128 | $cbyte >> 24 & 63);
1067 $str .= chr(128 | $cbyte >> 18 & 63);
1068 $str .= chr(128 | $cbyte >> 12 & 63);
1069 $str .= chr(128 | $cbyte >> 6 & 63);
1070 $str .= chr(128 | $cbyte & 63);
1071 } else {
1072 // Cannot express a 32-bit character in UTF-8
1073 $str .= chr($this->noCharByteVal);
1074 }
1075 }
1076 }
1077 }
1078 }
1079 }
1080 return $str;
1081 }
1082
1083 /**
1084 * Converts a UTF-8 Multibyte character to a UNICODE number
1085 * Unit-tested by Kasper
1086 *
1087 * @param string $str UTF-8 multibyte character string
1088 * @param bool $hex If set, then a hex. number is returned.
1089 * @return int UNICODE integer
1090 * @see UnumberToChar()
1091 */
1092 public function utf8CharToUnumber($str, $hex = 0) {
1093 // First char
1094 $ord = ord($str[0]);
1095 // This verifyes that it IS a multi byte string
1096 if (($ord & 192) == 192) {
1097 $binBuf = '';
1098 // For each byte in multibyte string...
1099 for ($b = 0; $b < 8; $b++) {
1100 // Shift it left and ...
1101 $ord = $ord << 1;
1102 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
1103 if ($ord & 128) {
1104 $binBuf .= substr('00000000' . decbin(ord(substr($str, ($b + 1), 1))), -6);
1105 } else {
1106 break;
1107 }
1108 }
1109 $binBuf = substr(('00000000' . decbin(ord($str[0]))), -(6 - $b)) . $binBuf;
1110 $int = bindec($binBuf);
1111 } else {
1112 $int = $ord;
1113 }
1114 return $hex ? 'x' . dechex($int) : $int;
1115 }
1116
1117 /********************************************
1118 *
1119 * Init functions
1120 *
1121 ********************************************/
1122 /**
1123 * This will initialize a charset for use if it's defined in the 'typo3/sysext/core/Resources/Private/Charsets/csconvtbl/' folder
1124 * This function is automatically called by the conversion functions
1125 *
1126 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
1127 *
1128 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
1129 * @return int Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
1130 * @acces private
1131 */
1132 public function initCharset($charset) {
1133 // Only process if the charset is not yet loaded:
1134 if (!is_array($this->parsedCharsets[$charset])) {
1135 // Conversion table filename:
1136 $charsetConvTableFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/csconvtbl/' . $charset . '.tbl';
1137 // If the conversion table is found:
1138 if ($charset && GeneralUtility::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
1139 // Cache file for charsets:
1140 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1141 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1142 if ($cacheFile && @is_file($cacheFile)) {
1143 $this->parsedCharsets[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1144 } else {
1145 // Parse conversion table into lines:
1146 $lines = GeneralUtility::trimExplode(LF, GeneralUtility::getUrl($charsetConvTableFile), TRUE);
1147 // Initialize the internal variable holding the conv. table:
1148 $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
1149 // traverse the lines:
1150 $detectedType = '';
1151 foreach ($lines as $value) {
1152 // Comment line or blanks are ignored.
1153 if (trim($value) && $value[0] !== '#') {
1154 // Detect type if not done yet: (Done on first real line)
1155 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1156 if (!$detectedType) {
1157 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
1158 }
1159 if ($detectedType === 'ms-token') {
1160 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1161 } elseif ($detectedType === 'whitespaced') {
1162 $regA = array();
1163 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1164 $hexbyte = $regA[1];
1165 $utf8 = 'U+' . $regA[2];
1166 }
1167 $decval = hexdec(trim($hexbyte));
1168 if ($decval > 127) {
1169 $utf8decval = hexdec(substr(trim($utf8), 2));
1170 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1171 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1172 }
1173 }
1174 }
1175 if ($cacheFile) {
1176 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1177 }
1178 }
1179 return 2;
1180 } else {
1181 return FALSE;
1182 }
1183 } else {
1184 return 1;
1185 }
1186 }
1187
1188 /**
1189 * This function initializes all UTF-8 character data tables.
1190 *
1191 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1192 *
1193 * @param string $mode Mode ("case", "ascii", ...)
1194 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1195 * @access private
1196 */
1197 public function initUnicodeData($mode = NULL) {
1198 // Cache files
1199 $cacheFileCase = GeneralUtility::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1200 $cacheFileASCII = GeneralUtility::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1201 // Only process if the tables are not yet loaded
1202 switch ($mode) {
1203 case 'case':
1204 if (is_array($this->caseFolding['utf-8'])) {
1205 return 1;
1206 }
1207 // Use cached version if possible
1208 if ($cacheFileCase && @is_file($cacheFileCase)) {
1209 $this->caseFolding['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileCase));
1210 return 2;
1211 }
1212 break;
1213 case 'ascii':
1214 if (is_array($this->toASCII['utf-8'])) {
1215 return 1;
1216 }
1217 // Use cached version if possible
1218 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1219 $this->toASCII['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileASCII));
1220 return 2;
1221 }
1222 break;
1223 }
1224 // Process main Unicode data file
1225 $unicodeDataFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/UnicodeData.txt';
1226 if (!(GeneralUtility::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1227 return FALSE;
1228 }
1229 $fh = fopen($unicodeDataFile, 'rb');
1230 if (!$fh) {
1231 return FALSE;
1232 }
1233 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1234 // Note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1235 $this->caseFolding['utf-8'] = array();
1236 $utf8CaseFolding = &$this->caseFolding['utf-8'];
1237 // a shorthand
1238 $utf8CaseFolding['toUpper'] = array();
1239 $utf8CaseFolding['toLower'] = array();
1240 $utf8CaseFolding['toTitle'] = array();
1241 // Array of temp. decompositions
1242 $decomposition = array();
1243 // Array of chars that are marks (eg. composing accents)
1244 $mark = array();
1245 // Array of chars that are numbers (eg. digits)
1246 $number = array();
1247 // Array of chars to be omitted (eg. Russian hard sign)
1248 $omit = array();
1249 while (!feof($fh)) {
1250 $line = fgets($fh, 4096);
1251 // Has a lot of info
1252 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title, ) = explode(';', rtrim($line));
1253 $ord = hexdec($char);
1254 if ($ord > 65535) {
1255 // Only process the BMP
1256 break;
1257 }
1258 $utf8_char = $this->UnumberToChar($ord);
1259 if ($upper) {
1260 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1261 }
1262 if ($lower) {
1263 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1264 }
1265 // Store "title" only when different from "upper" (only a few)
1266 if ($title && $title != $upper) {
1267 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1268 }
1269 switch ($cat[0]) {
1270 case 'M':
1271 // mark (accent, umlaut, ...)
1272 $mark['U+' . $char] = 1;
1273 break;
1274 case 'N':
1275 // numeric value
1276 if ($ord > 128 && $num != '') {
1277 $number['U+' . $char] = $num;
1278 }
1279 }
1280 // Accented Latin letters without "official" decomposition
1281 $match = array();
1282 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1283 $c = ord($match[2]);
1284 if ($match[1] === 'SMALL') {
1285 $c += 32;
1286 }
1287 $decomposition['U+' . $char] = array(dechex($c));
1288 continue;
1289 }
1290 $match = array();
1291 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1292 switch ($match[1]) {
1293 case '<circle>':
1294 // add parenthesis as circle replacement, eg (1)
1295 $match[2] = '0028 ' . $match[2] . ' 0029';
1296 break;
1297 case '<square>':
1298 // add square brackets as square replacement, eg [1]
1299 $match[2] = '005B ' . $match[2] . ' 005D';
1300 break;
1301 case '<compat>':
1302 // ignore multi char decompositions that start with a space
1303 if (preg_match('/^0020 /', $match[2])) {
1304 continue 2;
1305 }
1306 break;
1307 case '<initial>':
1308 case '<medial>':
1309 case '<final>':
1310 case '<isolated>':
1311 case '<vertical>':
1312 continue 2;
1313 }
1314 $decomposition['U+' . $char] = explode(' ', $match[2]);
1315 }
1316 }
1317 fclose($fh);
1318 // Process additional Unicode data for casing (allow folded characters to expand into a sequence)
1319 $specialCasingFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/SpecialCasing.txt';
1320 if (GeneralUtility::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1321 $fh = fopen($specialCasingFile, 'rb');
1322 if ($fh) {
1323 while (!feof($fh)) {
1324 $line = fgets($fh, 4096);
1325 if ($line[0] !== '#' && trim($line) !== '') {
1326 list($char, $lower, $title, $upper, $cond) = GeneralUtility::trimExplode(';', $line);
1327 if ($cond === '' || $cond[0] === '#') {
1328 $utf8_char = $this->UnumberToChar(hexdec($char));
1329 if ($char !== $lower) {
1330 $arr = explode(' ', $lower);
1331 for ($i = 0; isset($arr[$i]); $i++) {
1332 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1333 }
1334 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1335 }
1336 if ($char !== $title && $title !== $upper) {
1337 $arr = explode(' ', $title);
1338 for ($i = 0; isset($arr[$i]); $i++) {
1339 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1340 }
1341 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1342 }
1343 if ($char !== $upper) {
1344 $arr = explode(' ', $upper);
1345 for ($i = 0; isset($arr[$i]); $i++) {
1346 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1347 }
1348 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1349 }
1350 }
1351 }
1352 }
1353 fclose($fh);
1354 }
1355 }
1356 // Process custom decompositions
1357 $customTranslitFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/Translit.txt';
1358 if (GeneralUtility::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1359 $fh = fopen($customTranslitFile, 'rb');
1360 if ($fh) {
1361 while (!feof($fh)) {
1362 $line = fgets($fh, 4096);
1363 if ($line[0] !== '#' && trim($line) !== '') {
1364 list($char, $translit) = GeneralUtility::trimExplode(';', $line);
1365 if (!$translit) {
1366 $omit['U+' . $char] = 1;
1367 }
1368 $decomposition['U+' . $char] = explode(' ', $translit);
1369 }
1370 }
1371 fclose($fh);
1372 }
1373 }
1374 // Decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1375 foreach ($decomposition as $from => $to) {
1376 $code_decomp = array();
1377 while ($code_value = array_shift($to)) {
1378 // Do recursive decomposition
1379 if (isset($decomposition['U+' . $code_value])) {
1380 foreach (array_reverse($decomposition['U+' . $code_value]) as $cv) {
1381 array_unshift($to, $cv);
1382 }
1383 } elseif (!isset($mark[('U+' . $code_value)])) {
1384 // remove mark
1385 array_push($code_decomp, $code_value);
1386 }
1387 }
1388 if (count($code_decomp) || isset($omit[$from])) {
1389 $decomposition[$from] = $code_decomp;
1390 } else {
1391 unset($decomposition[$from]);
1392 }
1393 }
1394 // Create ascii only mapping
1395 $this->toASCII['utf-8'] = array();
1396 $ascii = &$this->toASCII['utf-8'];
1397 foreach ($decomposition as $from => $to) {
1398 $code_decomp = array();
1399 while ($code_value = array_shift($to)) {
1400 $ord = hexdec($code_value);
1401 if ($ord > 127) {
1402 continue 2;
1403 } else {
1404 // Skip decompositions containing non-ASCII chars
1405 array_push($code_decomp, chr($ord));
1406 }
1407 }
1408 $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1409 }
1410 // Add numeric decompositions
1411 foreach ($number as $from => $to) {
1412 $utf8_char = $this->UnumberToChar(hexdec($from));
1413 if (!isset($ascii[$utf8_char])) {
1414 $ascii[$utf8_char] = $to;
1415 }
1416 }
1417 if ($cacheFileCase) {
1418 GeneralUtility::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1419 }
1420 if ($cacheFileASCII) {
1421 GeneralUtility::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1422 }
1423 return 3;
1424 }
1425
1426 /**
1427 * This function initializes the folding table for a charset other than UTF-8.
1428 * This function is automatically called by the case folding functions.
1429 *
1430 * @param string $charset Charset for which to initialize case folding.
1431 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1432 * @access private
1433 */
1434 public function initCaseFolding($charset) {
1435 // Only process if the case table is not yet loaded:
1436 if (is_array($this->caseFolding[$charset])) {
1437 return 1;
1438 }
1439 // Use cached version if possible
1440 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1441 if ($cacheFile && @is_file($cacheFile)) {
1442 $this->caseFolding[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1443 return 2;
1444 }
1445 // init UTF-8 conversion for this charset
1446 if (!$this->initCharset($charset)) {
1447 return FALSE;
1448 }
1449 // UTF-8 case folding is used as the base conversion table
1450 if (!$this->initUnicodeData('case')) {
1451 return FALSE;
1452 }
1453 $nochar = chr($this->noCharByteVal);
1454 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1455 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1456 $c = $this->utf8_decode($utf8, $charset);
1457 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1458 if ($cc !== '' && $cc !== $nochar) {
1459 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1460 }
1461 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1462 if ($cc !== '' && $cc !== $nochar) {
1463 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1464 }
1465 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1466 if ($cc !== '' && $cc !== $nochar) {
1467 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1468 }
1469 }
1470 // Add the ASCII case table
1471 $start = ord('a');
1472 $end = ord('z');
1473 for ($i = $start; $i <= $end; $i++) {
1474 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1475 }
1476 $start = ord('A');
1477 $end = ord('Z');
1478 for ($i = $start; $i <= $end; $i++) {
1479 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1480 }
1481 if ($cacheFile) {
1482 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1483 }
1484 return 3;
1485 }
1486
1487 /**
1488 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1489 * This function is automatically called by the ASCII transliteration functions.
1490 *
1491 * @param string $charset Charset for which to initialize conversion.
1492 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1493 * @access private
1494 */
1495 public function initToASCII($charset) {
1496 // Only process if the case table is not yet loaded:
1497 if (is_array($this->toASCII[$charset])) {
1498 return 1;
1499 }
1500 // Use cached version if possible
1501 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1502 if ($cacheFile && @is_file($cacheFile)) {
1503 $this->toASCII[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1504 return 2;
1505 }
1506 // Init UTF-8 conversion for this charset
1507 if (!$this->initCharset($charset)) {
1508 return FALSE;
1509 }
1510 // UTF-8/ASCII transliteration is used as the base conversion table
1511 if (!$this->initUnicodeData('ascii')) {
1512 return FALSE;
1513 }
1514 $nochar = chr($this->noCharByteVal);
1515 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1516 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1517 $c = $this->utf8_decode($utf8, $charset);
1518 if (isset($this->toASCII['utf-8'][$utf8])) {
1519 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1520 }
1521 }
1522 if ($cacheFile) {
1523 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1524 }
1525 return 3;
1526 }
1527
1528 /********************************************
1529 *
1530 * String operation functions
1531 *
1532 ********************************************/
1533 /**
1534 * Returns a part of a string.
1535 * Unit-tested by Kasper (single byte charsets only)
1536 *
1537 * @param string $charset The character set
1538 * @param string $string Character string
1539 * @param int $start Start position (character position)
1540 * @param int $len Length (in characters)
1541 * @return string The substring
1542 * @see substr(), mb_substr()
1543 */
1544 public function substr($charset, $string, $start, $len = NULL) {
1545 if ($len === 0 || $string === '') {
1546 return '';
1547 }
1548 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1549 // Cannot omit $len, when specifying charset
1550 if ($len === NULL) {
1551 // Save internal encoding
1552 $enc = mb_internal_encoding();
1553 mb_internal_encoding($charset);
1554 $str = mb_substr($string, $start);
1555 // Restore internal encoding
1556 mb_internal_encoding($enc);
1557 return $str;
1558 } else {
1559 return mb_substr($string, $start, $len, $charset);
1560 }
1561 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'iconv') {
1562 // Cannot omit $len, when specifying charset
1563 if ($len === NULL) {
1564 // Save internal encoding
1565 $enc = iconv_get_encoding('internal_encoding');
1566 iconv_set_encoding('internal_encoding', $charset);
1567 $str = iconv_substr($string, $start);
1568 // Restore internal encoding
1569 iconv_set_encoding('internal_encoding', $enc);
1570 return $str;
1571 } else {
1572 return iconv_substr($string, $start, $len, $charset);
1573 }
1574 } elseif ($charset === 'utf-8') {
1575 return $this->utf8_substr($string, $start, $len);
1576 } elseif ($this->eucBasedSets[$charset]) {
1577 return $this->euc_substr($string, $start, $charset, $len);
1578 } elseif ($this->twoByteSets[$charset]) {
1579 return substr($string, $start * 2, $len * 2);
1580 } elseif ($this->fourByteSets[$charset]) {
1581 return substr($string, $start * 4, $len * 4);
1582 }
1583 // Treat everything else as single-byte encoding
1584 return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
1585 }
1586
1587 /**
1588 * Counts the number of characters.
1589 * Unit-tested by Kasper (single byte charsets only)
1590 *
1591 * @param string $charset The character set
1592 * @param string $string Character string
1593 * @return int The number of characters
1594 * @see strlen()
1595 */
1596 public function strlen($charset, $string) {
1597 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1598 return mb_strlen($string, $charset);
1599 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'iconv') {
1600 return iconv_strlen($string, $charset);
1601 } elseif ($charset == 'utf-8') {
1602 return $this->utf8_strlen($string);
1603 } elseif ($this->eucBasedSets[$charset]) {
1604 return $this->euc_strlen($string, $charset);
1605 } elseif ($this->twoByteSets[$charset]) {
1606 return strlen($string) / 2;
1607 } elseif ($this->fourByteSets[$charset]) {
1608 return strlen($string) / 4;
1609 }
1610 // Treat everything else as single-byte encoding
1611 return strlen($string);
1612 }
1613
1614 /**
1615 * Method to crop strings using the mb_substr function.
1616 *
1617 * @param string $charset The character set
1618 * @param string $string String to be cropped
1619 * @param int $len Crop length (in characters)
1620 * @param string $crop Crop signifier
1621 * @return string The shortened string
1622 * @see mb_strlen(), mb_substr()
1623 */
1624 protected function cropMbstring($charset, $string, $len, $crop = '') {
1625 if ((int)$len === 0 || mb_strlen($string, $charset) <= abs($len)) {
1626 return $string;
1627 }
1628 if ($len > 0) {
1629 $string = mb_substr($string, 0, $len, $charset) . $crop;
1630 } else {
1631 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1632 }
1633 return $string;
1634 }
1635
1636 /**
1637 * Truncates a string and pre-/appends a string.
1638 * Unit tested by Kasper
1639 *
1640 * @param string $charset The character set
1641 * @param string $string Character string
1642 * @param int $len Length (in characters)
1643 * @param string $crop Crop signifier
1644 * @return string The shortened string
1645 * @see substr(), mb_strimwidth()
1646 */
1647 public function crop($charset, $string, $len, $crop = '') {
1648 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1649 return $this->cropMbstring($charset, $string, $len, $crop);
1650 }
1651 if ((int)$len === 0) {
1652 return $string;
1653 }
1654 if ($charset == 'utf-8') {
1655 $i = $this->utf8_char2byte_pos($string, $len);
1656 } elseif ($this->eucBasedSets[$charset]) {
1657 $i = $this->euc_char2byte_pos($string, $len, $charset);
1658 } else {
1659 if ($len > 0) {
1660 $i = $len;
1661 } else {
1662 $i = strlen($string) + $len;
1663 if ($i <= 0) {
1664 $i = FALSE;
1665 }
1666 }
1667 }
1668 // $len outside actual string length
1669 if ($i === FALSE) {
1670 return $string;
1671 } else {
1672 if ($len > 0) {
1673 if (strlen($string[$i])) {
1674 return substr($string, 0, $i) . $crop;
1675 }
1676 } else {
1677 if (strlen($string[$i - 1])) {
1678 return $crop . substr($string, $i);
1679 }
1680 }
1681 }
1682 return $string;
1683 }
1684
1685 /**
1686 * Cuts a string short at a given byte length.
1687 *
1688 * @param string $charset The character set
1689 * @param string $string Character string
1690 * @param int $len The byte length
1691 * @return string The shortened string
1692 * @see mb_strcut()
1693 */
1694 public function strtrunc($charset, $string, $len) {
1695 if ($len <= 0) {
1696 return '';
1697 }
1698 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1699 return mb_strcut($string, 0, $len, $charset);
1700 } elseif ($charset == 'utf-8') {
1701 return $this->utf8_strtrunc($string, $len);
1702 } elseif ($this->eucBasedSets[$charset]) {
1703 return $this->euc_strtrunc($string, $len, $charset);
1704 } elseif ($this->twoByteSets[$charset]) {
1705 if ($len % 2) {
1706 $len--;
1707 }
1708 } elseif ($this->fourByteSets[$charset]) {
1709 $x = $len % 4;
1710 // Realign to position dividable by four
1711 $len -= $x;
1712 }
1713 // Treat everything else as single-byte encoding
1714 return substr($string, 0, $len);
1715 }
1716
1717 /**
1718 * Translates all characters of a string into their respective case values.
1719 * Unlike strtolower() and strtoupper() this method is locale independent.
1720 * Note that the string length may change!
1721 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1722 * Unit-tested by Kasper
1723 * Real case folding is language dependent, this method ignores this fact.
1724 *
1725 * @param string $charset Character set of string
1726 * @param string $string Input string to convert case for
1727 * @param string $case Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1728 * @return string The converted string
1729 * @see strtolower(), strtoupper()
1730 */
1731 public function conv_case($charset, $string, $case) {
1732 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1733 if ($case === 'toLower') {
1734 $string = mb_strtolower($string, $charset);
1735 } else {
1736 $string = mb_strtoupper($string, $charset);
1737 }
1738 } elseif ($charset === 'utf-8') {
1739 $string = $this->utf8_char_mapping($string, 'case', $case);
1740 } elseif (isset($this->eucBasedSets[$charset])) {
1741 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1742 } else {
1743 // Treat everything else as single-byte encoding
1744 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1745 }
1746 return $string;
1747 }
1748
1749 /**
1750 * Equivalent of lcfirst/ucfirst but using character set.
1751 *
1752 * @param string $charset
1753 * @param string $string
1754 * @param string $case
1755 * @return string
1756 * @see \TYPO3\CMS\Core\Charset\CharsetConverter::conv_case()
1757 */
1758 public function convCaseFirst($charset, $string, $case) {
1759 $firstChar = $this->substr($charset, $string, 0, 1);
1760 $firstChar = $this->conv_case($charset, $firstChar, $case);
1761 $remainder = $this->substr($charset, $string, 1);
1762 return $firstChar . $remainder;
1763 }
1764
1765 /**
1766 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1767 *
1768 * @param string $charset Character set of string
1769 * @param string $string Input string to convert
1770 * @return string The converted string
1771 */
1772 public function specCharsToASCII($charset, $string) {
1773 if ($charset === 'utf-8') {
1774 $string = $this->utf8_char_mapping($string, 'ascii');
1775 } elseif (isset($this->eucBasedSets[$charset])) {
1776 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1777 } else {
1778 // Treat everything else as single-byte encoding
1779 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1780 }
1781 return $string;
1782 }
1783
1784 /**
1785 * Converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1786 * into a TYPO3-readable language code
1787 *
1788 * @param string $languageCodesList List of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1789 * @return string A preferred language that TYPO3 supports, or "default" if none found
1790 */
1791 public function getPreferredClientLanguage($languageCodesList) {
1792 $allLanguageCodes = array();
1793 $selectedLanguage = 'default';
1794 // Get all languages where TYPO3 code is the same as the ISO code
1795 foreach ($this->charSetArray as $typo3Lang => $charSet) {
1796 $allLanguageCodes[$typo3Lang] = $typo3Lang;
1797 }
1798 // Get all languages where TYPO3 code differs from ISO code
1799 // or needs the country part
1800 // the iso codes will here overwrite the default typo3 language in the key
1801 foreach ($this->locales->getIsoMapping() as $typo3Lang => $isoLang) {
1802 $isoLang = join('-', explode('_', $isoLang));
1803 $allLanguageCodes[$typo3Lang] = $isoLang;
1804 }
1805 // Move the iso codes to the (because we're comparing the keys with "isset" later on)
1806 $allLanguageCodes = array_flip($allLanguageCodes);
1807 $preferredLanguages = GeneralUtility::trimExplode(',', $languageCodesList);
1808 // Order the preferred languages after they key
1809 $sortedPreferredLanguages = array();
1810 foreach ($preferredLanguages as $preferredLanguage) {
1811 $quality = 1.0;
1812 if (strpos($preferredLanguage, ';q=') !== FALSE) {
1813 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1814 }
1815 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1816 }
1817 // Loop through the languages, with the highest priority first
1818 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1819 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1820 if (isset($allLanguageCodes[$preferredLanguage])) {
1821 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1822 break;
1823 }
1824 // Strip the country code from the end
1825 list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
1826 if (isset($allLanguageCodes[$preferredLanguage])) {
1827 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1828 break;
1829 }
1830 }
1831 if (!$selectedLanguage || $selectedLanguage === 'en') {
1832 $selectedLanguage = 'default';
1833 }
1834 return $selectedLanguage;
1835 }
1836
1837 /********************************************
1838 *
1839 * Internal string operation functions
1840 *
1841 ********************************************/
1842 /**
1843 * Maps all characters of a string in a single byte charset.
1844 *
1845 * @param string $str The string
1846 * @param string $charset The charset
1847 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1848 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
1849 * @return string The converted string
1850 */
1851 public function sb_char_mapping($str, $charset, $mode, $opt = '') {
1852 switch ($mode) {
1853 case 'case':
1854 if (!$this->initCaseFolding($charset)) {
1855 return $str;
1856 }
1857 // Do nothing
1858 $map = &$this->caseFolding[$charset][$opt];
1859 break;
1860 case 'ascii':
1861 if (!$this->initToASCII($charset)) {
1862 return $str;
1863 }
1864 // Do nothing
1865 $map = &$this->toASCII[$charset];
1866 break;
1867 default:
1868 return $str;
1869 }
1870 $out = '';
1871 for ($i = 0; strlen($str[$i]); $i++) {
1872 $c = $str[$i];
1873 if (isset($map[$c])) {
1874 $out .= $map[$c];
1875 } else {
1876 $out .= $c;
1877 }
1878 }
1879 return $out;
1880 }
1881
1882 /********************************************
1883 *
1884 * Internal UTF-8 string operation functions
1885 *
1886 ********************************************/
1887 /**
1888 * Returns a part of a UTF-8 string.
1889 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1890 *
1891 * @param string $str UTF-8 string
1892 * @param int $start Start position (character position)
1893 * @param int $len Length (in characters)
1894 * @return string The substring
1895 * @see substr()
1896 */
1897 public function utf8_substr($str, $start, $len = NULL) {
1898 if ((string)$len === '0') {
1899 return '';
1900 }
1901 $byte_start = $this->utf8_char2byte_pos($str, $start);
1902 if ($byte_start === FALSE) {
1903 if ($start > 0) {
1904 // $start outside string length
1905 return FALSE;
1906 } else {
1907 $start = 0;
1908 }
1909 }
1910 $str = substr($str, $byte_start);
1911 if ($len != NULL) {
1912 $byte_end = $this->utf8_char2byte_pos($str, $len);
1913 // $len outside actual string length
1914 if ($byte_end === FALSE) {
1915 return $len < 0 ? '' : $str;
1916 } else {
1917 // When length is less than zero and exceeds, then we return blank string.
1918 return substr($str, 0, $byte_end);
1919 }
1920 } else {
1921 return $str;
1922 }
1923 }
1924
1925 /**
1926 * Counts the number of characters of a string in UTF-8.
1927 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1928 *
1929 * @param string $str UTF-8 multibyte character string
1930 * @return int The number of characters
1931 * @see strlen()
1932 */
1933 public function utf8_strlen($str) {
1934 $n = 0;
1935 for ($i = 0; strlen($str[$i]); $i++) {
1936 $c = ord($str[$i]);
1937 // Single-byte (0xxxxxx)
1938 if (!($c & 128)) {
1939 $n++;
1940 } elseif (($c & 192) == 192) {
1941 // Multi-byte starting byte (11xxxxxx)
1942 $n++;
1943 }
1944 }
1945 return $n;
1946 }
1947
1948 /**
1949 * Truncates a string in UTF-8 short at a given byte length.
1950 *
1951 * @param string $str UTF-8 multibyte character string
1952 * @param int $len The byte length
1953 * @return string The shortened string
1954 * @see mb_strcut()
1955 */
1956 public function utf8_strtrunc($str, $len) {
1957 $i = $len - 1;
1958 // Part of a multibyte sequence
1959 if (ord($str[$i]) & 128) {
1960 for (; $i > 0 && !(ord($str[$i]) & 64); $i--) {
1961
1962 }
1963 if ($i <= 0) {
1964 return '';
1965 }
1966 // Sanity check
1967 for ($bc = 0, $mbs = ord($str[$i]); $mbs & 128; $mbs = $mbs << 1) {
1968 // Calculate number of bytes
1969 $bc++;
1970 }
1971 if ($bc + $i > $len) {
1972 return substr($str, 0, $i);
1973 }
1974 }
1975 return substr($str, 0, $len);
1976 }
1977
1978 /**
1979 * Find position of first occurrence of a string, both arguments are in UTF-8.
1980 *
1981 * @param string $haystack UTF-8 string to search in
1982 * @param string $needle UTF-8 string to search for
1983 * @param int $offset Positition to start the search
1984 * @return int The character position
1985 * @see strpos()
1986 */
1987 public function utf8_strpos($haystack, $needle, $offset = 0) {
1988 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1989 return mb_strpos($haystack, $needle, $offset, 'utf-8');
1990 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'iconv') {
1991 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1992 }
1993 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1994 if ($byte_offset === FALSE) {
1995 // Offset beyond string length
1996 return FALSE;
1997 }
1998 $byte_pos = strpos($haystack, $needle, $byte_offset);
1999 if ($byte_pos === FALSE) {
2000 // Needle not found
2001 return FALSE;
2002 }
2003 return $this->utf8_byte2char_pos($haystack, $byte_pos);
2004 }
2005
2006 /**
2007 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
2008 *
2009 * @param string $haystack UTF-8 string to search in
2010 * @param string $needle UTF-8 character to search for (single character)
2011 * @return int The character position
2012 * @see strrpos()
2013 */
2014 public function utf8_strrpos($haystack, $needle) {
2015 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
2016 return mb_strrpos($haystack, $needle, 'utf-8');
2017 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'iconv') {
2018 return iconv_strrpos($haystack, $needle, 'utf-8');
2019 }
2020 $byte_pos = strrpos($haystack, $needle);
2021 if ($byte_pos === FALSE) {
2022 // Needle not found
2023 return FALSE;
2024 }
2025 return $this->utf8_byte2char_pos($haystack, $byte_pos);
2026 }
2027
2028 /**
2029 * Translates a character position into an 'absolute' byte position.
2030 * Unit tested by Kasper.
2031 *
2032 * @param string $str UTF-8 string
2033 * @param int $pos Character position (negative values start from the end)
2034 * @return int Byte position
2035 */
2036 public function utf8_char2byte_pos($str, $pos) {
2037 // Number of characters found
2038 $n = 0;
2039 // Number of characters wanted
2040 $p = abs($pos);
2041 if ($pos >= 0) {
2042 $i = 0;
2043 $d = 1;
2044 } else {
2045 $i = strlen($str) - 1;
2046 $d = -1;
2047 }
2048 for (; strlen($str[$i]) && $n < $p; $i += $d) {
2049 $c = (int)ord($str[$i]);
2050 // single-byte (0xxxxxx)
2051 if (!($c & 128)) {
2052 $n++;
2053 } elseif (($c & 192) == 192) {
2054 // Multi-byte starting byte (11xxxxxx)
2055 $n++;
2056 }
2057 }
2058 if (!strlen($str[$i])) {
2059 // Offset beyond string length
2060 return FALSE;
2061 }
2062 if ($pos >= 0) {
2063 // Skip trailing multi-byte data bytes
2064 while (ord($str[$i]) & 128 && !(ord($str[$i]) & 64)) {
2065 $i++;
2066 }
2067 } else {
2068 // Correct offset
2069 $i++;
2070 }
2071 return $i;
2072 }
2073
2074 /**
2075 * Translates an 'absolute' byte position into a character position.
2076 * Unit tested by Kasper.
2077 *
2078 * @param string $str UTF-8 string
2079 * @param int $pos Byte position
2080 * @return int Character position
2081 */
2082 public function utf8_byte2char_pos($str, $pos) {
2083 // Number of characters
2084 $n = 0;
2085 for ($i = $pos; $i > 0; $i--) {
2086 $c = (int)ord($str[$i]);
2087 // single-byte (0xxxxxx)
2088 if (!($c & 128)) {
2089 $n++;
2090 } elseif (($c & 192) == 192) {
2091 // Multi-byte starting byte (11xxxxxx)
2092 $n++;
2093 }
2094 }
2095 if (!strlen($str[$i])) {
2096 // Offset beyond string length
2097 return FALSE;
2098 }
2099 return $n;
2100 }
2101
2102 /**
2103 * Maps all characters of an UTF-8 string.
2104 *
2105 * @param string $str UTF-8 string
2106 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2107 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
2108 * @return string The converted string
2109 */
2110 public function utf8_char_mapping($str, $mode, $opt = '') {
2111 if (!$this->initUnicodeData($mode)) {
2112 // Do nothing
2113 return $str;
2114 }
2115 $out = '';
2116 switch ($mode) {
2117 case 'case':
2118 $map = &$this->caseFolding['utf-8'][$opt];
2119 break;
2120 case 'ascii':
2121 $map = &$this->toASCII['utf-8'];
2122 break;
2123 default:
2124 return $str;
2125 }
2126 for ($i = 0; strlen($str[$i]); $i++) {
2127 $c = ord($str[$i]);
2128 // single-byte (0xxxxxx)
2129 if (!($c & 128)) {
2130 $mbc = $str[$i];
2131 } elseif (($c & 192) == 192) {
2132 // multi-byte starting byte (11xxxxxx)
2133 for ($bc = 0; $c & 128; $c = $c << 1) {
2134 $bc++;
2135 }
2136 // calculate number of bytes
2137 $mbc = substr($str, $i, $bc);
2138 $i += $bc - 1;
2139 }
2140 if (isset($map[$mbc])) {
2141 $out .= $map[$mbc];
2142 } else {
2143 $out .= $mbc;
2144 }
2145 }
2146 return $out;
2147 }
2148
2149 /********************************************
2150 *
2151 * Internal EUC string operation functions
2152 *
2153 * Extended Unix Code:
2154 * ASCII compatible 7bit single bytes chars
2155 * 8bit two byte chars
2156 *
2157 * Shift-JIS is treated as a special case.
2158 *
2159 ********************************************/
2160 /**
2161 * Cuts a string in the EUC charset family short at a given byte length.
2162 *
2163 * @param string $str EUC multibyte character string
2164 * @param int $len The byte length
2165 * @param string $charset The charset
2166 * @return string The shortened string
2167 * @see mb_strcut()
2168 */
2169 public function euc_strtrunc($str, $len, $charset) {
2170 $sjis = $charset === 'shift_jis';
2171 for ($i = 0; strlen($str[$i]) && $i < $len; $i++) {
2172 $c = ord($str[$i]);
2173 if ($sjis) {
2174 if ($c >= 128 && $c < 160 || $c >= 224) {
2175 $i++;
2176 }
2177 } else {
2178 if ($c >= 128) {
2179 $i++;
2180 }
2181 }
2182 }
2183 if (!strlen($str[$i])) {
2184 return $str;
2185 }
2186 // string shorter than supplied length
2187 if ($i > $len) {
2188 // We ended on a first byte
2189 return substr($str, 0, $len - 1);
2190 } else {
2191 return substr($str, 0, $len);
2192 }
2193 }
2194
2195 /**
2196 * Returns a part of a string in the EUC charset family.
2197 *
2198 * @param string $str EUC multibyte character string
2199 * @param int $start Start position (character position)
2200 * @param string $charset The charset
2201 * @param int $len Length (in characters)
2202 * @return string the substring
2203 */
2204 public function euc_substr($str, $start, $charset, $len = NULL) {
2205 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2206 if ($byte_start === FALSE) {
2207 // $start outside string length
2208 return FALSE;
2209 }
2210 $str = substr($str, $byte_start);
2211 if ($len != NULL) {
2212 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2213 // $len outside actual string length
2214 if ($byte_end === FALSE) {
2215 return $str;
2216 } else {
2217 return substr($str, 0, $byte_end);
2218 }
2219 } else {
2220 return $str;
2221 }
2222 }
2223
2224 /**
2225 * Counts the number of characters of a string in the EUC charset family.
2226 *
2227 * @param string $str EUC multibyte character string
2228 * @param string $charset The charset
2229 * @return int The number of characters
2230 * @see strlen()
2231 */
2232 public function euc_strlen($str, $charset) {
2233 $sjis = $charset === 'shift_jis';
2234 $n = 0;
2235 for ($i = 0; strlen($str[$i]); $i++) {
2236 $c = ord($str[$i]);
2237 if ($sjis) {
2238 if ($c >= 128 && $c < 160 || $c >= 224) {
2239 $i++;
2240 }
2241 } else {
2242 if ($c >= 128) {
2243 $i++;
2244 }
2245 }
2246 $n++;
2247 }
2248 return $n;
2249 }
2250
2251 /**
2252 * Translates a character position into an 'absolute' byte position.
2253 *
2254 * @param string $str EUC multibyte character string
2255 * @param int $pos Character position (negative values start from the end)
2256 * @param string $charset The charset
2257 * @return int Byte position
2258 */
2259 public function euc_char2byte_pos($str, $pos, $charset) {
2260 $sjis = $charset === 'shift_jis';
2261 // Number of characters seen
2262 $n = 0;
2263 // Number of characters wanted
2264 $p = abs($pos);
2265 if ($pos >= 0) {
2266 $i = 0;
2267 $d = 1;
2268 } else {
2269 $i = strlen($str) - 1;
2270 $d = -1;
2271 }
2272 for (; strlen($str[$i]) && $n < $p; $i += $d) {
2273 $c = ord($str[$i]);
2274 if ($sjis) {
2275 if ($c >= 128 && $c < 160 || $c >= 224) {
2276 $i += $d;
2277 }
2278 } else {
2279 if ($c >= 128) {
2280 $i += $d;
2281 }
2282 }
2283 $n++;
2284 }
2285 if (!strlen($str[$i])) {
2286 return FALSE;
2287 }
2288 // offset beyond string length
2289 if ($pos < 0) {
2290 $i++;
2291 }
2292 // correct offset
2293 return $i;
2294 }
2295
2296 /**
2297 * Maps all characters of a string in the EUC charset family.
2298 *
2299 * @param string $str EUC multibyte character string
2300 * @param string $charset The charset
2301 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2302 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
2303 * @return string The converted string
2304 */
2305 public function euc_char_mapping($str, $charset, $mode, $opt = '') {
2306 switch ($mode) {
2307 case 'case':
2308 if (!$this->initCaseFolding($charset)) {
2309 return $str;
2310 }
2311 // do nothing
2312 $map = &$this->caseFolding[$charset][$opt];
2313 break;
2314 case 'ascii':
2315 if (!$this->initToASCII($charset)) {
2316 return $str;
2317 }
2318 // do nothing
2319 $map = &$this->toASCII[$charset];
2320 break;
2321 default:
2322 return $str;
2323 }
2324 $sjis = $charset === 'shift_jis';
2325 $out = '';
2326 for ($i = 0; strlen($str[$i]); $i++) {
2327 $mbc = $str[$i];
2328 $c = ord($mbc);
2329 if ($sjis) {
2330 // A double-byte char
2331 if ($c >= 128 && $c < 160 || $c >= 224) {
2332 $mbc = substr($str, $i, 2);
2333 $i++;
2334 }
2335 } else {
2336 // A double-byte char
2337 if ($c >= 128) {
2338 $mbc = substr($str, $i, 2);
2339 $i++;
2340 }
2341 }
2342 if (isset($map[$mbc])) {
2343 $out .= $map[$mbc];
2344 } else {
2345 $out .= $mbc;
2346 }
2347 }
2348 return $out;
2349 }
2350
2351 }