[CLEANUP] Ensure variables initalized and fix code smell
[Packages/TYPO3.CMS.git] / typo3 / sysext / core / Classes / Charset / CharsetConverter.php
1 <?php
2 namespace TYPO3\CMS\Core\Charset;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Localization\Locales;
18 use TYPO3\CMS\Core\SingletonInterface;
19 use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
20 use TYPO3\CMS\Core\Utility\GeneralUtility;
21
22 /**
23 * Notes on UTF-8
24 *
25 * Functions working on UTF-8 strings:
26 *
27 * - strchr/strstr
28 * - strrchr
29 * - substr_count
30 * - implode/explode/join
31 *
32 * Functions nearly working on UTF-8 strings:
33 *
34 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
35 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
36 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
37 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
38 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
39 *
40 * Functions NOT working on UTF-8 strings:
41 *
42 * - str*cmp
43 * - stristr
44 * - stripos
45 * - substr
46 * - strrev
47 * - split/spliti
48 * - ...
49 */
50
51 /**
52 * Class for conversion between charsets
53 */
54 class CharsetConverter implements SingletonInterface
55 {
56 /**
57 * ASCII Value for chars with no equivalent.
58 *
59 * @var int
60 */
61 public $noCharByteVal = 63;
62
63 /**
64 * This is the array where parsed conversion tables are stored (cached)
65 *
66 * @var array
67 */
68 public $parsedCharsets = [];
69
70 /**
71 * An array where case folding data will be stored (cached)
72 *
73 * @var array
74 */
75 public $caseFolding = [];
76
77 /**
78 * An array where charset-to-ASCII mappings are stored (cached)
79 *
80 * @var array
81 */
82 public $toASCII = [];
83
84 /**
85 * This tells the converter which charsets has two bytes per char:
86 *
87 * @var array
88 */
89 public $twoByteSets = [
90 'ucs-2' => 1
91 ];
92
93 /**
94 * This tells the converter which charsets has four bytes per char:
95 *
96 * @var array
97 * @deprecated since TYPO3 v8, will be removed with TYPO3 v9
98 */
99 public $fourByteSets = [
100 'ucs-4' => 1, // 4-byte Unicode
101 'utf-32' => 1
102 ];
103
104 /**
105 * This tells the converter which charsets use a scheme like the Extended Unix Code:
106 *
107 * @var array
108 */
109 public $eucBasedSets = [
110 'gb2312' => 1, // Chinese, simplified.
111 'big5' => 1, // Chinese, traditional.
112 'euc-kr' => 1, // Korean
113 'shift_jis' => 1
114 ];
115
116 /**
117 * @link http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
118 * @link http://czyborra.com/charsets/iso8859.html
119 *
120 * @var array
121 */
122 public $synonyms = [
123 'us' => 'ascii',
124 'us-ascii' => 'ascii',
125 'cp819' => 'iso-8859-1',
126 'ibm819' => 'iso-8859-1',
127 'iso-ir-100' => 'iso-8859-1',
128 'iso-ir-101' => 'iso-8859-2',
129 'iso-ir-109' => 'iso-8859-3',
130 'iso-ir-110' => 'iso-8859-4',
131 'iso-ir-144' => 'iso-8859-5',
132 'iso-ir-127' => 'iso-8859-6',
133 'iso-ir-126' => 'iso-8859-7',
134 'iso-ir-138' => 'iso-8859-8',
135 'iso-ir-148' => 'iso-8859-9',
136 'iso-ir-157' => 'iso-8859-10',
137 'iso-ir-179' => 'iso-8859-13',
138 'iso-ir-199' => 'iso-8859-14',
139 'iso-ir-203' => 'iso-8859-15',
140 'csisolatin1' => 'iso-8859-1',
141 'csisolatin2' => 'iso-8859-2',
142 'csisolatin3' => 'iso-8859-3',
143 'csisolatin5' => 'iso-8859-9',
144 'csisolatin8' => 'iso-8859-14',
145 'csisolatin9' => 'iso-8859-15',
146 'csisolatingreek' => 'iso-8859-7',
147 'iso-celtic' => 'iso-8859-14',
148 'latin1' => 'iso-8859-1',
149 'latin2' => 'iso-8859-2',
150 'latin3' => 'iso-8859-3',
151 'latin5' => 'iso-8859-9',
152 'latin6' => 'iso-8859-10',
153 'latin8' => 'iso-8859-14',
154 'latin9' => 'iso-8859-15',
155 'l1' => 'iso-8859-1',
156 'l2' => 'iso-8859-2',
157 'l3' => 'iso-8859-3',
158 'l5' => 'iso-8859-9',
159 'l6' => 'iso-8859-10',
160 'l8' => 'iso-8859-14',
161 'l9' => 'iso-8859-15',
162 'cyrillic' => 'iso-8859-5',
163 'arabic' => 'iso-8859-6',
164 'tis-620' => 'iso-8859-11',
165 'win874' => 'windows-874',
166 'win1250' => 'windows-1250',
167 'win1251' => 'windows-1251',
168 'win1252' => 'windows-1252',
169 'win1253' => 'windows-1253',
170 'win1254' => 'windows-1254',
171 'win1255' => 'windows-1255',
172 'win1256' => 'windows-1256',
173 'win1257' => 'windows-1257',
174 'win1258' => 'windows-1258',
175 'cp1250' => 'windows-1250',
176 'cp1251' => 'windows-1251',
177 'cp1252' => 'windows-1252',
178 'ms-ee' => 'windows-1250',
179 'ms-ansi' => 'windows-1252',
180 'ms-greek' => 'windows-1253',
181 'ms-turk' => 'windows-1254',
182 'winbaltrim' => 'windows-1257',
183 'koi-8ru' => 'koi-8r',
184 'koi8r' => 'koi-8r',
185 'cp878' => 'koi-8r',
186 'mac' => 'macroman',
187 'macintosh' => 'macroman',
188 'euc-cn' => 'gb2312',
189 'x-euc-cn' => 'gb2312',
190 'euccn' => 'gb2312',
191 'cp936' => 'gb2312',
192 'big-5' => 'big5',
193 'cp950' => 'big5',
194 'eucjp' => 'euc-jp',
195 'sjis' => 'shift_jis',
196 'shift-jis' => 'shift_jis',
197 'cp932' => 'shift_jis',
198 'cp949' => 'euc-kr',
199 'utf7' => 'utf-7',
200 'utf8' => 'utf-8',
201 'utf16' => 'utf-16',
202 'utf32' => 'utf-32',
203 'ucs2' => 'ucs-2',
204 'ucs4' => 'ucs-4'
205 ];
206
207 /**
208 * TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
209 * Empty values means "utf-8"
210 *
211 * @var array
212 * @deprecated since TYPO3 v8, will be removed in TYPO3 v9, use Locales
213 */
214 public $charSetArray = [
215 'af' => '',
216 'ar' => 'iso-8859-6',
217 'ba' => 'iso-8859-2',
218 'bg' => 'windows-1251',
219 'br' => '',
220 'ca' => 'iso-8859-15',
221 'ch' => 'gb2312',
222 'cs' => 'windows-1250',
223 'cz' => 'windows-1250',
224 'da' => '',
225 'de' => '',
226 'dk' => '',
227 'el' => 'iso-8859-7',
228 'eo' => 'utf-8',
229 'es' => '',
230 'et' => 'iso-8859-4',
231 'eu' => '',
232 'fa' => 'utf-8',
233 'fi' => '',
234 'fo' => 'utf-8',
235 'fr' => '',
236 'fr_CA' => '',
237 'ga' => '',
238 'ge' => 'utf-8',
239 'gl' => '',
240 'gr' => 'iso-8859-7',
241 'he' => 'utf-8',
242 'hi' => 'utf-8',
243 'hk' => 'big5',
244 'hr' => 'windows-1250',
245 'hu' => 'iso-8859-2',
246 'is' => 'utf-8',
247 'it' => '',
248 'ja' => 'shift_jis',
249 'jp' => 'shift_jis',
250 'ka' => 'utf-8',
251 'kl' => 'utf-8',
252 'km' => 'utf-8',
253 'ko' => 'euc-kr',
254 'kr' => 'euc-kr',
255 'lt' => 'windows-1257',
256 'lv' => 'utf-8',
257 'ms' => '',
258 'my' => '',
259 'nl' => '',
260 'no' => '',
261 'pl' => 'iso-8859-2',
262 'pt' => '',
263 'pt_BR' => '',
264 'qc' => '',
265 'ro' => 'iso-8859-2',
266 'ru' => 'windows-1251',
267 'se' => '',
268 'si' => 'windows-1250',
269 'sk' => 'windows-1250',
270 'sl' => 'windows-1250',
271 'sq' => 'utf-8',
272 'sr' => 'utf-8',
273 'sv' => '',
274 'th' => 'iso-8859-11',
275 'tr' => 'iso-8859-9',
276 'ua' => 'windows-1251',
277 'uk' => 'windows-1251',
278 'vi' => 'utf-8',
279 'vn' => 'utf-8',
280 'zh' => 'big5'
281 ];
282
283 /**
284 * Normalize - changes input character set to lowercase letters.
285 *
286 * @param string $charset Input charset
287 * @return string Normalized charset
288 */
289 public function parse_charset($charset)
290 {
291 $charset = trim(strtolower($charset));
292 if (isset($this->synonyms[$charset])) {
293 $charset = $this->synonyms[$charset];
294 }
295 return $charset;
296 }
297
298 /********************************************
299 *
300 * Charset Conversion functions
301 *
302 ********************************************/
303 /**
304 * Convert from one charset to another charset.
305 *
306 * @param string $inputString Input string
307 * @param string $fromCharset From charset (the current charset of the string)
308 * @param string $toCharset To charset (the output charset wanted)
309 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
310 * @return string Converted string
311 * @see convArray()
312 */
313 public function conv($inputString, $fromCharset, $toCharset, $useEntityForNoChar = false)
314 {
315 if ($fromCharset === $toCharset) {
316 return $inputString;
317 }
318 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
319 if ($toCharset === 'utf-8' || !$useEntityForNoChar) {
320 // Returns FALSE for unsupported charsets
321 $convertedString = mb_convert_encoding($inputString, $toCharset, $fromCharset);
322 if (false !== $convertedString) {
323 return $convertedString;
324 }
325 }
326 if ($fromCharset !== 'utf-8') {
327 $inputString = $this->utf8_encode($inputString, $fromCharset);
328 }
329 if ($toCharset !== 'utf-8') {
330 $inputString = $this->utf8_decode($inputString, $toCharset, $useEntityForNoChar);
331 }
332 return $inputString;
333 }
334
335 /**
336 * Convert all elements in ARRAY with type string from one charset to another charset.
337 * NOTICE: Array is passed by reference!
338 *
339 * @param array $array Input array, possibly multidimensional
340 * @param string $fromCharset From charset (the current charset of the string)
341 * @param string $toCharset To charset (the output charset wanted)
342 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
343 * @return void
344 * @see conv()
345 */
346 public function convArray(&$array, $fromCharset, $toCharset, $useEntityForNoChar = false)
347 {
348 foreach ($array as $key => $value) {
349 if (is_array($array[$key])) {
350 $this->convArray($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
351 } elseif (is_string($array[$key])) {
352 $array[$key] = $this->conv($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
353 }
354 }
355 }
356
357 /**
358 * Converts $str from $charset to UTF-8
359 *
360 * @param string $str String in local charset to convert to UTF-8
361 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
362 * @return string Output string, converted to UTF-8
363 */
364 public function utf8_encode($str, $charset)
365 {
366 if ($charset === 'utf-8') {
367 return $str;
368 }
369 // Charset is case-insensitive
370 // Parse conv. table if not already
371 if ($this->initCharset($charset)) {
372 $strLen = strlen($str);
373 $outStr = '';
374 // Traverse each char in string
375 for ($a = 0; $a < $strLen; $a++) {
376 $chr = substr($str, $a, 1);
377 $ord = ord($chr);
378 // If the charset has two bytes per char
379 if (isset($this->twoByteSets[$charset])) {
380 $ord2 = ord($str[$a + 1]);
381 // Assume big endian
382 $ord = $ord << 8 | $ord2;
383 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
384 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
385 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
386 } else {
387 $outStr .= chr($this->noCharByteVal);
388 }
389 // No char exists
390 $a++;
391 } elseif ($ord > 127) {
392 // If char has value over 127 it's a multibyte char in UTF-8
393 // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
394 if (isset($this->eucBasedSets[$charset])) {
395 // Shift-JIS: chars between 160 and 223 are single byte
396 if ($charset !== 'shift_jis' || ($ord < 160 || $ord > 223)) {
397 $a++;
398 $ord2 = ord(substr($str, $a, 1));
399 $ord = $ord * 256 + $ord2;
400 }
401 }
402 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
403 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
404 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
405 } else {
406 $outStr .= chr($this->noCharByteVal);
407 }
408 } else {
409 $outStr .= $chr;
410 }
411 }
412 return $outStr;
413 }
414 return '';
415 }
416
417 /**
418 * Converts $str from UTF-8 to $charset
419 *
420 * @param string $str String in UTF-8 to convert to local charset
421 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
422 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
423 * @return string Output string, converted to local charset
424 */
425 public function utf8_decode($str, $charset, $useEntityForNoChar = false)
426 {
427 if ($charset === 'utf-8') {
428 return $str;
429 }
430 // Charset is case-insensitive.
431 // Parse conv. table if not already
432 if ($this->initCharset($charset)) {
433 $strLen = strlen($str);
434 $outStr = '';
435 // Traverse each char in UTF-8 string
436 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) {
437 $chr = substr($str, $a, 1);
438 $ord = ord($chr);
439 // This means multibyte! (first byte!)
440 if ($ord > 127) {
441 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
442 if ($ord & 64) {
443 // Add first byte
444 $buf = $chr;
445 // For each byte in multibyte string
446 for ($b = 0; $b < 8; $b++) {
447 // Shift it left and
448 $ord = $ord << 1;
449 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
450 if ($ord & 128) {
451 $a++;
452 // ... and add the next char.
453 $buf .= substr($str, $a, 1);
454 } else {
455 break;
456 }
457 }
458 // If the UTF-8 char-sequence is found then...
459 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
460 // The local number
461 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
462 // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
463 if ($mByte > 255) {
464 $outStr .= chr(($mByte >> 8 & 255)) . chr(($mByte & 255));
465 } else {
466 $outStr .= chr($mByte);
467 }
468 } elseif ($useEntityForNoChar) {
469 // Create num entity:
470 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
471 } else {
472 $outStr .= chr($this->noCharByteVal);
473 }
474 } else {
475 $outStr .= chr($this->noCharByteVal);
476 }
477 } else {
478 $outStr .= $chr;
479 }
480 }
481 return $outStr;
482 }
483 return '';
484 }
485
486 /**
487 * Converts all chars > 127 to numeric entities.
488 *
489 * @param string $str Input string
490 * @return string Output string
491 */
492 public function utf8_to_entities($str)
493 {
494 $strLen = strlen($str);
495 $outStr = '';
496 // Traverse each char in UTF-8 string.
497 for ($a = 0; $a < $strLen; $a++) {
498 $chr = substr($str, $a, 1);
499 $ord = ord($chr);
500 // This means multibyte! (first byte!)
501 if ($ord > 127) {
502 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
503 if ($ord & 64) {
504 // Add first byte
505 $buf = $chr;
506 // For each byte in multibyte string...
507 for ($b = 0; $b < 8; $b++) {
508 // Shift it left and ...
509 $ord = $ord << 1;
510 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
511 if ($ord & 128) {
512 $a++;
513 // ... and add the next char.
514 $buf .= substr($str, $a, 1);
515 } else {
516 break;
517 }
518 }
519 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
520 } else {
521 $outStr .= chr($this->noCharByteVal);
522 }
523 } else {
524 $outStr .= $chr;
525 }
526 }
527 return $outStr;
528 }
529
530 /**
531 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars.
532 * All string-HTML entities (like &amp; or &pound;) will be converted as well
533 * @param string $str Input string, UTF-8
534 * @return string Output string
535 */
536 public function entities_to_utf8($str)
537 {
538 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT));
539 $token = md5(microtime());
540 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
541 foreach ($parts as $k => $v) {
542 // Only take every second element
543 if ($k % 2 === 0) {
544 continue;
545 }
546 $position = 0;
547 // Dec or hex entities
548 if (substr($v, $position, 1) === '#') {
549 $position++;
550 if (substr($v, $position, 1) === 'x') {
551 $v = hexdec(substr($v, ++$position));
552 } else {
553 $v = substr($v, $position);
554 }
555 $parts[$k] = $this->UnumberToChar($v);
556 } elseif (isset($trans_tbl['&' . $v . ';'])) {
557 // Other entities:
558 $v = $trans_tbl['&' . $v . ';'];
559 $parts[$k] = $v;
560 } else {
561 // No conversion:
562 $parts[$k] = '&' . $v . ';';
563 }
564 }
565 return implode('', $parts);
566 }
567
568 /**
569 * Converts all chars in the input UTF-8 string into integer numbers returned in an array.
570 * All HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
571 * Also, instead of integer numbers the real UTF-8 char is returned.
572 *
573 * @param string $str Input string, UTF-8
574 * @return array Output array with the char numbers
575 */
576 public function utf8_to_numberarray($str)
577 {
578 // Entities must be registered as well
579 $str = $this->entities_to_utf8($str);
580
581 // Do conversion:
582 $strLen = strlen($str);
583 $outArr = [];
584 // Traverse each char in UTF-8 string.
585 for ($a = 0; $a < $strLen; $a++) {
586 $chr = substr($str, $a, 1);
587 $ord = ord($chr);
588 // This means multibyte! (first byte!)
589 if ($ord > 127) {
590 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
591 if ($ord & 64) {
592 // Add first byte
593 $buf = $chr;
594 // For each byte in multibyte string...
595 for ($b = 0; $b < 8; $b++) {
596 // Shift it left and ...
597 $ord = $ord << 1;
598 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
599 if ($ord & 128) {
600 $a++;
601 // ... and add the next char.
602 $buf .= substr($str, $a, 1);
603 } else {
604 break;
605 }
606 }
607 $outArr[] = $buf;
608 } else {
609 $outArr[] = chr($this->noCharByteVal);
610 }
611 } else {
612 $outArr[] = chr($ord);
613 }
614 }
615 return $outArr;
616 }
617
618 /**
619 * Converts a UNICODE number to a UTF-8 multibyte character
620 * Algorithm based on script found at From: http://czyborra.com/utf/
621 * Unit-tested by Kasper
622 *
623 * The binary representation of the character's integer value is thus simply spread across the bytes
624 * and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
625 *
626 * bytes | bits | representation
627 * 1 | 7 | 0vvvvvvv
628 * 2 | 11 | 110vvvvv 10vvvvvv
629 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
630 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
631 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
632 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
633 *
634 * @param int $unicodeInteger UNICODE integer
635 * @return string UTF-8 multibyte character string
636 * @see utf8CharToUnumber()
637 */
638 public function UnumberToChar($unicodeInteger)
639 {
640 $str = '';
641 if ($unicodeInteger < 128) {
642 $str .= chr($unicodeInteger);
643 } elseif ($unicodeInteger < 2048) {
644 $str .= chr(192 | $unicodeInteger >> 6);
645 $str .= chr(128 | $unicodeInteger & 63);
646 } elseif ($unicodeInteger < 65536) {
647 $str .= chr(224 | $unicodeInteger >> 12);
648 $str .= chr(128 | $unicodeInteger >> 6 & 63);
649 $str .= chr(128 | $unicodeInteger & 63);
650 } elseif ($unicodeInteger < 2097152) {
651 $str .= chr(240 | $unicodeInteger >> 18);
652 $str .= chr(128 | $unicodeInteger >> 12 & 63);
653 $str .= chr(128 | $unicodeInteger >> 6 & 63);
654 $str .= chr(128 | $unicodeInteger & 63);
655 } elseif ($unicodeInteger < 67108864) {
656 $str .= chr(248 | $unicodeInteger >> 24);
657 $str .= chr(128 | $unicodeInteger >> 18 & 63);
658 $str .= chr(128 | $unicodeInteger >> 12 & 63);
659 $str .= chr(128 | $unicodeInteger >> 6 & 63);
660 $str .= chr(128 | $unicodeInteger & 63);
661 } elseif ($unicodeInteger < 2147483648) {
662 $str .= chr(252 | $unicodeInteger >> 30);
663 $str .= chr(128 | $unicodeInteger >> 24 & 63);
664 $str .= chr(128 | $unicodeInteger >> 18 & 63);
665 $str .= chr(128 | $unicodeInteger >> 12 & 63);
666 $str .= chr(128 | $unicodeInteger >> 6 & 63);
667 $str .= chr(128 | $unicodeInteger & 63);
668 } else {
669 // Cannot express a 32-bit character in UTF-8
670 $str .= chr($this->noCharByteVal);
671 }
672 return $str;
673 }
674
675 /**
676 * Converts a UTF-8 Multibyte character to a UNICODE number
677 * Unit-tested by Kasper
678 *
679 * @param string $str UTF-8 multibyte character string
680 * @param bool $hex If set, then a hex. number is returned.
681 * @return int UNICODE integer
682 * @see UnumberToChar()
683 */
684 public function utf8CharToUnumber($str, $hex = false)
685 {
686 // First char
687 $ord = ord($str[0]);
688 // This verifies that it IS a multi byte string
689 if (($ord & 192) === 192) {
690 $binBuf = '';
691 $b = 0;
692 // For each byte in multibyte string...
693 for (; $b < 8; $b++) {
694 // Shift it left and ...
695 $ord = $ord << 1;
696 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
697 if ($ord & 128) {
698 $binBuf .= substr('00000000' . decbin(ord(substr($str, ($b + 1), 1))), -6);
699 } else {
700 break;
701 }
702 }
703 $binBuf = substr(('00000000' . decbin(ord($str[0]))), -(6 - $b)) . $binBuf;
704 $int = bindec($binBuf);
705 } else {
706 $int = $ord;
707 }
708 return $hex ? 'x' . dechex($int) : $int;
709 }
710
711 /********************************************
712 *
713 * Init functions
714 *
715 ********************************************/
716 /**
717 * This will initialize a charset for use if it's defined in the 'typo3/sysext/core/Resources/Private/Charsets/csconvtbl/' folder
718 * This function is automatically called by the conversion functions
719 *
720 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
721 *
722 * @param string $charset The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
723 * @return int Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
724 * @access private
725 */
726 public function initCharset($charset)
727 {
728 // Only process if the charset is not yet loaded:
729 if (!is_array($this->parsedCharsets[$charset])) {
730 // Conversion table filename:
731 $charsetConvTableFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/csconvtbl/' . $charset . '.tbl';
732 // If the conversion table is found:
733 if ($charset && GeneralUtility::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
734 // Cache file for charsets:
735 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
736 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/charset_' . $charset . '.tbl');
737 if ($cacheFile && @is_file($cacheFile)) {
738 $this->parsedCharsets[$charset] = unserialize(file_get_contents($cacheFile));
739 } else {
740 // Parse conversion table into lines:
741 $lines = GeneralUtility::trimExplode(LF, file_get_contents($charsetConvTableFile), true);
742 // Initialize the internal variable holding the conv. table:
743 $this->parsedCharsets[$charset] = ['local' => [], 'utf8' => []];
744 // traverse the lines:
745 $detectedType = '';
746 foreach ($lines as $value) {
747 // Comment line or blanks are ignored.
748 if (trim($value) && $value[0] !== '#') {
749 // Detect type if not done yet: (Done on first real line)
750 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
751 if (!$detectedType) {
752 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
753 }
754 $hexbyte = '';
755 $utf8 = '';
756 if ($detectedType === 'ms-token') {
757 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
758 } elseif ($detectedType === 'whitespaced') {
759 $regA = [];
760 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
761 $hexbyte = $regA[1];
762 $utf8 = 'U+' . $regA[2];
763 }
764 $decval = hexdec(trim($hexbyte));
765 if ($decval > 127) {
766 $utf8decval = hexdec(substr(trim($utf8), 2));
767 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
768 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
769 }
770 }
771 }
772 if ($cacheFile) {
773 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
774 }
775 }
776 return 2;
777 } else {
778 return false;
779 }
780 } else {
781 return 1;
782 }
783 }
784
785 /**
786 * This function initializes all UTF-8 character data tables.
787 *
788 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
789 *
790 * @param string $mode Mode ("case", "ascii", ...)
791 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
792 * @access private
793 */
794 public function initUnicodeData($mode = null)
795 {
796 // Cache files
797 $cacheFileCase = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/cscase_utf-8.tbl');
798 $cacheFileASCII = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/csascii_utf-8.tbl');
799 // Only process if the tables are not yet loaded
800 switch ($mode) {
801 case 'case':
802 if (is_array($this->caseFolding['utf-8'])) {
803 return 1;
804 }
805 // Use cached version if possible
806 if ($cacheFileCase && @is_file($cacheFileCase)) {
807 $this->caseFolding['utf-8'] = unserialize(file_get_contents($cacheFileCase));
808 return 2;
809 }
810 break;
811 case 'ascii':
812 if (is_array($this->toASCII['utf-8'])) {
813 return 1;
814 }
815 // Use cached version if possible
816 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
817 $this->toASCII['utf-8'] = unserialize(file_get_contents($cacheFileASCII));
818 return 2;
819 }
820 break;
821 }
822 // Process main Unicode data file
823 $unicodeDataFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/UnicodeData.txt';
824 if (!(GeneralUtility::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
825 return false;
826 }
827 $fh = fopen($unicodeDataFile, 'rb');
828 if (!$fh) {
829 return false;
830 }
831 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
832 // Note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
833 $this->caseFolding['utf-8'] = [];
834 $utf8CaseFolding = &$this->caseFolding['utf-8'];
835 // a shorthand
836 $utf8CaseFolding['toUpper'] = [];
837 $utf8CaseFolding['toLower'] = [];
838 $utf8CaseFolding['toTitle'] = [];
839 // Array of temp. decompositions
840 $decomposition = [];
841 // Array of chars that are marks (eg. composing accents)
842 $mark = [];
843 // Array of chars that are numbers (eg. digits)
844 $number = [];
845 // Array of chars to be omitted (eg. Russian hard sign)
846 $omit = [];
847 while (!feof($fh)) {
848 $line = fgets($fh, 4096);
849 // Has a lot of info
850 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title, ) = explode(';', rtrim($line));
851 $ord = hexdec($char);
852 if ($ord > 65535) {
853 // Only process the BMP
854 break;
855 }
856 $utf8_char = $this->UnumberToChar($ord);
857 if ($upper) {
858 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
859 }
860 if ($lower) {
861 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
862 }
863 // Store "title" only when different from "upper" (only a few)
864 if ($title && $title !== $upper) {
865 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
866 }
867 switch ($cat[0]) {
868 case 'M':
869 // mark (accent, umlaut, ...)
870 $mark['U+' . $char] = 1;
871 break;
872 case 'N':
873 // numeric value
874 if ($ord > 128 && $num !== '') {
875 $number['U+' . $char] = $num;
876 }
877 }
878 // Accented Latin letters without "official" decomposition
879 $match = [];
880 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
881 $c = ord($match[2]);
882 if ($match[1] === 'SMALL') {
883 $c += 32;
884 }
885 $decomposition['U+' . $char] = [dechex($c)];
886 continue;
887 }
888 $match = [];
889 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
890 switch ($match[1]) {
891 case '<circle>':
892 // add parenthesis as circle replacement, eg (1)
893 $match[2] = '0028 ' . $match[2] . ' 0029';
894 break;
895 case '<square>':
896 // add square brackets as square replacement, eg [1]
897 $match[2] = '005B ' . $match[2] . ' 005D';
898 break;
899 case '<compat>':
900 // ignore multi char decompositions that start with a space
901 if (preg_match('/^0020 /', $match[2])) {
902 continue 2;
903 }
904 break;
905 case '<initial>':
906 case '<medial>':
907 case '<final>':
908 case '<isolated>':
909 case '<vertical>':
910 continue 2;
911 }
912 $decomposition['U+' . $char] = explode(' ', $match[2]);
913 }
914 }
915 fclose($fh);
916 // Process additional Unicode data for casing (allow folded characters to expand into a sequence)
917 $specialCasingFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/SpecialCasing.txt';
918 if (GeneralUtility::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
919 $fh = fopen($specialCasingFile, 'rb');
920 if ($fh) {
921 while (!feof($fh)) {
922 $line = fgets($fh, 4096);
923 if ($line[0] !== '#' && trim($line) !== '') {
924 list($char, $lower, $title, $upper, $cond) = GeneralUtility::trimExplode(';', $line);
925 if ($cond === '' || $cond[0] === '#') {
926 $utf8_char = $this->UnumberToChar(hexdec($char));
927 if ($char !== $lower) {
928 $arr = explode(' ', $lower);
929 for ($i = 0; isset($arr[$i]); $i++) {
930 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
931 }
932 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
933 }
934 if ($char !== $title && $title !== $upper) {
935 $arr = explode(' ', $title);
936 for ($i = 0; isset($arr[$i]); $i++) {
937 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
938 }
939 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
940 }
941 if ($char !== $upper) {
942 $arr = explode(' ', $upper);
943 for ($i = 0; isset($arr[$i]); $i++) {
944 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
945 }
946 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
947 }
948 }
949 }
950 }
951 fclose($fh);
952 }
953 }
954 // Process custom decompositions
955 $customTranslitFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/Translit.txt';
956 if (GeneralUtility::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
957 $fh = fopen($customTranslitFile, 'rb');
958 if ($fh) {
959 while (!feof($fh)) {
960 $line = fgets($fh, 4096);
961 if ($line[0] !== '#' && trim($line) !== '') {
962 list($char, $translit) = GeneralUtility::trimExplode(';', $line);
963 if (!$translit) {
964 $omit['U+' . $char] = 1;
965 }
966 $decomposition['U+' . $char] = explode(' ', $translit);
967 }
968 }
969 fclose($fh);
970 }
971 }
972 // Decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
973 foreach ($decomposition as $from => $to) {
974 $code_decomp = [];
975 while ($code_value = array_shift($to)) {
976 // Do recursive decomposition
977 if (isset($decomposition['U+' . $code_value])) {
978 foreach (array_reverse($decomposition['U+' . $code_value]) as $cv) {
979 array_unshift($to, $cv);
980 }
981 } elseif (!isset($mark['U+' . $code_value])) {
982 // remove mark
983 array_push($code_decomp, $code_value);
984 }
985 }
986 if (!empty($code_decomp) || isset($omit[$from])) {
987 $decomposition[$from] = $code_decomp;
988 } else {
989 unset($decomposition[$from]);
990 }
991 }
992 // Create ascii only mapping
993 $this->toASCII['utf-8'] = [];
994 $ascii = &$this->toASCII['utf-8'];
995 foreach ($decomposition as $from => $to) {
996 $code_decomp = [];
997 while ($code_value = array_shift($to)) {
998 $ord = hexdec($code_value);
999 if ($ord > 127) {
1000 continue 2;
1001 } else {
1002 // Skip decompositions containing non-ASCII chars
1003 array_push($code_decomp, chr($ord));
1004 }
1005 }
1006 $ascii[$this->UnumberToChar(hexdec($from))] = implode('', $code_decomp);
1007 }
1008 // Add numeric decompositions
1009 foreach ($number as $from => $to) {
1010 $utf8_char = $this->UnumberToChar(hexdec($from));
1011 if (!isset($ascii[$utf8_char])) {
1012 $ascii[$utf8_char] = $to;
1013 }
1014 }
1015 if ($cacheFileCase) {
1016 GeneralUtility::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1017 }
1018 if ($cacheFileASCII) {
1019 GeneralUtility::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1020 }
1021 return 3;
1022 }
1023
1024 /**
1025 * This function initializes the folding table for a charset other than UTF-8.
1026 * This function is automatically called by the case folding functions.
1027 *
1028 * @param string $charset Charset for which to initialize case folding.
1029 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1030 * @access private
1031 */
1032 public function initCaseFolding($charset)
1033 {
1034 // Only process if the case table is not yet loaded:
1035 if (is_array($this->caseFolding[$charset])) {
1036 return 1;
1037 }
1038 // Use cached version if possible
1039 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/cscase_' . $charset . '.tbl');
1040 if ($cacheFile && @is_file($cacheFile)) {
1041 $this->caseFolding[$charset] = unserialize(file_get_contents($cacheFile));
1042 return 2;
1043 }
1044 // init UTF-8 conversion for this charset
1045 if (!$this->initCharset($charset)) {
1046 return false;
1047 }
1048 // UTF-8 case folding is used as the base conversion table
1049 if (!$this->initUnicodeData('case')) {
1050 return false;
1051 }
1052 $nochar = chr($this->noCharByteVal);
1053 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1054 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1055 $c = $this->utf8_decode($utf8, $charset);
1056 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1057 if ($cc !== '' && $cc !== $nochar) {
1058 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1059 }
1060 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1061 if ($cc !== '' && $cc !== $nochar) {
1062 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1063 }
1064 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1065 if ($cc !== '' && $cc !== $nochar) {
1066 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1067 }
1068 }
1069 // Add the ASCII case table
1070 $start = ord('a');
1071 $end = ord('z');
1072 for ($i = $start; $i <= $end; $i++) {
1073 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1074 }
1075 $start = ord('A');
1076 $end = ord('Z');
1077 for ($i = $start; $i <= $end; $i++) {
1078 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1079 }
1080 if ($cacheFile) {
1081 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1082 }
1083 return 3;
1084 }
1085
1086 /**
1087 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1088 * This function is automatically called by the ASCII transliteration functions.
1089 *
1090 * @param string $charset Charset for which to initialize conversion.
1091 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1092 * @access private
1093 */
1094 public function initToASCII($charset)
1095 {
1096 // Only process if the case table is not yet loaded:
1097 if (is_array($this->toASCII[$charset])) {
1098 return 1;
1099 }
1100 // Use cached version if possible
1101 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/csascii_' . $charset . '.tbl');
1102 if ($cacheFile && @is_file($cacheFile)) {
1103 $this->toASCII[$charset] = unserialize(file_get_contents($cacheFile));
1104 return 2;
1105 }
1106 // Init UTF-8 conversion for this charset
1107 if (!$this->initCharset($charset)) {
1108 return false;
1109 }
1110 // UTF-8/ASCII transliteration is used as the base conversion table
1111 if (!$this->initUnicodeData('ascii')) {
1112 return false;
1113 }
1114 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1115 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1116 $c = $this->utf8_decode($utf8, $charset);
1117 if (isset($this->toASCII['utf-8'][$utf8])) {
1118 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1119 }
1120 }
1121 if ($cacheFile) {
1122 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1123 }
1124 return 3;
1125 }
1126
1127 /********************************************
1128 *
1129 * String operation functions
1130 *
1131 ********************************************/
1132 /**
1133 * Returns a part of a string.
1134 * Unit-tested by Kasper (single byte charsets only)
1135 *
1136 * @param string $charset The character set
1137 * @param string $string Character string
1138 * @param int $start Start position (character position)
1139 * @param int $len Length (in characters)
1140 * @return string The substring
1141 * @see substr(), mb_substr()
1142 * @deprecated since TYPO3 v8, will be removed with TYPO3 v9, use mb_substr() directly
1143 */
1144 public function substr($charset, $string, $start, $len = null)
1145 {
1146 GeneralUtility::logDeprecatedFunction();
1147 return mb_substr($string, $start, $len, $charset);
1148 }
1149
1150 /**
1151 * Counts the number of characters.
1152 * Unit-tested by Kasper (single byte charsets only)
1153 *
1154 * @param string $charset The character set
1155 * @param string $string Character string
1156 * @return int The number of characters
1157 * @see strlen()
1158 * @deprecated since TYPO3 v8, will be removed with TYPO3 v9, use mb_strlen() directly
1159 */
1160 public function strlen($charset, $string)
1161 {
1162 GeneralUtility::logDeprecatedFunction();
1163 return mb_strlen($string, $charset);
1164 }
1165
1166 /**
1167 * Truncates a string and pre-/appends a string.
1168 * Unit tested by Kasper
1169 *
1170 * @param string $charset The character set
1171 * @param string $string Character string
1172 * @param int $len Length (in characters)
1173 * @param string $crop Crop signifier
1174 * @return string The shortened string
1175 * @see substr(), mb_strimwidth()
1176 */
1177 public function crop($charset, $string, $len, $crop = '')
1178 {
1179 if ((int)$len === 0 || mb_strlen($string, $charset) <= abs($len)) {
1180 return $string;
1181 }
1182 if ($len > 0) {
1183 $string = mb_substr($string, 0, $len, $charset) . $crop;
1184 } else {
1185 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1186 }
1187 return $string;
1188 }
1189
1190 /**
1191 * Cuts a string short at a given byte length.
1192 *
1193 * @param string $charset The character set
1194 * @param string $string Character string
1195 * @param int $len The byte length
1196 * @return string The shortened string
1197 * @see mb_strcut()
1198 * @deprecated since TYPO3 v8, will be removed with TYPO3 v9, use mb_strcut() directly
1199 */
1200 public function strtrunc($charset, $string, $len)
1201 {
1202 GeneralUtility::logDeprecatedFunction();
1203 if ($len <= 0) {
1204 return '';
1205 }
1206 return mb_strcut($string, 0, $len, $charset);
1207 }
1208
1209 /**
1210 * Translates all characters of a string into their respective case values.
1211 * Unlike strtolower() and strtoupper() this method is locale independent.
1212 * Note that the string length may change!
1213 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1214 * Unit-tested by Kasper
1215 * Real case folding is language dependent, this method ignores this fact.
1216 *
1217 * @param string $charset Character set of string
1218 * @param string $string Input string to convert case for
1219 * @param string $case Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1220 * @return string The converted string
1221 * @see strtolower(), strtoupper()
1222 * @deprecated since TYPO3 v8, will be removed with TYPO3 v9, use mb_strtolower() or mb_strtoupper() directly
1223 */
1224 public function conv_case($charset, $string, $case)
1225 {
1226 GeneralUtility::logDeprecatedFunction();
1227 return $case === 'toLower'
1228 ? mb_strtolower($string, $charset)
1229 : mb_strtoupper($string, $charset);
1230 }
1231
1232 /**
1233 * Equivalent of lcfirst/ucfirst but using character set.
1234 *
1235 * @param string $charset
1236 * @param string $string
1237 * @param string $case can be 'toLower' or 'toUpper'
1238 * @return string
1239 */
1240 public function convCaseFirst($charset, $string, $case)
1241 {
1242 $firstChar = mb_substr($string, 0, 1, $charset);
1243 $firstChar = $case === 'toLower'
1244 ? mb_strtolower($firstChar, $charset)
1245 : mb_strtoupper($firstChar, $charset);
1246 $remainder = mb_substr($string, 1, null, $charset);
1247 return $firstChar . $remainder;
1248 }
1249
1250 /**
1251 * Capitalize the given string
1252 *
1253 * @param string $charset
1254 * @param string $string
1255 * @return string
1256 * @deprecated since TYPO3 v8, will be removed with TYPO3 v9, use mb_convert_case() directly
1257 */
1258 public function convCapitalize($charset, $string)
1259 {
1260 GeneralUtility::logDeprecatedFunction();
1261 return mb_convert_case($string, MB_CASE_TITLE, $charset);
1262 }
1263
1264 /**
1265 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1266 *
1267 * @param string $charset Character set of string
1268 * @param string $string Input string to convert
1269 * @return string The converted string
1270 */
1271 public function specCharsToASCII($charset, $string)
1272 {
1273 if ($charset === 'utf-8') {
1274 $string = $this->utf8_char_mapping($string, 'ascii');
1275 } elseif (isset($this->eucBasedSets[$charset])) {
1276 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1277 } else {
1278 // Treat everything else as single-byte encoding
1279 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1280 }
1281 return $string;
1282 }
1283
1284 /**
1285 * Converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1286 * into a TYPO3-readable language code
1287 *
1288 * @param string $languageCodesList List of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1289 * @return string A preferred language that TYPO3 supports, or "default" if none found
1290 * @deprecated since TYPO3 v8, will be removed in TYPO3 v9, use Locales::getPreferredClientLanguage() for usage
1291 */
1292 public function getPreferredClientLanguage($languageCodesList)
1293 {
1294 GeneralUtility::logDeprecatedFunction();
1295 /** @var Locales $locales */
1296 $locales = GeneralUtility::makeInstance(Locales::class);
1297 return $locales->getPreferredClientLanguage($languageCodesList);
1298 }
1299
1300 /********************************************
1301 *
1302 * Internal string operation functions
1303 *
1304 ********************************************/
1305 /**
1306 * Maps all characters of a string in a single byte charset.
1307 *
1308 * @param string $str The string
1309 * @param string $charset The charset
1310 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1311 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
1312 * @return string The converted string
1313 */
1314 public function sb_char_mapping($str, $charset, $mode, $opt = '')
1315 {
1316 switch ($mode) {
1317 case 'case':
1318 if (!$this->initCaseFolding($charset)) {
1319 return $str;
1320 }
1321 // Do nothing
1322 $map = &$this->caseFolding[$charset][$opt];
1323 break;
1324 case 'ascii':
1325 if (!$this->initToASCII($charset)) {
1326 return $str;
1327 }
1328 // Do nothing
1329 $map = &$this->toASCII[$charset];
1330 break;
1331 default:
1332 return $str;
1333 }
1334 $out = '';
1335 for ($i = 0; isset($str[$i]); $i++) {
1336 $c = $str[$i];
1337 if (isset($map[$c])) {
1338 $out .= $map[$c];
1339 } else {
1340 $out .= $c;
1341 }
1342 }
1343 return $out;
1344 }
1345
1346 /********************************************
1347 *
1348 * Internal UTF-8 string operation functions
1349 *
1350 ********************************************/
1351 /**
1352 * Returns a part of a UTF-8 string.
1353 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1354 *
1355 * @param string $str UTF-8 string
1356 * @param int $start Start position (character position)
1357 * @param int $len Length (in characters)
1358 * @return string The substring
1359 * @see substr()
1360 * @deprecated since TYPO3 v8, will be removed with TYPO3 v9, use mb_substr() directly
1361 */
1362 public function utf8_substr($str, $start, $len = null)
1363 {
1364 GeneralUtility::logDeprecatedFunction();
1365 if ((string)$len === '0') {
1366 return '';
1367 }
1368 $byte_start = $this->utf8_char2byte_pos($str, $start);
1369 if ($byte_start === false) {
1370 if ($start > 0) {
1371 // $start outside string length
1372 return false;
1373 }
1374 }
1375 $str = substr($str, $byte_start);
1376 if ($len != null) {
1377 $byte_end = $this->utf8_char2byte_pos($str, $len);
1378 // $len outside actual string length
1379 if ($byte_end === false) {
1380 return $len < 0 ? '' : $str;
1381 } else {
1382 // When length is less than zero and exceeds, then we return blank string.
1383 return substr($str, 0, $byte_end);
1384 }
1385 } else {
1386 return $str;
1387 }
1388 }
1389
1390 /**
1391 * Counts the number of characters of a string in UTF-8.
1392 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1393 *
1394 * @param string $str UTF-8 multibyte character string
1395 * @return int The number of characters
1396 * @see strlen()
1397 * @deprecated since TYPO3 v8, will be removed with TYPO3 v9, use mb_strlen() directly
1398 */
1399 public function utf8_strlen($str)
1400 {
1401 GeneralUtility::logDeprecatedFunction();
1402 $n = 0;
1403 for ($i = 0; isset($str[$i]); $i++) {
1404 $c = ord($str[$i]);
1405 // Single-byte (0xxxxxx)
1406 if (!($c & 128)) {
1407 $n++;
1408 } elseif (($c & 192) === 192) {
1409 // Multi-byte starting byte (11xxxxxx)
1410 $n++;
1411 }
1412 }
1413 return $n;
1414 }
1415
1416 /**
1417 * Truncates a string in UTF-8 short at a given byte length.
1418 *
1419 * @param string $str UTF-8 multibyte character string
1420 * @param int $len The byte length
1421 * @return string The shortened string
1422 * @see mb_strcut()
1423 * @deprecated since TYPO3 v8, will be removed with TYPO3 v9, use mb_strcut() directly
1424 */
1425 public function utf8_strtrunc($str, $len)
1426 {
1427 GeneralUtility::logDeprecatedFunction();
1428 $i = $len - 1;
1429 // Part of a multibyte sequence
1430 if (ord($str[$i]) & 128) {
1431 for (; $i > 0 && !(ord($str[$i]) & 64); $i--) {
1432 }
1433 if ($i <= 0) {
1434 return '';
1435 }
1436 $bc = 0;
1437 // Sanity check
1438 for ($mbs = ord($str[$i]); $mbs & 128; $mbs = $mbs << 1) {
1439 // Calculate number of bytes
1440 $bc++;
1441 }
1442 if ($bc + $i > $len) {
1443 return substr($str, 0, $i);
1444 }
1445 }
1446 return substr($str, 0, $len);
1447 }
1448
1449 /**
1450 * Find position of first occurrence of a string, both arguments are in UTF-8.
1451 *
1452 * @param string $haystack UTF-8 string to search in
1453 * @param string $needle UTF-8 string to search for
1454 * @param int $offset Position to start the search
1455 * @return int The character position
1456 * @see strpos()
1457 * @deprecated since TYPO3 v8, will be removed with TYPO3 v9, use mb_strpos() directly
1458 */
1459 public function utf8_strpos($haystack, $needle, $offset = 0)
1460 {
1461 GeneralUtility::logDeprecatedFunction();
1462 return mb_strpos($haystack, $needle, $offset, 'utf-8');
1463 }
1464
1465 /**
1466 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1467 *
1468 * @param string $haystack UTF-8 string to search in
1469 * @param string $needle UTF-8 character to search for (single character)
1470 * @return int The character position
1471 * @see strrpos()
1472 * @deprecated since TYPO3 v8, will be removed with TYPO3 v9, use mb_strrpos() directly
1473 */
1474 public function utf8_strrpos($haystack, $needle)
1475 {
1476 GeneralUtility::logDeprecatedFunction();
1477 return mb_strrpos($haystack, $needle, 'utf-8');
1478 }
1479
1480 /**
1481 * Translates a character position into an 'absolute' byte position.
1482 * Unit tested by Kasper.
1483 *
1484 * @param string $str UTF-8 string
1485 * @param int $pos Character position (negative values start from the end)
1486 * @return int Byte position
1487 */
1488 public function utf8_char2byte_pos($str, $pos)
1489 {
1490 // Number of characters found
1491 $n = 0;
1492 // Number of characters wanted
1493 $p = abs($pos);
1494 if ($pos >= 0) {
1495 $i = 0;
1496 $d = 1;
1497 } else {
1498 $i = strlen($str) - 1;
1499 $d = -1;
1500 }
1501 for (; isset($str[$i]) && $n < $p; $i += $d) {
1502 $c = (int)ord($str[$i]);
1503 // single-byte (0xxxxxx)
1504 if (!($c & 128)) {
1505 $n++;
1506 } elseif (($c & 192) === 192) {
1507 // Multi-byte starting byte (11xxxxxx)
1508 $n++;
1509 }
1510 }
1511 if (!isset($str[$i])) {
1512 // Offset beyond string length
1513 return false;
1514 }
1515 if ($pos >= 0) {
1516 // Skip trailing multi-byte data bytes
1517 while (ord($str[$i]) & 128 && !(ord($str[$i]) & 64)) {
1518 $i++;
1519 }
1520 } else {
1521 // Correct offset
1522 $i++;
1523 }
1524 return $i;
1525 }
1526
1527 /**
1528 * Translates an 'absolute' byte position into a character position.
1529 * Unit tested by Kasper.
1530 *
1531 * @param string $str UTF-8 string
1532 * @param int $pos Byte position
1533 * @return int Character position
1534 * @deprecated since TYPO3 v8, will be removed with TYPO3 v9, former internal function only
1535 */
1536 public function utf8_byte2char_pos($str, $pos)
1537 {
1538 GeneralUtility::logDeprecatedFunction();
1539 // Number of characters
1540 $n = 0;
1541 $i = $pos;
1542 for (; $i > 0; $i--) {
1543 $c = (int)ord($str[$i]);
1544 // single-byte (0xxxxxx)
1545 if (!($c & 128)) {
1546 $n++;
1547 } elseif (($c & 192) === 192) {
1548 // Multi-byte starting byte (11xxxxxx)
1549 $n++;
1550 }
1551 }
1552 if (!isset($str[$i])) {
1553 // Offset beyond string length
1554 return false;
1555 }
1556 return $n;
1557 }
1558
1559 /**
1560 * Maps all characters of an UTF-8 string.
1561 *
1562 * @param string $str UTF-8 string
1563 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1564 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
1565 * @return string The converted string
1566 */
1567 public function utf8_char_mapping($str, $mode, $opt = '')
1568 {
1569 if (!$this->initUnicodeData($mode)) {
1570 // Do nothing
1571 return $str;
1572 }
1573 $out = '';
1574 switch ($mode) {
1575 case 'case':
1576 $map = &$this->caseFolding['utf-8'][$opt];
1577 break;
1578 case 'ascii':
1579 $map = &$this->toASCII['utf-8'];
1580 break;
1581 default:
1582 return $str;
1583 }
1584 for ($i = 0; isset($str[$i]); $i++) {
1585 $c = ord($str[$i]);
1586 $mbc = '';
1587 // single-byte (0xxxxxx)
1588 if (!($c & 128)) {
1589 $mbc = $str[$i];
1590 } elseif (($c & 192) === 192) {
1591 $bc = 0;
1592 // multi-byte starting byte (11xxxxxx)
1593 for (; $c & 128; $c = $c << 1) {
1594 $bc++;
1595 }
1596 // calculate number of bytes
1597 $mbc = substr($str, $i, $bc);
1598 $i += $bc - 1;
1599 }
1600 if (isset($map[$mbc])) {
1601 $out .= $map[$mbc];
1602 } else {
1603 $out .= $mbc;
1604 }
1605 }
1606 return $out;
1607 }
1608
1609 /********************************************
1610 *
1611 * Internal EUC string operation functions
1612 *
1613 * Extended Unix Code:
1614 * ASCII compatible 7bit single bytes chars
1615 * 8bit two byte chars
1616 *
1617 * Shift-JIS is treated as a special case.
1618 *
1619 ********************************************/
1620 /**
1621 * Cuts a string in the EUC charset family short at a given byte length.
1622 *
1623 * @param string $str EUC multibyte character string
1624 * @param int $len The byte length
1625 * @param string $charset The charset
1626 * @return string The shortened string
1627 * @see mb_strcut()
1628 * @deprecated since TYPO3 v8, will be removed with TYPO3 v9, use mb_strcut() directly
1629 */
1630 public function euc_strtrunc($str, $len, $charset)
1631 {
1632 GeneralUtility::logDeprecatedFunction();
1633 $shiftJis = $charset === 'shift_jis';
1634 $i = 0;
1635 for (; isset($str[$i]) && $i < $len; $i++) {
1636 $c = ord($str[$i]);
1637 if ($shiftJis) {
1638 if ($c >= 128 && $c < 160 || $c >= 224) {
1639 $i++;
1640 }
1641 } else {
1642 if ($c >= 128) {
1643 $i++;
1644 }
1645 }
1646 }
1647 if (!isset($str[$i])) {
1648 return $str;
1649 }
1650 // string shorter than supplied length
1651 if ($i > $len) {
1652 // We ended on a first byte
1653 return substr($str, 0, $len - 1);
1654 } else {
1655 return substr($str, 0, $len);
1656 }
1657 }
1658
1659 /**
1660 * Returns a part of a string in the EUC charset family.
1661 *
1662 * @param string $str EUC multibyte character string
1663 * @param int $start Start position (character position)
1664 * @param string $charset The charset
1665 * @param int $len Length (in characters)
1666 * @return string the substring
1667 * @deprecated since TYPO3 v8, will be removed with TYPO3 v9, use mb_substr() directly
1668 */
1669 public function euc_substr($str, $start, $charset, $len = null)
1670 {
1671 GeneralUtility::logDeprecatedFunction();
1672 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
1673 if ($byte_start === false) {
1674 // $start outside string length
1675 return false;
1676 }
1677 $str = substr($str, $byte_start);
1678 if ($len != null) {
1679 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
1680 // $len outside actual string length
1681 if ($byte_end === false) {
1682 return $str;
1683 } else {
1684 return substr($str, 0, $byte_end);
1685 }
1686 } else {
1687 return $str;
1688 }
1689 }
1690
1691 /**
1692 * Counts the number of characters of a string in the EUC charset family.
1693 *
1694 * @param string $str EUC multibyte character string
1695 * @param string $charset The charset
1696 * @return int The number of characters
1697 * @see strlen()
1698 * @deprecated since TYPO3 v8, will be removed with TYPO3 v9, use mb_strlen() directly
1699 */
1700 public function euc_strlen($str, $charset)
1701 {
1702 GeneralUtility::logDeprecatedFunction();
1703 $sjis = $charset === 'shift_jis';
1704 $n = 0;
1705 for ($i = 0; isset($str[$i]); $i++) {
1706 $c = ord($str[$i]);
1707 if ($sjis) {
1708 if ($c >= 128 && $c < 160 || $c >= 224) {
1709 $i++;
1710 }
1711 } else {
1712 if ($c >= 128) {
1713 $i++;
1714 }
1715 }
1716 $n++;
1717 }
1718 return $n;
1719 }
1720
1721 /**
1722 * Translates a character position into an 'absolute' byte position.
1723 *
1724 * @param string $str EUC multibyte character string
1725 * @param int $pos Character position (negative values start from the end)
1726 * @param string $charset The charset
1727 * @return int Byte position
1728 * @deprecated since TYPO3 v8, will be removed with TYPO3 v9, former internal function only
1729 */
1730 public function euc_char2byte_pos($str, $pos, $charset)
1731 {
1732 GeneralUtility::logDeprecatedFunction();
1733 $sjis = $charset === 'shift_jis';
1734 // Number of characters seen
1735 $n = 0;
1736 // Number of characters wanted
1737 $p = abs($pos);
1738 if ($pos >= 0) {
1739 $i = 0;
1740 $d = 1;
1741 } else {
1742 $i = strlen($str) - 1;
1743 $d = -1;
1744 }
1745 for (; isset($str[$i]) && $n < $p; $i += $d) {
1746 $c = ord($str[$i]);
1747 if ($sjis) {
1748 if ($c >= 128 && $c < 160 || $c >= 224) {
1749 $i += $d;
1750 }
1751 } else {
1752 if ($c >= 128) {
1753 $i += $d;
1754 }
1755 }
1756 $n++;
1757 }
1758 if (!isset($str[$i])) {
1759 return false;
1760 }
1761 // offset beyond string length
1762 if ($pos < 0) {
1763 $i++;
1764 }
1765 // correct offset
1766 return $i;
1767 }
1768
1769 /**
1770 * Maps all characters of a string in the EUC charset family.
1771 *
1772 * @param string $str EUC multibyte character string
1773 * @param string $charset The charset
1774 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1775 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
1776 * @return string The converted string
1777 */
1778 public function euc_char_mapping($str, $charset, $mode, $opt = '')
1779 {
1780 switch ($mode) {
1781 case 'case':
1782 if (!$this->initCaseFolding($charset)) {
1783 return $str;
1784 }
1785 // do nothing
1786 $map = &$this->caseFolding[$charset][$opt];
1787 break;
1788 case 'ascii':
1789 if (!$this->initToASCII($charset)) {
1790 return $str;
1791 }
1792 // do nothing
1793 $map = &$this->toASCII[$charset];
1794 break;
1795 default:
1796 return $str;
1797 }
1798 $sjis = $charset === 'shift_jis';
1799 $out = '';
1800 for ($i = 0; isset($str[$i]); $i++) {
1801 $mbc = $str[$i];
1802 $c = ord($mbc);
1803 if ($sjis) {
1804 // A double-byte char
1805 if ($c >= 128 && $c < 160 || $c >= 224) {
1806 $mbc = substr($str, $i, 2);
1807 $i++;
1808 }
1809 } else {
1810 // A double-byte char
1811 if ($c >= 128) {
1812 $mbc = substr($str, $i, 2);
1813 $i++;
1814 }
1815 }
1816 if (isset($map[$mbc])) {
1817 $out .= $map[$mbc];
1818 } else {
1819 $out .= $mbc;
1820 }
1821 }
1822 return $out;
1823 }
1824 }