[TASK] Add more fixers for php-cs-fixer
[Packages/TYPO3.CMS.git] / typo3 / sysext / core / Classes / Charset / CharsetConverter.php
1 <?php
2 namespace TYPO3\CMS\Core\Charset;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Localization\Locales;
18 use TYPO3\CMS\Core\SingletonInterface;
19 use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
20 use TYPO3\CMS\Core\Utility\GeneralUtility;
21
22 /**
23 * Notes on UTF-8
24 *
25 * Functions working on UTF-8 strings:
26 *
27 * - strchr/strstr
28 * - strrchr
29 * - substr_count
30 * - implode/explode/join
31 *
32 * Functions nearly working on UTF-8 strings:
33 *
34 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
35 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
36 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
37 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
38 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
39 *
40 * Functions NOT working on UTF-8 strings:
41 *
42 * - str*cmp
43 * - stristr
44 * - stripos
45 * - substr
46 * - strrev
47 * - split/spliti
48 * - ...
49 */
50
51 /**
52 * Class for conversion between charsets
53 */
54 class CharsetConverter implements SingletonInterface
55 {
56 /**
57 * Possible strategies for handling multi-byte data
58 * Only used for internal purpose
59 * @internal
60 */
61 const STRATEGY_MBSTRING = 'mbstring';
62 const STRATEGY_ICONV = 'iconv';
63 const STRATEGY_FALLBACK = 'fallback';
64
65 /**
66 * Set to one of the strategies above, based on the availability of the environment.
67 *
68 * @var string
69 */
70 protected $conversionStrategy = null;
71
72 /**
73 * ASCII Value for chars with no equivalent.
74 *
75 * @var int
76 */
77 public $noCharByteVal = 63;
78
79 /**
80 * This is the array where parsed conversion tables are stored (cached)
81 *
82 * @var array
83 */
84 public $parsedCharsets = [];
85
86 /**
87 * An array where case folding data will be stored (cached)
88 *
89 * @var array
90 */
91 public $caseFolding = [];
92
93 /**
94 * An array where charset-to-ASCII mappings are stored (cached)
95 *
96 * @var array
97 */
98 public $toASCII = [];
99
100 /**
101 * This tells the converter which charsets has two bytes per char:
102 *
103 * @var array
104 */
105 public $twoByteSets = [
106 'ucs-2' => 1
107 ];
108
109 /**
110 * This tells the converter which charsets has four bytes per char:
111 *
112 * @var array
113 */
114 public $fourByteSets = [
115 'ucs-4' => 1, // 4-byte Unicode
116 'utf-32' => 1
117 ];
118
119 /**
120 * This tells the converter which charsets use a scheme like the Extended Unix Code:
121 *
122 * @var array
123 */
124 public $eucBasedSets = [
125 'gb2312' => 1, // Chinese, simplified.
126 'big5' => 1, // Chinese, traditional.
127 'euc-kr' => 1, // Korean
128 'shift_jis' => 1
129 ];
130
131 /**
132 * @link http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
133 * @link http://czyborra.com/charsets/iso8859.html
134 *
135 * @var array
136 */
137 public $synonyms = [
138 'us' => 'ascii',
139 'us-ascii' => 'ascii',
140 'cp819' => 'iso-8859-1',
141 'ibm819' => 'iso-8859-1',
142 'iso-ir-100' => 'iso-8859-1',
143 'iso-ir-101' => 'iso-8859-2',
144 'iso-ir-109' => 'iso-8859-3',
145 'iso-ir-110' => 'iso-8859-4',
146 'iso-ir-144' => 'iso-8859-5',
147 'iso-ir-127' => 'iso-8859-6',
148 'iso-ir-126' => 'iso-8859-7',
149 'iso-ir-138' => 'iso-8859-8',
150 'iso-ir-148' => 'iso-8859-9',
151 'iso-ir-157' => 'iso-8859-10',
152 'iso-ir-179' => 'iso-8859-13',
153 'iso-ir-199' => 'iso-8859-14',
154 'iso-ir-203' => 'iso-8859-15',
155 'csisolatin1' => 'iso-8859-1',
156 'csisolatin2' => 'iso-8859-2',
157 'csisolatin3' => 'iso-8859-3',
158 'csisolatin5' => 'iso-8859-9',
159 'csisolatin8' => 'iso-8859-14',
160 'csisolatin9' => 'iso-8859-15',
161 'csisolatingreek' => 'iso-8859-7',
162 'iso-celtic' => 'iso-8859-14',
163 'latin1' => 'iso-8859-1',
164 'latin2' => 'iso-8859-2',
165 'latin3' => 'iso-8859-3',
166 'latin5' => 'iso-8859-9',
167 'latin6' => 'iso-8859-10',
168 'latin8' => 'iso-8859-14',
169 'latin9' => 'iso-8859-15',
170 'l1' => 'iso-8859-1',
171 'l2' => 'iso-8859-2',
172 'l3' => 'iso-8859-3',
173 'l5' => 'iso-8859-9',
174 'l6' => 'iso-8859-10',
175 'l8' => 'iso-8859-14',
176 'l9' => 'iso-8859-15',
177 'cyrillic' => 'iso-8859-5',
178 'arabic' => 'iso-8859-6',
179 'tis-620' => 'iso-8859-11',
180 'win874' => 'windows-874',
181 'win1250' => 'windows-1250',
182 'win1251' => 'windows-1251',
183 'win1252' => 'windows-1252',
184 'win1253' => 'windows-1253',
185 'win1254' => 'windows-1254',
186 'win1255' => 'windows-1255',
187 'win1256' => 'windows-1256',
188 'win1257' => 'windows-1257',
189 'win1258' => 'windows-1258',
190 'cp1250' => 'windows-1250',
191 'cp1251' => 'windows-1251',
192 'cp1252' => 'windows-1252',
193 'ms-ee' => 'windows-1250',
194 'ms-ansi' => 'windows-1252',
195 'ms-greek' => 'windows-1253',
196 'ms-turk' => 'windows-1254',
197 'winbaltrim' => 'windows-1257',
198 'koi-8ru' => 'koi-8r',
199 'koi8r' => 'koi-8r',
200 'cp878' => 'koi-8r',
201 'mac' => 'macroman',
202 'macintosh' => 'macroman',
203 'euc-cn' => 'gb2312',
204 'x-euc-cn' => 'gb2312',
205 'euccn' => 'gb2312',
206 'cp936' => 'gb2312',
207 'big-5' => 'big5',
208 'cp950' => 'big5',
209 'eucjp' => 'euc-jp',
210 'sjis' => 'shift_jis',
211 'shift-jis' => 'shift_jis',
212 'cp932' => 'shift_jis',
213 'cp949' => 'euc-kr',
214 'utf7' => 'utf-7',
215 'utf8' => 'utf-8',
216 'utf16' => 'utf-16',
217 'utf32' => 'utf-32',
218 'ucs2' => 'ucs-2',
219 'ucs4' => 'ucs-4'
220 ];
221
222 /**
223 * TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
224 * Empty values means "utf-8"
225 *
226 * @var array
227 * @deprecated since TYPO3 v8, will be removed in TYPO3 v9, use Locales
228 */
229 public $charSetArray = [
230 'af' => '',
231 'ar' => 'iso-8859-6',
232 'ba' => 'iso-8859-2',
233 'bg' => 'windows-1251',
234 'br' => '',
235 'ca' => 'iso-8859-15',
236 'ch' => 'gb2312',
237 'cs' => 'windows-1250',
238 'cz' => 'windows-1250',
239 'da' => '',
240 'de' => '',
241 'dk' => '',
242 'el' => 'iso-8859-7',
243 'eo' => 'utf-8',
244 'es' => '',
245 'et' => 'iso-8859-4',
246 'eu' => '',
247 'fa' => 'utf-8',
248 'fi' => '',
249 'fo' => 'utf-8',
250 'fr' => '',
251 'fr_CA' => '',
252 'ga' => '',
253 'ge' => 'utf-8',
254 'gl' => '',
255 'gr' => 'iso-8859-7',
256 'he' => 'utf-8',
257 'hi' => 'utf-8',
258 'hk' => 'big5',
259 'hr' => 'windows-1250',
260 'hu' => 'iso-8859-2',
261 'is' => 'utf-8',
262 'it' => '',
263 'ja' => 'shift_jis',
264 'jp' => 'shift_jis',
265 'ka' => 'utf-8',
266 'kl' => 'utf-8',
267 'km' => 'utf-8',
268 'ko' => 'euc-kr',
269 'kr' => 'euc-kr',
270 'lt' => 'windows-1257',
271 'lv' => 'utf-8',
272 'ms' => '',
273 'my' => '',
274 'nl' => '',
275 'no' => '',
276 'pl' => 'iso-8859-2',
277 'pt' => '',
278 'pt_BR' => '',
279 'qc' => '',
280 'ro' => 'iso-8859-2',
281 'ru' => 'windows-1251',
282 'se' => '',
283 'si' => 'windows-1250',
284 'sk' => 'windows-1250',
285 'sl' => 'windows-1250',
286 'sq' => 'utf-8',
287 'sr' => 'utf-8',
288 'sv' => '',
289 'th' => 'iso-8859-11',
290 'tr' => 'iso-8859-9',
291 'ua' => 'windows-1251',
292 'uk' => 'windows-1251',
293 'vi' => 'utf-8',
294 'vn' => 'utf-8',
295 'zh' => 'big5'
296 ];
297
298 /**
299 * Normalize - changes input character set to lowercase letters.
300 *
301 * @param string $charset Input charset
302 * @return string Normalized charset
303 */
304 public function parse_charset($charset)
305 {
306 $charset = trim(strtolower($charset));
307 if (isset($this->synonyms[$charset])) {
308 $charset = $this->synonyms[$charset];
309 }
310 return $charset;
311 }
312
313 /********************************************
314 *
315 * Charset Conversion functions
316 *
317 ********************************************/
318 /**
319 * Convert from one charset to another charset.
320 *
321 * @param string $inputString Input string
322 * @param string $fromCharset From charset (the current charset of the string)
323 * @param string $toCharset To charset (the output charset wanted)
324 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
325 * @return string Converted string
326 * @see convArray()
327 */
328 public function conv($inputString, $fromCharset, $toCharset, $useEntityForNoChar = false)
329 {
330 if ($fromCharset === $toCharset) {
331 return $inputString;
332 }
333 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
334 if ($toCharset === 'utf-8' || !$useEntityForNoChar) {
335 switch ($this->getConversionStrategy()) {
336 case self::STRATEGY_MBSTRING:
337 $convertedString = mb_convert_encoding($inputString, $toCharset, $fromCharset);
338 if (false !== $convertedString) {
339 return $convertedString;
340 }
341 // Returns FALSE for unsupported charsets
342 break;
343 case self::STRATEGY_ICONV:
344 $convertedString = iconv($fromCharset, $toCharset . '//TRANSLIT', $inputString);
345 if (false !== $convertedString) {
346 return $convertedString;
347 }
348 break;
349 }
350 }
351 if ($fromCharset !== 'utf-8') {
352 $inputString = $this->utf8_encode($inputString, $fromCharset);
353 }
354 if ($toCharset !== 'utf-8') {
355 $inputString = $this->utf8_decode($inputString, $toCharset, $useEntityForNoChar);
356 }
357 return $inputString;
358 }
359
360 /**
361 * Convert all elements in ARRAY with type string from one charset to another charset.
362 * NOTICE: Array is passed by reference!
363 *
364 * @param array $array Input array, possibly multidimensional
365 * @param string $fromCharset From charset (the current charset of the string)
366 * @param string $toCharset To charset (the output charset wanted)
367 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
368 * @return void
369 * @see conv()
370 */
371 public function convArray(&$array, $fromCharset, $toCharset, $useEntityForNoChar = false)
372 {
373 foreach ($array as $key => $value) {
374 if (is_array($array[$key])) {
375 $this->convArray($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
376 } elseif (is_string($array[$key])) {
377 $array[$key] = $this->conv($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
378 }
379 }
380 }
381
382 /**
383 * Converts $str from $charset to UTF-8
384 *
385 * @param string $str String in local charset to convert to UTF-8
386 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
387 * @return string Output string, converted to UTF-8
388 */
389 public function utf8_encode($str, $charset)
390 {
391 if ($charset === 'utf-8') {
392 return $str;
393 }
394 // Charset is case-insensitive
395 // Parse conv. table if not already
396 if ($this->initCharset($charset)) {
397 $strLen = strlen($str);
398 $outStr = '';
399 // Traverse each char in string
400 for ($a = 0; $a < $strLen; $a++) {
401 $chr = substr($str, $a, 1);
402 $ord = ord($chr);
403 // If the charset has two bytes per char
404 if (isset($this->twoByteSets[$charset])) {
405 $ord2 = ord($str[$a + 1]);
406 // Assume big endian
407 $ord = $ord << 8 | $ord2;
408 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
409 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
410 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
411 } else {
412 $outStr .= chr($this->noCharByteVal);
413 }
414 // No char exists
415 $a++;
416 } elseif ($ord > 127) {
417 // If char has value over 127 it's a multibyte char in UTF-8
418 // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
419 if (isset($this->eucBasedSets[$charset])) {
420 // Shift-JIS: chars between 160 and 223 are single byte
421 if ($charset !== 'shift_jis' || ($ord < 160 || $ord > 223)) {
422 $a++;
423 $ord2 = ord(substr($str, $a, 1));
424 $ord = $ord * 256 + $ord2;
425 }
426 }
427 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
428 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
429 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
430 } else {
431 $outStr .= chr($this->noCharByteVal);
432 }
433 } else {
434 $outStr .= $chr;
435 }
436 }
437 return $outStr;
438 }
439 }
440
441 /**
442 * Converts $str from UTF-8 to $charset
443 *
444 * @param string $str String in UTF-8 to convert to local charset
445 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
446 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
447 * @return string Output string, converted to local charset
448 */
449 public function utf8_decode($str, $charset, $useEntityForNoChar = false)
450 {
451 if ($charset === 'utf-8') {
452 return $str;
453 }
454 // Charset is case-insensitive.
455 // Parse conv. table if not already
456 if ($this->initCharset($charset)) {
457 $strLen = strlen($str);
458 $outStr = '';
459 // Traverse each char in UTF-8 string
460 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) {
461 $chr = substr($str, $a, 1);
462 $ord = ord($chr);
463 // This means multibyte! (first byte!)
464 if ($ord > 127) {
465 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
466 if ($ord & 64) {
467 // Add first byte
468 $buf = $chr;
469 // For each byte in multibyte string
470 for ($b = 0; $b < 8; $b++) {
471 // Shift it left and
472 $ord = $ord << 1;
473 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
474 if ($ord & 128) {
475 $a++;
476 // ... and add the next char.
477 $buf .= substr($str, $a, 1);
478 } else {
479 break;
480 }
481 }
482 // If the UTF-8 char-sequence is found then...
483 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
484 // The local number
485 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
486 // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
487 if ($mByte > 255) {
488 $outStr .= chr(($mByte >> 8 & 255)) . chr(($mByte & 255));
489 } else {
490 $outStr .= chr($mByte);
491 }
492 } elseif ($useEntityForNoChar) {
493 // Create num entity:
494 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
495 } else {
496 $outStr .= chr($this->noCharByteVal);
497 }
498 } else {
499 $outStr .= chr($this->noCharByteVal);
500 }
501 } else {
502 $outStr .= $chr;
503 }
504 }
505 return $outStr;
506 }
507 }
508
509 /**
510 * Converts all chars > 127 to numeric entities.
511 *
512 * @param string $str Input string
513 * @return string Output string
514 */
515 public function utf8_to_entities($str)
516 {
517 $strLen = strlen($str);
518 $outStr = '';
519 // Traverse each char in UTF-8 string.
520 for ($a = 0; $a < $strLen; $a++) {
521 $chr = substr($str, $a, 1);
522 $ord = ord($chr);
523 // This means multibyte! (first byte!)
524 if ($ord > 127) {
525 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
526 if ($ord & 64) {
527 // Add first byte
528 $buf = $chr;
529 // For each byte in multibyte string...
530 for ($b = 0; $b < 8; $b++) {
531 // Shift it left and ...
532 $ord = $ord << 1;
533 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
534 if ($ord & 128) {
535 $a++;
536 // ... and add the next char.
537 $buf .= substr($str, $a, 1);
538 } else {
539 break;
540 }
541 }
542 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
543 } else {
544 $outStr .= chr($this->noCharByteVal);
545 }
546 } else {
547 $outStr .= $chr;
548 }
549 }
550 return $outStr;
551 }
552
553 /**
554 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars.
555 * All string-HTML entities (like &amp; or &pound;) will be converted as well
556 * @param string $str Input string, UTF-8
557 * @return string Output string
558 */
559 public function entities_to_utf8($str)
560 {
561 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT));
562 $token = md5(microtime());
563 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
564 foreach ($parts as $k => $v) {
565 // Only take every second element
566 if ($k % 2 === 0) {
567 continue;
568 }
569 $position = 0;
570 // Dec or hex entities
571 if (substr($v, $position, 1) === '#') {
572 $position++;
573 if (substr($v, $position, 1) === 'x') {
574 $v = hexdec(substr($v, ++$position));
575 } else {
576 $v = substr($v, $position);
577 }
578 $parts[$k] = $this->UnumberToChar($v);
579 } elseif (isset($trans_tbl['&' . $v . ';'])) {
580 // Other entities:
581 $v = $trans_tbl['&' . $v . ';'];
582 $parts[$k] = $v;
583 } else {
584 // No conversion:
585 $parts[$k] = '&' . $v . ';';
586 }
587 }
588 return implode('', $parts);
589 }
590
591 /**
592 * Converts all chars in the input UTF-8 string into integer numbers returned in an array.
593 * All HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
594 * Also, instead of integer numbers the real UTF-8 char is returned.
595 *
596 * @param string $str Input string, UTF-8
597 * @return array Output array with the char numbers
598 */
599 public function utf8_to_numberarray($str)
600 {
601 // Entities must be registered as well
602 $str = $this->entities_to_utf8($str);
603
604 // Do conversion:
605 $strLen = strlen($str);
606 $outArr = [];
607 // Traverse each char in UTF-8 string.
608 for ($a = 0; $a < $strLen; $a++) {
609 $chr = substr($str, $a, 1);
610 $ord = ord($chr);
611 // This means multibyte! (first byte!)
612 if ($ord > 127) {
613 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
614 if ($ord & 64) {
615 // Add first byte
616 $buf = $chr;
617 // For each byte in multibyte string...
618 for ($b = 0; $b < 8; $b++) {
619 // Shift it left and ...
620 $ord = $ord << 1;
621 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
622 if ($ord & 128) {
623 $a++;
624 // ... and add the next char.
625 $buf .= substr($str, $a, 1);
626 } else {
627 break;
628 }
629 }
630 $outArr[] = $buf;
631 } else {
632 $outArr[] = chr($this->noCharByteVal);
633 }
634 } else {
635 $outArr[] = chr($ord);
636 }
637 }
638 return $outArr;
639 }
640
641 /**
642 * Converts a UNICODE number to a UTF-8 multibyte character
643 * Algorithm based on script found at From: http://czyborra.com/utf/
644 * Unit-tested by Kasper
645 *
646 * The binary representation of the character's integer value is thus simply spread across the bytes
647 * and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
648 *
649 * bytes | bits | representation
650 * 1 | 7 | 0vvvvvvv
651 * 2 | 11 | 110vvvvv 10vvvvvv
652 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
653 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
654 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
655 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
656 *
657 * @param int $unicodeInteger UNICODE integer
658 * @return string UTF-8 multibyte character string
659 * @see utf8CharToUnumber()
660 */
661 public function UnumberToChar($unicodeInteger)
662 {
663 $str = '';
664 if ($unicodeInteger < 128) {
665 $str .= chr($unicodeInteger);
666 } elseif ($unicodeInteger < 2048) {
667 $str .= chr(192 | $unicodeInteger >> 6);
668 $str .= chr(128 | $unicodeInteger & 63);
669 } elseif ($unicodeInteger < 65536) {
670 $str .= chr(224 | $unicodeInteger >> 12);
671 $str .= chr(128 | $unicodeInteger >> 6 & 63);
672 $str .= chr(128 | $unicodeInteger & 63);
673 } elseif ($unicodeInteger < 2097152) {
674 $str .= chr(240 | $unicodeInteger >> 18);
675 $str .= chr(128 | $unicodeInteger >> 12 & 63);
676 $str .= chr(128 | $unicodeInteger >> 6 & 63);
677 $str .= chr(128 | $unicodeInteger & 63);
678 } elseif ($unicodeInteger < 67108864) {
679 $str .= chr(248 | $unicodeInteger >> 24);
680 $str .= chr(128 | $unicodeInteger >> 18 & 63);
681 $str .= chr(128 | $unicodeInteger >> 12 & 63);
682 $str .= chr(128 | $unicodeInteger >> 6 & 63);
683 $str .= chr(128 | $unicodeInteger & 63);
684 } elseif ($unicodeInteger < 2147483648) {
685 $str .= chr(252 | $unicodeInteger >> 30);
686 $str .= chr(128 | $unicodeInteger >> 24 & 63);
687 $str .= chr(128 | $unicodeInteger >> 18 & 63);
688 $str .= chr(128 | $unicodeInteger >> 12 & 63);
689 $str .= chr(128 | $unicodeInteger >> 6 & 63);
690 $str .= chr(128 | $unicodeInteger & 63);
691 } else {
692 // Cannot express a 32-bit character in UTF-8
693 $str .= chr($this->noCharByteVal);
694 }
695 return $str;
696 }
697
698 /**
699 * Converts a UTF-8 Multibyte character to a UNICODE number
700 * Unit-tested by Kasper
701 *
702 * @param string $str UTF-8 multibyte character string
703 * @param bool $hex If set, then a hex. number is returned.
704 * @return int UNICODE integer
705 * @see UnumberToChar()
706 */
707 public function utf8CharToUnumber($str, $hex = false)
708 {
709 // First char
710 $ord = ord($str[0]);
711 // This verifies that it IS a multi byte string
712 if (($ord & 192) === 192) {
713 $binBuf = '';
714 // For each byte in multibyte string...
715 for ($b = 0; $b < 8; $b++) {
716 // Shift it left and ...
717 $ord = $ord << 1;
718 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
719 if ($ord & 128) {
720 $binBuf .= substr('00000000' . decbin(ord(substr($str, ($b + 1), 1))), -6);
721 } else {
722 break;
723 }
724 }
725 $binBuf = substr(('00000000' . decbin(ord($str[0]))), -(6 - $b)) . $binBuf;
726 $int = bindec($binBuf);
727 } else {
728 $int = $ord;
729 }
730 return $hex ? 'x' . dechex($int) : $int;
731 }
732
733 /********************************************
734 *
735 * Init functions
736 *
737 ********************************************/
738 /**
739 * This will initialize a charset for use if it's defined in the 'typo3/sysext/core/Resources/Private/Charsets/csconvtbl/' folder
740 * This function is automatically called by the conversion functions
741 *
742 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
743 *
744 * @param string $charset The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
745 * @return int Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
746 * @access private
747 */
748 public function initCharset($charset)
749 {
750 // Only process if the charset is not yet loaded:
751 if (!is_array($this->parsedCharsets[$charset])) {
752 // Conversion table filename:
753 $charsetConvTableFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/csconvtbl/' . $charset . '.tbl';
754 // If the conversion table is found:
755 if ($charset && GeneralUtility::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
756 // Cache file for charsets:
757 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
758 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/charset_' . $charset . '.tbl');
759 if ($cacheFile && @is_file($cacheFile)) {
760 $this->parsedCharsets[$charset] = unserialize(file_get_contents($cacheFile));
761 } else {
762 // Parse conversion table into lines:
763 $lines = GeneralUtility::trimExplode(LF, file_get_contents($charsetConvTableFile), true);
764 // Initialize the internal variable holding the conv. table:
765 $this->parsedCharsets[$charset] = ['local' => [], 'utf8' => []];
766 // traverse the lines:
767 $detectedType = '';
768 foreach ($lines as $value) {
769 // Comment line or blanks are ignored.
770 if (trim($value) && $value[0] !== '#') {
771 // Detect type if not done yet: (Done on first real line)
772 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
773 if (!$detectedType) {
774 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
775 }
776 if ($detectedType === 'ms-token') {
777 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
778 } elseif ($detectedType === 'whitespaced') {
779 $regA = [];
780 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
781 $hexbyte = $regA[1];
782 $utf8 = 'U+' . $regA[2];
783 }
784 $decval = hexdec(trim($hexbyte));
785 if ($decval > 127) {
786 $utf8decval = hexdec(substr(trim($utf8), 2));
787 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
788 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
789 }
790 }
791 }
792 if ($cacheFile) {
793 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
794 }
795 }
796 return 2;
797 } else {
798 return false;
799 }
800 } else {
801 return 1;
802 }
803 }
804
805 /**
806 * This function initializes all UTF-8 character data tables.
807 *
808 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
809 *
810 * @param string $mode Mode ("case", "ascii", ...)
811 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
812 * @access private
813 */
814 public function initUnicodeData($mode = null)
815 {
816 // Cache files
817 $cacheFileCase = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/cscase_utf-8.tbl');
818 $cacheFileASCII = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/csascii_utf-8.tbl');
819 // Only process if the tables are not yet loaded
820 switch ($mode) {
821 case 'case':
822 if (is_array($this->caseFolding['utf-8'])) {
823 return 1;
824 }
825 // Use cached version if possible
826 if ($cacheFileCase && @is_file($cacheFileCase)) {
827 $this->caseFolding['utf-8'] = unserialize(file_get_contents($cacheFileCase));
828 return 2;
829 }
830 break;
831 case 'ascii':
832 if (is_array($this->toASCII['utf-8'])) {
833 return 1;
834 }
835 // Use cached version if possible
836 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
837 $this->toASCII['utf-8'] = unserialize(file_get_contents($cacheFileASCII));
838 return 2;
839 }
840 break;
841 }
842 // Process main Unicode data file
843 $unicodeDataFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/UnicodeData.txt';
844 if (!(GeneralUtility::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
845 return false;
846 }
847 $fh = fopen($unicodeDataFile, 'rb');
848 if (!$fh) {
849 return false;
850 }
851 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
852 // Note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
853 $this->caseFolding['utf-8'] = [];
854 $utf8CaseFolding = &$this->caseFolding['utf-8'];
855 // a shorthand
856 $utf8CaseFolding['toUpper'] = [];
857 $utf8CaseFolding['toLower'] = [];
858 $utf8CaseFolding['toTitle'] = [];
859 // Array of temp. decompositions
860 $decomposition = [];
861 // Array of chars that are marks (eg. composing accents)
862 $mark = [];
863 // Array of chars that are numbers (eg. digits)
864 $number = [];
865 // Array of chars to be omitted (eg. Russian hard sign)
866 $omit = [];
867 while (!feof($fh)) {
868 $line = fgets($fh, 4096);
869 // Has a lot of info
870 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title, ) = explode(';', rtrim($line));
871 $ord = hexdec($char);
872 if ($ord > 65535) {
873 // Only process the BMP
874 break;
875 }
876 $utf8_char = $this->UnumberToChar($ord);
877 if ($upper) {
878 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
879 }
880 if ($lower) {
881 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
882 }
883 // Store "title" only when different from "upper" (only a few)
884 if ($title && $title !== $upper) {
885 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
886 }
887 switch ($cat[0]) {
888 case 'M':
889 // mark (accent, umlaut, ...)
890 $mark['U+' . $char] = 1;
891 break;
892 case 'N':
893 // numeric value
894 if ($ord > 128 && $num !== '') {
895 $number['U+' . $char] = $num;
896 }
897 }
898 // Accented Latin letters without "official" decomposition
899 $match = [];
900 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
901 $c = ord($match[2]);
902 if ($match[1] === 'SMALL') {
903 $c += 32;
904 }
905 $decomposition['U+' . $char] = [dechex($c)];
906 continue;
907 }
908 $match = [];
909 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
910 switch ($match[1]) {
911 case '<circle>':
912 // add parenthesis as circle replacement, eg (1)
913 $match[2] = '0028 ' . $match[2] . ' 0029';
914 break;
915 case '<square>':
916 // add square brackets as square replacement, eg [1]
917 $match[2] = '005B ' . $match[2] . ' 005D';
918 break;
919 case '<compat>':
920 // ignore multi char decompositions that start with a space
921 if (preg_match('/^0020 /', $match[2])) {
922 continue 2;
923 }
924 break;
925 case '<initial>':
926 case '<medial>':
927 case '<final>':
928 case '<isolated>':
929 case '<vertical>':
930 continue 2;
931 }
932 $decomposition['U+' . $char] = explode(' ', $match[2]);
933 }
934 }
935 fclose($fh);
936 // Process additional Unicode data for casing (allow folded characters to expand into a sequence)
937 $specialCasingFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/SpecialCasing.txt';
938 if (GeneralUtility::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
939 $fh = fopen($specialCasingFile, 'rb');
940 if ($fh) {
941 while (!feof($fh)) {
942 $line = fgets($fh, 4096);
943 if ($line[0] !== '#' && trim($line) !== '') {
944 list($char, $lower, $title, $upper, $cond) = GeneralUtility::trimExplode(';', $line);
945 if ($cond === '' || $cond[0] === '#') {
946 $utf8_char = $this->UnumberToChar(hexdec($char));
947 if ($char !== $lower) {
948 $arr = explode(' ', $lower);
949 for ($i = 0; isset($arr[$i]); $i++) {
950 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
951 }
952 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
953 }
954 if ($char !== $title && $title !== $upper) {
955 $arr = explode(' ', $title);
956 for ($i = 0; isset($arr[$i]); $i++) {
957 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
958 }
959 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
960 }
961 if ($char !== $upper) {
962 $arr = explode(' ', $upper);
963 for ($i = 0; isset($arr[$i]); $i++) {
964 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
965 }
966 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
967 }
968 }
969 }
970 }
971 fclose($fh);
972 }
973 }
974 // Process custom decompositions
975 $customTranslitFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/Translit.txt';
976 if (GeneralUtility::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
977 $fh = fopen($customTranslitFile, 'rb');
978 if ($fh) {
979 while (!feof($fh)) {
980 $line = fgets($fh, 4096);
981 if ($line[0] !== '#' && trim($line) !== '') {
982 list($char, $translit) = GeneralUtility::trimExplode(';', $line);
983 if (!$translit) {
984 $omit['U+' . $char] = 1;
985 }
986 $decomposition['U+' . $char] = explode(' ', $translit);
987 }
988 }
989 fclose($fh);
990 }
991 }
992 // Decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
993 foreach ($decomposition as $from => $to) {
994 $code_decomp = [];
995 while ($code_value = array_shift($to)) {
996 // Do recursive decomposition
997 if (isset($decomposition['U+' . $code_value])) {
998 foreach (array_reverse($decomposition['U+' . $code_value]) as $cv) {
999 array_unshift($to, $cv);
1000 }
1001 } elseif (!isset($mark['U+' . $code_value])) {
1002 // remove mark
1003 array_push($code_decomp, $code_value);
1004 }
1005 }
1006 if (!empty($code_decomp) || isset($omit[$from])) {
1007 $decomposition[$from] = $code_decomp;
1008 } else {
1009 unset($decomposition[$from]);
1010 }
1011 }
1012 // Create ascii only mapping
1013 $this->toASCII['utf-8'] = [];
1014 $ascii = &$this->toASCII['utf-8'];
1015 foreach ($decomposition as $from => $to) {
1016 $code_decomp = [];
1017 while ($code_value = array_shift($to)) {
1018 $ord = hexdec($code_value);
1019 if ($ord > 127) {
1020 continue 2;
1021 } else {
1022 // Skip decompositions containing non-ASCII chars
1023 array_push($code_decomp, chr($ord));
1024 }
1025 }
1026 $ascii[$this->UnumberToChar(hexdec($from))] = implode('', $code_decomp);
1027 }
1028 // Add numeric decompositions
1029 foreach ($number as $from => $to) {
1030 $utf8_char = $this->UnumberToChar(hexdec($from));
1031 if (!isset($ascii[$utf8_char])) {
1032 $ascii[$utf8_char] = $to;
1033 }
1034 }
1035 if ($cacheFileCase) {
1036 GeneralUtility::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1037 }
1038 if ($cacheFileASCII) {
1039 GeneralUtility::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1040 }
1041 return 3;
1042 }
1043
1044 /**
1045 * This function initializes the folding table for a charset other than UTF-8.
1046 * This function is automatically called by the case folding functions.
1047 *
1048 * @param string $charset Charset for which to initialize case folding.
1049 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1050 * @access private
1051 */
1052 public function initCaseFolding($charset)
1053 {
1054 // Only process if the case table is not yet loaded:
1055 if (is_array($this->caseFolding[$charset])) {
1056 return 1;
1057 }
1058 // Use cached version if possible
1059 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/cscase_' . $charset . '.tbl');
1060 if ($cacheFile && @is_file($cacheFile)) {
1061 $this->caseFolding[$charset] = unserialize(file_get_contents($cacheFile));
1062 return 2;
1063 }
1064 // init UTF-8 conversion for this charset
1065 if (!$this->initCharset($charset)) {
1066 return false;
1067 }
1068 // UTF-8 case folding is used as the base conversion table
1069 if (!$this->initUnicodeData('case')) {
1070 return false;
1071 }
1072 $nochar = chr($this->noCharByteVal);
1073 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1074 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1075 $c = $this->utf8_decode($utf8, $charset);
1076 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1077 if ($cc !== '' && $cc !== $nochar) {
1078 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1079 }
1080 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1081 if ($cc !== '' && $cc !== $nochar) {
1082 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1083 }
1084 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1085 if ($cc !== '' && $cc !== $nochar) {
1086 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1087 }
1088 }
1089 // Add the ASCII case table
1090 $start = ord('a');
1091 $end = ord('z');
1092 for ($i = $start; $i <= $end; $i++) {
1093 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1094 }
1095 $start = ord('A');
1096 $end = ord('Z');
1097 for ($i = $start; $i <= $end; $i++) {
1098 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1099 }
1100 if ($cacheFile) {
1101 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1102 }
1103 return 3;
1104 }
1105
1106 /**
1107 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1108 * This function is automatically called by the ASCII transliteration functions.
1109 *
1110 * @param string $charset Charset for which to initialize conversion.
1111 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1112 * @access private
1113 */
1114 public function initToASCII($charset)
1115 {
1116 // Only process if the case table is not yet loaded:
1117 if (is_array($this->toASCII[$charset])) {
1118 return 1;
1119 }
1120 // Use cached version if possible
1121 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/csascii_' . $charset . '.tbl');
1122 if ($cacheFile && @is_file($cacheFile)) {
1123 $this->toASCII[$charset] = unserialize(file_get_contents($cacheFile));
1124 return 2;
1125 }
1126 // Init UTF-8 conversion for this charset
1127 if (!$this->initCharset($charset)) {
1128 return false;
1129 }
1130 // UTF-8/ASCII transliteration is used as the base conversion table
1131 if (!$this->initUnicodeData('ascii')) {
1132 return false;
1133 }
1134 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1135 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1136 $c = $this->utf8_decode($utf8, $charset);
1137 if (isset($this->toASCII['utf-8'][$utf8])) {
1138 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1139 }
1140 }
1141 if ($cacheFile) {
1142 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1143 }
1144 return 3;
1145 }
1146
1147 /********************************************
1148 *
1149 * String operation functions
1150 *
1151 ********************************************/
1152 /**
1153 * Returns a part of a string.
1154 * Unit-tested by Kasper (single byte charsets only)
1155 *
1156 * @param string $charset The character set
1157 * @param string $string Character string
1158 * @param int $start Start position (character position)
1159 * @param int $len Length (in characters)
1160 * @return string The substring
1161 * @see substr(), mb_substr()
1162 */
1163 public function substr($charset, $string, $start, $len = null)
1164 {
1165 if ($len === 0 || $string === '') {
1166 return '';
1167 }
1168 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1169 // Cannot omit $len, when specifying charset
1170 if ($len === null) {
1171 // Save internal encoding
1172 $enc = mb_internal_encoding();
1173 mb_internal_encoding($charset);
1174 $str = mb_substr($string, $start);
1175 // Restore internal encoding
1176 mb_internal_encoding($enc);
1177 return $str;
1178 } else {
1179 return mb_substr($string, $start, $len, $charset);
1180 }
1181 } elseif ($this->getConversionStrategy() === self::STRATEGY_ICONV) {
1182 // Cannot omit $len, when specifying charset
1183 if ($len === null) {
1184 // Save internal encoding
1185 $enc = iconv_get_encoding('internal_encoding');
1186 iconv_set_encoding('internal_encoding', $charset);
1187 $str = iconv_substr($string, $start);
1188 // Restore internal encoding
1189 iconv_set_encoding('internal_encoding', $enc);
1190 return $str;
1191 } else {
1192 return iconv_substr($string, $start, $len, $charset);
1193 }
1194 } elseif ($charset === 'utf-8') {
1195 return $this->utf8_substr($string, $start, $len);
1196 } elseif ($this->eucBasedSets[$charset]) {
1197 return $this->euc_substr($string, $start, $charset, $len);
1198 } elseif ($this->twoByteSets[$charset]) {
1199 return substr($string, $start * 2, $len * 2);
1200 } elseif ($this->fourByteSets[$charset]) {
1201 return substr($string, $start * 4, $len * 4);
1202 }
1203 // Treat everything else as single-byte encoding
1204 return $len === null ? substr($string, $start) : substr($string, $start, $len);
1205 }
1206
1207 /**
1208 * Counts the number of characters.
1209 * Unit-tested by Kasper (single byte charsets only)
1210 *
1211 * @param string $charset The character set
1212 * @param string $string Character string
1213 * @return int The number of characters
1214 * @see strlen()
1215 */
1216 public function strlen($charset, $string)
1217 {
1218 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1219 return mb_strlen($string, $charset);
1220 } elseif ($this->getConversionStrategy() === self::STRATEGY_ICONV) {
1221 return iconv_strlen($string, $charset);
1222 } elseif ($charset === 'utf-8') {
1223 return $this->utf8_strlen($string);
1224 } elseif ($this->eucBasedSets[$charset]) {
1225 return $this->euc_strlen($string, $charset);
1226 } elseif ($this->twoByteSets[$charset]) {
1227 return strlen($string) / 2;
1228 } elseif ($this->fourByteSets[$charset]) {
1229 return strlen($string) / 4;
1230 }
1231 // Treat everything else as single-byte encoding
1232 return strlen($string);
1233 }
1234
1235 /**
1236 * Method to crop strings using the mb_substr function.
1237 *
1238 * @param string $charset The character set
1239 * @param string $string String to be cropped
1240 * @param int $len Crop length (in characters)
1241 * @param string $crop Crop signifier
1242 * @return string The shortened string
1243 * @see mb_strlen(), mb_substr()
1244 */
1245 protected function cropMbstring($charset, $string, $len, $crop = '')
1246 {
1247 if ((int)$len === 0 || mb_strlen($string, $charset) <= abs($len)) {
1248 return $string;
1249 }
1250 if ($len > 0) {
1251 $string = mb_substr($string, 0, $len, $charset) . $crop;
1252 } else {
1253 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1254 }
1255 return $string;
1256 }
1257
1258 /**
1259 * Truncates a string and pre-/appends a string.
1260 * Unit tested by Kasper
1261 *
1262 * @param string $charset The character set
1263 * @param string $string Character string
1264 * @param int $len Length (in characters)
1265 * @param string $crop Crop signifier
1266 * @return string The shortened string
1267 * @see substr(), mb_strimwidth()
1268 */
1269 public function crop($charset, $string, $len, $crop = '')
1270 {
1271 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1272 return $this->cropMbstring($charset, $string, $len, $crop);
1273 }
1274 if ((int)$len === 0) {
1275 return $string;
1276 }
1277 if ($charset === 'utf-8') {
1278 $i = $this->utf8_char2byte_pos($string, $len);
1279 } elseif ($this->eucBasedSets[$charset]) {
1280 $i = $this->euc_char2byte_pos($string, $len, $charset);
1281 } else {
1282 if ($len > 0) {
1283 $i = $len;
1284 } else {
1285 $i = strlen($string) + $len;
1286 if ($i <= 0) {
1287 $i = false;
1288 }
1289 }
1290 }
1291 // $len outside actual string length
1292 if ($i === false) {
1293 return $string;
1294 } else {
1295 if ($len > 0) {
1296 if (isset($string[$i])) {
1297 return substr($string, 0, $i) . $crop;
1298 }
1299 } else {
1300 if (isset($string[$i - 1])) {
1301 return $crop . substr($string, $i);
1302 }
1303 }
1304 }
1305 return $string;
1306 }
1307
1308 /**
1309 * Cuts a string short at a given byte length.
1310 *
1311 * @param string $charset The character set
1312 * @param string $string Character string
1313 * @param int $len The byte length
1314 * @return string The shortened string
1315 * @see mb_strcut()
1316 */
1317 public function strtrunc($charset, $string, $len)
1318 {
1319 if ($len <= 0) {
1320 return '';
1321 }
1322 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1323 return mb_strcut($string, 0, $len, $charset);
1324 } elseif ($charset === 'utf-8') {
1325 return $this->utf8_strtrunc($string, $len);
1326 } elseif ($this->eucBasedSets[$charset]) {
1327 return $this->euc_strtrunc($string, $len, $charset);
1328 } elseif ($this->twoByteSets[$charset]) {
1329 if ($len % 2) {
1330 $len--;
1331 }
1332 } elseif ($this->fourByteSets[$charset]) {
1333 $x = $len % 4;
1334 // Realign to position dividable by four
1335 $len -= $x;
1336 }
1337 // Treat everything else as single-byte encoding
1338 return substr($string, 0, $len);
1339 }
1340
1341 /**
1342 * Translates all characters of a string into their respective case values.
1343 * Unlike strtolower() and strtoupper() this method is locale independent.
1344 * Note that the string length may change!
1345 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1346 * Unit-tested by Kasper
1347 * Real case folding is language dependent, this method ignores this fact.
1348 *
1349 * @param string $charset Character set of string
1350 * @param string $string Input string to convert case for
1351 * @param string $case Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1352 * @return string The converted string
1353 * @see strtolower(), strtoupper()
1354 */
1355 public function conv_case($charset, $string, $case)
1356 {
1357 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1358 if ($case === 'toLower') {
1359 $string = mb_strtolower($string, $charset);
1360 } else {
1361 $string = mb_strtoupper($string, $charset);
1362 }
1363 } elseif ($charset === 'utf-8') {
1364 $string = $this->utf8_char_mapping($string, 'case', $case);
1365 } elseif (isset($this->eucBasedSets[$charset])) {
1366 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1367 } else {
1368 // Treat everything else as single-byte encoding
1369 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1370 }
1371 return $string;
1372 }
1373
1374 /**
1375 * Equivalent of lcfirst/ucfirst but using character set.
1376 *
1377 * @param string $charset
1378 * @param string $string
1379 * @param string $case
1380 * @return string
1381 * @see \TYPO3\CMS\Core\Charset\CharsetConverter::conv_case()
1382 */
1383 public function convCaseFirst($charset, $string, $case)
1384 {
1385 $firstChar = $this->substr($charset, $string, 0, 1);
1386 $firstChar = $this->conv_case($charset, $firstChar, $case);
1387 $remainder = $this->substr($charset, $string, 1);
1388 return $firstChar . $remainder;
1389 }
1390
1391 /**
1392 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1393 *
1394 * @param string $charset Character set of string
1395 * @param string $string Input string to convert
1396 * @return string The converted string
1397 */
1398 public function specCharsToASCII($charset, $string)
1399 {
1400 if ($charset === 'utf-8') {
1401 $string = $this->utf8_char_mapping($string, 'ascii');
1402 } elseif (isset($this->eucBasedSets[$charset])) {
1403 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1404 } else {
1405 // Treat everything else as single-byte encoding
1406 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1407 }
1408 return $string;
1409 }
1410
1411 /**
1412 * Converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1413 * into a TYPO3-readable language code
1414 *
1415 * @param string $languageCodesList List of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1416 * @return string A preferred language that TYPO3 supports, or "default" if none found
1417 * @deprecated since TYPO3 v8, will be removed in TYPO3 v9, use Locales::getPreferredClientLanguage() for usage
1418 */
1419 public function getPreferredClientLanguage($languageCodesList)
1420 {
1421 GeneralUtility::logDeprecatedFunction();
1422 /** @var Locales $locales */
1423 $locales = GeneralUtility::makeInstance(Locales::class);
1424 return $locales->getPreferredClientLanguage($languageCodesList);
1425 }
1426
1427 /********************************************
1428 *
1429 * Internal string operation functions
1430 *
1431 ********************************************/
1432 /**
1433 * Maps all characters of a string in a single byte charset.
1434 *
1435 * @param string $str The string
1436 * @param string $charset The charset
1437 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1438 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
1439 * @return string The converted string
1440 */
1441 public function sb_char_mapping($str, $charset, $mode, $opt = '')
1442 {
1443 switch ($mode) {
1444 case 'case':
1445 if (!$this->initCaseFolding($charset)) {
1446 return $str;
1447 }
1448 // Do nothing
1449 $map = &$this->caseFolding[$charset][$opt];
1450 break;
1451 case 'ascii':
1452 if (!$this->initToASCII($charset)) {
1453 return $str;
1454 }
1455 // Do nothing
1456 $map = &$this->toASCII[$charset];
1457 break;
1458 default:
1459 return $str;
1460 }
1461 $out = '';
1462 for ($i = 0; isset($str[$i]); $i++) {
1463 $c = $str[$i];
1464 if (isset($map[$c])) {
1465 $out .= $map[$c];
1466 } else {
1467 $out .= $c;
1468 }
1469 }
1470 return $out;
1471 }
1472
1473 /********************************************
1474 *
1475 * Internal UTF-8 string operation functions
1476 *
1477 ********************************************/
1478 /**
1479 * Returns a part of a UTF-8 string.
1480 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1481 *
1482 * @param string $str UTF-8 string
1483 * @param int $start Start position (character position)
1484 * @param int $len Length (in characters)
1485 * @return string The substring
1486 * @see substr()
1487 */
1488 public function utf8_substr($str, $start, $len = null)
1489 {
1490 if ((string)$len === '0') {
1491 return '';
1492 }
1493 $byte_start = $this->utf8_char2byte_pos($str, $start);
1494 if ($byte_start === false) {
1495 if ($start > 0) {
1496 // $start outside string length
1497 return false;
1498 }
1499 }
1500 $str = substr($str, $byte_start);
1501 if ($len != null) {
1502 $byte_end = $this->utf8_char2byte_pos($str, $len);
1503 // $len outside actual string length
1504 if ($byte_end === false) {
1505 return $len < 0 ? '' : $str;
1506 } else {
1507 // When length is less than zero and exceeds, then we return blank string.
1508 return substr($str, 0, $byte_end);
1509 }
1510 } else {
1511 return $str;
1512 }
1513 }
1514
1515 /**
1516 * Counts the number of characters of a string in UTF-8.
1517 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1518 *
1519 * @param string $str UTF-8 multibyte character string
1520 * @return int The number of characters
1521 * @see strlen()
1522 */
1523 public function utf8_strlen($str)
1524 {
1525 $n = 0;
1526 for ($i = 0; isset($str[$i]); $i++) {
1527 $c = ord($str[$i]);
1528 // Single-byte (0xxxxxx)
1529 if (!($c & 128)) {
1530 $n++;
1531 } elseif (($c & 192) === 192) {
1532 // Multi-byte starting byte (11xxxxxx)
1533 $n++;
1534 }
1535 }
1536 return $n;
1537 }
1538
1539 /**
1540 * Truncates a string in UTF-8 short at a given byte length.
1541 *
1542 * @param string $str UTF-8 multibyte character string
1543 * @param int $len The byte length
1544 * @return string The shortened string
1545 * @see mb_strcut()
1546 */
1547 public function utf8_strtrunc($str, $len)
1548 {
1549 $i = $len - 1;
1550 // Part of a multibyte sequence
1551 if (ord($str[$i]) & 128) {
1552 for (; $i > 0 && !(ord($str[$i]) & 64); $i--) {
1553 }
1554 if ($i <= 0) {
1555 return '';
1556 }
1557 // Sanity check
1558 for ($bc = 0, $mbs = ord($str[$i]); $mbs & 128; $mbs = $mbs << 1) {
1559 // Calculate number of bytes
1560 $bc++;
1561 }
1562 if ($bc + $i > $len) {
1563 return substr($str, 0, $i);
1564 }
1565 }
1566 return substr($str, 0, $len);
1567 }
1568
1569 /**
1570 * Find position of first occurrence of a string, both arguments are in UTF-8.
1571 *
1572 * @param string $haystack UTF-8 string to search in
1573 * @param string $needle UTF-8 string to search for
1574 * @param int $offset Position to start the search
1575 * @return int The character position
1576 * @see strpos()
1577 */
1578 public function utf8_strpos($haystack, $needle, $offset = 0)
1579 {
1580 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1581 return mb_strpos($haystack, $needle, $offset, 'utf-8');
1582 } elseif ($this->getConversionStrategy() === self::STRATEGY_ICONV) {
1583 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1584 }
1585 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1586 if ($byte_offset === false) {
1587 // Offset beyond string length
1588 return false;
1589 }
1590 $byte_pos = strpos($haystack, $needle, $byte_offset);
1591 if ($byte_pos === false) {
1592 // Needle not found
1593 return false;
1594 }
1595 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1596 }
1597
1598 /**
1599 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1600 *
1601 * @param string $haystack UTF-8 string to search in
1602 * @param string $needle UTF-8 character to search for (single character)
1603 * @return int The character position
1604 * @see strrpos()
1605 */
1606 public function utf8_strrpos($haystack, $needle)
1607 {
1608 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1609 return mb_strrpos($haystack, $needle, 'utf-8');
1610 } elseif ($this->getConversionStrategy() === self::STRATEGY_ICONV) {
1611 return iconv_strrpos($haystack, $needle, 'utf-8');
1612 }
1613 $byte_pos = strrpos($haystack, $needle);
1614 if ($byte_pos === false) {
1615 // Needle not found
1616 return false;
1617 }
1618 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1619 }
1620
1621 /**
1622 * Translates a character position into an 'absolute' byte position.
1623 * Unit tested by Kasper.
1624 *
1625 * @param string $str UTF-8 string
1626 * @param int $pos Character position (negative values start from the end)
1627 * @return int Byte position
1628 */
1629 public function utf8_char2byte_pos($str, $pos)
1630 {
1631 // Number of characters found
1632 $n = 0;
1633 // Number of characters wanted
1634 $p = abs($pos);
1635 if ($pos >= 0) {
1636 $i = 0;
1637 $d = 1;
1638 } else {
1639 $i = strlen($str) - 1;
1640 $d = -1;
1641 }
1642 for (; isset($str[$i]) && $n < $p; $i += $d) {
1643 $c = (int)ord($str[$i]);
1644 // single-byte (0xxxxxx)
1645 if (!($c & 128)) {
1646 $n++;
1647 } elseif (($c & 192) === 192) {
1648 // Multi-byte starting byte (11xxxxxx)
1649 $n++;
1650 }
1651 }
1652 if (!isset($str[$i])) {
1653 // Offset beyond string length
1654 return false;
1655 }
1656 if ($pos >= 0) {
1657 // Skip trailing multi-byte data bytes
1658 while (ord($str[$i]) & 128 && !(ord($str[$i]) & 64)) {
1659 $i++;
1660 }
1661 } else {
1662 // Correct offset
1663 $i++;
1664 }
1665 return $i;
1666 }
1667
1668 /**
1669 * Translates an 'absolute' byte position into a character position.
1670 * Unit tested by Kasper.
1671 *
1672 * @param string $str UTF-8 string
1673 * @param int $pos Byte position
1674 * @return int Character position
1675 */
1676 public function utf8_byte2char_pos($str, $pos)
1677 {
1678 // Number of characters
1679 $n = 0;
1680 for ($i = $pos; $i > 0; $i--) {
1681 $c = (int)ord($str[$i]);
1682 // single-byte (0xxxxxx)
1683 if (!($c & 128)) {
1684 $n++;
1685 } elseif (($c & 192) === 192) {
1686 // Multi-byte starting byte (11xxxxxx)
1687 $n++;
1688 }
1689 }
1690 if (!isset($str[$i])) {
1691 // Offset beyond string length
1692 return false;
1693 }
1694 return $n;
1695 }
1696
1697 /**
1698 * Maps all characters of an UTF-8 string.
1699 *
1700 * @param string $str UTF-8 string
1701 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1702 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
1703 * @return string The converted string
1704 */
1705 public function utf8_char_mapping($str, $mode, $opt = '')
1706 {
1707 if (!$this->initUnicodeData($mode)) {
1708 // Do nothing
1709 return $str;
1710 }
1711 $out = '';
1712 switch ($mode) {
1713 case 'case':
1714 $map = &$this->caseFolding['utf-8'][$opt];
1715 break;
1716 case 'ascii':
1717 $map = &$this->toASCII['utf-8'];
1718 break;
1719 default:
1720 return $str;
1721 }
1722 for ($i = 0; isset($str[$i]); $i++) {
1723 $c = ord($str[$i]);
1724 // single-byte (0xxxxxx)
1725 if (!($c & 128)) {
1726 $mbc = $str[$i];
1727 } elseif (($c & 192) === 192) {
1728 // multi-byte starting byte (11xxxxxx)
1729 for ($bc = 0; $c & 128; $c = $c << 1) {
1730 $bc++;
1731 }
1732 // calculate number of bytes
1733 $mbc = substr($str, $i, $bc);
1734 $i += $bc - 1;
1735 }
1736 if (isset($map[$mbc])) {
1737 $out .= $map[$mbc];
1738 } else {
1739 $out .= $mbc;
1740 }
1741 }
1742 return $out;
1743 }
1744
1745 /********************************************
1746 *
1747 * Internal EUC string operation functions
1748 *
1749 * Extended Unix Code:
1750 * ASCII compatible 7bit single bytes chars
1751 * 8bit two byte chars
1752 *
1753 * Shift-JIS is treated as a special case.
1754 *
1755 ********************************************/
1756 /**
1757 * Cuts a string in the EUC charset family short at a given byte length.
1758 *
1759 * @param string $str EUC multibyte character string
1760 * @param int $len The byte length
1761 * @param string $charset The charset
1762 * @return string The shortened string
1763 * @see mb_strcut()
1764 */
1765 public function euc_strtrunc($str, $len, $charset)
1766 {
1767 $shiftJis = $charset === 'shift_jis';
1768 for ($i = 0; isset($str[$i]) && $i < $len; $i++) {
1769 $c = ord($str[$i]);
1770 if ($shiftJis) {
1771 if ($c >= 128 && $c < 160 || $c >= 224) {
1772 $i++;
1773 }
1774 } else {
1775 if ($c >= 128) {
1776 $i++;
1777 }
1778 }
1779 }
1780 if (!isset($str[$i])) {
1781 return $str;
1782 }
1783 // string shorter than supplied length
1784 if ($i > $len) {
1785 // We ended on a first byte
1786 return substr($str, 0, $len - 1);
1787 } else {
1788 return substr($str, 0, $len);
1789 }
1790 }
1791
1792 /**
1793 * Returns a part of a string in the EUC charset family.
1794 *
1795 * @param string $str EUC multibyte character string
1796 * @param int $start Start position (character position)
1797 * @param string $charset The charset
1798 * @param int $len Length (in characters)
1799 * @return string the substring
1800 */
1801 public function euc_substr($str, $start, $charset, $len = null)
1802 {
1803 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
1804 if ($byte_start === false) {
1805 // $start outside string length
1806 return false;
1807 }
1808 $str = substr($str, $byte_start);
1809 if ($len != null) {
1810 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
1811 // $len outside actual string length
1812 if ($byte_end === false) {
1813 return $str;
1814 } else {
1815 return substr($str, 0, $byte_end);
1816 }
1817 } else {
1818 return $str;
1819 }
1820 }
1821
1822 /**
1823 * Counts the number of characters of a string in the EUC charset family.
1824 *
1825 * @param string $str EUC multibyte character string
1826 * @param string $charset The charset
1827 * @return int The number of characters
1828 * @see strlen()
1829 */
1830 public function euc_strlen($str, $charset)
1831 {
1832 $sjis = $charset === 'shift_jis';
1833 $n = 0;
1834 for ($i = 0; isset($str[$i]); $i++) {
1835 $c = ord($str[$i]);
1836 if ($sjis) {
1837 if ($c >= 128 && $c < 160 || $c >= 224) {
1838 $i++;
1839 }
1840 } else {
1841 if ($c >= 128) {
1842 $i++;
1843 }
1844 }
1845 $n++;
1846 }
1847 return $n;
1848 }
1849
1850 /**
1851 * Translates a character position into an 'absolute' byte position.
1852 *
1853 * @param string $str EUC multibyte character string
1854 * @param int $pos Character position (negative values start from the end)
1855 * @param string $charset The charset
1856 * @return int Byte position
1857 */
1858 public function euc_char2byte_pos($str, $pos, $charset)
1859 {
1860 $sjis = $charset === 'shift_jis';
1861 // Number of characters seen
1862 $n = 0;
1863 // Number of characters wanted
1864 $p = abs($pos);
1865 if ($pos >= 0) {
1866 $i = 0;
1867 $d = 1;
1868 } else {
1869 $i = strlen($str) - 1;
1870 $d = -1;
1871 }
1872 for (; isset($str[$i]) && $n < $p; $i += $d) {
1873 $c = ord($str[$i]);
1874 if ($sjis) {
1875 if ($c >= 128 && $c < 160 || $c >= 224) {
1876 $i += $d;
1877 }
1878 } else {
1879 if ($c >= 128) {
1880 $i += $d;
1881 }
1882 }
1883 $n++;
1884 }
1885 if (!isset($str[$i])) {
1886 return false;
1887 }
1888 // offset beyond string length
1889 if ($pos < 0) {
1890 $i++;
1891 }
1892 // correct offset
1893 return $i;
1894 }
1895
1896 /**
1897 * Maps all characters of a string in the EUC charset family.
1898 *
1899 * @param string $str EUC multibyte character string
1900 * @param string $charset The charset
1901 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1902 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
1903 * @return string The converted string
1904 */
1905 public function euc_char_mapping($str, $charset, $mode, $opt = '')
1906 {
1907 switch ($mode) {
1908 case 'case':
1909 if (!$this->initCaseFolding($charset)) {
1910 return $str;
1911 }
1912 // do nothing
1913 $map = &$this->caseFolding[$charset][$opt];
1914 break;
1915 case 'ascii':
1916 if (!$this->initToASCII($charset)) {
1917 return $str;
1918 }
1919 // do nothing
1920 $map = &$this->toASCII[$charset];
1921 break;
1922 default:
1923 return $str;
1924 }
1925 $sjis = $charset === 'shift_jis';
1926 $out = '';
1927 for ($i = 0; isset($str[$i]); $i++) {
1928 $mbc = $str[$i];
1929 $c = ord($mbc);
1930 if ($sjis) {
1931 // A double-byte char
1932 if ($c >= 128 && $c < 160 || $c >= 224) {
1933 $mbc = substr($str, $i, 2);
1934 $i++;
1935 }
1936 } else {
1937 // A double-byte char
1938 if ($c >= 128) {
1939 $mbc = substr($str, $i, 2);
1940 $i++;
1941 }
1942 }
1943 if (isset($map[$mbc])) {
1944 $out .= $map[$mbc];
1945 } else {
1946 $out .= $mbc;
1947 }
1948 }
1949 return $out;
1950 }
1951
1952 /**
1953 * Checks the selected strategy based on which method is available in the system.
1954 * "mbstring" takes precedence over "iconv".
1955 * See http://stackoverflow.com/questions/8233517/what-is-the-difference-between-iconv-and-mb-convert-encoding-in-php
1956 *
1957 * @return string could be "mbstring", "iconv" or "fallback"
1958 */
1959 protected function getConversionStrategy()
1960 {
1961 if ($this->conversionStrategy === null) {
1962 if (extension_loaded('mbstring')) {
1963 $this->conversionStrategy = self::STRATEGY_MBSTRING;
1964 } elseif (extension_loaded('iconv')) {
1965 $this->conversionStrategy = self::STRATEGY_ICONV;
1966 } else {
1967 $this->conversionStrategy = self::STRATEGY_FALLBACK;
1968 }
1969 }
1970 return $this->conversionStrategy;
1971 }
1972 }