a09e8e93e6dcb9854ed862519baa86815d3c7323
[Packages/TYPO3.CMS.git] / typo3 / sysext / core / Classes / Charset / CharsetConverter.php
1 <?php
2 namespace TYPO3\CMS\Core\Charset;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Core\Environment;
18 use TYPO3\CMS\Core\SingletonInterface;
19 use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
20 use TYPO3\CMS\Core\Utility\GeneralUtility;
21
22 /**
23 * Notes on UTF-8
24 *
25 * Functions working on UTF-8 strings:
26 *
27 * - strchr/strstr
28 * - strrchr
29 * - substr_count
30 * - implode/explode/join
31 *
32 * Functions nearly working on UTF-8 strings:
33 *
34 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
35 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
36 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
37 *
38 * Functions NOT working on UTF-8 strings:
39 *
40 * - str*cmp
41 * - stristr
42 * - stripos
43 * - substr
44 * - strrev
45 * - split/spliti
46 * - ...
47 */
48
49 /**
50 * Class for conversion between charsets
51 */
52 class CharsetConverter implements SingletonInterface
53 {
54 /**
55 * ASCII Value for chars with no equivalent.
56 *
57 * @var int
58 */
59 public $noCharByteVal = 63;
60
61 /**
62 * This is the array where parsed conversion tables are stored (cached)
63 *
64 * @var array
65 */
66 public $parsedCharsets = [];
67
68 /**
69 * An array where charset-to-ASCII mappings are stored (cached)
70 *
71 * @var array
72 */
73 public $toASCII = [];
74
75 /**
76 * This tells the converter which charsets has two bytes per char:
77 *
78 * @var array
79 */
80 public $twoByteSets = [
81 'ucs-2' => 1
82 ];
83
84 /**
85 * This tells the converter which charsets use a scheme like the Extended Unix Code:
86 *
87 * @var array
88 */
89 public $eucBasedSets = [
90 'gb2312' => 1, // Chinese, simplified.
91 'big5' => 1, // Chinese, traditional.
92 'euc-kr' => 1, // Korean
93 'shift_jis' => 1
94 ];
95
96 /**
97 * @link http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
98 * @link http://czyborra.com/charsets/iso8859.html
99 *
100 * @var array
101 */
102 public $synonyms = [
103 'us' => 'ascii',
104 'us-ascii' => 'ascii',
105 'cp819' => 'iso-8859-1',
106 'ibm819' => 'iso-8859-1',
107 'iso-ir-100' => 'iso-8859-1',
108 'iso-ir-101' => 'iso-8859-2',
109 'iso-ir-109' => 'iso-8859-3',
110 'iso-ir-110' => 'iso-8859-4',
111 'iso-ir-144' => 'iso-8859-5',
112 'iso-ir-127' => 'iso-8859-6',
113 'iso-ir-126' => 'iso-8859-7',
114 'iso-ir-138' => 'iso-8859-8',
115 'iso-ir-148' => 'iso-8859-9',
116 'iso-ir-157' => 'iso-8859-10',
117 'iso-ir-179' => 'iso-8859-13',
118 'iso-ir-199' => 'iso-8859-14',
119 'iso-ir-203' => 'iso-8859-15',
120 'csisolatin1' => 'iso-8859-1',
121 'csisolatin2' => 'iso-8859-2',
122 'csisolatin3' => 'iso-8859-3',
123 'csisolatin5' => 'iso-8859-9',
124 'csisolatin8' => 'iso-8859-14',
125 'csisolatin9' => 'iso-8859-15',
126 'csisolatingreek' => 'iso-8859-7',
127 'iso-celtic' => 'iso-8859-14',
128 'latin1' => 'iso-8859-1',
129 'latin2' => 'iso-8859-2',
130 'latin3' => 'iso-8859-3',
131 'latin5' => 'iso-8859-9',
132 'latin6' => 'iso-8859-10',
133 'latin8' => 'iso-8859-14',
134 'latin9' => 'iso-8859-15',
135 'l1' => 'iso-8859-1',
136 'l2' => 'iso-8859-2',
137 'l3' => 'iso-8859-3',
138 'l5' => 'iso-8859-9',
139 'l6' => 'iso-8859-10',
140 'l8' => 'iso-8859-14',
141 'l9' => 'iso-8859-15',
142 'cyrillic' => 'iso-8859-5',
143 'arabic' => 'iso-8859-6',
144 'tis-620' => 'iso-8859-11',
145 'win874' => 'windows-874',
146 'win1250' => 'windows-1250',
147 'win1251' => 'windows-1251',
148 'win1252' => 'windows-1252',
149 'win1253' => 'windows-1253',
150 'win1254' => 'windows-1254',
151 'win1255' => 'windows-1255',
152 'win1256' => 'windows-1256',
153 'win1257' => 'windows-1257',
154 'win1258' => 'windows-1258',
155 'cp1250' => 'windows-1250',
156 'cp1251' => 'windows-1251',
157 'cp1252' => 'windows-1252',
158 'ms-ee' => 'windows-1250',
159 'ms-ansi' => 'windows-1252',
160 'ms-greek' => 'windows-1253',
161 'ms-turk' => 'windows-1254',
162 'winbaltrim' => 'windows-1257',
163 'koi-8ru' => 'koi-8r',
164 'koi8r' => 'koi-8r',
165 'cp878' => 'koi-8r',
166 'mac' => 'macroman',
167 'macintosh' => 'macroman',
168 'euc-cn' => 'gb2312',
169 'x-euc-cn' => 'gb2312',
170 'euccn' => 'gb2312',
171 'cp936' => 'gb2312',
172 'big-5' => 'big5',
173 'cp950' => 'big5',
174 'eucjp' => 'euc-jp',
175 'sjis' => 'shift_jis',
176 'shift-jis' => 'shift_jis',
177 'cp932' => 'shift_jis',
178 'cp949' => 'euc-kr',
179 'utf7' => 'utf-7',
180 'utf8' => 'utf-8',
181 'utf16' => 'utf-16',
182 'utf32' => 'utf-32',
183 'ucs2' => 'ucs-2',
184 'ucs4' => 'ucs-4'
185 ];
186
187 /**
188 * Normalize - changes input character set to lowercase letters.
189 *
190 * @param string $charset Input charset
191 * @return string Normalized charset
192 */
193 public function parse_charset($charset)
194 {
195 $charset = trim(strtolower($charset));
196 if (isset($this->synonyms[$charset])) {
197 $charset = $this->synonyms[$charset];
198 }
199 return $charset;
200 }
201
202 /********************************************
203 *
204 * Charset Conversion functions
205 *
206 ********************************************/
207 /**
208 * Convert from one charset to another charset.
209 *
210 * @param string $inputString Input string
211 * @param string $fromCharset From charset (the current charset of the string)
212 * @param string $toCharset To charset (the output charset wanted)
213 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
214 * @return string Converted string
215 * @see convArray()
216 */
217 public function conv($inputString, $fromCharset, $toCharset, $useEntityForNoChar = false)
218 {
219 if ($fromCharset === $toCharset) {
220 return $inputString;
221 }
222 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
223 if ($toCharset === 'utf-8' || !$useEntityForNoChar) {
224 // Returns FALSE for unsupported charsets
225 $convertedString = mb_convert_encoding($inputString, $toCharset, $fromCharset);
226 if (false !== $convertedString) {
227 return $convertedString;
228 }
229 }
230 if ($fromCharset !== 'utf-8') {
231 $inputString = $this->utf8_encode($inputString, $fromCharset);
232 }
233 if ($toCharset !== 'utf-8') {
234 $inputString = $this->utf8_decode($inputString, $toCharset, $useEntityForNoChar);
235 }
236 return $inputString;
237 }
238
239 /**
240 * Convert all elements in ARRAY with type string from one charset to another charset.
241 * NOTICE: Array is passed by reference!
242 *
243 * @param array $array Input array, possibly multidimensional
244 * @param string $fromCharset From charset (the current charset of the string)
245 * @param string $toCharset To charset (the output charset wanted)
246 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
247 * @see conv()
248 */
249 public function convArray(&$array, $fromCharset, $toCharset, $useEntityForNoChar = false)
250 {
251 foreach ($array as $key => $value) {
252 if (is_array($array[$key])) {
253 $this->convArray($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
254 } elseif (is_string($array[$key])) {
255 $array[$key] = $this->conv($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
256 }
257 }
258 }
259
260 /**
261 * Converts $str from $charset to UTF-8
262 *
263 * @param string $str String in local charset to convert to UTF-8
264 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
265 * @return string Output string, converted to UTF-8
266 */
267 public function utf8_encode($str, $charset)
268 {
269 if ($charset === 'utf-8') {
270 return $str;
271 }
272 // Charset is case-insensitive
273 // Parse conv. table if not already
274 if ($this->initCharset($charset)) {
275 $strLen = strlen($str);
276 $outStr = '';
277 // Traverse each char in string
278 for ($a = 0; $a < $strLen; $a++) {
279 $chr = substr($str, $a, 1);
280 $ord = ord($chr);
281 // If the charset has two bytes per char
282 if (isset($this->twoByteSets[$charset])) {
283 $ord2 = ord($str[$a + 1]);
284 // Assume big endian
285 $ord = $ord << 8 | $ord2;
286 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
287 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
288 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
289 } else {
290 $outStr .= chr($this->noCharByteVal);
291 }
292 // No char exists
293 $a++;
294 } elseif ($ord > 127) {
295 // If char has value over 127 it's a multibyte char in UTF-8
296 // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
297 if (isset($this->eucBasedSets[$charset])) {
298 // Shift-JIS: chars between 160 and 223 are single byte
299 if ($charset !== 'shift_jis' || ($ord < 160 || $ord > 223)) {
300 $a++;
301 $ord2 = ord(substr($str, $a, 1));
302 $ord = $ord * 256 + $ord2;
303 }
304 }
305 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
306 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
307 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
308 } else {
309 $outStr .= chr($this->noCharByteVal);
310 }
311 } else {
312 $outStr .= $chr;
313 }
314 }
315 return $outStr;
316 }
317 return '';
318 }
319
320 /**
321 * Converts $str from UTF-8 to $charset
322 *
323 * @param string $str String in UTF-8 to convert to local charset
324 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
325 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
326 * @return string Output string, converted to local charset
327 */
328 public function utf8_decode($str, $charset, $useEntityForNoChar = false)
329 {
330 if ($charset === 'utf-8') {
331 return $str;
332 }
333 // Charset is case-insensitive.
334 // Parse conv. table if not already
335 if ($this->initCharset($charset)) {
336 $strLen = strlen($str);
337 $outStr = '';
338 // Traverse each char in UTF-8 string
339 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) {
340 $chr = substr($str, $a, 1);
341 $ord = ord($chr);
342 // This means multibyte! (first byte!)
343 if ($ord > 127) {
344 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
345 if ($ord & 64) {
346 // Add first byte
347 $buf = $chr;
348 // For each byte in multibyte string
349 for ($b = 0; $b < 8; $b++) {
350 // Shift it left and
351 $ord = $ord << 1;
352 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
353 if ($ord & 128) {
354 $a++;
355 // ... and add the next char.
356 $buf .= substr($str, $a, 1);
357 } else {
358 break;
359 }
360 }
361 // If the UTF-8 char-sequence is found then...
362 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
363 // The local number
364 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
365 // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
366 if ($mByte > 255) {
367 $outStr .= chr($mByte >> 8 & 255) . chr($mByte & 255);
368 } else {
369 $outStr .= chr($mByte);
370 }
371 } elseif ($useEntityForNoChar) {
372 // Create num entity:
373 $outStr .= '&#' . $this->utf8CharToUnumber($buf, true) . ';';
374 } else {
375 $outStr .= chr($this->noCharByteVal);
376 }
377 } else {
378 $outStr .= chr($this->noCharByteVal);
379 }
380 } else {
381 $outStr .= $chr;
382 }
383 }
384 return $outStr;
385 }
386 return '';
387 }
388
389 /**
390 * Converts all chars > 127 to numeric entities.
391 *
392 * @param string $str Input string
393 * @return string Output string
394 */
395 public function utf8_to_entities($str)
396 {
397 $strLen = strlen($str);
398 $outStr = '';
399 // Traverse each char in UTF-8 string.
400 for ($a = 0; $a < $strLen; $a++) {
401 $chr = substr($str, $a, 1);
402 $ord = ord($chr);
403 // This means multibyte! (first byte!)
404 if ($ord > 127) {
405 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
406 if ($ord & 64) {
407 // Add first byte
408 $buf = $chr;
409 // For each byte in multibyte string...
410 for ($b = 0; $b < 8; $b++) {
411 // Shift it left and ...
412 $ord = $ord << 1;
413 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
414 if ($ord & 128) {
415 $a++;
416 // ... and add the next char.
417 $buf .= substr($str, $a, 1);
418 } else {
419 break;
420 }
421 }
422 $outStr .= '&#' . $this->utf8CharToUnumber($buf, true) . ';';
423 } else {
424 $outStr .= chr($this->noCharByteVal);
425 }
426 } else {
427 $outStr .= $chr;
428 }
429 }
430 return $outStr;
431 }
432
433 /**
434 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars.
435 * All string-HTML entities (like &amp; or &pound;) will be converted as well
436 * @param string $str Input string, UTF-8
437 * @return string Output string
438 */
439 public function entities_to_utf8($str)
440 {
441 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT));
442 $token = md5(microtime());
443 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
444 foreach ($parts as $k => $v) {
445 // Only take every second element
446 if ($k % 2 === 0) {
447 continue;
448 }
449 $position = 0;
450 // Dec or hex entities
451 if (substr($v, $position, 1) === '#') {
452 $position++;
453 if (substr($v, $position, 1) === 'x') {
454 $v = hexdec(substr($v, ++$position));
455 } else {
456 $v = substr($v, $position);
457 }
458 $parts[$k] = $this->UnumberToChar($v);
459 } elseif (isset($trans_tbl['&' . $v . ';'])) {
460 // Other entities:
461 $v = $trans_tbl['&' . $v . ';'];
462 $parts[$k] = $v;
463 } else {
464 // No conversion:
465 $parts[$k] = '&' . $v . ';';
466 }
467 }
468 return implode('', $parts);
469 }
470
471 /**
472 * Converts all chars in the input UTF-8 string into integer numbers returned in an array.
473 * All HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
474 * Also, instead of integer numbers the real UTF-8 char is returned.
475 *
476 * @param string $str Input string, UTF-8
477 * @return array Output array with the char numbers
478 */
479 public function utf8_to_numberarray($str)
480 {
481 // Entities must be registered as well
482 $str = $this->entities_to_utf8($str);
483
484 // Do conversion:
485 $strLen = strlen($str);
486 $outArr = [];
487 // Traverse each char in UTF-8 string.
488 for ($a = 0; $a < $strLen; $a++) {
489 $chr = substr($str, $a, 1);
490 $ord = ord($chr);
491 // This means multibyte! (first byte!)
492 if ($ord > 127) {
493 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
494 if ($ord & 64) {
495 // Add first byte
496 $buf = $chr;
497 // For each byte in multibyte string...
498 for ($b = 0; $b < 8; $b++) {
499 // Shift it left and ...
500 $ord = $ord << 1;
501 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
502 if ($ord & 128) {
503 $a++;
504 // ... and add the next char.
505 $buf .= substr($str, $a, 1);
506 } else {
507 break;
508 }
509 }
510 $outArr[] = $buf;
511 } else {
512 $outArr[] = chr($this->noCharByteVal);
513 }
514 } else {
515 $outArr[] = chr($ord);
516 }
517 }
518 return $outArr;
519 }
520
521 /**
522 * Converts a UNICODE number to a UTF-8 multibyte character
523 * Algorithm based on script found at From: http://czyborra.com/utf/
524 * Unit-tested by Kasper
525 *
526 * The binary representation of the character's integer value is thus simply spread across the bytes
527 * and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
528 *
529 * bytes | bits | representation
530 * 1 | 7 | 0vvvvvvv
531 * 2 | 11 | 110vvvvv 10vvvvvv
532 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
533 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
534 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
535 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
536 *
537 * @param int $unicodeInteger UNICODE integer
538 * @return string UTF-8 multibyte character string
539 * @see utf8CharToUnumber()
540 */
541 public function UnumberToChar($unicodeInteger)
542 {
543 $str = '';
544 if ($unicodeInteger < 128) {
545 $str .= chr($unicodeInteger);
546 } elseif ($unicodeInteger < 2048) {
547 $str .= chr(192 | $unicodeInteger >> 6);
548 $str .= chr(128 | $unicodeInteger & 63);
549 } elseif ($unicodeInteger < 65536) {
550 $str .= chr(224 | $unicodeInteger >> 12);
551 $str .= chr(128 | $unicodeInteger >> 6 & 63);
552 $str .= chr(128 | $unicodeInteger & 63);
553 } elseif ($unicodeInteger < 2097152) {
554 $str .= chr(240 | $unicodeInteger >> 18);
555 $str .= chr(128 | $unicodeInteger >> 12 & 63);
556 $str .= chr(128 | $unicodeInteger >> 6 & 63);
557 $str .= chr(128 | $unicodeInteger & 63);
558 } elseif ($unicodeInteger < 67108864) {
559 $str .= chr(248 | $unicodeInteger >> 24);
560 $str .= chr(128 | $unicodeInteger >> 18 & 63);
561 $str .= chr(128 | $unicodeInteger >> 12 & 63);
562 $str .= chr(128 | $unicodeInteger >> 6 & 63);
563 $str .= chr(128 | $unicodeInteger & 63);
564 } elseif ($unicodeInteger < 2147483648) {
565 $str .= chr(252 | $unicodeInteger >> 30);
566 $str .= chr(128 | $unicodeInteger >> 24 & 63);
567 $str .= chr(128 | $unicodeInteger >> 18 & 63);
568 $str .= chr(128 | $unicodeInteger >> 12 & 63);
569 $str .= chr(128 | $unicodeInteger >> 6 & 63);
570 $str .= chr(128 | $unicodeInteger & 63);
571 } else {
572 // Cannot express a 32-bit character in UTF-8
573 $str .= chr($this->noCharByteVal);
574 }
575 return $str;
576 }
577
578 /**
579 * Converts a UTF-8 Multibyte character to a UNICODE number
580 * Unit-tested by Kasper
581 *
582 * @param string $str UTF-8 multibyte character string
583 * @param bool $hex If set, then a hex. number is returned.
584 * @return int UNICODE integer
585 * @see UnumberToChar()
586 */
587 public function utf8CharToUnumber($str, $hex = false)
588 {
589 // First char
590 $ord = ord($str[0]);
591 // This verifies that it IS a multi byte string
592 if (($ord & 192) === 192) {
593 $binBuf = '';
594 $b = 0;
595 // For each byte in multibyte string...
596 for (; $b < 8; $b++) {
597 // Shift it left and ...
598 $ord = $ord << 1;
599 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
600 if ($ord & 128) {
601 $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6);
602 } else {
603 break;
604 }
605 }
606 $binBuf = substr('00000000' . decbin(ord($str[0])), -(6 - $b)) . $binBuf;
607 $int = bindec($binBuf);
608 } else {
609 $int = $ord;
610 }
611 return $hex ? 'x' . dechex($int) : $int;
612 }
613
614 /********************************************
615 *
616 * Init functions
617 *
618 ********************************************/
619 /**
620 * This will initialize a charset for use if it's defined in the 'typo3/sysext/core/Resources/Private/Charsets/csconvtbl/' folder
621 * This function is automatically called by the conversion functions
622 *
623 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
624 *
625 * @param string $charset The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
626 * @return int Returns '1' if already loaded, '2' if the charset conversion table was found and parsed.
627 * @throws UnknownCharsetException if no charset table was found
628 */
629 protected function initCharset($charset)
630 {
631 // Only process if the charset is not yet loaded:
632 if (!is_array($this->parsedCharsets[$charset])) {
633 // Conversion table filename:
634 $charsetConvTableFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/csconvtbl/' . $charset . '.tbl';
635 // If the conversion table is found:
636 if ($charset && GeneralUtility::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
637 // Cache file for charsets:
638 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
639 $cacheFile = Environment::getVarPath() . '/charset/charset_' . $charset . '.tbl';
640 if ($cacheFile && @is_file($cacheFile)) {
641 $this->parsedCharsets[$charset] = unserialize(file_get_contents($cacheFile));
642 } else {
643 // Parse conversion table into lines:
644 $lines = GeneralUtility::trimExplode(LF, file_get_contents($charsetConvTableFile), true);
645 // Initialize the internal variable holding the conv. table:
646 $this->parsedCharsets[$charset] = ['local' => [], 'utf8' => []];
647 // traverse the lines:
648 $detectedType = '';
649 foreach ($lines as $value) {
650 // Comment line or blanks are ignored.
651 if (trim($value) && $value[0] !== '#') {
652 // Detect type if not done yet: (Done on first real line)
653 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
654 if (!$detectedType) {
655 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
656 }
657 $hexbyte = '';
658 $utf8 = '';
659 if ($detectedType === 'ms-token') {
660 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
661 } elseif ($detectedType === 'whitespaced') {
662 $regA = [];
663 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
664 $hexbyte = $regA[1];
665 $utf8 = 'U+' . $regA[2];
666 }
667 $decval = hexdec(trim($hexbyte));
668 if ($decval > 127) {
669 $utf8decval = hexdec(substr(trim($utf8), 2));
670 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
671 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
672 }
673 }
674 }
675 if ($cacheFile) {
676 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
677 }
678 }
679 return 2;
680 }
681 throw new UnknownCharsetException(sprintf('Unknown charset "%s"', $charset), 1508916031);
682 }
683 return 1;
684 }
685
686 /**
687 * This function initializes all UTF-8 character data tables.
688 *
689 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
690 *
691 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
692 */
693 protected function initUnicodeData()
694 {
695 // Cache file
696 $cacheFileASCII = Environment::getVarPath() . '/charset/csascii_utf-8.tbl';
697 // Only process if the tables are not yet loaded
698 if (isset($this->toASCII['utf-8']) && is_array($this->toASCII['utf-8'])) {
699 return 1;
700 }
701 // Use cached version if possible
702 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
703 $this->toASCII['utf-8'] = unserialize(file_get_contents($cacheFileASCII));
704 return 2;
705 }
706 // Process main Unicode data file
707 $unicodeDataFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/UnicodeData.txt';
708 if (!(GeneralUtility::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
709 return false;
710 }
711 $fh = fopen($unicodeDataFile, 'rb');
712 if (!$fh) {
713 return false;
714 }
715 // Array of temp. decompositions
716 $decomposition = [];
717 // Array of chars that are marks (eg. composing accents)
718 $mark = [];
719 // Array of chars that are numbers (eg. digits)
720 $number = [];
721 // Array of chars to be omitted (eg. Russian hard sign)
722 $omit = [];
723 while (!feof($fh)) {
724 $line = fgets($fh, 4096);
725 // Has a lot of info
726 list($char, $name, $cat, , , $decomp, , , $num) = explode(';', rtrim($line));
727 $ord = hexdec($char);
728 if ($ord > 65535) {
729 // Only process the BMP
730 break;
731 }
732 switch ($cat[0]) {
733 case 'M':
734 // mark (accent, umlaut, ...)
735 $mark['U+' . $char] = 1;
736 break;
737 case 'N':
738 // numeric value
739 if ($ord > 128 && $num !== '') {
740 $number['U+' . $char] = $num;
741 }
742 }
743 // Accented Latin letters without "official" decomposition
744 $match = [];
745 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
746 $c = ord($match[2]);
747 if ($match[1] === 'SMALL') {
748 $c += 32;
749 }
750 $decomposition['U+' . $char] = [dechex($c)];
751 continue;
752 }
753 $match = [];
754 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
755 switch ($match[1]) {
756 case '<circle>':
757 // add parenthesis as circle replacement, eg (1)
758 $match[2] = '0028 ' . $match[2] . ' 0029';
759 break;
760 case '<square>':
761 // add square brackets as square replacement, eg [1]
762 $match[2] = '005B ' . $match[2] . ' 005D';
763 break;
764 case '<compat>':
765 // ignore multi char decompositions that start with a space
766 if (preg_match('/^0020 /', $match[2])) {
767 continue 2;
768 }
769 break;
770 case '<initial>':
771 case '<medial>':
772 case '<final>':
773 case '<isolated>':
774 case '<vertical>':
775 continue 2;
776 }
777 $decomposition['U+' . $char] = explode(' ', $match[2]);
778 }
779 }
780 fclose($fh);
781 // Process custom decompositions
782 $customTranslitFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/Translit.txt';
783 if (GeneralUtility::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
784 $fh = fopen($customTranslitFile, 'rb');
785 if ($fh) {
786 while (!feof($fh)) {
787 $line = fgets($fh, 4096);
788 if ($line[0] !== '#' && trim($line) !== '') {
789 list($char, $translit) = GeneralUtility::trimExplode(';', $line);
790 if (!$translit) {
791 $omit['U+' . $char] = 1;
792 }
793 $decomposition['U+' . $char] = explode(' ', $translit);
794 }
795 }
796 fclose($fh);
797 }
798 }
799 // Decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
800 foreach ($decomposition as $from => $to) {
801 $code_decomp = [];
802 while ($code_value = array_shift($to)) {
803 // Do recursive decomposition
804 if (isset($decomposition['U+' . $code_value])) {
805 foreach (array_reverse($decomposition['U+' . $code_value]) as $cv) {
806 array_unshift($to, $cv);
807 }
808 } elseif (!isset($mark['U+' . $code_value])) {
809 // remove mark
810 $code_decomp[] = $code_value;
811 }
812 }
813 if (!empty($code_decomp) || isset($omit[$from])) {
814 $decomposition[$from] = $code_decomp;
815 } else {
816 unset($decomposition[$from]);
817 }
818 }
819 // Create ascii only mapping
820 $this->toASCII['utf-8'] = [];
821 foreach ($decomposition as $from => $to) {
822 $code_decomp = [];
823 while ($code_value = array_shift($to)) {
824 $ord = hexdec($code_value);
825 if ($ord > 127) {
826 continue 2;
827 }
828 // Skip decompositions containing non-ASCII chars
829 $code_decomp[] = chr($ord);
830 }
831 $this->toASCII['utf-8'][$this->UnumberToChar(hexdec($from))] = implode('', $code_decomp);
832 }
833 // Add numeric decompositions
834 foreach ($number as $from => $to) {
835 $utf8_char = $this->UnumberToChar(hexdec($from));
836 if (!isset($this->toASCII['utf-8'][$utf8_char])) {
837 $this->toASCII['utf-8'][$utf8_char] = $to;
838 }
839 }
840 if ($cacheFileASCII) {
841 GeneralUtility::writeFileToTypo3tempDir($cacheFileASCII, serialize($this->toASCII['utf-8']));
842 }
843 return 3;
844 }
845
846 /**
847 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
848 * This function is automatically called by the ASCII transliteration functions.
849 *
850 * @param string $charset Charset for which to initialize conversion.
851 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
852 */
853 protected function initToASCII($charset)
854 {
855 // Only process if the case table is not yet loaded:
856 if (is_array($this->toASCII[$charset])) {
857 return 1;
858 }
859 // Use cached version if possible
860 $cacheFile = Environment::getVarPath() . '/charset/csascii_' . $charset . '.tbl';
861 if ($cacheFile && @is_file($cacheFile)) {
862 $this->toASCII[$charset] = unserialize(file_get_contents($cacheFile));
863 return 2;
864 }
865 // Init UTF-8 conversion for this charset
866 if (!$this->initCharset($charset)) {
867 return false;
868 }
869 // UTF-8/ASCII transliteration is used as the base conversion table
870 if (!$this->initUnicodeData()) {
871 return false;
872 }
873 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
874 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
875 $c = $this->utf8_decode($utf8, $charset);
876 if (isset($this->toASCII['utf-8'][$utf8])) {
877 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
878 }
879 }
880 if ($cacheFile) {
881 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
882 }
883 return 3;
884 }
885
886 /********************************************
887 *
888 * String operation functions
889 *
890 ********************************************/
891
892 /**
893 * Truncates a string and pre-/appends a string.
894 * Unit tested by Kasper
895 *
896 * @param string $charset The character set
897 * @param string $string Character string
898 * @param int $len Length (in characters)
899 * @param string $crop Crop signifier
900 * @return string The shortened string
901 * @see substr(), mb_strimwidth()
902 */
903 public function crop($charset, $string, $len, $crop = '')
904 {
905 if ((int)$len === 0 || mb_strlen($string, $charset) <= abs($len)) {
906 return $string;
907 }
908 if ($len > 0) {
909 $string = mb_substr($string, 0, $len, $charset) . $crop;
910 } else {
911 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
912 }
913 return $string;
914 }
915
916 /**
917 * Equivalent of lcfirst/ucfirst but using character set.
918 *
919 * @param string $charset
920 * @param string $string
921 * @param string $case can be 'toLower' or 'toUpper'
922 * @return string
923 */
924 public function convCaseFirst($charset, $string, $case)
925 {
926 $firstChar = mb_substr($string, 0, 1, $charset);
927 $firstChar = $case === 'toLower'
928 ? mb_strtolower($firstChar, $charset)
929 : mb_strtoupper($firstChar, $charset);
930 $remainder = mb_substr($string, 1, null, $charset);
931 return $firstChar . $remainder;
932 }
933
934 /**
935 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
936 *
937 * @param string $charset Character set of string
938 * @param string $string Input string to convert
939 * @return string The converted string
940 */
941 public function specCharsToASCII($charset, $string)
942 {
943 if ($charset === 'utf-8') {
944 $string = $this->utf8_char_mapping($string);
945 } elseif (isset($this->eucBasedSets[$charset])) {
946 $string = $this->euc_char_mapping($string, $charset);
947 } else {
948 // Treat everything else as single-byte encoding
949 $string = $this->sb_char_mapping($string, $charset);
950 }
951 return $string;
952 }
953
954 /********************************************
955 *
956 * Internal string operation functions
957 *
958 ********************************************/
959 /**
960 * Maps all characters of a string in a single byte charset.
961 *
962 * @param string $str The string
963 * @param string $charset The charset
964 * @return string The converted string
965 */
966 public function sb_char_mapping($str, $charset)
967 {
968 if (!$this->initToASCII($charset)) {
969 return $str;
970 }
971 // Do nothing
972 $map = &$this->toASCII[$charset];
973 $out = '';
974 for ($i = 0; isset($str[$i]); $i++) {
975 $c = $str[$i];
976 if (isset($map[$c])) {
977 $out .= $map[$c];
978 } else {
979 $out .= $c;
980 }
981 }
982 return $out;
983 }
984
985 /********************************************
986 *
987 * Internal UTF-8 string operation functions
988 *
989 ********************************************/
990
991 /**
992 * Translates a character position into an 'absolute' byte position.
993 * Unit tested by Kasper.
994 *
995 * @param string $str UTF-8 string
996 * @param int $pos Character position (negative values start from the end)
997 * @return int Byte position
998 */
999 public function utf8_char2byte_pos($str, $pos)
1000 {
1001 // Number of characters found
1002 $n = 0;
1003 // Number of characters wanted
1004 $p = abs($pos);
1005 if ($pos >= 0) {
1006 $i = 0;
1007 $d = 1;
1008 } else {
1009 $i = strlen($str) - 1;
1010 $d = -1;
1011 }
1012 for (; isset($str[$i]) && $n < $p; $i += $d) {
1013 $c = (int)ord($str[$i]);
1014 // single-byte (0xxxxxx)
1015 if (!($c & 128)) {
1016 $n++;
1017 } elseif (($c & 192) === 192) {
1018 // Multi-byte starting byte (11xxxxxx)
1019 $n++;
1020 }
1021 }
1022 if (!isset($str[$i])) {
1023 // Offset beyond string length
1024 return false;
1025 }
1026 if ($pos >= 0) {
1027 // Skip trailing multi-byte data bytes
1028 while (ord($str[$i]) & 128 && !(ord($str[$i]) & 64)) {
1029 $i++;
1030 }
1031 } else {
1032 // Correct offset
1033 $i++;
1034 }
1035 return $i;
1036 }
1037
1038 /**
1039 * Maps all characters of an UTF-8 string.
1040 *
1041 * @param string $str UTF-8 string
1042 * @return string The converted string
1043 */
1044 public function utf8_char_mapping($str)
1045 {
1046 if (!$this->initUnicodeData()) {
1047 // Do nothing
1048 return $str;
1049 }
1050 $out = '';
1051 $map = &$this->toASCII['utf-8'];
1052 for ($i = 0; isset($str[$i]); $i++) {
1053 $c = ord($str[$i]);
1054 $mbc = '';
1055 // single-byte (0xxxxxx)
1056 if (!($c & 128)) {
1057 $mbc = $str[$i];
1058 } elseif (($c & 192) === 192) {
1059 $bc = 0;
1060 // multi-byte starting byte (11xxxxxx)
1061 for (; $c & 128; $c = $c << 1) {
1062 $bc++;
1063 }
1064 // calculate number of bytes
1065 $mbc = substr($str, $i, $bc);
1066 $i += $bc - 1;
1067 }
1068 if (isset($map[$mbc])) {
1069 $out .= $map[$mbc];
1070 } else {
1071 $out .= $mbc;
1072 }
1073 }
1074 return $out;
1075 }
1076
1077 /********************************************
1078 *
1079 * Internal EUC string operation functions
1080 *
1081 * Extended Unix Code:
1082 * ASCII compatible 7bit single bytes chars
1083 * 8bit two byte chars
1084 *
1085 * Shift-JIS is treated as a special case.
1086 *
1087 ********************************************/
1088
1089 /**
1090 * Maps all characters of a string in the EUC charset family.
1091 *
1092 * @param string $str EUC multibyte character string
1093 * @param string $charset The charset
1094 * @return string The converted string
1095 */
1096 public function euc_char_mapping($str, $charset)
1097 {
1098 if (!$this->initToASCII($charset)) {
1099 return $str;
1100 }
1101 // do nothing
1102 $map = &$this->toASCII[$charset];
1103 $out = '';
1104 for ($i = 0; isset($str[$i]); $i++) {
1105 $mbc = $str[$i];
1106 $c = ord($mbc);
1107 if ($charset === 'shift_jis') {
1108 // A double-byte char
1109 if ($c >= 128 && $c < 160 || $c >= 224) {
1110 $mbc = substr($str, $i, 2);
1111 $i++;
1112 }
1113 } else {
1114 // A double-byte char
1115 if ($c >= 128) {
1116 $mbc = substr($str, $i, 2);
1117 $i++;
1118 }
1119 }
1120 if (isset($map[$mbc])) {
1121 $out .= $map[$mbc];
1122 } else {
1123 $out .= $mbc;
1124 }
1125 }
1126 return $out;
1127 }
1128 }