6f2c6cf9edf4f8b306bde9505e5506397145f09b
[Packages/TYPO3.CMS.git] / typo3 / sysext / core / Classes / Charset / CharsetConverter.php
1 <?php
2 namespace TYPO3\CMS\Core\Charset;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\SingletonInterface;
18 use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
19 use TYPO3\CMS\Core\Utility\GeneralUtility;
20
21 /**
22 * Notes on UTF-8
23 *
24 * Functions working on UTF-8 strings:
25 *
26 * - strchr/strstr
27 * - strrchr
28 * - substr_count
29 * - implode/explode/join
30 *
31 * Functions nearly working on UTF-8 strings:
32 *
33 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
34 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
35 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
36 *
37 * Functions NOT working on UTF-8 strings:
38 *
39 * - str*cmp
40 * - stristr
41 * - stripos
42 * - substr
43 * - strrev
44 * - split/spliti
45 * - ...
46 */
47
48 /**
49 * Class for conversion between charsets
50 */
51 class CharsetConverter implements SingletonInterface
52 {
53 /**
54 * ASCII Value for chars with no equivalent.
55 *
56 * @var int
57 */
58 public $noCharByteVal = 63;
59
60 /**
61 * This is the array where parsed conversion tables are stored (cached)
62 *
63 * @var array
64 */
65 public $parsedCharsets = [];
66
67 /**
68 * An array where case folding data will be stored (cached)
69 *
70 * @var array
71 */
72 public $caseFolding = [];
73
74 /**
75 * An array where charset-to-ASCII mappings are stored (cached)
76 *
77 * @var array
78 */
79 public $toASCII = [];
80
81 /**
82 * This tells the converter which charsets has two bytes per char:
83 *
84 * @var array
85 */
86 public $twoByteSets = [
87 'ucs-2' => 1
88 ];
89
90 /**
91 * This tells the converter which charsets use a scheme like the Extended Unix Code:
92 *
93 * @var array
94 */
95 public $eucBasedSets = [
96 'gb2312' => 1, // Chinese, simplified.
97 'big5' => 1, // Chinese, traditional.
98 'euc-kr' => 1, // Korean
99 'shift_jis' => 1
100 ];
101
102 /**
103 * @link http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
104 * @link http://czyborra.com/charsets/iso8859.html
105 *
106 * @var array
107 */
108 public $synonyms = [
109 'us' => 'ascii',
110 'us-ascii' => 'ascii',
111 'cp819' => 'iso-8859-1',
112 'ibm819' => 'iso-8859-1',
113 'iso-ir-100' => 'iso-8859-1',
114 'iso-ir-101' => 'iso-8859-2',
115 'iso-ir-109' => 'iso-8859-3',
116 'iso-ir-110' => 'iso-8859-4',
117 'iso-ir-144' => 'iso-8859-5',
118 'iso-ir-127' => 'iso-8859-6',
119 'iso-ir-126' => 'iso-8859-7',
120 'iso-ir-138' => 'iso-8859-8',
121 'iso-ir-148' => 'iso-8859-9',
122 'iso-ir-157' => 'iso-8859-10',
123 'iso-ir-179' => 'iso-8859-13',
124 'iso-ir-199' => 'iso-8859-14',
125 'iso-ir-203' => 'iso-8859-15',
126 'csisolatin1' => 'iso-8859-1',
127 'csisolatin2' => 'iso-8859-2',
128 'csisolatin3' => 'iso-8859-3',
129 'csisolatin5' => 'iso-8859-9',
130 'csisolatin8' => 'iso-8859-14',
131 'csisolatin9' => 'iso-8859-15',
132 'csisolatingreek' => 'iso-8859-7',
133 'iso-celtic' => 'iso-8859-14',
134 'latin1' => 'iso-8859-1',
135 'latin2' => 'iso-8859-2',
136 'latin3' => 'iso-8859-3',
137 'latin5' => 'iso-8859-9',
138 'latin6' => 'iso-8859-10',
139 'latin8' => 'iso-8859-14',
140 'latin9' => 'iso-8859-15',
141 'l1' => 'iso-8859-1',
142 'l2' => 'iso-8859-2',
143 'l3' => 'iso-8859-3',
144 'l5' => 'iso-8859-9',
145 'l6' => 'iso-8859-10',
146 'l8' => 'iso-8859-14',
147 'l9' => 'iso-8859-15',
148 'cyrillic' => 'iso-8859-5',
149 'arabic' => 'iso-8859-6',
150 'tis-620' => 'iso-8859-11',
151 'win874' => 'windows-874',
152 'win1250' => 'windows-1250',
153 'win1251' => 'windows-1251',
154 'win1252' => 'windows-1252',
155 'win1253' => 'windows-1253',
156 'win1254' => 'windows-1254',
157 'win1255' => 'windows-1255',
158 'win1256' => 'windows-1256',
159 'win1257' => 'windows-1257',
160 'win1258' => 'windows-1258',
161 'cp1250' => 'windows-1250',
162 'cp1251' => 'windows-1251',
163 'cp1252' => 'windows-1252',
164 'ms-ee' => 'windows-1250',
165 'ms-ansi' => 'windows-1252',
166 'ms-greek' => 'windows-1253',
167 'ms-turk' => 'windows-1254',
168 'winbaltrim' => 'windows-1257',
169 'koi-8ru' => 'koi-8r',
170 'koi8r' => 'koi-8r',
171 'cp878' => 'koi-8r',
172 'mac' => 'macroman',
173 'macintosh' => 'macroman',
174 'euc-cn' => 'gb2312',
175 'x-euc-cn' => 'gb2312',
176 'euccn' => 'gb2312',
177 'cp936' => 'gb2312',
178 'big-5' => 'big5',
179 'cp950' => 'big5',
180 'eucjp' => 'euc-jp',
181 'sjis' => 'shift_jis',
182 'shift-jis' => 'shift_jis',
183 'cp932' => 'shift_jis',
184 'cp949' => 'euc-kr',
185 'utf7' => 'utf-7',
186 'utf8' => 'utf-8',
187 'utf16' => 'utf-16',
188 'utf32' => 'utf-32',
189 'ucs2' => 'ucs-2',
190 'ucs4' => 'ucs-4'
191 ];
192
193 /**
194 * Normalize - changes input character set to lowercase letters.
195 *
196 * @param string $charset Input charset
197 * @return string Normalized charset
198 */
199 public function parse_charset($charset)
200 {
201 $charset = trim(strtolower($charset));
202 if (isset($this->synonyms[$charset])) {
203 $charset = $this->synonyms[$charset];
204 }
205 return $charset;
206 }
207
208 /********************************************
209 *
210 * Charset Conversion functions
211 *
212 ********************************************/
213 /**
214 * Convert from one charset to another charset.
215 *
216 * @param string $inputString Input string
217 * @param string $fromCharset From charset (the current charset of the string)
218 * @param string $toCharset To charset (the output charset wanted)
219 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
220 * @return string Converted string
221 * @see convArray()
222 */
223 public function conv($inputString, $fromCharset, $toCharset, $useEntityForNoChar = false)
224 {
225 if ($fromCharset === $toCharset) {
226 return $inputString;
227 }
228 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
229 if ($toCharset === 'utf-8' || !$useEntityForNoChar) {
230 // Returns FALSE for unsupported charsets
231 $convertedString = mb_convert_encoding($inputString, $toCharset, $fromCharset);
232 if (false !== $convertedString) {
233 return $convertedString;
234 }
235 }
236 if ($fromCharset !== 'utf-8') {
237 $inputString = $this->utf8_encode($inputString, $fromCharset);
238 }
239 if ($toCharset !== 'utf-8') {
240 $inputString = $this->utf8_decode($inputString, $toCharset, $useEntityForNoChar);
241 }
242 return $inputString;
243 }
244
245 /**
246 * Convert all elements in ARRAY with type string from one charset to another charset.
247 * NOTICE: Array is passed by reference!
248 *
249 * @param array $array Input array, possibly multidimensional
250 * @param string $fromCharset From charset (the current charset of the string)
251 * @param string $toCharset To charset (the output charset wanted)
252 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
253 * @see conv()
254 */
255 public function convArray(&$array, $fromCharset, $toCharset, $useEntityForNoChar = false)
256 {
257 foreach ($array as $key => $value) {
258 if (is_array($array[$key])) {
259 $this->convArray($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
260 } elseif (is_string($array[$key])) {
261 $array[$key] = $this->conv($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
262 }
263 }
264 }
265
266 /**
267 * Converts $str from $charset to UTF-8
268 *
269 * @param string $str String in local charset to convert to UTF-8
270 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
271 * @return string Output string, converted to UTF-8
272 */
273 public function utf8_encode($str, $charset)
274 {
275 if ($charset === 'utf-8') {
276 return $str;
277 }
278 // Charset is case-insensitive
279 // Parse conv. table if not already
280 if ($this->initCharset($charset)) {
281 $strLen = strlen($str);
282 $outStr = '';
283 // Traverse each char in string
284 for ($a = 0; $a < $strLen; $a++) {
285 $chr = substr($str, $a, 1);
286 $ord = ord($chr);
287 // If the charset has two bytes per char
288 if (isset($this->twoByteSets[$charset])) {
289 $ord2 = ord($str[$a + 1]);
290 // Assume big endian
291 $ord = $ord << 8 | $ord2;
292 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
293 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
294 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
295 } else {
296 $outStr .= chr($this->noCharByteVal);
297 }
298 // No char exists
299 $a++;
300 } elseif ($ord > 127) {
301 // If char has value over 127 it's a multibyte char in UTF-8
302 // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
303 if (isset($this->eucBasedSets[$charset])) {
304 // Shift-JIS: chars between 160 and 223 are single byte
305 if ($charset !== 'shift_jis' || ($ord < 160 || $ord > 223)) {
306 $a++;
307 $ord2 = ord(substr($str, $a, 1));
308 $ord = $ord * 256 + $ord2;
309 }
310 }
311 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
312 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
313 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
314 } else {
315 $outStr .= chr($this->noCharByteVal);
316 }
317 } else {
318 $outStr .= $chr;
319 }
320 }
321 return $outStr;
322 }
323 return '';
324 }
325
326 /**
327 * Converts $str from UTF-8 to $charset
328 *
329 * @param string $str String in UTF-8 to convert to local charset
330 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
331 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
332 * @return string Output string, converted to local charset
333 */
334 public function utf8_decode($str, $charset, $useEntityForNoChar = false)
335 {
336 if ($charset === 'utf-8') {
337 return $str;
338 }
339 // Charset is case-insensitive.
340 // Parse conv. table if not already
341 if ($this->initCharset($charset)) {
342 $strLen = strlen($str);
343 $outStr = '';
344 // Traverse each char in UTF-8 string
345 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) {
346 $chr = substr($str, $a, 1);
347 $ord = ord($chr);
348 // This means multibyte! (first byte!)
349 if ($ord > 127) {
350 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
351 if ($ord & 64) {
352 // Add first byte
353 $buf = $chr;
354 // For each byte in multibyte string
355 for ($b = 0; $b < 8; $b++) {
356 // Shift it left and
357 $ord = $ord << 1;
358 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
359 if ($ord & 128) {
360 $a++;
361 // ... and add the next char.
362 $buf .= substr($str, $a, 1);
363 } else {
364 break;
365 }
366 }
367 // If the UTF-8 char-sequence is found then...
368 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
369 // The local number
370 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
371 // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
372 if ($mByte > 255) {
373 $outStr .= chr(($mByte >> 8 & 255)) . chr(($mByte & 255));
374 } else {
375 $outStr .= chr($mByte);
376 }
377 } elseif ($useEntityForNoChar) {
378 // Create num entity:
379 $outStr .= '&#' . $this->utf8CharToUnumber($buf, true) . ';';
380 } else {
381 $outStr .= chr($this->noCharByteVal);
382 }
383 } else {
384 $outStr .= chr($this->noCharByteVal);
385 }
386 } else {
387 $outStr .= $chr;
388 }
389 }
390 return $outStr;
391 }
392 return '';
393 }
394
395 /**
396 * Converts all chars > 127 to numeric entities.
397 *
398 * @param string $str Input string
399 * @return string Output string
400 */
401 public function utf8_to_entities($str)
402 {
403 $strLen = strlen($str);
404 $outStr = '';
405 // Traverse each char in UTF-8 string.
406 for ($a = 0; $a < $strLen; $a++) {
407 $chr = substr($str, $a, 1);
408 $ord = ord($chr);
409 // This means multibyte! (first byte!)
410 if ($ord > 127) {
411 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
412 if ($ord & 64) {
413 // Add first byte
414 $buf = $chr;
415 // For each byte in multibyte string...
416 for ($b = 0; $b < 8; $b++) {
417 // Shift it left and ...
418 $ord = $ord << 1;
419 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
420 if ($ord & 128) {
421 $a++;
422 // ... and add the next char.
423 $buf .= substr($str, $a, 1);
424 } else {
425 break;
426 }
427 }
428 $outStr .= '&#' . $this->utf8CharToUnumber($buf, true) . ';';
429 } else {
430 $outStr .= chr($this->noCharByteVal);
431 }
432 } else {
433 $outStr .= $chr;
434 }
435 }
436 return $outStr;
437 }
438
439 /**
440 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars.
441 * All string-HTML entities (like &amp; or &pound;) will be converted as well
442 * @param string $str Input string, UTF-8
443 * @return string Output string
444 */
445 public function entities_to_utf8($str)
446 {
447 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT));
448 $token = md5(microtime());
449 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
450 foreach ($parts as $k => $v) {
451 // Only take every second element
452 if ($k % 2 === 0) {
453 continue;
454 }
455 $position = 0;
456 // Dec or hex entities
457 if (substr($v, $position, 1) === '#') {
458 $position++;
459 if (substr($v, $position, 1) === 'x') {
460 $v = hexdec(substr($v, ++$position));
461 } else {
462 $v = substr($v, $position);
463 }
464 $parts[$k] = $this->UnumberToChar($v);
465 } elseif (isset($trans_tbl['&' . $v . ';'])) {
466 // Other entities:
467 $v = $trans_tbl['&' . $v . ';'];
468 $parts[$k] = $v;
469 } else {
470 // No conversion:
471 $parts[$k] = '&' . $v . ';';
472 }
473 }
474 return implode('', $parts);
475 }
476
477 /**
478 * Converts all chars in the input UTF-8 string into integer numbers returned in an array.
479 * All HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
480 * Also, instead of integer numbers the real UTF-8 char is returned.
481 *
482 * @param string $str Input string, UTF-8
483 * @return array Output array with the char numbers
484 */
485 public function utf8_to_numberarray($str)
486 {
487 // Entities must be registered as well
488 $str = $this->entities_to_utf8($str);
489
490 // Do conversion:
491 $strLen = strlen($str);
492 $outArr = [];
493 // Traverse each char in UTF-8 string.
494 for ($a = 0; $a < $strLen; $a++) {
495 $chr = substr($str, $a, 1);
496 $ord = ord($chr);
497 // This means multibyte! (first byte!)
498 if ($ord > 127) {
499 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
500 if ($ord & 64) {
501 // Add first byte
502 $buf = $chr;
503 // For each byte in multibyte string...
504 for ($b = 0; $b < 8; $b++) {
505 // Shift it left and ...
506 $ord = $ord << 1;
507 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
508 if ($ord & 128) {
509 $a++;
510 // ... and add the next char.
511 $buf .= substr($str, $a, 1);
512 } else {
513 break;
514 }
515 }
516 $outArr[] = $buf;
517 } else {
518 $outArr[] = chr($this->noCharByteVal);
519 }
520 } else {
521 $outArr[] = chr($ord);
522 }
523 }
524 return $outArr;
525 }
526
527 /**
528 * Converts a UNICODE number to a UTF-8 multibyte character
529 * Algorithm based on script found at From: http://czyborra.com/utf/
530 * Unit-tested by Kasper
531 *
532 * The binary representation of the character's integer value is thus simply spread across the bytes
533 * and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
534 *
535 * bytes | bits | representation
536 * 1 | 7 | 0vvvvvvv
537 * 2 | 11 | 110vvvvv 10vvvvvv
538 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
539 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
540 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
541 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
542 *
543 * @param int $unicodeInteger UNICODE integer
544 * @return string UTF-8 multibyte character string
545 * @see utf8CharToUnumber()
546 */
547 public function UnumberToChar($unicodeInteger)
548 {
549 $str = '';
550 if ($unicodeInteger < 128) {
551 $str .= chr($unicodeInteger);
552 } elseif ($unicodeInteger < 2048) {
553 $str .= chr(192 | $unicodeInteger >> 6);
554 $str .= chr(128 | $unicodeInteger & 63);
555 } elseif ($unicodeInteger < 65536) {
556 $str .= chr(224 | $unicodeInteger >> 12);
557 $str .= chr(128 | $unicodeInteger >> 6 & 63);
558 $str .= chr(128 | $unicodeInteger & 63);
559 } elseif ($unicodeInteger < 2097152) {
560 $str .= chr(240 | $unicodeInteger >> 18);
561 $str .= chr(128 | $unicodeInteger >> 12 & 63);
562 $str .= chr(128 | $unicodeInteger >> 6 & 63);
563 $str .= chr(128 | $unicodeInteger & 63);
564 } elseif ($unicodeInteger < 67108864) {
565 $str .= chr(248 | $unicodeInteger >> 24);
566 $str .= chr(128 | $unicodeInteger >> 18 & 63);
567 $str .= chr(128 | $unicodeInteger >> 12 & 63);
568 $str .= chr(128 | $unicodeInteger >> 6 & 63);
569 $str .= chr(128 | $unicodeInteger & 63);
570 } elseif ($unicodeInteger < 2147483648) {
571 $str .= chr(252 | $unicodeInteger >> 30);
572 $str .= chr(128 | $unicodeInteger >> 24 & 63);
573 $str .= chr(128 | $unicodeInteger >> 18 & 63);
574 $str .= chr(128 | $unicodeInteger >> 12 & 63);
575 $str .= chr(128 | $unicodeInteger >> 6 & 63);
576 $str .= chr(128 | $unicodeInteger & 63);
577 } else {
578 // Cannot express a 32-bit character in UTF-8
579 $str .= chr($this->noCharByteVal);
580 }
581 return $str;
582 }
583
584 /**
585 * Converts a UTF-8 Multibyte character to a UNICODE number
586 * Unit-tested by Kasper
587 *
588 * @param string $str UTF-8 multibyte character string
589 * @param bool $hex If set, then a hex. number is returned.
590 * @return int UNICODE integer
591 * @see UnumberToChar()
592 */
593 public function utf8CharToUnumber($str, $hex = false)
594 {
595 // First char
596 $ord = ord($str[0]);
597 // This verifies that it IS a multi byte string
598 if (($ord & 192) === 192) {
599 $binBuf = '';
600 $b = 0;
601 // For each byte in multibyte string...
602 for (; $b < 8; $b++) {
603 // Shift it left and ...
604 $ord = $ord << 1;
605 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
606 if ($ord & 128) {
607 $binBuf .= substr('00000000' . decbin(ord(substr($str, ($b + 1), 1))), -6);
608 } else {
609 break;
610 }
611 }
612 $binBuf = substr(('00000000' . decbin(ord($str[0]))), -(6 - $b)) . $binBuf;
613 $int = bindec($binBuf);
614 } else {
615 $int = $ord;
616 }
617 return $hex ? 'x' . dechex($int) : $int;
618 }
619
620 /********************************************
621 *
622 * Init functions
623 *
624 ********************************************/
625 /**
626 * This will initialize a charset for use if it's defined in the 'typo3/sysext/core/Resources/Private/Charsets/csconvtbl/' folder
627 * This function is automatically called by the conversion functions
628 *
629 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
630 *
631 * @param string $charset The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
632 * @return int Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
633 * @access private
634 */
635 public function initCharset($charset)
636 {
637 // Only process if the charset is not yet loaded:
638 if (!is_array($this->parsedCharsets[$charset])) {
639 // Conversion table filename:
640 $charsetConvTableFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/csconvtbl/' . $charset . '.tbl';
641 // If the conversion table is found:
642 if ($charset && GeneralUtility::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
643 // Cache file for charsets:
644 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
645 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/charset_' . $charset . '.tbl');
646 if ($cacheFile && @is_file($cacheFile)) {
647 $this->parsedCharsets[$charset] = unserialize(file_get_contents($cacheFile));
648 } else {
649 // Parse conversion table into lines:
650 $lines = GeneralUtility::trimExplode(LF, file_get_contents($charsetConvTableFile), true);
651 // Initialize the internal variable holding the conv. table:
652 $this->parsedCharsets[$charset] = ['local' => [], 'utf8' => []];
653 // traverse the lines:
654 $detectedType = '';
655 foreach ($lines as $value) {
656 // Comment line or blanks are ignored.
657 if (trim($value) && $value[0] !== '#') {
658 // Detect type if not done yet: (Done on first real line)
659 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
660 if (!$detectedType) {
661 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
662 }
663 $hexbyte = '';
664 $utf8 = '';
665 if ($detectedType === 'ms-token') {
666 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
667 } elseif ($detectedType === 'whitespaced') {
668 $regA = [];
669 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
670 $hexbyte = $regA[1];
671 $utf8 = 'U+' . $regA[2];
672 }
673 $decval = hexdec(trim($hexbyte));
674 if ($decval > 127) {
675 $utf8decval = hexdec(substr(trim($utf8), 2));
676 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
677 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
678 }
679 }
680 }
681 if ($cacheFile) {
682 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
683 }
684 }
685 return 2;
686 } else {
687 return false;
688 }
689 } else {
690 return 1;
691 }
692 }
693
694 /**
695 * This function initializes all UTF-8 character data tables.
696 *
697 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
698 *
699 * @param string $mode Mode ("case", "ascii", ...)
700 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
701 * @access private
702 */
703 public function initUnicodeData($mode = null)
704 {
705 // Cache files
706 $cacheFileCase = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/cscase_utf-8.tbl');
707 $cacheFileASCII = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/csascii_utf-8.tbl');
708 // Only process if the tables are not yet loaded
709 switch ($mode) {
710 case 'case':
711 if (is_array($this->caseFolding['utf-8'])) {
712 return 1;
713 }
714 // Use cached version if possible
715 if ($cacheFileCase && @is_file($cacheFileCase)) {
716 $this->caseFolding['utf-8'] = unserialize(file_get_contents($cacheFileCase));
717 return 2;
718 }
719 break;
720 case 'ascii':
721 if (is_array($this->toASCII['utf-8'])) {
722 return 1;
723 }
724 // Use cached version if possible
725 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
726 $this->toASCII['utf-8'] = unserialize(file_get_contents($cacheFileASCII));
727 return 2;
728 }
729 break;
730 }
731 // Process main Unicode data file
732 $unicodeDataFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/UnicodeData.txt';
733 if (!(GeneralUtility::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
734 return false;
735 }
736 $fh = fopen($unicodeDataFile, 'rb');
737 if (!$fh) {
738 return false;
739 }
740 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
741 // Note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
742 $this->caseFolding['utf-8'] = [];
743 $utf8CaseFolding = &$this->caseFolding['utf-8'];
744 // a shorthand
745 $utf8CaseFolding['toUpper'] = [];
746 $utf8CaseFolding['toLower'] = [];
747 $utf8CaseFolding['toTitle'] = [];
748 // Array of temp. decompositions
749 $decomposition = [];
750 // Array of chars that are marks (eg. composing accents)
751 $mark = [];
752 // Array of chars that are numbers (eg. digits)
753 $number = [];
754 // Array of chars to be omitted (eg. Russian hard sign)
755 $omit = [];
756 while (!feof($fh)) {
757 $line = fgets($fh, 4096);
758 // Has a lot of info
759 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title, ) = explode(';', rtrim($line));
760 $ord = hexdec($char);
761 if ($ord > 65535) {
762 // Only process the BMP
763 break;
764 }
765 $utf8_char = $this->UnumberToChar($ord);
766 if ($upper) {
767 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
768 }
769 if ($lower) {
770 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
771 }
772 // Store "title" only when different from "upper" (only a few)
773 if ($title && $title !== $upper) {
774 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
775 }
776 switch ($cat[0]) {
777 case 'M':
778 // mark (accent, umlaut, ...)
779 $mark['U+' . $char] = 1;
780 break;
781 case 'N':
782 // numeric value
783 if ($ord > 128 && $num !== '') {
784 $number['U+' . $char] = $num;
785 }
786 }
787 // Accented Latin letters without "official" decomposition
788 $match = [];
789 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
790 $c = ord($match[2]);
791 if ($match[1] === 'SMALL') {
792 $c += 32;
793 }
794 $decomposition['U+' . $char] = [dechex($c)];
795 continue;
796 }
797 $match = [];
798 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
799 switch ($match[1]) {
800 case '<circle>':
801 // add parenthesis as circle replacement, eg (1)
802 $match[2] = '0028 ' . $match[2] . ' 0029';
803 break;
804 case '<square>':
805 // add square brackets as square replacement, eg [1]
806 $match[2] = '005B ' . $match[2] . ' 005D';
807 break;
808 case '<compat>':
809 // ignore multi char decompositions that start with a space
810 if (preg_match('/^0020 /', $match[2])) {
811 continue 2;
812 }
813 break;
814 case '<initial>':
815 case '<medial>':
816 case '<final>':
817 case '<isolated>':
818 case '<vertical>':
819 continue 2;
820 }
821 $decomposition['U+' . $char] = explode(' ', $match[2]);
822 }
823 }
824 fclose($fh);
825 // Process additional Unicode data for casing (allow folded characters to expand into a sequence)
826 $specialCasingFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/SpecialCasing.txt';
827 if (GeneralUtility::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
828 $fh = fopen($specialCasingFile, 'rb');
829 if ($fh) {
830 while (!feof($fh)) {
831 $line = fgets($fh, 4096);
832 if ($line[0] !== '#' && trim($line) !== '') {
833 list($char, $lower, $title, $upper, $cond) = GeneralUtility::trimExplode(';', $line);
834 if ($cond === '' || $cond[0] === '#') {
835 $utf8_char = $this->UnumberToChar(hexdec($char));
836 if ($char !== $lower) {
837 $arr = explode(' ', $lower);
838 for ($i = 0; isset($arr[$i]); $i++) {
839 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
840 }
841 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
842 }
843 if ($char !== $title && $title !== $upper) {
844 $arr = explode(' ', $title);
845 for ($i = 0; isset($arr[$i]); $i++) {
846 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
847 }
848 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
849 }
850 if ($char !== $upper) {
851 $arr = explode(' ', $upper);
852 for ($i = 0; isset($arr[$i]); $i++) {
853 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
854 }
855 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
856 }
857 }
858 }
859 }
860 fclose($fh);
861 }
862 }
863 // Process custom decompositions
864 $customTranslitFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/Translit.txt';
865 if (GeneralUtility::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
866 $fh = fopen($customTranslitFile, 'rb');
867 if ($fh) {
868 while (!feof($fh)) {
869 $line = fgets($fh, 4096);
870 if ($line[0] !== '#' && trim($line) !== '') {
871 list($char, $translit) = GeneralUtility::trimExplode(';', $line);
872 if (!$translit) {
873 $omit['U+' . $char] = 1;
874 }
875 $decomposition['U+' . $char] = explode(' ', $translit);
876 }
877 }
878 fclose($fh);
879 }
880 }
881 // Decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
882 foreach ($decomposition as $from => $to) {
883 $code_decomp = [];
884 while ($code_value = array_shift($to)) {
885 // Do recursive decomposition
886 if (isset($decomposition['U+' . $code_value])) {
887 foreach (array_reverse($decomposition['U+' . $code_value]) as $cv) {
888 array_unshift($to, $cv);
889 }
890 } elseif (!isset($mark['U+' . $code_value])) {
891 // remove mark
892 $code_decomp[] = $code_value;
893 }
894 }
895 if (!empty($code_decomp) || isset($omit[$from])) {
896 $decomposition[$from] = $code_decomp;
897 } else {
898 unset($decomposition[$from]);
899 }
900 }
901 // Create ascii only mapping
902 $this->toASCII['utf-8'] = [];
903 $ascii = &$this->toASCII['utf-8'];
904 foreach ($decomposition as $from => $to) {
905 $code_decomp = [];
906 while ($code_value = array_shift($to)) {
907 $ord = hexdec($code_value);
908 if ($ord > 127) {
909 continue 2;
910 } else {
911 // Skip decompositions containing non-ASCII chars
912 $code_decomp[] = chr($ord);
913 }
914 }
915 $ascii[$this->UnumberToChar(hexdec($from))] = implode('', $code_decomp);
916 }
917 // Add numeric decompositions
918 foreach ($number as $from => $to) {
919 $utf8_char = $this->UnumberToChar(hexdec($from));
920 if (!isset($ascii[$utf8_char])) {
921 $ascii[$utf8_char] = $to;
922 }
923 }
924 if ($cacheFileCase) {
925 GeneralUtility::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
926 }
927 if ($cacheFileASCII) {
928 GeneralUtility::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
929 }
930 return 3;
931 }
932
933 /**
934 * This function initializes the folding table for a charset other than UTF-8.
935 * This function is automatically called by the case folding functions.
936 *
937 * @param string $charset Charset for which to initialize case folding.
938 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
939 * @access private
940 */
941 public function initCaseFolding($charset)
942 {
943 // Only process if the case table is not yet loaded:
944 if (is_array($this->caseFolding[$charset])) {
945 return 1;
946 }
947 // Use cached version if possible
948 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/cscase_' . $charset . '.tbl');
949 if ($cacheFile && @is_file($cacheFile)) {
950 $this->caseFolding[$charset] = unserialize(file_get_contents($cacheFile));
951 return 2;
952 }
953 // init UTF-8 conversion for this charset
954 if (!$this->initCharset($charset)) {
955 return false;
956 }
957 // UTF-8 case folding is used as the base conversion table
958 if (!$this->initUnicodeData('case')) {
959 return false;
960 }
961 $nochar = chr($this->noCharByteVal);
962 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
963 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
964 $c = $this->utf8_decode($utf8, $charset);
965 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
966 if ($cc !== '' && $cc !== $nochar) {
967 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
968 }
969 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
970 if ($cc !== '' && $cc !== $nochar) {
971 $this->caseFolding[$charset]['toLower'][$c] = $cc;
972 }
973 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
974 if ($cc !== '' && $cc !== $nochar) {
975 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
976 }
977 }
978 // Add the ASCII case table
979 $start = ord('a');
980 $end = ord('z');
981 for ($i = $start; $i <= $end; $i++) {
982 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
983 }
984 $start = ord('A');
985 $end = ord('Z');
986 for ($i = $start; $i <= $end; $i++) {
987 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
988 }
989 if ($cacheFile) {
990 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
991 }
992 return 3;
993 }
994
995 /**
996 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
997 * This function is automatically called by the ASCII transliteration functions.
998 *
999 * @param string $charset Charset for which to initialize conversion.
1000 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1001 * @access private
1002 */
1003 public function initToASCII($charset)
1004 {
1005 // Only process if the case table is not yet loaded:
1006 if (is_array($this->toASCII[$charset])) {
1007 return 1;
1008 }
1009 // Use cached version if possible
1010 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/csascii_' . $charset . '.tbl');
1011 if ($cacheFile && @is_file($cacheFile)) {
1012 $this->toASCII[$charset] = unserialize(file_get_contents($cacheFile));
1013 return 2;
1014 }
1015 // Init UTF-8 conversion for this charset
1016 if (!$this->initCharset($charset)) {
1017 return false;
1018 }
1019 // UTF-8/ASCII transliteration is used as the base conversion table
1020 if (!$this->initUnicodeData('ascii')) {
1021 return false;
1022 }
1023 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1024 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1025 $c = $this->utf8_decode($utf8, $charset);
1026 if (isset($this->toASCII['utf-8'][$utf8])) {
1027 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1028 }
1029 }
1030 if ($cacheFile) {
1031 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1032 }
1033 return 3;
1034 }
1035
1036 /********************************************
1037 *
1038 * String operation functions
1039 *
1040 ********************************************/
1041
1042 /**
1043 * Truncates a string and pre-/appends a string.
1044 * Unit tested by Kasper
1045 *
1046 * @param string $charset The character set
1047 * @param string $string Character string
1048 * @param int $len Length (in characters)
1049 * @param string $crop Crop signifier
1050 * @return string The shortened string
1051 * @see substr(), mb_strimwidth()
1052 */
1053 public function crop($charset, $string, $len, $crop = '')
1054 {
1055 if ((int)$len === 0 || mb_strlen($string, $charset) <= abs($len)) {
1056 return $string;
1057 }
1058 if ($len > 0) {
1059 $string = mb_substr($string, 0, $len, $charset) . $crop;
1060 } else {
1061 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1062 }
1063 return $string;
1064 }
1065
1066 /**
1067 * Equivalent of lcfirst/ucfirst but using character set.
1068 *
1069 * @param string $charset
1070 * @param string $string
1071 * @param string $case can be 'toLower' or 'toUpper'
1072 * @return string
1073 */
1074 public function convCaseFirst($charset, $string, $case)
1075 {
1076 $firstChar = mb_substr($string, 0, 1, $charset);
1077 $firstChar = $case === 'toLower'
1078 ? mb_strtolower($firstChar, $charset)
1079 : mb_strtoupper($firstChar, $charset);
1080 $remainder = mb_substr($string, 1, null, $charset);
1081 return $firstChar . $remainder;
1082 }
1083
1084 /**
1085 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1086 *
1087 * @param string $charset Character set of string
1088 * @param string $string Input string to convert
1089 * @return string The converted string
1090 */
1091 public function specCharsToASCII($charset, $string)
1092 {
1093 if ($charset === 'utf-8') {
1094 $string = $this->utf8_char_mapping($string);
1095 } elseif (isset($this->eucBasedSets[$charset])) {
1096 $string = $this->euc_char_mapping($string, $charset);
1097 } else {
1098 // Treat everything else as single-byte encoding
1099 $string = $this->sb_char_mapping($string, $charset);
1100 }
1101 return $string;
1102 }
1103
1104 /********************************************
1105 *
1106 * Internal string operation functions
1107 *
1108 ********************************************/
1109 /**
1110 * Maps all characters of a string in a single byte charset.
1111 *
1112 * @param string $str The string
1113 * @param string $charset The charset
1114 * @return string The converted string
1115 */
1116 public function sb_char_mapping($str, $charset)
1117 {
1118 if (!$this->initToASCII($charset)) {
1119 return $str;
1120 }
1121 // Do nothing
1122 $map = &$this->toASCII[$charset];
1123 $out = '';
1124 for ($i = 0; isset($str[$i]); $i++) {
1125 $c = $str[$i];
1126 if (isset($map[$c])) {
1127 $out .= $map[$c];
1128 } else {
1129 $out .= $c;
1130 }
1131 }
1132 return $out;
1133 }
1134
1135 /********************************************
1136 *
1137 * Internal UTF-8 string operation functions
1138 *
1139 ********************************************/
1140
1141 /**
1142 * Translates a character position into an 'absolute' byte position.
1143 * Unit tested by Kasper.
1144 *
1145 * @param string $str UTF-8 string
1146 * @param int $pos Character position (negative values start from the end)
1147 * @return int Byte position
1148 */
1149 public function utf8_char2byte_pos($str, $pos)
1150 {
1151 // Number of characters found
1152 $n = 0;
1153 // Number of characters wanted
1154 $p = abs($pos);
1155 if ($pos >= 0) {
1156 $i = 0;
1157 $d = 1;
1158 } else {
1159 $i = strlen($str) - 1;
1160 $d = -1;
1161 }
1162 for (; isset($str[$i]) && $n < $p; $i += $d) {
1163 $c = (int)ord($str[$i]);
1164 // single-byte (0xxxxxx)
1165 if (!($c & 128)) {
1166 $n++;
1167 } elseif (($c & 192) === 192) {
1168 // Multi-byte starting byte (11xxxxxx)
1169 $n++;
1170 }
1171 }
1172 if (!isset($str[$i])) {
1173 // Offset beyond string length
1174 return false;
1175 }
1176 if ($pos >= 0) {
1177 // Skip trailing multi-byte data bytes
1178 while (ord($str[$i]) & 128 && !(ord($str[$i]) & 64)) {
1179 $i++;
1180 }
1181 } else {
1182 // Correct offset
1183 $i++;
1184 }
1185 return $i;
1186 }
1187
1188 /**
1189 * Maps all characters of an UTF-8 string.
1190 *
1191 * @param string $str UTF-8 string
1192 * @return string The converted string
1193 */
1194 public function utf8_char_mapping($str)
1195 {
1196 if (!$this->initUnicodeData('ascii')) {
1197 // Do nothing
1198 return $str;
1199 }
1200 $out = '';
1201 $map = &$this->toASCII['utf-8'];
1202 for ($i = 0; isset($str[$i]); $i++) {
1203 $c = ord($str[$i]);
1204 $mbc = '';
1205 // single-byte (0xxxxxx)
1206 if (!($c & 128)) {
1207 $mbc = $str[$i];
1208 } elseif (($c & 192) === 192) {
1209 $bc = 0;
1210 // multi-byte starting byte (11xxxxxx)
1211 for (; $c & 128; $c = $c << 1) {
1212 $bc++;
1213 }
1214 // calculate number of bytes
1215 $mbc = substr($str, $i, $bc);
1216 $i += $bc - 1;
1217 }
1218 if (isset($map[$mbc])) {
1219 $out .= $map[$mbc];
1220 } else {
1221 $out .= $mbc;
1222 }
1223 }
1224 return $out;
1225 }
1226
1227 /********************************************
1228 *
1229 * Internal EUC string operation functions
1230 *
1231 * Extended Unix Code:
1232 * ASCII compatible 7bit single bytes chars
1233 * 8bit two byte chars
1234 *
1235 * Shift-JIS is treated as a special case.
1236 *
1237 ********************************************/
1238
1239 /**
1240 * Maps all characters of a string in the EUC charset family.
1241 *
1242 * @param string $str EUC multibyte character string
1243 * @param string $charset The charset
1244 * @return string The converted string
1245 */
1246 public function euc_char_mapping($str, $charset)
1247 {
1248 if (!$this->initToASCII($charset)) {
1249 return $str;
1250 }
1251 // do nothing
1252 $map = &$this->toASCII[$charset];
1253 $out = '';
1254 for ($i = 0; isset($str[$i]); $i++) {
1255 $mbc = $str[$i];
1256 $c = ord($mbc);
1257 if ($charset === 'shift_jis') {
1258 // A double-byte char
1259 if ($c >= 128 && $c < 160 || $c >= 224) {
1260 $mbc = substr($str, $i, 2);
1261 $i++;
1262 }
1263 } else {
1264 // A double-byte char
1265 if ($c >= 128) {
1266 $mbc = substr($str, $i, 2);
1267 $i++;
1268 }
1269 }
1270 if (isset($map[$mbc])) {
1271 $out .= $map[$mbc];
1272 } else {
1273 $out .= $mbc;
1274 }
1275 }
1276 return $out;
1277 }
1278 }