[TASK] Change visibility of CharsetConverter init methods
[Packages/TYPO3.CMS.git] / typo3 / sysext / core / Classes / Charset / CharsetConverter.php
1 <?php
2 namespace TYPO3\CMS\Core\Charset;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Core\Environment;
18 use TYPO3\CMS\Core\SingletonInterface;
19 use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
20 use TYPO3\CMS\Core\Utility\GeneralUtility;
21
22 /**
23 * Notes on UTF-8
24 *
25 * Functions working on UTF-8 strings:
26 *
27 * - strchr/strstr
28 * - strrchr
29 * - substr_count
30 * - implode/explode/join
31 *
32 * Functions nearly working on UTF-8 strings:
33 *
34 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
35 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
36 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
37 *
38 * Functions NOT working on UTF-8 strings:
39 *
40 * - str*cmp
41 * - stristr
42 * - stripos
43 * - substr
44 * - strrev
45 * - split/spliti
46 * - ...
47 */
48
49 /**
50 * Class for conversion between charsets
51 */
52 class CharsetConverter implements SingletonInterface
53 {
54 /**
55 * ASCII Value for chars with no equivalent.
56 *
57 * @var int
58 */
59 public $noCharByteVal = 63;
60
61 /**
62 * This is the array where parsed conversion tables are stored (cached)
63 *
64 * @var array
65 */
66 public $parsedCharsets = [];
67
68 /**
69 * An array where case folding data will be stored (cached)
70 *
71 * @var array
72 */
73 public $caseFolding = [];
74
75 /**
76 * An array where charset-to-ASCII mappings are stored (cached)
77 *
78 * @var array
79 */
80 public $toASCII = [];
81
82 /**
83 * This tells the converter which charsets has two bytes per char:
84 *
85 * @var array
86 */
87 public $twoByteSets = [
88 'ucs-2' => 1
89 ];
90
91 /**
92 * This tells the converter which charsets use a scheme like the Extended Unix Code:
93 *
94 * @var array
95 */
96 public $eucBasedSets = [
97 'gb2312' => 1, // Chinese, simplified.
98 'big5' => 1, // Chinese, traditional.
99 'euc-kr' => 1, // Korean
100 'shift_jis' => 1
101 ];
102
103 /**
104 * @link http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
105 * @link http://czyborra.com/charsets/iso8859.html
106 *
107 * @var array
108 */
109 public $synonyms = [
110 'us' => 'ascii',
111 'us-ascii' => 'ascii',
112 'cp819' => 'iso-8859-1',
113 'ibm819' => 'iso-8859-1',
114 'iso-ir-100' => 'iso-8859-1',
115 'iso-ir-101' => 'iso-8859-2',
116 'iso-ir-109' => 'iso-8859-3',
117 'iso-ir-110' => 'iso-8859-4',
118 'iso-ir-144' => 'iso-8859-5',
119 'iso-ir-127' => 'iso-8859-6',
120 'iso-ir-126' => 'iso-8859-7',
121 'iso-ir-138' => 'iso-8859-8',
122 'iso-ir-148' => 'iso-8859-9',
123 'iso-ir-157' => 'iso-8859-10',
124 'iso-ir-179' => 'iso-8859-13',
125 'iso-ir-199' => 'iso-8859-14',
126 'iso-ir-203' => 'iso-8859-15',
127 'csisolatin1' => 'iso-8859-1',
128 'csisolatin2' => 'iso-8859-2',
129 'csisolatin3' => 'iso-8859-3',
130 'csisolatin5' => 'iso-8859-9',
131 'csisolatin8' => 'iso-8859-14',
132 'csisolatin9' => 'iso-8859-15',
133 'csisolatingreek' => 'iso-8859-7',
134 'iso-celtic' => 'iso-8859-14',
135 'latin1' => 'iso-8859-1',
136 'latin2' => 'iso-8859-2',
137 'latin3' => 'iso-8859-3',
138 'latin5' => 'iso-8859-9',
139 'latin6' => 'iso-8859-10',
140 'latin8' => 'iso-8859-14',
141 'latin9' => 'iso-8859-15',
142 'l1' => 'iso-8859-1',
143 'l2' => 'iso-8859-2',
144 'l3' => 'iso-8859-3',
145 'l5' => 'iso-8859-9',
146 'l6' => 'iso-8859-10',
147 'l8' => 'iso-8859-14',
148 'l9' => 'iso-8859-15',
149 'cyrillic' => 'iso-8859-5',
150 'arabic' => 'iso-8859-6',
151 'tis-620' => 'iso-8859-11',
152 'win874' => 'windows-874',
153 'win1250' => 'windows-1250',
154 'win1251' => 'windows-1251',
155 'win1252' => 'windows-1252',
156 'win1253' => 'windows-1253',
157 'win1254' => 'windows-1254',
158 'win1255' => 'windows-1255',
159 'win1256' => 'windows-1256',
160 'win1257' => 'windows-1257',
161 'win1258' => 'windows-1258',
162 'cp1250' => 'windows-1250',
163 'cp1251' => 'windows-1251',
164 'cp1252' => 'windows-1252',
165 'ms-ee' => 'windows-1250',
166 'ms-ansi' => 'windows-1252',
167 'ms-greek' => 'windows-1253',
168 'ms-turk' => 'windows-1254',
169 'winbaltrim' => 'windows-1257',
170 'koi-8ru' => 'koi-8r',
171 'koi8r' => 'koi-8r',
172 'cp878' => 'koi-8r',
173 'mac' => 'macroman',
174 'macintosh' => 'macroman',
175 'euc-cn' => 'gb2312',
176 'x-euc-cn' => 'gb2312',
177 'euccn' => 'gb2312',
178 'cp936' => 'gb2312',
179 'big-5' => 'big5',
180 'cp950' => 'big5',
181 'eucjp' => 'euc-jp',
182 'sjis' => 'shift_jis',
183 'shift-jis' => 'shift_jis',
184 'cp932' => 'shift_jis',
185 'cp949' => 'euc-kr',
186 'utf7' => 'utf-7',
187 'utf8' => 'utf-8',
188 'utf16' => 'utf-16',
189 'utf32' => 'utf-32',
190 'ucs2' => 'ucs-2',
191 'ucs4' => 'ucs-4'
192 ];
193
194 /**
195 * Normalize - changes input character set to lowercase letters.
196 *
197 * @param string $charset Input charset
198 * @return string Normalized charset
199 */
200 public function parse_charset($charset)
201 {
202 $charset = trim(strtolower($charset));
203 if (isset($this->synonyms[$charset])) {
204 $charset = $this->synonyms[$charset];
205 }
206 return $charset;
207 }
208
209 /********************************************
210 *
211 * Charset Conversion functions
212 *
213 ********************************************/
214 /**
215 * Convert from one charset to another charset.
216 *
217 * @param string $inputString Input string
218 * @param string $fromCharset From charset (the current charset of the string)
219 * @param string $toCharset To charset (the output charset wanted)
220 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
221 * @return string Converted string
222 * @see convArray()
223 */
224 public function conv($inputString, $fromCharset, $toCharset, $useEntityForNoChar = false)
225 {
226 if ($fromCharset === $toCharset) {
227 return $inputString;
228 }
229 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
230 if ($toCharset === 'utf-8' || !$useEntityForNoChar) {
231 // Returns FALSE for unsupported charsets
232 $convertedString = mb_convert_encoding($inputString, $toCharset, $fromCharset);
233 if (false !== $convertedString) {
234 return $convertedString;
235 }
236 }
237 if ($fromCharset !== 'utf-8') {
238 $inputString = $this->utf8_encode($inputString, $fromCharset);
239 }
240 if ($toCharset !== 'utf-8') {
241 $inputString = $this->utf8_decode($inputString, $toCharset, $useEntityForNoChar);
242 }
243 return $inputString;
244 }
245
246 /**
247 * Convert all elements in ARRAY with type string from one charset to another charset.
248 * NOTICE: Array is passed by reference!
249 *
250 * @param array $array Input array, possibly multidimensional
251 * @param string $fromCharset From charset (the current charset of the string)
252 * @param string $toCharset To charset (the output charset wanted)
253 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
254 * @see conv()
255 */
256 public function convArray(&$array, $fromCharset, $toCharset, $useEntityForNoChar = false)
257 {
258 foreach ($array as $key => $value) {
259 if (is_array($array[$key])) {
260 $this->convArray($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
261 } elseif (is_string($array[$key])) {
262 $array[$key] = $this->conv($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
263 }
264 }
265 }
266
267 /**
268 * Converts $str from $charset to UTF-8
269 *
270 * @param string $str String in local charset to convert to UTF-8
271 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
272 * @return string Output string, converted to UTF-8
273 */
274 public function utf8_encode($str, $charset)
275 {
276 if ($charset === 'utf-8') {
277 return $str;
278 }
279 // Charset is case-insensitive
280 // Parse conv. table if not already
281 if ($this->initCharset($charset)) {
282 $strLen = strlen($str);
283 $outStr = '';
284 // Traverse each char in string
285 for ($a = 0; $a < $strLen; $a++) {
286 $chr = substr($str, $a, 1);
287 $ord = ord($chr);
288 // If the charset has two bytes per char
289 if (isset($this->twoByteSets[$charset])) {
290 $ord2 = ord($str[$a + 1]);
291 // Assume big endian
292 $ord = $ord << 8 | $ord2;
293 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
294 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
295 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
296 } else {
297 $outStr .= chr($this->noCharByteVal);
298 }
299 // No char exists
300 $a++;
301 } elseif ($ord > 127) {
302 // If char has value over 127 it's a multibyte char in UTF-8
303 // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
304 if (isset($this->eucBasedSets[$charset])) {
305 // Shift-JIS: chars between 160 and 223 are single byte
306 if ($charset !== 'shift_jis' || ($ord < 160 || $ord > 223)) {
307 $a++;
308 $ord2 = ord(substr($str, $a, 1));
309 $ord = $ord * 256 + $ord2;
310 }
311 }
312 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
313 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
314 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
315 } else {
316 $outStr .= chr($this->noCharByteVal);
317 }
318 } else {
319 $outStr .= $chr;
320 }
321 }
322 return $outStr;
323 }
324 return '';
325 }
326
327 /**
328 * Converts $str from UTF-8 to $charset
329 *
330 * @param string $str String in UTF-8 to convert to local charset
331 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
332 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
333 * @return string Output string, converted to local charset
334 */
335 public function utf8_decode($str, $charset, $useEntityForNoChar = false)
336 {
337 if ($charset === 'utf-8') {
338 return $str;
339 }
340 // Charset is case-insensitive.
341 // Parse conv. table if not already
342 if ($this->initCharset($charset)) {
343 $strLen = strlen($str);
344 $outStr = '';
345 // Traverse each char in UTF-8 string
346 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) {
347 $chr = substr($str, $a, 1);
348 $ord = ord($chr);
349 // This means multibyte! (first byte!)
350 if ($ord > 127) {
351 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
352 if ($ord & 64) {
353 // Add first byte
354 $buf = $chr;
355 // For each byte in multibyte string
356 for ($b = 0; $b < 8; $b++) {
357 // Shift it left and
358 $ord = $ord << 1;
359 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
360 if ($ord & 128) {
361 $a++;
362 // ... and add the next char.
363 $buf .= substr($str, $a, 1);
364 } else {
365 break;
366 }
367 }
368 // If the UTF-8 char-sequence is found then...
369 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
370 // The local number
371 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
372 // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
373 if ($mByte > 255) {
374 $outStr .= chr($mByte >> 8 & 255) . chr($mByte & 255);
375 } else {
376 $outStr .= chr($mByte);
377 }
378 } elseif ($useEntityForNoChar) {
379 // Create num entity:
380 $outStr .= '&#' . $this->utf8CharToUnumber($buf, true) . ';';
381 } else {
382 $outStr .= chr($this->noCharByteVal);
383 }
384 } else {
385 $outStr .= chr($this->noCharByteVal);
386 }
387 } else {
388 $outStr .= $chr;
389 }
390 }
391 return $outStr;
392 }
393 return '';
394 }
395
396 /**
397 * Converts all chars > 127 to numeric entities.
398 *
399 * @param string $str Input string
400 * @return string Output string
401 */
402 public function utf8_to_entities($str)
403 {
404 $strLen = strlen($str);
405 $outStr = '';
406 // Traverse each char in UTF-8 string.
407 for ($a = 0; $a < $strLen; $a++) {
408 $chr = substr($str, $a, 1);
409 $ord = ord($chr);
410 // This means multibyte! (first byte!)
411 if ($ord > 127) {
412 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
413 if ($ord & 64) {
414 // Add first byte
415 $buf = $chr;
416 // For each byte in multibyte string...
417 for ($b = 0; $b < 8; $b++) {
418 // Shift it left and ...
419 $ord = $ord << 1;
420 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
421 if ($ord & 128) {
422 $a++;
423 // ... and add the next char.
424 $buf .= substr($str, $a, 1);
425 } else {
426 break;
427 }
428 }
429 $outStr .= '&#' . $this->utf8CharToUnumber($buf, true) . ';';
430 } else {
431 $outStr .= chr($this->noCharByteVal);
432 }
433 } else {
434 $outStr .= $chr;
435 }
436 }
437 return $outStr;
438 }
439
440 /**
441 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars.
442 * All string-HTML entities (like &amp; or &pound;) will be converted as well
443 * @param string $str Input string, UTF-8
444 * @return string Output string
445 */
446 public function entities_to_utf8($str)
447 {
448 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT));
449 $token = md5(microtime());
450 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
451 foreach ($parts as $k => $v) {
452 // Only take every second element
453 if ($k % 2 === 0) {
454 continue;
455 }
456 $position = 0;
457 // Dec or hex entities
458 if (substr($v, $position, 1) === '#') {
459 $position++;
460 if (substr($v, $position, 1) === 'x') {
461 $v = hexdec(substr($v, ++$position));
462 } else {
463 $v = substr($v, $position);
464 }
465 $parts[$k] = $this->UnumberToChar($v);
466 } elseif (isset($trans_tbl['&' . $v . ';'])) {
467 // Other entities:
468 $v = $trans_tbl['&' . $v . ';'];
469 $parts[$k] = $v;
470 } else {
471 // No conversion:
472 $parts[$k] = '&' . $v . ';';
473 }
474 }
475 return implode('', $parts);
476 }
477
478 /**
479 * Converts all chars in the input UTF-8 string into integer numbers returned in an array.
480 * All HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
481 * Also, instead of integer numbers the real UTF-8 char is returned.
482 *
483 * @param string $str Input string, UTF-8
484 * @return array Output array with the char numbers
485 */
486 public function utf8_to_numberarray($str)
487 {
488 // Entities must be registered as well
489 $str = $this->entities_to_utf8($str);
490
491 // Do conversion:
492 $strLen = strlen($str);
493 $outArr = [];
494 // Traverse each char in UTF-8 string.
495 for ($a = 0; $a < $strLen; $a++) {
496 $chr = substr($str, $a, 1);
497 $ord = ord($chr);
498 // This means multibyte! (first byte!)
499 if ($ord > 127) {
500 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
501 if ($ord & 64) {
502 // Add first byte
503 $buf = $chr;
504 // For each byte in multibyte string...
505 for ($b = 0; $b < 8; $b++) {
506 // Shift it left and ...
507 $ord = $ord << 1;
508 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
509 if ($ord & 128) {
510 $a++;
511 // ... and add the next char.
512 $buf .= substr($str, $a, 1);
513 } else {
514 break;
515 }
516 }
517 $outArr[] = $buf;
518 } else {
519 $outArr[] = chr($this->noCharByteVal);
520 }
521 } else {
522 $outArr[] = chr($ord);
523 }
524 }
525 return $outArr;
526 }
527
528 /**
529 * Converts a UNICODE number to a UTF-8 multibyte character
530 * Algorithm based on script found at From: http://czyborra.com/utf/
531 * Unit-tested by Kasper
532 *
533 * The binary representation of the character's integer value is thus simply spread across the bytes
534 * and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
535 *
536 * bytes | bits | representation
537 * 1 | 7 | 0vvvvvvv
538 * 2 | 11 | 110vvvvv 10vvvvvv
539 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
540 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
541 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
542 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
543 *
544 * @param int $unicodeInteger UNICODE integer
545 * @return string UTF-8 multibyte character string
546 * @see utf8CharToUnumber()
547 */
548 public function UnumberToChar($unicodeInteger)
549 {
550 $str = '';
551 if ($unicodeInteger < 128) {
552 $str .= chr($unicodeInteger);
553 } elseif ($unicodeInteger < 2048) {
554 $str .= chr(192 | $unicodeInteger >> 6);
555 $str .= chr(128 | $unicodeInteger & 63);
556 } elseif ($unicodeInteger < 65536) {
557 $str .= chr(224 | $unicodeInteger >> 12);
558 $str .= chr(128 | $unicodeInteger >> 6 & 63);
559 $str .= chr(128 | $unicodeInteger & 63);
560 } elseif ($unicodeInteger < 2097152) {
561 $str .= chr(240 | $unicodeInteger >> 18);
562 $str .= chr(128 | $unicodeInteger >> 12 & 63);
563 $str .= chr(128 | $unicodeInteger >> 6 & 63);
564 $str .= chr(128 | $unicodeInteger & 63);
565 } elseif ($unicodeInteger < 67108864) {
566 $str .= chr(248 | $unicodeInteger >> 24);
567 $str .= chr(128 | $unicodeInteger >> 18 & 63);
568 $str .= chr(128 | $unicodeInteger >> 12 & 63);
569 $str .= chr(128 | $unicodeInteger >> 6 & 63);
570 $str .= chr(128 | $unicodeInteger & 63);
571 } elseif ($unicodeInteger < 2147483648) {
572 $str .= chr(252 | $unicodeInteger >> 30);
573 $str .= chr(128 | $unicodeInteger >> 24 & 63);
574 $str .= chr(128 | $unicodeInteger >> 18 & 63);
575 $str .= chr(128 | $unicodeInteger >> 12 & 63);
576 $str .= chr(128 | $unicodeInteger >> 6 & 63);
577 $str .= chr(128 | $unicodeInteger & 63);
578 } else {
579 // Cannot express a 32-bit character in UTF-8
580 $str .= chr($this->noCharByteVal);
581 }
582 return $str;
583 }
584
585 /**
586 * Converts a UTF-8 Multibyte character to a UNICODE number
587 * Unit-tested by Kasper
588 *
589 * @param string $str UTF-8 multibyte character string
590 * @param bool $hex If set, then a hex. number is returned.
591 * @return int UNICODE integer
592 * @see UnumberToChar()
593 */
594 public function utf8CharToUnumber($str, $hex = false)
595 {
596 // First char
597 $ord = ord($str[0]);
598 // This verifies that it IS a multi byte string
599 if (($ord & 192) === 192) {
600 $binBuf = '';
601 $b = 0;
602 // For each byte in multibyte string...
603 for (; $b < 8; $b++) {
604 // Shift it left and ...
605 $ord = $ord << 1;
606 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
607 if ($ord & 128) {
608 $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6);
609 } else {
610 break;
611 }
612 }
613 $binBuf = substr('00000000' . decbin(ord($str[0])), -(6 - $b)) . $binBuf;
614 $int = bindec($binBuf);
615 } else {
616 $int = $ord;
617 }
618 return $hex ? 'x' . dechex($int) : $int;
619 }
620
621 /********************************************
622 *
623 * Init functions
624 *
625 ********************************************/
626 /**
627 * This will initialize a charset for use if it's defined in the 'typo3/sysext/core/Resources/Private/Charsets/csconvtbl/' folder
628 * This function is automatically called by the conversion functions
629 *
630 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
631 *
632 * @param string $charset The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
633 * @return int Returns '1' if already loaded, '2' if the charset conversion table was found and parsed.
634 * @throws UnknownCharsetException if no charset table was found
635 */
636 protected function initCharset($charset)
637 {
638 // Only process if the charset is not yet loaded:
639 if (!is_array($this->parsedCharsets[$charset])) {
640 // Conversion table filename:
641 $charsetConvTableFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/csconvtbl/' . $charset . '.tbl';
642 // If the conversion table is found:
643 if ($charset && GeneralUtility::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
644 // Cache file for charsets:
645 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
646 $cacheFile = Environment::getVarPath() . '/charset/charset_' . $charset . '.tbl';
647 if ($cacheFile && @is_file($cacheFile)) {
648 $this->parsedCharsets[$charset] = unserialize(file_get_contents($cacheFile));
649 } else {
650 // Parse conversion table into lines:
651 $lines = GeneralUtility::trimExplode(LF, file_get_contents($charsetConvTableFile), true);
652 // Initialize the internal variable holding the conv. table:
653 $this->parsedCharsets[$charset] = ['local' => [], 'utf8' => []];
654 // traverse the lines:
655 $detectedType = '';
656 foreach ($lines as $value) {
657 // Comment line or blanks are ignored.
658 if (trim($value) && $value[0] !== '#') {
659 // Detect type if not done yet: (Done on first real line)
660 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
661 if (!$detectedType) {
662 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
663 }
664 $hexbyte = '';
665 $utf8 = '';
666 if ($detectedType === 'ms-token') {
667 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
668 } elseif ($detectedType === 'whitespaced') {
669 $regA = [];
670 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
671 $hexbyte = $regA[1];
672 $utf8 = 'U+' . $regA[2];
673 }
674 $decval = hexdec(trim($hexbyte));
675 if ($decval > 127) {
676 $utf8decval = hexdec(substr(trim($utf8), 2));
677 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
678 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
679 }
680 }
681 }
682 if ($cacheFile) {
683 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
684 }
685 }
686 return 2;
687 }
688 throw new UnknownCharsetException(sprintf('Unknown charset "%s"', $charset), 1508916031);
689 }
690 return 1;
691 }
692
693 /**
694 * This function initializes all UTF-8 character data tables.
695 *
696 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
697 *
698 * @param string $mode Mode ("case", "ascii", ...)
699 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
700 */
701 protected function initUnicodeData($mode = null)
702 {
703 // Cache files
704 $cacheFileCase = Environment::getVarPath() . '/charset/cscase_utf-8.tbl';
705 $cacheFileASCII = Environment::getVarPath() . '/charset/csascii_utf-8.tbl';
706 // Only process if the tables are not yet loaded
707 switch ($mode) {
708 case 'case':
709 if (is_array($this->caseFolding['utf-8'])) {
710 return 1;
711 }
712 // Use cached version if possible
713 if ($cacheFileCase && @is_file($cacheFileCase)) {
714 $this->caseFolding['utf-8'] = unserialize(file_get_contents($cacheFileCase));
715 return 2;
716 }
717 break;
718 case 'ascii':
719 if (isset($this->toASCII['utf-8']) && is_array($this->toASCII['utf-8'])) {
720 return 1;
721 }
722 // Use cached version if possible
723 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
724 $this->toASCII['utf-8'] = unserialize(file_get_contents($cacheFileASCII));
725 return 2;
726 }
727 break;
728 }
729 // Process main Unicode data file
730 $unicodeDataFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/UnicodeData.txt';
731 if (!(GeneralUtility::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
732 return false;
733 }
734 $fh = fopen($unicodeDataFile, 'rb');
735 if (!$fh) {
736 return false;
737 }
738 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
739 // Note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
740 $this->caseFolding['utf-8'] = [];
741 $utf8CaseFolding = &$this->caseFolding['utf-8'];
742 // a shorthand
743 $utf8CaseFolding['toUpper'] = [];
744 $utf8CaseFolding['toLower'] = [];
745 $utf8CaseFolding['toTitle'] = [];
746 // Array of temp. decompositions
747 $decomposition = [];
748 // Array of chars that are marks (eg. composing accents)
749 $mark = [];
750 // Array of chars that are numbers (eg. digits)
751 $number = [];
752 // Array of chars to be omitted (eg. Russian hard sign)
753 $omit = [];
754 while (!feof($fh)) {
755 $line = fgets($fh, 4096);
756 // Has a lot of info
757 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title, ) = explode(';', rtrim($line));
758 $ord = hexdec($char);
759 if ($ord > 65535) {
760 // Only process the BMP
761 break;
762 }
763 $utf8_char = $this->UnumberToChar($ord);
764 if ($upper) {
765 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
766 }
767 if ($lower) {
768 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
769 }
770 // Store "title" only when different from "upper" (only a few)
771 if ($title && $title !== $upper) {
772 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
773 }
774 switch ($cat[0]) {
775 case 'M':
776 // mark (accent, umlaut, ...)
777 $mark['U+' . $char] = 1;
778 break;
779 case 'N':
780 // numeric value
781 if ($ord > 128 && $num !== '') {
782 $number['U+' . $char] = $num;
783 }
784 }
785 // Accented Latin letters without "official" decomposition
786 $match = [];
787 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
788 $c = ord($match[2]);
789 if ($match[1] === 'SMALL') {
790 $c += 32;
791 }
792 $decomposition['U+' . $char] = [dechex($c)];
793 continue;
794 }
795 $match = [];
796 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
797 switch ($match[1]) {
798 case '<circle>':
799 // add parenthesis as circle replacement, eg (1)
800 $match[2] = '0028 ' . $match[2] . ' 0029';
801 break;
802 case '<square>':
803 // add square brackets as square replacement, eg [1]
804 $match[2] = '005B ' . $match[2] . ' 005D';
805 break;
806 case '<compat>':
807 // ignore multi char decompositions that start with a space
808 if (preg_match('/^0020 /', $match[2])) {
809 continue 2;
810 }
811 break;
812 case '<initial>':
813 case '<medial>':
814 case '<final>':
815 case '<isolated>':
816 case '<vertical>':
817 continue 2;
818 }
819 $decomposition['U+' . $char] = explode(' ', $match[2]);
820 }
821 }
822 fclose($fh);
823 // Process additional Unicode data for casing (allow folded characters to expand into a sequence)
824 $specialCasingFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/SpecialCasing.txt';
825 if (GeneralUtility::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
826 $fh = fopen($specialCasingFile, 'rb');
827 if ($fh) {
828 while (!feof($fh)) {
829 $line = fgets($fh, 4096);
830 if ($line[0] !== '#' && trim($line) !== '') {
831 list($char, $lower, $title, $upper, $cond) = GeneralUtility::trimExplode(';', $line);
832 if ($cond === '' || $cond[0] === '#') {
833 $utf8_char = $this->UnumberToChar(hexdec($char));
834 if ($char !== $lower) {
835 $arr = explode(' ', $lower);
836 for ($i = 0; isset($arr[$i]); $i++) {
837 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
838 }
839 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
840 }
841 if ($char !== $title && $title !== $upper) {
842 $arr = explode(' ', $title);
843 for ($i = 0; isset($arr[$i]); $i++) {
844 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
845 }
846 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
847 }
848 if ($char !== $upper) {
849 $arr = explode(' ', $upper);
850 for ($i = 0; isset($arr[$i]); $i++) {
851 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
852 }
853 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
854 }
855 }
856 }
857 }
858 fclose($fh);
859 }
860 }
861 // Process custom decompositions
862 $customTranslitFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/Translit.txt';
863 if (GeneralUtility::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
864 $fh = fopen($customTranslitFile, 'rb');
865 if ($fh) {
866 while (!feof($fh)) {
867 $line = fgets($fh, 4096);
868 if ($line[0] !== '#' && trim($line) !== '') {
869 list($char, $translit) = GeneralUtility::trimExplode(';', $line);
870 if (!$translit) {
871 $omit['U+' . $char] = 1;
872 }
873 $decomposition['U+' . $char] = explode(' ', $translit);
874 }
875 }
876 fclose($fh);
877 }
878 }
879 // Decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
880 foreach ($decomposition as $from => $to) {
881 $code_decomp = [];
882 while ($code_value = array_shift($to)) {
883 // Do recursive decomposition
884 if (isset($decomposition['U+' . $code_value])) {
885 foreach (array_reverse($decomposition['U+' . $code_value]) as $cv) {
886 array_unshift($to, $cv);
887 }
888 } elseif (!isset($mark['U+' . $code_value])) {
889 // remove mark
890 $code_decomp[] = $code_value;
891 }
892 }
893 if (!empty($code_decomp) || isset($omit[$from])) {
894 $decomposition[$from] = $code_decomp;
895 } else {
896 unset($decomposition[$from]);
897 }
898 }
899 // Create ascii only mapping
900 $this->toASCII['utf-8'] = [];
901 $ascii = &$this->toASCII['utf-8'];
902 foreach ($decomposition as $from => $to) {
903 $code_decomp = [];
904 while ($code_value = array_shift($to)) {
905 $ord = hexdec($code_value);
906 if ($ord > 127) {
907 continue 2;
908 }
909 // Skip decompositions containing non-ASCII chars
910 $code_decomp[] = chr($ord);
911 }
912 $ascii[$this->UnumberToChar(hexdec($from))] = implode('', $code_decomp);
913 }
914 // Add numeric decompositions
915 foreach ($number as $from => $to) {
916 $utf8_char = $this->UnumberToChar(hexdec($from));
917 if (!isset($ascii[$utf8_char])) {
918 $ascii[$utf8_char] = $to;
919 }
920 }
921 if ($cacheFileCase) {
922 GeneralUtility::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
923 }
924 if ($cacheFileASCII) {
925 GeneralUtility::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
926 }
927 return 3;
928 }
929
930 /**
931 * This function initializes the folding table for a charset other than UTF-8.
932 * This function is automatically called by the case folding functions.
933 *
934 * @param string $charset Charset for which to initialize case folding.
935 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
936 */
937 protected function initCaseFolding($charset)
938 {
939 // Only process if the case table is not yet loaded:
940 if (is_array($this->caseFolding[$charset])) {
941 return 1;
942 }
943 // Use cached version if possible
944 $cacheFile = Environment::getVarPath() . '/charset/cscase_' . $charset . '.tbl';
945 if ($cacheFile && @is_file($cacheFile)) {
946 $this->caseFolding[$charset] = unserialize(file_get_contents($cacheFile));
947 return 2;
948 }
949 // init UTF-8 conversion for this charset
950 if (!$this->initCharset($charset)) {
951 return false;
952 }
953 // UTF-8 case folding is used as the base conversion table
954 if (!$this->initUnicodeData('case')) {
955 return false;
956 }
957 $nochar = chr($this->noCharByteVal);
958 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
959 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
960 $c = $this->utf8_decode($utf8, $charset);
961 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
962 if ($cc !== '' && $cc !== $nochar) {
963 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
964 }
965 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
966 if ($cc !== '' && $cc !== $nochar) {
967 $this->caseFolding[$charset]['toLower'][$c] = $cc;
968 }
969 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
970 if ($cc !== '' && $cc !== $nochar) {
971 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
972 }
973 }
974 // Add the ASCII case table
975 $start = ord('a');
976 $end = ord('z');
977 for ($i = $start; $i <= $end; $i++) {
978 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
979 }
980 $start = ord('A');
981 $end = ord('Z');
982 for ($i = $start; $i <= $end; $i++) {
983 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
984 }
985 if ($cacheFile) {
986 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
987 }
988 return 3;
989 }
990
991 /**
992 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
993 * This function is automatically called by the ASCII transliteration functions.
994 *
995 * @param string $charset Charset for which to initialize conversion.
996 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
997 */
998 protected function initToASCII($charset)
999 {
1000 // Only process if the case table is not yet loaded:
1001 if (is_array($this->toASCII[$charset])) {
1002 return 1;
1003 }
1004 // Use cached version if possible
1005 $cacheFile = Environment::getVarPath() . '/charset/csascii_' . $charset . '.tbl';
1006 if ($cacheFile && @is_file($cacheFile)) {
1007 $this->toASCII[$charset] = unserialize(file_get_contents($cacheFile));
1008 return 2;
1009 }
1010 // Init UTF-8 conversion for this charset
1011 if (!$this->initCharset($charset)) {
1012 return false;
1013 }
1014 // UTF-8/ASCII transliteration is used as the base conversion table
1015 if (!$this->initUnicodeData('ascii')) {
1016 return false;
1017 }
1018 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1019 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1020 $c = $this->utf8_decode($utf8, $charset);
1021 if (isset($this->toASCII['utf-8'][$utf8])) {
1022 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1023 }
1024 }
1025 if ($cacheFile) {
1026 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1027 }
1028 return 3;
1029 }
1030
1031 /********************************************
1032 *
1033 * String operation functions
1034 *
1035 ********************************************/
1036
1037 /**
1038 * Truncates a string and pre-/appends a string.
1039 * Unit tested by Kasper
1040 *
1041 * @param string $charset The character set
1042 * @param string $string Character string
1043 * @param int $len Length (in characters)
1044 * @param string $crop Crop signifier
1045 * @return string The shortened string
1046 * @see substr(), mb_strimwidth()
1047 */
1048 public function crop($charset, $string, $len, $crop = '')
1049 {
1050 if ((int)$len === 0 || mb_strlen($string, $charset) <= abs($len)) {
1051 return $string;
1052 }
1053 if ($len > 0) {
1054 $string = mb_substr($string, 0, $len, $charset) . $crop;
1055 } else {
1056 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1057 }
1058 return $string;
1059 }
1060
1061 /**
1062 * Equivalent of lcfirst/ucfirst but using character set.
1063 *
1064 * @param string $charset
1065 * @param string $string
1066 * @param string $case can be 'toLower' or 'toUpper'
1067 * @return string
1068 */
1069 public function convCaseFirst($charset, $string, $case)
1070 {
1071 $firstChar = mb_substr($string, 0, 1, $charset);
1072 $firstChar = $case === 'toLower'
1073 ? mb_strtolower($firstChar, $charset)
1074 : mb_strtoupper($firstChar, $charset);
1075 $remainder = mb_substr($string, 1, null, $charset);
1076 return $firstChar . $remainder;
1077 }
1078
1079 /**
1080 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1081 *
1082 * @param string $charset Character set of string
1083 * @param string $string Input string to convert
1084 * @return string The converted string
1085 */
1086 public function specCharsToASCII($charset, $string)
1087 {
1088 if ($charset === 'utf-8') {
1089 $string = $this->utf8_char_mapping($string);
1090 } elseif (isset($this->eucBasedSets[$charset])) {
1091 $string = $this->euc_char_mapping($string, $charset);
1092 } else {
1093 // Treat everything else as single-byte encoding
1094 $string = $this->sb_char_mapping($string, $charset);
1095 }
1096 return $string;
1097 }
1098
1099 /********************************************
1100 *
1101 * Internal string operation functions
1102 *
1103 ********************************************/
1104 /**
1105 * Maps all characters of a string in a single byte charset.
1106 *
1107 * @param string $str The string
1108 * @param string $charset The charset
1109 * @return string The converted string
1110 */
1111 public function sb_char_mapping($str, $charset)
1112 {
1113 if (!$this->initToASCII($charset)) {
1114 return $str;
1115 }
1116 // Do nothing
1117 $map = &$this->toASCII[$charset];
1118 $out = '';
1119 for ($i = 0; isset($str[$i]); $i++) {
1120 $c = $str[$i];
1121 if (isset($map[$c])) {
1122 $out .= $map[$c];
1123 } else {
1124 $out .= $c;
1125 }
1126 }
1127 return $out;
1128 }
1129
1130 /********************************************
1131 *
1132 * Internal UTF-8 string operation functions
1133 *
1134 ********************************************/
1135
1136 /**
1137 * Translates a character position into an 'absolute' byte position.
1138 * Unit tested by Kasper.
1139 *
1140 * @param string $str UTF-8 string
1141 * @param int $pos Character position (negative values start from the end)
1142 * @return int Byte position
1143 */
1144 public function utf8_char2byte_pos($str, $pos)
1145 {
1146 // Number of characters found
1147 $n = 0;
1148 // Number of characters wanted
1149 $p = abs($pos);
1150 if ($pos >= 0) {
1151 $i = 0;
1152 $d = 1;
1153 } else {
1154 $i = strlen($str) - 1;
1155 $d = -1;
1156 }
1157 for (; isset($str[$i]) && $n < $p; $i += $d) {
1158 $c = (int)ord($str[$i]);
1159 // single-byte (0xxxxxx)
1160 if (!($c & 128)) {
1161 $n++;
1162 } elseif (($c & 192) === 192) {
1163 // Multi-byte starting byte (11xxxxxx)
1164 $n++;
1165 }
1166 }
1167 if (!isset($str[$i])) {
1168 // Offset beyond string length
1169 return false;
1170 }
1171 if ($pos >= 0) {
1172 // Skip trailing multi-byte data bytes
1173 while (ord($str[$i]) & 128 && !(ord($str[$i]) & 64)) {
1174 $i++;
1175 }
1176 } else {
1177 // Correct offset
1178 $i++;
1179 }
1180 return $i;
1181 }
1182
1183 /**
1184 * Maps all characters of an UTF-8 string.
1185 *
1186 * @param string $str UTF-8 string
1187 * @return string The converted string
1188 */
1189 public function utf8_char_mapping($str)
1190 {
1191 if (!$this->initUnicodeData('ascii')) {
1192 // Do nothing
1193 return $str;
1194 }
1195 $out = '';
1196 $map = &$this->toASCII['utf-8'];
1197 for ($i = 0; isset($str[$i]); $i++) {
1198 $c = ord($str[$i]);
1199 $mbc = '';
1200 // single-byte (0xxxxxx)
1201 if (!($c & 128)) {
1202 $mbc = $str[$i];
1203 } elseif (($c & 192) === 192) {
1204 $bc = 0;
1205 // multi-byte starting byte (11xxxxxx)
1206 for (; $c & 128; $c = $c << 1) {
1207 $bc++;
1208 }
1209 // calculate number of bytes
1210 $mbc = substr($str, $i, $bc);
1211 $i += $bc - 1;
1212 }
1213 if (isset($map[$mbc])) {
1214 $out .= $map[$mbc];
1215 } else {
1216 $out .= $mbc;
1217 }
1218 }
1219 return $out;
1220 }
1221
1222 /********************************************
1223 *
1224 * Internal EUC string operation functions
1225 *
1226 * Extended Unix Code:
1227 * ASCII compatible 7bit single bytes chars
1228 * 8bit two byte chars
1229 *
1230 * Shift-JIS is treated as a special case.
1231 *
1232 ********************************************/
1233
1234 /**
1235 * Maps all characters of a string in the EUC charset family.
1236 *
1237 * @param string $str EUC multibyte character string
1238 * @param string $charset The charset
1239 * @return string The converted string
1240 */
1241 public function euc_char_mapping($str, $charset)
1242 {
1243 if (!$this->initToASCII($charset)) {
1244 return $str;
1245 }
1246 // do nothing
1247 $map = &$this->toASCII[$charset];
1248 $out = '';
1249 for ($i = 0; isset($str[$i]); $i++) {
1250 $mbc = $str[$i];
1251 $c = ord($mbc);
1252 if ($charset === 'shift_jis') {
1253 // A double-byte char
1254 if ($c >= 128 && $c < 160 || $c >= 224) {
1255 $mbc = substr($str, $i, 2);
1256 $i++;
1257 }
1258 } else {
1259 // A double-byte char
1260 if ($c >= 128) {
1261 $mbc = substr($str, $i, 2);
1262 $i++;
1263 }
1264 }
1265 if (isset($map[$mbc])) {
1266 $out .= $map[$mbc];
1267 } else {
1268 $out .= $mbc;
1269 }
1270 }
1271 return $out;
1272 }
1273 }