[!!!][TASK] Remove deprecated code from charset converter
[Packages/TYPO3.CMS.git] / typo3 / sysext / core / Classes / Charset / CharsetConverter.php
1 <?php
2 namespace TYPO3\CMS\Core\Charset;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Core\Environment;
18 use TYPO3\CMS\Core\SingletonInterface;
19 use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
20 use TYPO3\CMS\Core\Utility\GeneralUtility;
21
22 /**
23 * Notes on UTF-8
24 *
25 * Functions working on UTF-8 strings:
26 *
27 * - strchr/strstr
28 * - strrchr
29 * - substr_count
30 * - implode/explode/join
31 *
32 * Functions nearly working on UTF-8 strings:
33 *
34 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
35 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
36 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
37 *
38 * Functions NOT working on UTF-8 strings:
39 *
40 * - str*cmp
41 * - stristr
42 * - stripos
43 * - substr
44 * - strrev
45 * - split/spliti
46 * - ...
47 */
48
49 /**
50 * Class for conversion between charsets
51 */
52 class CharsetConverter implements SingletonInterface
53 {
54 /**
55 * ASCII Value for chars with no equivalent.
56 *
57 * @var int
58 */
59 protected $noCharByteVal = 63;
60
61 /**
62 * This is the array where parsed conversion tables are stored (cached)
63 *
64 * @var array
65 */
66 protected $parsedCharsets = [];
67
68 /**
69 * An array where charset-to-ASCII mappings are stored (cached)
70 *
71 * @var array
72 */
73 protected $toASCII = [];
74
75 /**
76 * This tells the converter which charsets has two bytes per char:
77 *
78 * @var array
79 */
80 protected $twoByteSets = [
81 'ucs-2' => 1
82 ];
83
84 /**
85 * This tells the converter which charsets use a scheme like the Extended Unix Code:
86 *
87 * @var array
88 */
89 protected $eucBasedSets = [
90 'gb2312' => 1, // Chinese, simplified.
91 'big5' => 1, // Chinese, traditional.
92 'euc-kr' => 1, // Korean
93 'shift_jis' => 1
94 ];
95
96 /********************************************
97 *
98 * Charset Conversion functions
99 *
100 ********************************************/
101 /**
102 * Convert from one charset to another charset.
103 *
104 * @param string $inputString Input string
105 * @param string $fromCharset From charset (the current charset of the string)
106 * @param string $toCharset To charset (the output charset wanted)
107 * @return string Converted string
108 */
109 public function conv($inputString, $fromCharset, $toCharset)
110 {
111 if ($fromCharset === $toCharset) {
112 return $inputString;
113 }
114 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
115 if ($toCharset === 'utf-8') {
116 // Returns FALSE for unsupported charsets
117 $convertedString = mb_convert_encoding($inputString, $toCharset, $fromCharset);
118 if (false !== $convertedString) {
119 return $convertedString;
120 }
121 }
122 if ($fromCharset !== 'utf-8') {
123 $inputString = $this->utf8_encode($inputString, $fromCharset);
124 }
125 if ($toCharset !== 'utf-8') {
126 $inputString = $this->utf8_decode($inputString, $toCharset, true);
127 }
128 return $inputString;
129 }
130
131 /**
132 * Converts $str from $charset to UTF-8
133 *
134 * @param string $str String in local charset to convert to UTF-8
135 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
136 * @return string Output string, converted to UTF-8
137 */
138 public function utf8_encode($str, $charset)
139 {
140 if ($charset === 'utf-8') {
141 return $str;
142 }
143 // Charset is case-insensitive
144 // Parse conv. table if not already
145 if ($this->initCharset($charset)) {
146 $strLen = strlen($str);
147 $outStr = '';
148 // Traverse each char in string
149 for ($a = 0; $a < $strLen; $a++) {
150 $chr = substr($str, $a, 1);
151 $ord = ord($chr);
152 // If the charset has two bytes per char
153 if (isset($this->twoByteSets[$charset])) {
154 $ord2 = ord($str[$a + 1]);
155 // Assume big endian
156 $ord = $ord << 8 | $ord2;
157 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
158 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
159 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
160 } else {
161 $outStr .= chr($this->noCharByteVal);
162 }
163 // No char exists
164 $a++;
165 } elseif ($ord > 127) {
166 // If char has value over 127 it's a multibyte char in UTF-8
167 // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
168 if (isset($this->eucBasedSets[$charset])) {
169 // Shift-JIS: chars between 160 and 223 are single byte
170 if ($charset !== 'shift_jis' || ($ord < 160 || $ord > 223)) {
171 $a++;
172 $ord2 = ord(substr($str, $a, 1));
173 $ord = $ord * 256 + $ord2;
174 }
175 }
176 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
177 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
178 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
179 } else {
180 $outStr .= chr($this->noCharByteVal);
181 }
182 } else {
183 $outStr .= $chr;
184 }
185 }
186 return $outStr;
187 }
188 return '';
189 }
190
191 /**
192 * Converts $str from UTF-8 to $charset
193 *
194 * @param string $str String in UTF-8 to convert to local charset
195 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
196 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
197 * @return string Output string, converted to local charset
198 */
199 public function utf8_decode($str, $charset, $useEntityForNoChar = false)
200 {
201 if ($charset === 'utf-8') {
202 return $str;
203 }
204 // Charset is case-insensitive.
205 // Parse conv. table if not already
206 if ($this->initCharset($charset)) {
207 $strLen = strlen($str);
208 $outStr = '';
209 // Traverse each char in UTF-8 string
210 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) {
211 $chr = substr($str, $a, 1);
212 $ord = ord($chr);
213 // This means multibyte! (first byte!)
214 if ($ord > 127) {
215 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
216 if ($ord & 64) {
217 // Add first byte
218 $buf = $chr;
219 // For each byte in multibyte string
220 for ($b = 0; $b < 8; $b++) {
221 // Shift it left and
222 $ord = $ord << 1;
223 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
224 if ($ord & 128) {
225 $a++;
226 // ... and add the next char.
227 $buf .= substr($str, $a, 1);
228 } else {
229 break;
230 }
231 }
232 // If the UTF-8 char-sequence is found then...
233 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
234 // The local number
235 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
236 // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
237 if ($mByte > 255) {
238 $outStr .= chr($mByte >> 8 & 255) . chr($mByte & 255);
239 } else {
240 $outStr .= chr($mByte);
241 }
242 } elseif ($useEntityForNoChar) {
243 // Create num entity:
244 $outStr .= '&#' . $this->utf8CharToUnumber($buf, true) . ';';
245 } else {
246 $outStr .= chr($this->noCharByteVal);
247 }
248 } else {
249 $outStr .= chr($this->noCharByteVal);
250 }
251 } else {
252 $outStr .= $chr;
253 }
254 }
255 return $outStr;
256 }
257 return '';
258 }
259
260 /**
261 * Converts all chars in the input UTF-8 string into integer numbers returned in an array.
262 * All HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
263 * Also, instead of integer numbers the real UTF-8 char is returned.
264 *
265 * @param string $str Input string, UTF-8
266 * @return array Output array with the char numbers
267 */
268 public function utf8_to_numberarray($str)
269 {
270 // Entities must be registered as well
271 $str = html_entity_decode($str, ENT_COMPAT, 'utf-8');
272
273 // Do conversion:
274 $strLen = strlen($str);
275 $outArr = [];
276 // Traverse each char in UTF-8 string.
277 for ($a = 0; $a < $strLen; $a++) {
278 $chr = substr($str, $a, 1);
279 $ord = ord($chr);
280 // This means multibyte! (first byte!)
281 if ($ord > 127) {
282 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
283 if ($ord & 64) {
284 // Add first byte
285 $buf = $chr;
286 // For each byte in multibyte string...
287 for ($b = 0; $b < 8; $b++) {
288 // Shift it left and ...
289 $ord = $ord << 1;
290 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
291 if ($ord & 128) {
292 $a++;
293 // ... and add the next char.
294 $buf .= substr($str, $a, 1);
295 } else {
296 break;
297 }
298 }
299 $outArr[] = $buf;
300 } else {
301 $outArr[] = chr($this->noCharByteVal);
302 }
303 } else {
304 $outArr[] = chr($ord);
305 }
306 }
307 return $outArr;
308 }
309
310 /**
311 * Converts a UNICODE number to a UTF-8 multibyte character
312 * Algorithm based on script found at From: http://czyborra.com/utf/
313 * Unit-tested by Kasper
314 *
315 * The binary representation of the character's integer value is thus simply spread across the bytes
316 * and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
317 *
318 * bytes | bits | representation
319 * 1 | 7 | 0vvvvvvv
320 * 2 | 11 | 110vvvvv 10vvvvvv
321 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
322 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
323 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
324 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
325 *
326 * @param int $unicodeInteger UNICODE integer
327 * @return string UTF-8 multibyte character string
328 * @see utf8CharToUnumber()
329 */
330 public function UnumberToChar($unicodeInteger)
331 {
332 $str = '';
333 if ($unicodeInteger < 128) {
334 $str .= chr($unicodeInteger);
335 } elseif ($unicodeInteger < 2048) {
336 $str .= chr(192 | $unicodeInteger >> 6);
337 $str .= chr(128 | $unicodeInteger & 63);
338 } elseif ($unicodeInteger < 65536) {
339 $str .= chr(224 | $unicodeInteger >> 12);
340 $str .= chr(128 | $unicodeInteger >> 6 & 63);
341 $str .= chr(128 | $unicodeInteger & 63);
342 } elseif ($unicodeInteger < 2097152) {
343 $str .= chr(240 | $unicodeInteger >> 18);
344 $str .= chr(128 | $unicodeInteger >> 12 & 63);
345 $str .= chr(128 | $unicodeInteger >> 6 & 63);
346 $str .= chr(128 | $unicodeInteger & 63);
347 } elseif ($unicodeInteger < 67108864) {
348 $str .= chr(248 | $unicodeInteger >> 24);
349 $str .= chr(128 | $unicodeInteger >> 18 & 63);
350 $str .= chr(128 | $unicodeInteger >> 12 & 63);
351 $str .= chr(128 | $unicodeInteger >> 6 & 63);
352 $str .= chr(128 | $unicodeInteger & 63);
353 } elseif ($unicodeInteger < 2147483648) {
354 $str .= chr(252 | $unicodeInteger >> 30);
355 $str .= chr(128 | $unicodeInteger >> 24 & 63);
356 $str .= chr(128 | $unicodeInteger >> 18 & 63);
357 $str .= chr(128 | $unicodeInteger >> 12 & 63);
358 $str .= chr(128 | $unicodeInteger >> 6 & 63);
359 $str .= chr(128 | $unicodeInteger & 63);
360 } else {
361 // Cannot express a 32-bit character in UTF-8
362 $str .= chr($this->noCharByteVal);
363 }
364 return $str;
365 }
366
367 /**
368 * Converts a UTF-8 Multibyte character to a UNICODE number
369 * Unit-tested by Kasper
370 *
371 * @param string $str UTF-8 multibyte character string
372 * @param bool $hex If set, then a hex. number is returned.
373 * @return int UNICODE integer
374 * @see UnumberToChar()
375 */
376 public function utf8CharToUnumber($str, $hex = false)
377 {
378 // First char
379 $ord = ord($str[0]);
380 // This verifies that it IS a multi byte string
381 if (($ord & 192) === 192) {
382 $binBuf = '';
383 $b = 0;
384 // For each byte in multibyte string...
385 for (; $b < 8; $b++) {
386 // Shift it left and ...
387 $ord = $ord << 1;
388 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
389 if ($ord & 128) {
390 $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6);
391 } else {
392 break;
393 }
394 }
395 $binBuf = substr('00000000' . decbin(ord($str[0])), -(6 - $b)) . $binBuf;
396 $int = bindec($binBuf);
397 } else {
398 $int = $ord;
399 }
400 return $hex ? 'x' . dechex($int) : $int;
401 }
402
403 /********************************************
404 *
405 * Init functions
406 *
407 ********************************************/
408 /**
409 * This will initialize a charset for use if it's defined in the 'typo3/sysext/core/Resources/Private/Charsets/csconvtbl/' folder
410 * This function is automatically called by the conversion functions
411 *
412 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
413 *
414 * @param string $charset The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
415 * @return int Returns '1' if already loaded, '2' if the charset conversion table was found and parsed.
416 * @throws UnknownCharsetException if no charset table was found
417 */
418 protected function initCharset($charset)
419 {
420 // Only process if the charset is not yet loaded:
421 if (empty($this->parsedCharsets[$charset])) {
422 // Conversion table filename:
423 $charsetConvTableFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/csconvtbl/' . $charset . '.tbl';
424 // If the conversion table is found:
425 if ($charset && GeneralUtility::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
426 // Cache file for charsets:
427 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
428 $cacheFile = Environment::getVarPath() . '/charset/charset_' . $charset . '.tbl';
429 if ($cacheFile && @is_file($cacheFile)) {
430 $this->parsedCharsets[$charset] = unserialize(file_get_contents($cacheFile));
431 } else {
432 // Parse conversion table into lines:
433 $lines = GeneralUtility::trimExplode(LF, file_get_contents($charsetConvTableFile), true);
434 // Initialize the internal variable holding the conv. table:
435 $this->parsedCharsets[$charset] = ['local' => [], 'utf8' => []];
436 // traverse the lines:
437 $detectedType = '';
438 foreach ($lines as $value) {
439 // Comment line or blanks are ignored.
440 if (trim($value) && $value[0] !== '#') {
441 // Detect type if not done yet: (Done on first real line)
442 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
443 if (!$detectedType) {
444 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
445 }
446 $hexbyte = '';
447 $utf8 = '';
448 if ($detectedType === 'ms-token') {
449 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
450 } elseif ($detectedType === 'whitespaced') {
451 $regA = [];
452 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
453 $hexbyte = $regA[1];
454 $utf8 = 'U+' . $regA[2];
455 }
456 $decval = hexdec(trim($hexbyte));
457 if ($decval > 127) {
458 $utf8decval = hexdec(substr(trim($utf8), 2));
459 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
460 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
461 }
462 }
463 }
464 if ($cacheFile) {
465 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
466 }
467 }
468 return 2;
469 }
470 throw new UnknownCharsetException(sprintf('Unknown charset "%s"', $charset), 1508916031);
471 }
472 return 1;
473 }
474
475 /**
476 * This function initializes all UTF-8 character data tables.
477 *
478 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
479 *
480 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
481 */
482 protected function initUnicodeData()
483 {
484 // Cache file
485 $cacheFileASCII = Environment::getVarPath() . '/charset/csascii_utf-8.tbl';
486 // Only process if the tables are not yet loaded
487 if (isset($this->toASCII['utf-8']) && is_array($this->toASCII['utf-8'])) {
488 return 1;
489 }
490 // Use cached version if possible
491 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
492 $this->toASCII['utf-8'] = unserialize(file_get_contents($cacheFileASCII));
493 return 2;
494 }
495 // Process main Unicode data file
496 $unicodeDataFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/UnicodeData.txt';
497 if (!(GeneralUtility::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
498 return false;
499 }
500 $fh = fopen($unicodeDataFile, 'rb');
501 if (!$fh) {
502 return false;
503 }
504 // Array of temp. decompositions
505 $decomposition = [];
506 // Array of chars that are marks (eg. composing accents)
507 $mark = [];
508 // Array of chars that are numbers (eg. digits)
509 $number = [];
510 // Array of chars to be omitted (eg. Russian hard sign)
511 $omit = [];
512 while (!feof($fh)) {
513 $line = fgets($fh, 4096);
514 // Has a lot of info
515 list($char, $name, $cat, , , $decomp, , , $num) = explode(';', rtrim($line));
516 $ord = hexdec($char);
517 if ($ord > 65535) {
518 // Only process the BMP
519 break;
520 }
521 switch ($cat[0]) {
522 case 'M':
523 // mark (accent, umlaut, ...)
524 $mark['U+' . $char] = 1;
525 break;
526 case 'N':
527 // numeric value
528 if ($ord > 128 && $num !== '') {
529 $number['U+' . $char] = $num;
530 }
531 }
532 // Accented Latin letters without "official" decomposition
533 $match = [];
534 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
535 $c = ord($match[2]);
536 if ($match[1] === 'SMALL') {
537 $c += 32;
538 }
539 $decomposition['U+' . $char] = [dechex($c)];
540 continue;
541 }
542 $match = [];
543 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
544 switch ($match[1]) {
545 case '<circle>':
546 // add parenthesis as circle replacement, eg (1)
547 $match[2] = '0028 ' . $match[2] . ' 0029';
548 break;
549 case '<square>':
550 // add square brackets as square replacement, eg [1]
551 $match[2] = '005B ' . $match[2] . ' 005D';
552 break;
553 case '<compat>':
554 // ignore multi char decompositions that start with a space
555 if (preg_match('/^0020 /', $match[2])) {
556 continue 2;
557 }
558 break;
559 case '<initial>':
560 case '<medial>':
561 case '<final>':
562 case '<isolated>':
563 case '<vertical>':
564 continue 2;
565 }
566 $decomposition['U+' . $char] = explode(' ', $match[2]);
567 }
568 }
569 fclose($fh);
570 // Process custom decompositions
571 $customTranslitFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/Translit.txt';
572 if (GeneralUtility::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
573 $fh = fopen($customTranslitFile, 'rb');
574 if ($fh) {
575 while (!feof($fh)) {
576 $line = fgets($fh, 4096);
577 if ($line[0] !== '#' && trim($line) !== '') {
578 list($char, $translit) = GeneralUtility::trimExplode(';', $line);
579 if (!$translit) {
580 $omit['U+' . $char] = 1;
581 }
582 $decomposition['U+' . $char] = explode(' ', $translit);
583 }
584 }
585 fclose($fh);
586 }
587 }
588 // Decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
589 foreach ($decomposition as $from => $to) {
590 $code_decomp = [];
591 while ($code_value = array_shift($to)) {
592 // Do recursive decomposition
593 if (isset($decomposition['U+' . $code_value])) {
594 foreach (array_reverse($decomposition['U+' . $code_value]) as $cv) {
595 array_unshift($to, $cv);
596 }
597 } elseif (!isset($mark['U+' . $code_value])) {
598 // remove mark
599 $code_decomp[] = $code_value;
600 }
601 }
602 if (!empty($code_decomp) || isset($omit[$from])) {
603 $decomposition[$from] = $code_decomp;
604 } else {
605 unset($decomposition[$from]);
606 }
607 }
608 // Create ascii only mapping
609 $this->toASCII['utf-8'] = [];
610 foreach ($decomposition as $from => $to) {
611 $code_decomp = [];
612 while ($code_value = array_shift($to)) {
613 $ord = hexdec($code_value);
614 if ($ord > 127) {
615 continue 2;
616 }
617 // Skip decompositions containing non-ASCII chars
618 $code_decomp[] = chr($ord);
619 }
620 $this->toASCII['utf-8'][$this->UnumberToChar(hexdec($from))] = implode('', $code_decomp);
621 }
622 // Add numeric decompositions
623 foreach ($number as $from => $to) {
624 $utf8_char = $this->UnumberToChar(hexdec($from));
625 if (!isset($this->toASCII['utf-8'][$utf8_char])) {
626 $this->toASCII['utf-8'][$utf8_char] = $to;
627 }
628 }
629 if ($cacheFileASCII) {
630 GeneralUtility::writeFileToTypo3tempDir($cacheFileASCII, serialize($this->toASCII['utf-8']));
631 }
632 return 3;
633 }
634
635 /**
636 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
637 * This function is automatically called by the ASCII transliteration functions.
638 *
639 * @param string $charset Charset for which to initialize conversion.
640 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
641 */
642 protected function initToASCII($charset)
643 {
644 // Only process if the case table is not yet loaded:
645 if (isset($this->toASCII[$charset]) && is_array($this->toASCII[$charset])) {
646 return 1;
647 }
648 // Use cached version if possible
649 $cacheFile = Environment::getVarPath() . '/charset/csascii_' . $charset . '.tbl';
650 if ($cacheFile && @is_file($cacheFile)) {
651 $this->toASCII[$charset] = unserialize(file_get_contents($cacheFile));
652 return 2;
653 }
654 // Init UTF-8 conversion for this charset
655 if (!$this->initCharset($charset)) {
656 return false;
657 }
658 // UTF-8/ASCII transliteration is used as the base conversion table
659 if (!$this->initUnicodeData()) {
660 return false;
661 }
662 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
663 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
664 $c = $this->utf8_decode($utf8, $charset);
665 if (isset($this->toASCII['utf-8'][$utf8])) {
666 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
667 }
668 }
669 if ($cacheFile) {
670 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
671 }
672 return 3;
673 }
674
675 /********************************************
676 *
677 * String operation functions
678 *
679 ********************************************/
680
681 /**
682 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
683 *
684 * @param string $charset Character set of string
685 * @param string $string Input string to convert
686 * @return string The converted string
687 */
688 public function specCharsToASCII($charset, $string)
689 {
690 if ($charset === 'utf-8') {
691 $string = $this->utf8_char_mapping($string);
692 } elseif (isset($this->eucBasedSets[$charset])) {
693 $string = $this->euc_char_mapping($string, $charset);
694 } else {
695 // Treat everything else as single-byte encoding
696 $string = $this->sb_char_mapping($string, $charset);
697 }
698 return $string;
699 }
700
701 /********************************************
702 *
703 * Internal string operation functions
704 *
705 ********************************************/
706 /**
707 * Maps all characters of a string in a single byte charset.
708 *
709 * @param string $str The string
710 * @param string $charset The charset
711 * @return string The converted string
712 */
713 public function sb_char_mapping($str, $charset)
714 {
715 if (!$this->initToASCII($charset)) {
716 return $str;
717 }
718 // Do nothing
719 $map = &$this->toASCII[$charset];
720 $out = '';
721 for ($i = 0; isset($str[$i]); $i++) {
722 $c = $str[$i];
723 if (isset($map[$c])) {
724 $out .= $map[$c];
725 } else {
726 $out .= $c;
727 }
728 }
729 return $out;
730 }
731
732 /********************************************
733 *
734 * Internal UTF-8 string operation functions
735 *
736 ********************************************/
737
738 /**
739 * Maps all characters of an UTF-8 string.
740 *
741 * @param string $str UTF-8 string
742 * @return string The converted string
743 */
744 public function utf8_char_mapping($str)
745 {
746 if (!$this->initUnicodeData()) {
747 // Do nothing
748 return $str;
749 }
750 $out = '';
751 $map = &$this->toASCII['utf-8'];
752 for ($i = 0; isset($str[$i]); $i++) {
753 $c = ord($str[$i]);
754 $mbc = '';
755 // single-byte (0xxxxxx)
756 if (!($c & 128)) {
757 $mbc = $str[$i];
758 } elseif (($c & 192) === 192) {
759 $bc = 0;
760 // multi-byte starting byte (11xxxxxx)
761 for (; $c & 128; $c = $c << 1) {
762 $bc++;
763 }
764 // calculate number of bytes
765 $mbc = substr($str, $i, $bc);
766 $i += $bc - 1;
767 }
768 if (isset($map[$mbc])) {
769 $out .= $map[$mbc];
770 } else {
771 $out .= $mbc;
772 }
773 }
774 return $out;
775 }
776
777 /********************************************
778 *
779 * Internal EUC string operation functions
780 *
781 * Extended Unix Code:
782 * ASCII compatible 7bit single bytes chars
783 * 8bit two byte chars
784 *
785 * Shift-JIS is treated as a special case.
786 *
787 ********************************************/
788
789 /**
790 * Maps all characters of a string in the EUC charset family.
791 *
792 * @param string $str EUC multibyte character string
793 * @param string $charset The charset
794 * @return string The converted string
795 */
796 public function euc_char_mapping($str, $charset)
797 {
798 if (!$this->initToASCII($charset)) {
799 return $str;
800 }
801 // do nothing
802 $map = &$this->toASCII[$charset];
803 $out = '';
804 for ($i = 0; isset($str[$i]); $i++) {
805 $mbc = $str[$i];
806 $c = ord($mbc);
807 if ($charset === 'shift_jis') {
808 // A double-byte char
809 if ($c >= 128 && $c < 160 || $c >= 224) {
810 $mbc = substr($str, $i, 2);
811 $i++;
812 }
813 } else {
814 // A double-byte char
815 if ($c >= 128) {
816 $mbc = substr($str, $i, 2);
817 $i++;
818 }
819 }
820 if (isset($map[$mbc])) {
821 $out .= $map[$mbc];
822 } else {
823 $out .= $mbc;
824 }
825 }
826 return $out;
827 }
828 }