[!!!][TASK] Remove charset functionality for locales
[Packages/TYPO3.CMS.git] / typo3 / sysext / core / Classes / Charset / CharsetConverter.php
1 <?php
2 namespace TYPO3\CMS\Core\Charset;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Localization\Locales;
18 use TYPO3\CMS\Core\SingletonInterface;
19 use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
20 use TYPO3\CMS\Core\Utility\GeneralUtility;
21
22 /**
23 * Notes on UTF-8
24 *
25 * Functions working on UTF-8 strings:
26 *
27 * - strchr/strstr
28 * - strrchr
29 * - substr_count
30 * - implode/explode/join
31 *
32 * Functions nearly working on UTF-8 strings:
33 *
34 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
35 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
36 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
37 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
38 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
39 *
40 * Functions NOT working on UTF-8 strings:
41 *
42 * - str*cmp
43 * - stristr
44 * - stripos
45 * - substr
46 * - strrev
47 * - split/spliti
48 * - ...
49 */
50
51 /**
52 * Class for conversion between charsets
53 */
54 class CharsetConverter implements SingletonInterface
55 {
56
57 /**
58 * Possible strategies for handling multi-byte data
59 * Only used for internal purpose
60 * @internal
61 */
62 const STRATEGY_MBSTRING = 'mbstring';
63 const STRATEGY_ICONV = 'iconv';
64 const STRATEGY_FALLBACK = 'fallback';
65
66 /**
67 * Set to one of the strategies above, based on the availability of the environment.
68 *
69 * @var string
70 */
71 protected $conversionStrategy = null;
72
73 /**
74 * ASCII Value for chars with no equivalent.
75 *
76 * @var int
77 */
78 public $noCharByteVal = 63;
79
80 /**
81 * This is the array where parsed conversion tables are stored (cached)
82 *
83 * @var array
84 */
85 public $parsedCharsets = array();
86
87 /**
88 * An array where case folding data will be stored (cached)
89 *
90 * @var array
91 */
92 public $caseFolding = array();
93
94 /**
95 * An array where charset-to-ASCII mappings are stored (cached)
96 *
97 * @var array
98 */
99 public $toASCII = array();
100
101 /**
102 * This tells the converter which charsets has two bytes per char:
103 *
104 * @var array
105 */
106 public $twoByteSets = array(
107 'ucs-2' => 1
108 );
109
110 /**
111 * This tells the converter which charsets has four bytes per char:
112 *
113 * @var array
114 */
115 public $fourByteSets = array(
116 'ucs-4' => 1, // 4-byte Unicode
117 'utf-32' => 1
118 );
119
120 /**
121 * This tells the converter which charsets use a scheme like the Extended Unix Code:
122 *
123 * @var array
124 */
125 public $eucBasedSets = array(
126 'gb2312' => 1, // Chinese, simplified.
127 'big5' => 1, // Chinese, traditional.
128 'euc-kr' => 1, // Korean
129 'shift_jis' => 1
130 );
131
132 /**
133 * @link http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
134 * @link http://czyborra.com/charsets/iso8859.html
135 *
136 * @var array
137 */
138 public $synonyms = array(
139 'us' => 'ascii',
140 'us-ascii' => 'ascii',
141 'cp819' => 'iso-8859-1',
142 'ibm819' => 'iso-8859-1',
143 'iso-ir-100' => 'iso-8859-1',
144 'iso-ir-101' => 'iso-8859-2',
145 'iso-ir-109' => 'iso-8859-3',
146 'iso-ir-110' => 'iso-8859-4',
147 'iso-ir-144' => 'iso-8859-5',
148 'iso-ir-127' => 'iso-8859-6',
149 'iso-ir-126' => 'iso-8859-7',
150 'iso-ir-138' => 'iso-8859-8',
151 'iso-ir-148' => 'iso-8859-9',
152 'iso-ir-157' => 'iso-8859-10',
153 'iso-ir-179' => 'iso-8859-13',
154 'iso-ir-199' => 'iso-8859-14',
155 'iso-ir-203' => 'iso-8859-15',
156 'csisolatin1' => 'iso-8859-1',
157 'csisolatin2' => 'iso-8859-2',
158 'csisolatin3' => 'iso-8859-3',
159 'csisolatin5' => 'iso-8859-9',
160 'csisolatin8' => 'iso-8859-14',
161 'csisolatin9' => 'iso-8859-15',
162 'csisolatingreek' => 'iso-8859-7',
163 'iso-celtic' => 'iso-8859-14',
164 'latin1' => 'iso-8859-1',
165 'latin2' => 'iso-8859-2',
166 'latin3' => 'iso-8859-3',
167 'latin5' => 'iso-8859-9',
168 'latin6' => 'iso-8859-10',
169 'latin8' => 'iso-8859-14',
170 'latin9' => 'iso-8859-15',
171 'l1' => 'iso-8859-1',
172 'l2' => 'iso-8859-2',
173 'l3' => 'iso-8859-3',
174 'l5' => 'iso-8859-9',
175 'l6' => 'iso-8859-10',
176 'l8' => 'iso-8859-14',
177 'l9' => 'iso-8859-15',
178 'cyrillic' => 'iso-8859-5',
179 'arabic' => 'iso-8859-6',
180 'tis-620' => 'iso-8859-11',
181 'win874' => 'windows-874',
182 'win1250' => 'windows-1250',
183 'win1251' => 'windows-1251',
184 'win1252' => 'windows-1252',
185 'win1253' => 'windows-1253',
186 'win1254' => 'windows-1254',
187 'win1255' => 'windows-1255',
188 'win1256' => 'windows-1256',
189 'win1257' => 'windows-1257',
190 'win1258' => 'windows-1258',
191 'cp1250' => 'windows-1250',
192 'cp1251' => 'windows-1251',
193 'cp1252' => 'windows-1252',
194 'ms-ee' => 'windows-1250',
195 'ms-ansi' => 'windows-1252',
196 'ms-greek' => 'windows-1253',
197 'ms-turk' => 'windows-1254',
198 'winbaltrim' => 'windows-1257',
199 'koi-8ru' => 'koi-8r',
200 'koi8r' => 'koi-8r',
201 'cp878' => 'koi-8r',
202 'mac' => 'macroman',
203 'macintosh' => 'macroman',
204 'euc-cn' => 'gb2312',
205 'x-euc-cn' => 'gb2312',
206 'euccn' => 'gb2312',
207 'cp936' => 'gb2312',
208 'big-5' => 'big5',
209 'cp950' => 'big5',
210 'eucjp' => 'euc-jp',
211 'sjis' => 'shift_jis',
212 'shift-jis' => 'shift_jis',
213 'cp932' => 'shift_jis',
214 'cp949' => 'euc-kr',
215 'utf7' => 'utf-7',
216 'utf8' => 'utf-8',
217 'utf16' => 'utf-16',
218 'utf32' => 'utf-32',
219 'ucs2' => 'ucs-2',
220 'ucs4' => 'ucs-4'
221 );
222
223 /**
224 * TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
225 * Empty values means "utf-8"
226 *
227 * @var array
228 */
229 public $charSetArray = array(
230 'af' => '',
231 'ar' => 'iso-8859-6',
232 'ba' => 'iso-8859-2',
233 'bg' => 'windows-1251',
234 'br' => '',
235 'ca' => 'iso-8859-15',
236 'ch' => 'gb2312',
237 'cs' => 'windows-1250',
238 'cz' => 'windows-1250',
239 'da' => '',
240 'de' => '',
241 'dk' => '',
242 'el' => 'iso-8859-7',
243 'eo' => 'utf-8',
244 'es' => '',
245 'et' => 'iso-8859-4',
246 'eu' => '',
247 'fa' => 'utf-8',
248 'fi' => '',
249 'fo' => 'utf-8',
250 'fr' => '',
251 'fr_CA' => '',
252 'ga' => '',
253 'ge' => 'utf-8',
254 'gl' => '',
255 'gr' => 'iso-8859-7',
256 'he' => 'utf-8',
257 'hi' => 'utf-8',
258 'hk' => 'big5',
259 'hr' => 'windows-1250',
260 'hu' => 'iso-8859-2',
261 'is' => 'utf-8',
262 'it' => '',
263 'ja' => 'shift_jis',
264 'jp' => 'shift_jis',
265 'ka' => 'utf-8',
266 'kl' => 'utf-8',
267 'km' => 'utf-8',
268 'ko' => 'euc-kr',
269 'kr' => 'euc-kr',
270 'lt' => 'windows-1257',
271 'lv' => 'utf-8',
272 'ms' => '',
273 'my' => '',
274 'nl' => '',
275 'no' => '',
276 'pl' => 'iso-8859-2',
277 'pt' => '',
278 'pt_BR' => '',
279 'qc' => '',
280 'ro' => 'iso-8859-2',
281 'ru' => 'windows-1251',
282 'se' => '',
283 'si' => 'windows-1250',
284 'sk' => 'windows-1250',
285 'sl' => 'windows-1250',
286 'sq' => 'utf-8',
287 'sr' => 'utf-8',
288 'sv' => '',
289 'th' => 'iso-8859-11',
290 'tr' => 'iso-8859-9',
291 'ua' => 'windows-1251',
292 'uk' => 'windows-1251',
293 'vi' => 'utf-8',
294 'vn' => 'utf-8',
295 'zh' => 'big5'
296 );
297
298 /**
299 * Normalize - changes input character set to lowercase letters.
300 *
301 * @param string $charset Input charset
302 * @return string Normalized charset
303 */
304 public function parse_charset($charset)
305 {
306 $charset = trim(strtolower($charset));
307 if (isset($this->synonyms[$charset])) {
308 $charset = $this->synonyms[$charset];
309 }
310 return $charset;
311 }
312
313 /********************************************
314 *
315 * Charset Conversion functions
316 *
317 ********************************************/
318 /**
319 * Convert from one charset to another charset.
320 *
321 * @param string $inputString Input string
322 * @param string $fromCharset From charset (the current charset of the string)
323 * @param string $toCharset To charset (the output charset wanted)
324 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
325 * @return string Converted string
326 * @see convArray()
327 */
328 public function conv($inputString, $fromCharset, $toCharset, $useEntityForNoChar = false)
329 {
330 if ($fromCharset === $toCharset) {
331 return $inputString;
332 }
333 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
334 if ($toCharset === 'utf-8' || !$useEntityForNoChar) {
335 switch ($this->getConversionStrategy()) {
336 case self::STRATEGY_MBSTRING:
337 $convertedString = mb_convert_encoding($inputString, $toCharset, $fromCharset);
338 if (false !== $convertedString) {
339 return $convertedString;
340 }
341 // Returns FALSE for unsupported charsets
342 break;
343 case self::STRATEGY_ICONV:
344 $convertedString = iconv($fromCharset, $toCharset . '//TRANSLIT', $inputString);
345 if (false !== $convertedString) {
346 return $convertedString;
347 }
348 break;
349 }
350 }
351 if ($fromCharset !== 'utf-8') {
352 $inputString = $this->utf8_encode($inputString, $fromCharset);
353 }
354 if ($toCharset !== 'utf-8') {
355 $inputString = $this->utf8_decode($inputString, $toCharset, $useEntityForNoChar);
356 }
357 return $inputString;
358 }
359
360 /**
361 * Convert all elements in ARRAY with type string from one charset to another charset.
362 * NOTICE: Array is passed by reference!
363 *
364 * @param array $array Input array, possibly multidimensional
365 * @param string $fromCharset From charset (the current charset of the string)
366 * @param string $toCharset To charset (the output charset wanted)
367 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
368 * @return void
369 * @see conv()
370 */
371 public function convArray(&$array, $fromCharset, $toCharset, $useEntityForNoChar = false)
372 {
373 foreach ($array as $key => $value) {
374 if (is_array($array[$key])) {
375 $this->convArray($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
376 } elseif (is_string($array[$key])) {
377 $array[$key] = $this->conv($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
378 }
379 }
380 }
381
382 /**
383 * Converts $str from $charset to UTF-8
384 *
385 * @param string $str String in local charset to convert to UTF-8
386 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
387 * @return string Output string, converted to UTF-8
388 */
389 public function utf8_encode($str, $charset)
390 {
391 if ($charset === 'utf-8') {
392 return $str;
393 }
394 // Charset is case-insensitive
395 // Parse conv. table if not already
396 if ($this->initCharset($charset)) {
397 $strLen = strlen($str);
398 $outStr = '';
399 // Traverse each char in string
400 for ($a = 0; $a < $strLen; $a++) {
401 $chr = substr($str, $a, 1);
402 $ord = ord($chr);
403 // If the charset has two bytes per char
404 if (isset($this->twoByteSets[$charset])) {
405 $ord2 = ord($str[$a + 1]);
406 // Assume big endian
407 $ord = $ord << 8 | $ord2;
408 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
409 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
410 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
411 } else {
412 $outStr .= chr($this->noCharByteVal);
413 }
414 // No char exists
415 $a++;
416 } elseif ($ord > 127) {
417 // If char has value over 127 it's a multibyte char in UTF-8
418 // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
419 if (isset($this->eucBasedSets[$charset])) {
420 // Shift-JIS: chars between 160 and 223 are single byte
421 if ($charset !== 'shift_jis' || ($ord < 160 || $ord > 223)) {
422 $a++;
423 $ord2 = ord(substr($str, $a, 1));
424 $ord = $ord * 256 + $ord2;
425 }
426 }
427 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
428 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
429 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
430 } else {
431 $outStr .= chr($this->noCharByteVal);
432 }
433 } else {
434 $outStr .= $chr;
435 }
436 }
437 return $outStr;
438 }
439 }
440
441 /**
442 * Converts $str from UTF-8 to $charset
443 *
444 * @param string $str String in UTF-8 to convert to local charset
445 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
446 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
447 * @return string Output string, converted to local charset
448 */
449 public function utf8_decode($str, $charset, $useEntityForNoChar = false)
450 {
451 if ($charset === 'utf-8') {
452 return $str;
453 }
454 // Charset is case-insensitive.
455 // Parse conv. table if not already
456 if ($this->initCharset($charset)) {
457 $strLen = strlen($str);
458 $outStr = '';
459 // Traverse each char in UTF-8 string
460 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) {
461 $chr = substr($str, $a, 1);
462 $ord = ord($chr);
463 // This means multibyte! (first byte!)
464 if ($ord > 127) {
465 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
466 if ($ord & 64) {
467 // Add first byte
468 $buf = $chr;
469 // For each byte in multibyte string
470 for ($b = 0; $b < 8; $b++) {
471 // Shift it left and
472 $ord = $ord << 1;
473 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
474 if ($ord & 128) {
475 $a++;
476 // ... and add the next char.
477 $buf .= substr($str, $a, 1);
478 } else {
479 break;
480 }
481 }
482 // If the UTF-8 char-sequence is found then...
483 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
484 // The local number
485 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
486 // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
487 if ($mByte > 255) {
488 $outStr .= chr(($mByte >> 8 & 255)) . chr(($mByte & 255));
489 } else {
490 $outStr .= chr($mByte);
491 }
492 } elseif ($useEntityForNoChar) {
493 // Create num entity:
494 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
495 } else {
496 $outStr .= chr($this->noCharByteVal);
497 }
498 } else {
499 $outStr .= chr($this->noCharByteVal);
500 }
501 } else {
502 $outStr .= $chr;
503 }
504 }
505 return $outStr;
506 }
507 }
508
509 /**
510 * Converts all chars > 127 to numeric entities.
511 *
512 * @param string $str Input string
513 * @return string Output string
514 */
515 public function utf8_to_entities($str)
516 {
517 $strLen = strlen($str);
518 $outStr = '';
519 // Traverse each char in UTF-8 string.
520 for ($a = 0; $a < $strLen; $a++) {
521 $chr = substr($str, $a, 1);
522 $ord = ord($chr);
523 // This means multibyte! (first byte!)
524 if ($ord > 127) {
525 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
526 if ($ord & 64) {
527 // Add first byte
528 $buf = $chr;
529 // For each byte in multibyte string...
530 for ($b = 0; $b < 8; $b++) {
531 // Shift it left and ...
532 $ord = $ord << 1;
533 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
534 if ($ord & 128) {
535 $a++;
536 // ... and add the next char.
537 $buf .= substr($str, $a, 1);
538 } else {
539 break;
540 }
541 }
542 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
543 } else {
544 $outStr .= chr($this->noCharByteVal);
545 }
546 } else {
547 $outStr .= $chr;
548 }
549 }
550 return $outStr;
551 }
552
553 /**
554 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
555 *
556 * @param string $str Input string, UTF-8
557 * @param bool $alsoStdHtmlEnt If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
558 * @return string Output string
559 */
560 public function entities_to_utf8($str, $alsoStdHtmlEnt = false)
561 {
562 if ($alsoStdHtmlEnt) {
563 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8'));
564 }
565 $token = md5(microtime());
566 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
567 foreach ($parts as $k => $v) {
568 // Only take every second element
569 if ($k % 2 === 0) {
570 continue;
571 }
572 $position = 0;
573 // Dec or hex entities
574 if (substr($v, $position, 1) === '#') {
575 $position++;
576 if (substr($v, $position, 1) === 'x') {
577 $v = hexdec(substr($v, ++$position));
578 } else {
579 $v = substr($v, $position);
580 }
581 $parts[$k] = $this->UnumberToChar($v);
582 } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) {
583 // Other entities:
584 $v = $trans_tbl['&' . $v . ';'];
585 $parts[$k] = $v;
586 } else {
587 // No conversion:
588 $parts[$k] = '&' . $v . ';';
589 }
590 }
591 return implode('', $parts);
592 }
593
594 /**
595 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
596 *
597 * @param string $str Input string, UTF-8
598 * @param bool $convEntities If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
599 * @param bool $retChar If set, then instead of integer numbers the real UTF-8 char is returned.
600 * @return array Output array with the char numbers
601 */
602 public function utf8_to_numberarray($str, $convEntities = false, $retChar = false)
603 {
604 // If entities must be registered as well...:
605 if ($convEntities) {
606 $str = $this->entities_to_utf8($str, 1);
607 }
608 // Do conversion:
609 $strLen = strlen($str);
610 $outArr = array();
611 // Traverse each char in UTF-8 string.
612 for ($a = 0; $a < $strLen; $a++) {
613 $chr = substr($str, $a, 1);
614 $ord = ord($chr);
615 // This means multibyte! (first byte!)
616 if ($ord > 127) {
617 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
618 if ($ord & 64) {
619 // Add first byte
620 $buf = $chr;
621 // For each byte in multibyte string...
622 for ($b = 0; $b < 8; $b++) {
623 // Shift it left and ...
624 $ord = $ord << 1;
625 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
626 if ($ord & 128) {
627 $a++;
628 // ... and add the next char.
629 $buf .= substr($str, $a, 1);
630 } else {
631 break;
632 }
633 }
634 $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
635 } else {
636 $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
637 }
638 } else {
639 $outArr[] = $retChar ? chr($ord) : $ord;
640 }
641 }
642 return $outArr;
643 }
644
645 /**
646 * Converts a UNICODE number to a UTF-8 multibyte character
647 * Algorithm based on script found at From: http://czyborra.com/utf/
648 * Unit-tested by Kasper
649 *
650 * The binary representation of the character's integer value is thus simply spread across the bytes
651 * and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
652 *
653 * bytes | bits | representation
654 * 1 | 7 | 0vvvvvvv
655 * 2 | 11 | 110vvvvv 10vvvvvv
656 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
657 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
658 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
659 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
660 *
661 * @param int $unicodeInteger UNICODE integer
662 * @return string UTF-8 multibyte character string
663 * @see utf8CharToUnumber()
664 */
665 public function UnumberToChar($unicodeInteger)
666 {
667 $str = '';
668 if ($unicodeInteger < 128) {
669 $str .= chr($unicodeInteger);
670 } elseif ($unicodeInteger < 2048) {
671 $str .= chr(192 | $unicodeInteger >> 6);
672 $str .= chr(128 | $unicodeInteger & 63);
673 } elseif ($unicodeInteger < 65536) {
674 $str .= chr(224 | $unicodeInteger >> 12);
675 $str .= chr(128 | $unicodeInteger >> 6 & 63);
676 $str .= chr(128 | $unicodeInteger & 63);
677 } elseif ($unicodeInteger < 2097152) {
678 $str .= chr(240 | $unicodeInteger >> 18);
679 $str .= chr(128 | $unicodeInteger >> 12 & 63);
680 $str .= chr(128 | $unicodeInteger >> 6 & 63);
681 $str .= chr(128 | $unicodeInteger & 63);
682 } elseif ($unicodeInteger < 67108864) {
683 $str .= chr(248 | $unicodeInteger >> 24);
684 $str .= chr(128 | $unicodeInteger >> 18 & 63);
685 $str .= chr(128 | $unicodeInteger >> 12 & 63);
686 $str .= chr(128 | $unicodeInteger >> 6 & 63);
687 $str .= chr(128 | $unicodeInteger & 63);
688 } elseif ($unicodeInteger < 2147483648) {
689 $str .= chr(252 | $unicodeInteger >> 30);
690 $str .= chr(128 | $unicodeInteger >> 24 & 63);
691 $str .= chr(128 | $unicodeInteger >> 18 & 63);
692 $str .= chr(128 | $unicodeInteger >> 12 & 63);
693 $str .= chr(128 | $unicodeInteger >> 6 & 63);
694 $str .= chr(128 | $unicodeInteger & 63);
695 } else {
696 // Cannot express a 32-bit character in UTF-8
697 $str .= chr($this->noCharByteVal);
698 }
699 return $str;
700 }
701
702 /**
703 * Converts a UTF-8 Multibyte character to a UNICODE number
704 * Unit-tested by Kasper
705 *
706 * @param string $str UTF-8 multibyte character string
707 * @param bool $hex If set, then a hex. number is returned.
708 * @return int UNICODE integer
709 * @see UnumberToChar()
710 */
711 public function utf8CharToUnumber($str, $hex = false)
712 {
713 // First char
714 $ord = ord($str[0]);
715 // This verifies that it IS a multi byte string
716 if (($ord & 192) === 192) {
717 $binBuf = '';
718 // For each byte in multibyte string...
719 for ($b = 0; $b < 8; $b++) {
720 // Shift it left and ...
721 $ord = $ord << 1;
722 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
723 if ($ord & 128) {
724 $binBuf .= substr('00000000' . decbin(ord(substr($str, ($b + 1), 1))), -6);
725 } else {
726 break;
727 }
728 }
729 $binBuf = substr(('00000000' . decbin(ord($str[0]))), -(6 - $b)) . $binBuf;
730 $int = bindec($binBuf);
731 } else {
732 $int = $ord;
733 }
734 return $hex ? 'x' . dechex($int) : $int;
735 }
736
737 /********************************************
738 *
739 * Init functions
740 *
741 ********************************************/
742 /**
743 * This will initialize a charset for use if it's defined in the 'typo3/sysext/core/Resources/Private/Charsets/csconvtbl/' folder
744 * This function is automatically called by the conversion functions
745 *
746 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
747 *
748 * @param string $charset The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
749 * @return int Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
750 * @access private
751 */
752 public function initCharset($charset)
753 {
754 // Only process if the charset is not yet loaded:
755 if (!is_array($this->parsedCharsets[$charset])) {
756 // Conversion table filename:
757 $charsetConvTableFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/csconvtbl/' . $charset . '.tbl';
758 // If the conversion table is found:
759 if ($charset && GeneralUtility::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
760 // Cache file for charsets:
761 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
762 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/charset_' . $charset . '.tbl');
763 if ($cacheFile && @is_file($cacheFile)) {
764 $this->parsedCharsets[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
765 } else {
766 // Parse conversion table into lines:
767 $lines = GeneralUtility::trimExplode(LF, GeneralUtility::getUrl($charsetConvTableFile), true);
768 // Initialize the internal variable holding the conv. table:
769 $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
770 // traverse the lines:
771 $detectedType = '';
772 foreach ($lines as $value) {
773 // Comment line or blanks are ignored.
774 if (trim($value) && $value[0] !== '#') {
775 // Detect type if not done yet: (Done on first real line)
776 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
777 if (!$detectedType) {
778 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
779 }
780 if ($detectedType === 'ms-token') {
781 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
782 } elseif ($detectedType === 'whitespaced') {
783 $regA = array();
784 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
785 $hexbyte = $regA[1];
786 $utf8 = 'U+' . $regA[2];
787 }
788 $decval = hexdec(trim($hexbyte));
789 if ($decval > 127) {
790 $utf8decval = hexdec(substr(trim($utf8), 2));
791 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
792 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
793 }
794 }
795 }
796 if ($cacheFile) {
797 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
798 }
799 }
800 return 2;
801 } else {
802 return false;
803 }
804 } else {
805 return 1;
806 }
807 }
808
809 /**
810 * This function initializes all UTF-8 character data tables.
811 *
812 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
813 *
814 * @param string $mode Mode ("case", "ascii", ...)
815 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
816 * @access private
817 */
818 public function initUnicodeData($mode = null)
819 {
820 // Cache files
821 $cacheFileCase = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/cscase_utf-8.tbl');
822 $cacheFileASCII = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/csascii_utf-8.tbl');
823 // Only process if the tables are not yet loaded
824 switch ($mode) {
825 case 'case':
826 if (is_array($this->caseFolding['utf-8'])) {
827 return 1;
828 }
829 // Use cached version if possible
830 if ($cacheFileCase && @is_file($cacheFileCase)) {
831 $this->caseFolding['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileCase));
832 return 2;
833 }
834 break;
835 case 'ascii':
836 if (is_array($this->toASCII['utf-8'])) {
837 return 1;
838 }
839 // Use cached version if possible
840 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
841 $this->toASCII['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileASCII));
842 return 2;
843 }
844 break;
845 }
846 // Process main Unicode data file
847 $unicodeDataFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/UnicodeData.txt';
848 if (!(GeneralUtility::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
849 return false;
850 }
851 $fh = fopen($unicodeDataFile, 'rb');
852 if (!$fh) {
853 return false;
854 }
855 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
856 // Note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
857 $this->caseFolding['utf-8'] = array();
858 $utf8CaseFolding = &$this->caseFolding['utf-8'];
859 // a shorthand
860 $utf8CaseFolding['toUpper'] = array();
861 $utf8CaseFolding['toLower'] = array();
862 $utf8CaseFolding['toTitle'] = array();
863 // Array of temp. decompositions
864 $decomposition = array();
865 // Array of chars that are marks (eg. composing accents)
866 $mark = array();
867 // Array of chars that are numbers (eg. digits)
868 $number = array();
869 // Array of chars to be omitted (eg. Russian hard sign)
870 $omit = array();
871 while (!feof($fh)) {
872 $line = fgets($fh, 4096);
873 // Has a lot of info
874 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title, ) = explode(';', rtrim($line));
875 $ord = hexdec($char);
876 if ($ord > 65535) {
877 // Only process the BMP
878 break;
879 }
880 $utf8_char = $this->UnumberToChar($ord);
881 if ($upper) {
882 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
883 }
884 if ($lower) {
885 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
886 }
887 // Store "title" only when different from "upper" (only a few)
888 if ($title && $title !== $upper) {
889 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
890 }
891 switch ($cat[0]) {
892 case 'M':
893 // mark (accent, umlaut, ...)
894 $mark['U+' . $char] = 1;
895 break;
896 case 'N':
897 // numeric value
898 if ($ord > 128 && $num !== '') {
899 $number['U+' . $char] = $num;
900 }
901 }
902 // Accented Latin letters without "official" decomposition
903 $match = array();
904 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
905 $c = ord($match[2]);
906 if ($match[1] === 'SMALL') {
907 $c += 32;
908 }
909 $decomposition['U+' . $char] = array(dechex($c));
910 continue;
911 }
912 $match = array();
913 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
914 switch ($match[1]) {
915 case '<circle>':
916 // add parenthesis as circle replacement, eg (1)
917 $match[2] = '0028 ' . $match[2] . ' 0029';
918 break;
919 case '<square>':
920 // add square brackets as square replacement, eg [1]
921 $match[2] = '005B ' . $match[2] . ' 005D';
922 break;
923 case '<compat>':
924 // ignore multi char decompositions that start with a space
925 if (preg_match('/^0020 /', $match[2])) {
926 continue 2;
927 }
928 break;
929 case '<initial>':
930 case '<medial>':
931 case '<final>':
932 case '<isolated>':
933 case '<vertical>':
934 continue 2;
935 }
936 $decomposition['U+' . $char] = explode(' ', $match[2]);
937 }
938 }
939 fclose($fh);
940 // Process additional Unicode data for casing (allow folded characters to expand into a sequence)
941 $specialCasingFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/SpecialCasing.txt';
942 if (GeneralUtility::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
943 $fh = fopen($specialCasingFile, 'rb');
944 if ($fh) {
945 while (!feof($fh)) {
946 $line = fgets($fh, 4096);
947 if ($line[0] !== '#' && trim($line) !== '') {
948 list($char, $lower, $title, $upper, $cond) = GeneralUtility::trimExplode(';', $line);
949 if ($cond === '' || $cond[0] === '#') {
950 $utf8_char = $this->UnumberToChar(hexdec($char));
951 if ($char !== $lower) {
952 $arr = explode(' ', $lower);
953 for ($i = 0; isset($arr[$i]); $i++) {
954 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
955 }
956 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
957 }
958 if ($char !== $title && $title !== $upper) {
959 $arr = explode(' ', $title);
960 for ($i = 0; isset($arr[$i]); $i++) {
961 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
962 }
963 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
964 }
965 if ($char !== $upper) {
966 $arr = explode(' ', $upper);
967 for ($i = 0; isset($arr[$i]); $i++) {
968 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
969 }
970 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
971 }
972 }
973 }
974 }
975 fclose($fh);
976 }
977 }
978 // Process custom decompositions
979 $customTranslitFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/Translit.txt';
980 if (GeneralUtility::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
981 $fh = fopen($customTranslitFile, 'rb');
982 if ($fh) {
983 while (!feof($fh)) {
984 $line = fgets($fh, 4096);
985 if ($line[0] !== '#' && trim($line) !== '') {
986 list($char, $translit) = GeneralUtility::trimExplode(';', $line);
987 if (!$translit) {
988 $omit['U+' . $char] = 1;
989 }
990 $decomposition['U+' . $char] = explode(' ', $translit);
991 }
992 }
993 fclose($fh);
994 }
995 }
996 // Decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
997 foreach ($decomposition as $from => $to) {
998 $code_decomp = array();
999 while ($code_value = array_shift($to)) {
1000 // Do recursive decomposition
1001 if (isset($decomposition['U+' . $code_value])) {
1002 foreach (array_reverse($decomposition['U+' . $code_value]) as $cv) {
1003 array_unshift($to, $cv);
1004 }
1005 } elseif (!isset($mark['U+' . $code_value])) {
1006 // remove mark
1007 array_push($code_decomp, $code_value);
1008 }
1009 }
1010 if (!empty($code_decomp) || isset($omit[$from])) {
1011 $decomposition[$from] = $code_decomp;
1012 } else {
1013 unset($decomposition[$from]);
1014 }
1015 }
1016 // Create ascii only mapping
1017 $this->toASCII['utf-8'] = array();
1018 $ascii = &$this->toASCII['utf-8'];
1019 foreach ($decomposition as $from => $to) {
1020 $code_decomp = array();
1021 while ($code_value = array_shift($to)) {
1022 $ord = hexdec($code_value);
1023 if ($ord > 127) {
1024 continue 2;
1025 } else {
1026 // Skip decompositions containing non-ASCII chars
1027 array_push($code_decomp, chr($ord));
1028 }
1029 }
1030 $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1031 }
1032 // Add numeric decompositions
1033 foreach ($number as $from => $to) {
1034 $utf8_char = $this->UnumberToChar(hexdec($from));
1035 if (!isset($ascii[$utf8_char])) {
1036 $ascii[$utf8_char] = $to;
1037 }
1038 }
1039 if ($cacheFileCase) {
1040 GeneralUtility::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1041 }
1042 if ($cacheFileASCII) {
1043 GeneralUtility::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1044 }
1045 return 3;
1046 }
1047
1048 /**
1049 * This function initializes the folding table for a charset other than UTF-8.
1050 * This function is automatically called by the case folding functions.
1051 *
1052 * @param string $charset Charset for which to initialize case folding.
1053 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1054 * @access private
1055 */
1056 public function initCaseFolding($charset)
1057 {
1058 // Only process if the case table is not yet loaded:
1059 if (is_array($this->caseFolding[$charset])) {
1060 return 1;
1061 }
1062 // Use cached version if possible
1063 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/cscase_' . $charset . '.tbl');
1064 if ($cacheFile && @is_file($cacheFile)) {
1065 $this->caseFolding[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1066 return 2;
1067 }
1068 // init UTF-8 conversion for this charset
1069 if (!$this->initCharset($charset)) {
1070 return false;
1071 }
1072 // UTF-8 case folding is used as the base conversion table
1073 if (!$this->initUnicodeData('case')) {
1074 return false;
1075 }
1076 $nochar = chr($this->noCharByteVal);
1077 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1078 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1079 $c = $this->utf8_decode($utf8, $charset);
1080 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1081 if ($cc !== '' && $cc !== $nochar) {
1082 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1083 }
1084 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1085 if ($cc !== '' && $cc !== $nochar) {
1086 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1087 }
1088 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1089 if ($cc !== '' && $cc !== $nochar) {
1090 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1091 }
1092 }
1093 // Add the ASCII case table
1094 $start = ord('a');
1095 $end = ord('z');
1096 for ($i = $start; $i <= $end; $i++) {
1097 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1098 }
1099 $start = ord('A');
1100 $end = ord('Z');
1101 for ($i = $start; $i <= $end; $i++) {
1102 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1103 }
1104 if ($cacheFile) {
1105 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1106 }
1107 return 3;
1108 }
1109
1110 /**
1111 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1112 * This function is automatically called by the ASCII transliteration functions.
1113 *
1114 * @param string $charset Charset for which to initialize conversion.
1115 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1116 * @access private
1117 */
1118 public function initToASCII($charset)
1119 {
1120 // Only process if the case table is not yet loaded:
1121 if (is_array($this->toASCII[$charset])) {
1122 return 1;
1123 }
1124 // Use cached version if possible
1125 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/csascii_' . $charset . '.tbl');
1126 if ($cacheFile && @is_file($cacheFile)) {
1127 $this->toASCII[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1128 return 2;
1129 }
1130 // Init UTF-8 conversion for this charset
1131 if (!$this->initCharset($charset)) {
1132 return false;
1133 }
1134 // UTF-8/ASCII transliteration is used as the base conversion table
1135 if (!$this->initUnicodeData('ascii')) {
1136 return false;
1137 }
1138 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1139 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1140 $c = $this->utf8_decode($utf8, $charset);
1141 if (isset($this->toASCII['utf-8'][$utf8])) {
1142 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1143 }
1144 }
1145 if ($cacheFile) {
1146 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1147 }
1148 return 3;
1149 }
1150
1151 /********************************************
1152 *
1153 * String operation functions
1154 *
1155 ********************************************/
1156 /**
1157 * Returns a part of a string.
1158 * Unit-tested by Kasper (single byte charsets only)
1159 *
1160 * @param string $charset The character set
1161 * @param string $string Character string
1162 * @param int $start Start position (character position)
1163 * @param int $len Length (in characters)
1164 * @return string The substring
1165 * @see substr(), mb_substr()
1166 */
1167 public function substr($charset, $string, $start, $len = null)
1168 {
1169 if ($len === 0 || $string === '') {
1170 return '';
1171 }
1172 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1173 // Cannot omit $len, when specifying charset
1174 if ($len === null) {
1175 // Save internal encoding
1176 $enc = mb_internal_encoding();
1177 mb_internal_encoding($charset);
1178 $str = mb_substr($string, $start);
1179 // Restore internal encoding
1180 mb_internal_encoding($enc);
1181 return $str;
1182 } else {
1183 return mb_substr($string, $start, $len, $charset);
1184 }
1185 } elseif ($this->getConversionStrategy() === self::STRATEGY_ICONV) {
1186 // Cannot omit $len, when specifying charset
1187 if ($len === null) {
1188 // Save internal encoding
1189 $enc = iconv_get_encoding('internal_encoding');
1190 iconv_set_encoding('internal_encoding', $charset);
1191 $str = iconv_substr($string, $start);
1192 // Restore internal encoding
1193 iconv_set_encoding('internal_encoding', $enc);
1194 return $str;
1195 } else {
1196 return iconv_substr($string, $start, $len, $charset);
1197 }
1198 } elseif ($charset === 'utf-8') {
1199 return $this->utf8_substr($string, $start, $len);
1200 } elseif ($this->eucBasedSets[$charset]) {
1201 return $this->euc_substr($string, $start, $charset, $len);
1202 } elseif ($this->twoByteSets[$charset]) {
1203 return substr($string, $start * 2, $len * 2);
1204 } elseif ($this->fourByteSets[$charset]) {
1205 return substr($string, $start * 4, $len * 4);
1206 }
1207 // Treat everything else as single-byte encoding
1208 return $len === null ? substr($string, $start) : substr($string, $start, $len);
1209 }
1210
1211 /**
1212 * Counts the number of characters.
1213 * Unit-tested by Kasper (single byte charsets only)
1214 *
1215 * @param string $charset The character set
1216 * @param string $string Character string
1217 * @return int The number of characters
1218 * @see strlen()
1219 */
1220 public function strlen($charset, $string)
1221 {
1222 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1223 return mb_strlen($string, $charset);
1224 } elseif ($this->getConversionStrategy() === self::STRATEGY_ICONV) {
1225 return iconv_strlen($string, $charset);
1226 } elseif ($charset === 'utf-8') {
1227 return $this->utf8_strlen($string);
1228 } elseif ($this->eucBasedSets[$charset]) {
1229 return $this->euc_strlen($string, $charset);
1230 } elseif ($this->twoByteSets[$charset]) {
1231 return strlen($string) / 2;
1232 } elseif ($this->fourByteSets[$charset]) {
1233 return strlen($string) / 4;
1234 }
1235 // Treat everything else as single-byte encoding
1236 return strlen($string);
1237 }
1238
1239 /**
1240 * Method to crop strings using the mb_substr function.
1241 *
1242 * @param string $charset The character set
1243 * @param string $string String to be cropped
1244 * @param int $len Crop length (in characters)
1245 * @param string $crop Crop signifier
1246 * @return string The shortened string
1247 * @see mb_strlen(), mb_substr()
1248 */
1249 protected function cropMbstring($charset, $string, $len, $crop = '')
1250 {
1251 if ((int)$len === 0 || mb_strlen($string, $charset) <= abs($len)) {
1252 return $string;
1253 }
1254 if ($len > 0) {
1255 $string = mb_substr($string, 0, $len, $charset) . $crop;
1256 } else {
1257 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1258 }
1259 return $string;
1260 }
1261
1262 /**
1263 * Truncates a string and pre-/appends a string.
1264 * Unit tested by Kasper
1265 *
1266 * @param string $charset The character set
1267 * @param string $string Character string
1268 * @param int $len Length (in characters)
1269 * @param string $crop Crop signifier
1270 * @return string The shortened string
1271 * @see substr(), mb_strimwidth()
1272 */
1273 public function crop($charset, $string, $len, $crop = '')
1274 {
1275 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1276 return $this->cropMbstring($charset, $string, $len, $crop);
1277 }
1278 if ((int)$len === 0) {
1279 return $string;
1280 }
1281 if ($charset === 'utf-8') {
1282 $i = $this->utf8_char2byte_pos($string, $len);
1283 } elseif ($this->eucBasedSets[$charset]) {
1284 $i = $this->euc_char2byte_pos($string, $len, $charset);
1285 } else {
1286 if ($len > 0) {
1287 $i = $len;
1288 } else {
1289 $i = strlen($string) + $len;
1290 if ($i <= 0) {
1291 $i = false;
1292 }
1293 }
1294 }
1295 // $len outside actual string length
1296 if ($i === false) {
1297 return $string;
1298 } else {
1299 if ($len > 0) {
1300 if (isset($string[$i])) {
1301 return substr($string, 0, $i) . $crop;
1302 }
1303 } else {
1304 if (isset($string[$i - 1])) {
1305 return $crop . substr($string, $i);
1306 }
1307 }
1308 }
1309 return $string;
1310 }
1311
1312 /**
1313 * Cuts a string short at a given byte length.
1314 *
1315 * @param string $charset The character set
1316 * @param string $string Character string
1317 * @param int $len The byte length
1318 * @return string The shortened string
1319 * @see mb_strcut()
1320 */
1321 public function strtrunc($charset, $string, $len)
1322 {
1323 if ($len <= 0) {
1324 return '';
1325 }
1326 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1327 return mb_strcut($string, 0, $len, $charset);
1328 } elseif ($charset === 'utf-8') {
1329 return $this->utf8_strtrunc($string, $len);
1330 } elseif ($this->eucBasedSets[$charset]) {
1331 return $this->euc_strtrunc($string, $len, $charset);
1332 } elseif ($this->twoByteSets[$charset]) {
1333 if ($len % 2) {
1334 $len--;
1335 }
1336 } elseif ($this->fourByteSets[$charset]) {
1337 $x = $len % 4;
1338 // Realign to position dividable by four
1339 $len -= $x;
1340 }
1341 // Treat everything else as single-byte encoding
1342 return substr($string, 0, $len);
1343 }
1344
1345 /**
1346 * Translates all characters of a string into their respective case values.
1347 * Unlike strtolower() and strtoupper() this method is locale independent.
1348 * Note that the string length may change!
1349 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1350 * Unit-tested by Kasper
1351 * Real case folding is language dependent, this method ignores this fact.
1352 *
1353 * @param string $charset Character set of string
1354 * @param string $string Input string to convert case for
1355 * @param string $case Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1356 * @return string The converted string
1357 * @see strtolower(), strtoupper()
1358 */
1359 public function conv_case($charset, $string, $case)
1360 {
1361 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1362 if ($case === 'toLower') {
1363 $string = mb_strtolower($string, $charset);
1364 } else {
1365 $string = mb_strtoupper($string, $charset);
1366 }
1367 } elseif ($charset === 'utf-8') {
1368 $string = $this->utf8_char_mapping($string, 'case', $case);
1369 } elseif (isset($this->eucBasedSets[$charset])) {
1370 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1371 } else {
1372 // Treat everything else as single-byte encoding
1373 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1374 }
1375 return $string;
1376 }
1377
1378 /**
1379 * Equivalent of lcfirst/ucfirst but using character set.
1380 *
1381 * @param string $charset
1382 * @param string $string
1383 * @param string $case
1384 * @return string
1385 * @see \TYPO3\CMS\Core\Charset\CharsetConverter::conv_case()
1386 */
1387 public function convCaseFirst($charset, $string, $case)
1388 {
1389 $firstChar = $this->substr($charset, $string, 0, 1);
1390 $firstChar = $this->conv_case($charset, $firstChar, $case);
1391 $remainder = $this->substr($charset, $string, 1);
1392 return $firstChar . $remainder;
1393 }
1394
1395 /**
1396 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1397 *
1398 * @param string $charset Character set of string
1399 * @param string $string Input string to convert
1400 * @return string The converted string
1401 */
1402 public function specCharsToASCII($charset, $string)
1403 {
1404 if ($charset === 'utf-8') {
1405 $string = $this->utf8_char_mapping($string, 'ascii');
1406 } elseif (isset($this->eucBasedSets[$charset])) {
1407 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1408 } else {
1409 // Treat everything else as single-byte encoding
1410 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1411 }
1412 return $string;
1413 }
1414
1415 /**
1416 * Converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1417 * into a TYPO3-readable language code
1418 *
1419 * @param string $languageCodesList List of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1420 * @return string A preferred language that TYPO3 supports, or "default" if none found
1421 */
1422 public function getPreferredClientLanguage($languageCodesList)
1423 {
1424 $allLanguageCodes = $this->getAllLanguageCodes();
1425 $selectedLanguage = 'default';
1426 $preferredLanguages = GeneralUtility::trimExplode(',', $languageCodesList);
1427 // Order the preferred languages after they key
1428 $sortedPreferredLanguages = array();
1429 foreach ($preferredLanguages as $preferredLanguage) {
1430 $quality = 1.0;
1431 if (strpos($preferredLanguage, ';q=') !== false) {
1432 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1433 }
1434 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1435 }
1436 // Loop through the languages, with the highest priority first
1437 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1438 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1439 if (isset($allLanguageCodes[$preferredLanguage])) {
1440 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1441 break;
1442 }
1443 // Strip the country code from the end
1444 list($preferredLanguage, ) = explode('-', $preferredLanguage);
1445 if (isset($allLanguageCodes[$preferredLanguage])) {
1446 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1447 break;
1448 }
1449 }
1450 if (!$selectedLanguage || $selectedLanguage === 'en') {
1451 $selectedLanguage = 'default';
1452 }
1453 return $selectedLanguage;
1454 }
1455
1456 /**
1457 * Merges all available charsets and locales, currently only used for getPreferredClientLanguage()
1458 *
1459 * @return array
1460 */
1461 protected function getAllLanguageCodes()
1462 {
1463 // Get all languages where TYPO3 code is the same as the ISO code
1464 $typo3LanguageCodes = array_keys($this->charSetArray);
1465 $allLanguageCodes = array_combine($typo3LanguageCodes, $typo3LanguageCodes);
1466 // Get all languages where TYPO3 code differs from ISO code
1467 // or needs the country part
1468 // the iso codes will here overwrite the default typo3 language in the key
1469 /** @var Locales $locales */
1470 $locales = GeneralUtility::makeInstance(Locales::class);
1471 foreach ($locales->getIsoMapping() as $typo3Lang => $isoLang) {
1472 $isoLang = join('-', explode('_', $isoLang));
1473 $allLanguageCodes[$typo3Lang] = $isoLang;
1474 }
1475 // Move the iso codes to the (because we're comparing the keys with "isset" later on)
1476 return array_flip($allLanguageCodes);
1477 }
1478
1479 /********************************************
1480 *
1481 * Internal string operation functions
1482 *
1483 ********************************************/
1484 /**
1485 * Maps all characters of a string in a single byte charset.
1486 *
1487 * @param string $str The string
1488 * @param string $charset The charset
1489 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1490 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
1491 * @return string The converted string
1492 */
1493 public function sb_char_mapping($str, $charset, $mode, $opt = '')
1494 {
1495 switch ($mode) {
1496 case 'case':
1497 if (!$this->initCaseFolding($charset)) {
1498 return $str;
1499 }
1500 // Do nothing
1501 $map = &$this->caseFolding[$charset][$opt];
1502 break;
1503 case 'ascii':
1504 if (!$this->initToASCII($charset)) {
1505 return $str;
1506 }
1507 // Do nothing
1508 $map = &$this->toASCII[$charset];
1509 break;
1510 default:
1511 return $str;
1512 }
1513 $out = '';
1514 for ($i = 0; isset($str[$i]); $i++) {
1515 $c = $str[$i];
1516 if (isset($map[$c])) {
1517 $out .= $map[$c];
1518 } else {
1519 $out .= $c;
1520 }
1521 }
1522 return $out;
1523 }
1524
1525 /********************************************
1526 *
1527 * Internal UTF-8 string operation functions
1528 *
1529 ********************************************/
1530 /**
1531 * Returns a part of a UTF-8 string.
1532 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1533 *
1534 * @param string $str UTF-8 string
1535 * @param int $start Start position (character position)
1536 * @param int $len Length (in characters)
1537 * @return string The substring
1538 * @see substr()
1539 */
1540 public function utf8_substr($str, $start, $len = null)
1541 {
1542 if ((string)$len === '0') {
1543 return '';
1544 }
1545 $byte_start = $this->utf8_char2byte_pos($str, $start);
1546 if ($byte_start === false) {
1547 if ($start > 0) {
1548 // $start outside string length
1549 return false;
1550 }
1551 }
1552 $str = substr($str, $byte_start);
1553 if ($len != null) {
1554 $byte_end = $this->utf8_char2byte_pos($str, $len);
1555 // $len outside actual string length
1556 if ($byte_end === false) {
1557 return $len < 0 ? '' : $str;
1558 } else {
1559 // When length is less than zero and exceeds, then we return blank string.
1560 return substr($str, 0, $byte_end);
1561 }
1562 } else {
1563 return $str;
1564 }
1565 }
1566
1567 /**
1568 * Counts the number of characters of a string in UTF-8.
1569 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1570 *
1571 * @param string $str UTF-8 multibyte character string
1572 * @return int The number of characters
1573 * @see strlen()
1574 */
1575 public function utf8_strlen($str)
1576 {
1577 $n = 0;
1578 for ($i = 0; isset($str[$i]); $i++) {
1579 $c = ord($str[$i]);
1580 // Single-byte (0xxxxxx)
1581 if (!($c & 128)) {
1582 $n++;
1583 } elseif (($c & 192) === 192) {
1584 // Multi-byte starting byte (11xxxxxx)
1585 $n++;
1586 }
1587 }
1588 return $n;
1589 }
1590
1591 /**
1592 * Truncates a string in UTF-8 short at a given byte length.
1593 *
1594 * @param string $str UTF-8 multibyte character string
1595 * @param int $len The byte length
1596 * @return string The shortened string
1597 * @see mb_strcut()
1598 */
1599 public function utf8_strtrunc($str, $len)
1600 {
1601 $i = $len - 1;
1602 // Part of a multibyte sequence
1603 if (ord($str[$i]) & 128) {
1604 for (; $i > 0 && !(ord($str[$i]) & 64); $i--) {
1605 }
1606 if ($i <= 0) {
1607 return '';
1608 }
1609 // Sanity check
1610 for ($bc = 0, $mbs = ord($str[$i]); $mbs & 128; $mbs = $mbs << 1) {
1611 // Calculate number of bytes
1612 $bc++;
1613 }
1614 if ($bc + $i > $len) {
1615 return substr($str, 0, $i);
1616 }
1617 }
1618 return substr($str, 0, $len);
1619 }
1620
1621 /**
1622 * Find position of first occurrence of a string, both arguments are in UTF-8.
1623 *
1624 * @param string $haystack UTF-8 string to search in
1625 * @param string $needle UTF-8 string to search for
1626 * @param int $offset Position to start the search
1627 * @return int The character position
1628 * @see strpos()
1629 */
1630 public function utf8_strpos($haystack, $needle, $offset = 0)
1631 {
1632 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1633 return mb_strpos($haystack, $needle, $offset, 'utf-8');
1634 } elseif ($this->getConversionStrategy() === self::STRATEGY_ICONV) {
1635 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1636 }
1637 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1638 if ($byte_offset === false) {
1639 // Offset beyond string length
1640 return false;
1641 }
1642 $byte_pos = strpos($haystack, $needle, $byte_offset);
1643 if ($byte_pos === false) {
1644 // Needle not found
1645 return false;
1646 }
1647 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1648 }
1649
1650 /**
1651 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1652 *
1653 * @param string $haystack UTF-8 string to search in
1654 * @param string $needle UTF-8 character to search for (single character)
1655 * @return int The character position
1656 * @see strrpos()
1657 */
1658 public function utf8_strrpos($haystack, $needle)
1659 {
1660 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1661 return mb_strrpos($haystack, $needle, 'utf-8');
1662 } elseif ($this->getConversionStrategy() === self::STRATEGY_ICONV) {
1663 return iconv_strrpos($haystack, $needle, 'utf-8');
1664 }
1665 $byte_pos = strrpos($haystack, $needle);
1666 if ($byte_pos === false) {
1667 // Needle not found
1668 return false;
1669 }
1670 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1671 }
1672
1673 /**
1674 * Translates a character position into an 'absolute' byte position.
1675 * Unit tested by Kasper.
1676 *
1677 * @param string $str UTF-8 string
1678 * @param int $pos Character position (negative values start from the end)
1679 * @return int Byte position
1680 */
1681 public function utf8_char2byte_pos($str, $pos)
1682 {
1683 // Number of characters found
1684 $n = 0;
1685 // Number of characters wanted
1686 $p = abs($pos);
1687 if ($pos >= 0) {
1688 $i = 0;
1689 $d = 1;
1690 } else {
1691 $i = strlen($str) - 1;
1692 $d = -1;
1693 }
1694 for (; isset($str[$i]) && $n < $p; $i += $d) {
1695 $c = (int)ord($str[$i]);
1696 // single-byte (0xxxxxx)
1697 if (!($c & 128)) {
1698 $n++;
1699 } elseif (($c & 192) === 192) {
1700 // Multi-byte starting byte (11xxxxxx)
1701 $n++;
1702 }
1703 }
1704 if (!isset($str[$i])) {
1705 // Offset beyond string length
1706 return false;
1707 }
1708 if ($pos >= 0) {
1709 // Skip trailing multi-byte data bytes
1710 while (ord($str[$i]) & 128 && !(ord($str[$i]) & 64)) {
1711 $i++;
1712 }
1713 } else {
1714 // Correct offset
1715 $i++;
1716 }
1717 return $i;
1718 }
1719
1720 /**
1721 * Translates an 'absolute' byte position into a character position.
1722 * Unit tested by Kasper.
1723 *
1724 * @param string $str UTF-8 string
1725 * @param int $pos Byte position
1726 * @return int Character position
1727 */
1728 public function utf8_byte2char_pos($str, $pos)
1729 {
1730 // Number of characters
1731 $n = 0;
1732 for ($i = $pos; $i > 0; $i--) {
1733 $c = (int)ord($str[$i]);
1734 // single-byte (0xxxxxx)
1735 if (!($c & 128)) {
1736 $n++;
1737 } elseif (($c & 192) === 192) {
1738 // Multi-byte starting byte (11xxxxxx)
1739 $n++;
1740 }
1741 }
1742 if (!isset($str[$i])) {
1743 // Offset beyond string length
1744 return false;
1745 }
1746 return $n;
1747 }
1748
1749 /**
1750 * Maps all characters of an UTF-8 string.
1751 *
1752 * @param string $str UTF-8 string
1753 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1754 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
1755 * @return string The converted string
1756 */
1757 public function utf8_char_mapping($str, $mode, $opt = '')
1758 {
1759 if (!$this->initUnicodeData($mode)) {
1760 // Do nothing
1761 return $str;
1762 }
1763 $out = '';
1764 switch ($mode) {
1765 case 'case':
1766 $map = &$this->caseFolding['utf-8'][$opt];
1767 break;
1768 case 'ascii':
1769 $map = &$this->toASCII['utf-8'];
1770 break;
1771 default:
1772 return $str;
1773 }
1774 for ($i = 0; isset($str[$i]); $i++) {
1775 $c = ord($str[$i]);
1776 // single-byte (0xxxxxx)
1777 if (!($c & 128)) {
1778 $mbc = $str[$i];
1779 } elseif (($c & 192) === 192) {
1780 // multi-byte starting byte (11xxxxxx)
1781 for ($bc = 0; $c & 128; $c = $c << 1) {
1782 $bc++;
1783 }
1784 // calculate number of bytes
1785 $mbc = substr($str, $i, $bc);
1786 $i += $bc - 1;
1787 }
1788 if (isset($map[$mbc])) {
1789 $out .= $map[$mbc];
1790 } else {
1791 $out .= $mbc;
1792 }
1793 }
1794 return $out;
1795 }
1796
1797 /********************************************
1798 *
1799 * Internal EUC string operation functions
1800 *
1801 * Extended Unix Code:
1802 * ASCII compatible 7bit single bytes chars
1803 * 8bit two byte chars
1804 *
1805 * Shift-JIS is treated as a special case.
1806 *
1807 ********************************************/
1808 /**
1809 * Cuts a string in the EUC charset family short at a given byte length.
1810 *
1811 * @param string $str EUC multibyte character string
1812 * @param int $len The byte length
1813 * @param string $charset The charset
1814 * @return string The shortened string
1815 * @see mb_strcut()
1816 */
1817 public function euc_strtrunc($str, $len, $charset)
1818 {
1819 $shiftJis = $charset === 'shift_jis';
1820 for ($i = 0; isset($str[$i]) && $i < $len; $i++) {
1821 $c = ord($str[$i]);
1822 if ($shiftJis) {
1823 if ($c >= 128 && $c < 160 || $c >= 224) {
1824 $i++;
1825 }
1826 } else {
1827 if ($c >= 128) {
1828 $i++;
1829 }
1830 }
1831 }
1832 if (!isset($str[$i])) {
1833 return $str;
1834 }
1835 // string shorter than supplied length
1836 if ($i > $len) {
1837 // We ended on a first byte
1838 return substr($str, 0, $len - 1);
1839 } else {
1840 return substr($str, 0, $len);
1841 }
1842 }
1843
1844 /**
1845 * Returns a part of a string in the EUC charset family.
1846 *
1847 * @param string $str EUC multibyte character string
1848 * @param int $start Start position (character position)
1849 * @param string $charset The charset
1850 * @param int $len Length (in characters)
1851 * @return string the substring
1852 */
1853 public function euc_substr($str, $start, $charset, $len = null)
1854 {
1855 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
1856 if ($byte_start === false) {
1857 // $start outside string length
1858 return false;
1859 }
1860 $str = substr($str, $byte_start);
1861 if ($len != null) {
1862 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
1863 // $len outside actual string length
1864 if ($byte_end === false) {
1865 return $str;
1866 } else {
1867 return substr($str, 0, $byte_end);
1868 }
1869 } else {
1870 return $str;
1871 }
1872 }
1873
1874 /**
1875 * Counts the number of characters of a string in the EUC charset family.
1876 *
1877 * @param string $str EUC multibyte character string
1878 * @param string $charset The charset
1879 * @return int The number of characters
1880 * @see strlen()
1881 */
1882 public function euc_strlen($str, $charset)
1883 {
1884 $sjis = $charset === 'shift_jis';
1885 $n = 0;
1886 for ($i = 0; isset($str[$i]); $i++) {
1887 $c = ord($str[$i]);
1888 if ($sjis) {
1889 if ($c >= 128 && $c < 160 || $c >= 224) {
1890 $i++;
1891 }
1892 } else {
1893 if ($c >= 128) {
1894 $i++;
1895 }
1896 }
1897 $n++;
1898 }
1899 return $n;
1900 }
1901
1902 /**
1903 * Translates a character position into an 'absolute' byte position.
1904 *
1905 * @param string $str EUC multibyte character string
1906 * @param int $pos Character position (negative values start from the end)
1907 * @param string $charset The charset
1908 * @return int Byte position
1909 */
1910 public function euc_char2byte_pos($str, $pos, $charset)
1911 {
1912 $sjis = $charset === 'shift_jis';
1913 // Number of characters seen
1914 $n = 0;
1915 // Number of characters wanted
1916 $p = abs($pos);
1917 if ($pos >= 0) {
1918 $i = 0;
1919 $d = 1;
1920 } else {
1921 $i = strlen($str) - 1;
1922 $d = -1;
1923 }
1924 for (; isset($str[$i]) && $n < $p; $i += $d) {
1925 $c = ord($str[$i]);
1926 if ($sjis) {
1927 if ($c >= 128 && $c < 160 || $c >= 224) {
1928 $i += $d;
1929 }
1930 } else {
1931 if ($c >= 128) {
1932 $i += $d;
1933 }
1934 }
1935 $n++;
1936 }
1937 if (!isset($str[$i])) {
1938 return false;
1939 }
1940 // offset beyond string length
1941 if ($pos < 0) {
1942 $i++;
1943 }
1944 // correct offset
1945 return $i;
1946 }
1947
1948 /**
1949 * Maps all characters of a string in the EUC charset family.
1950 *
1951 * @param string $str EUC multibyte character string
1952 * @param string $charset The charset
1953 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1954 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
1955 * @return string The converted string
1956 */
1957 public function euc_char_mapping($str, $charset, $mode, $opt = '')
1958 {
1959 switch ($mode) {
1960 case 'case':
1961 if (!$this->initCaseFolding($charset)) {
1962 return $str;
1963 }
1964 // do nothing
1965 $map = &$this->caseFolding[$charset][$opt];
1966 break;
1967 case 'ascii':
1968 if (!$this->initToASCII($charset)) {
1969 return $str;
1970 }
1971 // do nothing
1972 $map = &$this->toASCII[$charset];
1973 break;
1974 default:
1975 return $str;
1976 }
1977 $sjis = $charset === 'shift_jis';
1978 $out = '';
1979 for ($i = 0; isset($str[$i]); $i++) {
1980 $mbc = $str[$i];
1981 $c = ord($mbc);
1982 if ($sjis) {
1983 // A double-byte char
1984 if ($c >= 128 && $c < 160 || $c >= 224) {
1985 $mbc = substr($str, $i, 2);
1986 $i++;
1987 }
1988 } else {
1989 // A double-byte char
1990 if ($c >= 128) {
1991 $mbc = substr($str, $i, 2);
1992 $i++;
1993 }
1994 }
1995 if (isset($map[$mbc])) {
1996 $out .= $map[$mbc];
1997 } else {
1998 $out .= $mbc;
1999 }
2000 }
2001 return $out;
2002 }
2003
2004 /**
2005 * Checks the selected strategy based on which method is available in the system.
2006 * "mbstring" takes precedence over "iconv".
2007 * See http://stackoverflow.com/questions/8233517/what-is-the-difference-between-iconv-and-mb-convert-encoding-in-php
2008 *
2009 * @return string could be "mbstring", "iconv" or "fallback"
2010 */
2011 protected function getConversionStrategy()
2012 {
2013 if ($this->conversionStrategy === null) {
2014 if (extension_loaded('mbstring')) {
2015 $this->conversionStrategy = self::STRATEGY_MBSTRING;
2016 } elseif (extension_loaded('iconv')) {
2017 $this->conversionStrategy = self::STRATEGY_ICONV;
2018 } else {
2019 $this->conversionStrategy = self::STRATEGY_FALLBACK;
2020 }
2021 }
2022 return $this->conversionStrategy;
2023 }
2024 }