7265bbff479f66b758b50ee37af1bfe9f2a29394
[Packages/TYPO3.CMS.git] / typo3 / sysext / core / Classes / Charset / CharsetConverter.php
1 <?php
2 namespace TYPO3\CMS\Core\Charset;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Localization\Locales;
18 use TYPO3\CMS\Core\SingletonInterface;
19 use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
20 use TYPO3\CMS\Core\Utility\GeneralUtility;
21
22 /**
23 * Notes on UTF-8
24 *
25 * Functions working on UTF-8 strings:
26 *
27 * - strchr/strstr
28 * - strrchr
29 * - substr_count
30 * - implode/explode/join
31 *
32 * Functions nearly working on UTF-8 strings:
33 *
34 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
35 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
36 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
37 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
38 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
39 *
40 * Functions NOT working on UTF-8 strings:
41 *
42 * - str*cmp
43 * - stristr
44 * - stripos
45 * - substr
46 * - strrev
47 * - split/spliti
48 * - ...
49 */
50
51 /**
52 * Class for conversion between charsets
53 */
54 class CharsetConverter implements SingletonInterface
55 {
56 /**
57 * Possible strategies for handling multi-byte data
58 * Only used for internal purpose
59 * @internal
60 */
61 const STRATEGY_MBSTRING = 'mbstring';
62 const STRATEGY_ICONV = 'iconv';
63 const STRATEGY_FALLBACK = 'fallback';
64
65 /**
66 * Set to one of the strategies above, based on the availability of the environment.
67 *
68 * @var string
69 */
70 protected $conversionStrategy = null;
71
72 /**
73 * ASCII Value for chars with no equivalent.
74 *
75 * @var int
76 */
77 public $noCharByteVal = 63;
78
79 /**
80 * This is the array where parsed conversion tables are stored (cached)
81 *
82 * @var array
83 */
84 public $parsedCharsets = array();
85
86 /**
87 * An array where case folding data will be stored (cached)
88 *
89 * @var array
90 */
91 public $caseFolding = array();
92
93 /**
94 * An array where charset-to-ASCII mappings are stored (cached)
95 *
96 * @var array
97 */
98 public $toASCII = array();
99
100 /**
101 * This tells the converter which charsets has two bytes per char:
102 *
103 * @var array
104 */
105 public $twoByteSets = array(
106 'ucs-2' => 1
107 );
108
109 /**
110 * This tells the converter which charsets has four bytes per char:
111 *
112 * @var array
113 */
114 public $fourByteSets = array(
115 'ucs-4' => 1, // 4-byte Unicode
116 'utf-32' => 1
117 );
118
119 /**
120 * This tells the converter which charsets use a scheme like the Extended Unix Code:
121 *
122 * @var array
123 */
124 public $eucBasedSets = array(
125 'gb2312' => 1, // Chinese, simplified.
126 'big5' => 1, // Chinese, traditional.
127 'euc-kr' => 1, // Korean
128 'shift_jis' => 1
129 );
130
131 /**
132 * @link http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
133 * @link http://czyborra.com/charsets/iso8859.html
134 *
135 * @var array
136 */
137 public $synonyms = array(
138 'us' => 'ascii',
139 'us-ascii' => 'ascii',
140 'cp819' => 'iso-8859-1',
141 'ibm819' => 'iso-8859-1',
142 'iso-ir-100' => 'iso-8859-1',
143 'iso-ir-101' => 'iso-8859-2',
144 'iso-ir-109' => 'iso-8859-3',
145 'iso-ir-110' => 'iso-8859-4',
146 'iso-ir-144' => 'iso-8859-5',
147 'iso-ir-127' => 'iso-8859-6',
148 'iso-ir-126' => 'iso-8859-7',
149 'iso-ir-138' => 'iso-8859-8',
150 'iso-ir-148' => 'iso-8859-9',
151 'iso-ir-157' => 'iso-8859-10',
152 'iso-ir-179' => 'iso-8859-13',
153 'iso-ir-199' => 'iso-8859-14',
154 'iso-ir-203' => 'iso-8859-15',
155 'csisolatin1' => 'iso-8859-1',
156 'csisolatin2' => 'iso-8859-2',
157 'csisolatin3' => 'iso-8859-3',
158 'csisolatin5' => 'iso-8859-9',
159 'csisolatin8' => 'iso-8859-14',
160 'csisolatin9' => 'iso-8859-15',
161 'csisolatingreek' => 'iso-8859-7',
162 'iso-celtic' => 'iso-8859-14',
163 'latin1' => 'iso-8859-1',
164 'latin2' => 'iso-8859-2',
165 'latin3' => 'iso-8859-3',
166 'latin5' => 'iso-8859-9',
167 'latin6' => 'iso-8859-10',
168 'latin8' => 'iso-8859-14',
169 'latin9' => 'iso-8859-15',
170 'l1' => 'iso-8859-1',
171 'l2' => 'iso-8859-2',
172 'l3' => 'iso-8859-3',
173 'l5' => 'iso-8859-9',
174 'l6' => 'iso-8859-10',
175 'l8' => 'iso-8859-14',
176 'l9' => 'iso-8859-15',
177 'cyrillic' => 'iso-8859-5',
178 'arabic' => 'iso-8859-6',
179 'tis-620' => 'iso-8859-11',
180 'win874' => 'windows-874',
181 'win1250' => 'windows-1250',
182 'win1251' => 'windows-1251',
183 'win1252' => 'windows-1252',
184 'win1253' => 'windows-1253',
185 'win1254' => 'windows-1254',
186 'win1255' => 'windows-1255',
187 'win1256' => 'windows-1256',
188 'win1257' => 'windows-1257',
189 'win1258' => 'windows-1258',
190 'cp1250' => 'windows-1250',
191 'cp1251' => 'windows-1251',
192 'cp1252' => 'windows-1252',
193 'ms-ee' => 'windows-1250',
194 'ms-ansi' => 'windows-1252',
195 'ms-greek' => 'windows-1253',
196 'ms-turk' => 'windows-1254',
197 'winbaltrim' => 'windows-1257',
198 'koi-8ru' => 'koi-8r',
199 'koi8r' => 'koi-8r',
200 'cp878' => 'koi-8r',
201 'mac' => 'macroman',
202 'macintosh' => 'macroman',
203 'euc-cn' => 'gb2312',
204 'x-euc-cn' => 'gb2312',
205 'euccn' => 'gb2312',
206 'cp936' => 'gb2312',
207 'big-5' => 'big5',
208 'cp950' => 'big5',
209 'eucjp' => 'euc-jp',
210 'sjis' => 'shift_jis',
211 'shift-jis' => 'shift_jis',
212 'cp932' => 'shift_jis',
213 'cp949' => 'euc-kr',
214 'utf7' => 'utf-7',
215 'utf8' => 'utf-8',
216 'utf16' => 'utf-16',
217 'utf32' => 'utf-32',
218 'ucs2' => 'ucs-2',
219 'ucs4' => 'ucs-4'
220 );
221
222 /**
223 * TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
224 * Empty values means "utf-8"
225 *
226 * @var array
227 */
228 public $charSetArray = array(
229 'af' => '',
230 'ar' => 'iso-8859-6',
231 'ba' => 'iso-8859-2',
232 'bg' => 'windows-1251',
233 'br' => '',
234 'ca' => 'iso-8859-15',
235 'ch' => 'gb2312',
236 'cs' => 'windows-1250',
237 'cz' => 'windows-1250',
238 'da' => '',
239 'de' => '',
240 'dk' => '',
241 'el' => 'iso-8859-7',
242 'eo' => 'utf-8',
243 'es' => '',
244 'et' => 'iso-8859-4',
245 'eu' => '',
246 'fa' => 'utf-8',
247 'fi' => '',
248 'fo' => 'utf-8',
249 'fr' => '',
250 'fr_CA' => '',
251 'ga' => '',
252 'ge' => 'utf-8',
253 'gl' => '',
254 'gr' => 'iso-8859-7',
255 'he' => 'utf-8',
256 'hi' => 'utf-8',
257 'hk' => 'big5',
258 'hr' => 'windows-1250',
259 'hu' => 'iso-8859-2',
260 'is' => 'utf-8',
261 'it' => '',
262 'ja' => 'shift_jis',
263 'jp' => 'shift_jis',
264 'ka' => 'utf-8',
265 'kl' => 'utf-8',
266 'km' => 'utf-8',
267 'ko' => 'euc-kr',
268 'kr' => 'euc-kr',
269 'lt' => 'windows-1257',
270 'lv' => 'utf-8',
271 'ms' => '',
272 'my' => '',
273 'nl' => '',
274 'no' => '',
275 'pl' => 'iso-8859-2',
276 'pt' => '',
277 'pt_BR' => '',
278 'qc' => '',
279 'ro' => 'iso-8859-2',
280 'ru' => 'windows-1251',
281 'se' => '',
282 'si' => 'windows-1250',
283 'sk' => 'windows-1250',
284 'sl' => 'windows-1250',
285 'sq' => 'utf-8',
286 'sr' => 'utf-8',
287 'sv' => '',
288 'th' => 'iso-8859-11',
289 'tr' => 'iso-8859-9',
290 'ua' => 'windows-1251',
291 'uk' => 'windows-1251',
292 'vi' => 'utf-8',
293 'vn' => 'utf-8',
294 'zh' => 'big5'
295 );
296
297 /**
298 * Normalize - changes input character set to lowercase letters.
299 *
300 * @param string $charset Input charset
301 * @return string Normalized charset
302 */
303 public function parse_charset($charset)
304 {
305 $charset = trim(strtolower($charset));
306 if (isset($this->synonyms[$charset])) {
307 $charset = $this->synonyms[$charset];
308 }
309 return $charset;
310 }
311
312 /********************************************
313 *
314 * Charset Conversion functions
315 *
316 ********************************************/
317 /**
318 * Convert from one charset to another charset.
319 *
320 * @param string $inputString Input string
321 * @param string $fromCharset From charset (the current charset of the string)
322 * @param string $toCharset To charset (the output charset wanted)
323 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
324 * @return string Converted string
325 * @see convArray()
326 */
327 public function conv($inputString, $fromCharset, $toCharset, $useEntityForNoChar = false)
328 {
329 if ($fromCharset === $toCharset) {
330 return $inputString;
331 }
332 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
333 if ($toCharset === 'utf-8' || !$useEntityForNoChar) {
334 switch ($this->getConversionStrategy()) {
335 case self::STRATEGY_MBSTRING:
336 $convertedString = mb_convert_encoding($inputString, $toCharset, $fromCharset);
337 if (false !== $convertedString) {
338 return $convertedString;
339 }
340 // Returns FALSE for unsupported charsets
341 break;
342 case self::STRATEGY_ICONV:
343 $convertedString = iconv($fromCharset, $toCharset . '//TRANSLIT', $inputString);
344 if (false !== $convertedString) {
345 return $convertedString;
346 }
347 break;
348 }
349 }
350 if ($fromCharset !== 'utf-8') {
351 $inputString = $this->utf8_encode($inputString, $fromCharset);
352 }
353 if ($toCharset !== 'utf-8') {
354 $inputString = $this->utf8_decode($inputString, $toCharset, $useEntityForNoChar);
355 }
356 return $inputString;
357 }
358
359 /**
360 * Convert all elements in ARRAY with type string from one charset to another charset.
361 * NOTICE: Array is passed by reference!
362 *
363 * @param array $array Input array, possibly multidimensional
364 * @param string $fromCharset From charset (the current charset of the string)
365 * @param string $toCharset To charset (the output charset wanted)
366 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
367 * @return void
368 * @see conv()
369 */
370 public function convArray(&$array, $fromCharset, $toCharset, $useEntityForNoChar = false)
371 {
372 foreach ($array as $key => $value) {
373 if (is_array($array[$key])) {
374 $this->convArray($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
375 } elseif (is_string($array[$key])) {
376 $array[$key] = $this->conv($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
377 }
378 }
379 }
380
381 /**
382 * Converts $str from $charset to UTF-8
383 *
384 * @param string $str String in local charset to convert to UTF-8
385 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
386 * @return string Output string, converted to UTF-8
387 */
388 public function utf8_encode($str, $charset)
389 {
390 if ($charset === 'utf-8') {
391 return $str;
392 }
393 // Charset is case-insensitive
394 // Parse conv. table if not already
395 if ($this->initCharset($charset)) {
396 $strLen = strlen($str);
397 $outStr = '';
398 // Traverse each char in string
399 for ($a = 0; $a < $strLen; $a++) {
400 $chr = substr($str, $a, 1);
401 $ord = ord($chr);
402 // If the charset has two bytes per char
403 if (isset($this->twoByteSets[$charset])) {
404 $ord2 = ord($str[$a + 1]);
405 // Assume big endian
406 $ord = $ord << 8 | $ord2;
407 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
408 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
409 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
410 } else {
411 $outStr .= chr($this->noCharByteVal);
412 }
413 // No char exists
414 $a++;
415 } elseif ($ord > 127) {
416 // If char has value over 127 it's a multibyte char in UTF-8
417 // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
418 if (isset($this->eucBasedSets[$charset])) {
419 // Shift-JIS: chars between 160 and 223 are single byte
420 if ($charset !== 'shift_jis' || ($ord < 160 || $ord > 223)) {
421 $a++;
422 $ord2 = ord(substr($str, $a, 1));
423 $ord = $ord * 256 + $ord2;
424 }
425 }
426 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
427 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
428 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
429 } else {
430 $outStr .= chr($this->noCharByteVal);
431 }
432 } else {
433 $outStr .= $chr;
434 }
435 }
436 return $outStr;
437 }
438 }
439
440 /**
441 * Converts $str from UTF-8 to $charset
442 *
443 * @param string $str String in UTF-8 to convert to local charset
444 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
445 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
446 * @return string Output string, converted to local charset
447 */
448 public function utf8_decode($str, $charset, $useEntityForNoChar = false)
449 {
450 if ($charset === 'utf-8') {
451 return $str;
452 }
453 // Charset is case-insensitive.
454 // Parse conv. table if not already
455 if ($this->initCharset($charset)) {
456 $strLen = strlen($str);
457 $outStr = '';
458 // Traverse each char in UTF-8 string
459 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) {
460 $chr = substr($str, $a, 1);
461 $ord = ord($chr);
462 // This means multibyte! (first byte!)
463 if ($ord > 127) {
464 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
465 if ($ord & 64) {
466 // Add first byte
467 $buf = $chr;
468 // For each byte in multibyte string
469 for ($b = 0; $b < 8; $b++) {
470 // Shift it left and
471 $ord = $ord << 1;
472 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
473 if ($ord & 128) {
474 $a++;
475 // ... and add the next char.
476 $buf .= substr($str, $a, 1);
477 } else {
478 break;
479 }
480 }
481 // If the UTF-8 char-sequence is found then...
482 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
483 // The local number
484 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
485 // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
486 if ($mByte > 255) {
487 $outStr .= chr(($mByte >> 8 & 255)) . chr(($mByte & 255));
488 } else {
489 $outStr .= chr($mByte);
490 }
491 } elseif ($useEntityForNoChar) {
492 // Create num entity:
493 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
494 } else {
495 $outStr .= chr($this->noCharByteVal);
496 }
497 } else {
498 $outStr .= chr($this->noCharByteVal);
499 }
500 } else {
501 $outStr .= $chr;
502 }
503 }
504 return $outStr;
505 }
506 }
507
508 /**
509 * Converts all chars > 127 to numeric entities.
510 *
511 * @param string $str Input string
512 * @return string Output string
513 */
514 public function utf8_to_entities($str)
515 {
516 $strLen = strlen($str);
517 $outStr = '';
518 // Traverse each char in UTF-8 string.
519 for ($a = 0; $a < $strLen; $a++) {
520 $chr = substr($str, $a, 1);
521 $ord = ord($chr);
522 // This means multibyte! (first byte!)
523 if ($ord > 127) {
524 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
525 if ($ord & 64) {
526 // Add first byte
527 $buf = $chr;
528 // For each byte in multibyte string...
529 for ($b = 0; $b < 8; $b++) {
530 // Shift it left and ...
531 $ord = $ord << 1;
532 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
533 if ($ord & 128) {
534 $a++;
535 // ... and add the next char.
536 $buf .= substr($str, $a, 1);
537 } else {
538 break;
539 }
540 }
541 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
542 } else {
543 $outStr .= chr($this->noCharByteVal);
544 }
545 } else {
546 $outStr .= $chr;
547 }
548 }
549 return $outStr;
550 }
551
552 /**
553 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
554 *
555 * @param string $str Input string, UTF-8
556 * @param bool $alsoStdHtmlEnt If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
557 * @return string Output string
558 */
559 public function entities_to_utf8($str, $alsoStdHtmlEnt = false)
560 {
561 if ($alsoStdHtmlEnt) {
562 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8'));
563 }
564 $token = md5(microtime());
565 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
566 foreach ($parts as $k => $v) {
567 // Only take every second element
568 if ($k % 2 === 0) {
569 continue;
570 }
571 $position = 0;
572 // Dec or hex entities
573 if (substr($v, $position, 1) === '#') {
574 $position++;
575 if (substr($v, $position, 1) === 'x') {
576 $v = hexdec(substr($v, ++$position));
577 } else {
578 $v = substr($v, $position);
579 }
580 $parts[$k] = $this->UnumberToChar($v);
581 } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) {
582 // Other entities:
583 $v = $trans_tbl['&' . $v . ';'];
584 $parts[$k] = $v;
585 } else {
586 // No conversion:
587 $parts[$k] = '&' . $v . ';';
588 }
589 }
590 return implode('', $parts);
591 }
592
593 /**
594 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
595 *
596 * @param string $str Input string, UTF-8
597 * @param bool $convEntities If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
598 * @param bool $retChar If set, then instead of integer numbers the real UTF-8 char is returned.
599 * @return array Output array with the char numbers
600 */
601 public function utf8_to_numberarray($str, $convEntities = false, $retChar = false)
602 {
603 // If entities must be registered as well...:
604 if ($convEntities) {
605 $str = $this->entities_to_utf8($str, 1);
606 }
607 // Do conversion:
608 $strLen = strlen($str);
609 $outArr = array();
610 // Traverse each char in UTF-8 string.
611 for ($a = 0; $a < $strLen; $a++) {
612 $chr = substr($str, $a, 1);
613 $ord = ord($chr);
614 // This means multibyte! (first byte!)
615 if ($ord > 127) {
616 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
617 if ($ord & 64) {
618 // Add first byte
619 $buf = $chr;
620 // For each byte in multibyte string...
621 for ($b = 0; $b < 8; $b++) {
622 // Shift it left and ...
623 $ord = $ord << 1;
624 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
625 if ($ord & 128) {
626 $a++;
627 // ... and add the next char.
628 $buf .= substr($str, $a, 1);
629 } else {
630 break;
631 }
632 }
633 $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
634 } else {
635 $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
636 }
637 } else {
638 $outArr[] = $retChar ? chr($ord) : $ord;
639 }
640 }
641 return $outArr;
642 }
643
644 /**
645 * Converts a UNICODE number to a UTF-8 multibyte character
646 * Algorithm based on script found at From: http://czyborra.com/utf/
647 * Unit-tested by Kasper
648 *
649 * The binary representation of the character's integer value is thus simply spread across the bytes
650 * and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
651 *
652 * bytes | bits | representation
653 * 1 | 7 | 0vvvvvvv
654 * 2 | 11 | 110vvvvv 10vvvvvv
655 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
656 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
657 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
658 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
659 *
660 * @param int $unicodeInteger UNICODE integer
661 * @return string UTF-8 multibyte character string
662 * @see utf8CharToUnumber()
663 */
664 public function UnumberToChar($unicodeInteger)
665 {
666 $str = '';
667 if ($unicodeInteger < 128) {
668 $str .= chr($unicodeInteger);
669 } elseif ($unicodeInteger < 2048) {
670 $str .= chr(192 | $unicodeInteger >> 6);
671 $str .= chr(128 | $unicodeInteger & 63);
672 } elseif ($unicodeInteger < 65536) {
673 $str .= chr(224 | $unicodeInteger >> 12);
674 $str .= chr(128 | $unicodeInteger >> 6 & 63);
675 $str .= chr(128 | $unicodeInteger & 63);
676 } elseif ($unicodeInteger < 2097152) {
677 $str .= chr(240 | $unicodeInteger >> 18);
678 $str .= chr(128 | $unicodeInteger >> 12 & 63);
679 $str .= chr(128 | $unicodeInteger >> 6 & 63);
680 $str .= chr(128 | $unicodeInteger & 63);
681 } elseif ($unicodeInteger < 67108864) {
682 $str .= chr(248 | $unicodeInteger >> 24);
683 $str .= chr(128 | $unicodeInteger >> 18 & 63);
684 $str .= chr(128 | $unicodeInteger >> 12 & 63);
685 $str .= chr(128 | $unicodeInteger >> 6 & 63);
686 $str .= chr(128 | $unicodeInteger & 63);
687 } elseif ($unicodeInteger < 2147483648) {
688 $str .= chr(252 | $unicodeInteger >> 30);
689 $str .= chr(128 | $unicodeInteger >> 24 & 63);
690 $str .= chr(128 | $unicodeInteger >> 18 & 63);
691 $str .= chr(128 | $unicodeInteger >> 12 & 63);
692 $str .= chr(128 | $unicodeInteger >> 6 & 63);
693 $str .= chr(128 | $unicodeInteger & 63);
694 } else {
695 // Cannot express a 32-bit character in UTF-8
696 $str .= chr($this->noCharByteVal);
697 }
698 return $str;
699 }
700
701 /**
702 * Converts a UTF-8 Multibyte character to a UNICODE number
703 * Unit-tested by Kasper
704 *
705 * @param string $str UTF-8 multibyte character string
706 * @param bool $hex If set, then a hex. number is returned.
707 * @return int UNICODE integer
708 * @see UnumberToChar()
709 */
710 public function utf8CharToUnumber($str, $hex = false)
711 {
712 // First char
713 $ord = ord($str[0]);
714 // This verifies that it IS a multi byte string
715 if (($ord & 192) === 192) {
716 $binBuf = '';
717 // For each byte in multibyte string...
718 for ($b = 0; $b < 8; $b++) {
719 // Shift it left and ...
720 $ord = $ord << 1;
721 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
722 if ($ord & 128) {
723 $binBuf .= substr('00000000' . decbin(ord(substr($str, ($b + 1), 1))), -6);
724 } else {
725 break;
726 }
727 }
728 $binBuf = substr(('00000000' . decbin(ord($str[0]))), -(6 - $b)) . $binBuf;
729 $int = bindec($binBuf);
730 } else {
731 $int = $ord;
732 }
733 return $hex ? 'x' . dechex($int) : $int;
734 }
735
736 /********************************************
737 *
738 * Init functions
739 *
740 ********************************************/
741 /**
742 * This will initialize a charset for use if it's defined in the 'typo3/sysext/core/Resources/Private/Charsets/csconvtbl/' folder
743 * This function is automatically called by the conversion functions
744 *
745 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
746 *
747 * @param string $charset The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
748 * @return int Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
749 * @access private
750 */
751 public function initCharset($charset)
752 {
753 // Only process if the charset is not yet loaded:
754 if (!is_array($this->parsedCharsets[$charset])) {
755 // Conversion table filename:
756 $charsetConvTableFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/csconvtbl/' . $charset . '.tbl';
757 // If the conversion table is found:
758 if ($charset && GeneralUtility::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
759 // Cache file for charsets:
760 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
761 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/charset_' . $charset . '.tbl');
762 if ($cacheFile && @is_file($cacheFile)) {
763 $this->parsedCharsets[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
764 } else {
765 // Parse conversion table into lines:
766 $lines = GeneralUtility::trimExplode(LF, GeneralUtility::getUrl($charsetConvTableFile), true);
767 // Initialize the internal variable holding the conv. table:
768 $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
769 // traverse the lines:
770 $detectedType = '';
771 foreach ($lines as $value) {
772 // Comment line or blanks are ignored.
773 if (trim($value) && $value[0] !== '#') {
774 // Detect type if not done yet: (Done on first real line)
775 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
776 if (!$detectedType) {
777 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
778 }
779 if ($detectedType === 'ms-token') {
780 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
781 } elseif ($detectedType === 'whitespaced') {
782 $regA = array();
783 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
784 $hexbyte = $regA[1];
785 $utf8 = 'U+' . $regA[2];
786 }
787 $decval = hexdec(trim($hexbyte));
788 if ($decval > 127) {
789 $utf8decval = hexdec(substr(trim($utf8), 2));
790 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
791 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
792 }
793 }
794 }
795 if ($cacheFile) {
796 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
797 }
798 }
799 return 2;
800 } else {
801 return false;
802 }
803 } else {
804 return 1;
805 }
806 }
807
808 /**
809 * This function initializes all UTF-8 character data tables.
810 *
811 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
812 *
813 * @param string $mode Mode ("case", "ascii", ...)
814 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
815 * @access private
816 */
817 public function initUnicodeData($mode = null)
818 {
819 // Cache files
820 $cacheFileCase = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/cscase_utf-8.tbl');
821 $cacheFileASCII = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/csascii_utf-8.tbl');
822 // Only process if the tables are not yet loaded
823 switch ($mode) {
824 case 'case':
825 if (is_array($this->caseFolding['utf-8'])) {
826 return 1;
827 }
828 // Use cached version if possible
829 if ($cacheFileCase && @is_file($cacheFileCase)) {
830 $this->caseFolding['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileCase));
831 return 2;
832 }
833 break;
834 case 'ascii':
835 if (is_array($this->toASCII['utf-8'])) {
836 return 1;
837 }
838 // Use cached version if possible
839 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
840 $this->toASCII['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileASCII));
841 return 2;
842 }
843 break;
844 }
845 // Process main Unicode data file
846 $unicodeDataFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/UnicodeData.txt';
847 if (!(GeneralUtility::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
848 return false;
849 }
850 $fh = fopen($unicodeDataFile, 'rb');
851 if (!$fh) {
852 return false;
853 }
854 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
855 // Note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
856 $this->caseFolding['utf-8'] = array();
857 $utf8CaseFolding = &$this->caseFolding['utf-8'];
858 // a shorthand
859 $utf8CaseFolding['toUpper'] = array();
860 $utf8CaseFolding['toLower'] = array();
861 $utf8CaseFolding['toTitle'] = array();
862 // Array of temp. decompositions
863 $decomposition = array();
864 // Array of chars that are marks (eg. composing accents)
865 $mark = array();
866 // Array of chars that are numbers (eg. digits)
867 $number = array();
868 // Array of chars to be omitted (eg. Russian hard sign)
869 $omit = array();
870 while (!feof($fh)) {
871 $line = fgets($fh, 4096);
872 // Has a lot of info
873 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title, ) = explode(';', rtrim($line));
874 $ord = hexdec($char);
875 if ($ord > 65535) {
876 // Only process the BMP
877 break;
878 }
879 $utf8_char = $this->UnumberToChar($ord);
880 if ($upper) {
881 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
882 }
883 if ($lower) {
884 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
885 }
886 // Store "title" only when different from "upper" (only a few)
887 if ($title && $title !== $upper) {
888 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
889 }
890 switch ($cat[0]) {
891 case 'M':
892 // mark (accent, umlaut, ...)
893 $mark['U+' . $char] = 1;
894 break;
895 case 'N':
896 // numeric value
897 if ($ord > 128 && $num !== '') {
898 $number['U+' . $char] = $num;
899 }
900 }
901 // Accented Latin letters without "official" decomposition
902 $match = array();
903 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
904 $c = ord($match[2]);
905 if ($match[1] === 'SMALL') {
906 $c += 32;
907 }
908 $decomposition['U+' . $char] = array(dechex($c));
909 continue;
910 }
911 $match = array();
912 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
913 switch ($match[1]) {
914 case '<circle>':
915 // add parenthesis as circle replacement, eg (1)
916 $match[2] = '0028 ' . $match[2] . ' 0029';
917 break;
918 case '<square>':
919 // add square brackets as square replacement, eg [1]
920 $match[2] = '005B ' . $match[2] . ' 005D';
921 break;
922 case '<compat>':
923 // ignore multi char decompositions that start with a space
924 if (preg_match('/^0020 /', $match[2])) {
925 continue 2;
926 }
927 break;
928 case '<initial>':
929 case '<medial>':
930 case '<final>':
931 case '<isolated>':
932 case '<vertical>':
933 continue 2;
934 }
935 $decomposition['U+' . $char] = explode(' ', $match[2]);
936 }
937 }
938 fclose($fh);
939 // Process additional Unicode data for casing (allow folded characters to expand into a sequence)
940 $specialCasingFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/SpecialCasing.txt';
941 if (GeneralUtility::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
942 $fh = fopen($specialCasingFile, 'rb');
943 if ($fh) {
944 while (!feof($fh)) {
945 $line = fgets($fh, 4096);
946 if ($line[0] !== '#' && trim($line) !== '') {
947 list($char, $lower, $title, $upper, $cond) = GeneralUtility::trimExplode(';', $line);
948 if ($cond === '' || $cond[0] === '#') {
949 $utf8_char = $this->UnumberToChar(hexdec($char));
950 if ($char !== $lower) {
951 $arr = explode(' ', $lower);
952 for ($i = 0; isset($arr[$i]); $i++) {
953 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
954 }
955 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
956 }
957 if ($char !== $title && $title !== $upper) {
958 $arr = explode(' ', $title);
959 for ($i = 0; isset($arr[$i]); $i++) {
960 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
961 }
962 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
963 }
964 if ($char !== $upper) {
965 $arr = explode(' ', $upper);
966 for ($i = 0; isset($arr[$i]); $i++) {
967 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
968 }
969 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
970 }
971 }
972 }
973 }
974 fclose($fh);
975 }
976 }
977 // Process custom decompositions
978 $customTranslitFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/Translit.txt';
979 if (GeneralUtility::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
980 $fh = fopen($customTranslitFile, 'rb');
981 if ($fh) {
982 while (!feof($fh)) {
983 $line = fgets($fh, 4096);
984 if ($line[0] !== '#' && trim($line) !== '') {
985 list($char, $translit) = GeneralUtility::trimExplode(';', $line);
986 if (!$translit) {
987 $omit['U+' . $char] = 1;
988 }
989 $decomposition['U+' . $char] = explode(' ', $translit);
990 }
991 }
992 fclose($fh);
993 }
994 }
995 // Decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
996 foreach ($decomposition as $from => $to) {
997 $code_decomp = array();
998 while ($code_value = array_shift($to)) {
999 // Do recursive decomposition
1000 if (isset($decomposition['U+' . $code_value])) {
1001 foreach (array_reverse($decomposition['U+' . $code_value]) as $cv) {
1002 array_unshift($to, $cv);
1003 }
1004 } elseif (!isset($mark['U+' . $code_value])) {
1005 // remove mark
1006 array_push($code_decomp, $code_value);
1007 }
1008 }
1009 if (!empty($code_decomp) || isset($omit[$from])) {
1010 $decomposition[$from] = $code_decomp;
1011 } else {
1012 unset($decomposition[$from]);
1013 }
1014 }
1015 // Create ascii only mapping
1016 $this->toASCII['utf-8'] = array();
1017 $ascii = &$this->toASCII['utf-8'];
1018 foreach ($decomposition as $from => $to) {
1019 $code_decomp = array();
1020 while ($code_value = array_shift($to)) {
1021 $ord = hexdec($code_value);
1022 if ($ord > 127) {
1023 continue 2;
1024 } else {
1025 // Skip decompositions containing non-ASCII chars
1026 array_push($code_decomp, chr($ord));
1027 }
1028 }
1029 $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1030 }
1031 // Add numeric decompositions
1032 foreach ($number as $from => $to) {
1033 $utf8_char = $this->UnumberToChar(hexdec($from));
1034 if (!isset($ascii[$utf8_char])) {
1035 $ascii[$utf8_char] = $to;
1036 }
1037 }
1038 if ($cacheFileCase) {
1039 GeneralUtility::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1040 }
1041 if ($cacheFileASCII) {
1042 GeneralUtility::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1043 }
1044 return 3;
1045 }
1046
1047 /**
1048 * This function initializes the folding table for a charset other than UTF-8.
1049 * This function is automatically called by the case folding functions.
1050 *
1051 * @param string $charset Charset for which to initialize case folding.
1052 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1053 * @access private
1054 */
1055 public function initCaseFolding($charset)
1056 {
1057 // Only process if the case table is not yet loaded:
1058 if (is_array($this->caseFolding[$charset])) {
1059 return 1;
1060 }
1061 // Use cached version if possible
1062 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/cscase_' . $charset . '.tbl');
1063 if ($cacheFile && @is_file($cacheFile)) {
1064 $this->caseFolding[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1065 return 2;
1066 }
1067 // init UTF-8 conversion for this charset
1068 if (!$this->initCharset($charset)) {
1069 return false;
1070 }
1071 // UTF-8 case folding is used as the base conversion table
1072 if (!$this->initUnicodeData('case')) {
1073 return false;
1074 }
1075 $nochar = chr($this->noCharByteVal);
1076 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1077 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1078 $c = $this->utf8_decode($utf8, $charset);
1079 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1080 if ($cc !== '' && $cc !== $nochar) {
1081 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1082 }
1083 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1084 if ($cc !== '' && $cc !== $nochar) {
1085 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1086 }
1087 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1088 if ($cc !== '' && $cc !== $nochar) {
1089 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1090 }
1091 }
1092 // Add the ASCII case table
1093 $start = ord('a');
1094 $end = ord('z');
1095 for ($i = $start; $i <= $end; $i++) {
1096 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1097 }
1098 $start = ord('A');
1099 $end = ord('Z');
1100 for ($i = $start; $i <= $end; $i++) {
1101 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1102 }
1103 if ($cacheFile) {
1104 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1105 }
1106 return 3;
1107 }
1108
1109 /**
1110 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1111 * This function is automatically called by the ASCII transliteration functions.
1112 *
1113 * @param string $charset Charset for which to initialize conversion.
1114 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1115 * @access private
1116 */
1117 public function initToASCII($charset)
1118 {
1119 // Only process if the case table is not yet loaded:
1120 if (is_array($this->toASCII[$charset])) {
1121 return 1;
1122 }
1123 // Use cached version if possible
1124 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/var/charset/csascii_' . $charset . '.tbl');
1125 if ($cacheFile && @is_file($cacheFile)) {
1126 $this->toASCII[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1127 return 2;
1128 }
1129 // Init UTF-8 conversion for this charset
1130 if (!$this->initCharset($charset)) {
1131 return false;
1132 }
1133 // UTF-8/ASCII transliteration is used as the base conversion table
1134 if (!$this->initUnicodeData('ascii')) {
1135 return false;
1136 }
1137 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1138 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1139 $c = $this->utf8_decode($utf8, $charset);
1140 if (isset($this->toASCII['utf-8'][$utf8])) {
1141 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1142 }
1143 }
1144 if ($cacheFile) {
1145 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1146 }
1147 return 3;
1148 }
1149
1150 /********************************************
1151 *
1152 * String operation functions
1153 *
1154 ********************************************/
1155 /**
1156 * Returns a part of a string.
1157 * Unit-tested by Kasper (single byte charsets only)
1158 *
1159 * @param string $charset The character set
1160 * @param string $string Character string
1161 * @param int $start Start position (character position)
1162 * @param int $len Length (in characters)
1163 * @return string The substring
1164 * @see substr(), mb_substr()
1165 */
1166 public function substr($charset, $string, $start, $len = null)
1167 {
1168 if ($len === 0 || $string === '') {
1169 return '';
1170 }
1171 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1172 // Cannot omit $len, when specifying charset
1173 if ($len === null) {
1174 // Save internal encoding
1175 $enc = mb_internal_encoding();
1176 mb_internal_encoding($charset);
1177 $str = mb_substr($string, $start);
1178 // Restore internal encoding
1179 mb_internal_encoding($enc);
1180 return $str;
1181 } else {
1182 return mb_substr($string, $start, $len, $charset);
1183 }
1184 } elseif ($this->getConversionStrategy() === self::STRATEGY_ICONV) {
1185 // Cannot omit $len, when specifying charset
1186 if ($len === null) {
1187 // Save internal encoding
1188 $enc = iconv_get_encoding('internal_encoding');
1189 iconv_set_encoding('internal_encoding', $charset);
1190 $str = iconv_substr($string, $start);
1191 // Restore internal encoding
1192 iconv_set_encoding('internal_encoding', $enc);
1193 return $str;
1194 } else {
1195 return iconv_substr($string, $start, $len, $charset);
1196 }
1197 } elseif ($charset === 'utf-8') {
1198 return $this->utf8_substr($string, $start, $len);
1199 } elseif ($this->eucBasedSets[$charset]) {
1200 return $this->euc_substr($string, $start, $charset, $len);
1201 } elseif ($this->twoByteSets[$charset]) {
1202 return substr($string, $start * 2, $len * 2);
1203 } elseif ($this->fourByteSets[$charset]) {
1204 return substr($string, $start * 4, $len * 4);
1205 }
1206 // Treat everything else as single-byte encoding
1207 return $len === null ? substr($string, $start) : substr($string, $start, $len);
1208 }
1209
1210 /**
1211 * Counts the number of characters.
1212 * Unit-tested by Kasper (single byte charsets only)
1213 *
1214 * @param string $charset The character set
1215 * @param string $string Character string
1216 * @return int The number of characters
1217 * @see strlen()
1218 */
1219 public function strlen($charset, $string)
1220 {
1221 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1222 return mb_strlen($string, $charset);
1223 } elseif ($this->getConversionStrategy() === self::STRATEGY_ICONV) {
1224 return iconv_strlen($string, $charset);
1225 } elseif ($charset === 'utf-8') {
1226 return $this->utf8_strlen($string);
1227 } elseif ($this->eucBasedSets[$charset]) {
1228 return $this->euc_strlen($string, $charset);
1229 } elseif ($this->twoByteSets[$charset]) {
1230 return strlen($string) / 2;
1231 } elseif ($this->fourByteSets[$charset]) {
1232 return strlen($string) / 4;
1233 }
1234 // Treat everything else as single-byte encoding
1235 return strlen($string);
1236 }
1237
1238 /**
1239 * Method to crop strings using the mb_substr function.
1240 *
1241 * @param string $charset The character set
1242 * @param string $string String to be cropped
1243 * @param int $len Crop length (in characters)
1244 * @param string $crop Crop signifier
1245 * @return string The shortened string
1246 * @see mb_strlen(), mb_substr()
1247 */
1248 protected function cropMbstring($charset, $string, $len, $crop = '')
1249 {
1250 if ((int)$len === 0 || mb_strlen($string, $charset) <= abs($len)) {
1251 return $string;
1252 }
1253 if ($len > 0) {
1254 $string = mb_substr($string, 0, $len, $charset) . $crop;
1255 } else {
1256 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1257 }
1258 return $string;
1259 }
1260
1261 /**
1262 * Truncates a string and pre-/appends a string.
1263 * Unit tested by Kasper
1264 *
1265 * @param string $charset The character set
1266 * @param string $string Character string
1267 * @param int $len Length (in characters)
1268 * @param string $crop Crop signifier
1269 * @return string The shortened string
1270 * @see substr(), mb_strimwidth()
1271 */
1272 public function crop($charset, $string, $len, $crop = '')
1273 {
1274 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1275 return $this->cropMbstring($charset, $string, $len, $crop);
1276 }
1277 if ((int)$len === 0) {
1278 return $string;
1279 }
1280 if ($charset === 'utf-8') {
1281 $i = $this->utf8_char2byte_pos($string, $len);
1282 } elseif ($this->eucBasedSets[$charset]) {
1283 $i = $this->euc_char2byte_pos($string, $len, $charset);
1284 } else {
1285 if ($len > 0) {
1286 $i = $len;
1287 } else {
1288 $i = strlen($string) + $len;
1289 if ($i <= 0) {
1290 $i = false;
1291 }
1292 }
1293 }
1294 // $len outside actual string length
1295 if ($i === false) {
1296 return $string;
1297 } else {
1298 if ($len > 0) {
1299 if (isset($string[$i])) {
1300 return substr($string, 0, $i) . $crop;
1301 }
1302 } else {
1303 if (isset($string[$i - 1])) {
1304 return $crop . substr($string, $i);
1305 }
1306 }
1307 }
1308 return $string;
1309 }
1310
1311 /**
1312 * Cuts a string short at a given byte length.
1313 *
1314 * @param string $charset The character set
1315 * @param string $string Character string
1316 * @param int $len The byte length
1317 * @return string The shortened string
1318 * @see mb_strcut()
1319 */
1320 public function strtrunc($charset, $string, $len)
1321 {
1322 if ($len <= 0) {
1323 return '';
1324 }
1325 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1326 return mb_strcut($string, 0, $len, $charset);
1327 } elseif ($charset === 'utf-8') {
1328 return $this->utf8_strtrunc($string, $len);
1329 } elseif ($this->eucBasedSets[$charset]) {
1330 return $this->euc_strtrunc($string, $len, $charset);
1331 } elseif ($this->twoByteSets[$charset]) {
1332 if ($len % 2) {
1333 $len--;
1334 }
1335 } elseif ($this->fourByteSets[$charset]) {
1336 $x = $len % 4;
1337 // Realign to position dividable by four
1338 $len -= $x;
1339 }
1340 // Treat everything else as single-byte encoding
1341 return substr($string, 0, $len);
1342 }
1343
1344 /**
1345 * Translates all characters of a string into their respective case values.
1346 * Unlike strtolower() and strtoupper() this method is locale independent.
1347 * Note that the string length may change!
1348 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1349 * Unit-tested by Kasper
1350 * Real case folding is language dependent, this method ignores this fact.
1351 *
1352 * @param string $charset Character set of string
1353 * @param string $string Input string to convert case for
1354 * @param string $case Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1355 * @return string The converted string
1356 * @see strtolower(), strtoupper()
1357 */
1358 public function conv_case($charset, $string, $case)
1359 {
1360 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1361 if ($case === 'toLower') {
1362 $string = mb_strtolower($string, $charset);
1363 } else {
1364 $string = mb_strtoupper($string, $charset);
1365 }
1366 } elseif ($charset === 'utf-8') {
1367 $string = $this->utf8_char_mapping($string, 'case', $case);
1368 } elseif (isset($this->eucBasedSets[$charset])) {
1369 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1370 } else {
1371 // Treat everything else as single-byte encoding
1372 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1373 }
1374 return $string;
1375 }
1376
1377 /**
1378 * Equivalent of lcfirst/ucfirst but using character set.
1379 *
1380 * @param string $charset
1381 * @param string $string
1382 * @param string $case
1383 * @return string
1384 * @see \TYPO3\CMS\Core\Charset\CharsetConverter::conv_case()
1385 */
1386 public function convCaseFirst($charset, $string, $case)
1387 {
1388 $firstChar = $this->substr($charset, $string, 0, 1);
1389 $firstChar = $this->conv_case($charset, $firstChar, $case);
1390 $remainder = $this->substr($charset, $string, 1);
1391 return $firstChar . $remainder;
1392 }
1393
1394 /**
1395 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1396 *
1397 * @param string $charset Character set of string
1398 * @param string $string Input string to convert
1399 * @return string The converted string
1400 */
1401 public function specCharsToASCII($charset, $string)
1402 {
1403 if ($charset === 'utf-8') {
1404 $string = $this->utf8_char_mapping($string, 'ascii');
1405 } elseif (isset($this->eucBasedSets[$charset])) {
1406 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1407 } else {
1408 // Treat everything else as single-byte encoding
1409 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1410 }
1411 return $string;
1412 }
1413
1414 /**
1415 * Converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1416 * into a TYPO3-readable language code
1417 *
1418 * @param string $languageCodesList List of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1419 * @return string A preferred language that TYPO3 supports, or "default" if none found
1420 */
1421 public function getPreferredClientLanguage($languageCodesList)
1422 {
1423 $allLanguageCodes = $this->getAllLanguageCodes();
1424 $selectedLanguage = 'default';
1425 $preferredLanguages = GeneralUtility::trimExplode(',', $languageCodesList);
1426 // Order the preferred languages after they key
1427 $sortedPreferredLanguages = array();
1428 foreach ($preferredLanguages as $preferredLanguage) {
1429 $quality = 1.0;
1430 if (strpos($preferredLanguage, ';q=') !== false) {
1431 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1432 }
1433 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1434 }
1435 // Loop through the languages, with the highest priority first
1436 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1437 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1438 if (isset($allLanguageCodes[$preferredLanguage])) {
1439 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1440 break;
1441 }
1442 // Strip the country code from the end
1443 list($preferredLanguage, ) = explode('-', $preferredLanguage);
1444 if (isset($allLanguageCodes[$preferredLanguage])) {
1445 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1446 break;
1447 }
1448 }
1449 if (!$selectedLanguage || $selectedLanguage === 'en') {
1450 $selectedLanguage = 'default';
1451 }
1452 return $selectedLanguage;
1453 }
1454
1455 /**
1456 * Merges all available charsets and locales, currently only used for getPreferredClientLanguage()
1457 *
1458 * @return array
1459 */
1460 protected function getAllLanguageCodes()
1461 {
1462 // Get all languages where TYPO3 code is the same as the ISO code
1463 $typo3LanguageCodes = array_keys($this->charSetArray);
1464 $allLanguageCodes = array_combine($typo3LanguageCodes, $typo3LanguageCodes);
1465 // Get all languages where TYPO3 code differs from ISO code
1466 // or needs the country part
1467 // the iso codes will here overwrite the default typo3 language in the key
1468 /** @var Locales $locales */
1469 $locales = GeneralUtility::makeInstance(Locales::class);
1470 foreach ($locales->getIsoMapping() as $typo3Lang => $isoLang) {
1471 $isoLang = join('-', explode('_', $isoLang));
1472 $allLanguageCodes[$typo3Lang] = $isoLang;
1473 }
1474 // Move the iso codes to the (because we're comparing the keys with "isset" later on)
1475 return array_flip($allLanguageCodes);
1476 }
1477
1478 /********************************************
1479 *
1480 * Internal string operation functions
1481 *
1482 ********************************************/
1483 /**
1484 * Maps all characters of a string in a single byte charset.
1485 *
1486 * @param string $str The string
1487 * @param string $charset The charset
1488 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1489 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
1490 * @return string The converted string
1491 */
1492 public function sb_char_mapping($str, $charset, $mode, $opt = '')
1493 {
1494 switch ($mode) {
1495 case 'case':
1496 if (!$this->initCaseFolding($charset)) {
1497 return $str;
1498 }
1499 // Do nothing
1500 $map = &$this->caseFolding[$charset][$opt];
1501 break;
1502 case 'ascii':
1503 if (!$this->initToASCII($charset)) {
1504 return $str;
1505 }
1506 // Do nothing
1507 $map = &$this->toASCII[$charset];
1508 break;
1509 default:
1510 return $str;
1511 }
1512 $out = '';
1513 for ($i = 0; isset($str[$i]); $i++) {
1514 $c = $str[$i];
1515 if (isset($map[$c])) {
1516 $out .= $map[$c];
1517 } else {
1518 $out .= $c;
1519 }
1520 }
1521 return $out;
1522 }
1523
1524 /********************************************
1525 *
1526 * Internal UTF-8 string operation functions
1527 *
1528 ********************************************/
1529 /**
1530 * Returns a part of a UTF-8 string.
1531 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1532 *
1533 * @param string $str UTF-8 string
1534 * @param int $start Start position (character position)
1535 * @param int $len Length (in characters)
1536 * @return string The substring
1537 * @see substr()
1538 */
1539 public function utf8_substr($str, $start, $len = null)
1540 {
1541 if ((string)$len === '0') {
1542 return '';
1543 }
1544 $byte_start = $this->utf8_char2byte_pos($str, $start);
1545 if ($byte_start === false) {
1546 if ($start > 0) {
1547 // $start outside string length
1548 return false;
1549 }
1550 }
1551 $str = substr($str, $byte_start);
1552 if ($len != null) {
1553 $byte_end = $this->utf8_char2byte_pos($str, $len);
1554 // $len outside actual string length
1555 if ($byte_end === false) {
1556 return $len < 0 ? '' : $str;
1557 } else {
1558 // When length is less than zero and exceeds, then we return blank string.
1559 return substr($str, 0, $byte_end);
1560 }
1561 } else {
1562 return $str;
1563 }
1564 }
1565
1566 /**
1567 * Counts the number of characters of a string in UTF-8.
1568 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1569 *
1570 * @param string $str UTF-8 multibyte character string
1571 * @return int The number of characters
1572 * @see strlen()
1573 */
1574 public function utf8_strlen($str)
1575 {
1576 $n = 0;
1577 for ($i = 0; isset($str[$i]); $i++) {
1578 $c = ord($str[$i]);
1579 // Single-byte (0xxxxxx)
1580 if (!($c & 128)) {
1581 $n++;
1582 } elseif (($c & 192) === 192) {
1583 // Multi-byte starting byte (11xxxxxx)
1584 $n++;
1585 }
1586 }
1587 return $n;
1588 }
1589
1590 /**
1591 * Truncates a string in UTF-8 short at a given byte length.
1592 *
1593 * @param string $str UTF-8 multibyte character string
1594 * @param int $len The byte length
1595 * @return string The shortened string
1596 * @see mb_strcut()
1597 */
1598 public function utf8_strtrunc($str, $len)
1599 {
1600 $i = $len - 1;
1601 // Part of a multibyte sequence
1602 if (ord($str[$i]) & 128) {
1603 for (; $i > 0 && !(ord($str[$i]) & 64); $i--) {
1604 }
1605 if ($i <= 0) {
1606 return '';
1607 }
1608 // Sanity check
1609 for ($bc = 0, $mbs = ord($str[$i]); $mbs & 128; $mbs = $mbs << 1) {
1610 // Calculate number of bytes
1611 $bc++;
1612 }
1613 if ($bc + $i > $len) {
1614 return substr($str, 0, $i);
1615 }
1616 }
1617 return substr($str, 0, $len);
1618 }
1619
1620 /**
1621 * Find position of first occurrence of a string, both arguments are in UTF-8.
1622 *
1623 * @param string $haystack UTF-8 string to search in
1624 * @param string $needle UTF-8 string to search for
1625 * @param int $offset Position to start the search
1626 * @return int The character position
1627 * @see strpos()
1628 */
1629 public function utf8_strpos($haystack, $needle, $offset = 0)
1630 {
1631 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1632 return mb_strpos($haystack, $needle, $offset, 'utf-8');
1633 } elseif ($this->getConversionStrategy() === self::STRATEGY_ICONV) {
1634 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1635 }
1636 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1637 if ($byte_offset === false) {
1638 // Offset beyond string length
1639 return false;
1640 }
1641 $byte_pos = strpos($haystack, $needle, $byte_offset);
1642 if ($byte_pos === false) {
1643 // Needle not found
1644 return false;
1645 }
1646 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1647 }
1648
1649 /**
1650 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1651 *
1652 * @param string $haystack UTF-8 string to search in
1653 * @param string $needle UTF-8 character to search for (single character)
1654 * @return int The character position
1655 * @see strrpos()
1656 */
1657 public function utf8_strrpos($haystack, $needle)
1658 {
1659 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1660 return mb_strrpos($haystack, $needle, 'utf-8');
1661 } elseif ($this->getConversionStrategy() === self::STRATEGY_ICONV) {
1662 return iconv_strrpos($haystack, $needle, 'utf-8');
1663 }
1664 $byte_pos = strrpos($haystack, $needle);
1665 if ($byte_pos === false) {
1666 // Needle not found
1667 return false;
1668 }
1669 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1670 }
1671
1672 /**
1673 * Translates a character position into an 'absolute' byte position.
1674 * Unit tested by Kasper.
1675 *
1676 * @param string $str UTF-8 string
1677 * @param int $pos Character position (negative values start from the end)
1678 * @return int Byte position
1679 */
1680 public function utf8_char2byte_pos($str, $pos)
1681 {
1682 // Number of characters found
1683 $n = 0;
1684 // Number of characters wanted
1685 $p = abs($pos);
1686 if ($pos >= 0) {
1687 $i = 0;
1688 $d = 1;
1689 } else {
1690 $i = strlen($str) - 1;
1691 $d = -1;
1692 }
1693 for (; isset($str[$i]) && $n < $p; $i += $d) {
1694 $c = (int)ord($str[$i]);
1695 // single-byte (0xxxxxx)
1696 if (!($c & 128)) {
1697 $n++;
1698 } elseif (($c & 192) === 192) {
1699 // Multi-byte starting byte (11xxxxxx)
1700 $n++;
1701 }
1702 }
1703 if (!isset($str[$i])) {
1704 // Offset beyond string length
1705 return false;
1706 }
1707 if ($pos >= 0) {
1708 // Skip trailing multi-byte data bytes
1709 while (ord($str[$i]) & 128 && !(ord($str[$i]) & 64)) {
1710 $i++;
1711 }
1712 } else {
1713 // Correct offset
1714 $i++;
1715 }
1716 return $i;
1717 }
1718
1719 /**
1720 * Translates an 'absolute' byte position into a character position.
1721 * Unit tested by Kasper.
1722 *
1723 * @param string $str UTF-8 string
1724 * @param int $pos Byte position
1725 * @return int Character position
1726 */
1727 public function utf8_byte2char_pos($str, $pos)
1728 {
1729 // Number of characters
1730 $n = 0;
1731 for ($i = $pos; $i > 0; $i--) {
1732 $c = (int)ord($str[$i]);
1733 // single-byte (0xxxxxx)
1734 if (!($c & 128)) {
1735 $n++;
1736 } elseif (($c & 192) === 192) {
1737 // Multi-byte starting byte (11xxxxxx)
1738 $n++;
1739 }
1740 }
1741 if (!isset($str[$i])) {
1742 // Offset beyond string length
1743 return false;
1744 }
1745 return $n;
1746 }
1747
1748 /**
1749 * Maps all characters of an UTF-8 string.
1750 *
1751 * @param string $str UTF-8 string
1752 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1753 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
1754 * @return string The converted string
1755 */
1756 public function utf8_char_mapping($str, $mode, $opt = '')
1757 {
1758 if (!$this->initUnicodeData($mode)) {
1759 // Do nothing
1760 return $str;
1761 }
1762 $out = '';
1763 switch ($mode) {
1764 case 'case':
1765 $map = &$this->caseFolding['utf-8'][$opt];
1766 break;
1767 case 'ascii':
1768 $map = &$this->toASCII['utf-8'];
1769 break;
1770 default:
1771 return $str;
1772 }
1773 for ($i = 0; isset($str[$i]); $i++) {
1774 $c = ord($str[$i]);
1775 // single-byte (0xxxxxx)
1776 if (!($c & 128)) {
1777 $mbc = $str[$i];
1778 } elseif (($c & 192) === 192) {
1779 // multi-byte starting byte (11xxxxxx)
1780 for ($bc = 0; $c & 128; $c = $c << 1) {
1781 $bc++;
1782 }
1783 // calculate number of bytes
1784 $mbc = substr($str, $i, $bc);
1785 $i += $bc - 1;
1786 }
1787 if (isset($map[$mbc])) {
1788 $out .= $map[$mbc];
1789 } else {
1790 $out .= $mbc;
1791 }
1792 }
1793 return $out;
1794 }
1795
1796 /********************************************
1797 *
1798 * Internal EUC string operation functions
1799 *
1800 * Extended Unix Code:
1801 * ASCII compatible 7bit single bytes chars
1802 * 8bit two byte chars
1803 *
1804 * Shift-JIS is treated as a special case.
1805 *
1806 ********************************************/
1807 /**
1808 * Cuts a string in the EUC charset family short at a given byte length.
1809 *
1810 * @param string $str EUC multibyte character string
1811 * @param int $len The byte length
1812 * @param string $charset The charset
1813 * @return string The shortened string
1814 * @see mb_strcut()
1815 */
1816 public function euc_strtrunc($str, $len, $charset)
1817 {
1818 $shiftJis = $charset === 'shift_jis';
1819 for ($i = 0; isset($str[$i]) && $i < $len; $i++) {
1820 $c = ord($str[$i]);
1821 if ($shiftJis) {
1822 if ($c >= 128 && $c < 160 || $c >= 224) {
1823 $i++;
1824 }
1825 } else {
1826 if ($c >= 128) {
1827 $i++;
1828 }
1829 }
1830 }
1831 if (!isset($str[$i])) {
1832 return $str;
1833 }
1834 // string shorter than supplied length
1835 if ($i > $len) {
1836 // We ended on a first byte
1837 return substr($str, 0, $len - 1);
1838 } else {
1839 return substr($str, 0, $len);
1840 }
1841 }
1842
1843 /**
1844 * Returns a part of a string in the EUC charset family.
1845 *
1846 * @param string $str EUC multibyte character string
1847 * @param int $start Start position (character position)
1848 * @param string $charset The charset
1849 * @param int $len Length (in characters)
1850 * @return string the substring
1851 */
1852 public function euc_substr($str, $start, $charset, $len = null)
1853 {
1854 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
1855 if ($byte_start === false) {
1856 // $start outside string length
1857 return false;
1858 }
1859 $str = substr($str, $byte_start);
1860 if ($len != null) {
1861 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
1862 // $len outside actual string length
1863 if ($byte_end === false) {
1864 return $str;
1865 } else {
1866 return substr($str, 0, $byte_end);
1867 }
1868 } else {
1869 return $str;
1870 }
1871 }
1872
1873 /**
1874 * Counts the number of characters of a string in the EUC charset family.
1875 *
1876 * @param string $str EUC multibyte character string
1877 * @param string $charset The charset
1878 * @return int The number of characters
1879 * @see strlen()
1880 */
1881 public function euc_strlen($str, $charset)
1882 {
1883 $sjis = $charset === 'shift_jis';
1884 $n = 0;
1885 for ($i = 0; isset($str[$i]); $i++) {
1886 $c = ord($str[$i]);
1887 if ($sjis) {
1888 if ($c >= 128 && $c < 160 || $c >= 224) {
1889 $i++;
1890 }
1891 } else {
1892 if ($c >= 128) {
1893 $i++;
1894 }
1895 }
1896 $n++;
1897 }
1898 return $n;
1899 }
1900
1901 /**
1902 * Translates a character position into an 'absolute' byte position.
1903 *
1904 * @param string $str EUC multibyte character string
1905 * @param int $pos Character position (negative values start from the end)
1906 * @param string $charset The charset
1907 * @return int Byte position
1908 */
1909 public function euc_char2byte_pos($str, $pos, $charset)
1910 {
1911 $sjis = $charset === 'shift_jis';
1912 // Number of characters seen
1913 $n = 0;
1914 // Number of characters wanted
1915 $p = abs($pos);
1916 if ($pos >= 0) {
1917 $i = 0;
1918 $d = 1;
1919 } else {
1920 $i = strlen($str) - 1;
1921 $d = -1;
1922 }
1923 for (; isset($str[$i]) && $n < $p; $i += $d) {
1924 $c = ord($str[$i]);
1925 if ($sjis) {
1926 if ($c >= 128 && $c < 160 || $c >= 224) {
1927 $i += $d;
1928 }
1929 } else {
1930 if ($c >= 128) {
1931 $i += $d;
1932 }
1933 }
1934 $n++;
1935 }
1936 if (!isset($str[$i])) {
1937 return false;
1938 }
1939 // offset beyond string length
1940 if ($pos < 0) {
1941 $i++;
1942 }
1943 // correct offset
1944 return $i;
1945 }
1946
1947 /**
1948 * Maps all characters of a string in the EUC charset family.
1949 *
1950 * @param string $str EUC multibyte character string
1951 * @param string $charset The charset
1952 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1953 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
1954 * @return string The converted string
1955 */
1956 public function euc_char_mapping($str, $charset, $mode, $opt = '')
1957 {
1958 switch ($mode) {
1959 case 'case':
1960 if (!$this->initCaseFolding($charset)) {
1961 return $str;
1962 }
1963 // do nothing
1964 $map = &$this->caseFolding[$charset][$opt];
1965 break;
1966 case 'ascii':
1967 if (!$this->initToASCII($charset)) {
1968 return $str;
1969 }
1970 // do nothing
1971 $map = &$this->toASCII[$charset];
1972 break;
1973 default:
1974 return $str;
1975 }
1976 $sjis = $charset === 'shift_jis';
1977 $out = '';
1978 for ($i = 0; isset($str[$i]); $i++) {
1979 $mbc = $str[$i];
1980 $c = ord($mbc);
1981 if ($sjis) {
1982 // A double-byte char
1983 if ($c >= 128 && $c < 160 || $c >= 224) {
1984 $mbc = substr($str, $i, 2);
1985 $i++;
1986 }
1987 } else {
1988 // A double-byte char
1989 if ($c >= 128) {
1990 $mbc = substr($str, $i, 2);
1991 $i++;
1992 }
1993 }
1994 if (isset($map[$mbc])) {
1995 $out .= $map[$mbc];
1996 } else {
1997 $out .= $mbc;
1998 }
1999 }
2000 return $out;
2001 }
2002
2003 /**
2004 * Checks the selected strategy based on which method is available in the system.
2005 * "mbstring" takes precedence over "iconv".
2006 * See http://stackoverflow.com/questions/8233517/what-is-the-difference-between-iconv-and-mb-convert-encoding-in-php
2007 *
2008 * @return string could be "mbstring", "iconv" or "fallback"
2009 */
2010 protected function getConversionStrategy()
2011 {
2012 if ($this->conversionStrategy === null) {
2013 if (extension_loaded('mbstring')) {
2014 $this->conversionStrategy = self::STRATEGY_MBSTRING;
2015 } elseif (extension_loaded('iconv')) {
2016 $this->conversionStrategy = self::STRATEGY_ICONV;
2017 } else {
2018 $this->conversionStrategy = self::STRATEGY_FALLBACK;
2019 }
2020 }
2021 return $this->conversionStrategy;
2022 }
2023 }