2 namespace TYPO3\CMS\Core\Charset
;
5 * This file is part of the TYPO3 CMS project.
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
14 * The TYPO3 project - inspiring people to share!
17 use TYPO3\CMS\Core\Localization\Locales
;
18 use TYPO3\CMS\Core\SingletonInterface
;
19 use TYPO3\CMS\Core\Utility\ExtensionManagementUtility
;
20 use TYPO3\CMS\Core\Utility\GeneralUtility
;
25 * Functions working on UTF-8 strings:
30 * - implode/explode/join
32 * Functions nearly working on UTF-8 strings:
34 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
35 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
36 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
37 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
38 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
40 * Functions NOT working on UTF-8 strings:
52 * Class for conversion between charsets
54 class CharsetConverter
implements SingletonInterface
58 * Possible strategies for handling multi-byte data
59 * Only used for internal purpose
62 const STRATEGY_MBSTRING
= 'mbstring';
63 const STRATEGY_ICONV
= 'iconv';
64 const STRATEGY_FALLBACK
= 'fallback';
67 * ASCII Value for chars with no equivalent.
71 public $noCharByteVal = 63;
74 * This is the array where parsed conversion tables are stored (cached)
78 public $parsedCharsets = array();
81 * An array where case folding data will be stored (cached)
85 public $caseFolding = array();
88 * An array where charset-to-ASCII mappings are stored (cached)
92 public $toASCII = array();
95 * This tells the converter which charsets has two bytes per char:
99 public $twoByteSets = array(
104 * This tells the converter which charsets has four bytes per char:
108 public $fourByteSets = array(
109 'ucs-4' => 1, // 4-byte Unicode
114 * This tells the converter which charsets use a scheme like the Extended Unix Code:
118 public $eucBasedSets = array(
119 'gb2312' => 1, // Chinese, simplified.
120 'big5' => 1, // Chinese, traditional.
121 'euc-kr' => 1, // Korean
126 * @link http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
127 * @link http://czyborra.com/charsets/iso8859.html
131 public $synonyms = array(
133 'us-ascii' => 'ascii',
134 'cp819' => 'iso-8859-1',
135 'ibm819' => 'iso-8859-1',
136 'iso-ir-100' => 'iso-8859-1',
137 'iso-ir-101' => 'iso-8859-2',
138 'iso-ir-109' => 'iso-8859-3',
139 'iso-ir-110' => 'iso-8859-4',
140 'iso-ir-144' => 'iso-8859-5',
141 'iso-ir-127' => 'iso-8859-6',
142 'iso-ir-126' => 'iso-8859-7',
143 'iso-ir-138' => 'iso-8859-8',
144 'iso-ir-148' => 'iso-8859-9',
145 'iso-ir-157' => 'iso-8859-10',
146 'iso-ir-179' => 'iso-8859-13',
147 'iso-ir-199' => 'iso-8859-14',
148 'iso-ir-203' => 'iso-8859-15',
149 'csisolatin1' => 'iso-8859-1',
150 'csisolatin2' => 'iso-8859-2',
151 'csisolatin3' => 'iso-8859-3',
152 'csisolatin5' => 'iso-8859-9',
153 'csisolatin8' => 'iso-8859-14',
154 'csisolatin9' => 'iso-8859-15',
155 'csisolatingreek' => 'iso-8859-7',
156 'iso-celtic' => 'iso-8859-14',
157 'latin1' => 'iso-8859-1',
158 'latin2' => 'iso-8859-2',
159 'latin3' => 'iso-8859-3',
160 'latin5' => 'iso-8859-9',
161 'latin6' => 'iso-8859-10',
162 'latin8' => 'iso-8859-14',
163 'latin9' => 'iso-8859-15',
164 'l1' => 'iso-8859-1',
165 'l2' => 'iso-8859-2',
166 'l3' => 'iso-8859-3',
167 'l5' => 'iso-8859-9',
168 'l6' => 'iso-8859-10',
169 'l8' => 'iso-8859-14',
170 'l9' => 'iso-8859-15',
171 'cyrillic' => 'iso-8859-5',
172 'arabic' => 'iso-8859-6',
173 'tis-620' => 'iso-8859-11',
174 'win874' => 'windows-874',
175 'win1250' => 'windows-1250',
176 'win1251' => 'windows-1251',
177 'win1252' => 'windows-1252',
178 'win1253' => 'windows-1253',
179 'win1254' => 'windows-1254',
180 'win1255' => 'windows-1255',
181 'win1256' => 'windows-1256',
182 'win1257' => 'windows-1257',
183 'win1258' => 'windows-1258',
184 'cp1250' => 'windows-1250',
185 'cp1251' => 'windows-1251',
186 'cp1252' => 'windows-1252',
187 'ms-ee' => 'windows-1250',
188 'ms-ansi' => 'windows-1252',
189 'ms-greek' => 'windows-1253',
190 'ms-turk' => 'windows-1254',
191 'winbaltrim' => 'windows-1257',
192 'koi-8ru' => 'koi-8r',
196 'macintosh' => 'macroman',
197 'euc-cn' => 'gb2312',
198 'x-euc-cn' => 'gb2312',
204 'sjis' => 'shift_jis',
205 'shift-jis' => 'shift_jis',
206 'cp932' => 'shift_jis',
217 * Mapping of iso-639-1 language codes to script names
221 public $lang_to_script = array(
222 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
223 'af' => 'west_european', // Afrikaans
225 'bg' => 'cyrillic', // Bulgarian
226 'bs' => 'east_european', // Bosnian
227 'cs' => 'east_european', // Czech
228 'da' => 'west_european', // Danish
229 'de' => 'west_european', // German
230 'es' => 'west_european', // Spanish
232 'eo' => 'unicode', // Esperanto
233 'eu' => 'west_european', // Basque
234 'fa' => 'arabic', // Persian
235 'fi' => 'west_european', // Finish
236 'fo' => 'west_european', // Faroese
237 'fr' => 'west_european', // French
238 'ga' => 'west_european', // Irish
239 'gl' => 'west_european', // Galician
241 'he' => 'hebrew', // Hebrew (since 1998)
242 'hi' => 'unicode', // Hindi
243 'hr' => 'east_european', // Croatian
244 'hu' => 'east_european', // Hungarian
245 'iw' => 'hebrew', // Hebrew (til 1998)
246 'is' => 'west_european', // Icelandic
247 'it' => 'west_european', // Italian
249 'ka' => 'unicode', // Georgian
250 'kl' => 'west_european', // Greenlandic
251 'km' => 'unicode', // Khmer
253 'lt' => 'lithuanian',
254 'lv' => 'west_european', // Latvian/Lettish
255 'nl' => 'west_european', // Dutch
256 'no' => 'west_european', // Norwegian
257 'nb' => 'west_european', // Norwegian Bokmal
258 'nn' => 'west_european', // Norwegian Nynorsk
259 'pl' => 'east_european', // Polish
260 'pt' => 'west_european', // Portuguese
261 'ro' => 'east_european', // Romanian
262 'ru' => 'cyrillic', // Russian
263 'sk' => 'east_european', // Slovak
264 'sl' => 'east_european', // Slovenian
265 'sr' => 'cyrillic', // Serbian
266 'sv' => 'west_european', // Swedish
267 'sq' => 'albanian', // Albanian
269 'uk' => 'cyrillic', // Ukranian
270 'vi' => 'vietnamese',
273 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
274 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
275 'afk' => 'west_european', // Afrikaans
277 'bgr' => 'cyrillic', // Bulgarian
278 'cat' => 'west_european', // Catalan
279 'chs' => 'simpl_chinese',
280 'cht' => 'trad_chinese',
281 'csy' => 'east_european', // Czech
282 'dan' => 'west_european', // Danish
283 'deu' => 'west_european', // German
284 'dea' => 'west_european', // German (Austrian)
285 'des' => 'west_european', // German (Swiss)
286 'ena' => 'west_european', // English (Australian)
287 'enc' => 'west_european', // English (Canadian)
288 'eng' => 'west_european', // English
289 'enz' => 'west_european', // English (New Zealand)
290 'enu' => 'west_european', // English (United States)
291 'euq' => 'west_european', // Basque
292 'fos' => 'west_european', // Faroese
293 'far' => 'arabic', // Persian
294 'fin' => 'west_european', // Finish
295 'fra' => 'west_european', // French
296 'frb' => 'west_european', // French (Belgian)
297 'frc' => 'west_european', // French (Canadian)
298 'frs' => 'west_european', // French (Swiss)
299 'geo' => 'unicode', // Georgian
300 'glg' => 'west_european', // Galician
303 'hin' => 'unicode', // Hindi
304 'hun' => 'east_european', // Hungarian
305 'isl' => 'west_european', // Icelandic
306 'ita' => 'west_european', // Italian
307 'its' => 'west_european', // Italian (Swiss)
309 'khm' => 'unicode', // Khmer
311 'lth' => 'lithuanian',
312 'lvi' => 'west_european', // Latvian/Lettish
313 'msl' => 'west_european', // Malay
314 'nlb' => 'west_european', // Dutch (Belgian)
315 'nld' => 'west_european', // Dutch
316 'nor' => 'west_european', // Norwegian (bokmal)
317 'non' => 'west_european', // Norwegian (nynorsk)
318 'plk' => 'east_european', // Polish
319 'ptg' => 'west_european', // Portuguese
320 'ptb' => 'west_european', // Portuguese (Brazil)
321 'rom' => 'east_european', // Romanian
322 'rus' => 'cyrillic', // Russian
323 'slv' => 'east_european', // Slovenian
324 'sky' => 'east_european', // Slovak
325 'srl' => 'east_european', // Serbian (Latin)
326 'srb' => 'cyrillic', // Serbian (Cyrillic)
327 'esp' => 'west_european', // Spanish (trad. sort)
328 'esm' => 'west_european', // Spanish (Mexican)
329 'esn' => 'west_european', // Spanish (internat. sort)
330 'sve' => 'west_european', // Swedish
331 'sqi' => 'albanian', // Albanian
334 'ukr' => 'cyrillic', // Ukrainian
336 // English language names
337 'afrikaans' => 'west_european',
338 'albanian' => 'albanian',
339 'arabic' => 'arabic',
340 'basque' => 'west_european',
341 'bosnian' => 'east_european',
342 'bulgarian' => 'east_european',
343 'catalan' => 'west_european',
344 'croatian' => 'east_european',
345 'czech' => 'east_european',
346 'danish' => 'west_european',
347 'dutch' => 'west_european',
348 'english' => 'west_european',
349 'esperanto' => 'unicode',
350 'estonian' => 'estonian',
351 'faroese' => 'west_european',
353 'finnish' => 'west_european',
354 'french' => 'west_european',
355 'galician' => 'west_european',
356 'georgian' => 'unicode',
357 'german' => 'west_european',
359 'greenlandic' => 'west_european',
360 'hebrew' => 'hebrew',
361 'hindi' => 'unicode',
362 'hungarian' => 'east_european',
363 'icelandic' => 'west_european',
364 'italian' => 'west_european',
365 'khmer' => 'unicode',
366 'latvian' => 'west_european',
367 'lettish' => 'west_european',
368 'lithuanian' => 'lithuanian',
369 'malay' => 'west_european',
370 'norwegian' => 'west_european',
371 'persian' => 'arabic',
372 'polish' => 'east_european',
373 'portuguese' => 'west_european',
374 'russian' => 'cyrillic',
375 'romanian' => 'east_european',
376 'serbian' => 'cyrillic',
377 'slovak' => 'east_european',
378 'slovenian' => 'east_european',
379 'spanish' => 'west_european',
380 'svedish' => 'west_european',
382 'turkish' => 'turkish',
383 'ukrainian' => 'cyrillic'
387 * Mapping of language (family) names to charsets on Unix
391 public $script_to_charset_unix = array(
392 'west_european' => 'iso-8859-1',
393 'estonian' => 'iso-8859-1',
394 'east_european' => 'iso-8859-2',
395 'baltic' => 'iso-8859-4',
396 'cyrillic' => 'iso-8859-5',
397 'arabic' => 'iso-8859-6',
398 'greek' => 'iso-8859-7',
399 'hebrew' => 'iso-8859-8',
400 'turkish' => 'iso-8859-9',
401 'thai' => 'iso-8859-11', // = TIS-620
402 'lithuanian' => 'iso-8859-13',
403 'chinese' => 'gb2312', // = euc-cn
404 'japanese' => 'euc-jp',
405 'korean' => 'euc-kr',
406 'simpl_chinese' => 'gb2312',
407 'trad_chinese' => 'big5',
409 'unicode' => 'utf-8',
410 'albanian' => 'utf-8'
414 * Mapping of language (family) names to charsets on Windows
418 public $script_to_charset_windows = array(
419 'east_european' => 'windows-1250',
420 'cyrillic' => 'windows-1251',
421 'west_european' => 'windows-1252',
422 'greek' => 'windows-1253',
423 'turkish' => 'windows-1254',
424 'hebrew' => 'windows-1255',
425 'arabic' => 'windows-1256',
426 'baltic' => 'windows-1257',
427 'estonian' => 'windows-1257',
428 'lithuanian' => 'windows-1257',
429 'vietnamese' => 'windows-1258',
432 'chinese' => 'gb2312',
433 'japanese' => 'shift_jis',
434 'simpl_chinese' => 'gb2312',
435 'trad_chinese' => 'big5',
436 'albanian' => 'windows-1250',
441 * Mapping of locale names to charsets
445 public $locale_to_charset = array(
446 'japanese.euc' => 'euc-jp',
447 'ja_jp.ujis' => 'euc-jp',
448 'korean.euc' => 'euc-kr',
449 'sr@Latn' => 'iso-8859-2',
456 * TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
457 * Empty values means "utf-8"
461 public $charSetArray = array(
463 'ar' => 'iso-8859-6',
464 'ba' => 'iso-8859-2',
465 'bg' => 'windows-1251',
467 'ca' => 'iso-8859-15',
469 'cs' => 'windows-1250',
470 'cz' => 'windows-1250',
474 'el' => 'iso-8859-7',
477 'et' => 'iso-8859-4',
487 'gr' => 'iso-8859-7',
491 'hr' => 'windows-1250',
492 'hu' => 'iso-8859-2',
502 'lt' => 'windows-1257',
508 'pl' => 'iso-8859-2',
512 'ro' => 'iso-8859-2',
513 'ru' => 'windows-1251',
515 'si' => 'windows-1250',
516 'sk' => 'windows-1250',
517 'sl' => 'windows-1250',
521 'th' => 'iso-8859-11',
522 'tr' => 'iso-8859-9',
523 'ua' => 'windows-1251',
524 'uk' => 'windows-1251',
531 * Normalize - changes input character set to lowercase letters.
533 * @param string $charset Input charset
534 * @return string Normalized charset
536 public function parse_charset($charset)
538 $charset = trim(strtolower($charset));
539 if (isset($this->synonyms
[$charset])) {
540 $charset = $this->synonyms
[$charset];
546 * Get the charset of a locale.
549 * ln_CN language / country
550 * ln_CN.cs language / country / charset
551 * ln_CN.cs@mod language / country / charset / modifier
553 * @param string $locale Locale string
554 * @return string Charset resolved for locale string
556 public function get_locale_charset($locale)
558 $locale = strtolower($locale);
559 // Exact locale specific charset?
560 if (isset($this->locale_to_charset
[$locale])) {
561 return $this->locale_to_charset
[$locale];
564 list($locale, $modifier) = explode('@', $locale);
565 // Locale contains charset: use it
566 list($locale, $charset) = explode('.', $locale);
568 return $this->parse_charset($charset);
570 // Modifier is 'euro' (after charset check, because of xx.utf-8@euro)
571 if ($modifier === 'euro') {
572 return 'iso-8859-15';
575 list($language, ) = explode('_', $locale);
576 if (isset($this->lang_to_script
[$language])) {
577 $script = $this->lang_to_script
[$language];
579 if (TYPO3_OS
=== 'WIN') {
580 $cs = $this->script_to_charset_windows
[$script] ?
: 'windows-1252';
582 $cs = $this->script_to_charset_unix
[$script] ?
: 'utf-8';
587 /********************************************
589 * Charset Conversion functions
591 ********************************************/
593 * Convert from one charset to another charset.
595 * @param string $inputString Input string
596 * @param string $fromCharset From charset (the current charset of the string)
597 * @param string $toCharset To charset (the output charset wanted)
598 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
599 * @return string Converted string
602 public function conv($inputString, $fromCharset, $toCharset, $useEntityForNoChar = false)
604 if ($fromCharset === $toCharset) {
607 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
608 if ($toCharset === 'utf-8' ||
!$useEntityForNoChar) {
609 switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
611 $convertedString = mb_convert_encoding($inputString, $toCharset, $fromCharset);
612 if (false !== $convertedString) {
613 return $convertedString;
615 // Returns FALSE for unsupported charsets
618 $convertedString = iconv($fromCharset, $toCharset . '//TRANSLIT', $inputString);
619 if (false !== $convertedString) {
620 return $convertedString;
625 if ($fromCharset !== 'utf-8') {
626 $inputString = $this->utf8_encode($inputString, $fromCharset);
628 if ($toCharset !== 'utf-8') {
629 $inputString = $this->utf8_decode($inputString, $toCharset, $useEntityForNoChar);
635 * Convert all elements in ARRAY with type string from one charset to another charset.
636 * NOTICE: Array is passed by reference!
638 * @param array $array Input array, possibly multidimensional
639 * @param string $fromCharset From charset (the current charset of the string)
640 * @param string $toCharset To charset (the output charset wanted)
641 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
645 public function convArray(&$array, $fromCharset, $toCharset, $useEntityForNoChar = false)
647 foreach ($array as $key => $value) {
648 if (is_array($array[$key])) {
649 $this->convArray($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
650 } elseif (is_string($array[$key])) {
651 $array[$key] = $this->conv($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
657 * Converts $str from $charset to UTF-8
659 * @param string $str String in local charset to convert to UTF-8
660 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
661 * @return string Output string, converted to UTF-8
663 public function utf8_encode($str, $charset)
665 if ($charset === 'utf-8') {
668 // Charset is case-insensitive
669 // Parse conv. table if not already
670 if ($this->initCharset($charset)) {
671 $strLen = strlen($str);
673 // Traverse each char in string
674 for ($a = 0; $a < $strLen; $a++
) {
675 $chr = substr($str, $a, 1);
677 // If the charset has two bytes per char
678 if (isset($this->twoByteSets
[$charset])) {
679 $ord2 = ord($str[$a +
1]);
681 $ord = $ord << 8 |
$ord2;
682 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
683 if (isset($this->parsedCharsets
[$charset]['local'][$ord])) {
684 $outStr .= $this->parsedCharsets
[$charset]['local'][$ord];
686 $outStr .= chr($this->noCharByteVal
);
690 } elseif ($ord > 127) {
691 // If char has value over 127 it's a multibyte char in UTF-8
692 // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
693 if (isset($this->eucBasedSets
[$charset])) {
694 // Shift-JIS: chars between 160 and 223 are single byte
695 if ($charset !== 'shift_jis' ||
($ord < 160 ||
$ord > 223)) {
697 $ord2 = ord(substr($str, $a, 1));
698 $ord = $ord * 256 +
$ord2;
701 if (isset($this->parsedCharsets
[$charset]['local'][$ord])) {
702 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
703 $outStr .= $this->parsedCharsets
[$charset]['local'][$ord];
705 $outStr .= chr($this->noCharByteVal
);
716 * Converts $str from UTF-8 to $charset
718 * @param string $str String in UTF-8 to convert to local charset
719 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
720 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
721 * @return string Output string, converted to local charset
723 public function utf8_decode($str, $charset, $useEntityForNoChar = false)
725 if ($charset === 'utf-8') {
728 // Charset is case-insensitive.
729 // Parse conv. table if not already
730 if ($this->initCharset($charset)) {
731 $strLen = strlen($str);
733 // Traverse each char in UTF-8 string
734 for ($a = 0, $i = 0; $a < $strLen; $a++
, $i++
) {
735 $chr = substr($str, $a, 1);
737 // This means multibyte! (first byte!)
739 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
743 // For each byte in multibyte string
744 for ($b = 0; $b < 8; $b++
) {
747 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
750 // ... and add the next char.
751 $buf .= substr($str, $a, 1);
756 // If the UTF-8 char-sequence is found then...
757 if (isset($this->parsedCharsets
[$charset]['utf8'][$buf])) {
759 $mByte = $this->parsedCharsets
[$charset]['utf8'][$buf];
760 // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
762 $outStr .= chr(($mByte >> 8 & 255)) . chr(($mByte & 255));
764 $outStr .= chr($mByte);
766 } elseif ($useEntityForNoChar) {
767 // Create num entity:
768 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
770 $outStr .= chr($this->noCharByteVal
);
773 $outStr .= chr($this->noCharByteVal
);
784 * Converts all chars > 127 to numeric entities.
786 * @param string $str Input string
787 * @return string Output string
789 public function utf8_to_entities($str)
791 $strLen = strlen($str);
793 // Traverse each char in UTF-8 string.
794 for ($a = 0; $a < $strLen; $a++
) {
795 $chr = substr($str, $a, 1);
797 // This means multibyte! (first byte!)
799 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
803 // For each byte in multibyte string...
804 for ($b = 0; $b < 8; $b++
) {
805 // Shift it left and ...
807 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
810 // ... and add the next char.
811 $buf .= substr($str, $a, 1);
816 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
818 $outStr .= chr($this->noCharByteVal
);
828 * Converts numeric entities (UNICODE, eg. decimal (Ӓ) or hexadecimal ()) to UTF-8 multibyte chars
830 * @param string $str Input string, UTF-8
831 * @param bool $alsoStdHtmlEnt If set, then all string-HTML entities (like & or £ will be converted as well)
832 * @return string Output string
834 public function entities_to_utf8($str, $alsoStdHtmlEnt = false)
836 if ($alsoStdHtmlEnt) {
837 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES
, ENT_COMPAT
, 'UTF-8'));
839 $token = md5(microtime());
840 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
841 foreach ($parts as $k => $v) {
842 // Only take every second element
847 // Dec or hex entities
848 if (substr($v, $position, 1) === '#') {
850 if (substr($v, $position, 1) === 'x') {
851 $v = hexdec(substr($v, ++
$position));
853 $v = substr($v, $position);
855 $parts[$k] = $this->UnumberToChar($v);
856 } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) {
858 $v = $trans_tbl['&' . $v . ';'];
862 $parts[$k] = '&' . $v . ';';
865 return implode('', $parts);
869 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
871 * @param string $str Input string, UTF-8
872 * @param bool $convEntities If set, then all HTML entities (like & or £ or { or 㽝) will be detected as characters.
873 * @param bool $retChar If set, then instead of integer numbers the real UTF-8 char is returned.
874 * @return array Output array with the char numbers
876 public function utf8_to_numberarray($str, $convEntities = false, $retChar = false)
878 // If entities must be registered as well...:
880 $str = $this->entities_to_utf8($str, 1);
883 $strLen = strlen($str);
885 // Traverse each char in UTF-8 string.
886 for ($a = 0; $a < $strLen; $a++
) {
887 $chr = substr($str, $a, 1);
889 // This means multibyte! (first byte!)
891 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
895 // For each byte in multibyte string...
896 for ($b = 0; $b < 8; $b++
) {
897 // Shift it left and ...
899 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
902 // ... and add the next char.
903 $buf .= substr($str, $a, 1);
908 $outArr[] = $retChar ?
$buf : $this->utf8CharToUnumber($buf);
910 $outArr[] = $retChar ?
chr($this->noCharByteVal
) : $this->noCharByteVal
;
913 $outArr[] = $retChar ?
chr($ord) : $ord;
920 * Converts a UNICODE number to a UTF-8 multibyte character
921 * Algorithm based on script found at From: http://czyborra.com/utf/
922 * Unit-tested by Kasper
924 * The binary representation of the character's integer value is thus simply spread across the bytes
925 * and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
927 * bytes | bits | representation
929 * 2 | 11 | 110vvvvv 10vvvvvv
930 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
931 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
932 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
933 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
935 * @param int $unicodeInteger UNICODE integer
936 * @return string UTF-8 multibyte character string
937 * @see utf8CharToUnumber()
939 public function UnumberToChar($unicodeInteger)
942 if ($unicodeInteger < 128) {
943 $str .= chr($unicodeInteger);
944 } elseif ($unicodeInteger < 2048) {
945 $str .= chr(192 |
$unicodeInteger >> 6);
946 $str .= chr(128 |
$unicodeInteger & 63);
947 } elseif ($unicodeInteger < 65536) {
948 $str .= chr(224 |
$unicodeInteger >> 12);
949 $str .= chr(128 |
$unicodeInteger >> 6 & 63);
950 $str .= chr(128 |
$unicodeInteger & 63);
951 } elseif ($unicodeInteger < 2097152) {
952 $str .= chr(240 |
$unicodeInteger >> 18);
953 $str .= chr(128 |
$unicodeInteger >> 12 & 63);
954 $str .= chr(128 |
$unicodeInteger >> 6 & 63);
955 $str .= chr(128 |
$unicodeInteger & 63);
956 } elseif ($unicodeInteger < 67108864) {
957 $str .= chr(248 |
$unicodeInteger >> 24);
958 $str .= chr(128 |
$unicodeInteger >> 18 & 63);
959 $str .= chr(128 |
$unicodeInteger >> 12 & 63);
960 $str .= chr(128 |
$unicodeInteger >> 6 & 63);
961 $str .= chr(128 |
$unicodeInteger & 63);
962 } elseif ($unicodeInteger < 2147483648) {
963 $str .= chr(252 |
$unicodeInteger >> 30);
964 $str .= chr(128 |
$unicodeInteger >> 24 & 63);
965 $str .= chr(128 |
$unicodeInteger >> 18 & 63);
966 $str .= chr(128 |
$unicodeInteger >> 12 & 63);
967 $str .= chr(128 |
$unicodeInteger >> 6 & 63);
968 $str .= chr(128 |
$unicodeInteger & 63);
970 // Cannot express a 32-bit character in UTF-8
971 $str .= chr($this->noCharByteVal
);
977 * Converts a UTF-8 Multibyte character to a UNICODE number
978 * Unit-tested by Kasper
980 * @param string $str UTF-8 multibyte character string
981 * @param bool $hex If set, then a hex. number is returned.
982 * @return int UNICODE integer
983 * @see UnumberToChar()
985 public function utf8CharToUnumber($str, $hex = false)
989 // This verifies that it IS a multi byte string
990 if (($ord & 192) === 192) {
992 // For each byte in multibyte string...
993 for ($b = 0; $b < 8; $b++
) {
994 // Shift it left and ...
996 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
998 $binBuf .= substr('00000000' . decbin(ord(substr($str, ($b +
1), 1))), -6);
1003 $binBuf = substr(('00000000' . decbin(ord($str[0]))), -(6 - $b)) . $binBuf;
1004 $int = bindec($binBuf);
1008 return $hex ?
'x' . dechex($int) : $int;
1011 /********************************************
1015 ********************************************/
1017 * This will initialize a charset for use if it's defined in the 'typo3/sysext/core/Resources/Private/Charsets/csconvtbl/' folder
1018 * This function is automatically called by the conversion functions
1020 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
1022 * @param string $charset The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
1023 * @return int Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
1026 public function initCharset($charset)
1028 // Only process if the charset is not yet loaded:
1029 if (!is_array($this->parsedCharsets
[$charset])) {
1030 // Conversion table filename:
1031 $charsetConvTableFile = ExtensionManagementUtility
::extPath('core') . 'Resources/Private/Charsets/csconvtbl/' . $charset . '.tbl';
1032 // If the conversion table is found:
1033 if ($charset && GeneralUtility
::validPathStr($charsetConvTableFile) && @is_file
($charsetConvTableFile)) {
1034 // Cache file for charsets:
1035 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1036 $cacheFile = GeneralUtility
::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1037 if ($cacheFile && @is_file
($cacheFile)) {
1038 $this->parsedCharsets
[$charset] = unserialize(GeneralUtility
::getUrl($cacheFile));
1040 // Parse conversion table into lines:
1041 $lines = GeneralUtility
::trimExplode(LF
, GeneralUtility
::getUrl($charsetConvTableFile), true);
1042 // Initialize the internal variable holding the conv. table:
1043 $this->parsedCharsets
[$charset] = array('local' => array(), 'utf8' => array());
1044 // traverse the lines:
1046 foreach ($lines as $value) {
1047 // Comment line or blanks are ignored.
1048 if (trim($value) && $value[0] !== '#') {
1049 // Detect type if not done yet: (Done on first real line)
1050 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1051 if (!$detectedType) {
1052 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ?
'whitespaced' : 'ms-token';
1054 if ($detectedType === 'ms-token') {
1055 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1056 } elseif ($detectedType === 'whitespaced') {
1058 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1059 $hexbyte = $regA[1];
1060 $utf8 = 'U+' . $regA[2];
1062 $decval = hexdec(trim($hexbyte));
1063 if ($decval > 127) {
1064 $utf8decval = hexdec(substr(trim($utf8), 2));
1065 $this->parsedCharsets
[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1066 $this->parsedCharsets
[$charset]['utf8'][$this->parsedCharsets
[$charset]['local'][$decval]] = $decval;
1071 GeneralUtility
::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets
[$charset]));
1084 * This function initializes all UTF-8 character data tables.
1086 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1088 * @param string $mode Mode ("case", "ascii", ...)
1089 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1092 public function initUnicodeData($mode = null)
1095 $cacheFileCase = GeneralUtility
::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1096 $cacheFileASCII = GeneralUtility
::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1097 // Only process if the tables are not yet loaded
1100 if (is_array($this->caseFolding
['utf-8'])) {
1103 // Use cached version if possible
1104 if ($cacheFileCase && @is_file
($cacheFileCase)) {
1105 $this->caseFolding
['utf-8'] = unserialize(GeneralUtility
::getUrl($cacheFileCase));
1110 if (is_array($this->toASCII
['utf-8'])) {
1113 // Use cached version if possible
1114 if ($cacheFileASCII && @is_file
($cacheFileASCII)) {
1115 $this->toASCII
['utf-8'] = unserialize(GeneralUtility
::getUrl($cacheFileASCII));
1120 // Process main Unicode data file
1121 $unicodeDataFile = ExtensionManagementUtility
::extPath('core') . 'Resources/Private/Charsets/unidata/UnicodeData.txt';
1122 if (!(GeneralUtility
::validPathStr($unicodeDataFile) && @is_file
($unicodeDataFile))) {
1125 $fh = fopen($unicodeDataFile, 'rb');
1129 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1130 // Note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1131 $this->caseFolding
['utf-8'] = array();
1132 $utf8CaseFolding = &$this->caseFolding
['utf-8'];
1134 $utf8CaseFolding['toUpper'] = array();
1135 $utf8CaseFolding['toLower'] = array();
1136 $utf8CaseFolding['toTitle'] = array();
1137 // Array of temp. decompositions
1138 $decomposition = array();
1139 // Array of chars that are marks (eg. composing accents)
1141 // Array of chars that are numbers (eg. digits)
1143 // Array of chars to be omitted (eg. Russian hard sign)
1145 while (!feof($fh)) {
1146 $line = fgets($fh, 4096);
1147 // Has a lot of info
1148 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title, ) = explode(';', rtrim($line));
1149 $ord = hexdec($char);
1151 // Only process the BMP
1154 $utf8_char = $this->UnumberToChar($ord);
1156 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1159 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1161 // Store "title" only when different from "upper" (only a few)
1162 if ($title && $title !== $upper) {
1163 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1167 // mark (accent, umlaut, ...)
1168 $mark['U+' . $char] = 1;
1172 if ($ord > 128 && $num !== '') {
1173 $number['U+' . $char] = $num;
1176 // Accented Latin letters without "official" decomposition
1178 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1179 $c = ord($match[2]);
1180 if ($match[1] === 'SMALL') {
1183 $decomposition['U+' . $char] = array(dechex($c));
1187 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1188 switch ($match[1]) {
1190 // add parenthesis as circle replacement, eg (1)
1191 $match[2] = '0028 ' . $match[2] . ' 0029';
1194 // add square brackets as square replacement, eg [1]
1195 $match[2] = '005B ' . $match[2] . ' 005D';
1198 // ignore multi char decompositions that start with a space
1199 if (preg_match('/^0020 /', $match[2])) {
1210 $decomposition['U+' . $char] = explode(' ', $match[2]);
1214 // Process additional Unicode data for casing (allow folded characters to expand into a sequence)
1215 $specialCasingFile = ExtensionManagementUtility
::extPath('core') . 'Resources/Private/Charsets/unidata/SpecialCasing.txt';
1216 if (GeneralUtility
::validPathStr($specialCasingFile) && @is_file
($specialCasingFile)) {
1217 $fh = fopen($specialCasingFile, 'rb');
1219 while (!feof($fh)) {
1220 $line = fgets($fh, 4096);
1221 if ($line[0] !== '#' && trim($line) !== '') {
1222 list($char, $lower, $title, $upper, $cond) = GeneralUtility
::trimExplode(';', $line);
1223 if ($cond === '' ||
$cond[0] === '#') {
1224 $utf8_char = $this->UnumberToChar(hexdec($char));
1225 if ($char !== $lower) {
1226 $arr = explode(' ', $lower);
1227 for ($i = 0; isset($arr[$i]); $i++
) {
1228 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1230 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1232 if ($char !== $title && $title !== $upper) {
1233 $arr = explode(' ', $title);
1234 for ($i = 0; isset($arr[$i]); $i++
) {
1235 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1237 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1239 if ($char !== $upper) {
1240 $arr = explode(' ', $upper);
1241 for ($i = 0; isset($arr[$i]); $i++
) {
1242 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1244 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1252 // Process custom decompositions
1253 $customTranslitFile = ExtensionManagementUtility
::extPath('core') . 'Resources/Private/Charsets/unidata/Translit.txt';
1254 if (GeneralUtility
::validPathStr($customTranslitFile) && @is_file
($customTranslitFile)) {
1255 $fh = fopen($customTranslitFile, 'rb');
1257 while (!feof($fh)) {
1258 $line = fgets($fh, 4096);
1259 if ($line[0] !== '#' && trim($line) !== '') {
1260 list($char, $translit) = GeneralUtility
::trimExplode(';', $line);
1262 $omit['U+' . $char] = 1;
1264 $decomposition['U+' . $char] = explode(' ', $translit);
1270 // Decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1271 foreach ($decomposition as $from => $to) {
1272 $code_decomp = array();
1273 while ($code_value = array_shift($to)) {
1274 // Do recursive decomposition
1275 if (isset($decomposition['U+' . $code_value])) {
1276 foreach (array_reverse($decomposition['U+' . $code_value]) as $cv) {
1277 array_unshift($to, $cv);
1279 } elseif (!isset($mark['U+' . $code_value])) {
1281 array_push($code_decomp, $code_value);
1284 if (!empty($code_decomp) ||
isset($omit[$from])) {
1285 $decomposition[$from] = $code_decomp;
1287 unset($decomposition[$from]);
1290 // Create ascii only mapping
1291 $this->toASCII
['utf-8'] = array();
1292 $ascii = &$this->toASCII
['utf-8'];
1293 foreach ($decomposition as $from => $to) {
1294 $code_decomp = array();
1295 while ($code_value = array_shift($to)) {
1296 $ord = hexdec($code_value);
1300 // Skip decompositions containing non-ASCII chars
1301 array_push($code_decomp, chr($ord));
1304 $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1306 // Add numeric decompositions
1307 foreach ($number as $from => $to) {
1308 $utf8_char = $this->UnumberToChar(hexdec($from));
1309 if (!isset($ascii[$utf8_char])) {
1310 $ascii[$utf8_char] = $to;
1313 if ($cacheFileCase) {
1314 GeneralUtility
::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1316 if ($cacheFileASCII) {
1317 GeneralUtility
::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1323 * This function initializes the folding table for a charset other than UTF-8.
1324 * This function is automatically called by the case folding functions.
1326 * @param string $charset Charset for which to initialize case folding.
1327 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1330 public function initCaseFolding($charset)
1332 // Only process if the case table is not yet loaded:
1333 if (is_array($this->caseFolding
[$charset])) {
1336 // Use cached version if possible
1337 $cacheFile = GeneralUtility
::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1338 if ($cacheFile && @is_file
($cacheFile)) {
1339 $this->caseFolding
[$charset] = unserialize(GeneralUtility
::getUrl($cacheFile));
1342 // init UTF-8 conversion for this charset
1343 if (!$this->initCharset($charset)) {
1346 // UTF-8 case folding is used as the base conversion table
1347 if (!$this->initUnicodeData('case')) {
1350 $nochar = chr($this->noCharByteVal
);
1351 foreach ($this->parsedCharsets
[$charset]['local'] as $ci => $utf8) {
1352 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1353 $c = $this->utf8_decode($utf8, $charset);
1354 $cc = $this->utf8_decode($this->caseFolding
['utf-8']['toUpper'][$utf8], $charset);
1355 if ($cc !== '' && $cc !== $nochar) {
1356 $this->caseFolding
[$charset]['toUpper'][$c] = $cc;
1358 $cc = $this->utf8_decode($this->caseFolding
['utf-8']['toLower'][$utf8], $charset);
1359 if ($cc !== '' && $cc !== $nochar) {
1360 $this->caseFolding
[$charset]['toLower'][$c] = $cc;
1362 $cc = $this->utf8_decode($this->caseFolding
['utf-8']['toTitle'][$utf8], $charset);
1363 if ($cc !== '' && $cc !== $nochar) {
1364 $this->caseFolding
[$charset]['toTitle'][$c] = $cc;
1367 // Add the ASCII case table
1370 for ($i = $start; $i <= $end; $i++
) {
1371 $this->caseFolding
[$charset]['toUpper'][chr($i)] = chr($i - 32);
1375 for ($i = $start; $i <= $end; $i++
) {
1376 $this->caseFolding
[$charset]['toLower'][chr($i)] = chr($i +
32);
1379 GeneralUtility
::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding
[$charset]));
1385 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1386 * This function is automatically called by the ASCII transliteration functions.
1388 * @param string $charset Charset for which to initialize conversion.
1389 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1392 public function initToASCII($charset)
1394 // Only process if the case table is not yet loaded:
1395 if (is_array($this->toASCII
[$charset])) {
1398 // Use cached version if possible
1399 $cacheFile = GeneralUtility
::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1400 if ($cacheFile && @is_file
($cacheFile)) {
1401 $this->toASCII
[$charset] = unserialize(GeneralUtility
::getUrl($cacheFile));
1404 // Init UTF-8 conversion for this charset
1405 if (!$this->initCharset($charset)) {
1408 // UTF-8/ASCII transliteration is used as the base conversion table
1409 if (!$this->initUnicodeData('ascii')) {
1412 foreach ($this->parsedCharsets
[$charset]['local'] as $ci => $utf8) {
1413 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1414 $c = $this->utf8_decode($utf8, $charset);
1415 if (isset($this->toASCII
['utf-8'][$utf8])) {
1416 $this->toASCII
[$charset][$c] = $this->toASCII
['utf-8'][$utf8];
1420 GeneralUtility
::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII
[$charset]));
1425 /********************************************
1427 * String operation functions
1429 ********************************************/
1431 * Returns a part of a string.
1432 * Unit-tested by Kasper (single byte charsets only)
1434 * @param string $charset The character set
1435 * @param string $string Character string
1436 * @param int $start Start position (character position)
1437 * @param int $len Length (in characters)
1438 * @return string The substring
1439 * @see substr(), mb_substr()
1441 public function substr($charset, $string, $start, $len = null)
1443 if ($len === 0 ||
$string === '') {
1446 if ($this->getConversionStrategy() === self
::STRATEGY_MBSTRING
) {
1447 // Cannot omit $len, when specifying charset
1448 if ($len === null) {
1449 // Save internal encoding
1450 $enc = mb_internal_encoding();
1451 mb_internal_encoding($charset);
1452 $str = mb_substr($string, $start);
1453 // Restore internal encoding
1454 mb_internal_encoding($enc);
1457 return mb_substr($string, $start, $len, $charset);
1459 } elseif ($this->getConversionStrategy() === self
::STRATEGY_ICONV
) {
1460 // Cannot omit $len, when specifying charset
1461 if ($len === null) {
1462 // Save internal encoding
1463 $enc = iconv_get_encoding('internal_encoding');
1464 iconv_set_encoding('internal_encoding', $charset);
1465 $str = iconv_substr($string, $start);
1466 // Restore internal encoding
1467 iconv_set_encoding('internal_encoding', $enc);
1470 return iconv_substr($string, $start, $len, $charset);
1472 } elseif ($charset === 'utf-8') {
1473 return $this->utf8_substr($string, $start, $len);
1474 } elseif ($this->eucBasedSets
[$charset]) {
1475 return $this->euc_substr($string, $start, $charset, $len);
1476 } elseif ($this->twoByteSets
[$charset]) {
1477 return substr($string, $start * 2, $len * 2);
1478 } elseif ($this->fourByteSets
[$charset]) {
1479 return substr($string, $start * 4, $len * 4);
1481 // Treat everything else as single-byte encoding
1482 return $len === null ?
substr($string, $start) : substr($string, $start, $len);
1486 * Counts the number of characters.
1487 * Unit-tested by Kasper (single byte charsets only)
1489 * @param string $charset The character set
1490 * @param string $string Character string
1491 * @return int The number of characters
1494 public function strlen($charset, $string)
1496 if ($this->getConversionStrategy() === self
::STRATEGY_MBSTRING
) {
1497 return mb_strlen($string, $charset);
1498 } elseif ($this->getConversionStrategy() === self
::STRATEGY_ICONV
) {
1499 return iconv_strlen($string, $charset);
1500 } elseif ($charset === 'utf-8') {
1501 return $this->utf8_strlen($string);
1502 } elseif ($this->eucBasedSets
[$charset]) {
1503 return $this->euc_strlen($string, $charset);
1504 } elseif ($this->twoByteSets
[$charset]) {
1505 return strlen($string) / 2;
1506 } elseif ($this->fourByteSets
[$charset]) {
1507 return strlen($string) / 4;
1509 // Treat everything else as single-byte encoding
1510 return strlen($string);
1514 * Method to crop strings using the mb_substr function.
1516 * @param string $charset The character set
1517 * @param string $string String to be cropped
1518 * @param int $len Crop length (in characters)
1519 * @param string $crop Crop signifier
1520 * @return string The shortened string
1521 * @see mb_strlen(), mb_substr()
1523 protected function cropMbstring($charset, $string, $len, $crop = '')
1525 if ((int)$len === 0 ||
mb_strlen($string, $charset) <= abs($len)) {
1529 $string = mb_substr($string, 0, $len, $charset) . $crop;
1531 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1537 * Truncates a string and pre-/appends a string.
1538 * Unit tested by Kasper
1540 * @param string $charset The character set
1541 * @param string $string Character string
1542 * @param int $len Length (in characters)
1543 * @param string $crop Crop signifier
1544 * @return string The shortened string
1545 * @see substr(), mb_strimwidth()
1547 public function crop($charset, $string, $len, $crop = '')
1549 if ($this->getConversionStrategy() === self
::STRATEGY_MBSTRING
) {
1550 return $this->cropMbstring($charset, $string, $len, $crop);
1552 if ((int)$len === 0) {
1555 if ($charset === 'utf-8') {
1556 $i = $this->utf8_char2byte_pos($string, $len);
1557 } elseif ($this->eucBasedSets
[$charset]) {
1558 $i = $this->euc_char2byte_pos($string, $len, $charset);
1563 $i = strlen($string) +
$len;
1569 // $len outside actual string length
1574 if (isset($string[$i])) {
1575 return substr($string, 0, $i) . $crop;
1578 if (isset($string[$i - 1])) {
1579 return $crop . substr($string, $i);
1587 * Cuts a string short at a given byte length.
1589 * @param string $charset The character set
1590 * @param string $string Character string
1591 * @param int $len The byte length
1592 * @return string The shortened string
1595 public function strtrunc($charset, $string, $len)
1600 if ($this->getConversionStrategy() === self
::STRATEGY_MBSTRING
) {
1601 return mb_strcut($string, 0, $len, $charset);
1602 } elseif ($charset === 'utf-8') {
1603 return $this->utf8_strtrunc($string, $len);
1604 } elseif ($this->eucBasedSets
[$charset]) {
1605 return $this->euc_strtrunc($string, $len, $charset);
1606 } elseif ($this->twoByteSets
[$charset]) {
1610 } elseif ($this->fourByteSets
[$charset]) {
1612 // Realign to position dividable by four
1615 // Treat everything else as single-byte encoding
1616 return substr($string, 0, $len);
1620 * Translates all characters of a string into their respective case values.
1621 * Unlike strtolower() and strtoupper() this method is locale independent.
1622 * Note that the string length may change!
1623 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1624 * Unit-tested by Kasper
1625 * Real case folding is language dependent, this method ignores this fact.
1627 * @param string $charset Character set of string
1628 * @param string $string Input string to convert case for
1629 * @param string $case Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1630 * @return string The converted string
1631 * @see strtolower(), strtoupper()
1633 public function conv_case($charset, $string, $case)
1635 if ($this->getConversionStrategy() === self
::STRATEGY_MBSTRING
) {
1636 if ($case === 'toLower') {
1637 $string = mb_strtolower($string, $charset);
1639 $string = mb_strtoupper($string, $charset);
1641 } elseif ($charset === 'utf-8') {
1642 $string = $this->utf8_char_mapping($string, 'case', $case);
1643 } elseif (isset($this->eucBasedSets
[$charset])) {
1644 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1646 // Treat everything else as single-byte encoding
1647 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1653 * Equivalent of lcfirst/ucfirst but using character set.
1655 * @param string $charset
1656 * @param string $string
1657 * @param string $case
1659 * @see \TYPO3\CMS\Core\Charset\CharsetConverter::conv_case()
1661 public function convCaseFirst($charset, $string, $case)
1663 $firstChar = $this->substr($charset, $string, 0, 1);
1664 $firstChar = $this->conv_case($charset, $firstChar, $case);
1665 $remainder = $this->substr($charset, $string, 1);
1666 return $firstChar . $remainder;
1670 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1672 * @param string $charset Character set of string
1673 * @param string $string Input string to convert
1674 * @return string The converted string
1676 public function specCharsToASCII($charset, $string)
1678 if ($charset === 'utf-8') {
1679 $string = $this->utf8_char_mapping($string, 'ascii');
1680 } elseif (isset($this->eucBasedSets
[$charset])) {
1681 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1683 // Treat everything else as single-byte encoding
1684 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1690 * Converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1691 * into a TYPO3-readable language code
1693 * @param string $languageCodesList List of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1694 * @return string A preferred language that TYPO3 supports, or "default" if none found
1696 public function getPreferredClientLanguage($languageCodesList)
1698 $allLanguageCodes = $this->getAllLanguageCodes();
1699 $selectedLanguage = 'default';
1700 $preferredLanguages = GeneralUtility
::trimExplode(',', $languageCodesList);
1701 // Order the preferred languages after they key
1702 $sortedPreferredLanguages = array();
1703 foreach ($preferredLanguages as $preferredLanguage) {
1705 if (strpos($preferredLanguage, ';q=') !== false) {
1706 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1708 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1710 // Loop through the languages, with the highest priority first
1711 arsort($sortedPreferredLanguages, SORT_NUMERIC
);
1712 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1713 if (isset($allLanguageCodes[$preferredLanguage])) {
1714 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1717 // Strip the country code from the end
1718 list($preferredLanguage, ) = explode('-', $preferredLanguage);
1719 if (isset($allLanguageCodes[$preferredLanguage])) {
1720 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1724 if (!$selectedLanguage ||
$selectedLanguage === 'en') {
1725 $selectedLanguage = 'default';
1727 return $selectedLanguage;
1731 * Merges all available charsets and locales, currently only used for getPreferredClientLanguage()
1735 protected function getAllLanguageCodes()
1737 // Get all languages where TYPO3 code is the same as the ISO code
1738 $typo3LanguageCodes = array_keys($this->charSetArray
);
1739 $allLanguageCodes = array_combine($typo3LanguageCodes, $typo3LanguageCodes);
1740 // Get all languages where TYPO3 code differs from ISO code
1741 // or needs the country part
1742 // the iso codes will here overwrite the default typo3 language in the key
1743 /** @var Locales $locales */
1744 $locales = GeneralUtility
::makeInstance(Locales
::class);
1745 foreach ($locales->getIsoMapping() as $typo3Lang => $isoLang) {
1746 $isoLang = join('-', explode('_', $isoLang));
1747 $allLanguageCodes[$typo3Lang] = $isoLang;
1749 // Move the iso codes to the (because we're comparing the keys with "isset" later on)
1750 return array_flip($allLanguageCodes);
1753 /********************************************
1755 * Internal string operation functions
1757 ********************************************/
1759 * Maps all characters of a string in a single byte charset.
1761 * @param string $str The string
1762 * @param string $charset The charset
1763 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1764 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
1765 * @return string The converted string
1767 public function sb_char_mapping($str, $charset, $mode, $opt = '')
1771 if (!$this->initCaseFolding($charset)) {
1775 $map = &$this->caseFolding
[$charset][$opt];
1778 if (!$this->initToASCII($charset)) {
1782 $map = &$this->toASCII
[$charset];
1788 for ($i = 0; isset($str[$i]); $i++
) {
1790 if (isset($map[$c])) {
1799 /********************************************
1801 * Internal UTF-8 string operation functions
1803 ********************************************/
1805 * Returns a part of a UTF-8 string.
1806 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1808 * @param string $str UTF-8 string
1809 * @param int $start Start position (character position)
1810 * @param int $len Length (in characters)
1811 * @return string The substring
1814 public function utf8_substr($str, $start, $len = null)
1816 if ((string)$len === '0') {
1819 $byte_start = $this->utf8_char2byte_pos($str, $start);
1820 if ($byte_start === false) {
1822 // $start outside string length
1826 $str = substr($str, $byte_start);
1828 $byte_end = $this->utf8_char2byte_pos($str, $len);
1829 // $len outside actual string length
1830 if ($byte_end === false) {
1831 return $len < 0 ?
'' : $str;
1833 // When length is less than zero and exceeds, then we return blank string.
1834 return substr($str, 0, $byte_end);
1842 * Counts the number of characters of a string in UTF-8.
1843 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1845 * @param string $str UTF-8 multibyte character string
1846 * @return int The number of characters
1849 public function utf8_strlen($str)
1852 for ($i = 0; isset($str[$i]); $i++
) {
1854 // Single-byte (0xxxxxx)
1857 } elseif (($c & 192) === 192) {
1858 // Multi-byte starting byte (11xxxxxx)
1866 * Truncates a string in UTF-8 short at a given byte length.
1868 * @param string $str UTF-8 multibyte character string
1869 * @param int $len The byte length
1870 * @return string The shortened string
1873 public function utf8_strtrunc($str, $len)
1876 // Part of a multibyte sequence
1877 if (ord($str[$i]) & 128) {
1878 for (; $i > 0 && !(ord($str[$i]) & 64); $i--) {
1884 for ($bc = 0, $mbs = ord($str[$i]); $mbs & 128; $mbs = $mbs << 1) {
1885 // Calculate number of bytes
1888 if ($bc +
$i > $len) {
1889 return substr($str, 0, $i);
1892 return substr($str, 0, $len);
1896 * Find position of first occurrence of a string, both arguments are in UTF-8.
1898 * @param string $haystack UTF-8 string to search in
1899 * @param string $needle UTF-8 string to search for
1900 * @param int $offset Position to start the search
1901 * @return int The character position
1904 public function utf8_strpos($haystack, $needle, $offset = 0)
1906 if ($this->getConversionStrategy() === self
::STRATEGY_MBSTRING
) {
1907 return mb_strpos($haystack, $needle, $offset, 'utf-8');
1908 } elseif ($this->getConversionStrategy() === self
::STRATEGY_ICONV
) {
1909 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1911 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1912 if ($byte_offset === false) {
1913 // Offset beyond string length
1916 $byte_pos = strpos($haystack, $needle, $byte_offset);
1917 if ($byte_pos === false) {
1921 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1925 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1927 * @param string $haystack UTF-8 string to search in
1928 * @param string $needle UTF-8 character to search for (single character)
1929 * @return int The character position
1932 public function utf8_strrpos($haystack, $needle)
1934 if ($this->getConversionStrategy() === self
::STRATEGY_MBSTRING
) {
1935 return mb_strrpos($haystack, $needle, 'utf-8');
1936 } elseif ($this->getConversionStrategy() === self
::STRATEGY_ICONV
) {
1937 return iconv_strrpos($haystack, $needle, 'utf-8');
1939 $byte_pos = strrpos($haystack, $needle);
1940 if ($byte_pos === false) {
1944 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1948 * Translates a character position into an 'absolute' byte position.
1949 * Unit tested by Kasper.
1951 * @param string $str UTF-8 string
1952 * @param int $pos Character position (negative values start from the end)
1953 * @return int Byte position
1955 public function utf8_char2byte_pos($str, $pos)
1957 // Number of characters found
1959 // Number of characters wanted
1965 $i = strlen($str) - 1;
1968 for (; isset($str[$i]) && $n < $p; $i +
= $d) {
1969 $c = (int)ord($str[$i]);
1970 // single-byte (0xxxxxx)
1973 } elseif (($c & 192) === 192) {
1974 // Multi-byte starting byte (11xxxxxx)
1978 if (!isset($str[$i])) {
1979 // Offset beyond string length
1983 // Skip trailing multi-byte data bytes
1984 while (ord($str[$i]) & 128 && !(ord($str[$i]) & 64)) {
1995 * Translates an 'absolute' byte position into a character position.
1996 * Unit tested by Kasper.
1998 * @param string $str UTF-8 string
1999 * @param int $pos Byte position
2000 * @return int Character position
2002 public function utf8_byte2char_pos($str, $pos)
2004 // Number of characters
2006 for ($i = $pos; $i > 0; $i--) {
2007 $c = (int)ord($str[$i]);
2008 // single-byte (0xxxxxx)
2011 } elseif (($c & 192) === 192) {
2012 // Multi-byte starting byte (11xxxxxx)
2016 if (!isset($str[$i])) {
2017 // Offset beyond string length
2024 * Maps all characters of an UTF-8 string.
2026 * @param string $str UTF-8 string
2027 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2028 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
2029 * @return string The converted string
2031 public function utf8_char_mapping($str, $mode, $opt = '')
2033 if (!$this->initUnicodeData($mode)) {
2040 $map = &$this->caseFolding
['utf-8'][$opt];
2043 $map = &$this->toASCII
['utf-8'];
2048 for ($i = 0; isset($str[$i]); $i++
) {
2050 // single-byte (0xxxxxx)
2053 } elseif (($c & 192) === 192) {
2054 // multi-byte starting byte (11xxxxxx)
2055 for ($bc = 0; $c & 128; $c = $c << 1) {
2058 // calculate number of bytes
2059 $mbc = substr($str, $i, $bc);
2062 if (isset($map[$mbc])) {
2071 /********************************************
2073 * Internal EUC string operation functions
2075 * Extended Unix Code:
2076 * ASCII compatible 7bit single bytes chars
2077 * 8bit two byte chars
2079 * Shift-JIS is treated as a special case.
2081 ********************************************/
2083 * Cuts a string in the EUC charset family short at a given byte length.
2085 * @param string $str EUC multibyte character string
2086 * @param int $len The byte length
2087 * @param string $charset The charset
2088 * @return string The shortened string
2091 public function euc_strtrunc($str, $len, $charset)
2093 $shiftJis = $charset === 'shift_jis';
2094 for ($i = 0; isset($str[$i]) && $i < $len; $i++
) {
2097 if ($c >= 128 && $c < 160 ||
$c >= 224) {
2106 if (!isset($str[$i])) {
2109 // string shorter than supplied length
2111 // We ended on a first byte
2112 return substr($str, 0, $len - 1);
2114 return substr($str, 0, $len);
2119 * Returns a part of a string in the EUC charset family.
2121 * @param string $str EUC multibyte character string
2122 * @param int $start Start position (character position)
2123 * @param string $charset The charset
2124 * @param int $len Length (in characters)
2125 * @return string the substring
2127 public function euc_substr($str, $start, $charset, $len = null)
2129 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2130 if ($byte_start === false) {
2131 // $start outside string length
2134 $str = substr($str, $byte_start);
2136 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2137 // $len outside actual string length
2138 if ($byte_end === false) {
2141 return substr($str, 0, $byte_end);
2149 * Counts the number of characters of a string in the EUC charset family.
2151 * @param string $str EUC multibyte character string
2152 * @param string $charset The charset
2153 * @return int The number of characters
2156 public function euc_strlen($str, $charset)
2158 $sjis = $charset === 'shift_jis';
2160 for ($i = 0; isset($str[$i]); $i++
) {
2163 if ($c >= 128 && $c < 160 ||
$c >= 224) {
2177 * Translates a character position into an 'absolute' byte position.
2179 * @param string $str EUC multibyte character string
2180 * @param int $pos Character position (negative values start from the end)
2181 * @param string $charset The charset
2182 * @return int Byte position
2184 public function euc_char2byte_pos($str, $pos, $charset)
2186 $sjis = $charset === 'shift_jis';
2187 // Number of characters seen
2189 // Number of characters wanted
2195 $i = strlen($str) - 1;
2198 for (; isset($str[$i]) && $n < $p; $i +
= $d) {
2201 if ($c >= 128 && $c < 160 ||
$c >= 224) {
2211 if (!isset($str[$i])) {
2214 // offset beyond string length
2223 * Maps all characters of a string in the EUC charset family.
2225 * @param string $str EUC multibyte character string
2226 * @param string $charset The charset
2227 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2228 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
2229 * @return string The converted string
2231 public function euc_char_mapping($str, $charset, $mode, $opt = '')
2235 if (!$this->initCaseFolding($charset)) {
2239 $map = &$this->caseFolding
[$charset][$opt];
2242 if (!$this->initToASCII($charset)) {
2246 $map = &$this->toASCII
[$charset];
2251 $sjis = $charset === 'shift_jis';
2253 for ($i = 0; isset($str[$i]); $i++
) {
2257 // A double-byte char
2258 if ($c >= 128 && $c < 160 ||
$c >= 224) {
2259 $mbc = substr($str, $i, 2);
2263 // A double-byte char
2265 $mbc = substr($str, $i, 2);
2269 if (isset($map[$mbc])) {
2279 * Checks the selected strategy based on which method is configured in
2280 * $TYPO3_CONF_VARS[SYS][t3lib_cs_utils].
2282 * @return string could be "mbstring", "iconv" or "fallback"
2284 protected function getConversionStrategy() {
2285 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === self
::STRATEGY_MBSTRING
) {
2286 return self
::STRATEGY_MBSTRING
;
2287 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === self
::STRATEGY_ICONV
) {
2288 return self
::STRATEGY_ICONV
;
2290 return self
::STRATEGY_FALLBACK
;