[TASK] Only instantiate Locales in csConv when necessary
[Packages/TYPO3.CMS.git] / typo3 / sysext / core / Classes / Charset / CharsetConverter.php
1 <?php
2 namespace TYPO3\CMS\Core\Charset;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Localization\Locales;
18 use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
19 use TYPO3\CMS\Core\Utility\GeneralUtility;
20
21 /**
22 * Notes on UTF-8
23 *
24 * Functions working on UTF-8 strings:
25 *
26 * - strchr/strstr
27 * - strrchr
28 * - substr_count
29 * - implode/explode/join
30 *
31 * Functions nearly working on UTF-8 strings:
32 *
33 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
34 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
35 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
36 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
37 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
38 *
39 * Functions NOT working on UTF-8 strings:
40 *
41 * - str*cmp
42 * - stristr
43 * - stripos
44 * - substr
45 * - strrev
46 * - split/spliti
47 * - ...
48 */
49
50 /**
51 * Class for conversion between charsets
52 */
53 class CharsetConverter
54 {
55
56 /**
57 * ASCII Value for chars with no equivalent.
58 *
59 * @var int
60 */
61 public $noCharByteVal = 63;
62
63 /**
64 * This is the array where parsed conversion tables are stored (cached)
65 *
66 * @var array
67 */
68 public $parsedCharsets = array();
69
70 /**
71 * An array where case folding data will be stored (cached)
72 *
73 * @var array
74 */
75 public $caseFolding = array();
76
77 /**
78 * An array where charset-to-ASCII mappings are stored (cached)
79 *
80 * @var array
81 */
82 public $toASCII = array();
83
84 /**
85 * This tells the converter which charsets has two bytes per char:
86 *
87 * @var array
88 */
89 public $twoByteSets = array(
90 'ucs-2' => 1
91 );
92
93 /**
94 * This tells the converter which charsets has four bytes per char:
95 *
96 * @var array
97 */
98 public $fourByteSets = array(
99 'ucs-4' => 1, // 4-byte Unicode
100 'utf-32' => 1
101 );
102
103 /**
104 * This tells the converter which charsets use a scheme like the Extended Unix Code:
105 *
106 * @var array
107 */
108 public $eucBasedSets = array(
109 'gb2312' => 1, // Chinese, simplified.
110 'big5' => 1, // Chinese, traditional.
111 'euc-kr' => 1, // Korean
112 'shift_jis' => 1
113 );
114
115 /**
116 * @link http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
117 * @link http://czyborra.com/charsets/iso8859.html
118 *
119 * @var array
120 */
121 public $synonyms = array(
122 'us' => 'ascii',
123 'us-ascii' => 'ascii',
124 'cp819' => 'iso-8859-1',
125 'ibm819' => 'iso-8859-1',
126 'iso-ir-100' => 'iso-8859-1',
127 'iso-ir-101' => 'iso-8859-2',
128 'iso-ir-109' => 'iso-8859-3',
129 'iso-ir-110' => 'iso-8859-4',
130 'iso-ir-144' => 'iso-8859-5',
131 'iso-ir-127' => 'iso-8859-6',
132 'iso-ir-126' => 'iso-8859-7',
133 'iso-ir-138' => 'iso-8859-8',
134 'iso-ir-148' => 'iso-8859-9',
135 'iso-ir-157' => 'iso-8859-10',
136 'iso-ir-179' => 'iso-8859-13',
137 'iso-ir-199' => 'iso-8859-14',
138 'iso-ir-203' => 'iso-8859-15',
139 'csisolatin1' => 'iso-8859-1',
140 'csisolatin2' => 'iso-8859-2',
141 'csisolatin3' => 'iso-8859-3',
142 'csisolatin5' => 'iso-8859-9',
143 'csisolatin8' => 'iso-8859-14',
144 'csisolatin9' => 'iso-8859-15',
145 'csisolatingreek' => 'iso-8859-7',
146 'iso-celtic' => 'iso-8859-14',
147 'latin1' => 'iso-8859-1',
148 'latin2' => 'iso-8859-2',
149 'latin3' => 'iso-8859-3',
150 'latin5' => 'iso-8859-9',
151 'latin6' => 'iso-8859-10',
152 'latin8' => 'iso-8859-14',
153 'latin9' => 'iso-8859-15',
154 'l1' => 'iso-8859-1',
155 'l2' => 'iso-8859-2',
156 'l3' => 'iso-8859-3',
157 'l5' => 'iso-8859-9',
158 'l6' => 'iso-8859-10',
159 'l8' => 'iso-8859-14',
160 'l9' => 'iso-8859-15',
161 'cyrillic' => 'iso-8859-5',
162 'arabic' => 'iso-8859-6',
163 'tis-620' => 'iso-8859-11',
164 'win874' => 'windows-874',
165 'win1250' => 'windows-1250',
166 'win1251' => 'windows-1251',
167 'win1252' => 'windows-1252',
168 'win1253' => 'windows-1253',
169 'win1254' => 'windows-1254',
170 'win1255' => 'windows-1255',
171 'win1256' => 'windows-1256',
172 'win1257' => 'windows-1257',
173 'win1258' => 'windows-1258',
174 'cp1250' => 'windows-1250',
175 'cp1251' => 'windows-1251',
176 'cp1252' => 'windows-1252',
177 'ms-ee' => 'windows-1250',
178 'ms-ansi' => 'windows-1252',
179 'ms-greek' => 'windows-1253',
180 'ms-turk' => 'windows-1254',
181 'winbaltrim' => 'windows-1257',
182 'koi-8ru' => 'koi-8r',
183 'koi8r' => 'koi-8r',
184 'cp878' => 'koi-8r',
185 'mac' => 'macroman',
186 'macintosh' => 'macroman',
187 'euc-cn' => 'gb2312',
188 'x-euc-cn' => 'gb2312',
189 'euccn' => 'gb2312',
190 'cp936' => 'gb2312',
191 'big-5' => 'big5',
192 'cp950' => 'big5',
193 'eucjp' => 'euc-jp',
194 'sjis' => 'shift_jis',
195 'shift-jis' => 'shift_jis',
196 'cp932' => 'shift_jis',
197 'cp949' => 'euc-kr',
198 'utf7' => 'utf-7',
199 'utf8' => 'utf-8',
200 'utf16' => 'utf-16',
201 'utf32' => 'utf-32',
202 'ucs2' => 'ucs-2',
203 'ucs4' => 'ucs-4'
204 );
205
206 /**
207 * Mapping of iso-639-1 language codes to script names
208 *
209 * @var array
210 */
211 public $lang_to_script = array(
212 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
213 'af' => 'west_european', // Afrikaans
214 'ar' => 'arabic',
215 'bg' => 'cyrillic', // Bulgarian
216 'bs' => 'east_european', // Bosnian
217 'cs' => 'east_european', // Czech
218 'da' => 'west_european', // Danish
219 'de' => 'west_european', // German
220 'es' => 'west_european', // Spanish
221 'et' => 'estonian',
222 'eo' => 'unicode', // Esperanto
223 'eu' => 'west_european', // Basque
224 'fa' => 'arabic', // Persian
225 'fi' => 'west_european', // Finish
226 'fo' => 'west_european', // Faroese
227 'fr' => 'west_european', // French
228 'ga' => 'west_european', // Irish
229 'gl' => 'west_european', // Galician
230 'gr' => 'greek',
231 'he' => 'hebrew', // Hebrew (since 1998)
232 'hi' => 'unicode', // Hindi
233 'hr' => 'east_european', // Croatian
234 'hu' => 'east_european', // Hungarian
235 'iw' => 'hebrew', // Hebrew (til 1998)
236 'is' => 'west_european', // Icelandic
237 'it' => 'west_european', // Italian
238 'ja' => 'japanese',
239 'ka' => 'unicode', // Georgian
240 'kl' => 'west_european', // Greenlandic
241 'km' => 'unicode', // Khmer
242 'ko' => 'korean',
243 'lt' => 'lithuanian',
244 'lv' => 'west_european', // Latvian/Lettish
245 'nl' => 'west_european', // Dutch
246 'no' => 'west_european', // Norwegian
247 'nb' => 'west_european', // Norwegian Bokmal
248 'nn' => 'west_european', // Norwegian Nynorsk
249 'pl' => 'east_european', // Polish
250 'pt' => 'west_european', // Portuguese
251 'ro' => 'east_european', // Romanian
252 'ru' => 'cyrillic', // Russian
253 'sk' => 'east_european', // Slovak
254 'sl' => 'east_european', // Slovenian
255 'sr' => 'cyrillic', // Serbian
256 'sv' => 'west_european', // Swedish
257 'sq' => 'albanian', // Albanian
258 'th' => 'thai',
259 'uk' => 'cyrillic', // Ukranian
260 'vi' => 'vietnamese',
261 'zh' => 'chinese',
262
263 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
264 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
265 'afk' => 'west_european', // Afrikaans
266 'ara' => 'arabic',
267 'bgr' => 'cyrillic', // Bulgarian
268 'cat' => 'west_european', // Catalan
269 'chs' => 'simpl_chinese',
270 'cht' => 'trad_chinese',
271 'csy' => 'east_european', // Czech
272 'dan' => 'west_european', // Danish
273 'deu' => 'west_european', // German
274 'dea' => 'west_european', // German (Austrian)
275 'des' => 'west_european', // German (Swiss)
276 'ena' => 'west_european', // English (Australian)
277 'enc' => 'west_european', // English (Canadian)
278 'eng' => 'west_european', // English
279 'enz' => 'west_european', // English (New Zealand)
280 'enu' => 'west_european', // English (United States)
281 'euq' => 'west_european', // Basque
282 'fos' => 'west_european', // Faroese
283 'far' => 'arabic', // Persian
284 'fin' => 'west_european', // Finish
285 'fra' => 'west_european', // French
286 'frb' => 'west_european', // French (Belgian)
287 'frc' => 'west_european', // French (Canadian)
288 'frs' => 'west_european', // French (Swiss)
289 'geo' => 'unicode', // Georgian
290 'glg' => 'west_european', // Galician
291 'ell' => 'greek',
292 'heb' => 'hebrew',
293 'hin' => 'unicode', // Hindi
294 'hun' => 'east_european', // Hungarian
295 'isl' => 'west_european', // Icelandic
296 'ita' => 'west_european', // Italian
297 'its' => 'west_european', // Italian (Swiss)
298 'jpn' => 'japanese',
299 'khm' => 'unicode', // Khmer
300 'kor' => 'korean',
301 'lth' => 'lithuanian',
302 'lvi' => 'west_european', // Latvian/Lettish
303 'msl' => 'west_european', // Malay
304 'nlb' => 'west_european', // Dutch (Belgian)
305 'nld' => 'west_european', // Dutch
306 'nor' => 'west_european', // Norwegian (bokmal)
307 'non' => 'west_european', // Norwegian (nynorsk)
308 'plk' => 'east_european', // Polish
309 'ptg' => 'west_european', // Portuguese
310 'ptb' => 'west_european', // Portuguese (Brazil)
311 'rom' => 'east_european', // Romanian
312 'rus' => 'cyrillic', // Russian
313 'slv' => 'east_european', // Slovenian
314 'sky' => 'east_european', // Slovak
315 'srl' => 'east_european', // Serbian (Latin)
316 'srb' => 'cyrillic', // Serbian (Cyrillic)
317 'esp' => 'west_european', // Spanish (trad. sort)
318 'esm' => 'west_european', // Spanish (Mexican)
319 'esn' => 'west_european', // Spanish (internat. sort)
320 'sve' => 'west_european', // Swedish
321 'sqi' => 'albanian', // Albanian
322 'tha' => 'thai',
323 'trk' => 'turkish',
324 'ukr' => 'cyrillic', // Ukrainian
325
326 // English language names
327 'afrikaans' => 'west_european',
328 'albanian' => 'albanian',
329 'arabic' => 'arabic',
330 'basque' => 'west_european',
331 'bosnian' => 'east_european',
332 'bulgarian' => 'east_european',
333 'catalan' => 'west_european',
334 'croatian' => 'east_european',
335 'czech' => 'east_european',
336 'danish' => 'west_european',
337 'dutch' => 'west_european',
338 'english' => 'west_european',
339 'esperanto' => 'unicode',
340 'estonian' => 'estonian',
341 'faroese' => 'west_european',
342 'farsi' => 'arabic',
343 'finnish' => 'west_european',
344 'french' => 'west_european',
345 'galician' => 'west_european',
346 'georgian' => 'unicode',
347 'german' => 'west_european',
348 'greek' => 'greek',
349 'greenlandic' => 'west_european',
350 'hebrew' => 'hebrew',
351 'hindi' => 'unicode',
352 'hungarian' => 'east_european',
353 'icelandic' => 'west_european',
354 'italian' => 'west_european',
355 'khmer' => 'unicode',
356 'latvian' => 'west_european',
357 'lettish' => 'west_european',
358 'lithuanian' => 'lithuanian',
359 'malay' => 'west_european',
360 'norwegian' => 'west_european',
361 'persian' => 'arabic',
362 'polish' => 'east_european',
363 'portuguese' => 'west_european',
364 'russian' => 'cyrillic',
365 'romanian' => 'east_european',
366 'serbian' => 'cyrillic',
367 'slovak' => 'east_european',
368 'slovenian' => 'east_european',
369 'spanish' => 'west_european',
370 'svedish' => 'west_european',
371 'that' => 'thai',
372 'turkish' => 'turkish',
373 'ukrainian' => 'cyrillic'
374 );
375
376 /**
377 * Mapping of language (family) names to charsets on Unix
378 *
379 * @var array
380 */
381 public $script_to_charset_unix = array(
382 'west_european' => 'iso-8859-1',
383 'estonian' => 'iso-8859-1',
384 'east_european' => 'iso-8859-2',
385 'baltic' => 'iso-8859-4',
386 'cyrillic' => 'iso-8859-5',
387 'arabic' => 'iso-8859-6',
388 'greek' => 'iso-8859-7',
389 'hebrew' => 'iso-8859-8',
390 'turkish' => 'iso-8859-9',
391 'thai' => 'iso-8859-11', // = TIS-620
392 'lithuanian' => 'iso-8859-13',
393 'chinese' => 'gb2312', // = euc-cn
394 'japanese' => 'euc-jp',
395 'korean' => 'euc-kr',
396 'simpl_chinese' => 'gb2312',
397 'trad_chinese' => 'big5',
398 'vietnamese' => '',
399 'unicode' => 'utf-8',
400 'albanian' => 'utf-8'
401 );
402
403 /**
404 * Mapping of language (family) names to charsets on Windows
405 *
406 * @var array
407 */
408 public $script_to_charset_windows = array(
409 'east_european' => 'windows-1250',
410 'cyrillic' => 'windows-1251',
411 'west_european' => 'windows-1252',
412 'greek' => 'windows-1253',
413 'turkish' => 'windows-1254',
414 'hebrew' => 'windows-1255',
415 'arabic' => 'windows-1256',
416 'baltic' => 'windows-1257',
417 'estonian' => 'windows-1257',
418 'lithuanian' => 'windows-1257',
419 'vietnamese' => 'windows-1258',
420 'thai' => 'cp874',
421 'korean' => 'cp949',
422 'chinese' => 'gb2312',
423 'japanese' => 'shift_jis',
424 'simpl_chinese' => 'gb2312',
425 'trad_chinese' => 'big5',
426 'albanian' => 'windows-1250',
427 'unicode' => 'utf-8'
428 );
429
430 /**
431 * Mapping of locale names to charsets
432 *
433 * @var array
434 */
435 public $locale_to_charset = array(
436 'japanese.euc' => 'euc-jp',
437 'ja_jp.ujis' => 'euc-jp',
438 'korean.euc' => 'euc-kr',
439 'sr@Latn' => 'iso-8859-2',
440 'zh_cn' => 'gb2312',
441 'zh_hk' => 'big5',
442 'zh_tw' => 'big5'
443 );
444
445 /**
446 * TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
447 * Empty values means "utf-8"
448 *
449 * @var array
450 */
451 public $charSetArray = array(
452 'af' => '',
453 'ar' => 'iso-8859-6',
454 'ba' => 'iso-8859-2',
455 'bg' => 'windows-1251',
456 'br' => '',
457 'ca' => 'iso-8859-15',
458 'ch' => 'gb2312',
459 'cs' => 'windows-1250',
460 'cz' => 'windows-1250',
461 'da' => '',
462 'de' => '',
463 'dk' => '',
464 'el' => 'iso-8859-7',
465 'eo' => 'utf-8',
466 'es' => '',
467 'et' => 'iso-8859-4',
468 'eu' => '',
469 'fa' => 'utf-8',
470 'fi' => '',
471 'fo' => 'utf-8',
472 'fr' => '',
473 'fr_CA' => '',
474 'ga' => '',
475 'ge' => 'utf-8',
476 'gl' => '',
477 'gr' => 'iso-8859-7',
478 'he' => 'utf-8',
479 'hi' => 'utf-8',
480 'hk' => 'big5',
481 'hr' => 'windows-1250',
482 'hu' => 'iso-8859-2',
483 'is' => 'utf-8',
484 'it' => '',
485 'ja' => 'shift_jis',
486 'jp' => 'shift_jis',
487 'ka' => 'utf-8',
488 'kl' => 'utf-8',
489 'km' => 'utf-8',
490 'ko' => 'euc-kr',
491 'kr' => 'euc-kr',
492 'lt' => 'windows-1257',
493 'lv' => 'utf-8',
494 'ms' => '',
495 'my' => '',
496 'nl' => '',
497 'no' => '',
498 'pl' => 'iso-8859-2',
499 'pt' => '',
500 'pt_BR' => '',
501 'qc' => '',
502 'ro' => 'iso-8859-2',
503 'ru' => 'windows-1251',
504 'se' => '',
505 'si' => 'windows-1250',
506 'sk' => 'windows-1250',
507 'sl' => 'windows-1250',
508 'sq' => 'utf-8',
509 'sr' => 'utf-8',
510 'sv' => '',
511 'th' => 'iso-8859-11',
512 'tr' => 'iso-8859-9',
513 'ua' => 'windows-1251',
514 'uk' => 'windows-1251',
515 'vi' => 'utf-8',
516 'vn' => 'utf-8',
517 'zh' => 'big5'
518 );
519
520 /**
521 * Normalize - changes input character set to lowercase letters.
522 *
523 * @param string $charset Input charset
524 * @return string Normalized charset
525 */
526 public function parse_charset($charset)
527 {
528 $charset = trim(strtolower($charset));
529 if (isset($this->synonyms[$charset])) {
530 $charset = $this->synonyms[$charset];
531 }
532 return $charset;
533 }
534
535 /**
536 * Get the charset of a locale.
537 *
538 * ln language
539 * ln_CN language / country
540 * ln_CN.cs language / country / charset
541 * ln_CN.cs@mod language / country / charset / modifier
542 *
543 * @param string $locale Locale string
544 * @return string Charset resolved for locale string
545 */
546 public function get_locale_charset($locale)
547 {
548 $locale = strtolower($locale);
549 // Exact locale specific charset?
550 if (isset($this->locale_to_charset[$locale])) {
551 return $this->locale_to_charset[$locale];
552 }
553 // Get modifier
554 list($locale, $modifier) = explode('@', $locale);
555 // Locale contains charset: use it
556 list($locale, $charset) = explode('.', $locale);
557 if ($charset) {
558 return $this->parse_charset($charset);
559 }
560 // Modifier is 'euro' (after charset check, because of xx.utf-8@euro)
561 if ($modifier === 'euro') {
562 return 'iso-8859-15';
563 }
564 // Get language
565 list($language, ) = explode('_', $locale);
566 if (isset($this->lang_to_script[$language])) {
567 $script = $this->lang_to_script[$language];
568 }
569 if (TYPO3_OS === 'WIN') {
570 $cs = $this->script_to_charset_windows[$script] ?: 'windows-1252';
571 } else {
572 $cs = $this->script_to_charset_unix[$script] ?: 'utf-8';
573 }
574 return $cs;
575 }
576
577 /********************************************
578 *
579 * Charset Conversion functions
580 *
581 ********************************************/
582 /**
583 * Convert from one charset to another charset.
584 *
585 * @param string $inputString Input string
586 * @param string $fromCharset From charset (the current charset of the string)
587 * @param string $toCharset To charset (the output charset wanted)
588 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
589 * @return string Converted string
590 * @see convArray()
591 */
592 public function conv($inputString, $fromCharset, $toCharset, $useEntityForNoChar = false)
593 {
594 if ($fromCharset === $toCharset) {
595 return $inputString;
596 }
597 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
598 if ($toCharset === 'utf-8' || !$useEntityForNoChar) {
599 switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
600 case 'mbstring':
601 $convertedString = mb_convert_encoding($inputString, $toCharset, $fromCharset);
602 if (false !== $convertedString) {
603 return $convertedString;
604 }
605 // Returns FALSE for unsupported charsets
606 break;
607 case 'iconv':
608 $convertedString = iconv($fromCharset, $toCharset . '//TRANSLIT', $inputString);
609 if (false !== $convertedString) {
610 return $convertedString;
611 }
612 break;
613 case 'recode':
614 $convertedString = recode_string($fromCharset . '..' . $toCharset, $inputString);
615 if (false !== $convertedString) {
616 return $convertedString;
617 }
618 break;
619 }
620 }
621 if ($fromCharset !== 'utf-8') {
622 $inputString = $this->utf8_encode($inputString, $fromCharset);
623 }
624 if ($toCharset !== 'utf-8') {
625 $inputString = $this->utf8_decode($inputString, $toCharset, $useEntityForNoChar);
626 }
627 return $inputString;
628 }
629
630 /**
631 * Convert all elements in ARRAY with type string from one charset to another charset.
632 * NOTICE: Array is passed by reference!
633 *
634 * @param array $array Input array, possibly multidimensional
635 * @param string $fromCharset From charset (the current charset of the string)
636 * @param string $toCharset To charset (the output charset wanted)
637 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
638 * @return void
639 * @see conv()
640 */
641 public function convArray(&$array, $fromCharset, $toCharset, $useEntityForNoChar = false)
642 {
643 foreach ($array as $key => $value) {
644 if (is_array($array[$key])) {
645 $this->convArray($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
646 } elseif (is_string($array[$key])) {
647 $array[$key] = $this->conv($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
648 }
649 }
650 }
651
652 /**
653 * Converts $str from $charset to UTF-8
654 *
655 * @param string $str String in local charset to convert to UTF-8
656 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
657 * @return string Output string, converted to UTF-8
658 */
659 public function utf8_encode($str, $charset)
660 {
661 if ($charset === 'utf-8') {
662 return $str;
663 }
664 // Charset is case-insensitive
665 // Parse conv. table if not already
666 if ($this->initCharset($charset)) {
667 $strLen = strlen($str);
668 $outStr = '';
669 // Traverse each char in string
670 for ($a = 0; $a < $strLen; $a++) {
671 $chr = substr($str, $a, 1);
672 $ord = ord($chr);
673 // If the charset has two bytes per char
674 if (isset($this->twoByteSets[$charset])) {
675 $ord2 = ord($str[$a + 1]);
676 // Assume big endian
677 $ord = $ord << 8 | $ord2;
678 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
679 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
680 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
681 } else {
682 $outStr .= chr($this->noCharByteVal);
683 }
684 // No char exists
685 $a++;
686 } elseif ($ord > 127) {
687 // If char has value over 127 it's a multibyte char in UTF-8
688 // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
689 if (isset($this->eucBasedSets[$charset])) {
690 // Shift-JIS: chars between 160 and 223 are single byte
691 if ($charset !== 'shift_jis' || ($ord < 160 || $ord > 223)) {
692 $a++;
693 $ord2 = ord(substr($str, $a, 1));
694 $ord = $ord * 256 + $ord2;
695 }
696 }
697 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
698 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
699 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
700 } else {
701 $outStr .= chr($this->noCharByteVal);
702 }
703 } else {
704 $outStr .= $chr;
705 }
706 }
707 return $outStr;
708 }
709 }
710
711 /**
712 * Converts $str from UTF-8 to $charset
713 *
714 * @param string $str String in UTF-8 to convert to local charset
715 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
716 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
717 * @return string Output string, converted to local charset
718 */
719 public function utf8_decode($str, $charset, $useEntityForNoChar = false)
720 {
721 if ($charset === 'utf-8') {
722 return $str;
723 }
724 // Charset is case-insensitive.
725 // Parse conv. table if not already
726 if ($this->initCharset($charset)) {
727 $strLen = strlen($str);
728 $outStr = '';
729 // Traverse each char in UTF-8 string
730 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) {
731 $chr = substr($str, $a, 1);
732 $ord = ord($chr);
733 // This means multibyte! (first byte!)
734 if ($ord > 127) {
735 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
736 if ($ord & 64) {
737 // Add first byte
738 $buf = $chr;
739 // For each byte in multibyte string
740 for ($b = 0; $b < 8; $b++) {
741 // Shift it left and
742 $ord = $ord << 1;
743 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
744 if ($ord & 128) {
745 $a++;
746 // ... and add the next char.
747 $buf .= substr($str, $a, 1);
748 } else {
749 break;
750 }
751 }
752 // If the UTF-8 char-sequence is found then...
753 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
754 // The local number
755 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
756 // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
757 if ($mByte > 255) {
758 $outStr .= chr(($mByte >> 8 & 255)) . chr(($mByte & 255));
759 } else {
760 $outStr .= chr($mByte);
761 }
762 } elseif ($useEntityForNoChar) {
763 // Create num entity:
764 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
765 } else {
766 $outStr .= chr($this->noCharByteVal);
767 }
768 } else {
769 $outStr .= chr($this->noCharByteVal);
770 }
771 } else {
772 $outStr .= $chr;
773 }
774 }
775 return $outStr;
776 }
777 }
778
779 /**
780 * Converts all chars > 127 to numeric entities.
781 *
782 * @param string $str Input string
783 * @return string Output string
784 */
785 public function utf8_to_entities($str)
786 {
787 $strLen = strlen($str);
788 $outStr = '';
789 // Traverse each char in UTF-8 string.
790 for ($a = 0; $a < $strLen; $a++) {
791 $chr = substr($str, $a, 1);
792 $ord = ord($chr);
793 // This means multibyte! (first byte!)
794 if ($ord > 127) {
795 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
796 if ($ord & 64) {
797 // Add first byte
798 $buf = $chr;
799 // For each byte in multibyte string...
800 for ($b = 0; $b < 8; $b++) {
801 // Shift it left and ...
802 $ord = $ord << 1;
803 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
804 if ($ord & 128) {
805 $a++;
806 // ... and add the next char.
807 $buf .= substr($str, $a, 1);
808 } else {
809 break;
810 }
811 }
812 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
813 } else {
814 $outStr .= chr($this->noCharByteVal);
815 }
816 } else {
817 $outStr .= $chr;
818 }
819 }
820 return $outStr;
821 }
822
823 /**
824 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
825 *
826 * @param string $str Input string, UTF-8
827 * @param bool $alsoStdHtmlEnt If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
828 * @return string Output string
829 */
830 public function entities_to_utf8($str, $alsoStdHtmlEnt = false)
831 {
832 if ($alsoStdHtmlEnt) {
833 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8'));
834 }
835 $token = md5(microtime());
836 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
837 foreach ($parts as $k => $v) {
838 // Only take every second element
839 if ($k % 2 === 0) {
840 continue;
841 }
842 $position = 0;
843 // Dec or hex entities
844 if (substr($v, $position, 1) === '#') {
845 $position++;
846 if (substr($v, $position, 1) === 'x') {
847 $v = hexdec(substr($v, ++$position));
848 } else {
849 $v = substr($v, $position);
850 }
851 $parts[$k] = $this->UnumberToChar($v);
852 } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) {
853 // Other entities:
854 $v = $trans_tbl['&' . $v . ';'];
855 $parts[$k] = $v;
856 } else {
857 // No conversion:
858 $parts[$k] = '&' . $v . ';';
859 }
860 }
861 return implode('', $parts);
862 }
863
864 /**
865 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
866 *
867 * @param string $str Input string, UTF-8
868 * @param bool $convEntities If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
869 * @param bool $retChar If set, then instead of integer numbers the real UTF-8 char is returned.
870 * @return array Output array with the char numbers
871 */
872 public function utf8_to_numberarray($str, $convEntities = false, $retChar = false)
873 {
874 // If entities must be registered as well...:
875 if ($convEntities) {
876 $str = $this->entities_to_utf8($str, 1);
877 }
878 // Do conversion:
879 $strLen = strlen($str);
880 $outArr = array();
881 // Traverse each char in UTF-8 string.
882 for ($a = 0; $a < $strLen; $a++) {
883 $chr = substr($str, $a, 1);
884 $ord = ord($chr);
885 // This means multibyte! (first byte!)
886 if ($ord > 127) {
887 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
888 if ($ord & 64) {
889 // Add first byte
890 $buf = $chr;
891 // For each byte in multibyte string...
892 for ($b = 0; $b < 8; $b++) {
893 // Shift it left and ...
894 $ord = $ord << 1;
895 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
896 if ($ord & 128) {
897 $a++;
898 // ... and add the next char.
899 $buf .= substr($str, $a, 1);
900 } else {
901 break;
902 }
903 }
904 $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
905 } else {
906 $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
907 }
908 } else {
909 $outArr[] = $retChar ? chr($ord) : $ord;
910 }
911 }
912 return $outArr;
913 }
914
915 /**
916 * Converts a UNICODE number to a UTF-8 multibyte character
917 * Algorithm based on script found at From: http://czyborra.com/utf/
918 * Unit-tested by Kasper
919 *
920 * The binary representation of the character's integer value is thus simply spread across the bytes
921 * and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
922 *
923 * bytes | bits | representation
924 * 1 | 7 | 0vvvvvvv
925 * 2 | 11 | 110vvvvv 10vvvvvv
926 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
927 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
928 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
929 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
930 *
931 * @param int $unicodeInteger UNICODE integer
932 * @return string UTF-8 multibyte character string
933 * @see utf8CharToUnumber()
934 */
935 public function UnumberToChar($unicodeInteger)
936 {
937 $str = '';
938 if ($unicodeInteger < 128) {
939 $str .= chr($unicodeInteger);
940 } elseif ($unicodeInteger < 2048) {
941 $str .= chr(192 | $unicodeInteger >> 6);
942 $str .= chr(128 | $unicodeInteger & 63);
943 } elseif ($unicodeInteger < 65536) {
944 $str .= chr(224 | $unicodeInteger >> 12);
945 $str .= chr(128 | $unicodeInteger >> 6 & 63);
946 $str .= chr(128 | $unicodeInteger & 63);
947 } elseif ($unicodeInteger < 2097152) {
948 $str .= chr(240 | $unicodeInteger >> 18);
949 $str .= chr(128 | $unicodeInteger >> 12 & 63);
950 $str .= chr(128 | $unicodeInteger >> 6 & 63);
951 $str .= chr(128 | $unicodeInteger & 63);
952 } elseif ($unicodeInteger < 67108864) {
953 $str .= chr(248 | $unicodeInteger >> 24);
954 $str .= chr(128 | $unicodeInteger >> 18 & 63);
955 $str .= chr(128 | $unicodeInteger >> 12 & 63);
956 $str .= chr(128 | $unicodeInteger >> 6 & 63);
957 $str .= chr(128 | $unicodeInteger & 63);
958 } elseif ($unicodeInteger < 2147483648) {
959 $str .= chr(252 | $unicodeInteger >> 30);
960 $str .= chr(128 | $unicodeInteger >> 24 & 63);
961 $str .= chr(128 | $unicodeInteger >> 18 & 63);
962 $str .= chr(128 | $unicodeInteger >> 12 & 63);
963 $str .= chr(128 | $unicodeInteger >> 6 & 63);
964 $str .= chr(128 | $unicodeInteger & 63);
965 } else {
966 // Cannot express a 32-bit character in UTF-8
967 $str .= chr($this->noCharByteVal);
968 }
969 return $str;
970 }
971
972 /**
973 * Converts a UTF-8 Multibyte character to a UNICODE number
974 * Unit-tested by Kasper
975 *
976 * @param string $str UTF-8 multibyte character string
977 * @param bool $hex If set, then a hex. number is returned.
978 * @return int UNICODE integer
979 * @see UnumberToChar()
980 */
981 public function utf8CharToUnumber($str, $hex = false)
982 {
983 // First char
984 $ord = ord($str[0]);
985 // This verifies that it IS a multi byte string
986 if (($ord & 192) === 192) {
987 $binBuf = '';
988 // For each byte in multibyte string...
989 for ($b = 0; $b < 8; $b++) {
990 // Shift it left and ...
991 $ord = $ord << 1;
992 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
993 if ($ord & 128) {
994 $binBuf .= substr('00000000' . decbin(ord(substr($str, ($b + 1), 1))), -6);
995 } else {
996 break;
997 }
998 }
999 $binBuf = substr(('00000000' . decbin(ord($str[0]))), -(6 - $b)) . $binBuf;
1000 $int = bindec($binBuf);
1001 } else {
1002 $int = $ord;
1003 }
1004 return $hex ? 'x' . dechex($int) : $int;
1005 }
1006
1007 /********************************************
1008 *
1009 * Init functions
1010 *
1011 ********************************************/
1012 /**
1013 * This will initialize a charset for use if it's defined in the 'typo3/sysext/core/Resources/Private/Charsets/csconvtbl/' folder
1014 * This function is automatically called by the conversion functions
1015 *
1016 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
1017 *
1018 * @param string $charset The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
1019 * @return int Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
1020 * @access private
1021 */
1022 public function initCharset($charset)
1023 {
1024 // Only process if the charset is not yet loaded:
1025 if (!is_array($this->parsedCharsets[$charset])) {
1026 // Conversion table filename:
1027 $charsetConvTableFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/csconvtbl/' . $charset . '.tbl';
1028 // If the conversion table is found:
1029 if ($charset && GeneralUtility::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
1030 // Cache file for charsets:
1031 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1032 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1033 if ($cacheFile && @is_file($cacheFile)) {
1034 $this->parsedCharsets[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1035 } else {
1036 // Parse conversion table into lines:
1037 $lines = GeneralUtility::trimExplode(LF, GeneralUtility::getUrl($charsetConvTableFile), true);
1038 // Initialize the internal variable holding the conv. table:
1039 $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
1040 // traverse the lines:
1041 $detectedType = '';
1042 foreach ($lines as $value) {
1043 // Comment line or blanks are ignored.
1044 if (trim($value) && $value[0] !== '#') {
1045 // Detect type if not done yet: (Done on first real line)
1046 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1047 if (!$detectedType) {
1048 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
1049 }
1050 if ($detectedType === 'ms-token') {
1051 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1052 } elseif ($detectedType === 'whitespaced') {
1053 $regA = array();
1054 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1055 $hexbyte = $regA[1];
1056 $utf8 = 'U+' . $regA[2];
1057 }
1058 $decval = hexdec(trim($hexbyte));
1059 if ($decval > 127) {
1060 $utf8decval = hexdec(substr(trim($utf8), 2));
1061 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1062 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1063 }
1064 }
1065 }
1066 if ($cacheFile) {
1067 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1068 }
1069 }
1070 return 2;
1071 } else {
1072 return false;
1073 }
1074 } else {
1075 return 1;
1076 }
1077 }
1078
1079 /**
1080 * This function initializes all UTF-8 character data tables.
1081 *
1082 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1083 *
1084 * @param string $mode Mode ("case", "ascii", ...)
1085 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1086 * @access private
1087 */
1088 public function initUnicodeData($mode = null)
1089 {
1090 // Cache files
1091 $cacheFileCase = GeneralUtility::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1092 $cacheFileASCII = GeneralUtility::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1093 // Only process if the tables are not yet loaded
1094 switch ($mode) {
1095 case 'case':
1096 if (is_array($this->caseFolding['utf-8'])) {
1097 return 1;
1098 }
1099 // Use cached version if possible
1100 if ($cacheFileCase && @is_file($cacheFileCase)) {
1101 $this->caseFolding['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileCase));
1102 return 2;
1103 }
1104 break;
1105 case 'ascii':
1106 if (is_array($this->toASCII['utf-8'])) {
1107 return 1;
1108 }
1109 // Use cached version if possible
1110 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1111 $this->toASCII['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileASCII));
1112 return 2;
1113 }
1114 break;
1115 }
1116 // Process main Unicode data file
1117 $unicodeDataFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/UnicodeData.txt';
1118 if (!(GeneralUtility::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1119 return false;
1120 }
1121 $fh = fopen($unicodeDataFile, 'rb');
1122 if (!$fh) {
1123 return false;
1124 }
1125 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1126 // Note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1127 $this->caseFolding['utf-8'] = array();
1128 $utf8CaseFolding = &$this->caseFolding['utf-8'];
1129 // a shorthand
1130 $utf8CaseFolding['toUpper'] = array();
1131 $utf8CaseFolding['toLower'] = array();
1132 $utf8CaseFolding['toTitle'] = array();
1133 // Array of temp. decompositions
1134 $decomposition = array();
1135 // Array of chars that are marks (eg. composing accents)
1136 $mark = array();
1137 // Array of chars that are numbers (eg. digits)
1138 $number = array();
1139 // Array of chars to be omitted (eg. Russian hard sign)
1140 $omit = array();
1141 while (!feof($fh)) {
1142 $line = fgets($fh, 4096);
1143 // Has a lot of info
1144 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title, ) = explode(';', rtrim($line));
1145 $ord = hexdec($char);
1146 if ($ord > 65535) {
1147 // Only process the BMP
1148 break;
1149 }
1150 $utf8_char = $this->UnumberToChar($ord);
1151 if ($upper) {
1152 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1153 }
1154 if ($lower) {
1155 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1156 }
1157 // Store "title" only when different from "upper" (only a few)
1158 if ($title && $title !== $upper) {
1159 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1160 }
1161 switch ($cat[0]) {
1162 case 'M':
1163 // mark (accent, umlaut, ...)
1164 $mark['U+' . $char] = 1;
1165 break;
1166 case 'N':
1167 // numeric value
1168 if ($ord > 128 && $num !== '') {
1169 $number['U+' . $char] = $num;
1170 }
1171 }
1172 // Accented Latin letters without "official" decomposition
1173 $match = array();
1174 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1175 $c = ord($match[2]);
1176 if ($match[1] === 'SMALL') {
1177 $c += 32;
1178 }
1179 $decomposition['U+' . $char] = array(dechex($c));
1180 continue;
1181 }
1182 $match = array();
1183 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1184 switch ($match[1]) {
1185 case '<circle>':
1186 // add parenthesis as circle replacement, eg (1)
1187 $match[2] = '0028 ' . $match[2] . ' 0029';
1188 break;
1189 case '<square>':
1190 // add square brackets as square replacement, eg [1]
1191 $match[2] = '005B ' . $match[2] . ' 005D';
1192 break;
1193 case '<compat>':
1194 // ignore multi char decompositions that start with a space
1195 if (preg_match('/^0020 /', $match[2])) {
1196 continue 2;
1197 }
1198 break;
1199 case '<initial>':
1200 case '<medial>':
1201 case '<final>':
1202 case '<isolated>':
1203 case '<vertical>':
1204 continue 2;
1205 }
1206 $decomposition['U+' . $char] = explode(' ', $match[2]);
1207 }
1208 }
1209 fclose($fh);
1210 // Process additional Unicode data for casing (allow folded characters to expand into a sequence)
1211 $specialCasingFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/SpecialCasing.txt';
1212 if (GeneralUtility::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1213 $fh = fopen($specialCasingFile, 'rb');
1214 if ($fh) {
1215 while (!feof($fh)) {
1216 $line = fgets($fh, 4096);
1217 if ($line[0] !== '#' && trim($line) !== '') {
1218 list($char, $lower, $title, $upper, $cond) = GeneralUtility::trimExplode(';', $line);
1219 if ($cond === '' || $cond[0] === '#') {
1220 $utf8_char = $this->UnumberToChar(hexdec($char));
1221 if ($char !== $lower) {
1222 $arr = explode(' ', $lower);
1223 for ($i = 0; isset($arr[$i]); $i++) {
1224 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1225 }
1226 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1227 }
1228 if ($char !== $title && $title !== $upper) {
1229 $arr = explode(' ', $title);
1230 for ($i = 0; isset($arr[$i]); $i++) {
1231 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1232 }
1233 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1234 }
1235 if ($char !== $upper) {
1236 $arr = explode(' ', $upper);
1237 for ($i = 0; isset($arr[$i]); $i++) {
1238 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1239 }
1240 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1241 }
1242 }
1243 }
1244 }
1245 fclose($fh);
1246 }
1247 }
1248 // Process custom decompositions
1249 $customTranslitFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/Translit.txt';
1250 if (GeneralUtility::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1251 $fh = fopen($customTranslitFile, 'rb');
1252 if ($fh) {
1253 while (!feof($fh)) {
1254 $line = fgets($fh, 4096);
1255 if ($line[0] !== '#' && trim($line) !== '') {
1256 list($char, $translit) = GeneralUtility::trimExplode(';', $line);
1257 if (!$translit) {
1258 $omit['U+' . $char] = 1;
1259 }
1260 $decomposition['U+' . $char] = explode(' ', $translit);
1261 }
1262 }
1263 fclose($fh);
1264 }
1265 }
1266 // Decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1267 foreach ($decomposition as $from => $to) {
1268 $code_decomp = array();
1269 while ($code_value = array_shift($to)) {
1270 // Do recursive decomposition
1271 if (isset($decomposition['U+' . $code_value])) {
1272 foreach (array_reverse($decomposition['U+' . $code_value]) as $cv) {
1273 array_unshift($to, $cv);
1274 }
1275 } elseif (!isset($mark['U+' . $code_value])) {
1276 // remove mark
1277 array_push($code_decomp, $code_value);
1278 }
1279 }
1280 if (!empty($code_decomp) || isset($omit[$from])) {
1281 $decomposition[$from] = $code_decomp;
1282 } else {
1283 unset($decomposition[$from]);
1284 }
1285 }
1286 // Create ascii only mapping
1287 $this->toASCII['utf-8'] = array();
1288 $ascii = &$this->toASCII['utf-8'];
1289 foreach ($decomposition as $from => $to) {
1290 $code_decomp = array();
1291 while ($code_value = array_shift($to)) {
1292 $ord = hexdec($code_value);
1293 if ($ord > 127) {
1294 continue 2;
1295 } else {
1296 // Skip decompositions containing non-ASCII chars
1297 array_push($code_decomp, chr($ord));
1298 }
1299 }
1300 $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1301 }
1302 // Add numeric decompositions
1303 foreach ($number as $from => $to) {
1304 $utf8_char = $this->UnumberToChar(hexdec($from));
1305 if (!isset($ascii[$utf8_char])) {
1306 $ascii[$utf8_char] = $to;
1307 }
1308 }
1309 if ($cacheFileCase) {
1310 GeneralUtility::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1311 }
1312 if ($cacheFileASCII) {
1313 GeneralUtility::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1314 }
1315 return 3;
1316 }
1317
1318 /**
1319 * This function initializes the folding table for a charset other than UTF-8.
1320 * This function is automatically called by the case folding functions.
1321 *
1322 * @param string $charset Charset for which to initialize case folding.
1323 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1324 * @access private
1325 */
1326 public function initCaseFolding($charset)
1327 {
1328 // Only process if the case table is not yet loaded:
1329 if (is_array($this->caseFolding[$charset])) {
1330 return 1;
1331 }
1332 // Use cached version if possible
1333 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1334 if ($cacheFile && @is_file($cacheFile)) {
1335 $this->caseFolding[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1336 return 2;
1337 }
1338 // init UTF-8 conversion for this charset
1339 if (!$this->initCharset($charset)) {
1340 return false;
1341 }
1342 // UTF-8 case folding is used as the base conversion table
1343 if (!$this->initUnicodeData('case')) {
1344 return false;
1345 }
1346 $nochar = chr($this->noCharByteVal);
1347 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1348 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1349 $c = $this->utf8_decode($utf8, $charset);
1350 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1351 if ($cc !== '' && $cc !== $nochar) {
1352 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1353 }
1354 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1355 if ($cc !== '' && $cc !== $nochar) {
1356 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1357 }
1358 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1359 if ($cc !== '' && $cc !== $nochar) {
1360 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1361 }
1362 }
1363 // Add the ASCII case table
1364 $start = ord('a');
1365 $end = ord('z');
1366 for ($i = $start; $i <= $end; $i++) {
1367 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1368 }
1369 $start = ord('A');
1370 $end = ord('Z');
1371 for ($i = $start; $i <= $end; $i++) {
1372 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1373 }
1374 if ($cacheFile) {
1375 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1376 }
1377 return 3;
1378 }
1379
1380 /**
1381 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1382 * This function is automatically called by the ASCII transliteration functions.
1383 *
1384 * @param string $charset Charset for which to initialize conversion.
1385 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1386 * @access private
1387 */
1388 public function initToASCII($charset)
1389 {
1390 // Only process if the case table is not yet loaded:
1391 if (is_array($this->toASCII[$charset])) {
1392 return 1;
1393 }
1394 // Use cached version if possible
1395 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1396 if ($cacheFile && @is_file($cacheFile)) {
1397 $this->toASCII[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1398 return 2;
1399 }
1400 // Init UTF-8 conversion for this charset
1401 if (!$this->initCharset($charset)) {
1402 return false;
1403 }
1404 // UTF-8/ASCII transliteration is used as the base conversion table
1405 if (!$this->initUnicodeData('ascii')) {
1406 return false;
1407 }
1408 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1409 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1410 $c = $this->utf8_decode($utf8, $charset);
1411 if (isset($this->toASCII['utf-8'][$utf8])) {
1412 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1413 }
1414 }
1415 if ($cacheFile) {
1416 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1417 }
1418 return 3;
1419 }
1420
1421 /********************************************
1422 *
1423 * String operation functions
1424 *
1425 ********************************************/
1426 /**
1427 * Returns a part of a string.
1428 * Unit-tested by Kasper (single byte charsets only)
1429 *
1430 * @param string $charset The character set
1431 * @param string $string Character string
1432 * @param int $start Start position (character position)
1433 * @param int $len Length (in characters)
1434 * @return string The substring
1435 * @see substr(), mb_substr()
1436 */
1437 public function substr($charset, $string, $start, $len = null)
1438 {
1439 if ($len === 0 || $string === '') {
1440 return '';
1441 }
1442 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1443 // Cannot omit $len, when specifying charset
1444 if ($len === null) {
1445 // Save internal encoding
1446 $enc = mb_internal_encoding();
1447 mb_internal_encoding($charset);
1448 $str = mb_substr($string, $start);
1449 // Restore internal encoding
1450 mb_internal_encoding($enc);
1451 return $str;
1452 } else {
1453 return mb_substr($string, $start, $len, $charset);
1454 }
1455 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'iconv') {
1456 // Cannot omit $len, when specifying charset
1457 if ($len === null) {
1458 // Save internal encoding
1459 $enc = iconv_get_encoding('internal_encoding');
1460 iconv_set_encoding('internal_encoding', $charset);
1461 $str = iconv_substr($string, $start);
1462 // Restore internal encoding
1463 iconv_set_encoding('internal_encoding', $enc);
1464 return $str;
1465 } else {
1466 return iconv_substr($string, $start, $len, $charset);
1467 }
1468 } elseif ($charset === 'utf-8') {
1469 return $this->utf8_substr($string, $start, $len);
1470 } elseif ($this->eucBasedSets[$charset]) {
1471 return $this->euc_substr($string, $start, $charset, $len);
1472 } elseif ($this->twoByteSets[$charset]) {
1473 return substr($string, $start * 2, $len * 2);
1474 } elseif ($this->fourByteSets[$charset]) {
1475 return substr($string, $start * 4, $len * 4);
1476 }
1477 // Treat everything else as single-byte encoding
1478 return $len === null ? substr($string, $start) : substr($string, $start, $len);
1479 }
1480
1481 /**
1482 * Counts the number of characters.
1483 * Unit-tested by Kasper (single byte charsets only)
1484 *
1485 * @param string $charset The character set
1486 * @param string $string Character string
1487 * @return int The number of characters
1488 * @see strlen()
1489 */
1490 public function strlen($charset, $string)
1491 {
1492 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1493 return mb_strlen($string, $charset);
1494 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'iconv') {
1495 return iconv_strlen($string, $charset);
1496 } elseif ($charset === 'utf-8') {
1497 return $this->utf8_strlen($string);
1498 } elseif ($this->eucBasedSets[$charset]) {
1499 return $this->euc_strlen($string, $charset);
1500 } elseif ($this->twoByteSets[$charset]) {
1501 return strlen($string) / 2;
1502 } elseif ($this->fourByteSets[$charset]) {
1503 return strlen($string) / 4;
1504 }
1505 // Treat everything else as single-byte encoding
1506 return strlen($string);
1507 }
1508
1509 /**
1510 * Method to crop strings using the mb_substr function.
1511 *
1512 * @param string $charset The character set
1513 * @param string $string String to be cropped
1514 * @param int $len Crop length (in characters)
1515 * @param string $crop Crop signifier
1516 * @return string The shortened string
1517 * @see mb_strlen(), mb_substr()
1518 */
1519 protected function cropMbstring($charset, $string, $len, $crop = '')
1520 {
1521 if ((int)$len === 0 || mb_strlen($string, $charset) <= abs($len)) {
1522 return $string;
1523 }
1524 if ($len > 0) {
1525 $string = mb_substr($string, 0, $len, $charset) . $crop;
1526 } else {
1527 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1528 }
1529 return $string;
1530 }
1531
1532 /**
1533 * Truncates a string and pre-/appends a string.
1534 * Unit tested by Kasper
1535 *
1536 * @param string $charset The character set
1537 * @param string $string Character string
1538 * @param int $len Length (in characters)
1539 * @param string $crop Crop signifier
1540 * @return string The shortened string
1541 * @see substr(), mb_strimwidth()
1542 */
1543 public function crop($charset, $string, $len, $crop = '')
1544 {
1545 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1546 return $this->cropMbstring($charset, $string, $len, $crop);
1547 }
1548 if ((int)$len === 0) {
1549 return $string;
1550 }
1551 if ($charset === 'utf-8') {
1552 $i = $this->utf8_char2byte_pos($string, $len);
1553 } elseif ($this->eucBasedSets[$charset]) {
1554 $i = $this->euc_char2byte_pos($string, $len, $charset);
1555 } else {
1556 if ($len > 0) {
1557 $i = $len;
1558 } else {
1559 $i = strlen($string) + $len;
1560 if ($i <= 0) {
1561 $i = false;
1562 }
1563 }
1564 }
1565 // $len outside actual string length
1566 if ($i === false) {
1567 return $string;
1568 } else {
1569 if ($len > 0) {
1570 if (isset($string[$i])) {
1571 return substr($string, 0, $i) . $crop;
1572 }
1573 } else {
1574 if (isset($string[$i - 1])) {
1575 return $crop . substr($string, $i);
1576 }
1577 }
1578 }
1579 return $string;
1580 }
1581
1582 /**
1583 * Cuts a string short at a given byte length.
1584 *
1585 * @param string $charset The character set
1586 * @param string $string Character string
1587 * @param int $len The byte length
1588 * @return string The shortened string
1589 * @see mb_strcut()
1590 */
1591 public function strtrunc($charset, $string, $len)
1592 {
1593 if ($len <= 0) {
1594 return '';
1595 }
1596 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1597 return mb_strcut($string, 0, $len, $charset);
1598 } elseif ($charset === 'utf-8') {
1599 return $this->utf8_strtrunc($string, $len);
1600 } elseif ($this->eucBasedSets[$charset]) {
1601 return $this->euc_strtrunc($string, $len, $charset);
1602 } elseif ($this->twoByteSets[$charset]) {
1603 if ($len % 2) {
1604 $len--;
1605 }
1606 } elseif ($this->fourByteSets[$charset]) {
1607 $x = $len % 4;
1608 // Realign to position dividable by four
1609 $len -= $x;
1610 }
1611 // Treat everything else as single-byte encoding
1612 return substr($string, 0, $len);
1613 }
1614
1615 /**
1616 * Translates all characters of a string into their respective case values.
1617 * Unlike strtolower() and strtoupper() this method is locale independent.
1618 * Note that the string length may change!
1619 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1620 * Unit-tested by Kasper
1621 * Real case folding is language dependent, this method ignores this fact.
1622 *
1623 * @param string $charset Character set of string
1624 * @param string $string Input string to convert case for
1625 * @param string $case Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1626 * @return string The converted string
1627 * @see strtolower(), strtoupper()
1628 */
1629 public function conv_case($charset, $string, $case)
1630 {
1631 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1632 if ($case === 'toLower') {
1633 $string = mb_strtolower($string, $charset);
1634 } else {
1635 $string = mb_strtoupper($string, $charset);
1636 }
1637 } elseif ($charset === 'utf-8') {
1638 $string = $this->utf8_char_mapping($string, 'case', $case);
1639 } elseif (isset($this->eucBasedSets[$charset])) {
1640 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1641 } else {
1642 // Treat everything else as single-byte encoding
1643 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1644 }
1645 return $string;
1646 }
1647
1648 /**
1649 * Equivalent of lcfirst/ucfirst but using character set.
1650 *
1651 * @param string $charset
1652 * @param string $string
1653 * @param string $case
1654 * @return string
1655 * @see \TYPO3\CMS\Core\Charset\CharsetConverter::conv_case()
1656 */
1657 public function convCaseFirst($charset, $string, $case)
1658 {
1659 $firstChar = $this->substr($charset, $string, 0, 1);
1660 $firstChar = $this->conv_case($charset, $firstChar, $case);
1661 $remainder = $this->substr($charset, $string, 1);
1662 return $firstChar . $remainder;
1663 }
1664
1665 /**
1666 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1667 *
1668 * @param string $charset Character set of string
1669 * @param string $string Input string to convert
1670 * @return string The converted string
1671 */
1672 public function specCharsToASCII($charset, $string)
1673 {
1674 if ($charset === 'utf-8') {
1675 $string = $this->utf8_char_mapping($string, 'ascii');
1676 } elseif (isset($this->eucBasedSets[$charset])) {
1677 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1678 } else {
1679 // Treat everything else as single-byte encoding
1680 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1681 }
1682 return $string;
1683 }
1684
1685 /**
1686 * Converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1687 * into a TYPO3-readable language code
1688 *
1689 * @param string $languageCodesList List of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1690 * @return string A preferred language that TYPO3 supports, or "default" if none found
1691 */
1692 public function getPreferredClientLanguage($languageCodesList)
1693 {
1694 $allLanguageCodes = $this->getAllLanguageCodes();
1695 $selectedLanguage = 'default';
1696 $preferredLanguages = GeneralUtility::trimExplode(',', $languageCodesList);
1697 // Order the preferred languages after they key
1698 $sortedPreferredLanguages = array();
1699 foreach ($preferredLanguages as $preferredLanguage) {
1700 $quality = 1.0;
1701 if (strpos($preferredLanguage, ';q=') !== false) {
1702 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1703 }
1704 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1705 }
1706 // Loop through the languages, with the highest priority first
1707 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1708 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1709 if (isset($allLanguageCodes[$preferredLanguage])) {
1710 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1711 break;
1712 }
1713 // Strip the country code from the end
1714 list($preferredLanguage, ) = explode('-', $preferredLanguage);
1715 if (isset($allLanguageCodes[$preferredLanguage])) {
1716 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1717 break;
1718 }
1719 }
1720 if (!$selectedLanguage || $selectedLanguage === 'en') {
1721 $selectedLanguage = 'default';
1722 }
1723 return $selectedLanguage;
1724 }
1725
1726 /**
1727 * Merges all available charsets and locales, currently only used for getPreferredClientLanguage()
1728 *
1729 * @return array
1730 */
1731 protected function getAllLanguageCodes() {
1732 // Get all languages where TYPO3 code is the same as the ISO code
1733 $typo3LanguageCodes = array_keys($this->charSetArray);
1734 $allLanguageCodes = array_combine($typo3LanguageCodes, $typo3LanguageCodes);
1735 // Get all languages where TYPO3 code differs from ISO code
1736 // or needs the country part
1737 // the iso codes will here overwrite the default typo3 language in the key
1738 /** @var Locales $locales */
1739 $locales = GeneralUtility::makeInstance(Locales::class);
1740 foreach ($locales->getIsoMapping() as $typo3Lang => $isoLang) {
1741 $isoLang = join('-', explode('_', $isoLang));
1742 $allLanguageCodes[$typo3Lang] = $isoLang;
1743 }
1744 // Move the iso codes to the (because we're comparing the keys with "isset" later on)
1745 return array_flip($allLanguageCodes);
1746 }
1747
1748 /********************************************
1749 *
1750 * Internal string operation functions
1751 *
1752 ********************************************/
1753 /**
1754 * Maps all characters of a string in a single byte charset.
1755 *
1756 * @param string $str The string
1757 * @param string $charset The charset
1758 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1759 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
1760 * @return string The converted string
1761 */
1762 public function sb_char_mapping($str, $charset, $mode, $opt = '')
1763 {
1764 switch ($mode) {
1765 case 'case':
1766 if (!$this->initCaseFolding($charset)) {
1767 return $str;
1768 }
1769 // Do nothing
1770 $map = &$this->caseFolding[$charset][$opt];
1771 break;
1772 case 'ascii':
1773 if (!$this->initToASCII($charset)) {
1774 return $str;
1775 }
1776 // Do nothing
1777 $map = &$this->toASCII[$charset];
1778 break;
1779 default:
1780 return $str;
1781 }
1782 $out = '';
1783 for ($i = 0; isset($str[$i]); $i++) {
1784 $c = $str[$i];
1785 if (isset($map[$c])) {
1786 $out .= $map[$c];
1787 } else {
1788 $out .= $c;
1789 }
1790 }
1791 return $out;
1792 }
1793
1794 /********************************************
1795 *
1796 * Internal UTF-8 string operation functions
1797 *
1798 ********************************************/
1799 /**
1800 * Returns a part of a UTF-8 string.
1801 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1802 *
1803 * @param string $str UTF-8 string
1804 * @param int $start Start position (character position)
1805 * @param int $len Length (in characters)
1806 * @return string The substring
1807 * @see substr()
1808 */
1809 public function utf8_substr($str, $start, $len = null)
1810 {
1811 if ((string)$len === '0') {
1812 return '';
1813 }
1814 $byte_start = $this->utf8_char2byte_pos($str, $start);
1815 if ($byte_start === false) {
1816 if ($start > 0) {
1817 // $start outside string length
1818 return false;
1819 }
1820 }
1821 $str = substr($str, $byte_start);
1822 if ($len != null) {
1823 $byte_end = $this->utf8_char2byte_pos($str, $len);
1824 // $len outside actual string length
1825 if ($byte_end === false) {
1826 return $len < 0 ? '' : $str;
1827 } else {
1828 // When length is less than zero and exceeds, then we return blank string.
1829 return substr($str, 0, $byte_end);
1830 }
1831 } else {
1832 return $str;
1833 }
1834 }
1835
1836 /**
1837 * Counts the number of characters of a string in UTF-8.
1838 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1839 *
1840 * @param string $str UTF-8 multibyte character string
1841 * @return int The number of characters
1842 * @see strlen()
1843 */
1844 public function utf8_strlen($str)
1845 {
1846 $n = 0;
1847 for ($i = 0; isset($str[$i]); $i++) {
1848 $c = ord($str[$i]);
1849 // Single-byte (0xxxxxx)
1850 if (!($c & 128)) {
1851 $n++;
1852 } elseif (($c & 192) === 192) {
1853 // Multi-byte starting byte (11xxxxxx)
1854 $n++;
1855 }
1856 }
1857 return $n;
1858 }
1859
1860 /**
1861 * Truncates a string in UTF-8 short at a given byte length.
1862 *
1863 * @param string $str UTF-8 multibyte character string
1864 * @param int $len The byte length
1865 * @return string The shortened string
1866 * @see mb_strcut()
1867 */
1868 public function utf8_strtrunc($str, $len)
1869 {
1870 $i = $len - 1;
1871 // Part of a multibyte sequence
1872 if (ord($str[$i]) & 128) {
1873 for (; $i > 0 && !(ord($str[$i]) & 64); $i--) {
1874 }
1875 if ($i <= 0) {
1876 return '';
1877 }
1878 // Sanity check
1879 for ($bc = 0, $mbs = ord($str[$i]); $mbs & 128; $mbs = $mbs << 1) {
1880 // Calculate number of bytes
1881 $bc++;
1882 }
1883 if ($bc + $i > $len) {
1884 return substr($str, 0, $i);
1885 }
1886 }
1887 return substr($str, 0, $len);
1888 }
1889
1890 /**
1891 * Find position of first occurrence of a string, both arguments are in UTF-8.
1892 *
1893 * @param string $haystack UTF-8 string to search in
1894 * @param string $needle UTF-8 string to search for
1895 * @param int $offset Position to start the search
1896 * @return int The character position
1897 * @see strpos()
1898 */
1899 public function utf8_strpos($haystack, $needle, $offset = 0)
1900 {
1901 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1902 return mb_strpos($haystack, $needle, $offset, 'utf-8');
1903 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'iconv') {
1904 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1905 }
1906 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1907 if ($byte_offset === false) {
1908 // Offset beyond string length
1909 return false;
1910 }
1911 $byte_pos = strpos($haystack, $needle, $byte_offset);
1912 if ($byte_pos === false) {
1913 // Needle not found
1914 return false;
1915 }
1916 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1917 }
1918
1919 /**
1920 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1921 *
1922 * @param string $haystack UTF-8 string to search in
1923 * @param string $needle UTF-8 character to search for (single character)
1924 * @return int The character position
1925 * @see strrpos()
1926 */
1927 public function utf8_strrpos($haystack, $needle)
1928 {
1929 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1930 return mb_strrpos($haystack, $needle, 'utf-8');
1931 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'iconv') {
1932 return iconv_strrpos($haystack, $needle, 'utf-8');
1933 }
1934 $byte_pos = strrpos($haystack, $needle);
1935 if ($byte_pos === false) {
1936 // Needle not found
1937 return false;
1938 }
1939 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1940 }
1941
1942 /**
1943 * Translates a character position into an 'absolute' byte position.
1944 * Unit tested by Kasper.
1945 *
1946 * @param string $str UTF-8 string
1947 * @param int $pos Character position (negative values start from the end)
1948 * @return int Byte position
1949 */
1950 public function utf8_char2byte_pos($str, $pos)
1951 {
1952 // Number of characters found
1953 $n = 0;
1954 // Number of characters wanted
1955 $p = abs($pos);
1956 if ($pos >= 0) {
1957 $i = 0;
1958 $d = 1;
1959 } else {
1960 $i = strlen($str) - 1;
1961 $d = -1;
1962 }
1963 for (; isset($str[$i]) && $n < $p; $i += $d) {
1964 $c = (int)ord($str[$i]);
1965 // single-byte (0xxxxxx)
1966 if (!($c & 128)) {
1967 $n++;
1968 } elseif (($c & 192) === 192) {
1969 // Multi-byte starting byte (11xxxxxx)
1970 $n++;
1971 }
1972 }
1973 if (!isset($str[$i])) {
1974 // Offset beyond string length
1975 return false;
1976 }
1977 if ($pos >= 0) {
1978 // Skip trailing multi-byte data bytes
1979 while (ord($str[$i]) & 128 && !(ord($str[$i]) & 64)) {
1980 $i++;
1981 }
1982 } else {
1983 // Correct offset
1984 $i++;
1985 }
1986 return $i;
1987 }
1988
1989 /**
1990 * Translates an 'absolute' byte position into a character position.
1991 * Unit tested by Kasper.
1992 *
1993 * @param string $str UTF-8 string
1994 * @param int $pos Byte position
1995 * @return int Character position
1996 */
1997 public function utf8_byte2char_pos($str, $pos)
1998 {
1999 // Number of characters
2000 $n = 0;
2001 for ($i = $pos; $i > 0; $i--) {
2002 $c = (int)ord($str[$i]);
2003 // single-byte (0xxxxxx)
2004 if (!($c & 128)) {
2005 $n++;
2006 } elseif (($c & 192) === 192) {
2007 // Multi-byte starting byte (11xxxxxx)
2008 $n++;
2009 }
2010 }
2011 if (!isset($str[$i])) {
2012 // Offset beyond string length
2013 return false;
2014 }
2015 return $n;
2016 }
2017
2018 /**
2019 * Maps all characters of an UTF-8 string.
2020 *
2021 * @param string $str UTF-8 string
2022 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2023 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
2024 * @return string The converted string
2025 */
2026 public function utf8_char_mapping($str, $mode, $opt = '')
2027 {
2028 if (!$this->initUnicodeData($mode)) {
2029 // Do nothing
2030 return $str;
2031 }
2032 $out = '';
2033 switch ($mode) {
2034 case 'case':
2035 $map = &$this->caseFolding['utf-8'][$opt];
2036 break;
2037 case 'ascii':
2038 $map = &$this->toASCII['utf-8'];
2039 break;
2040 default:
2041 return $str;
2042 }
2043 for ($i = 0; isset($str[$i]); $i++) {
2044 $c = ord($str[$i]);
2045 // single-byte (0xxxxxx)
2046 if (!($c & 128)) {
2047 $mbc = $str[$i];
2048 } elseif (($c & 192) === 192) {
2049 // multi-byte starting byte (11xxxxxx)
2050 for ($bc = 0; $c & 128; $c = $c << 1) {
2051 $bc++;
2052 }
2053 // calculate number of bytes
2054 $mbc = substr($str, $i, $bc);
2055 $i += $bc - 1;
2056 }
2057 if (isset($map[$mbc])) {
2058 $out .= $map[$mbc];
2059 } else {
2060 $out .= $mbc;
2061 }
2062 }
2063 return $out;
2064 }
2065
2066 /********************************************
2067 *
2068 * Internal EUC string operation functions
2069 *
2070 * Extended Unix Code:
2071 * ASCII compatible 7bit single bytes chars
2072 * 8bit two byte chars
2073 *
2074 * Shift-JIS is treated as a special case.
2075 *
2076 ********************************************/
2077 /**
2078 * Cuts a string in the EUC charset family short at a given byte length.
2079 *
2080 * @param string $str EUC multibyte character string
2081 * @param int $len The byte length
2082 * @param string $charset The charset
2083 * @return string The shortened string
2084 * @see mb_strcut()
2085 */
2086 public function euc_strtrunc($str, $len, $charset)
2087 {
2088 $shiftJis = $charset === 'shift_jis';
2089 for ($i = 0; isset($str[$i]) && $i < $len; $i++) {
2090 $c = ord($str[$i]);
2091 if ($shiftJis) {
2092 if ($c >= 128 && $c < 160 || $c >= 224) {
2093 $i++;
2094 }
2095 } else {
2096 if ($c >= 128) {
2097 $i++;
2098 }
2099 }
2100 }
2101 if (!isset($str[$i])) {
2102 return $str;
2103 }
2104 // string shorter than supplied length
2105 if ($i > $len) {
2106 // We ended on a first byte
2107 return substr($str, 0, $len - 1);
2108 } else {
2109 return substr($str, 0, $len);
2110 }
2111 }
2112
2113 /**
2114 * Returns a part of a string in the EUC charset family.
2115 *
2116 * @param string $str EUC multibyte character string
2117 * @param int $start Start position (character position)
2118 * @param string $charset The charset
2119 * @param int $len Length (in characters)
2120 * @return string the substring
2121 */
2122 public function euc_substr($str, $start, $charset, $len = null)
2123 {
2124 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2125 if ($byte_start === false) {
2126 // $start outside string length
2127 return false;
2128 }
2129 $str = substr($str, $byte_start);
2130 if ($len != null) {
2131 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2132 // $len outside actual string length
2133 if ($byte_end === false) {
2134 return $str;
2135 } else {
2136 return substr($str, 0, $byte_end);
2137 }
2138 } else {
2139 return $str;
2140 }
2141 }
2142
2143 /**
2144 * Counts the number of characters of a string in the EUC charset family.
2145 *
2146 * @param string $str EUC multibyte character string
2147 * @param string $charset The charset
2148 * @return int The number of characters
2149 * @see strlen()
2150 */
2151 public function euc_strlen($str, $charset)
2152 {
2153 $sjis = $charset === 'shift_jis';
2154 $n = 0;
2155 for ($i = 0; isset($str[$i]); $i++) {
2156 $c = ord($str[$i]);
2157 if ($sjis) {
2158 if ($c >= 128 && $c < 160 || $c >= 224) {
2159 $i++;
2160 }
2161 } else {
2162 if ($c >= 128) {
2163 $i++;
2164 }
2165 }
2166 $n++;
2167 }
2168 return $n;
2169 }
2170
2171 /**
2172 * Translates a character position into an 'absolute' byte position.
2173 *
2174 * @param string $str EUC multibyte character string
2175 * @param int $pos Character position (negative values start from the end)
2176 * @param string $charset The charset
2177 * @return int Byte position
2178 */
2179 public function euc_char2byte_pos($str, $pos, $charset)
2180 {
2181 $sjis = $charset === 'shift_jis';
2182 // Number of characters seen
2183 $n = 0;
2184 // Number of characters wanted
2185 $p = abs($pos);
2186 if ($pos >= 0) {
2187 $i = 0;
2188 $d = 1;
2189 } else {
2190 $i = strlen($str) - 1;
2191 $d = -1;
2192 }
2193 for (; isset($str[$i]) && $n < $p; $i += $d) {
2194 $c = ord($str[$i]);
2195 if ($sjis) {
2196 if ($c >= 128 && $c < 160 || $c >= 224) {
2197 $i += $d;
2198 }
2199 } else {
2200 if ($c >= 128) {
2201 $i += $d;
2202 }
2203 }
2204 $n++;
2205 }
2206 if (!isset($str[$i])) {
2207 return false;
2208 }
2209 // offset beyond string length
2210 if ($pos < 0) {
2211 $i++;
2212 }
2213 // correct offset
2214 return $i;
2215 }
2216
2217 /**
2218 * Maps all characters of a string in the EUC charset family.
2219 *
2220 * @param string $str EUC multibyte character string
2221 * @param string $charset The charset
2222 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2223 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
2224 * @return string The converted string
2225 */
2226 public function euc_char_mapping($str, $charset, $mode, $opt = '')
2227 {
2228 switch ($mode) {
2229 case 'case':
2230 if (!$this->initCaseFolding($charset)) {
2231 return $str;
2232 }
2233 // do nothing
2234 $map = &$this->caseFolding[$charset][$opt];
2235 break;
2236 case 'ascii':
2237 if (!$this->initToASCII($charset)) {
2238 return $str;
2239 }
2240 // do nothing
2241 $map = &$this->toASCII[$charset];
2242 break;
2243 default:
2244 return $str;
2245 }
2246 $sjis = $charset === 'shift_jis';
2247 $out = '';
2248 for ($i = 0; isset($str[$i]); $i++) {
2249 $mbc = $str[$i];
2250 $c = ord($mbc);
2251 if ($sjis) {
2252 // A double-byte char
2253 if ($c >= 128 && $c < 160 || $c >= 224) {
2254 $mbc = substr($str, $i, 2);
2255 $i++;
2256 }
2257 } else {
2258 // A double-byte char
2259 if ($c >= 128) {
2260 $mbc = substr($str, $i, 2);
2261 $i++;
2262 }
2263 }
2264 if (isset($map[$mbc])) {
2265 $out .= $map[$mbc];
2266 } else {
2267 $out .= $mbc;
2268 }
2269 }
2270 return $out;
2271 }
2272 }