[TASK] Use constants for checking CharsetConverter
[Packages/TYPO3.CMS.git] / typo3 / sysext / core / Classes / Charset / CharsetConverter.php
1 <?php
2 namespace TYPO3\CMS\Core\Charset;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Localization\Locales;
18 use TYPO3\CMS\Core\SingletonInterface;
19 use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
20 use TYPO3\CMS\Core\Utility\GeneralUtility;
21
22 /**
23 * Notes on UTF-8
24 *
25 * Functions working on UTF-8 strings:
26 *
27 * - strchr/strstr
28 * - strrchr
29 * - substr_count
30 * - implode/explode/join
31 *
32 * Functions nearly working on UTF-8 strings:
33 *
34 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
35 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
36 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
37 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
38 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
39 *
40 * Functions NOT working on UTF-8 strings:
41 *
42 * - str*cmp
43 * - stristr
44 * - stripos
45 * - substr
46 * - strrev
47 * - split/spliti
48 * - ...
49 */
50
51 /**
52 * Class for conversion between charsets
53 */
54 class CharsetConverter implements SingletonInterface
55 {
56
57 /**
58 * Possible strategies for handling multi-byte data
59 * Only used for internal purpose
60 * @internal
61 */
62 const STRATEGY_MBSTRING = 'mbstring';
63 const STRATEGY_ICONV = 'iconv';
64 const STRATEGY_FALLBACK = 'fallback';
65
66 /**
67 * ASCII Value for chars with no equivalent.
68 *
69 * @var int
70 */
71 public $noCharByteVal = 63;
72
73 /**
74 * This is the array where parsed conversion tables are stored (cached)
75 *
76 * @var array
77 */
78 public $parsedCharsets = array();
79
80 /**
81 * An array where case folding data will be stored (cached)
82 *
83 * @var array
84 */
85 public $caseFolding = array();
86
87 /**
88 * An array where charset-to-ASCII mappings are stored (cached)
89 *
90 * @var array
91 */
92 public $toASCII = array();
93
94 /**
95 * This tells the converter which charsets has two bytes per char:
96 *
97 * @var array
98 */
99 public $twoByteSets = array(
100 'ucs-2' => 1
101 );
102
103 /**
104 * This tells the converter which charsets has four bytes per char:
105 *
106 * @var array
107 */
108 public $fourByteSets = array(
109 'ucs-4' => 1, // 4-byte Unicode
110 'utf-32' => 1
111 );
112
113 /**
114 * This tells the converter which charsets use a scheme like the Extended Unix Code:
115 *
116 * @var array
117 */
118 public $eucBasedSets = array(
119 'gb2312' => 1, // Chinese, simplified.
120 'big5' => 1, // Chinese, traditional.
121 'euc-kr' => 1, // Korean
122 'shift_jis' => 1
123 );
124
125 /**
126 * @link http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
127 * @link http://czyborra.com/charsets/iso8859.html
128 *
129 * @var array
130 */
131 public $synonyms = array(
132 'us' => 'ascii',
133 'us-ascii' => 'ascii',
134 'cp819' => 'iso-8859-1',
135 'ibm819' => 'iso-8859-1',
136 'iso-ir-100' => 'iso-8859-1',
137 'iso-ir-101' => 'iso-8859-2',
138 'iso-ir-109' => 'iso-8859-3',
139 'iso-ir-110' => 'iso-8859-4',
140 'iso-ir-144' => 'iso-8859-5',
141 'iso-ir-127' => 'iso-8859-6',
142 'iso-ir-126' => 'iso-8859-7',
143 'iso-ir-138' => 'iso-8859-8',
144 'iso-ir-148' => 'iso-8859-9',
145 'iso-ir-157' => 'iso-8859-10',
146 'iso-ir-179' => 'iso-8859-13',
147 'iso-ir-199' => 'iso-8859-14',
148 'iso-ir-203' => 'iso-8859-15',
149 'csisolatin1' => 'iso-8859-1',
150 'csisolatin2' => 'iso-8859-2',
151 'csisolatin3' => 'iso-8859-3',
152 'csisolatin5' => 'iso-8859-9',
153 'csisolatin8' => 'iso-8859-14',
154 'csisolatin9' => 'iso-8859-15',
155 'csisolatingreek' => 'iso-8859-7',
156 'iso-celtic' => 'iso-8859-14',
157 'latin1' => 'iso-8859-1',
158 'latin2' => 'iso-8859-2',
159 'latin3' => 'iso-8859-3',
160 'latin5' => 'iso-8859-9',
161 'latin6' => 'iso-8859-10',
162 'latin8' => 'iso-8859-14',
163 'latin9' => 'iso-8859-15',
164 'l1' => 'iso-8859-1',
165 'l2' => 'iso-8859-2',
166 'l3' => 'iso-8859-3',
167 'l5' => 'iso-8859-9',
168 'l6' => 'iso-8859-10',
169 'l8' => 'iso-8859-14',
170 'l9' => 'iso-8859-15',
171 'cyrillic' => 'iso-8859-5',
172 'arabic' => 'iso-8859-6',
173 'tis-620' => 'iso-8859-11',
174 'win874' => 'windows-874',
175 'win1250' => 'windows-1250',
176 'win1251' => 'windows-1251',
177 'win1252' => 'windows-1252',
178 'win1253' => 'windows-1253',
179 'win1254' => 'windows-1254',
180 'win1255' => 'windows-1255',
181 'win1256' => 'windows-1256',
182 'win1257' => 'windows-1257',
183 'win1258' => 'windows-1258',
184 'cp1250' => 'windows-1250',
185 'cp1251' => 'windows-1251',
186 'cp1252' => 'windows-1252',
187 'ms-ee' => 'windows-1250',
188 'ms-ansi' => 'windows-1252',
189 'ms-greek' => 'windows-1253',
190 'ms-turk' => 'windows-1254',
191 'winbaltrim' => 'windows-1257',
192 'koi-8ru' => 'koi-8r',
193 'koi8r' => 'koi-8r',
194 'cp878' => 'koi-8r',
195 'mac' => 'macroman',
196 'macintosh' => 'macroman',
197 'euc-cn' => 'gb2312',
198 'x-euc-cn' => 'gb2312',
199 'euccn' => 'gb2312',
200 'cp936' => 'gb2312',
201 'big-5' => 'big5',
202 'cp950' => 'big5',
203 'eucjp' => 'euc-jp',
204 'sjis' => 'shift_jis',
205 'shift-jis' => 'shift_jis',
206 'cp932' => 'shift_jis',
207 'cp949' => 'euc-kr',
208 'utf7' => 'utf-7',
209 'utf8' => 'utf-8',
210 'utf16' => 'utf-16',
211 'utf32' => 'utf-32',
212 'ucs2' => 'ucs-2',
213 'ucs4' => 'ucs-4'
214 );
215
216 /**
217 * Mapping of iso-639-1 language codes to script names
218 *
219 * @var array
220 */
221 public $lang_to_script = array(
222 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
223 'af' => 'west_european', // Afrikaans
224 'ar' => 'arabic',
225 'bg' => 'cyrillic', // Bulgarian
226 'bs' => 'east_european', // Bosnian
227 'cs' => 'east_european', // Czech
228 'da' => 'west_european', // Danish
229 'de' => 'west_european', // German
230 'es' => 'west_european', // Spanish
231 'et' => 'estonian',
232 'eo' => 'unicode', // Esperanto
233 'eu' => 'west_european', // Basque
234 'fa' => 'arabic', // Persian
235 'fi' => 'west_european', // Finish
236 'fo' => 'west_european', // Faroese
237 'fr' => 'west_european', // French
238 'ga' => 'west_european', // Irish
239 'gl' => 'west_european', // Galician
240 'gr' => 'greek',
241 'he' => 'hebrew', // Hebrew (since 1998)
242 'hi' => 'unicode', // Hindi
243 'hr' => 'east_european', // Croatian
244 'hu' => 'east_european', // Hungarian
245 'iw' => 'hebrew', // Hebrew (til 1998)
246 'is' => 'west_european', // Icelandic
247 'it' => 'west_european', // Italian
248 'ja' => 'japanese',
249 'ka' => 'unicode', // Georgian
250 'kl' => 'west_european', // Greenlandic
251 'km' => 'unicode', // Khmer
252 'ko' => 'korean',
253 'lt' => 'lithuanian',
254 'lv' => 'west_european', // Latvian/Lettish
255 'nl' => 'west_european', // Dutch
256 'no' => 'west_european', // Norwegian
257 'nb' => 'west_european', // Norwegian Bokmal
258 'nn' => 'west_european', // Norwegian Nynorsk
259 'pl' => 'east_european', // Polish
260 'pt' => 'west_european', // Portuguese
261 'ro' => 'east_european', // Romanian
262 'ru' => 'cyrillic', // Russian
263 'sk' => 'east_european', // Slovak
264 'sl' => 'east_european', // Slovenian
265 'sr' => 'cyrillic', // Serbian
266 'sv' => 'west_european', // Swedish
267 'sq' => 'albanian', // Albanian
268 'th' => 'thai',
269 'uk' => 'cyrillic', // Ukranian
270 'vi' => 'vietnamese',
271 'zh' => 'chinese',
272
273 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
274 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
275 'afk' => 'west_european', // Afrikaans
276 'ara' => 'arabic',
277 'bgr' => 'cyrillic', // Bulgarian
278 'cat' => 'west_european', // Catalan
279 'chs' => 'simpl_chinese',
280 'cht' => 'trad_chinese',
281 'csy' => 'east_european', // Czech
282 'dan' => 'west_european', // Danish
283 'deu' => 'west_european', // German
284 'dea' => 'west_european', // German (Austrian)
285 'des' => 'west_european', // German (Swiss)
286 'ena' => 'west_european', // English (Australian)
287 'enc' => 'west_european', // English (Canadian)
288 'eng' => 'west_european', // English
289 'enz' => 'west_european', // English (New Zealand)
290 'enu' => 'west_european', // English (United States)
291 'euq' => 'west_european', // Basque
292 'fos' => 'west_european', // Faroese
293 'far' => 'arabic', // Persian
294 'fin' => 'west_european', // Finish
295 'fra' => 'west_european', // French
296 'frb' => 'west_european', // French (Belgian)
297 'frc' => 'west_european', // French (Canadian)
298 'frs' => 'west_european', // French (Swiss)
299 'geo' => 'unicode', // Georgian
300 'glg' => 'west_european', // Galician
301 'ell' => 'greek',
302 'heb' => 'hebrew',
303 'hin' => 'unicode', // Hindi
304 'hun' => 'east_european', // Hungarian
305 'isl' => 'west_european', // Icelandic
306 'ita' => 'west_european', // Italian
307 'its' => 'west_european', // Italian (Swiss)
308 'jpn' => 'japanese',
309 'khm' => 'unicode', // Khmer
310 'kor' => 'korean',
311 'lth' => 'lithuanian',
312 'lvi' => 'west_european', // Latvian/Lettish
313 'msl' => 'west_european', // Malay
314 'nlb' => 'west_european', // Dutch (Belgian)
315 'nld' => 'west_european', // Dutch
316 'nor' => 'west_european', // Norwegian (bokmal)
317 'non' => 'west_european', // Norwegian (nynorsk)
318 'plk' => 'east_european', // Polish
319 'ptg' => 'west_european', // Portuguese
320 'ptb' => 'west_european', // Portuguese (Brazil)
321 'rom' => 'east_european', // Romanian
322 'rus' => 'cyrillic', // Russian
323 'slv' => 'east_european', // Slovenian
324 'sky' => 'east_european', // Slovak
325 'srl' => 'east_european', // Serbian (Latin)
326 'srb' => 'cyrillic', // Serbian (Cyrillic)
327 'esp' => 'west_european', // Spanish (trad. sort)
328 'esm' => 'west_european', // Spanish (Mexican)
329 'esn' => 'west_european', // Spanish (internat. sort)
330 'sve' => 'west_european', // Swedish
331 'sqi' => 'albanian', // Albanian
332 'tha' => 'thai',
333 'trk' => 'turkish',
334 'ukr' => 'cyrillic', // Ukrainian
335
336 // English language names
337 'afrikaans' => 'west_european',
338 'albanian' => 'albanian',
339 'arabic' => 'arabic',
340 'basque' => 'west_european',
341 'bosnian' => 'east_european',
342 'bulgarian' => 'east_european',
343 'catalan' => 'west_european',
344 'croatian' => 'east_european',
345 'czech' => 'east_european',
346 'danish' => 'west_european',
347 'dutch' => 'west_european',
348 'english' => 'west_european',
349 'esperanto' => 'unicode',
350 'estonian' => 'estonian',
351 'faroese' => 'west_european',
352 'farsi' => 'arabic',
353 'finnish' => 'west_european',
354 'french' => 'west_european',
355 'galician' => 'west_european',
356 'georgian' => 'unicode',
357 'german' => 'west_european',
358 'greek' => 'greek',
359 'greenlandic' => 'west_european',
360 'hebrew' => 'hebrew',
361 'hindi' => 'unicode',
362 'hungarian' => 'east_european',
363 'icelandic' => 'west_european',
364 'italian' => 'west_european',
365 'khmer' => 'unicode',
366 'latvian' => 'west_european',
367 'lettish' => 'west_european',
368 'lithuanian' => 'lithuanian',
369 'malay' => 'west_european',
370 'norwegian' => 'west_european',
371 'persian' => 'arabic',
372 'polish' => 'east_european',
373 'portuguese' => 'west_european',
374 'russian' => 'cyrillic',
375 'romanian' => 'east_european',
376 'serbian' => 'cyrillic',
377 'slovak' => 'east_european',
378 'slovenian' => 'east_european',
379 'spanish' => 'west_european',
380 'svedish' => 'west_european',
381 'that' => 'thai',
382 'turkish' => 'turkish',
383 'ukrainian' => 'cyrillic'
384 );
385
386 /**
387 * Mapping of language (family) names to charsets on Unix
388 *
389 * @var array
390 */
391 public $script_to_charset_unix = array(
392 'west_european' => 'iso-8859-1',
393 'estonian' => 'iso-8859-1',
394 'east_european' => 'iso-8859-2',
395 'baltic' => 'iso-8859-4',
396 'cyrillic' => 'iso-8859-5',
397 'arabic' => 'iso-8859-6',
398 'greek' => 'iso-8859-7',
399 'hebrew' => 'iso-8859-8',
400 'turkish' => 'iso-8859-9',
401 'thai' => 'iso-8859-11', // = TIS-620
402 'lithuanian' => 'iso-8859-13',
403 'chinese' => 'gb2312', // = euc-cn
404 'japanese' => 'euc-jp',
405 'korean' => 'euc-kr',
406 'simpl_chinese' => 'gb2312',
407 'trad_chinese' => 'big5',
408 'vietnamese' => '',
409 'unicode' => 'utf-8',
410 'albanian' => 'utf-8'
411 );
412
413 /**
414 * Mapping of language (family) names to charsets on Windows
415 *
416 * @var array
417 */
418 public $script_to_charset_windows = array(
419 'east_european' => 'windows-1250',
420 'cyrillic' => 'windows-1251',
421 'west_european' => 'windows-1252',
422 'greek' => 'windows-1253',
423 'turkish' => 'windows-1254',
424 'hebrew' => 'windows-1255',
425 'arabic' => 'windows-1256',
426 'baltic' => 'windows-1257',
427 'estonian' => 'windows-1257',
428 'lithuanian' => 'windows-1257',
429 'vietnamese' => 'windows-1258',
430 'thai' => 'cp874',
431 'korean' => 'cp949',
432 'chinese' => 'gb2312',
433 'japanese' => 'shift_jis',
434 'simpl_chinese' => 'gb2312',
435 'trad_chinese' => 'big5',
436 'albanian' => 'windows-1250',
437 'unicode' => 'utf-8'
438 );
439
440 /**
441 * Mapping of locale names to charsets
442 *
443 * @var array
444 */
445 public $locale_to_charset = array(
446 'japanese.euc' => 'euc-jp',
447 'ja_jp.ujis' => 'euc-jp',
448 'korean.euc' => 'euc-kr',
449 'sr@Latn' => 'iso-8859-2',
450 'zh_cn' => 'gb2312',
451 'zh_hk' => 'big5',
452 'zh_tw' => 'big5'
453 );
454
455 /**
456 * TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
457 * Empty values means "utf-8"
458 *
459 * @var array
460 */
461 public $charSetArray = array(
462 'af' => '',
463 'ar' => 'iso-8859-6',
464 'ba' => 'iso-8859-2',
465 'bg' => 'windows-1251',
466 'br' => '',
467 'ca' => 'iso-8859-15',
468 'ch' => 'gb2312',
469 'cs' => 'windows-1250',
470 'cz' => 'windows-1250',
471 'da' => '',
472 'de' => '',
473 'dk' => '',
474 'el' => 'iso-8859-7',
475 'eo' => 'utf-8',
476 'es' => '',
477 'et' => 'iso-8859-4',
478 'eu' => '',
479 'fa' => 'utf-8',
480 'fi' => '',
481 'fo' => 'utf-8',
482 'fr' => '',
483 'fr_CA' => '',
484 'ga' => '',
485 'ge' => 'utf-8',
486 'gl' => '',
487 'gr' => 'iso-8859-7',
488 'he' => 'utf-8',
489 'hi' => 'utf-8',
490 'hk' => 'big5',
491 'hr' => 'windows-1250',
492 'hu' => 'iso-8859-2',
493 'is' => 'utf-8',
494 'it' => '',
495 'ja' => 'shift_jis',
496 'jp' => 'shift_jis',
497 'ka' => 'utf-8',
498 'kl' => 'utf-8',
499 'km' => 'utf-8',
500 'ko' => 'euc-kr',
501 'kr' => 'euc-kr',
502 'lt' => 'windows-1257',
503 'lv' => 'utf-8',
504 'ms' => '',
505 'my' => '',
506 'nl' => '',
507 'no' => '',
508 'pl' => 'iso-8859-2',
509 'pt' => '',
510 'pt_BR' => '',
511 'qc' => '',
512 'ro' => 'iso-8859-2',
513 'ru' => 'windows-1251',
514 'se' => '',
515 'si' => 'windows-1250',
516 'sk' => 'windows-1250',
517 'sl' => 'windows-1250',
518 'sq' => 'utf-8',
519 'sr' => 'utf-8',
520 'sv' => '',
521 'th' => 'iso-8859-11',
522 'tr' => 'iso-8859-9',
523 'ua' => 'windows-1251',
524 'uk' => 'windows-1251',
525 'vi' => 'utf-8',
526 'vn' => 'utf-8',
527 'zh' => 'big5'
528 );
529
530 /**
531 * Normalize - changes input character set to lowercase letters.
532 *
533 * @param string $charset Input charset
534 * @return string Normalized charset
535 */
536 public function parse_charset($charset)
537 {
538 $charset = trim(strtolower($charset));
539 if (isset($this->synonyms[$charset])) {
540 $charset = $this->synonyms[$charset];
541 }
542 return $charset;
543 }
544
545 /**
546 * Get the charset of a locale.
547 *
548 * ln language
549 * ln_CN language / country
550 * ln_CN.cs language / country / charset
551 * ln_CN.cs@mod language / country / charset / modifier
552 *
553 * @param string $locale Locale string
554 * @return string Charset resolved for locale string
555 */
556 public function get_locale_charset($locale)
557 {
558 $locale = strtolower($locale);
559 // Exact locale specific charset?
560 if (isset($this->locale_to_charset[$locale])) {
561 return $this->locale_to_charset[$locale];
562 }
563 // Get modifier
564 list($locale, $modifier) = explode('@', $locale);
565 // Locale contains charset: use it
566 list($locale, $charset) = explode('.', $locale);
567 if ($charset) {
568 return $this->parse_charset($charset);
569 }
570 // Modifier is 'euro' (after charset check, because of xx.utf-8@euro)
571 if ($modifier === 'euro') {
572 return 'iso-8859-15';
573 }
574 // Get language
575 list($language, ) = explode('_', $locale);
576 if (isset($this->lang_to_script[$language])) {
577 $script = $this->lang_to_script[$language];
578 }
579 if (TYPO3_OS === 'WIN') {
580 $cs = $this->script_to_charset_windows[$script] ?: 'windows-1252';
581 } else {
582 $cs = $this->script_to_charset_unix[$script] ?: 'utf-8';
583 }
584 return $cs;
585 }
586
587 /********************************************
588 *
589 * Charset Conversion functions
590 *
591 ********************************************/
592 /**
593 * Convert from one charset to another charset.
594 *
595 * @param string $inputString Input string
596 * @param string $fromCharset From charset (the current charset of the string)
597 * @param string $toCharset To charset (the output charset wanted)
598 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
599 * @return string Converted string
600 * @see convArray()
601 */
602 public function conv($inputString, $fromCharset, $toCharset, $useEntityForNoChar = false)
603 {
604 if ($fromCharset === $toCharset) {
605 return $inputString;
606 }
607 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
608 if ($toCharset === 'utf-8' || !$useEntityForNoChar) {
609 switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
610 case 'mbstring':
611 $convertedString = mb_convert_encoding($inputString, $toCharset, $fromCharset);
612 if (false !== $convertedString) {
613 return $convertedString;
614 }
615 // Returns FALSE for unsupported charsets
616 break;
617 case 'iconv':
618 $convertedString = iconv($fromCharset, $toCharset . '//TRANSLIT', $inputString);
619 if (false !== $convertedString) {
620 return $convertedString;
621 }
622 break;
623 }
624 }
625 if ($fromCharset !== 'utf-8') {
626 $inputString = $this->utf8_encode($inputString, $fromCharset);
627 }
628 if ($toCharset !== 'utf-8') {
629 $inputString = $this->utf8_decode($inputString, $toCharset, $useEntityForNoChar);
630 }
631 return $inputString;
632 }
633
634 /**
635 * Convert all elements in ARRAY with type string from one charset to another charset.
636 * NOTICE: Array is passed by reference!
637 *
638 * @param array $array Input array, possibly multidimensional
639 * @param string $fromCharset From charset (the current charset of the string)
640 * @param string $toCharset To charset (the output charset wanted)
641 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
642 * @return void
643 * @see conv()
644 */
645 public function convArray(&$array, $fromCharset, $toCharset, $useEntityForNoChar = false)
646 {
647 foreach ($array as $key => $value) {
648 if (is_array($array[$key])) {
649 $this->convArray($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
650 } elseif (is_string($array[$key])) {
651 $array[$key] = $this->conv($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
652 }
653 }
654 }
655
656 /**
657 * Converts $str from $charset to UTF-8
658 *
659 * @param string $str String in local charset to convert to UTF-8
660 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
661 * @return string Output string, converted to UTF-8
662 */
663 public function utf8_encode($str, $charset)
664 {
665 if ($charset === 'utf-8') {
666 return $str;
667 }
668 // Charset is case-insensitive
669 // Parse conv. table if not already
670 if ($this->initCharset($charset)) {
671 $strLen = strlen($str);
672 $outStr = '';
673 // Traverse each char in string
674 for ($a = 0; $a < $strLen; $a++) {
675 $chr = substr($str, $a, 1);
676 $ord = ord($chr);
677 // If the charset has two bytes per char
678 if (isset($this->twoByteSets[$charset])) {
679 $ord2 = ord($str[$a + 1]);
680 // Assume big endian
681 $ord = $ord << 8 | $ord2;
682 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
683 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
684 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
685 } else {
686 $outStr .= chr($this->noCharByteVal);
687 }
688 // No char exists
689 $a++;
690 } elseif ($ord > 127) {
691 // If char has value over 127 it's a multibyte char in UTF-8
692 // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
693 if (isset($this->eucBasedSets[$charset])) {
694 // Shift-JIS: chars between 160 and 223 are single byte
695 if ($charset !== 'shift_jis' || ($ord < 160 || $ord > 223)) {
696 $a++;
697 $ord2 = ord(substr($str, $a, 1));
698 $ord = $ord * 256 + $ord2;
699 }
700 }
701 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
702 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
703 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
704 } else {
705 $outStr .= chr($this->noCharByteVal);
706 }
707 } else {
708 $outStr .= $chr;
709 }
710 }
711 return $outStr;
712 }
713 }
714
715 /**
716 * Converts $str from UTF-8 to $charset
717 *
718 * @param string $str String in UTF-8 to convert to local charset
719 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
720 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
721 * @return string Output string, converted to local charset
722 */
723 public function utf8_decode($str, $charset, $useEntityForNoChar = false)
724 {
725 if ($charset === 'utf-8') {
726 return $str;
727 }
728 // Charset is case-insensitive.
729 // Parse conv. table if not already
730 if ($this->initCharset($charset)) {
731 $strLen = strlen($str);
732 $outStr = '';
733 // Traverse each char in UTF-8 string
734 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) {
735 $chr = substr($str, $a, 1);
736 $ord = ord($chr);
737 // This means multibyte! (first byte!)
738 if ($ord > 127) {
739 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
740 if ($ord & 64) {
741 // Add first byte
742 $buf = $chr;
743 // For each byte in multibyte string
744 for ($b = 0; $b < 8; $b++) {
745 // Shift it left and
746 $ord = $ord << 1;
747 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
748 if ($ord & 128) {
749 $a++;
750 // ... and add the next char.
751 $buf .= substr($str, $a, 1);
752 } else {
753 break;
754 }
755 }
756 // If the UTF-8 char-sequence is found then...
757 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
758 // The local number
759 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
760 // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
761 if ($mByte > 255) {
762 $outStr .= chr(($mByte >> 8 & 255)) . chr(($mByte & 255));
763 } else {
764 $outStr .= chr($mByte);
765 }
766 } elseif ($useEntityForNoChar) {
767 // Create num entity:
768 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
769 } else {
770 $outStr .= chr($this->noCharByteVal);
771 }
772 } else {
773 $outStr .= chr($this->noCharByteVal);
774 }
775 } else {
776 $outStr .= $chr;
777 }
778 }
779 return $outStr;
780 }
781 }
782
783 /**
784 * Converts all chars > 127 to numeric entities.
785 *
786 * @param string $str Input string
787 * @return string Output string
788 */
789 public function utf8_to_entities($str)
790 {
791 $strLen = strlen($str);
792 $outStr = '';
793 // Traverse each char in UTF-8 string.
794 for ($a = 0; $a < $strLen; $a++) {
795 $chr = substr($str, $a, 1);
796 $ord = ord($chr);
797 // This means multibyte! (first byte!)
798 if ($ord > 127) {
799 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
800 if ($ord & 64) {
801 // Add first byte
802 $buf = $chr;
803 // For each byte in multibyte string...
804 for ($b = 0; $b < 8; $b++) {
805 // Shift it left and ...
806 $ord = $ord << 1;
807 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
808 if ($ord & 128) {
809 $a++;
810 // ... and add the next char.
811 $buf .= substr($str, $a, 1);
812 } else {
813 break;
814 }
815 }
816 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
817 } else {
818 $outStr .= chr($this->noCharByteVal);
819 }
820 } else {
821 $outStr .= $chr;
822 }
823 }
824 return $outStr;
825 }
826
827 /**
828 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
829 *
830 * @param string $str Input string, UTF-8
831 * @param bool $alsoStdHtmlEnt If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
832 * @return string Output string
833 */
834 public function entities_to_utf8($str, $alsoStdHtmlEnt = false)
835 {
836 if ($alsoStdHtmlEnt) {
837 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8'));
838 }
839 $token = md5(microtime());
840 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
841 foreach ($parts as $k => $v) {
842 // Only take every second element
843 if ($k % 2 === 0) {
844 continue;
845 }
846 $position = 0;
847 // Dec or hex entities
848 if (substr($v, $position, 1) === '#') {
849 $position++;
850 if (substr($v, $position, 1) === 'x') {
851 $v = hexdec(substr($v, ++$position));
852 } else {
853 $v = substr($v, $position);
854 }
855 $parts[$k] = $this->UnumberToChar($v);
856 } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) {
857 // Other entities:
858 $v = $trans_tbl['&' . $v . ';'];
859 $parts[$k] = $v;
860 } else {
861 // No conversion:
862 $parts[$k] = '&' . $v . ';';
863 }
864 }
865 return implode('', $parts);
866 }
867
868 /**
869 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
870 *
871 * @param string $str Input string, UTF-8
872 * @param bool $convEntities If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
873 * @param bool $retChar If set, then instead of integer numbers the real UTF-8 char is returned.
874 * @return array Output array with the char numbers
875 */
876 public function utf8_to_numberarray($str, $convEntities = false, $retChar = false)
877 {
878 // If entities must be registered as well...:
879 if ($convEntities) {
880 $str = $this->entities_to_utf8($str, 1);
881 }
882 // Do conversion:
883 $strLen = strlen($str);
884 $outArr = array();
885 // Traverse each char in UTF-8 string.
886 for ($a = 0; $a < $strLen; $a++) {
887 $chr = substr($str, $a, 1);
888 $ord = ord($chr);
889 // This means multibyte! (first byte!)
890 if ($ord > 127) {
891 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
892 if ($ord & 64) {
893 // Add first byte
894 $buf = $chr;
895 // For each byte in multibyte string...
896 for ($b = 0; $b < 8; $b++) {
897 // Shift it left and ...
898 $ord = $ord << 1;
899 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
900 if ($ord & 128) {
901 $a++;
902 // ... and add the next char.
903 $buf .= substr($str, $a, 1);
904 } else {
905 break;
906 }
907 }
908 $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
909 } else {
910 $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
911 }
912 } else {
913 $outArr[] = $retChar ? chr($ord) : $ord;
914 }
915 }
916 return $outArr;
917 }
918
919 /**
920 * Converts a UNICODE number to a UTF-8 multibyte character
921 * Algorithm based on script found at From: http://czyborra.com/utf/
922 * Unit-tested by Kasper
923 *
924 * The binary representation of the character's integer value is thus simply spread across the bytes
925 * and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
926 *
927 * bytes | bits | representation
928 * 1 | 7 | 0vvvvvvv
929 * 2 | 11 | 110vvvvv 10vvvvvv
930 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
931 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
932 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
933 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
934 *
935 * @param int $unicodeInteger UNICODE integer
936 * @return string UTF-8 multibyte character string
937 * @see utf8CharToUnumber()
938 */
939 public function UnumberToChar($unicodeInteger)
940 {
941 $str = '';
942 if ($unicodeInteger < 128) {
943 $str .= chr($unicodeInteger);
944 } elseif ($unicodeInteger < 2048) {
945 $str .= chr(192 | $unicodeInteger >> 6);
946 $str .= chr(128 | $unicodeInteger & 63);
947 } elseif ($unicodeInteger < 65536) {
948 $str .= chr(224 | $unicodeInteger >> 12);
949 $str .= chr(128 | $unicodeInteger >> 6 & 63);
950 $str .= chr(128 | $unicodeInteger & 63);
951 } elseif ($unicodeInteger < 2097152) {
952 $str .= chr(240 | $unicodeInteger >> 18);
953 $str .= chr(128 | $unicodeInteger >> 12 & 63);
954 $str .= chr(128 | $unicodeInteger >> 6 & 63);
955 $str .= chr(128 | $unicodeInteger & 63);
956 } elseif ($unicodeInteger < 67108864) {
957 $str .= chr(248 | $unicodeInteger >> 24);
958 $str .= chr(128 | $unicodeInteger >> 18 & 63);
959 $str .= chr(128 | $unicodeInteger >> 12 & 63);
960 $str .= chr(128 | $unicodeInteger >> 6 & 63);
961 $str .= chr(128 | $unicodeInteger & 63);
962 } elseif ($unicodeInteger < 2147483648) {
963 $str .= chr(252 | $unicodeInteger >> 30);
964 $str .= chr(128 | $unicodeInteger >> 24 & 63);
965 $str .= chr(128 | $unicodeInteger >> 18 & 63);
966 $str .= chr(128 | $unicodeInteger >> 12 & 63);
967 $str .= chr(128 | $unicodeInteger >> 6 & 63);
968 $str .= chr(128 | $unicodeInteger & 63);
969 } else {
970 // Cannot express a 32-bit character in UTF-8
971 $str .= chr($this->noCharByteVal);
972 }
973 return $str;
974 }
975
976 /**
977 * Converts a UTF-8 Multibyte character to a UNICODE number
978 * Unit-tested by Kasper
979 *
980 * @param string $str UTF-8 multibyte character string
981 * @param bool $hex If set, then a hex. number is returned.
982 * @return int UNICODE integer
983 * @see UnumberToChar()
984 */
985 public function utf8CharToUnumber($str, $hex = false)
986 {
987 // First char
988 $ord = ord($str[0]);
989 // This verifies that it IS a multi byte string
990 if (($ord & 192) === 192) {
991 $binBuf = '';
992 // For each byte in multibyte string...
993 for ($b = 0; $b < 8; $b++) {
994 // Shift it left and ...
995 $ord = $ord << 1;
996 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
997 if ($ord & 128) {
998 $binBuf .= substr('00000000' . decbin(ord(substr($str, ($b + 1), 1))), -6);
999 } else {
1000 break;
1001 }
1002 }
1003 $binBuf = substr(('00000000' . decbin(ord($str[0]))), -(6 - $b)) . $binBuf;
1004 $int = bindec($binBuf);
1005 } else {
1006 $int = $ord;
1007 }
1008 return $hex ? 'x' . dechex($int) : $int;
1009 }
1010
1011 /********************************************
1012 *
1013 * Init functions
1014 *
1015 ********************************************/
1016 /**
1017 * This will initialize a charset for use if it's defined in the 'typo3/sysext/core/Resources/Private/Charsets/csconvtbl/' folder
1018 * This function is automatically called by the conversion functions
1019 *
1020 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
1021 *
1022 * @param string $charset The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
1023 * @return int Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
1024 * @access private
1025 */
1026 public function initCharset($charset)
1027 {
1028 // Only process if the charset is not yet loaded:
1029 if (!is_array($this->parsedCharsets[$charset])) {
1030 // Conversion table filename:
1031 $charsetConvTableFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/csconvtbl/' . $charset . '.tbl';
1032 // If the conversion table is found:
1033 if ($charset && GeneralUtility::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
1034 // Cache file for charsets:
1035 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1036 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1037 if ($cacheFile && @is_file($cacheFile)) {
1038 $this->parsedCharsets[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1039 } else {
1040 // Parse conversion table into lines:
1041 $lines = GeneralUtility::trimExplode(LF, GeneralUtility::getUrl($charsetConvTableFile), true);
1042 // Initialize the internal variable holding the conv. table:
1043 $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
1044 // traverse the lines:
1045 $detectedType = '';
1046 foreach ($lines as $value) {
1047 // Comment line or blanks are ignored.
1048 if (trim($value) && $value[0] !== '#') {
1049 // Detect type if not done yet: (Done on first real line)
1050 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1051 if (!$detectedType) {
1052 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
1053 }
1054 if ($detectedType === 'ms-token') {
1055 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1056 } elseif ($detectedType === 'whitespaced') {
1057 $regA = array();
1058 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1059 $hexbyte = $regA[1];
1060 $utf8 = 'U+' . $regA[2];
1061 }
1062 $decval = hexdec(trim($hexbyte));
1063 if ($decval > 127) {
1064 $utf8decval = hexdec(substr(trim($utf8), 2));
1065 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1066 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1067 }
1068 }
1069 }
1070 if ($cacheFile) {
1071 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1072 }
1073 }
1074 return 2;
1075 } else {
1076 return false;
1077 }
1078 } else {
1079 return 1;
1080 }
1081 }
1082
1083 /**
1084 * This function initializes all UTF-8 character data tables.
1085 *
1086 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1087 *
1088 * @param string $mode Mode ("case", "ascii", ...)
1089 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1090 * @access private
1091 */
1092 public function initUnicodeData($mode = null)
1093 {
1094 // Cache files
1095 $cacheFileCase = GeneralUtility::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1096 $cacheFileASCII = GeneralUtility::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1097 // Only process if the tables are not yet loaded
1098 switch ($mode) {
1099 case 'case':
1100 if (is_array($this->caseFolding['utf-8'])) {
1101 return 1;
1102 }
1103 // Use cached version if possible
1104 if ($cacheFileCase && @is_file($cacheFileCase)) {
1105 $this->caseFolding['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileCase));
1106 return 2;
1107 }
1108 break;
1109 case 'ascii':
1110 if (is_array($this->toASCII['utf-8'])) {
1111 return 1;
1112 }
1113 // Use cached version if possible
1114 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1115 $this->toASCII['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileASCII));
1116 return 2;
1117 }
1118 break;
1119 }
1120 // Process main Unicode data file
1121 $unicodeDataFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/UnicodeData.txt';
1122 if (!(GeneralUtility::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1123 return false;
1124 }
1125 $fh = fopen($unicodeDataFile, 'rb');
1126 if (!$fh) {
1127 return false;
1128 }
1129 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1130 // Note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1131 $this->caseFolding['utf-8'] = array();
1132 $utf8CaseFolding = &$this->caseFolding['utf-8'];
1133 // a shorthand
1134 $utf8CaseFolding['toUpper'] = array();
1135 $utf8CaseFolding['toLower'] = array();
1136 $utf8CaseFolding['toTitle'] = array();
1137 // Array of temp. decompositions
1138 $decomposition = array();
1139 // Array of chars that are marks (eg. composing accents)
1140 $mark = array();
1141 // Array of chars that are numbers (eg. digits)
1142 $number = array();
1143 // Array of chars to be omitted (eg. Russian hard sign)
1144 $omit = array();
1145 while (!feof($fh)) {
1146 $line = fgets($fh, 4096);
1147 // Has a lot of info
1148 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title, ) = explode(';', rtrim($line));
1149 $ord = hexdec($char);
1150 if ($ord > 65535) {
1151 // Only process the BMP
1152 break;
1153 }
1154 $utf8_char = $this->UnumberToChar($ord);
1155 if ($upper) {
1156 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1157 }
1158 if ($lower) {
1159 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1160 }
1161 // Store "title" only when different from "upper" (only a few)
1162 if ($title && $title !== $upper) {
1163 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1164 }
1165 switch ($cat[0]) {
1166 case 'M':
1167 // mark (accent, umlaut, ...)
1168 $mark['U+' . $char] = 1;
1169 break;
1170 case 'N':
1171 // numeric value
1172 if ($ord > 128 && $num !== '') {
1173 $number['U+' . $char] = $num;
1174 }
1175 }
1176 // Accented Latin letters without "official" decomposition
1177 $match = array();
1178 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1179 $c = ord($match[2]);
1180 if ($match[1] === 'SMALL') {
1181 $c += 32;
1182 }
1183 $decomposition['U+' . $char] = array(dechex($c));
1184 continue;
1185 }
1186 $match = array();
1187 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1188 switch ($match[1]) {
1189 case '<circle>':
1190 // add parenthesis as circle replacement, eg (1)
1191 $match[2] = '0028 ' . $match[2] . ' 0029';
1192 break;
1193 case '<square>':
1194 // add square brackets as square replacement, eg [1]
1195 $match[2] = '005B ' . $match[2] . ' 005D';
1196 break;
1197 case '<compat>':
1198 // ignore multi char decompositions that start with a space
1199 if (preg_match('/^0020 /', $match[2])) {
1200 continue 2;
1201 }
1202 break;
1203 case '<initial>':
1204 case '<medial>':
1205 case '<final>':
1206 case '<isolated>':
1207 case '<vertical>':
1208 continue 2;
1209 }
1210 $decomposition['U+' . $char] = explode(' ', $match[2]);
1211 }
1212 }
1213 fclose($fh);
1214 // Process additional Unicode data for casing (allow folded characters to expand into a sequence)
1215 $specialCasingFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/SpecialCasing.txt';
1216 if (GeneralUtility::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1217 $fh = fopen($specialCasingFile, 'rb');
1218 if ($fh) {
1219 while (!feof($fh)) {
1220 $line = fgets($fh, 4096);
1221 if ($line[0] !== '#' && trim($line) !== '') {
1222 list($char, $lower, $title, $upper, $cond) = GeneralUtility::trimExplode(';', $line);
1223 if ($cond === '' || $cond[0] === '#') {
1224 $utf8_char = $this->UnumberToChar(hexdec($char));
1225 if ($char !== $lower) {
1226 $arr = explode(' ', $lower);
1227 for ($i = 0; isset($arr[$i]); $i++) {
1228 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1229 }
1230 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1231 }
1232 if ($char !== $title && $title !== $upper) {
1233 $arr = explode(' ', $title);
1234 for ($i = 0; isset($arr[$i]); $i++) {
1235 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1236 }
1237 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1238 }
1239 if ($char !== $upper) {
1240 $arr = explode(' ', $upper);
1241 for ($i = 0; isset($arr[$i]); $i++) {
1242 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1243 }
1244 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1245 }
1246 }
1247 }
1248 }
1249 fclose($fh);
1250 }
1251 }
1252 // Process custom decompositions
1253 $customTranslitFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/Translit.txt';
1254 if (GeneralUtility::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1255 $fh = fopen($customTranslitFile, 'rb');
1256 if ($fh) {
1257 while (!feof($fh)) {
1258 $line = fgets($fh, 4096);
1259 if ($line[0] !== '#' && trim($line) !== '') {
1260 list($char, $translit) = GeneralUtility::trimExplode(';', $line);
1261 if (!$translit) {
1262 $omit['U+' . $char] = 1;
1263 }
1264 $decomposition['U+' . $char] = explode(' ', $translit);
1265 }
1266 }
1267 fclose($fh);
1268 }
1269 }
1270 // Decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1271 foreach ($decomposition as $from => $to) {
1272 $code_decomp = array();
1273 while ($code_value = array_shift($to)) {
1274 // Do recursive decomposition
1275 if (isset($decomposition['U+' . $code_value])) {
1276 foreach (array_reverse($decomposition['U+' . $code_value]) as $cv) {
1277 array_unshift($to, $cv);
1278 }
1279 } elseif (!isset($mark['U+' . $code_value])) {
1280 // remove mark
1281 array_push($code_decomp, $code_value);
1282 }
1283 }
1284 if (!empty($code_decomp) || isset($omit[$from])) {
1285 $decomposition[$from] = $code_decomp;
1286 } else {
1287 unset($decomposition[$from]);
1288 }
1289 }
1290 // Create ascii only mapping
1291 $this->toASCII['utf-8'] = array();
1292 $ascii = &$this->toASCII['utf-8'];
1293 foreach ($decomposition as $from => $to) {
1294 $code_decomp = array();
1295 while ($code_value = array_shift($to)) {
1296 $ord = hexdec($code_value);
1297 if ($ord > 127) {
1298 continue 2;
1299 } else {
1300 // Skip decompositions containing non-ASCII chars
1301 array_push($code_decomp, chr($ord));
1302 }
1303 }
1304 $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1305 }
1306 // Add numeric decompositions
1307 foreach ($number as $from => $to) {
1308 $utf8_char = $this->UnumberToChar(hexdec($from));
1309 if (!isset($ascii[$utf8_char])) {
1310 $ascii[$utf8_char] = $to;
1311 }
1312 }
1313 if ($cacheFileCase) {
1314 GeneralUtility::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1315 }
1316 if ($cacheFileASCII) {
1317 GeneralUtility::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1318 }
1319 return 3;
1320 }
1321
1322 /**
1323 * This function initializes the folding table for a charset other than UTF-8.
1324 * This function is automatically called by the case folding functions.
1325 *
1326 * @param string $charset Charset for which to initialize case folding.
1327 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1328 * @access private
1329 */
1330 public function initCaseFolding($charset)
1331 {
1332 // Only process if the case table is not yet loaded:
1333 if (is_array($this->caseFolding[$charset])) {
1334 return 1;
1335 }
1336 // Use cached version if possible
1337 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1338 if ($cacheFile && @is_file($cacheFile)) {
1339 $this->caseFolding[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1340 return 2;
1341 }
1342 // init UTF-8 conversion for this charset
1343 if (!$this->initCharset($charset)) {
1344 return false;
1345 }
1346 // UTF-8 case folding is used as the base conversion table
1347 if (!$this->initUnicodeData('case')) {
1348 return false;
1349 }
1350 $nochar = chr($this->noCharByteVal);
1351 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1352 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1353 $c = $this->utf8_decode($utf8, $charset);
1354 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1355 if ($cc !== '' && $cc !== $nochar) {
1356 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1357 }
1358 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1359 if ($cc !== '' && $cc !== $nochar) {
1360 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1361 }
1362 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1363 if ($cc !== '' && $cc !== $nochar) {
1364 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1365 }
1366 }
1367 // Add the ASCII case table
1368 $start = ord('a');
1369 $end = ord('z');
1370 for ($i = $start; $i <= $end; $i++) {
1371 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1372 }
1373 $start = ord('A');
1374 $end = ord('Z');
1375 for ($i = $start; $i <= $end; $i++) {
1376 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1377 }
1378 if ($cacheFile) {
1379 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1380 }
1381 return 3;
1382 }
1383
1384 /**
1385 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1386 * This function is automatically called by the ASCII transliteration functions.
1387 *
1388 * @param string $charset Charset for which to initialize conversion.
1389 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1390 * @access private
1391 */
1392 public function initToASCII($charset)
1393 {
1394 // Only process if the case table is not yet loaded:
1395 if (is_array($this->toASCII[$charset])) {
1396 return 1;
1397 }
1398 // Use cached version if possible
1399 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1400 if ($cacheFile && @is_file($cacheFile)) {
1401 $this->toASCII[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1402 return 2;
1403 }
1404 // Init UTF-8 conversion for this charset
1405 if (!$this->initCharset($charset)) {
1406 return false;
1407 }
1408 // UTF-8/ASCII transliteration is used as the base conversion table
1409 if (!$this->initUnicodeData('ascii')) {
1410 return false;
1411 }
1412 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1413 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1414 $c = $this->utf8_decode($utf8, $charset);
1415 if (isset($this->toASCII['utf-8'][$utf8])) {
1416 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1417 }
1418 }
1419 if ($cacheFile) {
1420 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1421 }
1422 return 3;
1423 }
1424
1425 /********************************************
1426 *
1427 * String operation functions
1428 *
1429 ********************************************/
1430 /**
1431 * Returns a part of a string.
1432 * Unit-tested by Kasper (single byte charsets only)
1433 *
1434 * @param string $charset The character set
1435 * @param string $string Character string
1436 * @param int $start Start position (character position)
1437 * @param int $len Length (in characters)
1438 * @return string The substring
1439 * @see substr(), mb_substr()
1440 */
1441 public function substr($charset, $string, $start, $len = null)
1442 {
1443 if ($len === 0 || $string === '') {
1444 return '';
1445 }
1446 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1447 // Cannot omit $len, when specifying charset
1448 if ($len === null) {
1449 // Save internal encoding
1450 $enc = mb_internal_encoding();
1451 mb_internal_encoding($charset);
1452 $str = mb_substr($string, $start);
1453 // Restore internal encoding
1454 mb_internal_encoding($enc);
1455 return $str;
1456 } else {
1457 return mb_substr($string, $start, $len, $charset);
1458 }
1459 } elseif ($this->getConversionStrategy() === self::STRATEGY_ICONV) {
1460 // Cannot omit $len, when specifying charset
1461 if ($len === null) {
1462 // Save internal encoding
1463 $enc = iconv_get_encoding('internal_encoding');
1464 iconv_set_encoding('internal_encoding', $charset);
1465 $str = iconv_substr($string, $start);
1466 // Restore internal encoding
1467 iconv_set_encoding('internal_encoding', $enc);
1468 return $str;
1469 } else {
1470 return iconv_substr($string, $start, $len, $charset);
1471 }
1472 } elseif ($charset === 'utf-8') {
1473 return $this->utf8_substr($string, $start, $len);
1474 } elseif ($this->eucBasedSets[$charset]) {
1475 return $this->euc_substr($string, $start, $charset, $len);
1476 } elseif ($this->twoByteSets[$charset]) {
1477 return substr($string, $start * 2, $len * 2);
1478 } elseif ($this->fourByteSets[$charset]) {
1479 return substr($string, $start * 4, $len * 4);
1480 }
1481 // Treat everything else as single-byte encoding
1482 return $len === null ? substr($string, $start) : substr($string, $start, $len);
1483 }
1484
1485 /**
1486 * Counts the number of characters.
1487 * Unit-tested by Kasper (single byte charsets only)
1488 *
1489 * @param string $charset The character set
1490 * @param string $string Character string
1491 * @return int The number of characters
1492 * @see strlen()
1493 */
1494 public function strlen($charset, $string)
1495 {
1496 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1497 return mb_strlen($string, $charset);
1498 } elseif ($this->getConversionStrategy() === self::STRATEGY_ICONV) {
1499 return iconv_strlen($string, $charset);
1500 } elseif ($charset === 'utf-8') {
1501 return $this->utf8_strlen($string);
1502 } elseif ($this->eucBasedSets[$charset]) {
1503 return $this->euc_strlen($string, $charset);
1504 } elseif ($this->twoByteSets[$charset]) {
1505 return strlen($string) / 2;
1506 } elseif ($this->fourByteSets[$charset]) {
1507 return strlen($string) / 4;
1508 }
1509 // Treat everything else as single-byte encoding
1510 return strlen($string);
1511 }
1512
1513 /**
1514 * Method to crop strings using the mb_substr function.
1515 *
1516 * @param string $charset The character set
1517 * @param string $string String to be cropped
1518 * @param int $len Crop length (in characters)
1519 * @param string $crop Crop signifier
1520 * @return string The shortened string
1521 * @see mb_strlen(), mb_substr()
1522 */
1523 protected function cropMbstring($charset, $string, $len, $crop = '')
1524 {
1525 if ((int)$len === 0 || mb_strlen($string, $charset) <= abs($len)) {
1526 return $string;
1527 }
1528 if ($len > 0) {
1529 $string = mb_substr($string, 0, $len, $charset) . $crop;
1530 } else {
1531 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1532 }
1533 return $string;
1534 }
1535
1536 /**
1537 * Truncates a string and pre-/appends a string.
1538 * Unit tested by Kasper
1539 *
1540 * @param string $charset The character set
1541 * @param string $string Character string
1542 * @param int $len Length (in characters)
1543 * @param string $crop Crop signifier
1544 * @return string The shortened string
1545 * @see substr(), mb_strimwidth()
1546 */
1547 public function crop($charset, $string, $len, $crop = '')
1548 {
1549 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1550 return $this->cropMbstring($charset, $string, $len, $crop);
1551 }
1552 if ((int)$len === 0) {
1553 return $string;
1554 }
1555 if ($charset === 'utf-8') {
1556 $i = $this->utf8_char2byte_pos($string, $len);
1557 } elseif ($this->eucBasedSets[$charset]) {
1558 $i = $this->euc_char2byte_pos($string, $len, $charset);
1559 } else {
1560 if ($len > 0) {
1561 $i = $len;
1562 } else {
1563 $i = strlen($string) + $len;
1564 if ($i <= 0) {
1565 $i = false;
1566 }
1567 }
1568 }
1569 // $len outside actual string length
1570 if ($i === false) {
1571 return $string;
1572 } else {
1573 if ($len > 0) {
1574 if (isset($string[$i])) {
1575 return substr($string, 0, $i) . $crop;
1576 }
1577 } else {
1578 if (isset($string[$i - 1])) {
1579 return $crop . substr($string, $i);
1580 }
1581 }
1582 }
1583 return $string;
1584 }
1585
1586 /**
1587 * Cuts a string short at a given byte length.
1588 *
1589 * @param string $charset The character set
1590 * @param string $string Character string
1591 * @param int $len The byte length
1592 * @return string The shortened string
1593 * @see mb_strcut()
1594 */
1595 public function strtrunc($charset, $string, $len)
1596 {
1597 if ($len <= 0) {
1598 return '';
1599 }
1600 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1601 return mb_strcut($string, 0, $len, $charset);
1602 } elseif ($charset === 'utf-8') {
1603 return $this->utf8_strtrunc($string, $len);
1604 } elseif ($this->eucBasedSets[$charset]) {
1605 return $this->euc_strtrunc($string, $len, $charset);
1606 } elseif ($this->twoByteSets[$charset]) {
1607 if ($len % 2) {
1608 $len--;
1609 }
1610 } elseif ($this->fourByteSets[$charset]) {
1611 $x = $len % 4;
1612 // Realign to position dividable by four
1613 $len -= $x;
1614 }
1615 // Treat everything else as single-byte encoding
1616 return substr($string, 0, $len);
1617 }
1618
1619 /**
1620 * Translates all characters of a string into their respective case values.
1621 * Unlike strtolower() and strtoupper() this method is locale independent.
1622 * Note that the string length may change!
1623 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1624 * Unit-tested by Kasper
1625 * Real case folding is language dependent, this method ignores this fact.
1626 *
1627 * @param string $charset Character set of string
1628 * @param string $string Input string to convert case for
1629 * @param string $case Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1630 * @return string The converted string
1631 * @see strtolower(), strtoupper()
1632 */
1633 public function conv_case($charset, $string, $case)
1634 {
1635 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1636 if ($case === 'toLower') {
1637 $string = mb_strtolower($string, $charset);
1638 } else {
1639 $string = mb_strtoupper($string, $charset);
1640 }
1641 } elseif ($charset === 'utf-8') {
1642 $string = $this->utf8_char_mapping($string, 'case', $case);
1643 } elseif (isset($this->eucBasedSets[$charset])) {
1644 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1645 } else {
1646 // Treat everything else as single-byte encoding
1647 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1648 }
1649 return $string;
1650 }
1651
1652 /**
1653 * Equivalent of lcfirst/ucfirst but using character set.
1654 *
1655 * @param string $charset
1656 * @param string $string
1657 * @param string $case
1658 * @return string
1659 * @see \TYPO3\CMS\Core\Charset\CharsetConverter::conv_case()
1660 */
1661 public function convCaseFirst($charset, $string, $case)
1662 {
1663 $firstChar = $this->substr($charset, $string, 0, 1);
1664 $firstChar = $this->conv_case($charset, $firstChar, $case);
1665 $remainder = $this->substr($charset, $string, 1);
1666 return $firstChar . $remainder;
1667 }
1668
1669 /**
1670 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1671 *
1672 * @param string $charset Character set of string
1673 * @param string $string Input string to convert
1674 * @return string The converted string
1675 */
1676 public function specCharsToASCII($charset, $string)
1677 {
1678 if ($charset === 'utf-8') {
1679 $string = $this->utf8_char_mapping($string, 'ascii');
1680 } elseif (isset($this->eucBasedSets[$charset])) {
1681 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1682 } else {
1683 // Treat everything else as single-byte encoding
1684 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1685 }
1686 return $string;
1687 }
1688
1689 /**
1690 * Converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1691 * into a TYPO3-readable language code
1692 *
1693 * @param string $languageCodesList List of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1694 * @return string A preferred language that TYPO3 supports, or "default" if none found
1695 */
1696 public function getPreferredClientLanguage($languageCodesList)
1697 {
1698 $allLanguageCodes = $this->getAllLanguageCodes();
1699 $selectedLanguage = 'default';
1700 $preferredLanguages = GeneralUtility::trimExplode(',', $languageCodesList);
1701 // Order the preferred languages after they key
1702 $sortedPreferredLanguages = array();
1703 foreach ($preferredLanguages as $preferredLanguage) {
1704 $quality = 1.0;
1705 if (strpos($preferredLanguage, ';q=') !== false) {
1706 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1707 }
1708 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1709 }
1710 // Loop through the languages, with the highest priority first
1711 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1712 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1713 if (isset($allLanguageCodes[$preferredLanguage])) {
1714 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1715 break;
1716 }
1717 // Strip the country code from the end
1718 list($preferredLanguage, ) = explode('-', $preferredLanguage);
1719 if (isset($allLanguageCodes[$preferredLanguage])) {
1720 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1721 break;
1722 }
1723 }
1724 if (!$selectedLanguage || $selectedLanguage === 'en') {
1725 $selectedLanguage = 'default';
1726 }
1727 return $selectedLanguage;
1728 }
1729
1730 /**
1731 * Merges all available charsets and locales, currently only used for getPreferredClientLanguage()
1732 *
1733 * @return array
1734 */
1735 protected function getAllLanguageCodes()
1736 {
1737 // Get all languages where TYPO3 code is the same as the ISO code
1738 $typo3LanguageCodes = array_keys($this->charSetArray);
1739 $allLanguageCodes = array_combine($typo3LanguageCodes, $typo3LanguageCodes);
1740 // Get all languages where TYPO3 code differs from ISO code
1741 // or needs the country part
1742 // the iso codes will here overwrite the default typo3 language in the key
1743 /** @var Locales $locales */
1744 $locales = GeneralUtility::makeInstance(Locales::class);
1745 foreach ($locales->getIsoMapping() as $typo3Lang => $isoLang) {
1746 $isoLang = join('-', explode('_', $isoLang));
1747 $allLanguageCodes[$typo3Lang] = $isoLang;
1748 }
1749 // Move the iso codes to the (because we're comparing the keys with "isset" later on)
1750 return array_flip($allLanguageCodes);
1751 }
1752
1753 /********************************************
1754 *
1755 * Internal string operation functions
1756 *
1757 ********************************************/
1758 /**
1759 * Maps all characters of a string in a single byte charset.
1760 *
1761 * @param string $str The string
1762 * @param string $charset The charset
1763 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1764 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
1765 * @return string The converted string
1766 */
1767 public function sb_char_mapping($str, $charset, $mode, $opt = '')
1768 {
1769 switch ($mode) {
1770 case 'case':
1771 if (!$this->initCaseFolding($charset)) {
1772 return $str;
1773 }
1774 // Do nothing
1775 $map = &$this->caseFolding[$charset][$opt];
1776 break;
1777 case 'ascii':
1778 if (!$this->initToASCII($charset)) {
1779 return $str;
1780 }
1781 // Do nothing
1782 $map = &$this->toASCII[$charset];
1783 break;
1784 default:
1785 return $str;
1786 }
1787 $out = '';
1788 for ($i = 0; isset($str[$i]); $i++) {
1789 $c = $str[$i];
1790 if (isset($map[$c])) {
1791 $out .= $map[$c];
1792 } else {
1793 $out .= $c;
1794 }
1795 }
1796 return $out;
1797 }
1798
1799 /********************************************
1800 *
1801 * Internal UTF-8 string operation functions
1802 *
1803 ********************************************/
1804 /**
1805 * Returns a part of a UTF-8 string.
1806 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1807 *
1808 * @param string $str UTF-8 string
1809 * @param int $start Start position (character position)
1810 * @param int $len Length (in characters)
1811 * @return string The substring
1812 * @see substr()
1813 */
1814 public function utf8_substr($str, $start, $len = null)
1815 {
1816 if ((string)$len === '0') {
1817 return '';
1818 }
1819 $byte_start = $this->utf8_char2byte_pos($str, $start);
1820 if ($byte_start === false) {
1821 if ($start > 0) {
1822 // $start outside string length
1823 return false;
1824 }
1825 }
1826 $str = substr($str, $byte_start);
1827 if ($len != null) {
1828 $byte_end = $this->utf8_char2byte_pos($str, $len);
1829 // $len outside actual string length
1830 if ($byte_end === false) {
1831 return $len < 0 ? '' : $str;
1832 } else {
1833 // When length is less than zero and exceeds, then we return blank string.
1834 return substr($str, 0, $byte_end);
1835 }
1836 } else {
1837 return $str;
1838 }
1839 }
1840
1841 /**
1842 * Counts the number of characters of a string in UTF-8.
1843 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1844 *
1845 * @param string $str UTF-8 multibyte character string
1846 * @return int The number of characters
1847 * @see strlen()
1848 */
1849 public function utf8_strlen($str)
1850 {
1851 $n = 0;
1852 for ($i = 0; isset($str[$i]); $i++) {
1853 $c = ord($str[$i]);
1854 // Single-byte (0xxxxxx)
1855 if (!($c & 128)) {
1856 $n++;
1857 } elseif (($c & 192) === 192) {
1858 // Multi-byte starting byte (11xxxxxx)
1859 $n++;
1860 }
1861 }
1862 return $n;
1863 }
1864
1865 /**
1866 * Truncates a string in UTF-8 short at a given byte length.
1867 *
1868 * @param string $str UTF-8 multibyte character string
1869 * @param int $len The byte length
1870 * @return string The shortened string
1871 * @see mb_strcut()
1872 */
1873 public function utf8_strtrunc($str, $len)
1874 {
1875 $i = $len - 1;
1876 // Part of a multibyte sequence
1877 if (ord($str[$i]) & 128) {
1878 for (; $i > 0 && !(ord($str[$i]) & 64); $i--) {
1879 }
1880 if ($i <= 0) {
1881 return '';
1882 }
1883 // Sanity check
1884 for ($bc = 0, $mbs = ord($str[$i]); $mbs & 128; $mbs = $mbs << 1) {
1885 // Calculate number of bytes
1886 $bc++;
1887 }
1888 if ($bc + $i > $len) {
1889 return substr($str, 0, $i);
1890 }
1891 }
1892 return substr($str, 0, $len);
1893 }
1894
1895 /**
1896 * Find position of first occurrence of a string, both arguments are in UTF-8.
1897 *
1898 * @param string $haystack UTF-8 string to search in
1899 * @param string $needle UTF-8 string to search for
1900 * @param int $offset Position to start the search
1901 * @return int The character position
1902 * @see strpos()
1903 */
1904 public function utf8_strpos($haystack, $needle, $offset = 0)
1905 {
1906 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1907 return mb_strpos($haystack, $needle, $offset, 'utf-8');
1908 } elseif ($this->getConversionStrategy() === self::STRATEGY_ICONV) {
1909 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1910 }
1911 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1912 if ($byte_offset === false) {
1913 // Offset beyond string length
1914 return false;
1915 }
1916 $byte_pos = strpos($haystack, $needle, $byte_offset);
1917 if ($byte_pos === false) {
1918 // Needle not found
1919 return false;
1920 }
1921 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1922 }
1923
1924 /**
1925 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1926 *
1927 * @param string $haystack UTF-8 string to search in
1928 * @param string $needle UTF-8 character to search for (single character)
1929 * @return int The character position
1930 * @see strrpos()
1931 */
1932 public function utf8_strrpos($haystack, $needle)
1933 {
1934 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1935 return mb_strrpos($haystack, $needle, 'utf-8');
1936 } elseif ($this->getConversionStrategy() === self::STRATEGY_ICONV) {
1937 return iconv_strrpos($haystack, $needle, 'utf-8');
1938 }
1939 $byte_pos = strrpos($haystack, $needle);
1940 if ($byte_pos === false) {
1941 // Needle not found
1942 return false;
1943 }
1944 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1945 }
1946
1947 /**
1948 * Translates a character position into an 'absolute' byte position.
1949 * Unit tested by Kasper.
1950 *
1951 * @param string $str UTF-8 string
1952 * @param int $pos Character position (negative values start from the end)
1953 * @return int Byte position
1954 */
1955 public function utf8_char2byte_pos($str, $pos)
1956 {
1957 // Number of characters found
1958 $n = 0;
1959 // Number of characters wanted
1960 $p = abs($pos);
1961 if ($pos >= 0) {
1962 $i = 0;
1963 $d = 1;
1964 } else {
1965 $i = strlen($str) - 1;
1966 $d = -1;
1967 }
1968 for (; isset($str[$i]) && $n < $p; $i += $d) {
1969 $c = (int)ord($str[$i]);
1970 // single-byte (0xxxxxx)
1971 if (!($c & 128)) {
1972 $n++;
1973 } elseif (($c & 192) === 192) {
1974 // Multi-byte starting byte (11xxxxxx)
1975 $n++;
1976 }
1977 }
1978 if (!isset($str[$i])) {
1979 // Offset beyond string length
1980 return false;
1981 }
1982 if ($pos >= 0) {
1983 // Skip trailing multi-byte data bytes
1984 while (ord($str[$i]) & 128 && !(ord($str[$i]) & 64)) {
1985 $i++;
1986 }
1987 } else {
1988 // Correct offset
1989 $i++;
1990 }
1991 return $i;
1992 }
1993
1994 /**
1995 * Translates an 'absolute' byte position into a character position.
1996 * Unit tested by Kasper.
1997 *
1998 * @param string $str UTF-8 string
1999 * @param int $pos Byte position
2000 * @return int Character position
2001 */
2002 public function utf8_byte2char_pos($str, $pos)
2003 {
2004 // Number of characters
2005 $n = 0;
2006 for ($i = $pos; $i > 0; $i--) {
2007 $c = (int)ord($str[$i]);
2008 // single-byte (0xxxxxx)
2009 if (!($c & 128)) {
2010 $n++;
2011 } elseif (($c & 192) === 192) {
2012 // Multi-byte starting byte (11xxxxxx)
2013 $n++;
2014 }
2015 }
2016 if (!isset($str[$i])) {
2017 // Offset beyond string length
2018 return false;
2019 }
2020 return $n;
2021 }
2022
2023 /**
2024 * Maps all characters of an UTF-8 string.
2025 *
2026 * @param string $str UTF-8 string
2027 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2028 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
2029 * @return string The converted string
2030 */
2031 public function utf8_char_mapping($str, $mode, $opt = '')
2032 {
2033 if (!$this->initUnicodeData($mode)) {
2034 // Do nothing
2035 return $str;
2036 }
2037 $out = '';
2038 switch ($mode) {
2039 case 'case':
2040 $map = &$this->caseFolding['utf-8'][$opt];
2041 break;
2042 case 'ascii':
2043 $map = &$this->toASCII['utf-8'];
2044 break;
2045 default:
2046 return $str;
2047 }
2048 for ($i = 0; isset($str[$i]); $i++) {
2049 $c = ord($str[$i]);
2050 // single-byte (0xxxxxx)
2051 if (!($c & 128)) {
2052 $mbc = $str[$i];
2053 } elseif (($c & 192) === 192) {
2054 // multi-byte starting byte (11xxxxxx)
2055 for ($bc = 0; $c & 128; $c = $c << 1) {
2056 $bc++;
2057 }
2058 // calculate number of bytes
2059 $mbc = substr($str, $i, $bc);
2060 $i += $bc - 1;
2061 }
2062 if (isset($map[$mbc])) {
2063 $out .= $map[$mbc];
2064 } else {
2065 $out .= $mbc;
2066 }
2067 }
2068 return $out;
2069 }
2070
2071 /********************************************
2072 *
2073 * Internal EUC string operation functions
2074 *
2075 * Extended Unix Code:
2076 * ASCII compatible 7bit single bytes chars
2077 * 8bit two byte chars
2078 *
2079 * Shift-JIS is treated as a special case.
2080 *
2081 ********************************************/
2082 /**
2083 * Cuts a string in the EUC charset family short at a given byte length.
2084 *
2085 * @param string $str EUC multibyte character string
2086 * @param int $len The byte length
2087 * @param string $charset The charset
2088 * @return string The shortened string
2089 * @see mb_strcut()
2090 */
2091 public function euc_strtrunc($str, $len, $charset)
2092 {
2093 $shiftJis = $charset === 'shift_jis';
2094 for ($i = 0; isset($str[$i]) && $i < $len; $i++) {
2095 $c = ord($str[$i]);
2096 if ($shiftJis) {
2097 if ($c >= 128 && $c < 160 || $c >= 224) {
2098 $i++;
2099 }
2100 } else {
2101 if ($c >= 128) {
2102 $i++;
2103 }
2104 }
2105 }
2106 if (!isset($str[$i])) {
2107 return $str;
2108 }
2109 // string shorter than supplied length
2110 if ($i > $len) {
2111 // We ended on a first byte
2112 return substr($str, 0, $len - 1);
2113 } else {
2114 return substr($str, 0, $len);
2115 }
2116 }
2117
2118 /**
2119 * Returns a part of a string in the EUC charset family.
2120 *
2121 * @param string $str EUC multibyte character string
2122 * @param int $start Start position (character position)
2123 * @param string $charset The charset
2124 * @param int $len Length (in characters)
2125 * @return string the substring
2126 */
2127 public function euc_substr($str, $start, $charset, $len = null)
2128 {
2129 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2130 if ($byte_start === false) {
2131 // $start outside string length
2132 return false;
2133 }
2134 $str = substr($str, $byte_start);
2135 if ($len != null) {
2136 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2137 // $len outside actual string length
2138 if ($byte_end === false) {
2139 return $str;
2140 } else {
2141 return substr($str, 0, $byte_end);
2142 }
2143 } else {
2144 return $str;
2145 }
2146 }
2147
2148 /**
2149 * Counts the number of characters of a string in the EUC charset family.
2150 *
2151 * @param string $str EUC multibyte character string
2152 * @param string $charset The charset
2153 * @return int The number of characters
2154 * @see strlen()
2155 */
2156 public function euc_strlen($str, $charset)
2157 {
2158 $sjis = $charset === 'shift_jis';
2159 $n = 0;
2160 for ($i = 0; isset($str[$i]); $i++) {
2161 $c = ord($str[$i]);
2162 if ($sjis) {
2163 if ($c >= 128 && $c < 160 || $c >= 224) {
2164 $i++;
2165 }
2166 } else {
2167 if ($c >= 128) {
2168 $i++;
2169 }
2170 }
2171 $n++;
2172 }
2173 return $n;
2174 }
2175
2176 /**
2177 * Translates a character position into an 'absolute' byte position.
2178 *
2179 * @param string $str EUC multibyte character string
2180 * @param int $pos Character position (negative values start from the end)
2181 * @param string $charset The charset
2182 * @return int Byte position
2183 */
2184 public function euc_char2byte_pos($str, $pos, $charset)
2185 {
2186 $sjis = $charset === 'shift_jis';
2187 // Number of characters seen
2188 $n = 0;
2189 // Number of characters wanted
2190 $p = abs($pos);
2191 if ($pos >= 0) {
2192 $i = 0;
2193 $d = 1;
2194 } else {
2195 $i = strlen($str) - 1;
2196 $d = -1;
2197 }
2198 for (; isset($str[$i]) && $n < $p; $i += $d) {
2199 $c = ord($str[$i]);
2200 if ($sjis) {
2201 if ($c >= 128 && $c < 160 || $c >= 224) {
2202 $i += $d;
2203 }
2204 } else {
2205 if ($c >= 128) {
2206 $i += $d;
2207 }
2208 }
2209 $n++;
2210 }
2211 if (!isset($str[$i])) {
2212 return false;
2213 }
2214 // offset beyond string length
2215 if ($pos < 0) {
2216 $i++;
2217 }
2218 // correct offset
2219 return $i;
2220 }
2221
2222 /**
2223 * Maps all characters of a string in the EUC charset family.
2224 *
2225 * @param string $str EUC multibyte character string
2226 * @param string $charset The charset
2227 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2228 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
2229 * @return string The converted string
2230 */
2231 public function euc_char_mapping($str, $charset, $mode, $opt = '')
2232 {
2233 switch ($mode) {
2234 case 'case':
2235 if (!$this->initCaseFolding($charset)) {
2236 return $str;
2237 }
2238 // do nothing
2239 $map = &$this->caseFolding[$charset][$opt];
2240 break;
2241 case 'ascii':
2242 if (!$this->initToASCII($charset)) {
2243 return $str;
2244 }
2245 // do nothing
2246 $map = &$this->toASCII[$charset];
2247 break;
2248 default:
2249 return $str;
2250 }
2251 $sjis = $charset === 'shift_jis';
2252 $out = '';
2253 for ($i = 0; isset($str[$i]); $i++) {
2254 $mbc = $str[$i];
2255 $c = ord($mbc);
2256 if ($sjis) {
2257 // A double-byte char
2258 if ($c >= 128 && $c < 160 || $c >= 224) {
2259 $mbc = substr($str, $i, 2);
2260 $i++;
2261 }
2262 } else {
2263 // A double-byte char
2264 if ($c >= 128) {
2265 $mbc = substr($str, $i, 2);
2266 $i++;
2267 }
2268 }
2269 if (isset($map[$mbc])) {
2270 $out .= $map[$mbc];
2271 } else {
2272 $out .= $mbc;
2273 }
2274 }
2275 return $out;
2276 }
2277
2278 /**
2279 * Checks the selected strategy based on which method is configured in
2280 * $TYPO3_CONF_VARS[SYS][t3lib_cs_utils].
2281 *
2282 * @return string could be "mbstring", "iconv" or "fallback"
2283 */
2284 protected function getConversionStrategy() {
2285 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === self::STRATEGY_MBSTRING) {
2286 return self::STRATEGY_MBSTRING;
2287 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === self::STRATEGY_ICONV) {
2288 return self::STRATEGY_ICONV;
2289 } else {
2290 return self::STRATEGY_FALLBACK;
2291 }
2292 }
2293 }