[TASK] Remove unneeded parenthesis on array-access
[Packages/TYPO3.CMS.git] / typo3 / sysext / core / Classes / Charset / CharsetConverter.php
1 <?php
2 namespace TYPO3\CMS\Core\Charset;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Localization\Locales;
18 use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
19 use TYPO3\CMS\Core\Utility\GeneralUtility;
20
21 /**
22 * Notes on UTF-8
23 *
24 * Functions working on UTF-8 strings:
25 *
26 * - strchr/strstr
27 * - strrchr
28 * - substr_count
29 * - implode/explode/join
30 *
31 * Functions nearly working on UTF-8 strings:
32 *
33 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
34 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
35 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
36 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
37 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
38 *
39 * Functions NOT working on UTF-8 strings:
40 *
41 * - str*cmp
42 * - stristr
43 * - stripos
44 * - substr
45 * - strrev
46 * - split/spliti
47 * - ...
48 */
49
50 /**
51 * Class for conversion between charsets
52 */
53 class CharsetConverter
54 {
55 /**
56 * @var \TYPO3\CMS\Core\Localization\Locales
57 */
58 protected $locales;
59
60 /**
61 * ASCII Value for chars with no equivalent.
62 *
63 * @var int
64 */
65 public $noCharByteVal = 63;
66
67 /**
68 * This is the array where parsed conversion tables are stored (cached)
69 *
70 * @var array
71 */
72 public $parsedCharsets = array();
73
74 /**
75 * An array where case folding data will be stored (cached)
76 *
77 * @var array
78 */
79 public $caseFolding = array();
80
81 /**
82 * An array where charset-to-ASCII mappings are stored (cached)
83 *
84 * @var array
85 */
86 public $toASCII = array();
87
88 /**
89 * This tells the converter which charsets has two bytes per char:
90 *
91 * @var array
92 */
93 public $twoByteSets = array(
94 'ucs-2' => 1
95 );
96
97 /**
98 * This tells the converter which charsets has four bytes per char:
99 *
100 * @var array
101 */
102 public $fourByteSets = array(
103 'ucs-4' => 1, // 4-byte Unicode
104 'utf-32' => 1
105 );
106
107 /**
108 * This tells the converter which charsets use a scheme like the Extended Unix Code:
109 *
110 * @var array
111 */
112 public $eucBasedSets = array(
113 'gb2312' => 1, // Chinese, simplified.
114 'big5' => 1, // Chinese, traditional.
115 'euc-kr' => 1, // Korean
116 'shift_jis' => 1
117 );
118
119 /**
120 * @link http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
121 * @link http://czyborra.com/charsets/iso8859.html
122 *
123 * @var array
124 */
125 public $synonyms = array(
126 'us' => 'ascii',
127 'us-ascii' => 'ascii',
128 'cp819' => 'iso-8859-1',
129 'ibm819' => 'iso-8859-1',
130 'iso-ir-100' => 'iso-8859-1',
131 'iso-ir-101' => 'iso-8859-2',
132 'iso-ir-109' => 'iso-8859-3',
133 'iso-ir-110' => 'iso-8859-4',
134 'iso-ir-144' => 'iso-8859-5',
135 'iso-ir-127' => 'iso-8859-6',
136 'iso-ir-126' => 'iso-8859-7',
137 'iso-ir-138' => 'iso-8859-8',
138 'iso-ir-148' => 'iso-8859-9',
139 'iso-ir-157' => 'iso-8859-10',
140 'iso-ir-179' => 'iso-8859-13',
141 'iso-ir-199' => 'iso-8859-14',
142 'iso-ir-203' => 'iso-8859-15',
143 'csisolatin1' => 'iso-8859-1',
144 'csisolatin2' => 'iso-8859-2',
145 'csisolatin3' => 'iso-8859-3',
146 'csisolatin5' => 'iso-8859-9',
147 'csisolatin8' => 'iso-8859-14',
148 'csisolatin9' => 'iso-8859-15',
149 'csisolatingreek' => 'iso-8859-7',
150 'iso-celtic' => 'iso-8859-14',
151 'latin1' => 'iso-8859-1',
152 'latin2' => 'iso-8859-2',
153 'latin3' => 'iso-8859-3',
154 'latin5' => 'iso-8859-9',
155 'latin6' => 'iso-8859-10',
156 'latin8' => 'iso-8859-14',
157 'latin9' => 'iso-8859-15',
158 'l1' => 'iso-8859-1',
159 'l2' => 'iso-8859-2',
160 'l3' => 'iso-8859-3',
161 'l5' => 'iso-8859-9',
162 'l6' => 'iso-8859-10',
163 'l8' => 'iso-8859-14',
164 'l9' => 'iso-8859-15',
165 'cyrillic' => 'iso-8859-5',
166 'arabic' => 'iso-8859-6',
167 'tis-620' => 'iso-8859-11',
168 'win874' => 'windows-874',
169 'win1250' => 'windows-1250',
170 'win1251' => 'windows-1251',
171 'win1252' => 'windows-1252',
172 'win1253' => 'windows-1253',
173 'win1254' => 'windows-1254',
174 'win1255' => 'windows-1255',
175 'win1256' => 'windows-1256',
176 'win1257' => 'windows-1257',
177 'win1258' => 'windows-1258',
178 'cp1250' => 'windows-1250',
179 'cp1251' => 'windows-1251',
180 'cp1252' => 'windows-1252',
181 'ms-ee' => 'windows-1250',
182 'ms-ansi' => 'windows-1252',
183 'ms-greek' => 'windows-1253',
184 'ms-turk' => 'windows-1254',
185 'winbaltrim' => 'windows-1257',
186 'koi-8ru' => 'koi-8r',
187 'koi8r' => 'koi-8r',
188 'cp878' => 'koi-8r',
189 'mac' => 'macroman',
190 'macintosh' => 'macroman',
191 'euc-cn' => 'gb2312',
192 'x-euc-cn' => 'gb2312',
193 'euccn' => 'gb2312',
194 'cp936' => 'gb2312',
195 'big-5' => 'big5',
196 'cp950' => 'big5',
197 'eucjp' => 'euc-jp',
198 'sjis' => 'shift_jis',
199 'shift-jis' => 'shift_jis',
200 'cp932' => 'shift_jis',
201 'cp949' => 'euc-kr',
202 'utf7' => 'utf-7',
203 'utf8' => 'utf-8',
204 'utf16' => 'utf-16',
205 'utf32' => 'utf-32',
206 'ucs2' => 'ucs-2',
207 'ucs4' => 'ucs-4'
208 );
209
210 /**
211 * Mapping of iso-639-1 language codes to script names
212 *
213 * @var array
214 */
215 public $lang_to_script = array(
216 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
217 'af' => 'west_european', // Afrikaans
218 'ar' => 'arabic',
219 'bg' => 'cyrillic', // Bulgarian
220 'bs' => 'east_european', // Bosnian
221 'cs' => 'east_european', // Czech
222 'da' => 'west_european', // Danish
223 'de' => 'west_european', // German
224 'es' => 'west_european', // Spanish
225 'et' => 'estonian',
226 'eo' => 'unicode', // Esperanto
227 'eu' => 'west_european', // Basque
228 'fa' => 'arabic', // Persian
229 'fi' => 'west_european', // Finish
230 'fo' => 'west_european', // Faroese
231 'fr' => 'west_european', // French
232 'ga' => 'west_european', // Irish
233 'gl' => 'west_european', // Galician
234 'gr' => 'greek',
235 'he' => 'hebrew', // Hebrew (since 1998)
236 'hi' => 'unicode', // Hindi
237 'hr' => 'east_european', // Croatian
238 'hu' => 'east_european', // Hungarian
239 'iw' => 'hebrew', // Hebrew (til 1998)
240 'is' => 'west_european', // Icelandic
241 'it' => 'west_european', // Italian
242 'ja' => 'japanese',
243 'ka' => 'unicode', // Georgian
244 'kl' => 'west_european', // Greenlandic
245 'km' => 'unicode', // Khmer
246 'ko' => 'korean',
247 'lt' => 'lithuanian',
248 'lv' => 'west_european', // Latvian/Lettish
249 'nl' => 'west_european', // Dutch
250 'no' => 'west_european', // Norwegian
251 'nb' => 'west_european', // Norwegian Bokmal
252 'nn' => 'west_european', // Norwegian Nynorsk
253 'pl' => 'east_european', // Polish
254 'pt' => 'west_european', // Portuguese
255 'ro' => 'east_european', // Romanian
256 'ru' => 'cyrillic', // Russian
257 'sk' => 'east_european', // Slovak
258 'sl' => 'east_european', // Slovenian
259 'sr' => 'cyrillic', // Serbian
260 'sv' => 'west_european', // Swedish
261 'sq' => 'albanian', // Albanian
262 'th' => 'thai',
263 'uk' => 'cyrillic', // Ukranian
264 'vi' => 'vietnamese',
265 'zh' => 'chinese',
266
267 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
268 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
269 'afk' => 'west_european', // Afrikaans
270 'ara' => 'arabic',
271 'bgr' => 'cyrillic', // Bulgarian
272 'cat' => 'west_european', // Catalan
273 'chs' => 'simpl_chinese',
274 'cht' => 'trad_chinese',
275 'csy' => 'east_european', // Czech
276 'dan' => 'west_european', // Danish
277 'deu' => 'west_european', // German
278 'dea' => 'west_european', // German (Austrian)
279 'des' => 'west_european', // German (Swiss)
280 'ena' => 'west_european', // English (Australian)
281 'enc' => 'west_european', // English (Canadian)
282 'eng' => 'west_european', // English
283 'enz' => 'west_european', // English (New Zealand)
284 'enu' => 'west_european', // English (United States)
285 'euq' => 'west_european', // Basque
286 'fos' => 'west_european', // Faroese
287 'far' => 'arabic', // Persian
288 'fin' => 'west_european', // Finish
289 'fra' => 'west_european', // French
290 'frb' => 'west_european', // French (Belgian)
291 'frc' => 'west_european', // French (Canadian)
292 'frs' => 'west_european', // French (Swiss)
293 'geo' => 'unicode', // Georgian
294 'glg' => 'west_european', // Galician
295 'ell' => 'greek',
296 'heb' => 'hebrew',
297 'hin' => 'unicode', // Hindi
298 'hun' => 'east_european', // Hungarian
299 'isl' => 'west_european', // Icelandic
300 'ita' => 'west_european', // Italian
301 'its' => 'west_european', // Italian (Swiss)
302 'jpn' => 'japanese',
303 'khm' => 'unicode', // Khmer
304 'kor' => 'korean',
305 'lth' => 'lithuanian',
306 'lvi' => 'west_european', // Latvian/Lettish
307 'msl' => 'west_european', // Malay
308 'nlb' => 'west_european', // Dutch (Belgian)
309 'nld' => 'west_european', // Dutch
310 'nor' => 'west_european', // Norwegian (bokmal)
311 'non' => 'west_european', // Norwegian (nynorsk)
312 'plk' => 'east_european', // Polish
313 'ptg' => 'west_european', // Portuguese
314 'ptb' => 'west_european', // Portuguese (Brazil)
315 'rom' => 'east_european', // Romanian
316 'rus' => 'cyrillic', // Russian
317 'slv' => 'east_european', // Slovenian
318 'sky' => 'east_european', // Slovak
319 'srl' => 'east_european', // Serbian (Latin)
320 'srb' => 'cyrillic', // Serbian (Cyrillic)
321 'esp' => 'west_european', // Spanish (trad. sort)
322 'esm' => 'west_european', // Spanish (Mexican)
323 'esn' => 'west_european', // Spanish (internat. sort)
324 'sve' => 'west_european', // Swedish
325 'sqi' => 'albanian', // Albanian
326 'tha' => 'thai',
327 'trk' => 'turkish',
328 'ukr' => 'cyrillic', // Ukrainian
329
330 // English language names
331 'afrikaans' => 'west_european',
332 'albanian' => 'albanian',
333 'arabic' => 'arabic',
334 'basque' => 'west_european',
335 'bosnian' => 'east_european',
336 'bulgarian' => 'east_european',
337 'catalan' => 'west_european',
338 'croatian' => 'east_european',
339 'czech' => 'east_european',
340 'danish' => 'west_european',
341 'dutch' => 'west_european',
342 'english' => 'west_european',
343 'esperanto' => 'unicode',
344 'estonian' => 'estonian',
345 'faroese' => 'west_european',
346 'farsi' => 'arabic',
347 'finnish' => 'west_european',
348 'french' => 'west_european',
349 'galician' => 'west_european',
350 'georgian' => 'unicode',
351 'german' => 'west_european',
352 'greek' => 'greek',
353 'greenlandic' => 'west_european',
354 'hebrew' => 'hebrew',
355 'hindi' => 'unicode',
356 'hungarian' => 'east_european',
357 'icelandic' => 'west_european',
358 'italian' => 'west_european',
359 'khmer' => 'unicode',
360 'latvian' => 'west_european',
361 'lettish' => 'west_european',
362 'lithuanian' => 'lithuanian',
363 'malay' => 'west_european',
364 'norwegian' => 'west_european',
365 'persian' => 'arabic',
366 'polish' => 'east_european',
367 'portuguese' => 'west_european',
368 'russian' => 'cyrillic',
369 'romanian' => 'east_european',
370 'serbian' => 'cyrillic',
371 'slovak' => 'east_european',
372 'slovenian' => 'east_european',
373 'spanish' => 'west_european',
374 'svedish' => 'west_european',
375 'that' => 'thai',
376 'turkish' => 'turkish',
377 'ukrainian' => 'cyrillic'
378 );
379
380 /**
381 * Mapping of language (family) names to charsets on Unix
382 *
383 * @var array
384 */
385 public $script_to_charset_unix = array(
386 'west_european' => 'iso-8859-1',
387 'estonian' => 'iso-8859-1',
388 'east_european' => 'iso-8859-2',
389 'baltic' => 'iso-8859-4',
390 'cyrillic' => 'iso-8859-5',
391 'arabic' => 'iso-8859-6',
392 'greek' => 'iso-8859-7',
393 'hebrew' => 'iso-8859-8',
394 'turkish' => 'iso-8859-9',
395 'thai' => 'iso-8859-11', // = TIS-620
396 'lithuanian' => 'iso-8859-13',
397 'chinese' => 'gb2312', // = euc-cn
398 'japanese' => 'euc-jp',
399 'korean' => 'euc-kr',
400 'simpl_chinese' => 'gb2312',
401 'trad_chinese' => 'big5',
402 'vietnamese' => '',
403 'unicode' => 'utf-8',
404 'albanian' => 'utf-8'
405 );
406
407 /**
408 * Mapping of language (family) names to charsets on Windows
409 *
410 * @var array
411 */
412 public $script_to_charset_windows = array(
413 'east_european' => 'windows-1250',
414 'cyrillic' => 'windows-1251',
415 'west_european' => 'windows-1252',
416 'greek' => 'windows-1253',
417 'turkish' => 'windows-1254',
418 'hebrew' => 'windows-1255',
419 'arabic' => 'windows-1256',
420 'baltic' => 'windows-1257',
421 'estonian' => 'windows-1257',
422 'lithuanian' => 'windows-1257',
423 'vietnamese' => 'windows-1258',
424 'thai' => 'cp874',
425 'korean' => 'cp949',
426 'chinese' => 'gb2312',
427 'japanese' => 'shift_jis',
428 'simpl_chinese' => 'gb2312',
429 'trad_chinese' => 'big5',
430 'albanian' => 'windows-1250',
431 'unicode' => 'utf-8'
432 );
433
434 /**
435 * Mapping of locale names to charsets
436 *
437 * @var array
438 */
439 public $locale_to_charset = array(
440 'japanese.euc' => 'euc-jp',
441 'ja_jp.ujis' => 'euc-jp',
442 'korean.euc' => 'euc-kr',
443 'sr@Latn' => 'iso-8859-2',
444 'zh_cn' => 'gb2312',
445 'zh_hk' => 'big5',
446 'zh_tw' => 'big5'
447 );
448
449 /**
450 * TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
451 * Empty values means "utf-8"
452 *
453 * @var array
454 */
455 public $charSetArray = array(
456 'af' => '',
457 'ar' => 'iso-8859-6',
458 'ba' => 'iso-8859-2',
459 'bg' => 'windows-1251',
460 'br' => '',
461 'ca' => 'iso-8859-15',
462 'ch' => 'gb2312',
463 'cs' => 'windows-1250',
464 'cz' => 'windows-1250',
465 'da' => '',
466 'de' => '',
467 'dk' => '',
468 'el' => 'iso-8859-7',
469 'eo' => 'utf-8',
470 'es' => '',
471 'et' => 'iso-8859-4',
472 'eu' => '',
473 'fa' => 'utf-8',
474 'fi' => '',
475 'fo' => 'utf-8',
476 'fr' => '',
477 'fr_CA' => '',
478 'ga' => '',
479 'ge' => 'utf-8',
480 'gl' => '',
481 'gr' => 'iso-8859-7',
482 'he' => 'utf-8',
483 'hi' => 'utf-8',
484 'hk' => 'big5',
485 'hr' => 'windows-1250',
486 'hu' => 'iso-8859-2',
487 'is' => 'utf-8',
488 'it' => '',
489 'ja' => 'shift_jis',
490 'jp' => 'shift_jis',
491 'ka' => 'utf-8',
492 'kl' => 'utf-8',
493 'km' => 'utf-8',
494 'ko' => 'euc-kr',
495 'kr' => 'euc-kr',
496 'lt' => 'windows-1257',
497 'lv' => 'utf-8',
498 'ms' => '',
499 'my' => '',
500 'nl' => '',
501 'no' => '',
502 'pl' => 'iso-8859-2',
503 'pt' => '',
504 'pt_BR' => '',
505 'qc' => '',
506 'ro' => 'iso-8859-2',
507 'ru' => 'windows-1251',
508 'se' => '',
509 'si' => 'windows-1250',
510 'sk' => 'windows-1250',
511 'sl' => 'windows-1250',
512 'sq' => 'utf-8',
513 'sr' => 'utf-8',
514 'sv' => '',
515 'th' => 'iso-8859-11',
516 'tr' => 'iso-8859-9',
517 'ua' => 'windows-1251',
518 'uk' => 'windows-1251',
519 'vi' => 'utf-8',
520 'vn' => 'utf-8',
521 'zh' => 'big5'
522 );
523
524 /**
525 * Constructor
526 */
527 public function __construct()
528 {
529 $this->locales = GeneralUtility::makeInstance(Locales::class);
530 }
531
532 /**
533 * Normalize - changes input character set to lowercase letters.
534 *
535 * @param string $charset Input charset
536 * @return string Normalized charset
537 */
538 public function parse_charset($charset)
539 {
540 $charset = trim(strtolower($charset));
541 if (isset($this->synonyms[$charset])) {
542 $charset = $this->synonyms[$charset];
543 }
544 return $charset;
545 }
546
547 /**
548 * Get the charset of a locale.
549 *
550 * ln language
551 * ln_CN language / country
552 * ln_CN.cs language / country / charset
553 * ln_CN.cs@mod language / country / charset / modifier
554 *
555 * @param string $locale Locale string
556 * @return string Charset resolved for locale string
557 */
558 public function get_locale_charset($locale)
559 {
560 $locale = strtolower($locale);
561 // Exact locale specific charset?
562 if (isset($this->locale_to_charset[$locale])) {
563 return $this->locale_to_charset[$locale];
564 }
565 // Get modifier
566 list($locale, $modifier) = explode('@', $locale);
567 // Locale contains charset: use it
568 list($locale, $charset) = explode('.', $locale);
569 if ($charset) {
570 return $this->parse_charset($charset);
571 }
572 // Modifier is 'euro' (after charset check, because of xx.utf-8@euro)
573 if ($modifier === 'euro') {
574 return 'iso-8859-15';
575 }
576 // Get language
577 list($language, ) = explode('_', $locale);
578 if (isset($this->lang_to_script[$language])) {
579 $script = $this->lang_to_script[$language];
580 }
581 if (TYPO3_OS === 'WIN') {
582 $cs = $this->script_to_charset_windows[$script] ?: 'windows-1252';
583 } else {
584 $cs = $this->script_to_charset_unix[$script] ?: 'utf-8';
585 }
586 return $cs;
587 }
588
589 /********************************************
590 *
591 * Charset Conversion functions
592 *
593 ********************************************/
594 /**
595 * Convert from one charset to another charset.
596 *
597 * @param string $inputString Input string
598 * @param string $fromCharset From charset (the current charset of the string)
599 * @param string $toCharset To charset (the output charset wanted)
600 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
601 * @return string Converted string
602 * @see convArray()
603 */
604 public function conv($inputString, $fromCharset, $toCharset, $useEntityForNoChar = false)
605 {
606 if ($fromCharset === $toCharset) {
607 return $inputString;
608 }
609 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
610 if ($toCharset === 'utf-8' || !$useEntityForNoChar) {
611 switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
612 case 'mbstring':
613 $convertedString = mb_convert_encoding($inputString, $toCharset, $fromCharset);
614 if (false !== $convertedString) {
615 return $convertedString;
616 }
617 // Returns FALSE for unsupported charsets
618 break;
619 case 'iconv':
620 $convertedString = iconv($fromCharset, $toCharset . '//TRANSLIT', $inputString);
621 if (false !== $convertedString) {
622 return $convertedString;
623 }
624 break;
625 case 'recode':
626 $convertedString = recode_string($fromCharset . '..' . $toCharset, $inputString);
627 if (false !== $convertedString) {
628 return $convertedString;
629 }
630 break;
631 }
632 }
633 if ($fromCharset !== 'utf-8') {
634 $inputString = $this->utf8_encode($inputString, $fromCharset);
635 }
636 if ($toCharset !== 'utf-8') {
637 $inputString = $this->utf8_decode($inputString, $toCharset, $useEntityForNoChar);
638 }
639 return $inputString;
640 }
641
642 /**
643 * Convert all elements in ARRAY with type string from one charset to another charset.
644 * NOTICE: Array is passed by reference!
645 *
646 * @param array $array Input array, possibly multidimensional
647 * @param string $fromCharset From charset (the current charset of the string)
648 * @param string $toCharset To charset (the output charset wanted)
649 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
650 * @return void
651 * @see conv()
652 */
653 public function convArray(&$array, $fromCharset, $toCharset, $useEntityForNoChar = false)
654 {
655 foreach ($array as $key => $value) {
656 if (is_array($array[$key])) {
657 $this->convArray($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
658 } elseif (is_string($array[$key])) {
659 $array[$key] = $this->conv($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
660 }
661 }
662 }
663
664 /**
665 * Converts $str from $charset to UTF-8
666 *
667 * @param string $str String in local charset to convert to UTF-8
668 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
669 * @return string Output string, converted to UTF-8
670 */
671 public function utf8_encode($str, $charset)
672 {
673 if ($charset === 'utf-8') {
674 return $str;
675 }
676 // Charset is case-insensitive
677 // Parse conv. table if not already
678 if ($this->initCharset($charset)) {
679 $strLen = strlen($str);
680 $outStr = '';
681 // Traverse each char in string
682 for ($a = 0; $a < $strLen; $a++) {
683 $chr = substr($str, $a, 1);
684 $ord = ord($chr);
685 // If the charset has two bytes per char
686 if (isset($this->twoByteSets[$charset])) {
687 $ord2 = ord($str[$a + 1]);
688 // Assume big endian
689 $ord = $ord << 8 | $ord2;
690 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
691 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
692 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
693 } else {
694 $outStr .= chr($this->noCharByteVal);
695 }
696 // No char exists
697 $a++;
698 } elseif ($ord > 127) {
699 // If char has value over 127 it's a multibyte char in UTF-8
700 // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
701 if (isset($this->eucBasedSets[$charset])) {
702 // Shift-JIS: chars between 160 and 223 are single byte
703 if ($charset !== 'shift_jis' || ($ord < 160 || $ord > 223)) {
704 $a++;
705 $ord2 = ord(substr($str, $a, 1));
706 $ord = $ord * 256 + $ord2;
707 }
708 }
709 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
710 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
711 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
712 } else {
713 $outStr .= chr($this->noCharByteVal);
714 }
715 } else {
716 $outStr .= $chr;
717 }
718 }
719 return $outStr;
720 }
721 }
722
723 /**
724 * Converts $str from UTF-8 to $charset
725 *
726 * @param string $str String in UTF-8 to convert to local charset
727 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
728 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
729 * @return string Output string, converted to local charset
730 */
731 public function utf8_decode($str, $charset, $useEntityForNoChar = false)
732 {
733 if ($charset === 'utf-8') {
734 return $str;
735 }
736 // Charset is case-insensitive.
737 // Parse conv. table if not already
738 if ($this->initCharset($charset)) {
739 $strLen = strlen($str);
740 $outStr = '';
741 // Traverse each char in UTF-8 string
742 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) {
743 $chr = substr($str, $a, 1);
744 $ord = ord($chr);
745 // This means multibyte! (first byte!)
746 if ($ord > 127) {
747 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
748 if ($ord & 64) {
749 // Add first byte
750 $buf = $chr;
751 // For each byte in multibyte string
752 for ($b = 0; $b < 8; $b++) {
753 // Shift it left and
754 $ord = $ord << 1;
755 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
756 if ($ord & 128) {
757 $a++;
758 // ... and add the next char.
759 $buf .= substr($str, $a, 1);
760 } else {
761 break;
762 }
763 }
764 // If the UTF-8 char-sequence is found then...
765 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
766 // The local number
767 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
768 // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
769 if ($mByte > 255) {
770 $outStr .= chr(($mByte >> 8 & 255)) . chr(($mByte & 255));
771 } else {
772 $outStr .= chr($mByte);
773 }
774 } elseif ($useEntityForNoChar) {
775 // Create num entity:
776 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
777 } else {
778 $outStr .= chr($this->noCharByteVal);
779 }
780 } else {
781 $outStr .= chr($this->noCharByteVal);
782 }
783 } else {
784 $outStr .= $chr;
785 }
786 }
787 return $outStr;
788 }
789 }
790
791 /**
792 * Converts all chars > 127 to numeric entities.
793 *
794 * @param string $str Input string
795 * @return string Output string
796 */
797 public function utf8_to_entities($str)
798 {
799 $strLen = strlen($str);
800 $outStr = '';
801 // Traverse each char in UTF-8 string.
802 for ($a = 0; $a < $strLen; $a++) {
803 $chr = substr($str, $a, 1);
804 $ord = ord($chr);
805 // This means multibyte! (first byte!)
806 if ($ord > 127) {
807 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
808 if ($ord & 64) {
809 // Add first byte
810 $buf = $chr;
811 // For each byte in multibyte string...
812 for ($b = 0; $b < 8; $b++) {
813 // Shift it left and ...
814 $ord = $ord << 1;
815 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
816 if ($ord & 128) {
817 $a++;
818 // ... and add the next char.
819 $buf .= substr($str, $a, 1);
820 } else {
821 break;
822 }
823 }
824 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
825 } else {
826 $outStr .= chr($this->noCharByteVal);
827 }
828 } else {
829 $outStr .= $chr;
830 }
831 }
832 return $outStr;
833 }
834
835 /**
836 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
837 *
838 * @param string $str Input string, UTF-8
839 * @param bool $alsoStdHtmlEnt If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
840 * @return string Output string
841 */
842 public function entities_to_utf8($str, $alsoStdHtmlEnt = false)
843 {
844 if ($alsoStdHtmlEnt) {
845 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8'));
846 }
847 $token = md5(microtime());
848 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
849 foreach ($parts as $k => $v) {
850 // Only take every second element
851 if ($k % 2 === 0) {
852 continue;
853 }
854 $position = 0;
855 // Dec or hex entities
856 if (substr($v, $position, 1) === '#') {
857 $position++;
858 if (substr($v, $position, 1) === 'x') {
859 $v = hexdec(substr($v, ++$position));
860 } else {
861 $v = substr($v, $position);
862 }
863 $parts[$k] = $this->UnumberToChar($v);
864 } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) {
865 // Other entities:
866 $v = $trans_tbl['&' . $v . ';'];
867 $parts[$k] = $v;
868 } else {
869 // No conversion:
870 $parts[$k] = '&' . $v . ';';
871 }
872 }
873 return implode('', $parts);
874 }
875
876 /**
877 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
878 *
879 * @param string $str Input string, UTF-8
880 * @param bool $convEntities If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
881 * @param bool $retChar If set, then instead of integer numbers the real UTF-8 char is returned.
882 * @return array Output array with the char numbers
883 */
884 public function utf8_to_numberarray($str, $convEntities = false, $retChar = false)
885 {
886 // If entities must be registered as well...:
887 if ($convEntities) {
888 $str = $this->entities_to_utf8($str, 1);
889 }
890 // Do conversion:
891 $strLen = strlen($str);
892 $outArr = array();
893 // Traverse each char in UTF-8 string.
894 for ($a = 0; $a < $strLen; $a++) {
895 $chr = substr($str, $a, 1);
896 $ord = ord($chr);
897 // This means multibyte! (first byte!)
898 if ($ord > 127) {
899 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
900 if ($ord & 64) {
901 // Add first byte
902 $buf = $chr;
903 // For each byte in multibyte string...
904 for ($b = 0; $b < 8; $b++) {
905 // Shift it left and ...
906 $ord = $ord << 1;
907 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
908 if ($ord & 128) {
909 $a++;
910 // ... and add the next char.
911 $buf .= substr($str, $a, 1);
912 } else {
913 break;
914 }
915 }
916 $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
917 } else {
918 $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
919 }
920 } else {
921 $outArr[] = $retChar ? chr($ord) : $ord;
922 }
923 }
924 return $outArr;
925 }
926
927 /**
928 * Converts a UNICODE number to a UTF-8 multibyte character
929 * Algorithm based on script found at From: http://czyborra.com/utf/
930 * Unit-tested by Kasper
931 *
932 * The binary representation of the character's integer value is thus simply spread across the bytes
933 * and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
934 *
935 * bytes | bits | representation
936 * 1 | 7 | 0vvvvvvv
937 * 2 | 11 | 110vvvvv 10vvvvvv
938 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
939 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
940 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
941 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
942 *
943 * @param int $unicodeInteger UNICODE integer
944 * @return string UTF-8 multibyte character string
945 * @see utf8CharToUnumber()
946 */
947 public function UnumberToChar($unicodeInteger)
948 {
949 $str = '';
950 if ($unicodeInteger < 128) {
951 $str .= chr($unicodeInteger);
952 } elseif ($unicodeInteger < 2048) {
953 $str .= chr(192 | $unicodeInteger >> 6);
954 $str .= chr(128 | $unicodeInteger & 63);
955 } elseif ($unicodeInteger < 65536) {
956 $str .= chr(224 | $unicodeInteger >> 12);
957 $str .= chr(128 | $unicodeInteger >> 6 & 63);
958 $str .= chr(128 | $unicodeInteger & 63);
959 } elseif ($unicodeInteger < 2097152) {
960 $str .= chr(240 | $unicodeInteger >> 18);
961 $str .= chr(128 | $unicodeInteger >> 12 & 63);
962 $str .= chr(128 | $unicodeInteger >> 6 & 63);
963 $str .= chr(128 | $unicodeInteger & 63);
964 } elseif ($unicodeInteger < 67108864) {
965 $str .= chr(248 | $unicodeInteger >> 24);
966 $str .= chr(128 | $unicodeInteger >> 18 & 63);
967 $str .= chr(128 | $unicodeInteger >> 12 & 63);
968 $str .= chr(128 | $unicodeInteger >> 6 & 63);
969 $str .= chr(128 | $unicodeInteger & 63);
970 } elseif ($unicodeInteger < 2147483648) {
971 $str .= chr(252 | $unicodeInteger >> 30);
972 $str .= chr(128 | $unicodeInteger >> 24 & 63);
973 $str .= chr(128 | $unicodeInteger >> 18 & 63);
974 $str .= chr(128 | $unicodeInteger >> 12 & 63);
975 $str .= chr(128 | $unicodeInteger >> 6 & 63);
976 $str .= chr(128 | $unicodeInteger & 63);
977 } else {
978 // Cannot express a 32-bit character in UTF-8
979 $str .= chr($this->noCharByteVal);
980 }
981 return $str;
982 }
983
984 /**
985 * Converts a UTF-8 Multibyte character to a UNICODE number
986 * Unit-tested by Kasper
987 *
988 * @param string $str UTF-8 multibyte character string
989 * @param bool $hex If set, then a hex. number is returned.
990 * @return int UNICODE integer
991 * @see UnumberToChar()
992 */
993 public function utf8CharToUnumber($str, $hex = false)
994 {
995 // First char
996 $ord = ord($str[0]);
997 // This verifies that it IS a multi byte string
998 if (($ord & 192) === 192) {
999 $binBuf = '';
1000 // For each byte in multibyte string...
1001 for ($b = 0; $b < 8; $b++) {
1002 // Shift it left and ...
1003 $ord = $ord << 1;
1004 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
1005 if ($ord & 128) {
1006 $binBuf .= substr('00000000' . decbin(ord(substr($str, ($b + 1), 1))), -6);
1007 } else {
1008 break;
1009 }
1010 }
1011 $binBuf = substr(('00000000' . decbin(ord($str[0]))), -(6 - $b)) . $binBuf;
1012 $int = bindec($binBuf);
1013 } else {
1014 $int = $ord;
1015 }
1016 return $hex ? 'x' . dechex($int) : $int;
1017 }
1018
1019 /********************************************
1020 *
1021 * Init functions
1022 *
1023 ********************************************/
1024 /**
1025 * This will initialize a charset for use if it's defined in the 'typo3/sysext/core/Resources/Private/Charsets/csconvtbl/' folder
1026 * This function is automatically called by the conversion functions
1027 *
1028 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
1029 *
1030 * @param string $charset The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
1031 * @return int Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
1032 * @access private
1033 */
1034 public function initCharset($charset)
1035 {
1036 // Only process if the charset is not yet loaded:
1037 if (!is_array($this->parsedCharsets[$charset])) {
1038 // Conversion table filename:
1039 $charsetConvTableFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/csconvtbl/' . $charset . '.tbl';
1040 // If the conversion table is found:
1041 if ($charset && GeneralUtility::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
1042 // Cache file for charsets:
1043 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1044 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1045 if ($cacheFile && @is_file($cacheFile)) {
1046 $this->parsedCharsets[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1047 } else {
1048 // Parse conversion table into lines:
1049 $lines = GeneralUtility::trimExplode(LF, GeneralUtility::getUrl($charsetConvTableFile), true);
1050 // Initialize the internal variable holding the conv. table:
1051 $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
1052 // traverse the lines:
1053 $detectedType = '';
1054 foreach ($lines as $value) {
1055 // Comment line or blanks are ignored.
1056 if (trim($value) && $value[0] !== '#') {
1057 // Detect type if not done yet: (Done on first real line)
1058 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1059 if (!$detectedType) {
1060 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
1061 }
1062 if ($detectedType === 'ms-token') {
1063 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1064 } elseif ($detectedType === 'whitespaced') {
1065 $regA = array();
1066 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1067 $hexbyte = $regA[1];
1068 $utf8 = 'U+' . $regA[2];
1069 }
1070 $decval = hexdec(trim($hexbyte));
1071 if ($decval > 127) {
1072 $utf8decval = hexdec(substr(trim($utf8), 2));
1073 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1074 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1075 }
1076 }
1077 }
1078 if ($cacheFile) {
1079 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1080 }
1081 }
1082 return 2;
1083 } else {
1084 return false;
1085 }
1086 } else {
1087 return 1;
1088 }
1089 }
1090
1091 /**
1092 * This function initializes all UTF-8 character data tables.
1093 *
1094 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1095 *
1096 * @param string $mode Mode ("case", "ascii", ...)
1097 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1098 * @access private
1099 */
1100 public function initUnicodeData($mode = null)
1101 {
1102 // Cache files
1103 $cacheFileCase = GeneralUtility::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1104 $cacheFileASCII = GeneralUtility::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1105 // Only process if the tables are not yet loaded
1106 switch ($mode) {
1107 case 'case':
1108 if (is_array($this->caseFolding['utf-8'])) {
1109 return 1;
1110 }
1111 // Use cached version if possible
1112 if ($cacheFileCase && @is_file($cacheFileCase)) {
1113 $this->caseFolding['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileCase));
1114 return 2;
1115 }
1116 break;
1117 case 'ascii':
1118 if (is_array($this->toASCII['utf-8'])) {
1119 return 1;
1120 }
1121 // Use cached version if possible
1122 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1123 $this->toASCII['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileASCII));
1124 return 2;
1125 }
1126 break;
1127 }
1128 // Process main Unicode data file
1129 $unicodeDataFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/UnicodeData.txt';
1130 if (!(GeneralUtility::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1131 return false;
1132 }
1133 $fh = fopen($unicodeDataFile, 'rb');
1134 if (!$fh) {
1135 return false;
1136 }
1137 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1138 // Note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1139 $this->caseFolding['utf-8'] = array();
1140 $utf8CaseFolding = &$this->caseFolding['utf-8'];
1141 // a shorthand
1142 $utf8CaseFolding['toUpper'] = array();
1143 $utf8CaseFolding['toLower'] = array();
1144 $utf8CaseFolding['toTitle'] = array();
1145 // Array of temp. decompositions
1146 $decomposition = array();
1147 // Array of chars that are marks (eg. composing accents)
1148 $mark = array();
1149 // Array of chars that are numbers (eg. digits)
1150 $number = array();
1151 // Array of chars to be omitted (eg. Russian hard sign)
1152 $omit = array();
1153 while (!feof($fh)) {
1154 $line = fgets($fh, 4096);
1155 // Has a lot of info
1156 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title, ) = explode(';', rtrim($line));
1157 $ord = hexdec($char);
1158 if ($ord > 65535) {
1159 // Only process the BMP
1160 break;
1161 }
1162 $utf8_char = $this->UnumberToChar($ord);
1163 if ($upper) {
1164 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1165 }
1166 if ($lower) {
1167 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1168 }
1169 // Store "title" only when different from "upper" (only a few)
1170 if ($title && $title !== $upper) {
1171 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1172 }
1173 switch ($cat[0]) {
1174 case 'M':
1175 // mark (accent, umlaut, ...)
1176 $mark['U+' . $char] = 1;
1177 break;
1178 case 'N':
1179 // numeric value
1180 if ($ord > 128 && $num !== '') {
1181 $number['U+' . $char] = $num;
1182 }
1183 }
1184 // Accented Latin letters without "official" decomposition
1185 $match = array();
1186 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1187 $c = ord($match[2]);
1188 if ($match[1] === 'SMALL') {
1189 $c += 32;
1190 }
1191 $decomposition['U+' . $char] = array(dechex($c));
1192 continue;
1193 }
1194 $match = array();
1195 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1196 switch ($match[1]) {
1197 case '<circle>':
1198 // add parenthesis as circle replacement, eg (1)
1199 $match[2] = '0028 ' . $match[2] . ' 0029';
1200 break;
1201 case '<square>':
1202 // add square brackets as square replacement, eg [1]
1203 $match[2] = '005B ' . $match[2] . ' 005D';
1204 break;
1205 case '<compat>':
1206 // ignore multi char decompositions that start with a space
1207 if (preg_match('/^0020 /', $match[2])) {
1208 continue 2;
1209 }
1210 break;
1211 case '<initial>':
1212 case '<medial>':
1213 case '<final>':
1214 case '<isolated>':
1215 case '<vertical>':
1216 continue 2;
1217 }
1218 $decomposition['U+' . $char] = explode(' ', $match[2]);
1219 }
1220 }
1221 fclose($fh);
1222 // Process additional Unicode data for casing (allow folded characters to expand into a sequence)
1223 $specialCasingFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/SpecialCasing.txt';
1224 if (GeneralUtility::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1225 $fh = fopen($specialCasingFile, 'rb');
1226 if ($fh) {
1227 while (!feof($fh)) {
1228 $line = fgets($fh, 4096);
1229 if ($line[0] !== '#' && trim($line) !== '') {
1230 list($char, $lower, $title, $upper, $cond) = GeneralUtility::trimExplode(';', $line);
1231 if ($cond === '' || $cond[0] === '#') {
1232 $utf8_char = $this->UnumberToChar(hexdec($char));
1233 if ($char !== $lower) {
1234 $arr = explode(' ', $lower);
1235 for ($i = 0; isset($arr[$i]); $i++) {
1236 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1237 }
1238 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1239 }
1240 if ($char !== $title && $title !== $upper) {
1241 $arr = explode(' ', $title);
1242 for ($i = 0; isset($arr[$i]); $i++) {
1243 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1244 }
1245 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1246 }
1247 if ($char !== $upper) {
1248 $arr = explode(' ', $upper);
1249 for ($i = 0; isset($arr[$i]); $i++) {
1250 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1251 }
1252 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1253 }
1254 }
1255 }
1256 }
1257 fclose($fh);
1258 }
1259 }
1260 // Process custom decompositions
1261 $customTranslitFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/Translit.txt';
1262 if (GeneralUtility::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1263 $fh = fopen($customTranslitFile, 'rb');
1264 if ($fh) {
1265 while (!feof($fh)) {
1266 $line = fgets($fh, 4096);
1267 if ($line[0] !== '#' && trim($line) !== '') {
1268 list($char, $translit) = GeneralUtility::trimExplode(';', $line);
1269 if (!$translit) {
1270 $omit['U+' . $char] = 1;
1271 }
1272 $decomposition['U+' . $char] = explode(' ', $translit);
1273 }
1274 }
1275 fclose($fh);
1276 }
1277 }
1278 // Decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1279 foreach ($decomposition as $from => $to) {
1280 $code_decomp = array();
1281 while ($code_value = array_shift($to)) {
1282 // Do recursive decomposition
1283 if (isset($decomposition['U+' . $code_value])) {
1284 foreach (array_reverse($decomposition['U+' . $code_value]) as $cv) {
1285 array_unshift($to, $cv);
1286 }
1287 } elseif (!isset($mark['U+' . $code_value])) {
1288 // remove mark
1289 array_push($code_decomp, $code_value);
1290 }
1291 }
1292 if (!empty($code_decomp) || isset($omit[$from])) {
1293 $decomposition[$from] = $code_decomp;
1294 } else {
1295 unset($decomposition[$from]);
1296 }
1297 }
1298 // Create ascii only mapping
1299 $this->toASCII['utf-8'] = array();
1300 $ascii = &$this->toASCII['utf-8'];
1301 foreach ($decomposition as $from => $to) {
1302 $code_decomp = array();
1303 while ($code_value = array_shift($to)) {
1304 $ord = hexdec($code_value);
1305 if ($ord > 127) {
1306 continue 2;
1307 } else {
1308 // Skip decompositions containing non-ASCII chars
1309 array_push($code_decomp, chr($ord));
1310 }
1311 }
1312 $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1313 }
1314 // Add numeric decompositions
1315 foreach ($number as $from => $to) {
1316 $utf8_char = $this->UnumberToChar(hexdec($from));
1317 if (!isset($ascii[$utf8_char])) {
1318 $ascii[$utf8_char] = $to;
1319 }
1320 }
1321 if ($cacheFileCase) {
1322 GeneralUtility::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1323 }
1324 if ($cacheFileASCII) {
1325 GeneralUtility::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1326 }
1327 return 3;
1328 }
1329
1330 /**
1331 * This function initializes the folding table for a charset other than UTF-8.
1332 * This function is automatically called by the case folding functions.
1333 *
1334 * @param string $charset Charset for which to initialize case folding.
1335 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1336 * @access private
1337 */
1338 public function initCaseFolding($charset)
1339 {
1340 // Only process if the case table is not yet loaded:
1341 if (is_array($this->caseFolding[$charset])) {
1342 return 1;
1343 }
1344 // Use cached version if possible
1345 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1346 if ($cacheFile && @is_file($cacheFile)) {
1347 $this->caseFolding[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1348 return 2;
1349 }
1350 // init UTF-8 conversion for this charset
1351 if (!$this->initCharset($charset)) {
1352 return false;
1353 }
1354 // UTF-8 case folding is used as the base conversion table
1355 if (!$this->initUnicodeData('case')) {
1356 return false;
1357 }
1358 $nochar = chr($this->noCharByteVal);
1359 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1360 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1361 $c = $this->utf8_decode($utf8, $charset);
1362 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1363 if ($cc !== '' && $cc !== $nochar) {
1364 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1365 }
1366 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1367 if ($cc !== '' && $cc !== $nochar) {
1368 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1369 }
1370 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1371 if ($cc !== '' && $cc !== $nochar) {
1372 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1373 }
1374 }
1375 // Add the ASCII case table
1376 $start = ord('a');
1377 $end = ord('z');
1378 for ($i = $start; $i <= $end; $i++) {
1379 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1380 }
1381 $start = ord('A');
1382 $end = ord('Z');
1383 for ($i = $start; $i <= $end; $i++) {
1384 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1385 }
1386 if ($cacheFile) {
1387 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1388 }
1389 return 3;
1390 }
1391
1392 /**
1393 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1394 * This function is automatically called by the ASCII transliteration functions.
1395 *
1396 * @param string $charset Charset for which to initialize conversion.
1397 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1398 * @access private
1399 */
1400 public function initToASCII($charset)
1401 {
1402 // Only process if the case table is not yet loaded:
1403 if (is_array($this->toASCII[$charset])) {
1404 return 1;
1405 }
1406 // Use cached version if possible
1407 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1408 if ($cacheFile && @is_file($cacheFile)) {
1409 $this->toASCII[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1410 return 2;
1411 }
1412 // Init UTF-8 conversion for this charset
1413 if (!$this->initCharset($charset)) {
1414 return false;
1415 }
1416 // UTF-8/ASCII transliteration is used as the base conversion table
1417 if (!$this->initUnicodeData('ascii')) {
1418 return false;
1419 }
1420 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1421 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1422 $c = $this->utf8_decode($utf8, $charset);
1423 if (isset($this->toASCII['utf-8'][$utf8])) {
1424 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1425 }
1426 }
1427 if ($cacheFile) {
1428 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1429 }
1430 return 3;
1431 }
1432
1433 /********************************************
1434 *
1435 * String operation functions
1436 *
1437 ********************************************/
1438 /**
1439 * Returns a part of a string.
1440 * Unit-tested by Kasper (single byte charsets only)
1441 *
1442 * @param string $charset The character set
1443 * @param string $string Character string
1444 * @param int $start Start position (character position)
1445 * @param int $len Length (in characters)
1446 * @return string The substring
1447 * @see substr(), mb_substr()
1448 */
1449 public function substr($charset, $string, $start, $len = null)
1450 {
1451 if ($len === 0 || $string === '') {
1452 return '';
1453 }
1454 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1455 // Cannot omit $len, when specifying charset
1456 if ($len === null) {
1457 // Save internal encoding
1458 $enc = mb_internal_encoding();
1459 mb_internal_encoding($charset);
1460 $str = mb_substr($string, $start);
1461 // Restore internal encoding
1462 mb_internal_encoding($enc);
1463 return $str;
1464 } else {
1465 return mb_substr($string, $start, $len, $charset);
1466 }
1467 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'iconv') {
1468 // Cannot omit $len, when specifying charset
1469 if ($len === null) {
1470 // Save internal encoding
1471 $enc = iconv_get_encoding('internal_encoding');
1472 iconv_set_encoding('internal_encoding', $charset);
1473 $str = iconv_substr($string, $start);
1474 // Restore internal encoding
1475 iconv_set_encoding('internal_encoding', $enc);
1476 return $str;
1477 } else {
1478 return iconv_substr($string, $start, $len, $charset);
1479 }
1480 } elseif ($charset === 'utf-8') {
1481 return $this->utf8_substr($string, $start, $len);
1482 } elseif ($this->eucBasedSets[$charset]) {
1483 return $this->euc_substr($string, $start, $charset, $len);
1484 } elseif ($this->twoByteSets[$charset]) {
1485 return substr($string, $start * 2, $len * 2);
1486 } elseif ($this->fourByteSets[$charset]) {
1487 return substr($string, $start * 4, $len * 4);
1488 }
1489 // Treat everything else as single-byte encoding
1490 return $len === null ? substr($string, $start) : substr($string, $start, $len);
1491 }
1492
1493 /**
1494 * Counts the number of characters.
1495 * Unit-tested by Kasper (single byte charsets only)
1496 *
1497 * @param string $charset The character set
1498 * @param string $string Character string
1499 * @return int The number of characters
1500 * @see strlen()
1501 */
1502 public function strlen($charset, $string)
1503 {
1504 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1505 return mb_strlen($string, $charset);
1506 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'iconv') {
1507 return iconv_strlen($string, $charset);
1508 } elseif ($charset === 'utf-8') {
1509 return $this->utf8_strlen($string);
1510 } elseif ($this->eucBasedSets[$charset]) {
1511 return $this->euc_strlen($string, $charset);
1512 } elseif ($this->twoByteSets[$charset]) {
1513 return strlen($string) / 2;
1514 } elseif ($this->fourByteSets[$charset]) {
1515 return strlen($string) / 4;
1516 }
1517 // Treat everything else as single-byte encoding
1518 return strlen($string);
1519 }
1520
1521 /**
1522 * Method to crop strings using the mb_substr function.
1523 *
1524 * @param string $charset The character set
1525 * @param string $string String to be cropped
1526 * @param int $len Crop length (in characters)
1527 * @param string $crop Crop signifier
1528 * @return string The shortened string
1529 * @see mb_strlen(), mb_substr()
1530 */
1531 protected function cropMbstring($charset, $string, $len, $crop = '')
1532 {
1533 if ((int)$len === 0 || mb_strlen($string, $charset) <= abs($len)) {
1534 return $string;
1535 }
1536 if ($len > 0) {
1537 $string = mb_substr($string, 0, $len, $charset) . $crop;
1538 } else {
1539 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1540 }
1541 return $string;
1542 }
1543
1544 /**
1545 * Truncates a string and pre-/appends a string.
1546 * Unit tested by Kasper
1547 *
1548 * @param string $charset The character set
1549 * @param string $string Character string
1550 * @param int $len Length (in characters)
1551 * @param string $crop Crop signifier
1552 * @return string The shortened string
1553 * @see substr(), mb_strimwidth()
1554 */
1555 public function crop($charset, $string, $len, $crop = '')
1556 {
1557 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1558 return $this->cropMbstring($charset, $string, $len, $crop);
1559 }
1560 if ((int)$len === 0) {
1561 return $string;
1562 }
1563 if ($charset === 'utf-8') {
1564 $i = $this->utf8_char2byte_pos($string, $len);
1565 } elseif ($this->eucBasedSets[$charset]) {
1566 $i = $this->euc_char2byte_pos($string, $len, $charset);
1567 } else {
1568 if ($len > 0) {
1569 $i = $len;
1570 } else {
1571 $i = strlen($string) + $len;
1572 if ($i <= 0) {
1573 $i = false;
1574 }
1575 }
1576 }
1577 // $len outside actual string length
1578 if ($i === false) {
1579 return $string;
1580 } else {
1581 if ($len > 0) {
1582 if (isset($string[$i])) {
1583 return substr($string, 0, $i) . $crop;
1584 }
1585 } else {
1586 if (isset($string[$i - 1])) {
1587 return $crop . substr($string, $i);
1588 }
1589 }
1590 }
1591 return $string;
1592 }
1593
1594 /**
1595 * Cuts a string short at a given byte length.
1596 *
1597 * @param string $charset The character set
1598 * @param string $string Character string
1599 * @param int $len The byte length
1600 * @return string The shortened string
1601 * @see mb_strcut()
1602 */
1603 public function strtrunc($charset, $string, $len)
1604 {
1605 if ($len <= 0) {
1606 return '';
1607 }
1608 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1609 return mb_strcut($string, 0, $len, $charset);
1610 } elseif ($charset === 'utf-8') {
1611 return $this->utf8_strtrunc($string, $len);
1612 } elseif ($this->eucBasedSets[$charset]) {
1613 return $this->euc_strtrunc($string, $len, $charset);
1614 } elseif ($this->twoByteSets[$charset]) {
1615 if ($len % 2) {
1616 $len--;
1617 }
1618 } elseif ($this->fourByteSets[$charset]) {
1619 $x = $len % 4;
1620 // Realign to position dividable by four
1621 $len -= $x;
1622 }
1623 // Treat everything else as single-byte encoding
1624 return substr($string, 0, $len);
1625 }
1626
1627 /**
1628 * Translates all characters of a string into their respective case values.
1629 * Unlike strtolower() and strtoupper() this method is locale independent.
1630 * Note that the string length may change!
1631 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1632 * Unit-tested by Kasper
1633 * Real case folding is language dependent, this method ignores this fact.
1634 *
1635 * @param string $charset Character set of string
1636 * @param string $string Input string to convert case for
1637 * @param string $case Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1638 * @return string The converted string
1639 * @see strtolower(), strtoupper()
1640 */
1641 public function conv_case($charset, $string, $case)
1642 {
1643 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1644 if ($case === 'toLower') {
1645 $string = mb_strtolower($string, $charset);
1646 } else {
1647 $string = mb_strtoupper($string, $charset);
1648 }
1649 } elseif ($charset === 'utf-8') {
1650 $string = $this->utf8_char_mapping($string, 'case', $case);
1651 } elseif (isset($this->eucBasedSets[$charset])) {
1652 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1653 } else {
1654 // Treat everything else as single-byte encoding
1655 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1656 }
1657 return $string;
1658 }
1659
1660 /**
1661 * Equivalent of lcfirst/ucfirst but using character set.
1662 *
1663 * @param string $charset
1664 * @param string $string
1665 * @param string $case
1666 * @return string
1667 * @see \TYPO3\CMS\Core\Charset\CharsetConverter::conv_case()
1668 */
1669 public function convCaseFirst($charset, $string, $case)
1670 {
1671 $firstChar = $this->substr($charset, $string, 0, 1);
1672 $firstChar = $this->conv_case($charset, $firstChar, $case);
1673 $remainder = $this->substr($charset, $string, 1);
1674 return $firstChar . $remainder;
1675 }
1676
1677 /**
1678 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1679 *
1680 * @param string $charset Character set of string
1681 * @param string $string Input string to convert
1682 * @return string The converted string
1683 */
1684 public function specCharsToASCII($charset, $string)
1685 {
1686 if ($charset === 'utf-8') {
1687 $string = $this->utf8_char_mapping($string, 'ascii');
1688 } elseif (isset($this->eucBasedSets[$charset])) {
1689 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1690 } else {
1691 // Treat everything else as single-byte encoding
1692 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1693 }
1694 return $string;
1695 }
1696
1697 /**
1698 * Converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1699 * into a TYPO3-readable language code
1700 *
1701 * @param string $languageCodesList List of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1702 * @return string A preferred language that TYPO3 supports, or "default" if none found
1703 */
1704 public function getPreferredClientLanguage($languageCodesList)
1705 {
1706 $allLanguageCodes = array();
1707 $selectedLanguage = 'default';
1708 // Get all languages where TYPO3 code is the same as the ISO code
1709 foreach ($this->charSetArray as $typo3Lang => $charSet) {
1710 $allLanguageCodes[$typo3Lang] = $typo3Lang;
1711 }
1712 // Get all languages where TYPO3 code differs from ISO code
1713 // or needs the country part
1714 // the iso codes will here overwrite the default typo3 language in the key
1715 foreach ($this->locales->getIsoMapping() as $typo3Lang => $isoLang) {
1716 $isoLang = join('-', explode('_', $isoLang));
1717 $allLanguageCodes[$typo3Lang] = $isoLang;
1718 }
1719 // Move the iso codes to the (because we're comparing the keys with "isset" later on)
1720 $allLanguageCodes = array_flip($allLanguageCodes);
1721 $preferredLanguages = GeneralUtility::trimExplode(',', $languageCodesList);
1722 // Order the preferred languages after they key
1723 $sortedPreferredLanguages = array();
1724 foreach ($preferredLanguages as $preferredLanguage) {
1725 $quality = 1.0;
1726 if (strpos($preferredLanguage, ';q=') !== false) {
1727 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1728 }
1729 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1730 }
1731 // Loop through the languages, with the highest priority first
1732 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1733 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1734 if (isset($allLanguageCodes[$preferredLanguage])) {
1735 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1736 break;
1737 }
1738 // Strip the country code from the end
1739 list($preferredLanguage, ) = explode('-', $preferredLanguage);
1740 if (isset($allLanguageCodes[$preferredLanguage])) {
1741 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1742 break;
1743 }
1744 }
1745 if (!$selectedLanguage || $selectedLanguage === 'en') {
1746 $selectedLanguage = 'default';
1747 }
1748 return $selectedLanguage;
1749 }
1750
1751 /********************************************
1752 *
1753 * Internal string operation functions
1754 *
1755 ********************************************/
1756 /**
1757 * Maps all characters of a string in a single byte charset.
1758 *
1759 * @param string $str The string
1760 * @param string $charset The charset
1761 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1762 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
1763 * @return string The converted string
1764 */
1765 public function sb_char_mapping($str, $charset, $mode, $opt = '')
1766 {
1767 switch ($mode) {
1768 case 'case':
1769 if (!$this->initCaseFolding($charset)) {
1770 return $str;
1771 }
1772 // Do nothing
1773 $map = &$this->caseFolding[$charset][$opt];
1774 break;
1775 case 'ascii':
1776 if (!$this->initToASCII($charset)) {
1777 return $str;
1778 }
1779 // Do nothing
1780 $map = &$this->toASCII[$charset];
1781 break;
1782 default:
1783 return $str;
1784 }
1785 $out = '';
1786 for ($i = 0; isset($str[$i]); $i++) {
1787 $c = $str[$i];
1788 if (isset($map[$c])) {
1789 $out .= $map[$c];
1790 } else {
1791 $out .= $c;
1792 }
1793 }
1794 return $out;
1795 }
1796
1797 /********************************************
1798 *
1799 * Internal UTF-8 string operation functions
1800 *
1801 ********************************************/
1802 /**
1803 * Returns a part of a UTF-8 string.
1804 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1805 *
1806 * @param string $str UTF-8 string
1807 * @param int $start Start position (character position)
1808 * @param int $len Length (in characters)
1809 * @return string The substring
1810 * @see substr()
1811 */
1812 public function utf8_substr($str, $start, $len = null)
1813 {
1814 if ((string)$len === '0') {
1815 return '';
1816 }
1817 $byte_start = $this->utf8_char2byte_pos($str, $start);
1818 if ($byte_start === false) {
1819 if ($start > 0) {
1820 // $start outside string length
1821 return false;
1822 }
1823 }
1824 $str = substr($str, $byte_start);
1825 if ($len != null) {
1826 $byte_end = $this->utf8_char2byte_pos($str, $len);
1827 // $len outside actual string length
1828 if ($byte_end === false) {
1829 return $len < 0 ? '' : $str;
1830 } else {
1831 // When length is less than zero and exceeds, then we return blank string.
1832 return substr($str, 0, $byte_end);
1833 }
1834 } else {
1835 return $str;
1836 }
1837 }
1838
1839 /**
1840 * Counts the number of characters of a string in UTF-8.
1841 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1842 *
1843 * @param string $str UTF-8 multibyte character string
1844 * @return int The number of characters
1845 * @see strlen()
1846 */
1847 public function utf8_strlen($str)
1848 {
1849 $n = 0;
1850 for ($i = 0; isset($str[$i]); $i++) {
1851 $c = ord($str[$i]);
1852 // Single-byte (0xxxxxx)
1853 if (!($c & 128)) {
1854 $n++;
1855 } elseif (($c & 192) === 192) {
1856 // Multi-byte starting byte (11xxxxxx)
1857 $n++;
1858 }
1859 }
1860 return $n;
1861 }
1862
1863 /**
1864 * Truncates a string in UTF-8 short at a given byte length.
1865 *
1866 * @param string $str UTF-8 multibyte character string
1867 * @param int $len The byte length
1868 * @return string The shortened string
1869 * @see mb_strcut()
1870 */
1871 public function utf8_strtrunc($str, $len)
1872 {
1873 $i = $len - 1;
1874 // Part of a multibyte sequence
1875 if (ord($str[$i]) & 128) {
1876 for (; $i > 0 && !(ord($str[$i]) & 64); $i--) {
1877 }
1878 if ($i <= 0) {
1879 return '';
1880 }
1881 // Sanity check
1882 for ($bc = 0, $mbs = ord($str[$i]); $mbs & 128; $mbs = $mbs << 1) {
1883 // Calculate number of bytes
1884 $bc++;
1885 }
1886 if ($bc + $i > $len) {
1887 return substr($str, 0, $i);
1888 }
1889 }
1890 return substr($str, 0, $len);
1891 }
1892
1893 /**
1894 * Find position of first occurrence of a string, both arguments are in UTF-8.
1895 *
1896 * @param string $haystack UTF-8 string to search in
1897 * @param string $needle UTF-8 string to search for
1898 * @param int $offset Position to start the search
1899 * @return int The character position
1900 * @see strpos()
1901 */
1902 public function utf8_strpos($haystack, $needle, $offset = 0)
1903 {
1904 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1905 return mb_strpos($haystack, $needle, $offset, 'utf-8');
1906 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'iconv') {
1907 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1908 }
1909 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1910 if ($byte_offset === false) {
1911 // Offset beyond string length
1912 return false;
1913 }
1914 $byte_pos = strpos($haystack, $needle, $byte_offset);
1915 if ($byte_pos === false) {
1916 // Needle not found
1917 return false;
1918 }
1919 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1920 }
1921
1922 /**
1923 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1924 *
1925 * @param string $haystack UTF-8 string to search in
1926 * @param string $needle UTF-8 character to search for (single character)
1927 * @return int The character position
1928 * @see strrpos()
1929 */
1930 public function utf8_strrpos($haystack, $needle)
1931 {
1932 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1933 return mb_strrpos($haystack, $needle, 'utf-8');
1934 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'iconv') {
1935 return iconv_strrpos($haystack, $needle, 'utf-8');
1936 }
1937 $byte_pos = strrpos($haystack, $needle);
1938 if ($byte_pos === false) {
1939 // Needle not found
1940 return false;
1941 }
1942 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1943 }
1944
1945 /**
1946 * Translates a character position into an 'absolute' byte position.
1947 * Unit tested by Kasper.
1948 *
1949 * @param string $str UTF-8 string
1950 * @param int $pos Character position (negative values start from the end)
1951 * @return int Byte position
1952 */
1953 public function utf8_char2byte_pos($str, $pos)
1954 {
1955 // Number of characters found
1956 $n = 0;
1957 // Number of characters wanted
1958 $p = abs($pos);
1959 if ($pos >= 0) {
1960 $i = 0;
1961 $d = 1;
1962 } else {
1963 $i = strlen($str) - 1;
1964 $d = -1;
1965 }
1966 for (; isset($str[$i]) && $n < $p; $i += $d) {
1967 $c = (int)ord($str[$i]);
1968 // single-byte (0xxxxxx)
1969 if (!($c & 128)) {
1970 $n++;
1971 } elseif (($c & 192) === 192) {
1972 // Multi-byte starting byte (11xxxxxx)
1973 $n++;
1974 }
1975 }
1976 if (!isset($str[$i])) {
1977 // Offset beyond string length
1978 return false;
1979 }
1980 if ($pos >= 0) {
1981 // Skip trailing multi-byte data bytes
1982 while (ord($str[$i]) & 128 && !(ord($str[$i]) & 64)) {
1983 $i++;
1984 }
1985 } else {
1986 // Correct offset
1987 $i++;
1988 }
1989 return $i;
1990 }
1991
1992 /**
1993 * Translates an 'absolute' byte position into a character position.
1994 * Unit tested by Kasper.
1995 *
1996 * @param string $str UTF-8 string
1997 * @param int $pos Byte position
1998 * @return int Character position
1999 */
2000 public function utf8_byte2char_pos($str, $pos)
2001 {
2002 // Number of characters
2003 $n = 0;
2004 for ($i = $pos; $i > 0; $i--) {
2005 $c = (int)ord($str[$i]);
2006 // single-byte (0xxxxxx)
2007 if (!($c & 128)) {
2008 $n++;
2009 } elseif (($c & 192) === 192) {
2010 // Multi-byte starting byte (11xxxxxx)
2011 $n++;
2012 }
2013 }
2014 if (!isset($str[$i])) {
2015 // Offset beyond string length
2016 return false;
2017 }
2018 return $n;
2019 }
2020
2021 /**
2022 * Maps all characters of an UTF-8 string.
2023 *
2024 * @param string $str UTF-8 string
2025 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2026 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
2027 * @return string The converted string
2028 */
2029 public function utf8_char_mapping($str, $mode, $opt = '')
2030 {
2031 if (!$this->initUnicodeData($mode)) {
2032 // Do nothing
2033 return $str;
2034 }
2035 $out = '';
2036 switch ($mode) {
2037 case 'case':
2038 $map = &$this->caseFolding['utf-8'][$opt];
2039 break;
2040 case 'ascii':
2041 $map = &$this->toASCII['utf-8'];
2042 break;
2043 default:
2044 return $str;
2045 }
2046 for ($i = 0; isset($str[$i]); $i++) {
2047 $c = ord($str[$i]);
2048 // single-byte (0xxxxxx)
2049 if (!($c & 128)) {
2050 $mbc = $str[$i];
2051 } elseif (($c & 192) === 192) {
2052 // multi-byte starting byte (11xxxxxx)
2053 for ($bc = 0; $c & 128; $c = $c << 1) {
2054 $bc++;
2055 }
2056 // calculate number of bytes
2057 $mbc = substr($str, $i, $bc);
2058 $i += $bc - 1;
2059 }
2060 if (isset($map[$mbc])) {
2061 $out .= $map[$mbc];
2062 } else {
2063 $out .= $mbc;
2064 }
2065 }
2066 return $out;
2067 }
2068
2069 /********************************************
2070 *
2071 * Internal EUC string operation functions
2072 *
2073 * Extended Unix Code:
2074 * ASCII compatible 7bit single bytes chars
2075 * 8bit two byte chars
2076 *
2077 * Shift-JIS is treated as a special case.
2078 *
2079 ********************************************/
2080 /**
2081 * Cuts a string in the EUC charset family short at a given byte length.
2082 *
2083 * @param string $str EUC multibyte character string
2084 * @param int $len The byte length
2085 * @param string $charset The charset
2086 * @return string The shortened string
2087 * @see mb_strcut()
2088 */
2089 public function euc_strtrunc($str, $len, $charset)
2090 {
2091 $shiftJis = $charset === 'shift_jis';
2092 for ($i = 0; isset($str[$i]) && $i < $len; $i++) {
2093 $c = ord($str[$i]);
2094 if ($shiftJis) {
2095 if ($c >= 128 && $c < 160 || $c >= 224) {
2096 $i++;
2097 }
2098 } else {
2099 if ($c >= 128) {
2100 $i++;
2101 }
2102 }
2103 }
2104 if (!isset($str[$i])) {
2105 return $str;
2106 }
2107 // string shorter than supplied length
2108 if ($i > $len) {
2109 // We ended on a first byte
2110 return substr($str, 0, $len - 1);
2111 } else {
2112 return substr($str, 0, $len);
2113 }
2114 }
2115
2116 /**
2117 * Returns a part of a string in the EUC charset family.
2118 *
2119 * @param string $str EUC multibyte character string
2120 * @param int $start Start position (character position)
2121 * @param string $charset The charset
2122 * @param int $len Length (in characters)
2123 * @return string the substring
2124 */
2125 public function euc_substr($str, $start, $charset, $len = null)
2126 {
2127 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2128 if ($byte_start === false) {
2129 // $start outside string length
2130 return false;
2131 }
2132 $str = substr($str, $byte_start);
2133 if ($len != null) {
2134 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2135 // $len outside actual string length
2136 if ($byte_end === false) {
2137 return $str;
2138 } else {
2139 return substr($str, 0, $byte_end);
2140 }
2141 } else {
2142 return $str;
2143 }
2144 }
2145
2146 /**
2147 * Counts the number of characters of a string in the EUC charset family.
2148 *
2149 * @param string $str EUC multibyte character string
2150 * @param string $charset The charset
2151 * @return int The number of characters
2152 * @see strlen()
2153 */
2154 public function euc_strlen($str, $charset)
2155 {
2156 $sjis = $charset === 'shift_jis';
2157 $n = 0;
2158 for ($i = 0; isset($str[$i]); $i++) {
2159 $c = ord($str[$i]);
2160 if ($sjis) {
2161 if ($c >= 128 && $c < 160 || $c >= 224) {
2162 $i++;
2163 }
2164 } else {
2165 if ($c >= 128) {
2166 $i++;
2167 }
2168 }
2169 $n++;
2170 }
2171 return $n;
2172 }
2173
2174 /**
2175 * Translates a character position into an 'absolute' byte position.
2176 *
2177 * @param string $str EUC multibyte character string
2178 * @param int $pos Character position (negative values start from the end)
2179 * @param string $charset The charset
2180 * @return int Byte position
2181 */
2182 public function euc_char2byte_pos($str, $pos, $charset)
2183 {
2184 $sjis = $charset === 'shift_jis';
2185 // Number of characters seen
2186 $n = 0;
2187 // Number of characters wanted
2188 $p = abs($pos);
2189 if ($pos >= 0) {
2190 $i = 0;
2191 $d = 1;
2192 } else {
2193 $i = strlen($str) - 1;
2194 $d = -1;
2195 }
2196 for (; isset($str[$i]) && $n < $p; $i += $d) {
2197 $c = ord($str[$i]);
2198 if ($sjis) {
2199 if ($c >= 128 && $c < 160 || $c >= 224) {
2200 $i += $d;
2201 }
2202 } else {
2203 if ($c >= 128) {
2204 $i += $d;
2205 }
2206 }
2207 $n++;
2208 }
2209 if (!isset($str[$i])) {
2210 return false;
2211 }
2212 // offset beyond string length
2213 if ($pos < 0) {
2214 $i++;
2215 }
2216 // correct offset
2217 return $i;
2218 }
2219
2220 /**
2221 * Maps all characters of a string in the EUC charset family.
2222 *
2223 * @param string $str EUC multibyte character string
2224 * @param string $charset The charset
2225 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2226 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
2227 * @return string The converted string
2228 */
2229 public function euc_char_mapping($str, $charset, $mode, $opt = '')
2230 {
2231 switch ($mode) {
2232 case 'case':
2233 if (!$this->initCaseFolding($charset)) {
2234 return $str;
2235 }
2236 // do nothing
2237 $map = &$this->caseFolding[$charset][$opt];
2238 break;
2239 case 'ascii':
2240 if (!$this->initToASCII($charset)) {
2241 return $str;
2242 }
2243 // do nothing
2244 $map = &$this->toASCII[$charset];
2245 break;
2246 default:
2247 return $str;
2248 }
2249 $sjis = $charset === 'shift_jis';
2250 $out = '';
2251 for ($i = 0; isset($str[$i]); $i++) {
2252 $mbc = $str[$i];
2253 $c = ord($mbc);
2254 if ($sjis) {
2255 // A double-byte char
2256 if ($c >= 128 && $c < 160 || $c >= 224) {
2257 $mbc = substr($str, $i, 2);
2258 $i++;
2259 }
2260 } else {
2261 // A double-byte char
2262 if ($c >= 128) {
2263 $mbc = substr($str, $i, 2);
2264 $i++;
2265 }
2266 }
2267 if (isset($map[$mbc])) {
2268 $out .= $map[$mbc];
2269 } else {
2270 $out .= $mbc;
2271 }
2272 }
2273 return $out;
2274 }
2275 }