[TASK] Cleanup of TYPO3\CMS\Core\Localization and Charset
[Packages/TYPO3.CMS.git] / typo3 / sysext / core / Classes / Charset / CharsetConverter.php
1 <?php
2 namespace TYPO3\CMS\Core\Charset;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Localization\Locales;
18 use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
19 use TYPO3\CMS\Core\Utility\GeneralUtility;
20
21 /**
22 * Notes on UTF-8
23 *
24 * Functions working on UTF-8 strings:
25 *
26 * - strchr/strstr
27 * - strrchr
28 * - substr_count
29 * - implode/explode/join
30 *
31 * Functions nearly working on UTF-8 strings:
32 *
33 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
34 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
35 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
36 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
37 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
38 *
39 * Functions NOT working on UTF-8 strings:
40 *
41 * - str*cmp
42 * - stristr
43 * - stripos
44 * - substr
45 * - strrev
46 * - split/spliti
47 * - ...
48 */
49
50 /**
51 * Class for conversion between charsets
52 *
53 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
54 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
55 */
56 class CharsetConverter {
57
58 /**
59 * @var \TYPO3\CMS\Core\Localization\Locales
60 */
61 protected $locales;
62
63 /**
64 * ASCII Value for chars with no equivalent.
65 *
66 * @var int
67 */
68 public $noCharByteVal = 63;
69
70 /**
71 * This is the array where parsed conversion tables are stored (cached)
72 *
73 * @var array
74 */
75 public $parsedCharsets = array();
76
77 /**
78 * An array where case folding data will be stored (cached)
79 *
80 * @var array
81 */
82 public $caseFolding = array();
83
84 /**
85 * An array where charset-to-ASCII mappings are stored (cached)
86 *
87 * @var array
88 */
89 public $toASCII = array();
90
91 /**
92 * This tells the converter which charsets has two bytes per char:
93 *
94 * @var array
95 */
96 public $twoByteSets = array(
97 'ucs-2' => 1
98 );
99
100 /**
101 * This tells the converter which charsets has four bytes per char:
102 *
103 * @var array
104 */
105 public $fourByteSets = array(
106 'ucs-4' => 1, // 4-byte Unicode
107 'utf-32' => 1
108 );
109
110 /**
111 * This tells the converter which charsets use a scheme like the Extended Unix Code:
112 *
113 * @var array
114 */
115 public $eucBasedSets = array(
116 'gb2312' => 1, // Chinese, simplified.
117 'big5' => 1, // Chinese, traditional.
118 'euc-kr' => 1, // Korean
119 'shift_jis' => 1
120 );
121
122 /**
123 * @link http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
124 * @link http://czyborra.com/charsets/iso8859.html
125 *
126 * @var array
127 */
128 public $synonyms = array(
129 'us' => 'ascii',
130 'us-ascii' => 'ascii',
131 'cp819' => 'iso-8859-1',
132 'ibm819' => 'iso-8859-1',
133 'iso-ir-100' => 'iso-8859-1',
134 'iso-ir-101' => 'iso-8859-2',
135 'iso-ir-109' => 'iso-8859-3',
136 'iso-ir-110' => 'iso-8859-4',
137 'iso-ir-144' => 'iso-8859-5',
138 'iso-ir-127' => 'iso-8859-6',
139 'iso-ir-126' => 'iso-8859-7',
140 'iso-ir-138' => 'iso-8859-8',
141 'iso-ir-148' => 'iso-8859-9',
142 'iso-ir-157' => 'iso-8859-10',
143 'iso-ir-179' => 'iso-8859-13',
144 'iso-ir-199' => 'iso-8859-14',
145 'iso-ir-203' => 'iso-8859-15',
146 'csisolatin1' => 'iso-8859-1',
147 'csisolatin2' => 'iso-8859-2',
148 'csisolatin3' => 'iso-8859-3',
149 'csisolatin5' => 'iso-8859-9',
150 'csisolatin8' => 'iso-8859-14',
151 'csisolatin9' => 'iso-8859-15',
152 'csisolatingreek' => 'iso-8859-7',
153 'iso-celtic' => 'iso-8859-14',
154 'latin1' => 'iso-8859-1',
155 'latin2' => 'iso-8859-2',
156 'latin3' => 'iso-8859-3',
157 'latin5' => 'iso-8859-9',
158 'latin6' => 'iso-8859-10',
159 'latin8' => 'iso-8859-14',
160 'latin9' => 'iso-8859-15',
161 'l1' => 'iso-8859-1',
162 'l2' => 'iso-8859-2',
163 'l3' => 'iso-8859-3',
164 'l5' => 'iso-8859-9',
165 'l6' => 'iso-8859-10',
166 'l8' => 'iso-8859-14',
167 'l9' => 'iso-8859-15',
168 'cyrillic' => 'iso-8859-5',
169 'arabic' => 'iso-8859-6',
170 'tis-620' => 'iso-8859-11',
171 'win874' => 'windows-874',
172 'win1250' => 'windows-1250',
173 'win1251' => 'windows-1251',
174 'win1252' => 'windows-1252',
175 'win1253' => 'windows-1253',
176 'win1254' => 'windows-1254',
177 'win1255' => 'windows-1255',
178 'win1256' => 'windows-1256',
179 'win1257' => 'windows-1257',
180 'win1258' => 'windows-1258',
181 'cp1250' => 'windows-1250',
182 'cp1251' => 'windows-1251',
183 'cp1252' => 'windows-1252',
184 'ms-ee' => 'windows-1250',
185 'ms-ansi' => 'windows-1252',
186 'ms-greek' => 'windows-1253',
187 'ms-turk' => 'windows-1254',
188 'winbaltrim' => 'windows-1257',
189 'koi-8ru' => 'koi-8r',
190 'koi8r' => 'koi-8r',
191 'cp878' => 'koi-8r',
192 'mac' => 'macroman',
193 'macintosh' => 'macroman',
194 'euc-cn' => 'gb2312',
195 'x-euc-cn' => 'gb2312',
196 'euccn' => 'gb2312',
197 'cp936' => 'gb2312',
198 'big-5' => 'big5',
199 'cp950' => 'big5',
200 'eucjp' => 'euc-jp',
201 'sjis' => 'shift_jis',
202 'shift-jis' => 'shift_jis',
203 'cp932' => 'shift_jis',
204 'cp949' => 'euc-kr',
205 'utf7' => 'utf-7',
206 'utf8' => 'utf-8',
207 'utf16' => 'utf-16',
208 'utf32' => 'utf-32',
209 'ucs2' => 'ucs-2',
210 'ucs4' => 'ucs-4'
211 );
212
213 /**
214 * Mapping of iso-639-1 language codes to script names
215 *
216 * @var array
217 */
218 public $lang_to_script = array(
219 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
220 'af' => 'west_european', // Afrikaans
221 'ar' => 'arabic',
222 'bg' => 'cyrillic', // Bulgarian
223 'bs' => 'east_european', // Bosnian
224 'cs' => 'east_european', // Czech
225 'da' => 'west_european', // Danish
226 'de' => 'west_european', // German
227 'es' => 'west_european', // Spanish
228 'et' => 'estonian',
229 'eo' => 'unicode', // Esperanto
230 'eu' => 'west_european', // Basque
231 'fa' => 'arabic', // Persian
232 'fi' => 'west_european', // Finish
233 'fo' => 'west_european', // Faroese
234 'fr' => 'west_european', // French
235 'ga' => 'west_european', // Irish
236 'gl' => 'west_european', // Galician
237 'gr' => 'greek',
238 'he' => 'hebrew', // Hebrew (since 1998)
239 'hi' => 'unicode', // Hindi
240 'hr' => 'east_european', // Croatian
241 'hu' => 'east_european', // Hungarian
242 'iw' => 'hebrew', // Hebrew (til 1998)
243 'is' => 'west_european', // Icelandic
244 'it' => 'west_european', // Italian
245 'ja' => 'japanese',
246 'ka' => 'unicode', // Georgian
247 'kl' => 'west_european', // Greenlandic
248 'km' => 'unicode', // Khmer
249 'ko' => 'korean',
250 'lt' => 'lithuanian',
251 'lv' => 'west_european', // Latvian/Lettish
252 'nl' => 'west_european', // Dutch
253 'no' => 'west_european', // Norwegian
254 'nb' => 'west_european', // Norwegian Bokmal
255 'nn' => 'west_european', // Norwegian Nynorsk
256 'pl' => 'east_european', // Polish
257 'pt' => 'west_european', // Portuguese
258 'ro' => 'east_european', // Romanian
259 'ru' => 'cyrillic', // Russian
260 'sk' => 'east_european', // Slovak
261 'sl' => 'east_european', // Slovenian
262 'sr' => 'cyrillic', // Serbian
263 'sv' => 'west_european', // Swedish
264 'sq' => 'albanian', // Albanian
265 'th' => 'thai',
266 'uk' => 'cyrillic', // Ukranian
267 'vi' => 'vietnamese',
268 'zh' => 'chinese',
269
270 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
271 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
272 'afk' => 'west_european', // Afrikaans
273 'ara' => 'arabic',
274 'bgr' => 'cyrillic', // Bulgarian
275 'cat' => 'west_european', // Catalan
276 'chs' => 'simpl_chinese',
277 'cht' => 'trad_chinese',
278 'csy' => 'east_european', // Czech
279 'dan' => 'west_european', // Danish
280 'deu' => 'west_european', // German
281 'dea' => 'west_european', // German (Austrian)
282 'des' => 'west_european', // German (Swiss)
283 'ena' => 'west_european', // English (Australian)
284 'enc' => 'west_european', // English (Canadian)
285 'eng' => 'west_european', // English
286 'enz' => 'west_european', // English (New Zealand)
287 'enu' => 'west_european', // English (United States)
288 'euq' => 'west_european', // Basque
289 'fos' => 'west_european', // Faroese
290 'far' => 'arabic', // Persian
291 'fin' => 'west_european', // Finish
292 'fra' => 'west_european', // French
293 'frb' => 'west_european', // French (Belgian)
294 'frc' => 'west_european', // French (Canadian)
295 'frs' => 'west_european', // French (Swiss)
296 'geo' => 'unicode', // Georgian
297 'glg' => 'west_european', // Galician
298 'ell' => 'greek',
299 'heb' => 'hebrew',
300 'hin' => 'unicode', // Hindi
301 'hun' => 'east_european', // Hungarian
302 'isl' => 'west_european', // Icelandic
303 'ita' => 'west_european', // Italian
304 'its' => 'west_european', // Italian (Swiss)
305 'jpn' => 'japanese',
306 'khm' => 'unicode', // Khmer
307 'kor' => 'korean',
308 'lth' => 'lithuanian',
309 'lvi' => 'west_european', // Latvian/Lettish
310 'msl' => 'west_european', // Malay
311 'nlb' => 'west_european', // Dutch (Belgian)
312 'nld' => 'west_european', // Dutch
313 'nor' => 'west_european', // Norwegian (bokmal)
314 'non' => 'west_european', // Norwegian (nynorsk)
315 'plk' => 'east_european', // Polish
316 'ptg' => 'west_european', // Portuguese
317 'ptb' => 'west_european', // Portuguese (Brazil)
318 'rom' => 'east_european', // Romanian
319 'rus' => 'cyrillic', // Russian
320 'slv' => 'east_european', // Slovenian
321 'sky' => 'east_european', // Slovak
322 'srl' => 'east_european', // Serbian (Latin)
323 'srb' => 'cyrillic', // Serbian (Cyrillic)
324 'esp' => 'west_european', // Spanish (trad. sort)
325 'esm' => 'west_european', // Spanish (Mexican)
326 'esn' => 'west_european', // Spanish (internat. sort)
327 'sve' => 'west_european', // Swedish
328 'sqi' => 'albanian', // Albanian
329 'tha' => 'thai',
330 'trk' => 'turkish',
331 'ukr' => 'cyrillic', // Ukrainian
332
333 // English language names
334 'afrikaans' => 'west_european',
335 'albanian' => 'albanian',
336 'arabic' => 'arabic',
337 'basque' => 'west_european',
338 'bosnian' => 'east_european',
339 'bulgarian' => 'east_european',
340 'catalan' => 'west_european',
341 'croatian' => 'east_european',
342 'czech' => 'east_european',
343 'danish' => 'west_european',
344 'dutch' => 'west_european',
345 'english' => 'west_european',
346 'esperanto' => 'unicode',
347 'estonian' => 'estonian',
348 'faroese' => 'west_european',
349 'farsi' => 'arabic',
350 'finnish' => 'west_european',
351 'french' => 'west_european',
352 'galician' => 'west_european',
353 'georgian' => 'unicode',
354 'german' => 'west_european',
355 'greek' => 'greek',
356 'greenlandic' => 'west_european',
357 'hebrew' => 'hebrew',
358 'hindi' => 'unicode',
359 'hungarian' => 'east_european',
360 'icelandic' => 'west_european',
361 'italian' => 'west_european',
362 'khmer' => 'unicode',
363 'latvian' => 'west_european',
364 'lettish' => 'west_european',
365 'lithuanian' => 'lithuanian',
366 'malay' => 'west_european',
367 'norwegian' => 'west_european',
368 'persian' => 'arabic',
369 'polish' => 'east_european',
370 'portuguese' => 'west_european',
371 'russian' => 'cyrillic',
372 'romanian' => 'east_european',
373 'serbian' => 'cyrillic',
374 'slovak' => 'east_european',
375 'slovenian' => 'east_european',
376 'spanish' => 'west_european',
377 'svedish' => 'west_european',
378 'that' => 'thai',
379 'turkish' => 'turkish',
380 'ukrainian' => 'cyrillic'
381 );
382
383 /**
384 * Mapping of language (family) names to charsets on Unix
385 *
386 * @var array
387 */
388 public $script_to_charset_unix = array(
389 'west_european' => 'iso-8859-1',
390 'estonian' => 'iso-8859-1',
391 'east_european' => 'iso-8859-2',
392 'baltic' => 'iso-8859-4',
393 'cyrillic' => 'iso-8859-5',
394 'arabic' => 'iso-8859-6',
395 'greek' => 'iso-8859-7',
396 'hebrew' => 'iso-8859-8',
397 'turkish' => 'iso-8859-9',
398 'thai' => 'iso-8859-11', // = TIS-620
399 'lithuanian' => 'iso-8859-13',
400 'chinese' => 'gb2312', // = euc-cn
401 'japanese' => 'euc-jp',
402 'korean' => 'euc-kr',
403 'simpl_chinese' => 'gb2312',
404 'trad_chinese' => 'big5',
405 'vietnamese' => '',
406 'unicode' => 'utf-8',
407 'albanian' => 'utf-8'
408 );
409
410 /**
411 * Mapping of language (family) names to charsets on Windows
412 *
413 * @var array
414 */
415 public $script_to_charset_windows = array(
416 'east_european' => 'windows-1250',
417 'cyrillic' => 'windows-1251',
418 'west_european' => 'windows-1252',
419 'greek' => 'windows-1253',
420 'turkish' => 'windows-1254',
421 'hebrew' => 'windows-1255',
422 'arabic' => 'windows-1256',
423 'baltic' => 'windows-1257',
424 'estonian' => 'windows-1257',
425 'lithuanian' => 'windows-1257',
426 'vietnamese' => 'windows-1258',
427 'thai' => 'cp874',
428 'korean' => 'cp949',
429 'chinese' => 'gb2312',
430 'japanese' => 'shift_jis',
431 'simpl_chinese' => 'gb2312',
432 'trad_chinese' => 'big5',
433 'albanian' => 'windows-1250',
434 'unicode' => 'utf-8'
435 );
436
437 /**
438 * Mapping of locale names to charsets
439 *
440 * @var array
441 */
442 public $locale_to_charset = array(
443 'japanese.euc' => 'euc-jp',
444 'ja_jp.ujis' => 'euc-jp',
445 'korean.euc' => 'euc-kr',
446 'sr@Latn' => 'iso-8859-2',
447 'zh_cn' => 'gb2312',
448 'zh_hk' => 'big5',
449 'zh_tw' => 'big5'
450 );
451
452 /**
453 * TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
454 * Empty values means "utf-8"
455 *
456 * @var array
457 */
458 public $charSetArray = array(
459 'af' => '',
460 'ar' => 'iso-8859-6',
461 'ba' => 'iso-8859-2',
462 'bg' => 'windows-1251',
463 'br' => '',
464 'ca' => 'iso-8859-15',
465 'ch' => 'gb2312',
466 'cs' => 'windows-1250',
467 'cz' => 'windows-1250',
468 'da' => '',
469 'de' => '',
470 'dk' => '',
471 'el' => 'iso-8859-7',
472 'eo' => 'utf-8',
473 'es' => '',
474 'et' => 'iso-8859-4',
475 'eu' => '',
476 'fa' => 'utf-8',
477 'fi' => '',
478 'fo' => 'utf-8',
479 'fr' => '',
480 'fr_CA' => '',
481 'ga' => '',
482 'ge' => 'utf-8',
483 'gl' => '',
484 'gr' => 'iso-8859-7',
485 'he' => 'utf-8',
486 'hi' => 'utf-8',
487 'hk' => 'big5',
488 'hr' => 'windows-1250',
489 'hu' => 'iso-8859-2',
490 'is' => 'utf-8',
491 'it' => '',
492 'ja' => 'shift_jis',
493 'jp' => 'shift_jis',
494 'ka' => 'utf-8',
495 'kl' => 'utf-8',
496 'km' => 'utf-8',
497 'ko' => 'euc-kr',
498 'kr' => 'euc-kr',
499 'lt' => 'windows-1257',
500 'lv' => 'utf-8',
501 'ms' => '',
502 'my' => '',
503 'nl' => '',
504 'no' => '',
505 'pl' => 'iso-8859-2',
506 'pt' => '',
507 'pt_BR' => '',
508 'qc' => '',
509 'ro' => 'iso-8859-2',
510 'ru' => 'windows-1251',
511 'se' => '',
512 'si' => 'windows-1250',
513 'sk' => 'windows-1250',
514 'sl' => 'windows-1250',
515 'sq' => 'utf-8',
516 'sr' => 'utf-8',
517 'sv' => '',
518 'th' => 'iso-8859-11',
519 'tr' => 'iso-8859-9',
520 'ua' => 'windows-1251',
521 'uk' => 'windows-1251',
522 'vi' => 'utf-8',
523 'vn' => 'utf-8',
524 'zh' => 'big5'
525 );
526
527 /**
528 * Constructor
529 */
530 public function __construct() {
531 $this->locales = GeneralUtility::makeInstance(Locales::class);
532 }
533
534 /**
535 * Normalize - changes input character set to lowercase letters.
536 *
537 * @param string $charset Input charset
538 * @return string Normalized charset
539 */
540 public function parse_charset($charset) {
541 $charset = trim(strtolower($charset));
542 if (isset($this->synonyms[$charset])) {
543 $charset = $this->synonyms[$charset];
544 }
545 return $charset;
546 }
547
548 /**
549 * Get the charset of a locale.
550 *
551 * ln language
552 * ln_CN language / country
553 * ln_CN.cs language / country / charset
554 * ln_CN.cs@mod language / country / charset / modifier
555 *
556 * @param string $locale Locale string
557 * @return string Charset resolved for locale string
558 */
559 public function get_locale_charset($locale) {
560 $locale = strtolower($locale);
561 // Exact locale specific charset?
562 if (isset($this->locale_to_charset[$locale])) {
563 return $this->locale_to_charset[$locale];
564 }
565 // Get modifier
566 list($locale, $modifier) = explode('@', $locale);
567 // Locale contains charset: use it
568 list($locale, $charset) = explode('.', $locale);
569 if ($charset) {
570 return $this->parse_charset($charset);
571 }
572 // Modifier is 'euro' (after charset check, because of xx.utf-8@euro)
573 if ($modifier === 'euro') {
574 return 'iso-8859-15';
575 }
576 // Get language
577 list($language, ) = explode('_', $locale);
578 if (isset($this->lang_to_script[$language])) {
579 $script = $this->lang_to_script[$language];
580 }
581 if (TYPO3_OS === 'WIN') {
582 $cs = $this->script_to_charset_windows[$script] ?: 'windows-1252';
583 } else {
584 $cs = $this->script_to_charset_unix[$script] ?: 'utf-8';
585 }
586 return $cs;
587 }
588
589 /********************************************
590 *
591 * Charset Conversion functions
592 *
593 ********************************************/
594 /**
595 * Convert from one charset to another charset.
596 *
597 * @param string $inputString Input string
598 * @param string $fromCharset From charset (the current charset of the string)
599 * @param string $toCharset To charset (the output charset wanted)
600 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
601 * @return string Converted string
602 * @see convArray()
603 */
604 public function conv($inputString, $fromCharset, $toCharset, $useEntityForNoChar = FALSE) {
605 if ($fromCharset === $toCharset) {
606 return $inputString;
607 }
608 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
609 if ($toCharset === 'utf-8' || !$useEntityForNoChar) {
610 switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
611 case 'mbstring':
612 $convertedString = mb_convert_encoding($inputString, $toCharset, $fromCharset);
613 if (FALSE !== $convertedString) {
614 return $convertedString;
615 }
616 // Returns FALSE for unsupported charsets
617 break;
618 case 'iconv':
619 $convertedString = iconv($fromCharset, $toCharset . '//TRANSLIT', $inputString);
620 if (FALSE !== $convertedString) {
621 return $convertedString;
622 }
623 break;
624 case 'recode':
625 $convertedString = recode_string($fromCharset . '..' . $toCharset, $inputString);
626 if (FALSE !== $convertedString) {
627 return $convertedString;
628 }
629 break;
630 }
631 }
632 if ($fromCharset !== 'utf-8') {
633 $inputString = $this->utf8_encode($inputString, $fromCharset);
634 }
635 if ($toCharset !== 'utf-8') {
636 $inputString = $this->utf8_decode($inputString, $toCharset, $useEntityForNoChar);
637 }
638 return $inputString;
639 }
640
641 /**
642 * Convert all elements in ARRAY with type string from one charset to another charset.
643 * NOTICE: Array is passed by reference!
644 *
645 * @param array $array Input array, possibly multidimensional
646 * @param string $fromCharset From charset (the current charset of the string)
647 * @param string $toCharset To charset (the output charset wanted)
648 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
649 * @return void
650 * @see conv()
651 */
652 public function convArray(&$array, $fromCharset, $toCharset, $useEntityForNoChar = FALSE) {
653 foreach ($array as $key => $value) {
654 if (is_array($array[$key])) {
655 $this->convArray($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
656 } elseif (is_string($array[$key])) {
657 $array[$key] = $this->conv($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
658 }
659 }
660 }
661
662 /**
663 * Converts $str from $charset to UTF-8
664 *
665 * @param string $str String in local charset to convert to UTF-8
666 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
667 * @return string Output string, converted to UTF-8
668 */
669 public function utf8_encode($str, $charset) {
670 if ($charset === 'utf-8') {
671 return $str;
672 }
673 // Charset is case-insensitive
674 // Parse conv. table if not already
675 if ($this->initCharset($charset)) {
676 $strLen = strlen($str);
677 $outStr = '';
678 // Traverse each char in string
679 for ($a = 0; $a < $strLen; $a++) {
680 $chr = substr($str, $a, 1);
681 $ord = ord($chr);
682 // If the charset has two bytes per char
683 if (isset($this->twoByteSets[$charset])) {
684 $ord2 = ord($str[$a + 1]);
685 // Assume big endian
686 $ord = $ord << 8 | $ord2;
687 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
688 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
689 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
690 } else {
691 $outStr .= chr($this->noCharByteVal);
692 }
693 // No char exists
694 $a++;
695 } elseif ($ord > 127) {
696 // If char has value over 127 it's a multibyte char in UTF-8
697 // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
698 if (isset($this->eucBasedSets[$charset])) {
699 // Shift-JIS: chars between 160 and 223 are single byte
700 if ($charset !== 'shift_jis' || ($ord < 160 || $ord > 223)) {
701 $a++;
702 $ord2 = ord(substr($str, $a, 1));
703 $ord = $ord * 256 + $ord2;
704 }
705 }
706 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
707 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
708 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
709 } else {
710 $outStr .= chr($this->noCharByteVal);
711 }
712 } else {
713 $outStr .= $chr;
714 }
715 }
716 return $outStr;
717 }
718 }
719
720 /**
721 * Converts $str from UTF-8 to $charset
722 *
723 * @param string $str String in UTF-8 to convert to local charset
724 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
725 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
726 * @return string Output string, converted to local charset
727 */
728 public function utf8_decode($str, $charset, $useEntityForNoChar = FALSE) {
729 if ($charset === 'utf-8') {
730 return $str;
731 }
732 // Charset is case-insensitive.
733 // Parse conv. table if not already
734 if ($this->initCharset($charset)) {
735 $strLen = strlen($str);
736 $outStr = '';
737 // Traverse each char in UTF-8 string
738 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) {
739 $chr = substr($str, $a, 1);
740 $ord = ord($chr);
741 // This means multibyte! (first byte!)
742 if ($ord > 127) {
743 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
744 if ($ord & 64) {
745 // Add first byte
746 $buf = $chr;
747 // For each byte in multibyte string
748 for ($b = 0; $b < 8; $b++) {
749 // Shift it left and
750 $ord = $ord << 1;
751 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
752 if ($ord & 128) {
753 $a++;
754 // ... and add the next char.
755 $buf .= substr($str, $a, 1);
756 } else {
757 break;
758 }
759 }
760 // If the UTF-8 char-sequence is found then...
761 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
762 // The local number
763 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
764 // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
765 if ($mByte > 255) {
766 $outStr .= chr(($mByte >> 8 & 255)) . chr(($mByte & 255));
767 } else {
768 $outStr .= chr($mByte);
769 }
770 } elseif ($useEntityForNoChar) {
771 // Create num entity:
772 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
773 } else {
774 $outStr .= chr($this->noCharByteVal);
775 }
776 } else {
777 $outStr .= chr($this->noCharByteVal);
778 }
779 } else {
780 $outStr .= $chr;
781 }
782 }
783 return $outStr;
784 }
785 }
786
787 /**
788 * Converts all chars > 127 to numeric entities.
789 *
790 * @param string $str Input string
791 * @return string Output string
792 */
793 public function utf8_to_entities($str) {
794 $strLen = strlen($str);
795 $outStr = '';
796 // Traverse each char in UTF-8 string.
797 for ($a = 0; $a < $strLen; $a++) {
798 $chr = substr($str, $a, 1);
799 $ord = ord($chr);
800 // This means multibyte! (first byte!)
801 if ($ord > 127) {
802 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
803 if ($ord & 64) {
804 // Add first byte
805 $buf = $chr;
806 // For each byte in multibyte string...
807 for ($b = 0; $b < 8; $b++) {
808 // Shift it left and ...
809 $ord = $ord << 1;
810 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
811 if ($ord & 128) {
812 $a++;
813 // ... and add the next char.
814 $buf .= substr($str, $a, 1);
815 } else {
816 break;
817 }
818 }
819 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
820 } else {
821 $outStr .= chr($this->noCharByteVal);
822 }
823 } else {
824 $outStr .= $chr;
825 }
826 }
827 return $outStr;
828 }
829
830 /**
831 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
832 *
833 * @param string $str Input string, UTF-8
834 * @param bool $alsoStdHtmlEnt If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
835 * @return string Output string
836 */
837 public function entities_to_utf8($str, $alsoStdHtmlEnt = FALSE) {
838 if ($alsoStdHtmlEnt) {
839 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8'));
840 }
841 $token = md5(microtime());
842 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
843 foreach ($parts as $k => $v) {
844 // Only take every second element
845 if ($k % 2 === 0) {
846 continue;
847 }
848 $position = 0;
849 // Dec or hex entities
850 if (substr($v, $position, 1) === '#') {
851 $position++;
852 if (substr($v, $position, 1) === 'x') {
853 $v = hexdec(substr($v, ++$position));
854 } else {
855 $v = substr($v, $position);
856 }
857 $parts[$k] = $this->UnumberToChar($v);
858 } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) {
859 // Other entities:
860 $v = $trans_tbl['&' . $v . ';'];
861 $parts[$k] = $v;
862 } else {
863 // No conversion:
864 $parts[$k] = '&' . $v . ';';
865 }
866 }
867 return implode('', $parts);
868 }
869
870 /**
871 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
872 *
873 * @param string $str Input string, UTF-8
874 * @param bool $convEntities If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
875 * @param bool $retChar If set, then instead of integer numbers the real UTF-8 char is returned.
876 * @return array Output array with the char numbers
877 */
878 public function utf8_to_numberarray($str, $convEntities = FALSE, $retChar = FALSE) {
879 // If entities must be registered as well...:
880 if ($convEntities) {
881 $str = $this->entities_to_utf8($str, 1);
882 }
883 // Do conversion:
884 $strLen = strlen($str);
885 $outArr = array();
886 // Traverse each char in UTF-8 string.
887 for ($a = 0; $a < $strLen; $a++) {
888 $chr = substr($str, $a, 1);
889 $ord = ord($chr);
890 // This means multibyte! (first byte!)
891 if ($ord > 127) {
892 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
893 if ($ord & 64) {
894 // Add first byte
895 $buf = $chr;
896 // For each byte in multibyte string...
897 for ($b = 0; $b < 8; $b++) {
898 // Shift it left and ...
899 $ord = $ord << 1;
900 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
901 if ($ord & 128) {
902 $a++;
903 // ... and add the next char.
904 $buf .= substr($str, $a, 1);
905 } else {
906 break;
907 }
908 }
909 $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
910 } else {
911 $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
912 }
913 } else {
914 $outArr[] = $retChar ? chr($ord) : $ord;
915 }
916 }
917 return $outArr;
918 }
919
920 /**
921 * Converts a UNICODE number to a UTF-8 multibyte character
922 * Algorithm based on script found at From: http://czyborra.com/utf/
923 * Unit-tested by Kasper
924 *
925 * The binary representation of the character's integer value is thus simply spread across the bytes
926 * and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
927 *
928 * bytes | bits | representation
929 * 1 | 7 | 0vvvvvvv
930 * 2 | 11 | 110vvvvv 10vvvvvv
931 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
932 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
933 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
934 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
935 *
936 * @param int $unicodeInteger UNICODE integer
937 * @return string UTF-8 multibyte character string
938 * @see utf8CharToUnumber()
939 */
940 public function UnumberToChar($unicodeInteger) {
941 $str = '';
942 if ($unicodeInteger < 128) {
943 $str .= chr($unicodeInteger);
944 } elseif ($unicodeInteger < 2048) {
945 $str .= chr(192 | $unicodeInteger >> 6);
946 $str .= chr(128 | $unicodeInteger & 63);
947 } elseif ($unicodeInteger < 65536) {
948 $str .= chr(224 | $unicodeInteger >> 12);
949 $str .= chr(128 | $unicodeInteger >> 6 & 63);
950 $str .= chr(128 | $unicodeInteger & 63);
951 } elseif ($unicodeInteger < 2097152) {
952 $str .= chr(240 | $unicodeInteger >> 18);
953 $str .= chr(128 | $unicodeInteger >> 12 & 63);
954 $str .= chr(128 | $unicodeInteger >> 6 & 63);
955 $str .= chr(128 | $unicodeInteger & 63);
956 } elseif ($unicodeInteger < 67108864) {
957 $str .= chr(248 | $unicodeInteger >> 24);
958 $str .= chr(128 | $unicodeInteger >> 18 & 63);
959 $str .= chr(128 | $unicodeInteger >> 12 & 63);
960 $str .= chr(128 | $unicodeInteger >> 6 & 63);
961 $str .= chr(128 | $unicodeInteger & 63);
962 } elseif ($unicodeInteger < 2147483648) {
963 $str .= chr(252 | $unicodeInteger >> 30);
964 $str .= chr(128 | $unicodeInteger >> 24 & 63);
965 $str .= chr(128 | $unicodeInteger >> 18 & 63);
966 $str .= chr(128 | $unicodeInteger >> 12 & 63);
967 $str .= chr(128 | $unicodeInteger >> 6 & 63);
968 $str .= chr(128 | $unicodeInteger & 63);
969 } else {
970 // Cannot express a 32-bit character in UTF-8
971 $str .= chr($this->noCharByteVal);
972 }
973 return $str;
974 }
975
976 /**
977 * Converts a UTF-8 Multibyte character to a UNICODE number
978 * Unit-tested by Kasper
979 *
980 * @param string $str UTF-8 multibyte character string
981 * @param bool $hex If set, then a hex. number is returned.
982 * @return int UNICODE integer
983 * @see UnumberToChar()
984 */
985 public function utf8CharToUnumber($str, $hex = FALSE) {
986 // First char
987 $ord = ord($str[0]);
988 // This verifies that it IS a multi byte string
989 if (($ord & 192) === 192) {
990 $binBuf = '';
991 // For each byte in multibyte string...
992 for ($b = 0; $b < 8; $b++) {
993 // Shift it left and ...
994 $ord = $ord << 1;
995 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
996 if ($ord & 128) {
997 $binBuf .= substr('00000000' . decbin(ord(substr($str, ($b + 1), 1))), -6);
998 } else {
999 break;
1000 }
1001 }
1002 $binBuf = substr(('00000000' . decbin(ord($str[0]))), -(6 - $b)) . $binBuf;
1003 $int = bindec($binBuf);
1004 } else {
1005 $int = $ord;
1006 }
1007 return $hex ? 'x' . dechex($int) : $int;
1008 }
1009
1010 /********************************************
1011 *
1012 * Init functions
1013 *
1014 ********************************************/
1015 /**
1016 * This will initialize a charset for use if it's defined in the 'typo3/sysext/core/Resources/Private/Charsets/csconvtbl/' folder
1017 * This function is automatically called by the conversion functions
1018 *
1019 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
1020 *
1021 * @param string $charset The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
1022 * @return int Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
1023 * @access private
1024 */
1025 public function initCharset($charset) {
1026 // Only process if the charset is not yet loaded:
1027 if (!is_array($this->parsedCharsets[$charset])) {
1028 // Conversion table filename:
1029 $charsetConvTableFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/csconvtbl/' . $charset . '.tbl';
1030 // If the conversion table is found:
1031 if ($charset && GeneralUtility::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
1032 // Cache file for charsets:
1033 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1034 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1035 if ($cacheFile && @is_file($cacheFile)) {
1036 $this->parsedCharsets[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1037 } else {
1038 // Parse conversion table into lines:
1039 $lines = GeneralUtility::trimExplode(LF, GeneralUtility::getUrl($charsetConvTableFile), TRUE);
1040 // Initialize the internal variable holding the conv. table:
1041 $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
1042 // traverse the lines:
1043 $detectedType = '';
1044 foreach ($lines as $value) {
1045 // Comment line or blanks are ignored.
1046 if (trim($value) && $value[0] !== '#') {
1047 // Detect type if not done yet: (Done on first real line)
1048 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1049 if (!$detectedType) {
1050 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
1051 }
1052 if ($detectedType === 'ms-token') {
1053 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1054 } elseif ($detectedType === 'whitespaced') {
1055 $regA = array();
1056 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1057 $hexbyte = $regA[1];
1058 $utf8 = 'U+' . $regA[2];
1059 }
1060 $decval = hexdec(trim($hexbyte));
1061 if ($decval > 127) {
1062 $utf8decval = hexdec(substr(trim($utf8), 2));
1063 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1064 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1065 }
1066 }
1067 }
1068 if ($cacheFile) {
1069 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1070 }
1071 }
1072 return 2;
1073 } else {
1074 return FALSE;
1075 }
1076 } else {
1077 return 1;
1078 }
1079 }
1080
1081 /**
1082 * This function initializes all UTF-8 character data tables.
1083 *
1084 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1085 *
1086 * @param string $mode Mode ("case", "ascii", ...)
1087 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1088 * @access private
1089 */
1090 public function initUnicodeData($mode = NULL) {
1091 // Cache files
1092 $cacheFileCase = GeneralUtility::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1093 $cacheFileASCII = GeneralUtility::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1094 // Only process if the tables are not yet loaded
1095 switch ($mode) {
1096 case 'case':
1097 if (is_array($this->caseFolding['utf-8'])) {
1098 return 1;
1099 }
1100 // Use cached version if possible
1101 if ($cacheFileCase && @is_file($cacheFileCase)) {
1102 $this->caseFolding['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileCase));
1103 return 2;
1104 }
1105 break;
1106 case 'ascii':
1107 if (is_array($this->toASCII['utf-8'])) {
1108 return 1;
1109 }
1110 // Use cached version if possible
1111 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1112 $this->toASCII['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileASCII));
1113 return 2;
1114 }
1115 break;
1116 }
1117 // Process main Unicode data file
1118 $unicodeDataFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/UnicodeData.txt';
1119 if (!(GeneralUtility::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1120 return FALSE;
1121 }
1122 $fh = fopen($unicodeDataFile, 'rb');
1123 if (!$fh) {
1124 return FALSE;
1125 }
1126 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1127 // Note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1128 $this->caseFolding['utf-8'] = array();
1129 $utf8CaseFolding = &$this->caseFolding['utf-8'];
1130 // a shorthand
1131 $utf8CaseFolding['toUpper'] = array();
1132 $utf8CaseFolding['toLower'] = array();
1133 $utf8CaseFolding['toTitle'] = array();
1134 // Array of temp. decompositions
1135 $decomposition = array();
1136 // Array of chars that are marks (eg. composing accents)
1137 $mark = array();
1138 // Array of chars that are numbers (eg. digits)
1139 $number = array();
1140 // Array of chars to be omitted (eg. Russian hard sign)
1141 $omit = array();
1142 while (!feof($fh)) {
1143 $line = fgets($fh, 4096);
1144 // Has a lot of info
1145 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line));
1146 $ord = hexdec($char);
1147 if ($ord > 65535) {
1148 // Only process the BMP
1149 break;
1150 }
1151 $utf8_char = $this->UnumberToChar($ord);
1152 if ($upper) {
1153 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1154 }
1155 if ($lower) {
1156 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1157 }
1158 // Store "title" only when different from "upper" (only a few)
1159 if ($title && $title !== $upper) {
1160 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1161 }
1162 switch ($cat[0]) {
1163 case 'M':
1164 // mark (accent, umlaut, ...)
1165 $mark['U+' . $char] = 1;
1166 break;
1167 case 'N':
1168 // numeric value
1169 if ($ord > 128 && $num !== '') {
1170 $number['U+' . $char] = $num;
1171 }
1172 }
1173 // Accented Latin letters without "official" decomposition
1174 $match = array();
1175 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1176 $c = ord($match[2]);
1177 if ($match[1] === 'SMALL') {
1178 $c += 32;
1179 }
1180 $decomposition['U+' . $char] = array(dechex($c));
1181 continue;
1182 }
1183 $match = array();
1184 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1185 switch ($match[1]) {
1186 case '<circle>':
1187 // add parenthesis as circle replacement, eg (1)
1188 $match[2] = '0028 ' . $match[2] . ' 0029';
1189 break;
1190 case '<square>':
1191 // add square brackets as square replacement, eg [1]
1192 $match[2] = '005B ' . $match[2] . ' 005D';
1193 break;
1194 case '<compat>':
1195 // ignore multi char decompositions that start with a space
1196 if (preg_match('/^0020 /', $match[2])) {
1197 continue 2;
1198 }
1199 break;
1200 case '<initial>':
1201 case '<medial>':
1202 case '<final>':
1203 case '<isolated>':
1204 case '<vertical>':
1205 continue 2;
1206 }
1207 $decomposition['U+' . $char] = explode(' ', $match[2]);
1208 }
1209 }
1210 fclose($fh);
1211 // Process additional Unicode data for casing (allow folded characters to expand into a sequence)
1212 $specialCasingFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/SpecialCasing.txt';
1213 if (GeneralUtility::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1214 $fh = fopen($specialCasingFile, 'rb');
1215 if ($fh) {
1216 while (!feof($fh)) {
1217 $line = fgets($fh, 4096);
1218 if ($line[0] !== '#' && trim($line) !== '') {
1219 list($char, $lower, $title, $upper, $cond) = GeneralUtility::trimExplode(';', $line);
1220 if ($cond === '' || $cond[0] === '#') {
1221 $utf8_char = $this->UnumberToChar(hexdec($char));
1222 if ($char !== $lower) {
1223 $arr = explode(' ', $lower);
1224 for ($i = 0; isset($arr[$i]); $i++) {
1225 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1226 }
1227 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1228 }
1229 if ($char !== $title && $title !== $upper) {
1230 $arr = explode(' ', $title);
1231 for ($i = 0; isset($arr[$i]); $i++) {
1232 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1233 }
1234 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1235 }
1236 if ($char !== $upper) {
1237 $arr = explode(' ', $upper);
1238 for ($i = 0; isset($arr[$i]); $i++) {
1239 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1240 }
1241 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1242 }
1243 }
1244 }
1245 }
1246 fclose($fh);
1247 }
1248 }
1249 // Process custom decompositions
1250 $customTranslitFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/Translit.txt';
1251 if (GeneralUtility::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1252 $fh = fopen($customTranslitFile, 'rb');
1253 if ($fh) {
1254 while (!feof($fh)) {
1255 $line = fgets($fh, 4096);
1256 if ($line[0] !== '#' && trim($line) !== '') {
1257 list($char, $translit) = GeneralUtility::trimExplode(';', $line);
1258 if (!$translit) {
1259 $omit['U+' . $char] = 1;
1260 }
1261 $decomposition['U+' . $char] = explode(' ', $translit);
1262 }
1263 }
1264 fclose($fh);
1265 }
1266 }
1267 // Decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1268 foreach ($decomposition as $from => $to) {
1269 $code_decomp = array();
1270 while ($code_value = array_shift($to)) {
1271 // Do recursive decomposition
1272 if (isset($decomposition['U+' . $code_value])) {
1273 foreach (array_reverse($decomposition['U+' . $code_value]) as $cv) {
1274 array_unshift($to, $cv);
1275 }
1276 } elseif (!isset($mark[('U+' . $code_value)])) {
1277 // remove mark
1278 array_push($code_decomp, $code_value);
1279 }
1280 }
1281 if (!empty($code_decomp) || isset($omit[$from])) {
1282 $decomposition[$from] = $code_decomp;
1283 } else {
1284 unset($decomposition[$from]);
1285 }
1286 }
1287 // Create ascii only mapping
1288 $this->toASCII['utf-8'] = array();
1289 $ascii = &$this->toASCII['utf-8'];
1290 foreach ($decomposition as $from => $to) {
1291 $code_decomp = array();
1292 while ($code_value = array_shift($to)) {
1293 $ord = hexdec($code_value);
1294 if ($ord > 127) {
1295 continue 2;
1296 } else {
1297 // Skip decompositions containing non-ASCII chars
1298 array_push($code_decomp, chr($ord));
1299 }
1300 }
1301 $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1302 }
1303 // Add numeric decompositions
1304 foreach ($number as $from => $to) {
1305 $utf8_char = $this->UnumberToChar(hexdec($from));
1306 if (!isset($ascii[$utf8_char])) {
1307 $ascii[$utf8_char] = $to;
1308 }
1309 }
1310 if ($cacheFileCase) {
1311 GeneralUtility::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1312 }
1313 if ($cacheFileASCII) {
1314 GeneralUtility::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1315 }
1316 return 3;
1317 }
1318
1319 /**
1320 * This function initializes the folding table for a charset other than UTF-8.
1321 * This function is automatically called by the case folding functions.
1322 *
1323 * @param string $charset Charset for which to initialize case folding.
1324 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1325 * @access private
1326 */
1327 public function initCaseFolding($charset) {
1328 // Only process if the case table is not yet loaded:
1329 if (is_array($this->caseFolding[$charset])) {
1330 return 1;
1331 }
1332 // Use cached version if possible
1333 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1334 if ($cacheFile && @is_file($cacheFile)) {
1335 $this->caseFolding[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1336 return 2;
1337 }
1338 // init UTF-8 conversion for this charset
1339 if (!$this->initCharset($charset)) {
1340 return FALSE;
1341 }
1342 // UTF-8 case folding is used as the base conversion table
1343 if (!$this->initUnicodeData('case')) {
1344 return FALSE;
1345 }
1346 $nochar = chr($this->noCharByteVal);
1347 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1348 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1349 $c = $this->utf8_decode($utf8, $charset);
1350 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1351 if ($cc !== '' && $cc !== $nochar) {
1352 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1353 }
1354 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1355 if ($cc !== '' && $cc !== $nochar) {
1356 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1357 }
1358 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1359 if ($cc !== '' && $cc !== $nochar) {
1360 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1361 }
1362 }
1363 // Add the ASCII case table
1364 $start = ord('a');
1365 $end = ord('z');
1366 for ($i = $start; $i <= $end; $i++) {
1367 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1368 }
1369 $start = ord('A');
1370 $end = ord('Z');
1371 for ($i = $start; $i <= $end; $i++) {
1372 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1373 }
1374 if ($cacheFile) {
1375 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1376 }
1377 return 3;
1378 }
1379
1380 /**
1381 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1382 * This function is automatically called by the ASCII transliteration functions.
1383 *
1384 * @param string $charset Charset for which to initialize conversion.
1385 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1386 * @access private
1387 */
1388 public function initToASCII($charset) {
1389 // Only process if the case table is not yet loaded:
1390 if (is_array($this->toASCII[$charset])) {
1391 return 1;
1392 }
1393 // Use cached version if possible
1394 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1395 if ($cacheFile && @is_file($cacheFile)) {
1396 $this->toASCII[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1397 return 2;
1398 }
1399 // Init UTF-8 conversion for this charset
1400 if (!$this->initCharset($charset)) {
1401 return FALSE;
1402 }
1403 // UTF-8/ASCII transliteration is used as the base conversion table
1404 if (!$this->initUnicodeData('ascii')) {
1405 return FALSE;
1406 }
1407 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1408 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1409 $c = $this->utf8_decode($utf8, $charset);
1410 if (isset($this->toASCII['utf-8'][$utf8])) {
1411 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1412 }
1413 }
1414 if ($cacheFile) {
1415 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1416 }
1417 return 3;
1418 }
1419
1420 /********************************************
1421 *
1422 * String operation functions
1423 *
1424 ********************************************/
1425 /**
1426 * Returns a part of a string.
1427 * Unit-tested by Kasper (single byte charsets only)
1428 *
1429 * @param string $charset The character set
1430 * @param string $string Character string
1431 * @param int $start Start position (character position)
1432 * @param int $len Length (in characters)
1433 * @return string The substring
1434 * @see substr(), mb_substr()
1435 */
1436 public function substr($charset, $string, $start, $len = NULL) {
1437 if ($len === 0 || $string === '') {
1438 return '';
1439 }
1440 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1441 // Cannot omit $len, when specifying charset
1442 if ($len === NULL) {
1443 // Save internal encoding
1444 $enc = mb_internal_encoding();
1445 mb_internal_encoding($charset);
1446 $str = mb_substr($string, $start);
1447 // Restore internal encoding
1448 mb_internal_encoding($enc);
1449 return $str;
1450 } else {
1451 return mb_substr($string, $start, $len, $charset);
1452 }
1453 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'iconv') {
1454 // Cannot omit $len, when specifying charset
1455 if ($len === NULL) {
1456 // Save internal encoding
1457 $enc = iconv_get_encoding('internal_encoding');
1458 iconv_set_encoding('internal_encoding', $charset);
1459 $str = iconv_substr($string, $start);
1460 // Restore internal encoding
1461 iconv_set_encoding('internal_encoding', $enc);
1462 return $str;
1463 } else {
1464 return iconv_substr($string, $start, $len, $charset);
1465 }
1466 } elseif ($charset === 'utf-8') {
1467 return $this->utf8_substr($string, $start, $len);
1468 } elseif ($this->eucBasedSets[$charset]) {
1469 return $this->euc_substr($string, $start, $charset, $len);
1470 } elseif ($this->twoByteSets[$charset]) {
1471 return substr($string, $start * 2, $len * 2);
1472 } elseif ($this->fourByteSets[$charset]) {
1473 return substr($string, $start * 4, $len * 4);
1474 }
1475 // Treat everything else as single-byte encoding
1476 return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
1477 }
1478
1479 /**
1480 * Counts the number of characters.
1481 * Unit-tested by Kasper (single byte charsets only)
1482 *
1483 * @param string $charset The character set
1484 * @param string $string Character string
1485 * @return int The number of characters
1486 * @see strlen()
1487 */
1488 public function strlen($charset, $string) {
1489 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1490 return mb_strlen($string, $charset);
1491 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'iconv') {
1492 return iconv_strlen($string, $charset);
1493 } elseif ($charset === 'utf-8') {
1494 return $this->utf8_strlen($string);
1495 } elseif ($this->eucBasedSets[$charset]) {
1496 return $this->euc_strlen($string, $charset);
1497 } elseif ($this->twoByteSets[$charset]) {
1498 return strlen($string) / 2;
1499 } elseif ($this->fourByteSets[$charset]) {
1500 return strlen($string) / 4;
1501 }
1502 // Treat everything else as single-byte encoding
1503 return strlen($string);
1504 }
1505
1506 /**
1507 * Method to crop strings using the mb_substr function.
1508 *
1509 * @param string $charset The character set
1510 * @param string $string String to be cropped
1511 * @param int $len Crop length (in characters)
1512 * @param string $crop Crop signifier
1513 * @return string The shortened string
1514 * @see mb_strlen(), mb_substr()
1515 */
1516 protected function cropMbstring($charset, $string, $len, $crop = '') {
1517 if ((int)$len === 0 || mb_strlen($string, $charset) <= abs($len)) {
1518 return $string;
1519 }
1520 if ($len > 0) {
1521 $string = mb_substr($string, 0, $len, $charset) . $crop;
1522 } else {
1523 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1524 }
1525 return $string;
1526 }
1527
1528 /**
1529 * Truncates a string and pre-/appends a string.
1530 * Unit tested by Kasper
1531 *
1532 * @param string $charset The character set
1533 * @param string $string Character string
1534 * @param int $len Length (in characters)
1535 * @param string $crop Crop signifier
1536 * @return string The shortened string
1537 * @see substr(), mb_strimwidth()
1538 */
1539 public function crop($charset, $string, $len, $crop = '') {
1540 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1541 return $this->cropMbstring($charset, $string, $len, $crop);
1542 }
1543 if ((int)$len === 0) {
1544 return $string;
1545 }
1546 if ($charset === 'utf-8') {
1547 $i = $this->utf8_char2byte_pos($string, $len);
1548 } elseif ($this->eucBasedSets[$charset]) {
1549 $i = $this->euc_char2byte_pos($string, $len, $charset);
1550 } else {
1551 if ($len > 0) {
1552 $i = $len;
1553 } else {
1554 $i = strlen($string) + $len;
1555 if ($i <= 0) {
1556 $i = FALSE;
1557 }
1558 }
1559 }
1560 // $len outside actual string length
1561 if ($i === FALSE) {
1562 return $string;
1563 } else {
1564 if ($len > 0) {
1565 if (isset($string[$i])) {
1566 return substr($string, 0, $i) . $crop;
1567 }
1568 } else {
1569 if (isset($string[$i - 1])) {
1570 return $crop . substr($string, $i);
1571 }
1572 }
1573 }
1574 return $string;
1575 }
1576
1577 /**
1578 * Cuts a string short at a given byte length.
1579 *
1580 * @param string $charset The character set
1581 * @param string $string Character string
1582 * @param int $len The byte length
1583 * @return string The shortened string
1584 * @see mb_strcut()
1585 */
1586 public function strtrunc($charset, $string, $len) {
1587 if ($len <= 0) {
1588 return '';
1589 }
1590 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1591 return mb_strcut($string, 0, $len, $charset);
1592 } elseif ($charset === 'utf-8') {
1593 return $this->utf8_strtrunc($string, $len);
1594 } elseif ($this->eucBasedSets[$charset]) {
1595 return $this->euc_strtrunc($string, $len, $charset);
1596 } elseif ($this->twoByteSets[$charset]) {
1597 if ($len % 2) {
1598 $len--;
1599 }
1600 } elseif ($this->fourByteSets[$charset]) {
1601 $x = $len % 4;
1602 // Realign to position dividable by four
1603 $len -= $x;
1604 }
1605 // Treat everything else as single-byte encoding
1606 return substr($string, 0, $len);
1607 }
1608
1609 /**
1610 * Translates all characters of a string into their respective case values.
1611 * Unlike strtolower() and strtoupper() this method is locale independent.
1612 * Note that the string length may change!
1613 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1614 * Unit-tested by Kasper
1615 * Real case folding is language dependent, this method ignores this fact.
1616 *
1617 * @param string $charset Character set of string
1618 * @param string $string Input string to convert case for
1619 * @param string $case Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1620 * @return string The converted string
1621 * @see strtolower(), strtoupper()
1622 */
1623 public function conv_case($charset, $string, $case) {
1624 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1625 if ($case === 'toLower') {
1626 $string = mb_strtolower($string, $charset);
1627 } else {
1628 $string = mb_strtoupper($string, $charset);
1629 }
1630 } elseif ($charset === 'utf-8') {
1631 $string = $this->utf8_char_mapping($string, 'case', $case);
1632 } elseif (isset($this->eucBasedSets[$charset])) {
1633 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1634 } else {
1635 // Treat everything else as single-byte encoding
1636 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1637 }
1638 return $string;
1639 }
1640
1641 /**
1642 * Equivalent of lcfirst/ucfirst but using character set.
1643 *
1644 * @param string $charset
1645 * @param string $string
1646 * @param string $case
1647 * @return string
1648 * @see \TYPO3\CMS\Core\Charset\CharsetConverter::conv_case()
1649 */
1650 public function convCaseFirst($charset, $string, $case) {
1651 $firstChar = $this->substr($charset, $string, 0, 1);
1652 $firstChar = $this->conv_case($charset, $firstChar, $case);
1653 $remainder = $this->substr($charset, $string, 1);
1654 return $firstChar . $remainder;
1655 }
1656
1657 /**
1658 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1659 *
1660 * @param string $charset Character set of string
1661 * @param string $string Input string to convert
1662 * @return string The converted string
1663 */
1664 public function specCharsToASCII($charset, $string) {
1665 if ($charset === 'utf-8') {
1666 $string = $this->utf8_char_mapping($string, 'ascii');
1667 } elseif (isset($this->eucBasedSets[$charset])) {
1668 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1669 } else {
1670 // Treat everything else as single-byte encoding
1671 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1672 }
1673 return $string;
1674 }
1675
1676 /**
1677 * Converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1678 * into a TYPO3-readable language code
1679 *
1680 * @param string $languageCodesList List of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1681 * @return string A preferred language that TYPO3 supports, or "default" if none found
1682 */
1683 public function getPreferredClientLanguage($languageCodesList) {
1684 $allLanguageCodes = array();
1685 $selectedLanguage = 'default';
1686 // Get all languages where TYPO3 code is the same as the ISO code
1687 foreach ($this->charSetArray as $typo3Lang => $charSet) {
1688 $allLanguageCodes[$typo3Lang] = $typo3Lang;
1689 }
1690 // Get all languages where TYPO3 code differs from ISO code
1691 // or needs the country part
1692 // the iso codes will here overwrite the default typo3 language in the key
1693 foreach ($this->locales->getIsoMapping() as $typo3Lang => $isoLang) {
1694 $isoLang = join('-', explode('_', $isoLang));
1695 $allLanguageCodes[$typo3Lang] = $isoLang;
1696 }
1697 // Move the iso codes to the (because we're comparing the keys with "isset" later on)
1698 $allLanguageCodes = array_flip($allLanguageCodes);
1699 $preferredLanguages = GeneralUtility::trimExplode(',', $languageCodesList);
1700 // Order the preferred languages after they key
1701 $sortedPreferredLanguages = array();
1702 foreach ($preferredLanguages as $preferredLanguage) {
1703 $quality = 1.0;
1704 if (strpos($preferredLanguage, ';q=') !== FALSE) {
1705 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1706 }
1707 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1708 }
1709 // Loop through the languages, with the highest priority first
1710 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1711 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1712 if (isset($allLanguageCodes[$preferredLanguage])) {
1713 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1714 break;
1715 }
1716 // Strip the country code from the end
1717 list($preferredLanguage, ) = explode('-', $preferredLanguage);
1718 if (isset($allLanguageCodes[$preferredLanguage])) {
1719 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1720 break;
1721 }
1722 }
1723 if (!$selectedLanguage || $selectedLanguage === 'en') {
1724 $selectedLanguage = 'default';
1725 }
1726 return $selectedLanguage;
1727 }
1728
1729 /********************************************
1730 *
1731 * Internal string operation functions
1732 *
1733 ********************************************/
1734 /**
1735 * Maps all characters of a string in a single byte charset.
1736 *
1737 * @param string $str The string
1738 * @param string $charset The charset
1739 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1740 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
1741 * @return string The converted string
1742 */
1743 public function sb_char_mapping($str, $charset, $mode, $opt = '') {
1744 switch ($mode) {
1745 case 'case':
1746 if (!$this->initCaseFolding($charset)) {
1747 return $str;
1748 }
1749 // Do nothing
1750 $map = &$this->caseFolding[$charset][$opt];
1751 break;
1752 case 'ascii':
1753 if (!$this->initToASCII($charset)) {
1754 return $str;
1755 }
1756 // Do nothing
1757 $map = &$this->toASCII[$charset];
1758 break;
1759 default:
1760 return $str;
1761 }
1762 $out = '';
1763 for ($i = 0; isset($str[$i]); $i++) {
1764 $c = $str[$i];
1765 if (isset($map[$c])) {
1766 $out .= $map[$c];
1767 } else {
1768 $out .= $c;
1769 }
1770 }
1771 return $out;
1772 }
1773
1774 /********************************************
1775 *
1776 * Internal UTF-8 string operation functions
1777 *
1778 ********************************************/
1779 /**
1780 * Returns a part of a UTF-8 string.
1781 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1782 *
1783 * @param string $str UTF-8 string
1784 * @param int $start Start position (character position)
1785 * @param int $len Length (in characters)
1786 * @return string The substring
1787 * @see substr()
1788 */
1789 public function utf8_substr($str, $start, $len = NULL) {
1790 if ((string)$len === '0') {
1791 return '';
1792 }
1793 $byte_start = $this->utf8_char2byte_pos($str, $start);
1794 if ($byte_start === FALSE) {
1795 if ($start > 0) {
1796 // $start outside string length
1797 return FALSE;
1798 }
1799 }
1800 $str = substr($str, $byte_start);
1801 if ($len != NULL) {
1802 $byte_end = $this->utf8_char2byte_pos($str, $len);
1803 // $len outside actual string length
1804 if ($byte_end === FALSE) {
1805 return $len < 0 ? '' : $str;
1806 } else {
1807 // When length is less than zero and exceeds, then we return blank string.
1808 return substr($str, 0, $byte_end);
1809 }
1810 } else {
1811 return $str;
1812 }
1813 }
1814
1815 /**
1816 * Counts the number of characters of a string in UTF-8.
1817 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1818 *
1819 * @param string $str UTF-8 multibyte character string
1820 * @return int The number of characters
1821 * @see strlen()
1822 */
1823 public function utf8_strlen($str) {
1824 $n = 0;
1825 for ($i = 0; isset($str[$i]); $i++) {
1826 $c = ord($str[$i]);
1827 // Single-byte (0xxxxxx)
1828 if (!($c & 128)) {
1829 $n++;
1830 } elseif (($c & 192) === 192) {
1831 // Multi-byte starting byte (11xxxxxx)
1832 $n++;
1833 }
1834 }
1835 return $n;
1836 }
1837
1838 /**
1839 * Truncates a string in UTF-8 short at a given byte length.
1840 *
1841 * @param string $str UTF-8 multibyte character string
1842 * @param int $len The byte length
1843 * @return string The shortened string
1844 * @see mb_strcut()
1845 */
1846 public function utf8_strtrunc($str, $len) {
1847 $i = $len - 1;
1848 // Part of a multibyte sequence
1849 if (ord($str[$i]) & 128) {
1850 for (; $i > 0 && !(ord($str[$i]) & 64); $i--) {
1851
1852 }
1853 if ($i <= 0) {
1854 return '';
1855 }
1856 // Sanity check
1857 for ($bc = 0, $mbs = ord($str[$i]); $mbs & 128; $mbs = $mbs << 1) {
1858 // Calculate number of bytes
1859 $bc++;
1860 }
1861 if ($bc + $i > $len) {
1862 return substr($str, 0, $i);
1863 }
1864 }
1865 return substr($str, 0, $len);
1866 }
1867
1868 /**
1869 * Find position of first occurrence of a string, both arguments are in UTF-8.
1870 *
1871 * @param string $haystack UTF-8 string to search in
1872 * @param string $needle UTF-8 string to search for
1873 * @param int $offset Position to start the search
1874 * @return int The character position
1875 * @see strpos()
1876 */
1877 public function utf8_strpos($haystack, $needle, $offset = 0) {
1878 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1879 return mb_strpos($haystack, $needle, $offset, 'utf-8');
1880 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'iconv') {
1881 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1882 }
1883 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1884 if ($byte_offset === FALSE) {
1885 // Offset beyond string length
1886 return FALSE;
1887 }
1888 $byte_pos = strpos($haystack, $needle, $byte_offset);
1889 if ($byte_pos === FALSE) {
1890 // Needle not found
1891 return FALSE;
1892 }
1893 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1894 }
1895
1896 /**
1897 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1898 *
1899 * @param string $haystack UTF-8 string to search in
1900 * @param string $needle UTF-8 character to search for (single character)
1901 * @return int The character position
1902 * @see strrpos()
1903 */
1904 public function utf8_strrpos($haystack, $needle) {
1905 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1906 return mb_strrpos($haystack, $needle, 'utf-8');
1907 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'iconv') {
1908 return iconv_strrpos($haystack, $needle, 'utf-8');
1909 }
1910 $byte_pos = strrpos($haystack, $needle);
1911 if ($byte_pos === FALSE) {
1912 // Needle not found
1913 return FALSE;
1914 }
1915 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1916 }
1917
1918 /**
1919 * Translates a character position into an 'absolute' byte position.
1920 * Unit tested by Kasper.
1921 *
1922 * @param string $str UTF-8 string
1923 * @param int $pos Character position (negative values start from the end)
1924 * @return int Byte position
1925 */
1926 public function utf8_char2byte_pos($str, $pos) {
1927 // Number of characters found
1928 $n = 0;
1929 // Number of characters wanted
1930 $p = abs($pos);
1931 if ($pos >= 0) {
1932 $i = 0;
1933 $d = 1;
1934 } else {
1935 $i = strlen($str) - 1;
1936 $d = -1;
1937 }
1938 for (; isset($str[$i]) && $n < $p; $i += $d) {
1939 $c = (int)ord($str[$i]);
1940 // single-byte (0xxxxxx)
1941 if (!($c & 128)) {
1942 $n++;
1943 } elseif (($c & 192) === 192) {
1944 // Multi-byte starting byte (11xxxxxx)
1945 $n++;
1946 }
1947 }
1948 if (!isset($str[$i])) {
1949 // Offset beyond string length
1950 return FALSE;
1951 }
1952 if ($pos >= 0) {
1953 // Skip trailing multi-byte data bytes
1954 while (ord($str[$i]) & 128 && !(ord($str[$i]) & 64)) {
1955 $i++;
1956 }
1957 } else {
1958 // Correct offset
1959 $i++;
1960 }
1961 return $i;
1962 }
1963
1964 /**
1965 * Translates an 'absolute' byte position into a character position.
1966 * Unit tested by Kasper.
1967 *
1968 * @param string $str UTF-8 string
1969 * @param int $pos Byte position
1970 * @return int Character position
1971 */
1972 public function utf8_byte2char_pos($str, $pos) {
1973 // Number of characters
1974 $n = 0;
1975 for ($i = $pos; $i > 0; $i--) {
1976 $c = (int)ord($str[$i]);
1977 // single-byte (0xxxxxx)
1978 if (!($c & 128)) {
1979 $n++;
1980 } elseif (($c & 192) === 192) {
1981 // Multi-byte starting byte (11xxxxxx)
1982 $n++;
1983 }
1984 }
1985 if (!isset($str[$i])) {
1986 // Offset beyond string length
1987 return FALSE;
1988 }
1989 return $n;
1990 }
1991
1992 /**
1993 * Maps all characters of an UTF-8 string.
1994 *
1995 * @param string $str UTF-8 string
1996 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1997 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
1998 * @return string The converted string
1999 */
2000 public function utf8_char_mapping($str, $mode, $opt = '') {
2001 if (!$this->initUnicodeData($mode)) {
2002 // Do nothing
2003 return $str;
2004 }
2005 $out = '';
2006 switch ($mode) {
2007 case 'case':
2008 $map = &$this->caseFolding['utf-8'][$opt];
2009 break;
2010 case 'ascii':
2011 $map = &$this->toASCII['utf-8'];
2012 break;
2013 default:
2014 return $str;
2015 }
2016 for ($i = 0; isset($str[$i]); $i++) {
2017 $c = ord($str[$i]);
2018 // single-byte (0xxxxxx)
2019 if (!($c & 128)) {
2020 $mbc = $str[$i];
2021 } elseif (($c & 192) === 192) {
2022 // multi-byte starting byte (11xxxxxx)
2023 for ($bc = 0; $c & 128; $c = $c << 1) {
2024 $bc++;
2025 }
2026 // calculate number of bytes
2027 $mbc = substr($str, $i, $bc);
2028 $i += $bc - 1;
2029 }
2030 if (isset($map[$mbc])) {
2031 $out .= $map[$mbc];
2032 } else {
2033 $out .= $mbc;
2034 }
2035 }
2036 return $out;
2037 }
2038
2039 /********************************************
2040 *
2041 * Internal EUC string operation functions
2042 *
2043 * Extended Unix Code:
2044 * ASCII compatible 7bit single bytes chars
2045 * 8bit two byte chars
2046 *
2047 * Shift-JIS is treated as a special case.
2048 *
2049 ********************************************/
2050 /**
2051 * Cuts a string in the EUC charset family short at a given byte length.
2052 *
2053 * @param string $str EUC multibyte character string
2054 * @param int $len The byte length
2055 * @param string $charset The charset
2056 * @return string The shortened string
2057 * @see mb_strcut()
2058 */
2059 public function euc_strtrunc($str, $len, $charset) {
2060 $shiftJis = $charset === 'shift_jis';
2061 for ($i = 0; isset($str[$i]) && $i < $len; $i++) {
2062 $c = ord($str[$i]);
2063 if ($shiftJis) {
2064 if ($c >= 128 && $c < 160 || $c >= 224) {
2065 $i++;
2066 }
2067 } else {
2068 if ($c >= 128) {
2069 $i++;
2070 }
2071 }
2072 }
2073 if (!isset($str[$i])) {
2074 return $str;
2075 }
2076 // string shorter than supplied length
2077 if ($i > $len) {
2078 // We ended on a first byte
2079 return substr($str, 0, $len - 1);
2080 } else {
2081 return substr($str, 0, $len);
2082 }
2083 }
2084
2085 /**
2086 * Returns a part of a string in the EUC charset family.
2087 *
2088 * @param string $str EUC multibyte character string
2089 * @param int $start Start position (character position)
2090 * @param string $charset The charset
2091 * @param int $len Length (in characters)
2092 * @return string the substring
2093 */
2094 public function euc_substr($str, $start, $charset, $len = NULL) {
2095 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2096 if ($byte_start === FALSE) {
2097 // $start outside string length
2098 return FALSE;
2099 }
2100 $str = substr($str, $byte_start);
2101 if ($len != NULL) {
2102 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2103 // $len outside actual string length
2104 if ($byte_end === FALSE) {
2105 return $str;
2106 } else {
2107 return substr($str, 0, $byte_end);
2108 }
2109 } else {
2110 return $str;
2111 }
2112 }
2113
2114 /**
2115 * Counts the number of characters of a string in the EUC charset family.
2116 *
2117 * @param string $str EUC multibyte character string
2118 * @param string $charset The charset
2119 * @return int The number of characters
2120 * @see strlen()
2121 */
2122 public function euc_strlen($str, $charset) {
2123 $sjis = $charset === 'shift_jis';
2124 $n = 0;
2125 for ($i = 0; isset($str[$i]); $i++) {
2126 $c = ord($str[$i]);
2127 if ($sjis) {
2128 if ($c >= 128 && $c < 160 || $c >= 224) {
2129 $i++;
2130 }
2131 } else {
2132 if ($c >= 128) {
2133 $i++;
2134 }
2135 }
2136 $n++;
2137 }
2138 return $n;
2139 }
2140
2141 /**
2142 * Translates a character position into an 'absolute' byte position.
2143 *
2144 * @param string $str EUC multibyte character string
2145 * @param int $pos Character position (negative values start from the end)
2146 * @param string $charset The charset
2147 * @return int Byte position
2148 */
2149 public function euc_char2byte_pos($str, $pos, $charset) {
2150 $sjis = $charset === 'shift_jis';
2151 // Number of characters seen
2152 $n = 0;
2153 // Number of characters wanted
2154 $p = abs($pos);
2155 if ($pos >= 0) {
2156 $i = 0;
2157 $d = 1;
2158 } else {
2159 $i = strlen($str) - 1;
2160 $d = -1;
2161 }
2162 for (; isset($str[$i]) && $n < $p; $i += $d) {
2163 $c = ord($str[$i]);
2164 if ($sjis) {
2165 if ($c >= 128 && $c < 160 || $c >= 224) {
2166 $i += $d;
2167 }
2168 } else {
2169 if ($c >= 128) {
2170 $i += $d;
2171 }
2172 }
2173 $n++;
2174 }
2175 if (!isset($str[$i])) {
2176 return FALSE;
2177 }
2178 // offset beyond string length
2179 if ($pos < 0) {
2180 $i++;
2181 }
2182 // correct offset
2183 return $i;
2184 }
2185
2186 /**
2187 * Maps all characters of a string in the EUC charset family.
2188 *
2189 * @param string $str EUC multibyte character string
2190 * @param string $charset The charset
2191 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2192 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
2193 * @return string The converted string
2194 */
2195 public function euc_char_mapping($str, $charset, $mode, $opt = '') {
2196 switch ($mode) {
2197 case 'case':
2198 if (!$this->initCaseFolding($charset)) {
2199 return $str;
2200 }
2201 // do nothing
2202 $map = &$this->caseFolding[$charset][$opt];
2203 break;
2204 case 'ascii':
2205 if (!$this->initToASCII($charset)) {
2206 return $str;
2207 }
2208 // do nothing
2209 $map = &$this->toASCII[$charset];
2210 break;
2211 default:
2212 return $str;
2213 }
2214 $sjis = $charset === 'shift_jis';
2215 $out = '';
2216 for ($i = 0; isset($str[$i]); $i++) {
2217 $mbc = $str[$i];
2218 $c = ord($mbc);
2219 if ($sjis) {
2220 // A double-byte char
2221 if ($c >= 128 && $c < 160 || $c >= 224) {
2222 $mbc = substr($str, $i, 2);
2223 $i++;
2224 }
2225 } else {
2226 // A double-byte char
2227 if ($c >= 128) {
2228 $mbc = substr($str, $i, 2);
2229 $i++;
2230 }
2231 }
2232 if (isset($map[$mbc])) {
2233 $out .= $map[$mbc];
2234 } else {
2235 $out .= $mbc;
2236 }
2237 }
2238 return $out;
2239 }
2240
2241 }