86b14bf250bedbb2a83c8ec5f770b9f5594de57c
[Packages/TYPO3.CMS.git] / typo3 / sysext / core / Classes / Charset / CharsetConverter.php
1 <?php
2 namespace TYPO3\CMS\Core\Charset;
3
4 /***************************************************************
5 * Copyright notice
6 *
7 * (c) 2003-2013 Kasper Skårhøj (kasperYYYY@typo3.com)
8 * All rights reserved
9 *
10 * This script is part of the Typo3 project. The Typo3 project is
11 * free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * The GNU General Public License can be found at
17 * http://www.gnu.org/copyleft/gpl.html.
18 *
19 * This script is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
23 *
24 * This copyright notice MUST APPEAR in all copies of the script!
25 ***************************************************************/
26
27 use TYPO3\CMS\Core\Utility\GeneralUtility;
28
29 /**
30 * Notes on UTF-8
31 *
32 * Functions working on UTF-8 strings:
33 *
34 * - strchr/strstr
35 * - strrchr
36 * - substr_count
37 * - implode/explode/join
38 *
39 * Functions nearly working on UTF-8 strings:
40 *
41 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
42 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
43 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
44 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
45 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
46 *
47 * Functions NOT working on UTF-8 strings:
48 *
49 * - str*cmp
50 * - stristr
51 * - stripos
52 * - substr
53 * - strrev
54 * - split/spliti
55 * - ...
56 */
57
58 /**
59 * Class for conversion between charsets
60 *
61 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
62 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
63 */
64 class CharsetConverter {
65
66 /**
67 * @var \TYPO3\CMS\Core\Localization\Locales
68 */
69 protected $locales;
70
71 // ASCII Value for chars with no equivalent.
72 /**
73 * @todo Define visibility
74 */
75 public $noCharByteVal = 63;
76
77 // This is the array where parsed conversion tables are stored (cached)
78 /**
79 * @todo Define visibility
80 */
81 public $parsedCharsets = array();
82
83 // An array where case folding data will be stored (cached)
84 /**
85 * @todo Define visibility
86 */
87 public $caseFolding = array();
88
89 // An array where charset-to-ASCII mappings are stored (cached)
90 /**
91 * @todo Define visibility
92 */
93 public $toASCII = array();
94
95 // This tells the converter which charsets has two bytes per char:
96 /**
97 * @todo Define visibility
98 */
99 public $twoByteSets = array(
100 'ucs-2' => 1
101 );
102
103 // This tells the converter which charsets has four bytes per char:
104 /**
105 * @todo Define visibility
106 */
107 public $fourByteSets = array(
108 'ucs-4' => 1,
109 // 4-byte Unicode
110 'utf-32' => 1
111 );
112
113 // This tells the converter which charsets use a scheme like the Extended Unix Code:
114 /**
115 * @todo Define visibility
116 */
117 public $eucBasedSets = array(
118 'gb2312' => 1,
119 // Chinese, simplified.
120 'big5' => 1,
121 // Chinese, traditional.
122 'euc-kr' => 1,
123 // Korean
124 'shift_jis' => 1
125 );
126
127 // See http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
128 // http://czyborra.com/charsets/iso8859.html
129 /**
130 * @todo Define visibility
131 */
132 public $synonyms = array(
133 'us' => 'ascii',
134 'us-ascii' => 'ascii',
135 'cp819' => 'iso-8859-1',
136 'ibm819' => 'iso-8859-1',
137 'iso-ir-100' => 'iso-8859-1',
138 'iso-ir-101' => 'iso-8859-2',
139 'iso-ir-109' => 'iso-8859-3',
140 'iso-ir-110' => 'iso-8859-4',
141 'iso-ir-144' => 'iso-8859-5',
142 'iso-ir-127' => 'iso-8859-6',
143 'iso-ir-126' => 'iso-8859-7',
144 'iso-ir-138' => 'iso-8859-8',
145 'iso-ir-148' => 'iso-8859-9',
146 'iso-ir-157' => 'iso-8859-10',
147 'iso-ir-179' => 'iso-8859-13',
148 'iso-ir-199' => 'iso-8859-14',
149 'iso-ir-203' => 'iso-8859-15',
150 'csisolatin1' => 'iso-8859-1',
151 'csisolatin2' => 'iso-8859-2',
152 'csisolatin3' => 'iso-8859-3',
153 'csisolatin5' => 'iso-8859-9',
154 'csisolatin8' => 'iso-8859-14',
155 'csisolatin9' => 'iso-8859-15',
156 'csisolatingreek' => 'iso-8859-7',
157 'iso-celtic' => 'iso-8859-14',
158 'latin1' => 'iso-8859-1',
159 'latin2' => 'iso-8859-2',
160 'latin3' => 'iso-8859-3',
161 'latin5' => 'iso-8859-9',
162 'latin6' => 'iso-8859-10',
163 'latin8' => 'iso-8859-14',
164 'latin9' => 'iso-8859-15',
165 'l1' => 'iso-8859-1',
166 'l2' => 'iso-8859-2',
167 'l3' => 'iso-8859-3',
168 'l5' => 'iso-8859-9',
169 'l6' => 'iso-8859-10',
170 'l8' => 'iso-8859-14',
171 'l9' => 'iso-8859-15',
172 'cyrillic' => 'iso-8859-5',
173 'arabic' => 'iso-8859-6',
174 'tis-620' => 'iso-8859-11',
175 'win874' => 'windows-874',
176 'win1250' => 'windows-1250',
177 'win1251' => 'windows-1251',
178 'win1252' => 'windows-1252',
179 'win1253' => 'windows-1253',
180 'win1254' => 'windows-1254',
181 'win1255' => 'windows-1255',
182 'win1256' => 'windows-1256',
183 'win1257' => 'windows-1257',
184 'win1258' => 'windows-1258',
185 'cp1250' => 'windows-1250',
186 'cp1251' => 'windows-1251',
187 'cp1252' => 'windows-1252',
188 'ms-ee' => 'windows-1250',
189 'ms-ansi' => 'windows-1252',
190 'ms-greek' => 'windows-1253',
191 'ms-turk' => 'windows-1254',
192 'winbaltrim' => 'windows-1257',
193 'koi-8ru' => 'koi-8r',
194 'koi8r' => 'koi-8r',
195 'cp878' => 'koi-8r',
196 'mac' => 'macroman',
197 'macintosh' => 'macroman',
198 'euc-cn' => 'gb2312',
199 'x-euc-cn' => 'gb2312',
200 'euccn' => 'gb2312',
201 'cp936' => 'gb2312',
202 'big-5' => 'big5',
203 'cp950' => 'big5',
204 'eucjp' => 'euc-jp',
205 'sjis' => 'shift_jis',
206 'shift-jis' => 'shift_jis',
207 'cp932' => 'shift_jis',
208 'cp949' => 'euc-kr',
209 'utf7' => 'utf-7',
210 'utf8' => 'utf-8',
211 'utf16' => 'utf-16',
212 'utf32' => 'utf-32',
213 'utf8' => 'utf-8',
214 'ucs2' => 'ucs-2',
215 'ucs4' => 'ucs-4'
216 );
217
218 // Mapping of iso-639-1 language codes to script names
219 /**
220 * @todo Define visibility
221 */
222 public $lang_to_script = array(
223 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
224 'af' => 'west_european',
225 //Afrikaans
226 'ar' => 'arabic',
227 'bg' => 'cyrillic',
228 // Bulgarian
229 'bs' => 'east_european',
230 // Bosnian
231 'cs' => 'east_european',
232 // Czech
233 'da' => 'west_european',
234 // Danish
235 'de' => 'west_european',
236 // German
237 'es' => 'west_european',
238 // Spanish
239 'et' => 'estonian',
240 'eo' => 'unicode',
241 // Esperanto
242 'eu' => 'west_european',
243 // Basque
244 'fa' => 'arabic',
245 // Persian
246 'fi' => 'west_european',
247 // Finish
248 'fo' => 'west_european',
249 // Faroese
250 'fr' => 'west_european',
251 // French
252 'ga' => 'west_european',
253 // Irish
254 'gl' => 'west_european',
255 // Galician
256 'gr' => 'greek',
257 'he' => 'hebrew',
258 // Hebrew (since 1998)
259 'hi' => 'unicode',
260 // Hindi
261 'hr' => 'east_european',
262 // Croatian
263 'hu' => 'east_european',
264 // Hungarian
265 'iw' => 'hebrew',
266 // Hebrew (til 1998)
267 'is' => 'west_european',
268 // Icelandic
269 'it' => 'west_european',
270 // Italian
271 'ja' => 'japanese',
272 'ka' => 'unicode',
273 // Georgian
274 'kl' => 'west_european',
275 // Greenlandic
276 'km' => 'unicode',
277 // Khmer
278 'ko' => 'korean',
279 'lt' => 'lithuanian',
280 'lv' => 'west_european',
281 // Latvian/Lettish
282 'nl' => 'west_european',
283 // Dutch
284 'no' => 'west_european',
285 // Norwegian
286 'nb' => 'west_european',
287 // Norwegian Bokmal
288 'nn' => 'west_european',
289 // Norwegian Nynorsk
290 'pl' => 'east_european',
291 // Polish
292 'pt' => 'west_european',
293 // Portuguese
294 'ro' => 'east_european',
295 // Romanian
296 'ru' => 'cyrillic',
297 // Russian
298 'sk' => 'east_european',
299 // Slovak
300 'sl' => 'east_european',
301 // Slovenian
302 'sr' => 'cyrillic',
303 // Serbian
304 'sv' => 'west_european',
305 // Swedish
306 'sq' => 'albanian',
307 // Albanian
308 'th' => 'thai',
309 'uk' => 'cyrillic',
310 // Ukranian
311 'vi' => 'vietnamese',
312 'zh' => 'chinese',
313 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
314 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
315 'afk' => 'west_european',
316 // Afrikaans
317 'ara' => 'arabic',
318 'bgr' => 'cyrillic',
319 // Bulgarian
320 'cat' => 'west_european',
321 // Catalan
322 'chs' => 'simpl_chinese',
323 'cht' => 'trad_chinese',
324 'csy' => 'east_european',
325 // Czech
326 'dan' => 'west_european',
327 // Danisch
328 'deu' => 'west_european',
329 // German
330 'dea' => 'west_european',
331 // German (Austrian)
332 'des' => 'west_european',
333 // German (Swiss)
334 'ena' => 'west_european',
335 // English (Australian)
336 'enc' => 'west_european',
337 // English (Canadian)
338 'eng' => 'west_european',
339 // English
340 'enz' => 'west_european',
341 // English (New Zealand)
342 'enu' => 'west_european',
343 // English (United States)
344 'euq' => 'west_european',
345 // Basque
346 'fos' => 'west_european',
347 // Faroese
348 'far' => 'arabic',
349 // Persian
350 'fin' => 'west_european',
351 // Finish
352 'fra' => 'west_european',
353 // French
354 'frb' => 'west_european',
355 // French (Belgian)
356 'frc' => 'west_european',
357 // French (Canadian)
358 'frs' => 'west_european',
359 // French (Swiss)
360 'geo' => 'unicode',
361 // Georgian
362 'glg' => 'west_european',
363 // Galician
364 'ell' => 'greek',
365 'heb' => 'hebrew',
366 'hin' => 'unicode',
367 // Hindi
368 'hun' => 'east_european',
369 // Hungarian
370 'isl' => 'west_european',
371 // Icelandic
372 'ita' => 'west_european',
373 // Italian
374 'its' => 'west_european',
375 // Italian (Swiss)
376 'jpn' => 'japanese',
377 'khm' => 'unicode',
378 // Khmer
379 'kor' => 'korean',
380 'lth' => 'lithuanian',
381 'lvi' => 'west_european',
382 // Latvian/Lettish
383 'msl' => 'west_european',
384 // Malay
385 'nlb' => 'west_european',
386 // Dutch (Belgian)
387 'nld' => 'west_european',
388 // Dutch
389 'nor' => 'west_european',
390 // Norwegian (bokmal)
391 'non' => 'west_european',
392 // Norwegian (nynorsk)
393 'plk' => 'east_european',
394 // Polish
395 'ptg' => 'west_european',
396 // Portuguese
397 'ptb' => 'west_european',
398 // Portuguese (Brazil)
399 'rom' => 'east_european',
400 // Romanian
401 'rus' => 'cyrillic',
402 // Russian
403 'slv' => 'east_european',
404 // Slovenian
405 'sky' => 'east_european',
406 // Slovak
407 'srl' => 'east_european',
408 // Serbian (Latin)
409 'srb' => 'cyrillic',
410 // Serbian (Cyrillic)
411 'esp' => 'west_european',
412 // Spanish (trad. sort)
413 'esm' => 'west_european',
414 // Spanish (Mexican)
415 'esn' => 'west_european',
416 // Spanish (internat. sort)
417 'sve' => 'west_european',
418 // Swedish
419 'sqi' => 'albanian',
420 // Albanian
421 'tha' => 'thai',
422 'trk' => 'turkish',
423 'ukr' => 'cyrillic',
424 // Ukrainian
425 // English language names
426 'afrikaans' => 'west_european',
427 'albanian' => 'albanian',
428 'arabic' => 'arabic',
429 'basque' => 'west_european',
430 'bosnian' => 'east_european',
431 'bulgarian' => 'east_european',
432 'catalan' => 'west_european',
433 'croatian' => 'east_european',
434 'czech' => 'east_european',
435 'danish' => 'west_european',
436 'dutch' => 'west_european',
437 'english' => 'west_european',
438 'esperanto' => 'unicode',
439 'estonian' => 'estonian',
440 'faroese' => 'west_european',
441 'farsi' => 'arabic',
442 'finnish' => 'west_european',
443 'french' => 'west_european',
444 'galician' => 'west_european',
445 'georgian' => 'unicode',
446 'german' => 'west_european',
447 'greek' => 'greek',
448 'greenlandic' => 'west_european',
449 'hebrew' => 'hebrew',
450 'hindi' => 'unicode',
451 'hungarian' => 'east_european',
452 'icelandic' => 'west_european',
453 'italian' => 'west_european',
454 'khmer' => 'unicode',
455 'latvian' => 'west_european',
456 'lettish' => 'west_european',
457 'lithuanian' => 'lithuanian',
458 'malay' => 'west_european',
459 'norwegian' => 'west_european',
460 'persian' => 'arabic',
461 'polish' => 'east_european',
462 'portuguese' => 'west_european',
463 'russian' => 'cyrillic',
464 'romanian' => 'east_european',
465 'serbian' => 'cyrillic',
466 'slovak' => 'east_european',
467 'slovenian' => 'east_european',
468 'spanish' => 'west_european',
469 'svedish' => 'west_european',
470 'that' => 'thai',
471 'turkish' => 'turkish',
472 'ukrainian' => 'cyrillic'
473 );
474
475 // Mapping of language (family) names to charsets on Unix
476 /**
477 * @todo Define visibility
478 */
479 public $script_to_charset_unix = array(
480 'west_european' => 'iso-8859-1',
481 'estonian' => 'iso-8859-1',
482 'east_european' => 'iso-8859-2',
483 'baltic' => 'iso-8859-4',
484 'cyrillic' => 'iso-8859-5',
485 'arabic' => 'iso-8859-6',
486 'greek' => 'iso-8859-7',
487 'hebrew' => 'iso-8859-8',
488 'turkish' => 'iso-8859-9',
489 'thai' => 'iso-8859-11',
490 // = TIS-620
491 'lithuanian' => 'iso-8859-13',
492 'chinese' => 'gb2312',
493 // = euc-cn
494 'japanese' => 'euc-jp',
495 'korean' => 'euc-kr',
496 'simpl_chinese' => 'gb2312',
497 'trad_chinese' => 'big5',
498 'vietnamese' => '',
499 'unicode' => 'utf-8',
500 'albanian' => 'utf-8'
501 );
502
503 // Mapping of language (family) names to charsets on Windows
504 /**
505 * @todo Define visibility
506 */
507 public $script_to_charset_windows = array(
508 'east_european' => 'windows-1250',
509 'cyrillic' => 'windows-1251',
510 'west_european' => 'windows-1252',
511 'greek' => 'windows-1253',
512 'turkish' => 'windows-1254',
513 'hebrew' => 'windows-1255',
514 'arabic' => 'windows-1256',
515 'baltic' => 'windows-1257',
516 'estonian' => 'windows-1257',
517 'lithuanian' => 'windows-1257',
518 'vietnamese' => 'windows-1258',
519 'thai' => 'cp874',
520 'korean' => 'cp949',
521 'chinese' => 'gb2312',
522 'japanese' => 'shift_jis',
523 'simpl_chinese' => 'gb2312',
524 'trad_chinese' => 'big5',
525 'albanian' => 'windows-1250',
526 'unicode' => 'utf-8'
527 );
528
529 // Mapping of locale names to charsets
530 /**
531 * @todo Define visibility
532 */
533 public $locale_to_charset = array(
534 'japanese.euc' => 'euc-jp',
535 'ja_jp.ujis' => 'euc-jp',
536 'korean.euc' => 'euc-kr',
537 'sr@Latn' => 'iso-8859-2',
538 'zh_cn' => 'gb2312',
539 'zh_hk' => 'big5',
540 'zh_tw' => 'big5'
541 );
542
543 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
544 // Empty values means "iso-8859-1"
545 /**
546 * @todo Define visibility
547 */
548 public $charSetArray = array(
549 'af' => '',
550 'ar' => 'iso-8859-6',
551 'ba' => 'iso-8859-2',
552 'bg' => 'windows-1251',
553 'br' => '',
554 'ca' => 'iso-8859-15',
555 'ch' => 'gb2312',
556 'cs' => 'windows-1250',
557 'cz' => 'windows-1250',
558 'da' => '',
559 'de' => '',
560 'dk' => '',
561 'el' => 'iso-8859-7',
562 'eo' => 'utf-8',
563 'es' => '',
564 'et' => 'iso-8859-4',
565 'eu' => '',
566 'fa' => 'utf-8',
567 'fi' => '',
568 'fo' => 'utf-8',
569 'fr' => '',
570 'fr_CA' => '',
571 'ga' => '',
572 'ge' => 'utf-8',
573 'gl' => '',
574 'gr' => 'iso-8859-7',
575 'he' => 'utf-8',
576 'hi' => 'utf-8',
577 'hk' => 'big5',
578 'hr' => 'windows-1250',
579 'hu' => 'iso-8859-2',
580 'is' => 'utf-8',
581 'it' => '',
582 'ja' => 'shift_jis',
583 'jp' => 'shift_jis',
584 'ka' => 'utf-8',
585 'kl' => 'utf-8',
586 'km' => 'utf-8',
587 'ko' => 'euc-kr',
588 'kr' => 'euc-kr',
589 'lt' => 'windows-1257',
590 'lv' => 'utf-8',
591 'ms' => '',
592 'my' => '',
593 'nl' => '',
594 'no' => '',
595 'pl' => 'iso-8859-2',
596 'pt' => '',
597 'pt_BR' => '',
598 'qc' => '',
599 'ro' => 'iso-8859-2',
600 'ru' => 'windows-1251',
601 'se' => '',
602 'si' => 'windows-1250',
603 'sk' => 'windows-1250',
604 'sl' => 'windows-1250',
605 'sq' => 'utf-8',
606 'sr' => 'utf-8',
607 'sv' => '',
608 'th' => 'iso-8859-11',
609 'tr' => 'iso-8859-9',
610 'ua' => 'windows-1251',
611 'uk' => 'windows-1251',
612 'vi' => 'utf-8',
613 'vn' => 'utf-8',
614 'zh' => 'big5'
615 );
616
617 /**
618 * Default constructor.
619 */
620 public function __construct() {
621 $this->locales = GeneralUtility::makeInstance('TYPO3\\CMS\\Core\\Localization\\Locales');
622 }
623
624 /**
625 * Normalize - changes input character set to lowercase letters.
626 *
627 * @param string $charset Input charset
628 * @return string Normalized charset
629 * @todo Define visibility
630 */
631 public function parse_charset($charset) {
632 $charset = trim(strtolower($charset));
633 if (isset($this->synonyms[$charset])) {
634 $charset = $this->synonyms[$charset];
635 }
636 return $charset;
637 }
638
639 /**
640 * Get the charset of a locale.
641 *
642 * ln language
643 * ln_CN language / country
644 * ln_CN.cs language / country / charset
645 * ln_CN.cs@mod language / country / charset / modifier
646 *
647 * @param string $locale Locale string
648 * @return string Charset resolved for locale string
649 * @todo Define visibility
650 */
651 public function get_locale_charset($locale) {
652 $locale = strtolower($locale);
653 // Exact locale specific charset?
654 if (isset($this->locale_to_charset[$locale])) {
655 return $this->locale_to_charset[$locale];
656 }
657 // Get modifier
658 list($locale, $modifier) = explode('@', $locale);
659 // Locale contains charset: use it
660 list($locale, $charset) = explode('.', $locale);
661 if ($charset) {
662 return $this->parse_charset($charset);
663 }
664 // Modifier is 'euro' (after charset check, because of xx.utf-8@euro)
665 if ($modifier == 'euro') {
666 return 'iso-8859-15';
667 }
668 // Get language
669 list($language, $country) = explode('_', $locale);
670 if (isset($this->lang_to_script[$language])) {
671 $script = $this->lang_to_script[$language];
672 }
673 if (TYPO3_OS == 'WIN') {
674 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
675 } else {
676 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'utf-8';
677 }
678 return $cs;
679 }
680
681 /********************************************
682 *
683 * Charset Conversion functions
684 *
685 ********************************************/
686 /**
687 * Convert from one charset to another charset.
688 *
689 * @param string $str Input string
690 * @param string $fromCS From charset (the current charset of the string)
691 * @param string $toCS To charset (the output charset wanted)
692 * @param boolean $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
693 * @return string Converted string
694 * @see convArray()
695 * @todo Define visibility
696 */
697 public function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
698 if ($fromCS == $toCS) {
699 return $str;
700 }
701 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
702 if ($toCS == 'utf-8' || !$useEntityForNoChar) {
703 switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
704 case 'mbstring':
705 $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
706 if (FALSE !== $conv_str) {
707 return $conv_str;
708 }
709 // Returns FALSE for unsupported charsets
710 break;
711 case 'iconv':
712 $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
713 if (FALSE !== $conv_str) {
714 return $conv_str;
715 }
716 break;
717 case 'recode':
718 $conv_str = recode_string($fromCS . '..' . $toCS, $str);
719 if (FALSE !== $conv_str) {
720 return $conv_str;
721 }
722 break;
723 }
724 }
725 if ($fromCS != 'utf-8') {
726 $str = $this->utf8_encode($str, $fromCS);
727 }
728 if ($toCS != 'utf-8') {
729 $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
730 }
731 return $str;
732 }
733
734 /**
735 * Convert all elements in ARRAY with type string from one charset to another charset.
736 * NOTICE: Array is passed by reference!
737 *
738 * @param string $array Input array, possibly multidimensional
739 * @param string $fromCS From charset (the current charset of the string)
740 * @param string $toCS To charset (the output charset wanted)
741 * @param boolean $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
742 * @return void
743 * @see conv()
744 * @todo Define visibility
745 */
746 public function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
747 foreach ($array as $key => $value) {
748 if (is_array($array[$key])) {
749 $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
750 } elseif (is_string($array[$key])) {
751 $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
752 }
753 }
754 }
755
756 /**
757 * Converts $str from $charset to UTF-8
758 *
759 * @param string $str String in local charset to convert to UTF-8
760 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
761 * @return string Output string, converted to UTF-8
762 * @todo Define visibility
763 */
764 public function utf8_encode($str, $charset) {
765 if ($charset === 'utf-8') {
766 return $str;
767 }
768 // Charset is case-insensitive
769 // Parse conv. table if not already
770 if ($this->initCharset($charset)) {
771 $strLen = strlen($str);
772 $outStr = '';
773 // Traverse each char in string
774 for ($a = 0; $a < $strLen; $a++) {
775 $chr = substr($str, $a, 1);
776 $ord = ord($chr);
777 // If the charset has two bytes per char
778 if (isset($this->twoByteSets[$charset])) {
779 $ord2 = ord($str[$a + 1]);
780 // Assume big endian
781 $ord = $ord << 8 | $ord2;
782 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
783 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
784 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
785 } else {
786 $outStr .= chr($this->noCharByteVal);
787 }
788 // No char exists
789 $a++;
790 } elseif ($ord > 127) {
791 // If char has value over 127 it's a multibyte char in UTF-8
792 // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
793 if (isset($this->eucBasedSets[$charset])) {
794 // Shift-JIS: chars between 160 and 223 are single byte
795 if ($charset != 'shift_jis' || ($ord < 160 || $ord > 223)) {
796 $a++;
797 $ord2 = ord(substr($str, $a, 1));
798 $ord = $ord * 256 + $ord2;
799 }
800 }
801 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
802 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
803 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
804 } else {
805 $outStr .= chr($this->noCharByteVal);
806 }
807 } else {
808 $outStr .= $chr;
809 }
810 }
811 return $outStr;
812 }
813 }
814
815 /**
816 * Converts $str from UTF-8 to $charset
817 *
818 * @param string $str String in UTF-8 to convert to local charset
819 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
820 * @param boolean $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
821 * @return string Output string, converted to local charset
822 * @todo Define visibility
823 */
824 public function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
825 if ($charset === 'utf-8') {
826 return $str;
827 }
828 // Charset is case-insensitive.
829 // Parse conv. table if not already
830 if ($this->initCharset($charset)) {
831 $strLen = strlen($str);
832 $outStr = '';
833 $buf = '';
834 // Traverse each char in UTF-8 string
835 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) {
836 $chr = substr($str, $a, 1);
837 $ord = ord($chr);
838 // This means multibyte! (first byte!)
839 if ($ord > 127) {
840 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
841 if ($ord & 64) {
842 // Add first byte
843 $buf = $chr;
844 // For each byte in multibyte string
845 for ($b = 0; $b < 8; $b++) {
846 // Shift it left and
847 $ord = $ord << 1;
848 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
849 if ($ord & 128) {
850 $a++;
851 // ... and add the next char.
852 $buf .= substr($str, $a, 1);
853 } else {
854 break;
855 }
856 }
857 // If the UTF-8 char-sequence is found then...
858 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
859 // The local number
860 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
861 // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
862 if ($mByte > 255) {
863 $outStr .= chr(($mByte >> 8 & 255)) . chr(($mByte & 255));
864 } else {
865 $outStr .= chr($mByte);
866 }
867 } elseif ($useEntityForNoChar) {
868 // Create num entity:
869 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
870 } else {
871 $outStr .= chr($this->noCharByteVal);
872 }
873 } else {
874 $outStr .= chr($this->noCharByteVal);
875 }
876 } else {
877 $outStr .= $chr;
878 }
879 }
880 return $outStr;
881 }
882 }
883
884 /**
885 * Converts all chars > 127 to numeric entities.
886 *
887 * @param string $str Input string
888 * @return string Output string
889 * @todo Define visibility
890 */
891 public function utf8_to_entities($str) {
892 $strLen = strlen($str);
893 $outStr = '';
894 $buf = '';
895 // Traverse each char in UTF-8 string.
896 for ($a = 0; $a < $strLen; $a++) {
897 $chr = substr($str, $a, 1);
898 $ord = ord($chr);
899 // This means multibyte! (first byte!)
900 if ($ord > 127) {
901 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
902 if ($ord & 64) {
903 // Add first byte
904 $buf = $chr;
905 // For each byte in multibyte string...
906 for ($b = 0; $b < 8; $b++) {
907 // Shift it left and ...
908 $ord = $ord << 1;
909 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
910 if ($ord & 128) {
911 $a++;
912 // ... and add the next char.
913 $buf .= substr($str, $a, 1);
914 } else {
915 break;
916 }
917 }
918 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
919 } else {
920 $outStr .= chr($this->noCharByteVal);
921 }
922 } else {
923 $outStr .= $chr;
924 }
925 }
926 return $outStr;
927 }
928
929 /**
930 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
931 *
932 * @param string $str Input string, UTF-8
933 * @param boolean $alsoStdHtmlEnt If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
934 * @return string Output string
935 * @todo Define visibility
936 */
937 public function entities_to_utf8($str, $alsoStdHtmlEnt = FALSE) {
938 // Workaround for #39287: 3rd parameter for get_html_translation_table() was only added in PHP 5.3.4 and later
939 // see http://php.net/manual/en/function.get-html-translation-table.php
940 $applyPhpCompatibilityFix = version_compare(phpversion(), '5.3.4', '<');
941
942 if ($alsoStdHtmlEnt) {
943 if ($applyPhpCompatibilityFix === TRUE) {
944 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT));
945 } else {
946 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8'));
947 }
948 }
949 $token = md5(microtime());
950 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
951 foreach ($parts as $k => $v) {
952 // Only take every second element
953 if ($k % 2 === 0) {
954 continue;
955 }
956 $position = 0;
957 // Dec or hex entities
958 if (substr($v, $position, 1) == '#') {
959 $position++;
960 if (substr($v, $position, 1) == 'x') {
961 $v = hexdec(substr($v, ++$position));
962 } else {
963 $v = substr($v, $position);
964 }
965 $parts[$k] = $this->UnumberToChar($v);
966 } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) {
967 // Other entities:
968 $v = $trans_tbl['&' . $v . ';'];
969 if ($applyPhpCompatibilityFix === TRUE) {
970 $v = $this->utf8_encode($v, 'iso-8859-1');
971 }
972 $parts[$k] = $v;
973 } else {
974 // No conversion:
975 $parts[$k] = '&' . $v . ';';
976 }
977 }
978 return implode('', $parts);
979 }
980
981 /**
982 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
983 *
984 * @param string $str Input string, UTF-8
985 * @param boolean $convEntities If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
986 * @param boolean $retChar If set, then instead of integer numbers the real UTF-8 char is returned.
987 * @return array Output array with the char numbers
988 * @todo Define visibility
989 */
990 public function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
991 // If entities must be registered as well...:
992 if ($convEntities) {
993 $str = $this->entities_to_utf8($str, 1);
994 }
995 // Do conversion:
996 $strLen = strlen($str);
997 $outArr = array();
998 $buf = '';
999 // Traverse each char in UTF-8 string.
1000 for ($a = 0; $a < $strLen; $a++) {
1001 $chr = substr($str, $a, 1);
1002 $ord = ord($chr);
1003 // This means multibyte! (first byte!)
1004 if ($ord > 127) {
1005 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
1006 if ($ord & 64) {
1007 // Add first byte
1008 $buf = $chr;
1009 // For each byte in multibyte string...
1010 for ($b = 0; $b < 8; $b++) {
1011 // Shift it left and ...
1012 $ord = $ord << 1;
1013 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
1014 if ($ord & 128) {
1015 $a++;
1016 // ... and add the next char.
1017 $buf .= substr($str, $a, 1);
1018 } else {
1019 break;
1020 }
1021 }
1022 $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
1023 } else {
1024 $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
1025 }
1026 } else {
1027 $outArr[] = $retChar ? chr($ord) : $ord;
1028 }
1029 }
1030 return $outArr;
1031 }
1032
1033 /**
1034 * Converts a UNICODE number to a UTF-8 multibyte character
1035 * Algorithm based on script found at From: http://czyborra.com/utf/
1036 * Unit-tested by Kasper
1037 *
1038 * The binary representation of the character's integer value is thus simply spread across the bytes
1039 * and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
1040 *
1041 * bytes | bits | representation
1042 * 1 | 7 | 0vvvvvvv
1043 * 2 | 11 | 110vvvvv 10vvvvvv
1044 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
1045 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
1046 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
1047 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
1048 *
1049 * @param integer $cbyte UNICODE integer
1050 * @return string UTF-8 multibyte character string
1051 * @see utf8CharToUnumber()
1052 * @todo Define visibility
1053 */
1054 public function UnumberToChar($cbyte) {
1055 $str = '';
1056 if ($cbyte < 128) {
1057 $str .= chr($cbyte);
1058 } else {
1059 if ($cbyte < 2048) {
1060 $str .= chr(192 | $cbyte >> 6);
1061 $str .= chr(128 | $cbyte & 63);
1062 } else {
1063 if ($cbyte < 65536) {
1064 $str .= chr(224 | $cbyte >> 12);
1065 $str .= chr(128 | $cbyte >> 6 & 63);
1066 $str .= chr(128 | $cbyte & 63);
1067 } else {
1068 if ($cbyte < 2097152) {
1069 $str .= chr(240 | $cbyte >> 18);
1070 $str .= chr(128 | $cbyte >> 12 & 63);
1071 $str .= chr(128 | $cbyte >> 6 & 63);
1072 $str .= chr(128 | $cbyte & 63);
1073 } else {
1074 if ($cbyte < 67108864) {
1075 $str .= chr(248 | $cbyte >> 24);
1076 $str .= chr(128 | $cbyte >> 18 & 63);
1077 $str .= chr(128 | $cbyte >> 12 & 63);
1078 $str .= chr(128 | $cbyte >> 6 & 63);
1079 $str .= chr(128 | $cbyte & 63);
1080 } else {
1081 if ($cbyte < 2147483648) {
1082 $str .= chr(252 | $cbyte >> 30);
1083 $str .= chr(128 | $cbyte >> 24 & 63);
1084 $str .= chr(128 | $cbyte >> 18 & 63);
1085 $str .= chr(128 | $cbyte >> 12 & 63);
1086 $str .= chr(128 | $cbyte >> 6 & 63);
1087 $str .= chr(128 | $cbyte & 63);
1088 } else {
1089 // Cannot express a 32-bit character in UTF-8
1090 $str .= chr($this->noCharByteVal);
1091 }
1092 }
1093 }
1094 }
1095 }
1096 }
1097 return $str;
1098 }
1099
1100 /**
1101 * Converts a UTF-8 Multibyte character to a UNICODE number
1102 * Unit-tested by Kasper
1103 *
1104 * @param string $str UTF-8 multibyte character string
1105 * @param boolean $hex If set, then a hex. number is returned.
1106 * @return integer UNICODE integer
1107 * @see UnumberToChar()
1108 * @todo Define visibility
1109 */
1110 public function utf8CharToUnumber($str, $hex = 0) {
1111 // First char
1112 $ord = ord(substr($str, 0, 1));
1113 // This verifyes that it IS a multi byte string
1114 if (($ord & 192) == 192) {
1115 $binBuf = '';
1116 // For each byte in multibyte string...
1117 for ($b = 0; $b < 8; $b++) {
1118 // Shift it left and ...
1119 $ord = $ord << 1;
1120 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
1121 if ($ord & 128) {
1122 $binBuf .= substr('00000000' . decbin(ord(substr($str, ($b + 1), 1))), -6);
1123 } else {
1124 break;
1125 }
1126 }
1127 $binBuf = substr(('00000000' . decbin(ord(substr($str, 0, 1)))), -(6 - $b)) . $binBuf;
1128 $int = bindec($binBuf);
1129 } else {
1130 $int = $ord;
1131 }
1132 return $hex ? 'x' . dechex($int) : $int;
1133 }
1134
1135 /********************************************
1136 *
1137 * Init functions
1138 *
1139 ********************************************/
1140 /**
1141 * This will initialize a charset for use if it's defined in the 'typo3/sysext/core/Resources/Private/Charsets/csconvtbl/' folder
1142 * This function is automatically called by the conversion functions
1143 *
1144 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
1145 *
1146 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
1147 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
1148 * @acces private
1149 * @todo Define visibility
1150 */
1151 public function initCharset($charset) {
1152 // Only process if the charset is not yet loaded:
1153 if (!is_array($this->parsedCharsets[$charset])) {
1154 // Conversion table filename:
1155 $charsetConvTableFile = \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/csconvtbl/' . $charset . '.tbl';
1156 // If the conversion table is found:
1157 if ($charset && GeneralUtility::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
1158 // Cache file for charsets:
1159 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1160 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1161 if ($cacheFile && @is_file($cacheFile)) {
1162 $this->parsedCharsets[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1163 } else {
1164 // Parse conversion table into lines:
1165 $lines = GeneralUtility::trimExplode(LF, GeneralUtility::getUrl($charsetConvTableFile), TRUE);
1166 // Initialize the internal variable holding the conv. table:
1167 $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
1168 // traverse the lines:
1169 $detectedType = '';
1170 foreach ($lines as $value) {
1171 // Comment line or blanks are ignored.
1172 if (trim($value) && substr($value, 0, 1) != '#') {
1173 // Detect type if not done yet: (Done on first real line)
1174 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1175 if (!$detectedType) {
1176 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
1177 }
1178 if ($detectedType == 'ms-token') {
1179 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1180 } elseif ($detectedType == 'whitespaced') {
1181 $regA = array();
1182 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1183 $hexbyte = $regA[1];
1184 $utf8 = 'U+' . $regA[2];
1185 }
1186 $decval = hexdec(trim($hexbyte));
1187 if ($decval > 127) {
1188 $utf8decval = hexdec(substr(trim($utf8), 2));
1189 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1190 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1191 }
1192 }
1193 }
1194 if ($cacheFile) {
1195 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1196 }
1197 }
1198 return 2;
1199 } else {
1200 return FALSE;
1201 }
1202 } else {
1203 return 1;
1204 }
1205 }
1206
1207 /**
1208 * This function initializes all UTF-8 character data tables.
1209 *
1210 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1211 *
1212 * @param string $mode Mode ("case", "ascii", ...)
1213 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1214 * @access private
1215 * @todo Define visibility
1216 */
1217 public function initUnicodeData($mode = NULL) {
1218 // Cache files
1219 $cacheFileCase = GeneralUtility::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1220 $cacheFileASCII = GeneralUtility::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1221 // Only process if the tables are not yet loaded
1222 switch ($mode) {
1223 case 'case':
1224 if (is_array($this->caseFolding['utf-8'])) {
1225 return 1;
1226 }
1227 // Use cached version if possible
1228 if ($cacheFileCase && @is_file($cacheFileCase)) {
1229 $this->caseFolding['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileCase));
1230 return 2;
1231 }
1232 break;
1233 case 'ascii':
1234 if (is_array($this->toASCII['utf-8'])) {
1235 return 1;
1236 }
1237 // Use cached version if possible
1238 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1239 $this->toASCII['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileASCII));
1240 return 2;
1241 }
1242 break;
1243 }
1244 // Process main Unicode data file
1245 $unicodeDataFile = \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/UnicodeData.txt';
1246 if (!(GeneralUtility::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1247 return FALSE;
1248 }
1249 $fh = fopen($unicodeDataFile, 'rb');
1250 if (!$fh) {
1251 return FALSE;
1252 }
1253 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1254 // Note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1255 $this->caseFolding['utf-8'] = array();
1256 $utf8CaseFolding = &$this->caseFolding['utf-8'];
1257 // a shorthand
1258 $utf8CaseFolding['toUpper'] = array();
1259 $utf8CaseFolding['toLower'] = array();
1260 $utf8CaseFolding['toTitle'] = array();
1261 // Array of temp. decompositions
1262 $decomposition = array();
1263 // Array of chars that are marks (eg. composing accents)
1264 $mark = array();
1265 // Array of chars that are numbers (eg. digits)
1266 $number = array();
1267 // Array of chars to be omitted (eg. Russian hard sign)
1268 $omit = array();
1269 while (!feof($fh)) {
1270 $line = fgets($fh, 4096);
1271 // Has a lot of info
1272 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title, ) = explode(';', rtrim($line));
1273 $ord = hexdec($char);
1274 if ($ord > 65535) {
1275 // Only process the BMP
1276 break;
1277 }
1278 $utf8_char = $this->UnumberToChar($ord);
1279 if ($upper) {
1280 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1281 }
1282 if ($lower) {
1283 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1284 }
1285 // Store "title" only when different from "upper" (only a few)
1286 if ($title && $title != $upper) {
1287 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1288 }
1289 switch ($cat[0]) {
1290 case 'M':
1291 // mark (accent, umlaut, ...)
1292 $mark['U+' . $char] = 1;
1293 break;
1294 case 'N':
1295 // numeric value
1296 if ($ord > 128 && $num != '') {
1297 $number['U+' . $char] = $num;
1298 }
1299 }
1300 // Accented Latin letters without "official" decomposition
1301 $match = array();
1302 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1303 $c = ord($match[2]);
1304 if ($match[1] == 'SMALL') {
1305 $c += 32;
1306 }
1307 $decomposition['U+' . $char] = array(dechex($c));
1308 continue;
1309 }
1310 $match = array();
1311 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1312 switch ($match[1]) {
1313 case '<circle>':
1314 // add parenthesis as circle replacement, eg (1)
1315 $match[2] = '0028 ' . $match[2] . ' 0029';
1316 break;
1317 case '<square>':
1318 // add square brackets as square replacement, eg [1]
1319 $match[2] = '005B ' . $match[2] . ' 005D';
1320 break;
1321 case '<compat>':
1322 // ignore multi char decompositions that start with a space
1323 if (preg_match('/^0020 /', $match[2])) {
1324 continue 2;
1325 }
1326 break;
1327 case '<initial>':
1328
1329 case '<medial>':
1330
1331 case '<final>':
1332
1333 case '<isolated>':
1334
1335 case '<vertical>':
1336 continue 2;
1337 }
1338 $decomposition['U+' . $char] = explode(' ', $match[2]);
1339 }
1340 }
1341 fclose($fh);
1342 // Process additional Unicode data for casing (allow folded characters to expand into a sequence)
1343 $specialCasingFile = \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/SpecialCasing.txt';
1344 if (GeneralUtility::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1345 $fh = fopen($specialCasingFile, 'rb');
1346 if ($fh) {
1347 while (!feof($fh)) {
1348 $line = fgets($fh, 4096);
1349 if ($line[0] != '#' && trim($line) != '') {
1350 list($char, $lower, $title, $upper, $cond) = GeneralUtility::trimExplode(';', $line);
1351 if ($cond == '' || $cond[0] == '#') {
1352 $utf8_char = $this->UnumberToChar(hexdec($char));
1353 if ($char != $lower) {
1354 $arr = explode(' ', $lower);
1355 for ($i = 0; isset($arr[$i]); $i++) {
1356 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1357 }
1358 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1359 }
1360 if ($char != $title && $title != $upper) {
1361 $arr = explode(' ', $title);
1362 for ($i = 0; isset($arr[$i]); $i++) {
1363 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1364 }
1365 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1366 }
1367 if ($char != $upper) {
1368 $arr = explode(' ', $upper);
1369 for ($i = 0; isset($arr[$i]); $i++) {
1370 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1371 }
1372 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1373 }
1374 }
1375 }
1376 }
1377 fclose($fh);
1378 }
1379 }
1380 // Process custom decompositions
1381 $customTranslitFile = \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/Translit.txt';
1382 if (GeneralUtility::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1383 $fh = fopen($customTranslitFile, 'rb');
1384 if ($fh) {
1385 while (!feof($fh)) {
1386 $line = fgets($fh, 4096);
1387 if ($line[0] != '#' && trim($line) != '') {
1388 list($char, $translit) = GeneralUtility::trimExplode(';', $line);
1389 if (!$translit) {
1390 $omit['U+' . $char] = 1;
1391 }
1392 $decomposition['U+' . $char] = explode(' ', $translit);
1393 }
1394 }
1395 fclose($fh);
1396 }
1397 }
1398 // Decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1399 foreach ($decomposition as $from => $to) {
1400 $code_decomp = array();
1401 while ($code_value = array_shift($to)) {
1402 // Do recursive decomposition
1403 if (isset($decomposition['U+' . $code_value])) {
1404 foreach (array_reverse($decomposition['U+' . $code_value]) as $cv) {
1405 array_unshift($to, $cv);
1406 }
1407 } elseif (!isset($mark[('U+' . $code_value)])) {
1408 // remove mark
1409 array_push($code_decomp, $code_value);
1410 }
1411 }
1412 if (count($code_decomp) || isset($omit[$from])) {
1413 $decomposition[$from] = $code_decomp;
1414 } else {
1415 unset($decomposition[$from]);
1416 }
1417 }
1418 // Create ascii only mapping
1419 $this->toASCII['utf-8'] = array();
1420 $ascii = &$this->toASCII['utf-8'];
1421 foreach ($decomposition as $from => $to) {
1422 $code_decomp = array();
1423 while ($code_value = array_shift($to)) {
1424 $ord = hexdec($code_value);
1425 if ($ord > 127) {
1426 continue 2;
1427 } else {
1428 // Skip decompositions containing non-ASCII chars
1429 array_push($code_decomp, chr($ord));
1430 }
1431 }
1432 $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1433 }
1434 // Add numeric decompositions
1435 foreach ($number as $from => $to) {
1436 $utf8_char = $this->UnumberToChar(hexdec($from));
1437 if (!isset($ascii[$utf8_char])) {
1438 $ascii[$utf8_char] = $to;
1439 }
1440 }
1441 if ($cacheFileCase) {
1442 GeneralUtility::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1443 }
1444 if ($cacheFileASCII) {
1445 GeneralUtility::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1446 }
1447 return 3;
1448 }
1449
1450 /**
1451 * This function initializes the folding table for a charset other than UTF-8.
1452 * This function is automatically called by the case folding functions.
1453 *
1454 * @param string $charset Charset for which to initialize case folding.
1455 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1456 * @access private
1457 * @todo Define visibility
1458 */
1459 public function initCaseFolding($charset) {
1460 // Only process if the case table is not yet loaded:
1461 if (is_array($this->caseFolding[$charset])) {
1462 return 1;
1463 }
1464 // Use cached version if possible
1465 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1466 if ($cacheFile && @is_file($cacheFile)) {
1467 $this->caseFolding[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1468 return 2;
1469 }
1470 // init UTF-8 conversion for this charset
1471 if (!$this->initCharset($charset)) {
1472 return FALSE;
1473 }
1474 // UTF-8 case folding is used as the base conversion table
1475 if (!$this->initUnicodeData('case')) {
1476 return FALSE;
1477 }
1478 $nochar = chr($this->noCharByteVal);
1479 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1480 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1481 $c = $this->utf8_decode($utf8, $charset);
1482 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1483 if ($cc != '' && $cc != $nochar) {
1484 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1485 }
1486 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1487 if ($cc != '' && $cc != $nochar) {
1488 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1489 }
1490 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1491 if ($cc != '' && $cc != $nochar) {
1492 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1493 }
1494 }
1495 // Add the ASCII case table
1496 $start = ord('a');
1497 $end = ord('z');
1498 for ($i = $start; $i <= $end; $i++) {
1499 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1500 }
1501 $start = ord('A');
1502 $end = ord('Z');
1503 for ($i = $start; $i <= $end; $i++) {
1504 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1505 }
1506 if ($cacheFile) {
1507 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1508 }
1509 return 3;
1510 }
1511
1512 /**
1513 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1514 * This function is automatically called by the ASCII transliteration functions.
1515 *
1516 * @param string $charset Charset for which to initialize conversion.
1517 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1518 * @access private
1519 * @todo Define visibility
1520 */
1521 public function initToASCII($charset) {
1522 // Only process if the case table is not yet loaded:
1523 if (is_array($this->toASCII[$charset])) {
1524 return 1;
1525 }
1526 // Use cached version if possible
1527 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1528 if ($cacheFile && @is_file($cacheFile)) {
1529 $this->toASCII[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1530 return 2;
1531 }
1532 // Init UTF-8 conversion for this charset
1533 if (!$this->initCharset($charset)) {
1534 return FALSE;
1535 }
1536 // UTF-8/ASCII transliteration is used as the base conversion table
1537 if (!$this->initUnicodeData('ascii')) {
1538 return FALSE;
1539 }
1540 $nochar = chr($this->noCharByteVal);
1541 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1542 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1543 $c = $this->utf8_decode($utf8, $charset);
1544 if (isset($this->toASCII['utf-8'][$utf8])) {
1545 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1546 }
1547 }
1548 if ($cacheFile) {
1549 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1550 }
1551 return 3;
1552 }
1553
1554 /********************************************
1555 *
1556 * String operation functions
1557 *
1558 ********************************************/
1559 /**
1560 * Returns a part of a string.
1561 * Unit-tested by Kasper (single byte charsets only)
1562 *
1563 * @param string $charset The character set
1564 * @param string $string Character string
1565 * @param integer $start Start position (character position)
1566 * @param integer $len Length (in characters)
1567 * @return string The substring
1568 * @see substr(), mb_substr()
1569 * @todo Define visibility
1570 */
1571 public function substr($charset, $string, $start, $len = NULL) {
1572 if ($len === 0 || $string === '') {
1573 return '';
1574 }
1575 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1576 // Cannot omit $len, when specifying charset
1577 if ($len == NULL) {
1578 // Save internal encoding
1579 $enc = mb_internal_encoding();
1580 mb_internal_encoding($charset);
1581 $str = mb_substr($string, $start);
1582 // Restore internal encoding
1583 mb_internal_encoding($enc);
1584 return $str;
1585 } else {
1586 return mb_substr($string, $start, $len, $charset);
1587 }
1588 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1589 // Cannot omit $len, when specifying charset
1590 if ($len == NULL) {
1591 // Save internal encoding
1592 $enc = iconv_get_encoding('internal_encoding');
1593 iconv_set_encoding('internal_encoding', $charset);
1594 $str = iconv_substr($string, $start);
1595 // Restore internal encoding
1596 iconv_set_encoding('internal_encoding', $enc);
1597 return $str;
1598 } else {
1599 return iconv_substr($string, $start, $len, $charset);
1600 }
1601 } elseif ($charset == 'utf-8') {
1602 return $this->utf8_substr($string, $start, $len);
1603 } elseif ($this->eucBasedSets[$charset]) {
1604 return $this->euc_substr($string, $start, $charset, $len);
1605 } elseif ($this->twoByteSets[$charset]) {
1606 return substr($string, $start * 2, $len * 2);
1607 } elseif ($this->fourByteSets[$charset]) {
1608 return substr($string, $start * 4, $len * 4);
1609 }
1610 // Treat everything else as single-byte encoding
1611 return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
1612 }
1613
1614 /**
1615 * Counts the number of characters.
1616 * Unit-tested by Kasper (single byte charsets only)
1617 *
1618 * @param string $charset The character set
1619 * @param string $string Character string
1620 * @return integer The number of characters
1621 * @see strlen()
1622 * @todo Define visibility
1623 */
1624 public function strlen($charset, $string) {
1625 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1626 return mb_strlen($string, $charset);
1627 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1628 return iconv_strlen($string, $charset);
1629 } elseif ($charset == 'utf-8') {
1630 return $this->utf8_strlen($string);
1631 } elseif ($this->eucBasedSets[$charset]) {
1632 return $this->euc_strlen($string, $charset);
1633 } elseif ($this->twoByteSets[$charset]) {
1634 return strlen($string) / 2;
1635 } elseif ($this->fourByteSets[$charset]) {
1636 return strlen($string) / 4;
1637 }
1638 // Treat everything else as single-byte encoding
1639 return strlen($string);
1640 }
1641
1642 /**
1643 * Method to crop strings using the mb_substr function.
1644 *
1645 * @param string $charset The character set
1646 * @param string $string String to be cropped
1647 * @param integer $len Crop length (in characters)
1648 * @param string $crop Crop signifier
1649 * @return string The shortened string
1650 * @see mb_strlen(), mb_substr()
1651 */
1652 protected function cropMbstring($charset, $string, $len, $crop = '') {
1653 if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) {
1654 return $string;
1655 }
1656 if ($len > 0) {
1657 $string = mb_substr($string, 0, $len, $charset) . $crop;
1658 } else {
1659 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1660 }
1661 return $string;
1662 }
1663
1664 /**
1665 * Truncates a string and pre-/appends a string.
1666 * Unit tested by Kasper
1667 *
1668 * @param string $charset The character set
1669 * @param string $string Character string
1670 * @param integer $len Length (in characters)
1671 * @param string $crop Crop signifier
1672 * @return string The shortened string
1673 * @see substr(), mb_strimwidth()
1674 * @todo Define visibility
1675 */
1676 public function crop($charset, $string, $len, $crop = '') {
1677 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1678 return $this->cropMbstring($charset, $string, $len, $crop);
1679 }
1680 if (intval($len) == 0) {
1681 return $string;
1682 }
1683 if ($charset == 'utf-8') {
1684 $i = $this->utf8_char2byte_pos($string, $len);
1685 } elseif ($this->eucBasedSets[$charset]) {
1686 $i = $this->euc_char2byte_pos($string, $len, $charset);
1687 } else {
1688 if ($len > 0) {
1689 $i = $len;
1690 } else {
1691 $i = strlen($string) + $len;
1692 if ($i <= 0) {
1693 $i = FALSE;
1694 }
1695 }
1696 }
1697 // $len outside actual string length
1698 if ($i === FALSE) {
1699 return $string;
1700 } else {
1701 if ($len > 0) {
1702 if (strlen($string[$i])) {
1703 return substr($string, 0, $i) . $crop;
1704 }
1705 } else {
1706 if (strlen($string[$i - 1])) {
1707 return $crop . substr($string, $i);
1708 }
1709 }
1710 }
1711 return $string;
1712 }
1713
1714 /**
1715 * Cuts a string short at a given byte length.
1716 *
1717 * @param string $charset The character set
1718 * @param string $string Character string
1719 * @param integer $len The byte length
1720 * @return string The shortened string
1721 * @see mb_strcut()
1722 * @todo Define visibility
1723 */
1724 public function strtrunc($charset, $string, $len) {
1725 if ($len <= 0) {
1726 return '';
1727 }
1728 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1729 return mb_strcut($string, 0, $len, $charset);
1730 } elseif ($charset == 'utf-8') {
1731 return $this->utf8_strtrunc($string, $len);
1732 } elseif ($this->eucBasedSets[$charset]) {
1733 return $this->euc_strtrunc($string, $len, $charset);
1734 } elseif ($this->twoByteSets[$charset]) {
1735 if ($len % 2) {
1736 $len--;
1737 }
1738 } elseif ($this->fourByteSets[$charset]) {
1739 $x = $len % 4;
1740 // Realign to position dividable by four
1741 $len -= $x;
1742 }
1743 // Treat everything else as single-byte encoding
1744 return substr($string, 0, $len);
1745 }
1746
1747 /**
1748 * Translates all characters of a string into their respective case values.
1749 * Unlike strtolower() and strtoupper() this method is locale independent.
1750 * Note that the string length may change!
1751 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1752 * Unit-tested by Kasper
1753 * Real case folding is language dependent, this method ignores this fact.
1754 *
1755 * @param string $charset Character set of string
1756 * @param string $string Input string to convert case for
1757 * @param string $case Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1758 * @return string The converted string
1759 * @see strtolower(), strtoupper()
1760 * @todo Define visibility
1761 */
1762 public function conv_case($charset, $string, $case) {
1763 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1764 if ($case == 'toLower') {
1765 $string = mb_strtolower($string, $charset);
1766 } else {
1767 $string = mb_strtoupper($string, $charset);
1768 }
1769 } elseif ($charset == 'utf-8') {
1770 $string = $this->utf8_char_mapping($string, 'case', $case);
1771 } elseif (isset($this->eucBasedSets[$charset])) {
1772 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1773 } else {
1774 // Treat everything else as single-byte encoding
1775 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1776 }
1777 return $string;
1778 }
1779
1780 /**
1781 * Equivalent of lcfirst/ucfirst but using character set.
1782 *
1783 * @param string $charset
1784 * @param string $string
1785 * @param string $case
1786 * @return string
1787 * @see \TYPO3\CMS\Core\Charset\CharsetConverter::conv_case()
1788 */
1789 public function convCaseFirst($charset, $string, $case) {
1790 $firstChar = $this->substr($charset, $string, 0, 1);
1791 $firstChar = $this->conv_case($charset, $firstChar, $case);
1792 $remainder = $this->substr($charset, $string, 1);
1793 return $firstChar . $remainder;
1794 }
1795
1796 /**
1797 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1798 *
1799 * @param string $charset Character set of string
1800 * @param string $string Input string to convert
1801 * @return string The converted string
1802 * @todo Define visibility
1803 */
1804 public function specCharsToASCII($charset, $string) {
1805 if ($charset == 'utf-8') {
1806 $string = $this->utf8_char_mapping($string, 'ascii');
1807 } elseif (isset($this->eucBasedSets[$charset])) {
1808 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1809 } else {
1810 // Treat everything else as single-byte encoding
1811 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1812 }
1813 return $string;
1814 }
1815
1816 /**
1817 * Converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1818 * into a TYPO3-readable language code
1819 *
1820 * @param string $languageCodesList List of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1821 * @return string A preferred language that TYPO3 supports, or "default" if none found
1822 */
1823 public function getPreferredClientLanguage($languageCodesList) {
1824 $allLanguageCodes = array();
1825 $selectedLanguage = 'default';
1826 // Get all languages where TYPO3 code is the same as the ISO code
1827 foreach ($this->charSetArray as $typo3Lang => $charSet) {
1828 $allLanguageCodes[$typo3Lang] = $typo3Lang;
1829 }
1830 // Get all languages where TYPO3 code differs from ISO code
1831 // or needs the country part
1832 // the iso codes will here overwrite the default typo3 language in the key
1833 foreach ($this->locales->getIsoMapping() as $typo3Lang => $isoLang) {
1834 $isoLang = join('-', explode('_', $isoLang));
1835 $allLanguageCodes[$typo3Lang] = $isoLang;
1836 }
1837 // Move the iso codes to the (because we're comparing the keys with "isset" later on)
1838 $allLanguageCodes = array_flip($allLanguageCodes);
1839 $preferredLanguages = GeneralUtility::trimExplode(',', $languageCodesList);
1840 // Order the preferred languages after they key
1841 $sortedPreferredLanguages = array();
1842 foreach ($preferredLanguages as $preferredLanguage) {
1843 $quality = 1.0;
1844 if (strpos($preferredLanguage, ';q=') !== FALSE) {
1845 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1846 }
1847 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1848 }
1849 // Loop through the languages, with the highest priority first
1850 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1851 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1852 if (isset($allLanguageCodes[$preferredLanguage])) {
1853 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1854 break;
1855 }
1856 // Strip the country code from the end
1857 list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
1858 if (isset($allLanguageCodes[$preferredLanguage])) {
1859 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1860 break;
1861 }
1862 }
1863 if (!$selectedLanguage || $selectedLanguage == 'en') {
1864 $selectedLanguage = 'default';
1865 }
1866 return $selectedLanguage;
1867 }
1868
1869 /********************************************
1870 *
1871 * Internal string operation functions
1872 *
1873 ********************************************/
1874 /**
1875 * Maps all characters of a string in a single byte charset.
1876 *
1877 * @param string $str The string
1878 * @param string $charset The charset
1879 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1880 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
1881 * @return string The converted string
1882 * @todo Define visibility
1883 */
1884 public function sb_char_mapping($str, $charset, $mode, $opt = '') {
1885 switch ($mode) {
1886 case 'case':
1887 if (!$this->initCaseFolding($charset)) {
1888 return $str;
1889 }
1890 // Do nothing
1891 $map = &$this->caseFolding[$charset][$opt];
1892 break;
1893 case 'ascii':
1894 if (!$this->initToASCII($charset)) {
1895 return $str;
1896 }
1897 // Do nothing
1898 $map = &$this->toASCII[$charset];
1899 break;
1900 default:
1901 return $str;
1902 }
1903 $out = '';
1904 for ($i = 0; strlen($str[$i]); $i++) {
1905 $c = $str[$i];
1906 if (isset($map[$c])) {
1907 $out .= $map[$c];
1908 } else {
1909 $out .= $c;
1910 }
1911 }
1912 return $out;
1913 }
1914
1915 /********************************************
1916 *
1917 * Internal UTF-8 string operation functions
1918 *
1919 ********************************************/
1920 /**
1921 * Returns a part of a UTF-8 string.
1922 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1923 *
1924 * @param string $str UTF-8 string
1925 * @param integer $start Start position (character position)
1926 * @param integer $len Length (in characters)
1927 * @return string The substring
1928 * @see substr()
1929 * @todo Define visibility
1930 */
1931 public function utf8_substr($str, $start, $len = NULL) {
1932 if (!strcmp($len, '0')) {
1933 return '';
1934 }
1935 $byte_start = $this->utf8_char2byte_pos($str, $start);
1936 if ($byte_start === FALSE) {
1937 if ($start > 0) {
1938 // $start outside string length
1939 return FALSE;
1940 } else {
1941 $start = 0;
1942 }
1943 }
1944 $str = substr($str, $byte_start);
1945 if ($len != NULL) {
1946 $byte_end = $this->utf8_char2byte_pos($str, $len);
1947 // $len outside actual string length
1948 if ($byte_end === FALSE) {
1949 return $len < 0 ? '' : $str;
1950 } else {
1951 // When length is less than zero and exceeds, then we return blank string.
1952 return substr($str, 0, $byte_end);
1953 }
1954 } else {
1955 return $str;
1956 }
1957 }
1958
1959 /**
1960 * Counts the number of characters of a string in UTF-8.
1961 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1962 *
1963 * @param string $str UTF-8 multibyte character string
1964 * @return integer The number of characters
1965 * @see strlen()
1966 * @todo Define visibility
1967 */
1968 public function utf8_strlen($str) {
1969 $n = 0;
1970 for ($i = 0; strlen($str[$i]); $i++) {
1971 $c = ord($str[$i]);
1972 // Single-byte (0xxxxxx)
1973 if (!($c & 128)) {
1974 $n++;
1975 } elseif (($c & 192) == 192) {
1976 // Multi-byte starting byte (11xxxxxx)
1977 $n++;
1978 }
1979 }
1980 return $n;
1981 }
1982
1983 /**
1984 * Truncates a string in UTF-8 short at a given byte length.
1985 *
1986 * @param string $str UTF-8 multibyte character string
1987 * @param integer $len The byte length
1988 * @return string The shortened string
1989 * @see mb_strcut()
1990 * @todo Define visibility
1991 */
1992 public function utf8_strtrunc($str, $len) {
1993 $i = $len - 1;
1994 // Part of a multibyte sequence
1995 if (ord($str[$i]) & 128) {
1996 for (; $i > 0 && !(ord($str[$i]) & 64); $i--) {
1997
1998 }
1999 if ($i <= 0) {
2000 return '';
2001 }
2002 // Sanity check
2003 for ($bc = 0, $mbs = ord($str[$i]); $mbs & 128; $mbs = $mbs << 1) {
2004 // Calculate number of bytes
2005 $bc++;
2006 }
2007 if ($bc + $i > $len) {
2008 return substr($str, 0, $i);
2009 }
2010 }
2011 return substr($str, 0, $len);
2012 }
2013
2014 /**
2015 * Find position of first occurrence of a string, both arguments are in UTF-8.
2016 *
2017 * @param string $haystack UTF-8 string to search in
2018 * @param string $needle UTF-8 string to search for
2019 * @param integer $offset Positition to start the search
2020 * @return integer The character position
2021 * @see strpos()
2022 * @todo Define visibility
2023 */
2024 public function utf8_strpos($haystack, $needle, $offset = 0) {
2025 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
2026 return mb_strpos($haystack, $needle, $offset, 'utf-8');
2027 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
2028 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
2029 }
2030 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
2031 if ($byte_offset === FALSE) {
2032 // Offset beyond string length
2033 return FALSE;
2034 }
2035 $byte_pos = strpos($haystack, $needle, $byte_offset);
2036 if ($byte_pos === FALSE) {
2037 // Needle not found
2038 return FALSE;
2039 }
2040 return $this->utf8_byte2char_pos($haystack, $byte_pos);
2041 }
2042
2043 /**
2044 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
2045 *
2046 * @param string $haystack UTF-8 string to search in
2047 * @param string $needle UTF-8 character to search for (single character)
2048 * @return integer The character position
2049 * @see strrpos()
2050 * @todo Define visibility
2051 */
2052 public function utf8_strrpos($haystack, $needle) {
2053 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
2054 return mb_strrpos($haystack, $needle, 'utf-8');
2055 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
2056 return iconv_strrpos($haystack, $needle, 'utf-8');
2057 }
2058 $byte_pos = strrpos($haystack, $needle);
2059 if ($byte_pos === FALSE) {
2060 // Needle not found
2061 return FALSE;
2062 }
2063 return $this->utf8_byte2char_pos($haystack, $byte_pos);
2064 }
2065
2066 /**
2067 * Translates a character position into an 'absolute' byte position.
2068 * Unit tested by Kasper.
2069 *
2070 * @param string $str UTF-8 string
2071 * @param integer $pos Character position (negative values start from the end)
2072 * @return integer Byte position
2073 * @todo Define visibility
2074 */
2075 public function utf8_char2byte_pos($str, $pos) {
2076 // Number of characters found
2077 $n = 0;
2078 // Number of characters wanted
2079 $p = abs($pos);
2080 if ($pos >= 0) {
2081 $i = 0;
2082 $d = 1;
2083 } else {
2084 $i = strlen($str) - 1;
2085 $d = -1;
2086 }
2087 for (; strlen($str[$i]) && $n < $p; $i += $d) {
2088 $c = (int) ord($str[$i]);
2089 // single-byte (0xxxxxx)
2090 if (!($c & 128)) {
2091 $n++;
2092 } elseif (($c & 192) == 192) {
2093 // Multi-byte starting byte (11xxxxxx)
2094 $n++;
2095 }
2096 }
2097 if (!strlen($str[$i])) {
2098 // Offset beyond string length
2099 return FALSE;
2100 }
2101 if ($pos >= 0) {
2102 // Skip trailing multi-byte data bytes
2103 while (ord($str[$i]) & 128 && !(ord($str[$i]) & 64)) {
2104 $i++;
2105 }
2106 } else {
2107 // Correct offset
2108 $i++;
2109 }
2110 return $i;
2111 }
2112
2113 /**
2114 * Translates an 'absolute' byte position into a character position.
2115 * Unit tested by Kasper.
2116 *
2117 * @param string $str UTF-8 string
2118 * @param integer $pos Byte position
2119 * @return integer Character position
2120 * @todo Define visibility
2121 */
2122 public function utf8_byte2char_pos($str, $pos) {
2123 // Number of characters
2124 $n = 0;
2125 for ($i = $pos; $i > 0; $i--) {
2126 $c = (int) ord($str[$i]);
2127 // single-byte (0xxxxxx)
2128 if (!($c & 128)) {
2129 $n++;
2130 } elseif (($c & 192) == 192) {
2131 // Multi-byte starting byte (11xxxxxx)
2132 $n++;
2133 }
2134 }
2135 if (!strlen($str[$i])) {
2136 // Offset beyond string length
2137 return FALSE;
2138 }
2139 return $n;
2140 }
2141
2142 /**
2143 * Maps all characters of an UTF-8 string.
2144 *
2145 * @param string $str UTF-8 string
2146 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2147 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
2148 * @return string The converted string
2149 * @todo Define visibility
2150 */
2151 public function utf8_char_mapping($str, $mode, $opt = '') {
2152 if (!$this->initUnicodeData($mode)) {
2153 // Do nothing
2154 return $str;
2155 }
2156 $out = '';
2157 switch ($mode) {
2158 case 'case':
2159 $map = &$this->caseFolding['utf-8'][$opt];
2160 break;
2161 case 'ascii':
2162 $map = &$this->toASCII['utf-8'];
2163 break;
2164 default:
2165 return $str;
2166 }
2167 for ($i = 0; strlen($str[$i]); $i++) {
2168 $c = ord($str[$i]);
2169 // single-byte (0xxxxxx)
2170 if (!($c & 128)) {
2171 $mbc = $str[$i];
2172 } elseif (($c & 192) == 192) {
2173 // multi-byte starting byte (11xxxxxx)
2174 for ($bc = 0; $c & 128; $c = $c << 1) {
2175 $bc++;
2176 }
2177 // calculate number of bytes
2178 $mbc = substr($str, $i, $bc);
2179 $i += $bc - 1;
2180 }
2181 if (isset($map[$mbc])) {
2182 $out .= $map[$mbc];
2183 } else {
2184 $out .= $mbc;
2185 }
2186 }
2187 return $out;
2188 }
2189
2190 /********************************************
2191 *
2192 * Internal EUC string operation functions
2193 *
2194 * Extended Unix Code:
2195 * ASCII compatible 7bit single bytes chars
2196 * 8bit two byte chars
2197 *
2198 * Shift-JIS is treated as a special case.
2199 *
2200 ********************************************/
2201 /**
2202 * Cuts a string in the EUC charset family short at a given byte length.
2203 *
2204 * @param string $str EUC multibyte character string
2205 * @param integer $len The byte length
2206 * @param string $charset The charset
2207 * @return string The shortened string
2208 * @see mb_strcut()
2209 * @todo Define visibility
2210 */
2211 public function euc_strtrunc($str, $len, $charset) {
2212 $sjis = $charset == 'shift_jis';
2213 for ($i = 0; strlen($str[$i]) && $i < $len; $i++) {
2214 $c = ord($str[$i]);
2215 if ($sjis) {
2216 if ($c >= 128 && $c < 160 || $c >= 224) {
2217 $i++;
2218 }
2219 } else {
2220 if ($c >= 128) {
2221 $i++;
2222 }
2223 }
2224 }
2225 if (!strlen($str[$i])) {
2226 return $str;
2227 }
2228 // string shorter than supplied length
2229 if ($i > $len) {
2230 // We ended on a first byte
2231 return substr($str, 0, $len - 1);
2232 } else {
2233 return substr($str, 0, $len);
2234 }
2235 }
2236
2237 /**
2238 * Returns a part of a string in the EUC charset family.
2239 *
2240 * @param string $str EUC multibyte character string
2241 * @param integer $start Start position (character position)
2242 * @param string $charset The charset
2243 * @param integer $len Length (in characters)
2244 * @return string the substring
2245 * @todo Define visibility
2246 */
2247 public function euc_substr($str, $start, $charset, $len = NULL) {
2248 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2249 if ($byte_start === FALSE) {
2250 // $start outside string length
2251 return FALSE;
2252 }
2253 $str = substr($str, $byte_start);
2254 if ($len != NULL) {
2255 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2256 // $len outside actual string length
2257 if ($byte_end === FALSE) {
2258 return $str;
2259 } else {
2260 return substr($str, 0, $byte_end);
2261 }
2262 } else {
2263 return $str;
2264 }
2265 }
2266
2267 /**
2268 * Counts the number of characters of a string in the EUC charset family.
2269 *
2270 * @param string $str EUC multibyte character string
2271 * @param string $charset The charset
2272 * @return integer The number of characters
2273 * @see strlen()
2274 * @todo Define visibility
2275 */
2276 public function euc_strlen($str, $charset) {
2277 $sjis = $charset == 'shift_jis';
2278 $n = 0;
2279 for ($i = 0; strlen($str[$i]); $i++) {
2280 $c = ord($str[$i]);
2281 if ($sjis) {
2282 if ($c >= 128 && $c < 160 || $c >= 224) {
2283 $i++;
2284 }
2285 } else {
2286 if ($c >= 128) {
2287 $i++;
2288 }
2289 }
2290 $n++;
2291 }
2292 return $n;
2293 }
2294
2295 /**
2296 * Translates a character position into an 'absolute' byte position.
2297 *
2298 * @param string $str EUC multibyte character string
2299 * @param integer $pos Character position (negative values start from the end)
2300 * @param string $charset The charset
2301 * @return integer Byte position
2302 * @todo Define visibility
2303 */
2304 public function euc_char2byte_pos($str, $pos, $charset) {
2305 $sjis = $charset == 'shift_jis';
2306 // Number of characters seen
2307 $n = 0;
2308 // Number of characters wanted
2309 $p = abs($pos);
2310 if ($pos >= 0) {
2311 $i = 0;
2312 $d = 1;
2313 } else {
2314 $i = strlen($str) - 1;
2315 $d = -1;
2316 }
2317 for (; strlen($str[$i]) && $n < $p; $i += $d) {
2318 $c = ord($str[$i]);
2319 if ($sjis) {
2320 if ($c >= 128 && $c < 160 || $c >= 224) {
2321 $i += $d;
2322 }
2323 } else {
2324 if ($c >= 128) {
2325 $i += $d;
2326 }
2327 }
2328 $n++;
2329 }
2330 if (!strlen($str[$i])) {
2331 return FALSE;
2332 }
2333 // offset beyond string length
2334 if ($pos < 0) {
2335 $i++;
2336 }
2337 // correct offset
2338 return $i;
2339 }
2340
2341 /**
2342 * Maps all characters of a string in the EUC charset family.
2343 *
2344 * @param string $str EUC multibyte character string
2345 * @param string $charset The charset
2346 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2347 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
2348 * @return string The converted string
2349 * @todo Define visibility
2350 */
2351 public function euc_char_mapping($str, $charset, $mode, $opt = '') {
2352 switch ($mode) {
2353 case 'case':
2354 if (!$this->initCaseFolding($charset)) {
2355 return $str;
2356 }
2357 // do nothing
2358 $map = &$this->caseFolding[$charset][$opt];
2359 break;
2360 case 'ascii':
2361 if (!$this->initToASCII($charset)) {
2362 return $str;
2363 }
2364 // do nothing
2365 $map = &$this->toASCII[$charset];
2366 break;
2367 default:
2368 return $str;
2369 }
2370 $sjis = $charset == 'shift_jis';
2371 $out = '';
2372 for ($i = 0; strlen($str[$i]); $i++) {
2373 $mbc = $str[$i];
2374 $c = ord($mbc);
2375 if ($sjis) {
2376 // A double-byte char
2377 if ($c >= 128 && $c < 160 || $c >= 224) {
2378 $mbc = substr($str, $i, 2);
2379 $i++;
2380 }
2381 } else {
2382 // A double-byte char
2383 if ($c >= 128) {
2384 $mbc = substr($str, $i, 2);
2385 $i++;
2386 }
2387 }
2388 if (isset($map[$mbc])) {
2389 $out .= $map[$mbc];
2390 } else {
2391 $out .= $mbc;
2392 }
2393 }
2394 return $out;
2395 }
2396
2397 }
2398
2399
2400 ?>