[TASK] Raise PHP version requirement to 5.3.7
[Packages/TYPO3.CMS.git] / typo3 / sysext / core / Classes / Charset / CharsetConverter.php
1 <?php
2 namespace TYPO3\CMS\Core\Charset;
3
4 /***************************************************************
5 * Copyright notice
6 *
7 * (c) 2003-2013 Kasper Skårhøj (kasperYYYY@typo3.com)
8 * All rights reserved
9 *
10 * This script is part of the Typo3 project. The Typo3 project is
11 * free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * The GNU General Public License can be found at
17 * http://www.gnu.org/copyleft/gpl.html.
18 *
19 * This script is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
23 *
24 * This copyright notice MUST APPEAR in all copies of the script!
25 ***************************************************************/
26
27 use TYPO3\CMS\Core\Utility\GeneralUtility;
28
29 /**
30 * Notes on UTF-8
31 *
32 * Functions working on UTF-8 strings:
33 *
34 * - strchr/strstr
35 * - strrchr
36 * - substr_count
37 * - implode/explode/join
38 *
39 * Functions nearly working on UTF-8 strings:
40 *
41 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
42 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
43 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
44 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
45 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
46 *
47 * Functions NOT working on UTF-8 strings:
48 *
49 * - str*cmp
50 * - stristr
51 * - stripos
52 * - substr
53 * - strrev
54 * - split/spliti
55 * - ...
56 */
57
58 /**
59 * Class for conversion between charsets
60 *
61 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
62 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
63 */
64 class CharsetConverter {
65
66 /**
67 * @var \TYPO3\CMS\Core\Localization\Locales
68 */
69 protected $locales;
70
71 // ASCII Value for chars with no equivalent.
72 /**
73 * @todo Define visibility
74 */
75 public $noCharByteVal = 63;
76
77 // This is the array where parsed conversion tables are stored (cached)
78 /**
79 * @todo Define visibility
80 */
81 public $parsedCharsets = array();
82
83 // An array where case folding data will be stored (cached)
84 /**
85 * @todo Define visibility
86 */
87 public $caseFolding = array();
88
89 // An array where charset-to-ASCII mappings are stored (cached)
90 /**
91 * @todo Define visibility
92 */
93 public $toASCII = array();
94
95 // This tells the converter which charsets has two bytes per char:
96 /**
97 * @todo Define visibility
98 */
99 public $twoByteSets = array(
100 'ucs-2' => 1
101 );
102
103 // This tells the converter which charsets has four bytes per char:
104 /**
105 * @todo Define visibility
106 */
107 public $fourByteSets = array(
108 'ucs-4' => 1,
109 // 4-byte Unicode
110 'utf-32' => 1
111 );
112
113 // This tells the converter which charsets use a scheme like the Extended Unix Code:
114 /**
115 * @todo Define visibility
116 */
117 public $eucBasedSets = array(
118 'gb2312' => 1,
119 // Chinese, simplified.
120 'big5' => 1,
121 // Chinese, traditional.
122 'euc-kr' => 1,
123 // Korean
124 'shift_jis' => 1
125 );
126
127 // See http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
128 // http://czyborra.com/charsets/iso8859.html
129 /**
130 * @todo Define visibility
131 */
132 public $synonyms = array(
133 'us' => 'ascii',
134 'us-ascii' => 'ascii',
135 'cp819' => 'iso-8859-1',
136 'ibm819' => 'iso-8859-1',
137 'iso-ir-100' => 'iso-8859-1',
138 'iso-ir-101' => 'iso-8859-2',
139 'iso-ir-109' => 'iso-8859-3',
140 'iso-ir-110' => 'iso-8859-4',
141 'iso-ir-144' => 'iso-8859-5',
142 'iso-ir-127' => 'iso-8859-6',
143 'iso-ir-126' => 'iso-8859-7',
144 'iso-ir-138' => 'iso-8859-8',
145 'iso-ir-148' => 'iso-8859-9',
146 'iso-ir-157' => 'iso-8859-10',
147 'iso-ir-179' => 'iso-8859-13',
148 'iso-ir-199' => 'iso-8859-14',
149 'iso-ir-203' => 'iso-8859-15',
150 'csisolatin1' => 'iso-8859-1',
151 'csisolatin2' => 'iso-8859-2',
152 'csisolatin3' => 'iso-8859-3',
153 'csisolatin5' => 'iso-8859-9',
154 'csisolatin8' => 'iso-8859-14',
155 'csisolatin9' => 'iso-8859-15',
156 'csisolatingreek' => 'iso-8859-7',
157 'iso-celtic' => 'iso-8859-14',
158 'latin1' => 'iso-8859-1',
159 'latin2' => 'iso-8859-2',
160 'latin3' => 'iso-8859-3',
161 'latin5' => 'iso-8859-9',
162 'latin6' => 'iso-8859-10',
163 'latin8' => 'iso-8859-14',
164 'latin9' => 'iso-8859-15',
165 'l1' => 'iso-8859-1',
166 'l2' => 'iso-8859-2',
167 'l3' => 'iso-8859-3',
168 'l5' => 'iso-8859-9',
169 'l6' => 'iso-8859-10',
170 'l8' => 'iso-8859-14',
171 'l9' => 'iso-8859-15',
172 'cyrillic' => 'iso-8859-5',
173 'arabic' => 'iso-8859-6',
174 'tis-620' => 'iso-8859-11',
175 'win874' => 'windows-874',
176 'win1250' => 'windows-1250',
177 'win1251' => 'windows-1251',
178 'win1252' => 'windows-1252',
179 'win1253' => 'windows-1253',
180 'win1254' => 'windows-1254',
181 'win1255' => 'windows-1255',
182 'win1256' => 'windows-1256',
183 'win1257' => 'windows-1257',
184 'win1258' => 'windows-1258',
185 'cp1250' => 'windows-1250',
186 'cp1251' => 'windows-1251',
187 'cp1252' => 'windows-1252',
188 'ms-ee' => 'windows-1250',
189 'ms-ansi' => 'windows-1252',
190 'ms-greek' => 'windows-1253',
191 'ms-turk' => 'windows-1254',
192 'winbaltrim' => 'windows-1257',
193 'koi-8ru' => 'koi-8r',
194 'koi8r' => 'koi-8r',
195 'cp878' => 'koi-8r',
196 'mac' => 'macroman',
197 'macintosh' => 'macroman',
198 'euc-cn' => 'gb2312',
199 'x-euc-cn' => 'gb2312',
200 'euccn' => 'gb2312',
201 'cp936' => 'gb2312',
202 'big-5' => 'big5',
203 'cp950' => 'big5',
204 'eucjp' => 'euc-jp',
205 'sjis' => 'shift_jis',
206 'shift-jis' => 'shift_jis',
207 'cp932' => 'shift_jis',
208 'cp949' => 'euc-kr',
209 'utf7' => 'utf-7',
210 'utf8' => 'utf-8',
211 'utf16' => 'utf-16',
212 'utf32' => 'utf-32',
213 'utf8' => 'utf-8',
214 'ucs2' => 'ucs-2',
215 'ucs4' => 'ucs-4'
216 );
217
218 // Mapping of iso-639-1 language codes to script names
219 /**
220 * @todo Define visibility
221 */
222 public $lang_to_script = array(
223 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
224 'af' => 'west_european',
225 //Afrikaans
226 'ar' => 'arabic',
227 'bg' => 'cyrillic',
228 // Bulgarian
229 'bs' => 'east_european',
230 // Bosnian
231 'cs' => 'east_european',
232 // Czech
233 'da' => 'west_european',
234 // Danish
235 'de' => 'west_european',
236 // German
237 'es' => 'west_european',
238 // Spanish
239 'et' => 'estonian',
240 'eo' => 'unicode',
241 // Esperanto
242 'eu' => 'west_european',
243 // Basque
244 'fa' => 'arabic',
245 // Persian
246 'fi' => 'west_european',
247 // Finish
248 'fo' => 'west_european',
249 // Faroese
250 'fr' => 'west_european',
251 // French
252 'ga' => 'west_european',
253 // Irish
254 'gl' => 'west_european',
255 // Galician
256 'gr' => 'greek',
257 'he' => 'hebrew',
258 // Hebrew (since 1998)
259 'hi' => 'unicode',
260 // Hindi
261 'hr' => 'east_european',
262 // Croatian
263 'hu' => 'east_european',
264 // Hungarian
265 'iw' => 'hebrew',
266 // Hebrew (til 1998)
267 'is' => 'west_european',
268 // Icelandic
269 'it' => 'west_european',
270 // Italian
271 'ja' => 'japanese',
272 'ka' => 'unicode',
273 // Georgian
274 'kl' => 'west_european',
275 // Greenlandic
276 'km' => 'unicode',
277 // Khmer
278 'ko' => 'korean',
279 'lt' => 'lithuanian',
280 'lv' => 'west_european',
281 // Latvian/Lettish
282 'nl' => 'west_european',
283 // Dutch
284 'no' => 'west_european',
285 // Norwegian
286 'nb' => 'west_european',
287 // Norwegian Bokmal
288 'nn' => 'west_european',
289 // Norwegian Nynorsk
290 'pl' => 'east_european',
291 // Polish
292 'pt' => 'west_european',
293 // Portuguese
294 'ro' => 'east_european',
295 // Romanian
296 'ru' => 'cyrillic',
297 // Russian
298 'sk' => 'east_european',
299 // Slovak
300 'sl' => 'east_european',
301 // Slovenian
302 'sr' => 'cyrillic',
303 // Serbian
304 'sv' => 'west_european',
305 // Swedish
306 'sq' => 'albanian',
307 // Albanian
308 'th' => 'thai',
309 'uk' => 'cyrillic',
310 // Ukranian
311 'vi' => 'vietnamese',
312 'zh' => 'chinese',
313 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
314 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
315 'afk' => 'west_european',
316 // Afrikaans
317 'ara' => 'arabic',
318 'bgr' => 'cyrillic',
319 // Bulgarian
320 'cat' => 'west_european',
321 // Catalan
322 'chs' => 'simpl_chinese',
323 'cht' => 'trad_chinese',
324 'csy' => 'east_european',
325 // Czech
326 'dan' => 'west_european',
327 // Danisch
328 'deu' => 'west_european',
329 // German
330 'dea' => 'west_european',
331 // German (Austrian)
332 'des' => 'west_european',
333 // German (Swiss)
334 'ena' => 'west_european',
335 // English (Australian)
336 'enc' => 'west_european',
337 // English (Canadian)
338 'eng' => 'west_european',
339 // English
340 'enz' => 'west_european',
341 // English (New Zealand)
342 'enu' => 'west_european',
343 // English (United States)
344 'euq' => 'west_european',
345 // Basque
346 'fos' => 'west_european',
347 // Faroese
348 'far' => 'arabic',
349 // Persian
350 'fin' => 'west_european',
351 // Finish
352 'fra' => 'west_european',
353 // French
354 'frb' => 'west_european',
355 // French (Belgian)
356 'frc' => 'west_european',
357 // French (Canadian)
358 'frs' => 'west_european',
359 // French (Swiss)
360 'geo' => 'unicode',
361 // Georgian
362 'glg' => 'west_european',
363 // Galician
364 'ell' => 'greek',
365 'heb' => 'hebrew',
366 'hin' => 'unicode',
367 // Hindi
368 'hun' => 'east_european',
369 // Hungarian
370 'isl' => 'west_european',
371 // Icelandic
372 'ita' => 'west_european',
373 // Italian
374 'its' => 'west_european',
375 // Italian (Swiss)
376 'jpn' => 'japanese',
377 'khm' => 'unicode',
378 // Khmer
379 'kor' => 'korean',
380 'lth' => 'lithuanian',
381 'lvi' => 'west_european',
382 // Latvian/Lettish
383 'msl' => 'west_european',
384 // Malay
385 'nlb' => 'west_european',
386 // Dutch (Belgian)
387 'nld' => 'west_european',
388 // Dutch
389 'nor' => 'west_european',
390 // Norwegian (bokmal)
391 'non' => 'west_european',
392 // Norwegian (nynorsk)
393 'plk' => 'east_european',
394 // Polish
395 'ptg' => 'west_european',
396 // Portuguese
397 'ptb' => 'west_european',
398 // Portuguese (Brazil)
399 'rom' => 'east_european',
400 // Romanian
401 'rus' => 'cyrillic',
402 // Russian
403 'slv' => 'east_european',
404 // Slovenian
405 'sky' => 'east_european',
406 // Slovak
407 'srl' => 'east_european',
408 // Serbian (Latin)
409 'srb' => 'cyrillic',
410 // Serbian (Cyrillic)
411 'esp' => 'west_european',
412 // Spanish (trad. sort)
413 'esm' => 'west_european',
414 // Spanish (Mexican)
415 'esn' => 'west_european',
416 // Spanish (internat. sort)
417 'sve' => 'west_european',
418 // Swedish
419 'sqi' => 'albanian',
420 // Albanian
421 'tha' => 'thai',
422 'trk' => 'turkish',
423 'ukr' => 'cyrillic',
424 // Ukrainian
425 // English language names
426 'afrikaans' => 'west_european',
427 'albanian' => 'albanian',
428 'arabic' => 'arabic',
429 'basque' => 'west_european',
430 'bosnian' => 'east_european',
431 'bulgarian' => 'east_european',
432 'catalan' => 'west_european',
433 'croatian' => 'east_european',
434 'czech' => 'east_european',
435 'danish' => 'west_european',
436 'dutch' => 'west_european',
437 'english' => 'west_european',
438 'esperanto' => 'unicode',
439 'estonian' => 'estonian',
440 'faroese' => 'west_european',
441 'farsi' => 'arabic',
442 'finnish' => 'west_european',
443 'french' => 'west_european',
444 'galician' => 'west_european',
445 'georgian' => 'unicode',
446 'german' => 'west_european',
447 'greek' => 'greek',
448 'greenlandic' => 'west_european',
449 'hebrew' => 'hebrew',
450 'hindi' => 'unicode',
451 'hungarian' => 'east_european',
452 'icelandic' => 'west_european',
453 'italian' => 'west_european',
454 'khmer' => 'unicode',
455 'latvian' => 'west_european',
456 'lettish' => 'west_european',
457 'lithuanian' => 'lithuanian',
458 'malay' => 'west_european',
459 'norwegian' => 'west_european',
460 'persian' => 'arabic',
461 'polish' => 'east_european',
462 'portuguese' => 'west_european',
463 'russian' => 'cyrillic',
464 'romanian' => 'east_european',
465 'serbian' => 'cyrillic',
466 'slovak' => 'east_european',
467 'slovenian' => 'east_european',
468 'spanish' => 'west_european',
469 'svedish' => 'west_european',
470 'that' => 'thai',
471 'turkish' => 'turkish',
472 'ukrainian' => 'cyrillic'
473 );
474
475 // Mapping of language (family) names to charsets on Unix
476 /**
477 * @todo Define visibility
478 */
479 public $script_to_charset_unix = array(
480 'west_european' => 'iso-8859-1',
481 'estonian' => 'iso-8859-1',
482 'east_european' => 'iso-8859-2',
483 'baltic' => 'iso-8859-4',
484 'cyrillic' => 'iso-8859-5',
485 'arabic' => 'iso-8859-6',
486 'greek' => 'iso-8859-7',
487 'hebrew' => 'iso-8859-8',
488 'turkish' => 'iso-8859-9',
489 'thai' => 'iso-8859-11',
490 // = TIS-620
491 'lithuanian' => 'iso-8859-13',
492 'chinese' => 'gb2312',
493 // = euc-cn
494 'japanese' => 'euc-jp',
495 'korean' => 'euc-kr',
496 'simpl_chinese' => 'gb2312',
497 'trad_chinese' => 'big5',
498 'vietnamese' => '',
499 'unicode' => 'utf-8',
500 'albanian' => 'utf-8'
501 );
502
503 // Mapping of language (family) names to charsets on Windows
504 /**
505 * @todo Define visibility
506 */
507 public $script_to_charset_windows = array(
508 'east_european' => 'windows-1250',
509 'cyrillic' => 'windows-1251',
510 'west_european' => 'windows-1252',
511 'greek' => 'windows-1253',
512 'turkish' => 'windows-1254',
513 'hebrew' => 'windows-1255',
514 'arabic' => 'windows-1256',
515 'baltic' => 'windows-1257',
516 'estonian' => 'windows-1257',
517 'lithuanian' => 'windows-1257',
518 'vietnamese' => 'windows-1258',
519 'thai' => 'cp874',
520 'korean' => 'cp949',
521 'chinese' => 'gb2312',
522 'japanese' => 'shift_jis',
523 'simpl_chinese' => 'gb2312',
524 'trad_chinese' => 'big5',
525 'albanian' => 'windows-1250',
526 'unicode' => 'utf-8'
527 );
528
529 // Mapping of locale names to charsets
530 /**
531 * @todo Define visibility
532 */
533 public $locale_to_charset = array(
534 'japanese.euc' => 'euc-jp',
535 'ja_jp.ujis' => 'euc-jp',
536 'korean.euc' => 'euc-kr',
537 'sr@Latn' => 'iso-8859-2',
538 'zh_cn' => 'gb2312',
539 'zh_hk' => 'big5',
540 'zh_tw' => 'big5'
541 );
542
543 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
544 // Empty values means "iso-8859-1"
545 /**
546 * @todo Define visibility
547 */
548 public $charSetArray = array(
549 'af' => '',
550 'ar' => 'iso-8859-6',
551 'ba' => 'iso-8859-2',
552 'bg' => 'windows-1251',
553 'br' => '',
554 'ca' => 'iso-8859-15',
555 'ch' => 'gb2312',
556 'cs' => 'windows-1250',
557 'cz' => 'windows-1250',
558 'da' => '',
559 'de' => '',
560 'dk' => '',
561 'el' => 'iso-8859-7',
562 'eo' => 'utf-8',
563 'es' => '',
564 'et' => 'iso-8859-4',
565 'eu' => '',
566 'fa' => 'utf-8',
567 'fi' => '',
568 'fo' => 'utf-8',
569 'fr' => '',
570 'fr_CA' => '',
571 'ga' => '',
572 'ge' => 'utf-8',
573 'gl' => '',
574 'gr' => 'iso-8859-7',
575 'he' => 'utf-8',
576 'hi' => 'utf-8',
577 'hk' => 'big5',
578 'hr' => 'windows-1250',
579 'hu' => 'iso-8859-2',
580 'is' => 'utf-8',
581 'it' => '',
582 'ja' => 'shift_jis',
583 'jp' => 'shift_jis',
584 'ka' => 'utf-8',
585 'kl' => 'utf-8',
586 'km' => 'utf-8',
587 'ko' => 'euc-kr',
588 'kr' => 'euc-kr',
589 'lt' => 'windows-1257',
590 'lv' => 'utf-8',
591 'ms' => '',
592 'my' => '',
593 'nl' => '',
594 'no' => '',
595 'pl' => 'iso-8859-2',
596 'pt' => '',
597 'pt_BR' => '',
598 'qc' => '',
599 'ro' => 'iso-8859-2',
600 'ru' => 'windows-1251',
601 'se' => '',
602 'si' => 'windows-1250',
603 'sk' => 'windows-1250',
604 'sl' => 'windows-1250',
605 'sq' => 'utf-8',
606 'sr' => 'utf-8',
607 'sv' => '',
608 'th' => 'iso-8859-11',
609 'tr' => 'iso-8859-9',
610 'ua' => 'windows-1251',
611 'uk' => 'windows-1251',
612 'vi' => 'utf-8',
613 'vn' => 'utf-8',
614 'zh' => 'big5'
615 );
616
617 /**
618 * Default constructor.
619 */
620 public function __construct() {
621 $this->locales = GeneralUtility::makeInstance('TYPO3\\CMS\\Core\\Localization\\Locales');
622 }
623
624 /**
625 * Normalize - changes input character set to lowercase letters.
626 *
627 * @param string $charset Input charset
628 * @return string Normalized charset
629 * @todo Define visibility
630 */
631 public function parse_charset($charset) {
632 $charset = trim(strtolower($charset));
633 if (isset($this->synonyms[$charset])) {
634 $charset = $this->synonyms[$charset];
635 }
636 return $charset;
637 }
638
639 /**
640 * Get the charset of a locale.
641 *
642 * ln language
643 * ln_CN language / country
644 * ln_CN.cs language / country / charset
645 * ln_CN.cs@mod language / country / charset / modifier
646 *
647 * @param string $locale Locale string
648 * @return string Charset resolved for locale string
649 * @todo Define visibility
650 */
651 public function get_locale_charset($locale) {
652 $locale = strtolower($locale);
653 // Exact locale specific charset?
654 if (isset($this->locale_to_charset[$locale])) {
655 return $this->locale_to_charset[$locale];
656 }
657 // Get modifier
658 list($locale, $modifier) = explode('@', $locale);
659 // Locale contains charset: use it
660 list($locale, $charset) = explode('.', $locale);
661 if ($charset) {
662 return $this->parse_charset($charset);
663 }
664 // Modifier is 'euro' (after charset check, because of xx.utf-8@euro)
665 if ($modifier == 'euro') {
666 return 'iso-8859-15';
667 }
668 // Get language
669 list($language, $country) = explode('_', $locale);
670 if (isset($this->lang_to_script[$language])) {
671 $script = $this->lang_to_script[$language];
672 }
673 if (TYPO3_OS == 'WIN') {
674 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
675 } else {
676 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'utf-8';
677 }
678 return $cs;
679 }
680
681 /********************************************
682 *
683 * Charset Conversion functions
684 *
685 ********************************************/
686 /**
687 * Convert from one charset to another charset.
688 *
689 * @param string $str Input string
690 * @param string $fromCS From charset (the current charset of the string)
691 * @param string $toCS To charset (the output charset wanted)
692 * @param boolean $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
693 * @return string Converted string
694 * @see convArray()
695 * @todo Define visibility
696 */
697 public function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
698 if ($fromCS == $toCS) {
699 return $str;
700 }
701 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
702 if ($toCS == 'utf-8' || !$useEntityForNoChar) {
703 switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
704 case 'mbstring':
705 $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
706 if (FALSE !== $conv_str) {
707 return $conv_str;
708 }
709 // Returns FALSE for unsupported charsets
710 break;
711 case 'iconv':
712 $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
713 if (FALSE !== $conv_str) {
714 return $conv_str;
715 }
716 break;
717 case 'recode':
718 $conv_str = recode_string($fromCS . '..' . $toCS, $str);
719 if (FALSE !== $conv_str) {
720 return $conv_str;
721 }
722 break;
723 }
724 }
725 if ($fromCS != 'utf-8') {
726 $str = $this->utf8_encode($str, $fromCS);
727 }
728 if ($toCS != 'utf-8') {
729 $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
730 }
731 return $str;
732 }
733
734 /**
735 * Convert all elements in ARRAY with type string from one charset to another charset.
736 * NOTICE: Array is passed by reference!
737 *
738 * @param string $array Input array, possibly multidimensional
739 * @param string $fromCS From charset (the current charset of the string)
740 * @param string $toCS To charset (the output charset wanted)
741 * @param boolean $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
742 * @return void
743 * @see conv()
744 * @todo Define visibility
745 */
746 public function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
747 foreach ($array as $key => $value) {
748 if (is_array($array[$key])) {
749 $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
750 } elseif (is_string($array[$key])) {
751 $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
752 }
753 }
754 }
755
756 /**
757 * Converts $str from $charset to UTF-8
758 *
759 * @param string $str String in local charset to convert to UTF-8
760 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
761 * @return string Output string, converted to UTF-8
762 * @todo Define visibility
763 */
764 public function utf8_encode($str, $charset) {
765 if ($charset === 'utf-8') {
766 return $str;
767 }
768 // Charset is case-insensitive
769 // Parse conv. table if not already
770 if ($this->initCharset($charset)) {
771 $strLen = strlen($str);
772 $outStr = '';
773 // Traverse each char in string
774 for ($a = 0; $a < $strLen; $a++) {
775 $chr = substr($str, $a, 1);
776 $ord = ord($chr);
777 // If the charset has two bytes per char
778 if (isset($this->twoByteSets[$charset])) {
779 $ord2 = ord($str[$a + 1]);
780 // Assume big endian
781 $ord = $ord << 8 | $ord2;
782 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
783 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
784 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
785 } else {
786 $outStr .= chr($this->noCharByteVal);
787 }
788 // No char exists
789 $a++;
790 } elseif ($ord > 127) {
791 // If char has value over 127 it's a multibyte char in UTF-8
792 // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
793 if (isset($this->eucBasedSets[$charset])) {
794 // Shift-JIS: chars between 160 and 223 are single byte
795 if ($charset != 'shift_jis' || ($ord < 160 || $ord > 223)) {
796 $a++;
797 $ord2 = ord(substr($str, $a, 1));
798 $ord = $ord * 256 + $ord2;
799 }
800 }
801 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
802 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
803 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
804 } else {
805 $outStr .= chr($this->noCharByteVal);
806 }
807 } else {
808 $outStr .= $chr;
809 }
810 }
811 return $outStr;
812 }
813 }
814
815 /**
816 * Converts $str from UTF-8 to $charset
817 *
818 * @param string $str String in UTF-8 to convert to local charset
819 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
820 * @param boolean $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
821 * @return string Output string, converted to local charset
822 * @todo Define visibility
823 */
824 public function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
825 if ($charset === 'utf-8') {
826 return $str;
827 }
828 // Charset is case-insensitive.
829 // Parse conv. table if not already
830 if ($this->initCharset($charset)) {
831 $strLen = strlen($str);
832 $outStr = '';
833 $buf = '';
834 // Traverse each char in UTF-8 string
835 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) {
836 $chr = substr($str, $a, 1);
837 $ord = ord($chr);
838 // This means multibyte! (first byte!)
839 if ($ord > 127) {
840 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
841 if ($ord & 64) {
842 // Add first byte
843 $buf = $chr;
844 // For each byte in multibyte string
845 for ($b = 0; $b < 8; $b++) {
846 // Shift it left and
847 $ord = $ord << 1;
848 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
849 if ($ord & 128) {
850 $a++;
851 // ... and add the next char.
852 $buf .= substr($str, $a, 1);
853 } else {
854 break;
855 }
856 }
857 // If the UTF-8 char-sequence is found then...
858 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
859 // The local number
860 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
861 // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
862 if ($mByte > 255) {
863 $outStr .= chr(($mByte >> 8 & 255)) . chr(($mByte & 255));
864 } else {
865 $outStr .= chr($mByte);
866 }
867 } elseif ($useEntityForNoChar) {
868 // Create num entity:
869 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
870 } else {
871 $outStr .= chr($this->noCharByteVal);
872 }
873 } else {
874 $outStr .= chr($this->noCharByteVal);
875 }
876 } else {
877 $outStr .= $chr;
878 }
879 }
880 return $outStr;
881 }
882 }
883
884 /**
885 * Converts all chars > 127 to numeric entities.
886 *
887 * @param string $str Input string
888 * @return string Output string
889 * @todo Define visibility
890 */
891 public function utf8_to_entities($str) {
892 $strLen = strlen($str);
893 $outStr = '';
894 $buf = '';
895 // Traverse each char in UTF-8 string.
896 for ($a = 0; $a < $strLen; $a++) {
897 $chr = substr($str, $a, 1);
898 $ord = ord($chr);
899 // This means multibyte! (first byte!)
900 if ($ord > 127) {
901 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
902 if ($ord & 64) {
903 // Add first byte
904 $buf = $chr;
905 // For each byte in multibyte string...
906 for ($b = 0; $b < 8; $b++) {
907 // Shift it left and ...
908 $ord = $ord << 1;
909 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
910 if ($ord & 128) {
911 $a++;
912 // ... and add the next char.
913 $buf .= substr($str, $a, 1);
914 } else {
915 break;
916 }
917 }
918 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
919 } else {
920 $outStr .= chr($this->noCharByteVal);
921 }
922 } else {
923 $outStr .= $chr;
924 }
925 }
926 return $outStr;
927 }
928
929 /**
930 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
931 *
932 * @param string $str Input string, UTF-8
933 * @param boolean $alsoStdHtmlEnt If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
934 * @return string Output string
935 * @todo Define visibility
936 */
937 public function entities_to_utf8($str, $alsoStdHtmlEnt = FALSE) {
938 if ($alsoStdHtmlEnt) {
939 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8'));
940 }
941 $token = md5(microtime());
942 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
943 foreach ($parts as $k => $v) {
944 // Only take every second element
945 if ($k % 2 === 0) {
946 continue;
947 }
948 $position = 0;
949 // Dec or hex entities
950 if (substr($v, $position, 1) == '#') {
951 $position++;
952 if (substr($v, $position, 1) == 'x') {
953 $v = hexdec(substr($v, ++$position));
954 } else {
955 $v = substr($v, $position);
956 }
957 $parts[$k] = $this->UnumberToChar($v);
958 } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) {
959 // Other entities:
960 $v = $trans_tbl['&' . $v . ';'];
961 $parts[$k] = $v;
962 } else {
963 // No conversion:
964 $parts[$k] = '&' . $v . ';';
965 }
966 }
967 return implode('', $parts);
968 }
969
970 /**
971 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
972 *
973 * @param string $str Input string, UTF-8
974 * @param boolean $convEntities If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
975 * @param boolean $retChar If set, then instead of integer numbers the real UTF-8 char is returned.
976 * @return array Output array with the char numbers
977 * @todo Define visibility
978 */
979 public function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
980 // If entities must be registered as well...:
981 if ($convEntities) {
982 $str = $this->entities_to_utf8($str, 1);
983 }
984 // Do conversion:
985 $strLen = strlen($str);
986 $outArr = array();
987 $buf = '';
988 // Traverse each char in UTF-8 string.
989 for ($a = 0; $a < $strLen; $a++) {
990 $chr = substr($str, $a, 1);
991 $ord = ord($chr);
992 // This means multibyte! (first byte!)
993 if ($ord > 127) {
994 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
995 if ($ord & 64) {
996 // Add first byte
997 $buf = $chr;
998 // For each byte in multibyte string...
999 for ($b = 0; $b < 8; $b++) {
1000 // Shift it left and ...
1001 $ord = $ord << 1;
1002 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
1003 if ($ord & 128) {
1004 $a++;
1005 // ... and add the next char.
1006 $buf .= substr($str, $a, 1);
1007 } else {
1008 break;
1009 }
1010 }
1011 $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
1012 } else {
1013 $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
1014 }
1015 } else {
1016 $outArr[] = $retChar ? chr($ord) : $ord;
1017 }
1018 }
1019 return $outArr;
1020 }
1021
1022 /**
1023 * Converts a UNICODE number to a UTF-8 multibyte character
1024 * Algorithm based on script found at From: http://czyborra.com/utf/
1025 * Unit-tested by Kasper
1026 *
1027 * The binary representation of the character's integer value is thus simply spread across the bytes
1028 * and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
1029 *
1030 * bytes | bits | representation
1031 * 1 | 7 | 0vvvvvvv
1032 * 2 | 11 | 110vvvvv 10vvvvvv
1033 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
1034 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
1035 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
1036 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
1037 *
1038 * @param integer $cbyte UNICODE integer
1039 * @return string UTF-8 multibyte character string
1040 * @see utf8CharToUnumber()
1041 * @todo Define visibility
1042 */
1043 public function UnumberToChar($cbyte) {
1044 $str = '';
1045 if ($cbyte < 128) {
1046 $str .= chr($cbyte);
1047 } else {
1048 if ($cbyte < 2048) {
1049 $str .= chr(192 | $cbyte >> 6);
1050 $str .= chr(128 | $cbyte & 63);
1051 } else {
1052 if ($cbyte < 65536) {
1053 $str .= chr(224 | $cbyte >> 12);
1054 $str .= chr(128 | $cbyte >> 6 & 63);
1055 $str .= chr(128 | $cbyte & 63);
1056 } else {
1057 if ($cbyte < 2097152) {
1058 $str .= chr(240 | $cbyte >> 18);
1059 $str .= chr(128 | $cbyte >> 12 & 63);
1060 $str .= chr(128 | $cbyte >> 6 & 63);
1061 $str .= chr(128 | $cbyte & 63);
1062 } else {
1063 if ($cbyte < 67108864) {
1064 $str .= chr(248 | $cbyte >> 24);
1065 $str .= chr(128 | $cbyte >> 18 & 63);
1066 $str .= chr(128 | $cbyte >> 12 & 63);
1067 $str .= chr(128 | $cbyte >> 6 & 63);
1068 $str .= chr(128 | $cbyte & 63);
1069 } else {
1070 if ($cbyte < 2147483648) {
1071 $str .= chr(252 | $cbyte >> 30);
1072 $str .= chr(128 | $cbyte >> 24 & 63);
1073 $str .= chr(128 | $cbyte >> 18 & 63);
1074 $str .= chr(128 | $cbyte >> 12 & 63);
1075 $str .= chr(128 | $cbyte >> 6 & 63);
1076 $str .= chr(128 | $cbyte & 63);
1077 } else {
1078 // Cannot express a 32-bit character in UTF-8
1079 $str .= chr($this->noCharByteVal);
1080 }
1081 }
1082 }
1083 }
1084 }
1085 }
1086 return $str;
1087 }
1088
1089 /**
1090 * Converts a UTF-8 Multibyte character to a UNICODE number
1091 * Unit-tested by Kasper
1092 *
1093 * @param string $str UTF-8 multibyte character string
1094 * @param boolean $hex If set, then a hex. number is returned.
1095 * @return integer UNICODE integer
1096 * @see UnumberToChar()
1097 * @todo Define visibility
1098 */
1099 public function utf8CharToUnumber($str, $hex = 0) {
1100 // First char
1101 $ord = ord(substr($str, 0, 1));
1102 // This verifyes that it IS a multi byte string
1103 if (($ord & 192) == 192) {
1104 $binBuf = '';
1105 // For each byte in multibyte string...
1106 for ($b = 0; $b < 8; $b++) {
1107 // Shift it left and ...
1108 $ord = $ord << 1;
1109 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
1110 if ($ord & 128) {
1111 $binBuf .= substr('00000000' . decbin(ord(substr($str, ($b + 1), 1))), -6);
1112 } else {
1113 break;
1114 }
1115 }
1116 $binBuf = substr(('00000000' . decbin(ord(substr($str, 0, 1)))), -(6 - $b)) . $binBuf;
1117 $int = bindec($binBuf);
1118 } else {
1119 $int = $ord;
1120 }
1121 return $hex ? 'x' . dechex($int) : $int;
1122 }
1123
1124 /********************************************
1125 *
1126 * Init functions
1127 *
1128 ********************************************/
1129 /**
1130 * This will initialize a charset for use if it's defined in the 'typo3/sysext/core/Resources/Private/Charsets/csconvtbl/' folder
1131 * This function is automatically called by the conversion functions
1132 *
1133 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
1134 *
1135 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
1136 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
1137 * @acces private
1138 * @todo Define visibility
1139 */
1140 public function initCharset($charset) {
1141 // Only process if the charset is not yet loaded:
1142 if (!is_array($this->parsedCharsets[$charset])) {
1143 // Conversion table filename:
1144 $charsetConvTableFile = \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/csconvtbl/' . $charset . '.tbl';
1145 // If the conversion table is found:
1146 if ($charset && GeneralUtility::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
1147 // Cache file for charsets:
1148 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1149 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1150 if ($cacheFile && @is_file($cacheFile)) {
1151 $this->parsedCharsets[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1152 } else {
1153 // Parse conversion table into lines:
1154 $lines = GeneralUtility::trimExplode(LF, GeneralUtility::getUrl($charsetConvTableFile), TRUE);
1155 // Initialize the internal variable holding the conv. table:
1156 $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
1157 // traverse the lines:
1158 $detectedType = '';
1159 foreach ($lines as $value) {
1160 // Comment line or blanks are ignored.
1161 if (trim($value) && substr($value, 0, 1) != '#') {
1162 // Detect type if not done yet: (Done on first real line)
1163 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1164 if (!$detectedType) {
1165 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
1166 }
1167 if ($detectedType == 'ms-token') {
1168 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1169 } elseif ($detectedType == 'whitespaced') {
1170 $regA = array();
1171 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1172 $hexbyte = $regA[1];
1173 $utf8 = 'U+' . $regA[2];
1174 }
1175 $decval = hexdec(trim($hexbyte));
1176 if ($decval > 127) {
1177 $utf8decval = hexdec(substr(trim($utf8), 2));
1178 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1179 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1180 }
1181 }
1182 }
1183 if ($cacheFile) {
1184 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1185 }
1186 }
1187 return 2;
1188 } else {
1189 return FALSE;
1190 }
1191 } else {
1192 return 1;
1193 }
1194 }
1195
1196 /**
1197 * This function initializes all UTF-8 character data tables.
1198 *
1199 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1200 *
1201 * @param string $mode Mode ("case", "ascii", ...)
1202 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1203 * @access private
1204 * @todo Define visibility
1205 */
1206 public function initUnicodeData($mode = NULL) {
1207 // Cache files
1208 $cacheFileCase = GeneralUtility::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1209 $cacheFileASCII = GeneralUtility::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1210 // Only process if the tables are not yet loaded
1211 switch ($mode) {
1212 case 'case':
1213 if (is_array($this->caseFolding['utf-8'])) {
1214 return 1;
1215 }
1216 // Use cached version if possible
1217 if ($cacheFileCase && @is_file($cacheFileCase)) {
1218 $this->caseFolding['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileCase));
1219 return 2;
1220 }
1221 break;
1222 case 'ascii':
1223 if (is_array($this->toASCII['utf-8'])) {
1224 return 1;
1225 }
1226 // Use cached version if possible
1227 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1228 $this->toASCII['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileASCII));
1229 return 2;
1230 }
1231 break;
1232 }
1233 // Process main Unicode data file
1234 $unicodeDataFile = \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/UnicodeData.txt';
1235 if (!(GeneralUtility::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1236 return FALSE;
1237 }
1238 $fh = fopen($unicodeDataFile, 'rb');
1239 if (!$fh) {
1240 return FALSE;
1241 }
1242 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1243 // Note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1244 $this->caseFolding['utf-8'] = array();
1245 $utf8CaseFolding = &$this->caseFolding['utf-8'];
1246 // a shorthand
1247 $utf8CaseFolding['toUpper'] = array();
1248 $utf8CaseFolding['toLower'] = array();
1249 $utf8CaseFolding['toTitle'] = array();
1250 // Array of temp. decompositions
1251 $decomposition = array();
1252 // Array of chars that are marks (eg. composing accents)
1253 $mark = array();
1254 // Array of chars that are numbers (eg. digits)
1255 $number = array();
1256 // Array of chars to be omitted (eg. Russian hard sign)
1257 $omit = array();
1258 while (!feof($fh)) {
1259 $line = fgets($fh, 4096);
1260 // Has a lot of info
1261 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title, ) = explode(';', rtrim($line));
1262 $ord = hexdec($char);
1263 if ($ord > 65535) {
1264 // Only process the BMP
1265 break;
1266 }
1267 $utf8_char = $this->UnumberToChar($ord);
1268 if ($upper) {
1269 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1270 }
1271 if ($lower) {
1272 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1273 }
1274 // Store "title" only when different from "upper" (only a few)
1275 if ($title && $title != $upper) {
1276 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1277 }
1278 switch ($cat[0]) {
1279 case 'M':
1280 // mark (accent, umlaut, ...)
1281 $mark['U+' . $char] = 1;
1282 break;
1283 case 'N':
1284 // numeric value
1285 if ($ord > 128 && $num != '') {
1286 $number['U+' . $char] = $num;
1287 }
1288 }
1289 // Accented Latin letters without "official" decomposition
1290 $match = array();
1291 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1292 $c = ord($match[2]);
1293 if ($match[1] == 'SMALL') {
1294 $c += 32;
1295 }
1296 $decomposition['U+' . $char] = array(dechex($c));
1297 continue;
1298 }
1299 $match = array();
1300 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1301 switch ($match[1]) {
1302 case '<circle>':
1303 // add parenthesis as circle replacement, eg (1)
1304 $match[2] = '0028 ' . $match[2] . ' 0029';
1305 break;
1306 case '<square>':
1307 // add square brackets as square replacement, eg [1]
1308 $match[2] = '005B ' . $match[2] . ' 005D';
1309 break;
1310 case '<compat>':
1311 // ignore multi char decompositions that start with a space
1312 if (preg_match('/^0020 /', $match[2])) {
1313 continue 2;
1314 }
1315 break;
1316 case '<initial>':
1317
1318 case '<medial>':
1319
1320 case '<final>':
1321
1322 case '<isolated>':
1323
1324 case '<vertical>':
1325 continue 2;
1326 }
1327 $decomposition['U+' . $char] = explode(' ', $match[2]);
1328 }
1329 }
1330 fclose($fh);
1331 // Process additional Unicode data for casing (allow folded characters to expand into a sequence)
1332 $specialCasingFile = \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/SpecialCasing.txt';
1333 if (GeneralUtility::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1334 $fh = fopen($specialCasingFile, 'rb');
1335 if ($fh) {
1336 while (!feof($fh)) {
1337 $line = fgets($fh, 4096);
1338 if ($line[0] != '#' && trim($line) != '') {
1339 list($char, $lower, $title, $upper, $cond) = GeneralUtility::trimExplode(';', $line);
1340 if ($cond == '' || $cond[0] == '#') {
1341 $utf8_char = $this->UnumberToChar(hexdec($char));
1342 if ($char != $lower) {
1343 $arr = explode(' ', $lower);
1344 for ($i = 0; isset($arr[$i]); $i++) {
1345 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1346 }
1347 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1348 }
1349 if ($char != $title && $title != $upper) {
1350 $arr = explode(' ', $title);
1351 for ($i = 0; isset($arr[$i]); $i++) {
1352 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1353 }
1354 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1355 }
1356 if ($char != $upper) {
1357 $arr = explode(' ', $upper);
1358 for ($i = 0; isset($arr[$i]); $i++) {
1359 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1360 }
1361 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1362 }
1363 }
1364 }
1365 }
1366 fclose($fh);
1367 }
1368 }
1369 // Process custom decompositions
1370 $customTranslitFile = \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/Translit.txt';
1371 if (GeneralUtility::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1372 $fh = fopen($customTranslitFile, 'rb');
1373 if ($fh) {
1374 while (!feof($fh)) {
1375 $line = fgets($fh, 4096);
1376 if ($line[0] != '#' && trim($line) != '') {
1377 list($char, $translit) = GeneralUtility::trimExplode(';', $line);
1378 if (!$translit) {
1379 $omit['U+' . $char] = 1;
1380 }
1381 $decomposition['U+' . $char] = explode(' ', $translit);
1382 }
1383 }
1384 fclose($fh);
1385 }
1386 }
1387 // Decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1388 foreach ($decomposition as $from => $to) {
1389 $code_decomp = array();
1390 while ($code_value = array_shift($to)) {
1391 // Do recursive decomposition
1392 if (isset($decomposition['U+' . $code_value])) {
1393 foreach (array_reverse($decomposition['U+' . $code_value]) as $cv) {
1394 array_unshift($to, $cv);
1395 }
1396 } elseif (!isset($mark[('U+' . $code_value)])) {
1397 // remove mark
1398 array_push($code_decomp, $code_value);
1399 }
1400 }
1401 if (count($code_decomp) || isset($omit[$from])) {
1402 $decomposition[$from] = $code_decomp;
1403 } else {
1404 unset($decomposition[$from]);
1405 }
1406 }
1407 // Create ascii only mapping
1408 $this->toASCII['utf-8'] = array();
1409 $ascii = &$this->toASCII['utf-8'];
1410 foreach ($decomposition as $from => $to) {
1411 $code_decomp = array();
1412 while ($code_value = array_shift($to)) {
1413 $ord = hexdec($code_value);
1414 if ($ord > 127) {
1415 continue 2;
1416 } else {
1417 // Skip decompositions containing non-ASCII chars
1418 array_push($code_decomp, chr($ord));
1419 }
1420 }
1421 $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1422 }
1423 // Add numeric decompositions
1424 foreach ($number as $from => $to) {
1425 $utf8_char = $this->UnumberToChar(hexdec($from));
1426 if (!isset($ascii[$utf8_char])) {
1427 $ascii[$utf8_char] = $to;
1428 }
1429 }
1430 if ($cacheFileCase) {
1431 GeneralUtility::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1432 }
1433 if ($cacheFileASCII) {
1434 GeneralUtility::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1435 }
1436 return 3;
1437 }
1438
1439 /**
1440 * This function initializes the folding table for a charset other than UTF-8.
1441 * This function is automatically called by the case folding functions.
1442 *
1443 * @param string $charset Charset for which to initialize case folding.
1444 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1445 * @access private
1446 * @todo Define visibility
1447 */
1448 public function initCaseFolding($charset) {
1449 // Only process if the case table is not yet loaded:
1450 if (is_array($this->caseFolding[$charset])) {
1451 return 1;
1452 }
1453 // Use cached version if possible
1454 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1455 if ($cacheFile && @is_file($cacheFile)) {
1456 $this->caseFolding[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1457 return 2;
1458 }
1459 // init UTF-8 conversion for this charset
1460 if (!$this->initCharset($charset)) {
1461 return FALSE;
1462 }
1463 // UTF-8 case folding is used as the base conversion table
1464 if (!$this->initUnicodeData('case')) {
1465 return FALSE;
1466 }
1467 $nochar = chr($this->noCharByteVal);
1468 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1469 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1470 $c = $this->utf8_decode($utf8, $charset);
1471 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1472 if ($cc != '' && $cc != $nochar) {
1473 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1474 }
1475 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1476 if ($cc != '' && $cc != $nochar) {
1477 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1478 }
1479 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1480 if ($cc != '' && $cc != $nochar) {
1481 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1482 }
1483 }
1484 // Add the ASCII case table
1485 $start = ord('a');
1486 $end = ord('z');
1487 for ($i = $start; $i <= $end; $i++) {
1488 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1489 }
1490 $start = ord('A');
1491 $end = ord('Z');
1492 for ($i = $start; $i <= $end; $i++) {
1493 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1494 }
1495 if ($cacheFile) {
1496 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1497 }
1498 return 3;
1499 }
1500
1501 /**
1502 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1503 * This function is automatically called by the ASCII transliteration functions.
1504 *
1505 * @param string $charset Charset for which to initialize conversion.
1506 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1507 * @access private
1508 * @todo Define visibility
1509 */
1510 public function initToASCII($charset) {
1511 // Only process if the case table is not yet loaded:
1512 if (is_array($this->toASCII[$charset])) {
1513 return 1;
1514 }
1515 // Use cached version if possible
1516 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1517 if ($cacheFile && @is_file($cacheFile)) {
1518 $this->toASCII[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1519 return 2;
1520 }
1521 // Init UTF-8 conversion for this charset
1522 if (!$this->initCharset($charset)) {
1523 return FALSE;
1524 }
1525 // UTF-8/ASCII transliteration is used as the base conversion table
1526 if (!$this->initUnicodeData('ascii')) {
1527 return FALSE;
1528 }
1529 $nochar = chr($this->noCharByteVal);
1530 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1531 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1532 $c = $this->utf8_decode($utf8, $charset);
1533 if (isset($this->toASCII['utf-8'][$utf8])) {
1534 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1535 }
1536 }
1537 if ($cacheFile) {
1538 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1539 }
1540 return 3;
1541 }
1542
1543 /********************************************
1544 *
1545 * String operation functions
1546 *
1547 ********************************************/
1548 /**
1549 * Returns a part of a string.
1550 * Unit-tested by Kasper (single byte charsets only)
1551 *
1552 * @param string $charset The character set
1553 * @param string $string Character string
1554 * @param integer $start Start position (character position)
1555 * @param integer $len Length (in characters)
1556 * @return string The substring
1557 * @see substr(), mb_substr()
1558 * @todo Define visibility
1559 */
1560 public function substr($charset, $string, $start, $len = NULL) {
1561 if ($len === 0 || $string === '') {
1562 return '';
1563 }
1564 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1565 // Cannot omit $len, when specifying charset
1566 if ($len == NULL) {
1567 // Save internal encoding
1568 $enc = mb_internal_encoding();
1569 mb_internal_encoding($charset);
1570 $str = mb_substr($string, $start);
1571 // Restore internal encoding
1572 mb_internal_encoding($enc);
1573 return $str;
1574 } else {
1575 return mb_substr($string, $start, $len, $charset);
1576 }
1577 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1578 // Cannot omit $len, when specifying charset
1579 if ($len == NULL) {
1580 // Save internal encoding
1581 $enc = iconv_get_encoding('internal_encoding');
1582 iconv_set_encoding('internal_encoding', $charset);
1583 $str = iconv_substr($string, $start);
1584 // Restore internal encoding
1585 iconv_set_encoding('internal_encoding', $enc);
1586 return $str;
1587 } else {
1588 return iconv_substr($string, $start, $len, $charset);
1589 }
1590 } elseif ($charset == 'utf-8') {
1591 return $this->utf8_substr($string, $start, $len);
1592 } elseif ($this->eucBasedSets[$charset]) {
1593 return $this->euc_substr($string, $start, $charset, $len);
1594 } elseif ($this->twoByteSets[$charset]) {
1595 return substr($string, $start * 2, $len * 2);
1596 } elseif ($this->fourByteSets[$charset]) {
1597 return substr($string, $start * 4, $len * 4);
1598 }
1599 // Treat everything else as single-byte encoding
1600 return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
1601 }
1602
1603 /**
1604 * Counts the number of characters.
1605 * Unit-tested by Kasper (single byte charsets only)
1606 *
1607 * @param string $charset The character set
1608 * @param string $string Character string
1609 * @return integer The number of characters
1610 * @see strlen()
1611 * @todo Define visibility
1612 */
1613 public function strlen($charset, $string) {
1614 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1615 return mb_strlen($string, $charset);
1616 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1617 return iconv_strlen($string, $charset);
1618 } elseif ($charset == 'utf-8') {
1619 return $this->utf8_strlen($string);
1620 } elseif ($this->eucBasedSets[$charset]) {
1621 return $this->euc_strlen($string, $charset);
1622 } elseif ($this->twoByteSets[$charset]) {
1623 return strlen($string) / 2;
1624 } elseif ($this->fourByteSets[$charset]) {
1625 return strlen($string) / 4;
1626 }
1627 // Treat everything else as single-byte encoding
1628 return strlen($string);
1629 }
1630
1631 /**
1632 * Method to crop strings using the mb_substr function.
1633 *
1634 * @param string $charset The character set
1635 * @param string $string String to be cropped
1636 * @param integer $len Crop length (in characters)
1637 * @param string $crop Crop signifier
1638 * @return string The shortened string
1639 * @see mb_strlen(), mb_substr()
1640 */
1641 protected function cropMbstring($charset, $string, $len, $crop = '') {
1642 if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) {
1643 return $string;
1644 }
1645 if ($len > 0) {
1646 $string = mb_substr($string, 0, $len, $charset) . $crop;
1647 } else {
1648 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1649 }
1650 return $string;
1651 }
1652
1653 /**
1654 * Truncates a string and pre-/appends a string.
1655 * Unit tested by Kasper
1656 *
1657 * @param string $charset The character set
1658 * @param string $string Character string
1659 * @param integer $len Length (in characters)
1660 * @param string $crop Crop signifier
1661 * @return string The shortened string
1662 * @see substr(), mb_strimwidth()
1663 * @todo Define visibility
1664 */
1665 public function crop($charset, $string, $len, $crop = '') {
1666 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1667 return $this->cropMbstring($charset, $string, $len, $crop);
1668 }
1669 if (intval($len) == 0) {
1670 return $string;
1671 }
1672 if ($charset == 'utf-8') {
1673 $i = $this->utf8_char2byte_pos($string, $len);
1674 } elseif ($this->eucBasedSets[$charset]) {
1675 $i = $this->euc_char2byte_pos($string, $len, $charset);
1676 } else {
1677 if ($len > 0) {
1678 $i = $len;
1679 } else {
1680 $i = strlen($string) + $len;
1681 if ($i <= 0) {
1682 $i = FALSE;
1683 }
1684 }
1685 }
1686 // $len outside actual string length
1687 if ($i === FALSE) {
1688 return $string;
1689 } else {
1690 if ($len > 0) {
1691 if (strlen($string[$i])) {
1692 return substr($string, 0, $i) . $crop;
1693 }
1694 } else {
1695 if (strlen($string[$i - 1])) {
1696 return $crop . substr($string, $i);
1697 }
1698 }
1699 }
1700 return $string;
1701 }
1702
1703 /**
1704 * Cuts a string short at a given byte length.
1705 *
1706 * @param string $charset The character set
1707 * @param string $string Character string
1708 * @param integer $len The byte length
1709 * @return string The shortened string
1710 * @see mb_strcut()
1711 * @todo Define visibility
1712 */
1713 public function strtrunc($charset, $string, $len) {
1714 if ($len <= 0) {
1715 return '';
1716 }
1717 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1718 return mb_strcut($string, 0, $len, $charset);
1719 } elseif ($charset == 'utf-8') {
1720 return $this->utf8_strtrunc($string, $len);
1721 } elseif ($this->eucBasedSets[$charset]) {
1722 return $this->euc_strtrunc($string, $len, $charset);
1723 } elseif ($this->twoByteSets[$charset]) {
1724 if ($len % 2) {
1725 $len--;
1726 }
1727 } elseif ($this->fourByteSets[$charset]) {
1728 $x = $len % 4;
1729 // Realign to position dividable by four
1730 $len -= $x;
1731 }
1732 // Treat everything else as single-byte encoding
1733 return substr($string, 0, $len);
1734 }
1735
1736 /**
1737 * Translates all characters of a string into their respective case values.
1738 * Unlike strtolower() and strtoupper() this method is locale independent.
1739 * Note that the string length may change!
1740 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1741 * Unit-tested by Kasper
1742 * Real case folding is language dependent, this method ignores this fact.
1743 *
1744 * @param string $charset Character set of string
1745 * @param string $string Input string to convert case for
1746 * @param string $case Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1747 * @return string The converted string
1748 * @see strtolower(), strtoupper()
1749 * @todo Define visibility
1750 */
1751 public function conv_case($charset, $string, $case) {
1752 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1753 if ($case == 'toLower') {
1754 $string = mb_strtolower($string, $charset);
1755 } else {
1756 $string = mb_strtoupper($string, $charset);
1757 }
1758 } elseif ($charset == 'utf-8') {
1759 $string = $this->utf8_char_mapping($string, 'case', $case);
1760 } elseif (isset($this->eucBasedSets[$charset])) {
1761 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1762 } else {
1763 // Treat everything else as single-byte encoding
1764 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1765 }
1766 return $string;
1767 }
1768
1769 /**
1770 * Equivalent of lcfirst/ucfirst but using character set.
1771 *
1772 * @param string $charset
1773 * @param string $string
1774 * @param string $case
1775 * @return string
1776 * @see \TYPO3\CMS\Core\Charset\CharsetConverter::conv_case()
1777 */
1778 public function convCaseFirst($charset, $string, $case) {
1779 $firstChar = $this->substr($charset, $string, 0, 1);
1780 $firstChar = $this->conv_case($charset, $firstChar, $case);
1781 $remainder = $this->substr($charset, $string, 1);
1782 return $firstChar . $remainder;
1783 }
1784
1785 /**
1786 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1787 *
1788 * @param string $charset Character set of string
1789 * @param string $string Input string to convert
1790 * @return string The converted string
1791 * @todo Define visibility
1792 */
1793 public function specCharsToASCII($charset, $string) {
1794 if ($charset == 'utf-8') {
1795 $string = $this->utf8_char_mapping($string, 'ascii');
1796 } elseif (isset($this->eucBasedSets[$charset])) {
1797 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1798 } else {
1799 // Treat everything else as single-byte encoding
1800 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1801 }
1802 return $string;
1803 }
1804
1805 /**
1806 * Converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1807 * into a TYPO3-readable language code
1808 *
1809 * @param string $languageCodesList List of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1810 * @return string A preferred language that TYPO3 supports, or "default" if none found
1811 */
1812 public function getPreferredClientLanguage($languageCodesList) {
1813 $allLanguageCodes = array();
1814 $selectedLanguage = 'default';
1815 // Get all languages where TYPO3 code is the same as the ISO code
1816 foreach ($this->charSetArray as $typo3Lang => $charSet) {
1817 $allLanguageCodes[$typo3Lang] = $typo3Lang;
1818 }
1819 // Get all languages where TYPO3 code differs from ISO code
1820 // or needs the country part
1821 // the iso codes will here overwrite the default typo3 language in the key
1822 foreach ($this->locales->getIsoMapping() as $typo3Lang => $isoLang) {
1823 $isoLang = join('-', explode('_', $isoLang));
1824 $allLanguageCodes[$typo3Lang] = $isoLang;
1825 }
1826 // Move the iso codes to the (because we're comparing the keys with "isset" later on)
1827 $allLanguageCodes = array_flip($allLanguageCodes);
1828 $preferredLanguages = GeneralUtility::trimExplode(',', $languageCodesList);
1829 // Order the preferred languages after they key
1830 $sortedPreferredLanguages = array();
1831 foreach ($preferredLanguages as $preferredLanguage) {
1832 $quality = 1.0;
1833 if (strpos($preferredLanguage, ';q=') !== FALSE) {
1834 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1835 }
1836 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1837 }
1838 // Loop through the languages, with the highest priority first
1839 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1840 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1841 if (isset($allLanguageCodes[$preferredLanguage])) {
1842 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1843 break;
1844 }
1845 // Strip the country code from the end
1846 list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
1847 if (isset($allLanguageCodes[$preferredLanguage])) {
1848 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1849 break;
1850 }
1851 }
1852 if (!$selectedLanguage || $selectedLanguage == 'en') {
1853 $selectedLanguage = 'default';
1854 }
1855 return $selectedLanguage;
1856 }
1857
1858 /********************************************
1859 *
1860 * Internal string operation functions
1861 *
1862 ********************************************/
1863 /**
1864 * Maps all characters of a string in a single byte charset.
1865 *
1866 * @param string $str The string
1867 * @param string $charset The charset
1868 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1869 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
1870 * @return string The converted string
1871 * @todo Define visibility
1872 */
1873 public function sb_char_mapping($str, $charset, $mode, $opt = '') {
1874 switch ($mode) {
1875 case 'case':
1876 if (!$this->initCaseFolding($charset)) {
1877 return $str;
1878 }
1879 // Do nothing
1880 $map = &$this->caseFolding[$charset][$opt];
1881 break;
1882 case 'ascii':
1883 if (!$this->initToASCII($charset)) {
1884 return $str;
1885 }
1886 // Do nothing
1887 $map = &$this->toASCII[$charset];
1888 break;
1889 default:
1890 return $str;
1891 }
1892 $out = '';
1893 for ($i = 0; strlen($str[$i]); $i++) {
1894 $c = $str[$i];
1895 if (isset($map[$c])) {
1896 $out .= $map[$c];
1897 } else {
1898 $out .= $c;
1899 }
1900 }
1901 return $out;
1902 }
1903
1904 /********************************************
1905 *
1906 * Internal UTF-8 string operation functions
1907 *
1908 ********************************************/
1909 /**
1910 * Returns a part of a UTF-8 string.
1911 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1912 *
1913 * @param string $str UTF-8 string
1914 * @param integer $start Start position (character position)
1915 * @param integer $len Length (in characters)
1916 * @return string The substring
1917 * @see substr()
1918 * @todo Define visibility
1919 */
1920 public function utf8_substr($str, $start, $len = NULL) {
1921 if (!strcmp($len, '0')) {
1922 return '';
1923 }
1924 $byte_start = $this->utf8_char2byte_pos($str, $start);
1925 if ($byte_start === FALSE) {
1926 if ($start > 0) {
1927 // $start outside string length
1928 return FALSE;
1929 } else {
1930 $start = 0;
1931 }
1932 }
1933 $str = substr($str, $byte_start);
1934 if ($len != NULL) {
1935 $byte_end = $this->utf8_char2byte_pos($str, $len);
1936 // $len outside actual string length
1937 if ($byte_end === FALSE) {
1938 return $len < 0 ? '' : $str;
1939 } else {
1940 // When length is less than zero and exceeds, then we return blank string.
1941 return substr($str, 0, $byte_end);
1942 }
1943 } else {
1944 return $str;
1945 }
1946 }
1947
1948 /**
1949 * Counts the number of characters of a string in UTF-8.
1950 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1951 *
1952 * @param string $str UTF-8 multibyte character string
1953 * @return integer The number of characters
1954 * @see strlen()
1955 * @todo Define visibility
1956 */
1957 public function utf8_strlen($str) {
1958 $n = 0;
1959 for ($i = 0; strlen($str[$i]); $i++) {
1960 $c = ord($str[$i]);
1961 // Single-byte (0xxxxxx)
1962 if (!($c & 128)) {
1963 $n++;
1964 } elseif (($c & 192) == 192) {
1965 // Multi-byte starting byte (11xxxxxx)
1966 $n++;
1967 }
1968 }
1969 return $n;
1970 }
1971
1972 /**
1973 * Truncates a string in UTF-8 short at a given byte length.
1974 *
1975 * @param string $str UTF-8 multibyte character string
1976 * @param integer $len The byte length
1977 * @return string The shortened string
1978 * @see mb_strcut()
1979 * @todo Define visibility
1980 */
1981 public function utf8_strtrunc($str, $len) {
1982 $i = $len - 1;
1983 // Part of a multibyte sequence
1984 if (ord($str[$i]) & 128) {
1985 for (; $i > 0 && !(ord($str[$i]) & 64); $i--) {
1986
1987 }
1988 if ($i <= 0) {
1989 return '';
1990 }
1991 // Sanity check
1992 for ($bc = 0, $mbs = ord($str[$i]); $mbs & 128; $mbs = $mbs << 1) {
1993 // Calculate number of bytes
1994 $bc++;
1995 }
1996 if ($bc + $i > $len) {
1997 return substr($str, 0, $i);
1998 }
1999 }
2000 return substr($str, 0, $len);
2001 }
2002
2003 /**
2004 * Find position of first occurrence of a string, both arguments are in UTF-8.
2005 *
2006 * @param string $haystack UTF-8 string to search in
2007 * @param string $needle UTF-8 string to search for
2008 * @param integer $offset Positition to start the search
2009 * @return integer The character position
2010 * @see strpos()
2011 * @todo Define visibility
2012 */
2013 public function utf8_strpos($haystack, $needle, $offset = 0) {
2014 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
2015 return mb_strpos($haystack, $needle, $offset, 'utf-8');
2016 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
2017 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
2018 }
2019 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
2020 if ($byte_offset === FALSE) {
2021 // Offset beyond string length
2022 return FALSE;
2023 }
2024 $byte_pos = strpos($haystack, $needle, $byte_offset);
2025 if ($byte_pos === FALSE) {
2026 // Needle not found
2027 return FALSE;
2028 }
2029 return $this->utf8_byte2char_pos($haystack, $byte_pos);
2030 }
2031
2032 /**
2033 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
2034 *
2035 * @param string $haystack UTF-8 string to search in
2036 * @param string $needle UTF-8 character to search for (single character)
2037 * @return integer The character position
2038 * @see strrpos()
2039 * @todo Define visibility
2040 */
2041 public function utf8_strrpos($haystack, $needle) {
2042 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
2043 return mb_strrpos($haystack, $needle, 'utf-8');
2044 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
2045 return iconv_strrpos($haystack, $needle, 'utf-8');
2046 }
2047 $byte_pos = strrpos($haystack, $needle);
2048 if ($byte_pos === FALSE) {
2049 // Needle not found
2050 return FALSE;
2051 }
2052 return $this->utf8_byte2char_pos($haystack, $byte_pos);
2053 }
2054
2055 /**
2056 * Translates a character position into an 'absolute' byte position.
2057 * Unit tested by Kasper.
2058 *
2059 * @param string $str UTF-8 string
2060 * @param integer $pos Character position (negative values start from the end)
2061 * @return integer Byte position
2062 * @todo Define visibility
2063 */
2064 public function utf8_char2byte_pos($str, $pos) {
2065 // Number of characters found
2066 $n = 0;
2067 // Number of characters wanted
2068 $p = abs($pos);
2069 if ($pos >= 0) {
2070 $i = 0;
2071 $d = 1;
2072 } else {
2073 $i = strlen($str) - 1;
2074 $d = -1;
2075 }
2076 for (; strlen($str[$i]) && $n < $p; $i += $d) {
2077 $c = (int) ord($str[$i]);
2078 // single-byte (0xxxxxx)
2079 if (!($c & 128)) {
2080 $n++;
2081 } elseif (($c & 192) == 192) {
2082 // Multi-byte starting byte (11xxxxxx)
2083 $n++;
2084 }
2085 }
2086 if (!strlen($str[$i])) {
2087 // Offset beyond string length
2088 return FALSE;
2089 }
2090 if ($pos >= 0) {
2091 // Skip trailing multi-byte data bytes
2092 while (ord($str[$i]) & 128 && !(ord($str[$i]) & 64)) {
2093 $i++;
2094 }
2095 } else {
2096 // Correct offset
2097 $i++;
2098 }
2099 return $i;
2100 }
2101
2102 /**
2103 * Translates an 'absolute' byte position into a character position.
2104 * Unit tested by Kasper.
2105 *
2106 * @param string $str UTF-8 string
2107 * @param integer $pos Byte position
2108 * @return integer Character position
2109 * @todo Define visibility
2110 */
2111 public function utf8_byte2char_pos($str, $pos) {
2112 // Number of characters
2113 $n = 0;
2114 for ($i = $pos; $i > 0; $i--) {
2115 $c = (int) ord($str[$i]);
2116 // single-byte (0xxxxxx)
2117 if (!($c & 128)) {
2118 $n++;
2119 } elseif (($c & 192) == 192) {
2120 // Multi-byte starting byte (11xxxxxx)
2121 $n++;
2122 }
2123 }
2124 if (!strlen($str[$i])) {
2125 // Offset beyond string length
2126 return FALSE;
2127 }
2128 return $n;
2129 }
2130
2131 /**
2132 * Maps all characters of an UTF-8 string.
2133 *
2134 * @param string $str UTF-8 string
2135 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2136 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
2137 * @return string The converted string
2138 * @todo Define visibility
2139 */
2140 public function utf8_char_mapping($str, $mode, $opt = '') {
2141 if (!$this->initUnicodeData($mode)) {
2142 // Do nothing
2143 return $str;
2144 }
2145 $out = '';
2146 switch ($mode) {
2147 case 'case':
2148 $map = &$this->caseFolding['utf-8'][$opt];
2149 break;
2150 case 'ascii':
2151 $map = &$this->toASCII['utf-8'];
2152 break;
2153 default:
2154 return $str;
2155 }
2156 for ($i = 0; strlen($str[$i]); $i++) {
2157 $c = ord($str[$i]);
2158 // single-byte (0xxxxxx)
2159 if (!($c & 128)) {
2160 $mbc = $str[$i];
2161 } elseif (($c & 192) == 192) {
2162 // multi-byte starting byte (11xxxxxx)
2163 for ($bc = 0; $c & 128; $c = $c << 1) {
2164 $bc++;
2165 }
2166 // calculate number of bytes
2167 $mbc = substr($str, $i, $bc);
2168 $i += $bc - 1;
2169 }
2170 if (isset($map[$mbc])) {
2171 $out .= $map[$mbc];
2172 } else {
2173 $out .= $mbc;
2174 }
2175 }
2176 return $out;
2177 }
2178
2179 /********************************************
2180 *
2181 * Internal EUC string operation functions
2182 *
2183 * Extended Unix Code:
2184 * ASCII compatible 7bit single bytes chars
2185 * 8bit two byte chars
2186 *
2187 * Shift-JIS is treated as a special case.
2188 *
2189 ********************************************/
2190 /**
2191 * Cuts a string in the EUC charset family short at a given byte length.
2192 *
2193 * @param string $str EUC multibyte character string
2194 * @param integer $len The byte length
2195 * @param string $charset The charset
2196 * @return string The shortened string
2197 * @see mb_strcut()
2198 * @todo Define visibility
2199 */
2200 public function euc_strtrunc($str, $len, $charset) {
2201 $sjis = $charset == 'shift_jis';
2202 for ($i = 0; strlen($str[$i]) && $i < $len; $i++) {
2203 $c = ord($str[$i]);
2204 if ($sjis) {
2205 if ($c >= 128 && $c < 160 || $c >= 224) {
2206 $i++;
2207 }
2208 } else {
2209 if ($c >= 128) {
2210 $i++;
2211 }
2212 }
2213 }
2214 if (!strlen($str[$i])) {
2215 return $str;
2216 }
2217 // string shorter than supplied length
2218 if ($i > $len) {
2219 // We ended on a first byte
2220 return substr($str, 0, $len - 1);
2221 } else {
2222 return substr($str, 0, $len);
2223 }
2224 }
2225
2226 /**
2227 * Returns a part of a string in the EUC charset family.
2228 *
2229 * @param string $str EUC multibyte character string
2230 * @param integer $start Start position (character position)
2231 * @param string $charset The charset
2232 * @param integer $len Length (in characters)
2233 * @return string the substring
2234 * @todo Define visibility
2235 */
2236 public function euc_substr($str, $start, $charset, $len = NULL) {
2237 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2238 if ($byte_start === FALSE) {
2239 // $start outside string length
2240 return FALSE;
2241 }
2242 $str = substr($str, $byte_start);
2243 if ($len != NULL) {
2244 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2245 // $len outside actual string length
2246 if ($byte_end === FALSE) {
2247 return $str;
2248 } else {
2249 return substr($str, 0, $byte_end);
2250 }
2251 } else {
2252 return $str;
2253 }
2254 }
2255
2256 /**
2257 * Counts the number of characters of a string in the EUC charset family.
2258 *
2259 * @param string $str EUC multibyte character string
2260 * @param string $charset The charset
2261 * @return integer The number of characters
2262 * @see strlen()
2263 * @todo Define visibility
2264 */
2265 public function euc_strlen($str, $charset) {
2266 $sjis = $charset == 'shift_jis';
2267 $n = 0;
2268 for ($i = 0; strlen($str[$i]); $i++) {
2269 $c = ord($str[$i]);
2270 if ($sjis) {
2271 if ($c >= 128 && $c < 160 || $c >= 224) {
2272 $i++;
2273 }
2274 } else {
2275 if ($c >= 128) {
2276 $i++;
2277 }
2278 }
2279 $n++;
2280 }
2281 return $n;
2282 }
2283
2284 /**
2285 * Translates a character position into an 'absolute' byte position.
2286 *
2287 * @param string $str EUC multibyte character string
2288 * @param integer $pos Character position (negative values start from the end)
2289 * @param string $charset The charset
2290 * @return integer Byte position
2291 * @todo Define visibility
2292 */
2293 public function euc_char2byte_pos($str, $pos, $charset) {
2294 $sjis = $charset == 'shift_jis';
2295 // Number of characters seen
2296 $n = 0;
2297 // Number of characters wanted
2298 $p = abs($pos);
2299 if ($pos >= 0) {
2300 $i = 0;
2301 $d = 1;
2302 } else {
2303 $i = strlen($str) - 1;
2304 $d = -1;
2305 }
2306 for (; strlen($str[$i]) && $n < $p; $i += $d) {
2307 $c = ord($str[$i]);
2308 if ($sjis) {
2309 if ($c >= 128 && $c < 160 || $c >= 224) {
2310 $i += $d;
2311 }
2312 } else {
2313 if ($c >= 128) {
2314 $i += $d;
2315 }
2316 }
2317 $n++;
2318 }
2319 if (!strlen($str[$i])) {
2320 return FALSE;
2321 }
2322 // offset beyond string length
2323 if ($pos < 0) {
2324 $i++;
2325 }
2326 // correct offset
2327 return $i;
2328 }
2329
2330 /**
2331 * Maps all characters of a string in the EUC charset family.
2332 *
2333 * @param string $str EUC multibyte character string
2334 * @param string $charset The charset
2335 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2336 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
2337 * @return string The converted string
2338 * @todo Define visibility
2339 */
2340 public function euc_char_mapping($str, $charset, $mode, $opt = '') {
2341 switch ($mode) {
2342 case 'case':
2343 if (!$this->initCaseFolding($charset)) {
2344 return $str;
2345 }
2346 // do nothing
2347 $map = &$this->caseFolding[$charset][$opt];
2348 break;
2349 case 'ascii':
2350 if (!$this->initToASCII($charset)) {
2351 return $str;
2352 }
2353 // do nothing
2354 $map = &$this->toASCII[$charset];
2355 break;
2356 default:
2357 return $str;
2358 }
2359 $sjis = $charset == 'shift_jis';
2360 $out = '';
2361 for ($i = 0; strlen($str[$i]); $i++) {
2362 $mbc = $str[$i];
2363 $c = ord($mbc);
2364 if ($sjis) {
2365 // A double-byte char
2366 if ($c >= 128 && $c < 160 || $c >= 224) {
2367 $mbc = substr($str, $i, 2);
2368 $i++;
2369 }
2370 } else {
2371 // A double-byte char
2372 if ($c >= 128) {
2373 $mbc = substr($str, $i, 2);
2374 $i++;
2375 }
2376 }
2377 if (isset($map[$mbc])) {
2378 $out .= $map[$mbc];
2379 } else {
2380 $out .= $mbc;
2381 }
2382 }
2383 return $out;
2384 }
2385
2386 }
2387
2388
2389 ?>