[TASK] Re-work/simplify copyright header in PHP files - Part 3
[Packages/TYPO3.CMS.git] / typo3 / sysext / core / Classes / Charset / CharsetConverter.php
1 <?php
2 namespace TYPO3\CMS\Core\Charset;
3
4 /**
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Utility\GeneralUtility;
18
19 /**
20 * Notes on UTF-8
21 *
22 * Functions working on UTF-8 strings:
23 *
24 * - strchr/strstr
25 * - strrchr
26 * - substr_count
27 * - implode/explode/join
28 *
29 * Functions nearly working on UTF-8 strings:
30 *
31 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
32 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
33 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
34 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
35 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
36 *
37 * Functions NOT working on UTF-8 strings:
38 *
39 * - str*cmp
40 * - stristr
41 * - stripos
42 * - substr
43 * - strrev
44 * - split/spliti
45 * - ...
46 */
47
48 /**
49 * Class for conversion between charsets
50 *
51 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
52 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
53 */
54 class CharsetConverter {
55
56 /**
57 * @var \TYPO3\CMS\Core\Localization\Locales
58 */
59 protected $locales;
60
61 // ASCII Value for chars with no equivalent.
62 /**
63 * @todo Define visibility
64 */
65 public $noCharByteVal = 63;
66
67 // This is the array where parsed conversion tables are stored (cached)
68 /**
69 * @todo Define visibility
70 */
71 public $parsedCharsets = array();
72
73 // An array where case folding data will be stored (cached)
74 /**
75 * @todo Define visibility
76 */
77 public $caseFolding = array();
78
79 // An array where charset-to-ASCII mappings are stored (cached)
80 /**
81 * @todo Define visibility
82 */
83 public $toASCII = array();
84
85 // This tells the converter which charsets has two bytes per char:
86 /**
87 * @todo Define visibility
88 */
89 public $twoByteSets = array(
90 'ucs-2' => 1
91 );
92
93 // This tells the converter which charsets has four bytes per char:
94 /**
95 * @todo Define visibility
96 */
97 public $fourByteSets = array(
98 'ucs-4' => 1,
99 // 4-byte Unicode
100 'utf-32' => 1
101 );
102
103 // This tells the converter which charsets use a scheme like the Extended Unix Code:
104 /**
105 * @todo Define visibility
106 */
107 public $eucBasedSets = array(
108 'gb2312' => 1,
109 // Chinese, simplified.
110 'big5' => 1,
111 // Chinese, traditional.
112 'euc-kr' => 1,
113 // Korean
114 'shift_jis' => 1
115 );
116
117 // See http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
118 // http://czyborra.com/charsets/iso8859.html
119 /**
120 * @todo Define visibility
121 */
122 public $synonyms = array(
123 'us' => 'ascii',
124 'us-ascii' => 'ascii',
125 'cp819' => 'iso-8859-1',
126 'ibm819' => 'iso-8859-1',
127 'iso-ir-100' => 'iso-8859-1',
128 'iso-ir-101' => 'iso-8859-2',
129 'iso-ir-109' => 'iso-8859-3',
130 'iso-ir-110' => 'iso-8859-4',
131 'iso-ir-144' => 'iso-8859-5',
132 'iso-ir-127' => 'iso-8859-6',
133 'iso-ir-126' => 'iso-8859-7',
134 'iso-ir-138' => 'iso-8859-8',
135 'iso-ir-148' => 'iso-8859-9',
136 'iso-ir-157' => 'iso-8859-10',
137 'iso-ir-179' => 'iso-8859-13',
138 'iso-ir-199' => 'iso-8859-14',
139 'iso-ir-203' => 'iso-8859-15',
140 'csisolatin1' => 'iso-8859-1',
141 'csisolatin2' => 'iso-8859-2',
142 'csisolatin3' => 'iso-8859-3',
143 'csisolatin5' => 'iso-8859-9',
144 'csisolatin8' => 'iso-8859-14',
145 'csisolatin9' => 'iso-8859-15',
146 'csisolatingreek' => 'iso-8859-7',
147 'iso-celtic' => 'iso-8859-14',
148 'latin1' => 'iso-8859-1',
149 'latin2' => 'iso-8859-2',
150 'latin3' => 'iso-8859-3',
151 'latin5' => 'iso-8859-9',
152 'latin6' => 'iso-8859-10',
153 'latin8' => 'iso-8859-14',
154 'latin9' => 'iso-8859-15',
155 'l1' => 'iso-8859-1',
156 'l2' => 'iso-8859-2',
157 'l3' => 'iso-8859-3',
158 'l5' => 'iso-8859-9',
159 'l6' => 'iso-8859-10',
160 'l8' => 'iso-8859-14',
161 'l9' => 'iso-8859-15',
162 'cyrillic' => 'iso-8859-5',
163 'arabic' => 'iso-8859-6',
164 'tis-620' => 'iso-8859-11',
165 'win874' => 'windows-874',
166 'win1250' => 'windows-1250',
167 'win1251' => 'windows-1251',
168 'win1252' => 'windows-1252',
169 'win1253' => 'windows-1253',
170 'win1254' => 'windows-1254',
171 'win1255' => 'windows-1255',
172 'win1256' => 'windows-1256',
173 'win1257' => 'windows-1257',
174 'win1258' => 'windows-1258',
175 'cp1250' => 'windows-1250',
176 'cp1251' => 'windows-1251',
177 'cp1252' => 'windows-1252',
178 'ms-ee' => 'windows-1250',
179 'ms-ansi' => 'windows-1252',
180 'ms-greek' => 'windows-1253',
181 'ms-turk' => 'windows-1254',
182 'winbaltrim' => 'windows-1257',
183 'koi-8ru' => 'koi-8r',
184 'koi8r' => 'koi-8r',
185 'cp878' => 'koi-8r',
186 'mac' => 'macroman',
187 'macintosh' => 'macroman',
188 'euc-cn' => 'gb2312',
189 'x-euc-cn' => 'gb2312',
190 'euccn' => 'gb2312',
191 'cp936' => 'gb2312',
192 'big-5' => 'big5',
193 'cp950' => 'big5',
194 'eucjp' => 'euc-jp',
195 'sjis' => 'shift_jis',
196 'shift-jis' => 'shift_jis',
197 'cp932' => 'shift_jis',
198 'cp949' => 'euc-kr',
199 'utf7' => 'utf-7',
200 'utf8' => 'utf-8',
201 'utf16' => 'utf-16',
202 'utf32' => 'utf-32',
203 'utf8' => 'utf-8',
204 'ucs2' => 'ucs-2',
205 'ucs4' => 'ucs-4'
206 );
207
208 // Mapping of iso-639-1 language codes to script names
209 /**
210 * @todo Define visibility
211 */
212 public $lang_to_script = array(
213 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
214 'af' => 'west_european',
215 //Afrikaans
216 'ar' => 'arabic',
217 'bg' => 'cyrillic',
218 // Bulgarian
219 'bs' => 'east_european',
220 // Bosnian
221 'cs' => 'east_european',
222 // Czech
223 'da' => 'west_european',
224 // Danish
225 'de' => 'west_european',
226 // German
227 'es' => 'west_european',
228 // Spanish
229 'et' => 'estonian',
230 'eo' => 'unicode',
231 // Esperanto
232 'eu' => 'west_european',
233 // Basque
234 'fa' => 'arabic',
235 // Persian
236 'fi' => 'west_european',
237 // Finish
238 'fo' => 'west_european',
239 // Faroese
240 'fr' => 'west_european',
241 // French
242 'ga' => 'west_european',
243 // Irish
244 'gl' => 'west_european',
245 // Galician
246 'gr' => 'greek',
247 'he' => 'hebrew',
248 // Hebrew (since 1998)
249 'hi' => 'unicode',
250 // Hindi
251 'hr' => 'east_european',
252 // Croatian
253 'hu' => 'east_european',
254 // Hungarian
255 'iw' => 'hebrew',
256 // Hebrew (til 1998)
257 'is' => 'west_european',
258 // Icelandic
259 'it' => 'west_european',
260 // Italian
261 'ja' => 'japanese',
262 'ka' => 'unicode',
263 // Georgian
264 'kl' => 'west_european',
265 // Greenlandic
266 'km' => 'unicode',
267 // Khmer
268 'ko' => 'korean',
269 'lt' => 'lithuanian',
270 'lv' => 'west_european',
271 // Latvian/Lettish
272 'nl' => 'west_european',
273 // Dutch
274 'no' => 'west_european',
275 // Norwegian
276 'nb' => 'west_european',
277 // Norwegian Bokmal
278 'nn' => 'west_european',
279 // Norwegian Nynorsk
280 'pl' => 'east_european',
281 // Polish
282 'pt' => 'west_european',
283 // Portuguese
284 'ro' => 'east_european',
285 // Romanian
286 'ru' => 'cyrillic',
287 // Russian
288 'sk' => 'east_european',
289 // Slovak
290 'sl' => 'east_european',
291 // Slovenian
292 'sr' => 'cyrillic',
293 // Serbian
294 'sv' => 'west_european',
295 // Swedish
296 'sq' => 'albanian',
297 // Albanian
298 'th' => 'thai',
299 'uk' => 'cyrillic',
300 // Ukranian
301 'vi' => 'vietnamese',
302 'zh' => 'chinese',
303 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
304 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
305 'afk' => 'west_european',
306 // Afrikaans
307 'ara' => 'arabic',
308 'bgr' => 'cyrillic',
309 // Bulgarian
310 'cat' => 'west_european',
311 // Catalan
312 'chs' => 'simpl_chinese',
313 'cht' => 'trad_chinese',
314 'csy' => 'east_european',
315 // Czech
316 'dan' => 'west_european',
317 // Danisch
318 'deu' => 'west_european',
319 // German
320 'dea' => 'west_european',
321 // German (Austrian)
322 'des' => 'west_european',
323 // German (Swiss)
324 'ena' => 'west_european',
325 // English (Australian)
326 'enc' => 'west_european',
327 // English (Canadian)
328 'eng' => 'west_european',
329 // English
330 'enz' => 'west_european',
331 // English (New Zealand)
332 'enu' => 'west_european',
333 // English (United States)
334 'euq' => 'west_european',
335 // Basque
336 'fos' => 'west_european',
337 // Faroese
338 'far' => 'arabic',
339 // Persian
340 'fin' => 'west_european',
341 // Finish
342 'fra' => 'west_european',
343 // French
344 'frb' => 'west_european',
345 // French (Belgian)
346 'frc' => 'west_european',
347 // French (Canadian)
348 'frs' => 'west_european',
349 // French (Swiss)
350 'geo' => 'unicode',
351 // Georgian
352 'glg' => 'west_european',
353 // Galician
354 'ell' => 'greek',
355 'heb' => 'hebrew',
356 'hin' => 'unicode',
357 // Hindi
358 'hun' => 'east_european',
359 // Hungarian
360 'isl' => 'west_european',
361 // Icelandic
362 'ita' => 'west_european',
363 // Italian
364 'its' => 'west_european',
365 // Italian (Swiss)
366 'jpn' => 'japanese',
367 'khm' => 'unicode',
368 // Khmer
369 'kor' => 'korean',
370 'lth' => 'lithuanian',
371 'lvi' => 'west_european',
372 // Latvian/Lettish
373 'msl' => 'west_european',
374 // Malay
375 'nlb' => 'west_european',
376 // Dutch (Belgian)
377 'nld' => 'west_european',
378 // Dutch
379 'nor' => 'west_european',
380 // Norwegian (bokmal)
381 'non' => 'west_european',
382 // Norwegian (nynorsk)
383 'plk' => 'east_european',
384 // Polish
385 'ptg' => 'west_european',
386 // Portuguese
387 'ptb' => 'west_european',
388 // Portuguese (Brazil)
389 'rom' => 'east_european',
390 // Romanian
391 'rus' => 'cyrillic',
392 // Russian
393 'slv' => 'east_european',
394 // Slovenian
395 'sky' => 'east_european',
396 // Slovak
397 'srl' => 'east_european',
398 // Serbian (Latin)
399 'srb' => 'cyrillic',
400 // Serbian (Cyrillic)
401 'esp' => 'west_european',
402 // Spanish (trad. sort)
403 'esm' => 'west_european',
404 // Spanish (Mexican)
405 'esn' => 'west_european',
406 // Spanish (internat. sort)
407 'sve' => 'west_european',
408 // Swedish
409 'sqi' => 'albanian',
410 // Albanian
411 'tha' => 'thai',
412 'trk' => 'turkish',
413 'ukr' => 'cyrillic',
414 // Ukrainian
415 // English language names
416 'afrikaans' => 'west_european',
417 'albanian' => 'albanian',
418 'arabic' => 'arabic',
419 'basque' => 'west_european',
420 'bosnian' => 'east_european',
421 'bulgarian' => 'east_european',
422 'catalan' => 'west_european',
423 'croatian' => 'east_european',
424 'czech' => 'east_european',
425 'danish' => 'west_european',
426 'dutch' => 'west_european',
427 'english' => 'west_european',
428 'esperanto' => 'unicode',
429 'estonian' => 'estonian',
430 'faroese' => 'west_european',
431 'farsi' => 'arabic',
432 'finnish' => 'west_european',
433 'french' => 'west_european',
434 'galician' => 'west_european',
435 'georgian' => 'unicode',
436 'german' => 'west_european',
437 'greek' => 'greek',
438 'greenlandic' => 'west_european',
439 'hebrew' => 'hebrew',
440 'hindi' => 'unicode',
441 'hungarian' => 'east_european',
442 'icelandic' => 'west_european',
443 'italian' => 'west_european',
444 'khmer' => 'unicode',
445 'latvian' => 'west_european',
446 'lettish' => 'west_european',
447 'lithuanian' => 'lithuanian',
448 'malay' => 'west_european',
449 'norwegian' => 'west_european',
450 'persian' => 'arabic',
451 'polish' => 'east_european',
452 'portuguese' => 'west_european',
453 'russian' => 'cyrillic',
454 'romanian' => 'east_european',
455 'serbian' => 'cyrillic',
456 'slovak' => 'east_european',
457 'slovenian' => 'east_european',
458 'spanish' => 'west_european',
459 'svedish' => 'west_european',
460 'that' => 'thai',
461 'turkish' => 'turkish',
462 'ukrainian' => 'cyrillic'
463 );
464
465 // Mapping of language (family) names to charsets on Unix
466 /**
467 * @todo Define visibility
468 */
469 public $script_to_charset_unix = array(
470 'west_european' => 'iso-8859-1',
471 'estonian' => 'iso-8859-1',
472 'east_european' => 'iso-8859-2',
473 'baltic' => 'iso-8859-4',
474 'cyrillic' => 'iso-8859-5',
475 'arabic' => 'iso-8859-6',
476 'greek' => 'iso-8859-7',
477 'hebrew' => 'iso-8859-8',
478 'turkish' => 'iso-8859-9',
479 'thai' => 'iso-8859-11',
480 // = TIS-620
481 'lithuanian' => 'iso-8859-13',
482 'chinese' => 'gb2312',
483 // = euc-cn
484 'japanese' => 'euc-jp',
485 'korean' => 'euc-kr',
486 'simpl_chinese' => 'gb2312',
487 'trad_chinese' => 'big5',
488 'vietnamese' => '',
489 'unicode' => 'utf-8',
490 'albanian' => 'utf-8'
491 );
492
493 // Mapping of language (family) names to charsets on Windows
494 /**
495 * @todo Define visibility
496 */
497 public $script_to_charset_windows = array(
498 'east_european' => 'windows-1250',
499 'cyrillic' => 'windows-1251',
500 'west_european' => 'windows-1252',
501 'greek' => 'windows-1253',
502 'turkish' => 'windows-1254',
503 'hebrew' => 'windows-1255',
504 'arabic' => 'windows-1256',
505 'baltic' => 'windows-1257',
506 'estonian' => 'windows-1257',
507 'lithuanian' => 'windows-1257',
508 'vietnamese' => 'windows-1258',
509 'thai' => 'cp874',
510 'korean' => 'cp949',
511 'chinese' => 'gb2312',
512 'japanese' => 'shift_jis',
513 'simpl_chinese' => 'gb2312',
514 'trad_chinese' => 'big5',
515 'albanian' => 'windows-1250',
516 'unicode' => 'utf-8'
517 );
518
519 // Mapping of locale names to charsets
520 /**
521 * @todo Define visibility
522 */
523 public $locale_to_charset = array(
524 'japanese.euc' => 'euc-jp',
525 'ja_jp.ujis' => 'euc-jp',
526 'korean.euc' => 'euc-kr',
527 'sr@Latn' => 'iso-8859-2',
528 'zh_cn' => 'gb2312',
529 'zh_hk' => 'big5',
530 'zh_tw' => 'big5'
531 );
532
533 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
534 // Empty values means "iso-8859-1"
535 /**
536 * @todo Define visibility
537 */
538 public $charSetArray = array(
539 'af' => '',
540 'ar' => 'iso-8859-6',
541 'ba' => 'iso-8859-2',
542 'bg' => 'windows-1251',
543 'br' => '',
544 'ca' => 'iso-8859-15',
545 'ch' => 'gb2312',
546 'cs' => 'windows-1250',
547 'cz' => 'windows-1250',
548 'da' => '',
549 'de' => '',
550 'dk' => '',
551 'el' => 'iso-8859-7',
552 'eo' => 'utf-8',
553 'es' => '',
554 'et' => 'iso-8859-4',
555 'eu' => '',
556 'fa' => 'utf-8',
557 'fi' => '',
558 'fo' => 'utf-8',
559 'fr' => '',
560 'fr_CA' => '',
561 'ga' => '',
562 'ge' => 'utf-8',
563 'gl' => '',
564 'gr' => 'iso-8859-7',
565 'he' => 'utf-8',
566 'hi' => 'utf-8',
567 'hk' => 'big5',
568 'hr' => 'windows-1250',
569 'hu' => 'iso-8859-2',
570 'is' => 'utf-8',
571 'it' => '',
572 'ja' => 'shift_jis',
573 'jp' => 'shift_jis',
574 'ka' => 'utf-8',
575 'kl' => 'utf-8',
576 'km' => 'utf-8',
577 'ko' => 'euc-kr',
578 'kr' => 'euc-kr',
579 'lt' => 'windows-1257',
580 'lv' => 'utf-8',
581 'ms' => '',
582 'my' => '',
583 'nl' => '',
584 'no' => '',
585 'pl' => 'iso-8859-2',
586 'pt' => '',
587 'pt_BR' => '',
588 'qc' => '',
589 'ro' => 'iso-8859-2',
590 'ru' => 'windows-1251',
591 'se' => '',
592 'si' => 'windows-1250',
593 'sk' => 'windows-1250',
594 'sl' => 'windows-1250',
595 'sq' => 'utf-8',
596 'sr' => 'utf-8',
597 'sv' => '',
598 'th' => 'iso-8859-11',
599 'tr' => 'iso-8859-9',
600 'ua' => 'windows-1251',
601 'uk' => 'windows-1251',
602 'vi' => 'utf-8',
603 'vn' => 'utf-8',
604 'zh' => 'big5'
605 );
606
607 /**
608 * Default constructor.
609 */
610 public function __construct() {
611 $this->locales = GeneralUtility::makeInstance('TYPO3\\CMS\\Core\\Localization\\Locales');
612 }
613
614 /**
615 * Normalize - changes input character set to lowercase letters.
616 *
617 * @param string $charset Input charset
618 * @return string Normalized charset
619 * @todo Define visibility
620 */
621 public function parse_charset($charset) {
622 $charset = trim(strtolower($charset));
623 if (isset($this->synonyms[$charset])) {
624 $charset = $this->synonyms[$charset];
625 }
626 return $charset;
627 }
628
629 /**
630 * Get the charset of a locale.
631 *
632 * ln language
633 * ln_CN language / country
634 * ln_CN.cs language / country / charset
635 * ln_CN.cs@mod language / country / charset / modifier
636 *
637 * @param string $locale Locale string
638 * @return string Charset resolved for locale string
639 * @todo Define visibility
640 */
641 public function get_locale_charset($locale) {
642 $locale = strtolower($locale);
643 // Exact locale specific charset?
644 if (isset($this->locale_to_charset[$locale])) {
645 return $this->locale_to_charset[$locale];
646 }
647 // Get modifier
648 list($locale, $modifier) = explode('@', $locale);
649 // Locale contains charset: use it
650 list($locale, $charset) = explode('.', $locale);
651 if ($charset) {
652 return $this->parse_charset($charset);
653 }
654 // Modifier is 'euro' (after charset check, because of xx.utf-8@euro)
655 if ($modifier == 'euro') {
656 return 'iso-8859-15';
657 }
658 // Get language
659 list($language, $country) = explode('_', $locale);
660 if (isset($this->lang_to_script[$language])) {
661 $script = $this->lang_to_script[$language];
662 }
663 if (TYPO3_OS == 'WIN') {
664 $cs = $this->script_to_charset_windows[$script] ?: 'windows-1252';
665 } else {
666 $cs = $this->script_to_charset_unix[$script] ?: 'utf-8';
667 }
668 return $cs;
669 }
670
671 /********************************************
672 *
673 * Charset Conversion functions
674 *
675 ********************************************/
676 /**
677 * Convert from one charset to another charset.
678 *
679 * @param string $str Input string
680 * @param string $fromCS From charset (the current charset of the string)
681 * @param string $toCS To charset (the output charset wanted)
682 * @param boolean $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
683 * @return string Converted string
684 * @see convArray()
685 * @todo Define visibility
686 */
687 public function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
688 if ($fromCS == $toCS) {
689 return $str;
690 }
691 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
692 if ($toCS == 'utf-8' || !$useEntityForNoChar) {
693 switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
694 case 'mbstring':
695 $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
696 if (FALSE !== $conv_str) {
697 return $conv_str;
698 }
699 // Returns FALSE for unsupported charsets
700 break;
701 case 'iconv':
702 $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
703 if (FALSE !== $conv_str) {
704 return $conv_str;
705 }
706 break;
707 case 'recode':
708 $conv_str = recode_string($fromCS . '..' . $toCS, $str);
709 if (FALSE !== $conv_str) {
710 return $conv_str;
711 }
712 break;
713 }
714 }
715 if ($fromCS != 'utf-8') {
716 $str = $this->utf8_encode($str, $fromCS);
717 }
718 if ($toCS != 'utf-8') {
719 $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
720 }
721 return $str;
722 }
723
724 /**
725 * Convert all elements in ARRAY with type string from one charset to another charset.
726 * NOTICE: Array is passed by reference!
727 *
728 * @param string $array Input array, possibly multidimensional
729 * @param string $fromCS From charset (the current charset of the string)
730 * @param string $toCS To charset (the output charset wanted)
731 * @param boolean $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
732 * @return void
733 * @see conv()
734 * @todo Define visibility
735 */
736 public function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
737 foreach ($array as $key => $value) {
738 if (is_array($array[$key])) {
739 $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
740 } elseif (is_string($array[$key])) {
741 $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
742 }
743 }
744 }
745
746 /**
747 * Converts $str from $charset to UTF-8
748 *
749 * @param string $str String in local charset to convert to UTF-8
750 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
751 * @return string Output string, converted to UTF-8
752 * @todo Define visibility
753 */
754 public function utf8_encode($str, $charset) {
755 if ($charset === 'utf-8') {
756 return $str;
757 }
758 // Charset is case-insensitive
759 // Parse conv. table if not already
760 if ($this->initCharset($charset)) {
761 $strLen = strlen($str);
762 $outStr = '';
763 // Traverse each char in string
764 for ($a = 0; $a < $strLen; $a++) {
765 $chr = substr($str, $a, 1);
766 $ord = ord($chr);
767 // If the charset has two bytes per char
768 if (isset($this->twoByteSets[$charset])) {
769 $ord2 = ord($str[$a + 1]);
770 // Assume big endian
771 $ord = $ord << 8 | $ord2;
772 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
773 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
774 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
775 } else {
776 $outStr .= chr($this->noCharByteVal);
777 }
778 // No char exists
779 $a++;
780 } elseif ($ord > 127) {
781 // If char has value over 127 it's a multibyte char in UTF-8
782 // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
783 if (isset($this->eucBasedSets[$charset])) {
784 // Shift-JIS: chars between 160 and 223 are single byte
785 if ($charset != 'shift_jis' || ($ord < 160 || $ord > 223)) {
786 $a++;
787 $ord2 = ord(substr($str, $a, 1));
788 $ord = $ord * 256 + $ord2;
789 }
790 }
791 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
792 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
793 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
794 } else {
795 $outStr .= chr($this->noCharByteVal);
796 }
797 } else {
798 $outStr .= $chr;
799 }
800 }
801 return $outStr;
802 }
803 }
804
805 /**
806 * Converts $str from UTF-8 to $charset
807 *
808 * @param string $str String in UTF-8 to convert to local charset
809 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
810 * @param boolean $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
811 * @return string Output string, converted to local charset
812 * @todo Define visibility
813 */
814 public function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
815 if ($charset === 'utf-8') {
816 return $str;
817 }
818 // Charset is case-insensitive.
819 // Parse conv. table if not already
820 if ($this->initCharset($charset)) {
821 $strLen = strlen($str);
822 $outStr = '';
823 $buf = '';
824 // Traverse each char in UTF-8 string
825 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) {
826 $chr = substr($str, $a, 1);
827 $ord = ord($chr);
828 // This means multibyte! (first byte!)
829 if ($ord > 127) {
830 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
831 if ($ord & 64) {
832 // Add first byte
833 $buf = $chr;
834 // For each byte in multibyte string
835 for ($b = 0; $b < 8; $b++) {
836 // Shift it left and
837 $ord = $ord << 1;
838 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
839 if ($ord & 128) {
840 $a++;
841 // ... and add the next char.
842 $buf .= substr($str, $a, 1);
843 } else {
844 break;
845 }
846 }
847 // If the UTF-8 char-sequence is found then...
848 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
849 // The local number
850 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
851 // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
852 if ($mByte > 255) {
853 $outStr .= chr(($mByte >> 8 & 255)) . chr(($mByte & 255));
854 } else {
855 $outStr .= chr($mByte);
856 }
857 } elseif ($useEntityForNoChar) {
858 // Create num entity:
859 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
860 } else {
861 $outStr .= chr($this->noCharByteVal);
862 }
863 } else {
864 $outStr .= chr($this->noCharByteVal);
865 }
866 } else {
867 $outStr .= $chr;
868 }
869 }
870 return $outStr;
871 }
872 }
873
874 /**
875 * Converts all chars > 127 to numeric entities.
876 *
877 * @param string $str Input string
878 * @return string Output string
879 * @todo Define visibility
880 */
881 public function utf8_to_entities($str) {
882 $strLen = strlen($str);
883 $outStr = '';
884 $buf = '';
885 // Traverse each char in UTF-8 string.
886 for ($a = 0; $a < $strLen; $a++) {
887 $chr = substr($str, $a, 1);
888 $ord = ord($chr);
889 // This means multibyte! (first byte!)
890 if ($ord > 127) {
891 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
892 if ($ord & 64) {
893 // Add first byte
894 $buf = $chr;
895 // For each byte in multibyte string...
896 for ($b = 0; $b < 8; $b++) {
897 // Shift it left and ...
898 $ord = $ord << 1;
899 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
900 if ($ord & 128) {
901 $a++;
902 // ... and add the next char.
903 $buf .= substr($str, $a, 1);
904 } else {
905 break;
906 }
907 }
908 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
909 } else {
910 $outStr .= chr($this->noCharByteVal);
911 }
912 } else {
913 $outStr .= $chr;
914 }
915 }
916 return $outStr;
917 }
918
919 /**
920 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
921 *
922 * @param string $str Input string, UTF-8
923 * @param boolean $alsoStdHtmlEnt If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
924 * @return string Output string
925 * @todo Define visibility
926 */
927 public function entities_to_utf8($str, $alsoStdHtmlEnt = FALSE) {
928 if ($alsoStdHtmlEnt) {
929 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8'));
930 }
931 $token = md5(microtime());
932 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
933 foreach ($parts as $k => $v) {
934 // Only take every second element
935 if ($k % 2 === 0) {
936 continue;
937 }
938 $position = 0;
939 // Dec or hex entities
940 if (substr($v, $position, 1) == '#') {
941 $position++;
942 if (substr($v, $position, 1) == 'x') {
943 $v = hexdec(substr($v, ++$position));
944 } else {
945 $v = substr($v, $position);
946 }
947 $parts[$k] = $this->UnumberToChar($v);
948 } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) {
949 // Other entities:
950 $v = $trans_tbl['&' . $v . ';'];
951 $parts[$k] = $v;
952 } else {
953 // No conversion:
954 $parts[$k] = '&' . $v . ';';
955 }
956 }
957 return implode('', $parts);
958 }
959
960 /**
961 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
962 *
963 * @param string $str Input string, UTF-8
964 * @param boolean $convEntities If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
965 * @param boolean $retChar If set, then instead of integer numbers the real UTF-8 char is returned.
966 * @return array Output array with the char numbers
967 * @todo Define visibility
968 */
969 public function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
970 // If entities must be registered as well...:
971 if ($convEntities) {
972 $str = $this->entities_to_utf8($str, 1);
973 }
974 // Do conversion:
975 $strLen = strlen($str);
976 $outArr = array();
977 $buf = '';
978 // Traverse each char in UTF-8 string.
979 for ($a = 0; $a < $strLen; $a++) {
980 $chr = substr($str, $a, 1);
981 $ord = ord($chr);
982 // This means multibyte! (first byte!)
983 if ($ord > 127) {
984 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
985 if ($ord & 64) {
986 // Add first byte
987 $buf = $chr;
988 // For each byte in multibyte string...
989 for ($b = 0; $b < 8; $b++) {
990 // Shift it left and ...
991 $ord = $ord << 1;
992 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
993 if ($ord & 128) {
994 $a++;
995 // ... and add the next char.
996 $buf .= substr($str, $a, 1);
997 } else {
998 break;
999 }
1000 }
1001 $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
1002 } else {
1003 $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
1004 }
1005 } else {
1006 $outArr[] = $retChar ? chr($ord) : $ord;
1007 }
1008 }
1009 return $outArr;
1010 }
1011
1012 /**
1013 * Converts a UNICODE number to a UTF-8 multibyte character
1014 * Algorithm based on script found at From: http://czyborra.com/utf/
1015 * Unit-tested by Kasper
1016 *
1017 * The binary representation of the character's integer value is thus simply spread across the bytes
1018 * and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
1019 *
1020 * bytes | bits | representation
1021 * 1 | 7 | 0vvvvvvv
1022 * 2 | 11 | 110vvvvv 10vvvvvv
1023 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
1024 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
1025 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
1026 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
1027 *
1028 * @param integer $cbyte UNICODE integer
1029 * @return string UTF-8 multibyte character string
1030 * @see utf8CharToUnumber()
1031 * @todo Define visibility
1032 */
1033 public function UnumberToChar($cbyte) {
1034 $str = '';
1035 if ($cbyte < 128) {
1036 $str .= chr($cbyte);
1037 } else {
1038 if ($cbyte < 2048) {
1039 $str .= chr(192 | $cbyte >> 6);
1040 $str .= chr(128 | $cbyte & 63);
1041 } else {
1042 if ($cbyte < 65536) {
1043 $str .= chr(224 | $cbyte >> 12);
1044 $str .= chr(128 | $cbyte >> 6 & 63);
1045 $str .= chr(128 | $cbyte & 63);
1046 } else {
1047 if ($cbyte < 2097152) {
1048 $str .= chr(240 | $cbyte >> 18);
1049 $str .= chr(128 | $cbyte >> 12 & 63);
1050 $str .= chr(128 | $cbyte >> 6 & 63);
1051 $str .= chr(128 | $cbyte & 63);
1052 } else {
1053 if ($cbyte < 67108864) {
1054 $str .= chr(248 | $cbyte >> 24);
1055 $str .= chr(128 | $cbyte >> 18 & 63);
1056 $str .= chr(128 | $cbyte >> 12 & 63);
1057 $str .= chr(128 | $cbyte >> 6 & 63);
1058 $str .= chr(128 | $cbyte & 63);
1059 } else {
1060 if ($cbyte < 2147483648) {
1061 $str .= chr(252 | $cbyte >> 30);
1062 $str .= chr(128 | $cbyte >> 24 & 63);
1063 $str .= chr(128 | $cbyte >> 18 & 63);
1064 $str .= chr(128 | $cbyte >> 12 & 63);
1065 $str .= chr(128 | $cbyte >> 6 & 63);
1066 $str .= chr(128 | $cbyte & 63);
1067 } else {
1068 // Cannot express a 32-bit character in UTF-8
1069 $str .= chr($this->noCharByteVal);
1070 }
1071 }
1072 }
1073 }
1074 }
1075 }
1076 return $str;
1077 }
1078
1079 /**
1080 * Converts a UTF-8 Multibyte character to a UNICODE number
1081 * Unit-tested by Kasper
1082 *
1083 * @param string $str UTF-8 multibyte character string
1084 * @param boolean $hex If set, then a hex. number is returned.
1085 * @return integer UNICODE integer
1086 * @see UnumberToChar()
1087 * @todo Define visibility
1088 */
1089 public function utf8CharToUnumber($str, $hex = 0) {
1090 // First char
1091 $ord = ord($str[0]);
1092 // This verifyes that it IS a multi byte string
1093 if (($ord & 192) == 192) {
1094 $binBuf = '';
1095 // For each byte in multibyte string...
1096 for ($b = 0; $b < 8; $b++) {
1097 // Shift it left and ...
1098 $ord = $ord << 1;
1099 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
1100 if ($ord & 128) {
1101 $binBuf .= substr('00000000' . decbin(ord(substr($str, ($b + 1), 1))), -6);
1102 } else {
1103 break;
1104 }
1105 }
1106 $binBuf = substr(('00000000' . decbin(ord($str[0]))), -(6 - $b)) . $binBuf;
1107 $int = bindec($binBuf);
1108 } else {
1109 $int = $ord;
1110 }
1111 return $hex ? 'x' . dechex($int) : $int;
1112 }
1113
1114 /********************************************
1115 *
1116 * Init functions
1117 *
1118 ********************************************/
1119 /**
1120 * This will initialize a charset for use if it's defined in the 'typo3/sysext/core/Resources/Private/Charsets/csconvtbl/' folder
1121 * This function is automatically called by the conversion functions
1122 *
1123 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
1124 *
1125 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
1126 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
1127 * @acces private
1128 * @todo Define visibility
1129 */
1130 public function initCharset($charset) {
1131 // Only process if the charset is not yet loaded:
1132 if (!is_array($this->parsedCharsets[$charset])) {
1133 // Conversion table filename:
1134 $charsetConvTableFile = \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/csconvtbl/' . $charset . '.tbl';
1135 // If the conversion table is found:
1136 if ($charset && GeneralUtility::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
1137 // Cache file for charsets:
1138 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1139 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1140 if ($cacheFile && @is_file($cacheFile)) {
1141 $this->parsedCharsets[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1142 } else {
1143 // Parse conversion table into lines:
1144 $lines = GeneralUtility::trimExplode(LF, GeneralUtility::getUrl($charsetConvTableFile), TRUE);
1145 // Initialize the internal variable holding the conv. table:
1146 $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
1147 // traverse the lines:
1148 $detectedType = '';
1149 foreach ($lines as $value) {
1150 // Comment line or blanks are ignored.
1151 if (trim($value) && $value[0] !== '#') {
1152 // Detect type if not done yet: (Done on first real line)
1153 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1154 if (!$detectedType) {
1155 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
1156 }
1157 if ($detectedType == 'ms-token') {
1158 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1159 } elseif ($detectedType == 'whitespaced') {
1160 $regA = array();
1161 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1162 $hexbyte = $regA[1];
1163 $utf8 = 'U+' . $regA[2];
1164 }
1165 $decval = hexdec(trim($hexbyte));
1166 if ($decval > 127) {
1167 $utf8decval = hexdec(substr(trim($utf8), 2));
1168 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1169 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1170 }
1171 }
1172 }
1173 if ($cacheFile) {
1174 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1175 }
1176 }
1177 return 2;
1178 } else {
1179 return FALSE;
1180 }
1181 } else {
1182 return 1;
1183 }
1184 }
1185
1186 /**
1187 * This function initializes all UTF-8 character data tables.
1188 *
1189 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1190 *
1191 * @param string $mode Mode ("case", "ascii", ...)
1192 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1193 * @access private
1194 * @todo Define visibility
1195 */
1196 public function initUnicodeData($mode = NULL) {
1197 // Cache files
1198 $cacheFileCase = GeneralUtility::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1199 $cacheFileASCII = GeneralUtility::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1200 // Only process if the tables are not yet loaded
1201 switch ($mode) {
1202 case 'case':
1203 if (is_array($this->caseFolding['utf-8'])) {
1204 return 1;
1205 }
1206 // Use cached version if possible
1207 if ($cacheFileCase && @is_file($cacheFileCase)) {
1208 $this->caseFolding['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileCase));
1209 return 2;
1210 }
1211 break;
1212 case 'ascii':
1213 if (is_array($this->toASCII['utf-8'])) {
1214 return 1;
1215 }
1216 // Use cached version if possible
1217 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1218 $this->toASCII['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileASCII));
1219 return 2;
1220 }
1221 break;
1222 }
1223 // Process main Unicode data file
1224 $unicodeDataFile = \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/UnicodeData.txt';
1225 if (!(GeneralUtility::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1226 return FALSE;
1227 }
1228 $fh = fopen($unicodeDataFile, 'rb');
1229 if (!$fh) {
1230 return FALSE;
1231 }
1232 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1233 // Note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1234 $this->caseFolding['utf-8'] = array();
1235 $utf8CaseFolding = &$this->caseFolding['utf-8'];
1236 // a shorthand
1237 $utf8CaseFolding['toUpper'] = array();
1238 $utf8CaseFolding['toLower'] = array();
1239 $utf8CaseFolding['toTitle'] = array();
1240 // Array of temp. decompositions
1241 $decomposition = array();
1242 // Array of chars that are marks (eg. composing accents)
1243 $mark = array();
1244 // Array of chars that are numbers (eg. digits)
1245 $number = array();
1246 // Array of chars to be omitted (eg. Russian hard sign)
1247 $omit = array();
1248 while (!feof($fh)) {
1249 $line = fgets($fh, 4096);
1250 // Has a lot of info
1251 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title, ) = explode(';', rtrim($line));
1252 $ord = hexdec($char);
1253 if ($ord > 65535) {
1254 // Only process the BMP
1255 break;
1256 }
1257 $utf8_char = $this->UnumberToChar($ord);
1258 if ($upper) {
1259 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1260 }
1261 if ($lower) {
1262 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1263 }
1264 // Store "title" only when different from "upper" (only a few)
1265 if ($title && $title != $upper) {
1266 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1267 }
1268 switch ($cat[0]) {
1269 case 'M':
1270 // mark (accent, umlaut, ...)
1271 $mark['U+' . $char] = 1;
1272 break;
1273 case 'N':
1274 // numeric value
1275 if ($ord > 128 && $num != '') {
1276 $number['U+' . $char] = $num;
1277 }
1278 }
1279 // Accented Latin letters without "official" decomposition
1280 $match = array();
1281 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1282 $c = ord($match[2]);
1283 if ($match[1] == 'SMALL') {
1284 $c += 32;
1285 }
1286 $decomposition['U+' . $char] = array(dechex($c));
1287 continue;
1288 }
1289 $match = array();
1290 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1291 switch ($match[1]) {
1292 case '<circle>':
1293 // add parenthesis as circle replacement, eg (1)
1294 $match[2] = '0028 ' . $match[2] . ' 0029';
1295 break;
1296 case '<square>':
1297 // add square brackets as square replacement, eg [1]
1298 $match[2] = '005B ' . $match[2] . ' 005D';
1299 break;
1300 case '<compat>':
1301 // ignore multi char decompositions that start with a space
1302 if (preg_match('/^0020 /', $match[2])) {
1303 continue 2;
1304 }
1305 break;
1306 case '<initial>':
1307
1308 case '<medial>':
1309
1310 case '<final>':
1311
1312 case '<isolated>':
1313
1314 case '<vertical>':
1315 continue 2;
1316 }
1317 $decomposition['U+' . $char] = explode(' ', $match[2]);
1318 }
1319 }
1320 fclose($fh);
1321 // Process additional Unicode data for casing (allow folded characters to expand into a sequence)
1322 $specialCasingFile = \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/SpecialCasing.txt';
1323 if (GeneralUtility::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1324 $fh = fopen($specialCasingFile, 'rb');
1325 if ($fh) {
1326 while (!feof($fh)) {
1327 $line = fgets($fh, 4096);
1328 if ($line[0] != '#' && trim($line) != '') {
1329 list($char, $lower, $title, $upper, $cond) = GeneralUtility::trimExplode(';', $line);
1330 if ($cond == '' || $cond[0] == '#') {
1331 $utf8_char = $this->UnumberToChar(hexdec($char));
1332 if ($char != $lower) {
1333 $arr = explode(' ', $lower);
1334 for ($i = 0; isset($arr[$i]); $i++) {
1335 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1336 }
1337 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1338 }
1339 if ($char != $title && $title != $upper) {
1340 $arr = explode(' ', $title);
1341 for ($i = 0; isset($arr[$i]); $i++) {
1342 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1343 }
1344 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1345 }
1346 if ($char != $upper) {
1347 $arr = explode(' ', $upper);
1348 for ($i = 0; isset($arr[$i]); $i++) {
1349 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1350 }
1351 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1352 }
1353 }
1354 }
1355 }
1356 fclose($fh);
1357 }
1358 }
1359 // Process custom decompositions
1360 $customTranslitFile = \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/Translit.txt';
1361 if (GeneralUtility::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1362 $fh = fopen($customTranslitFile, 'rb');
1363 if ($fh) {
1364 while (!feof($fh)) {
1365 $line = fgets($fh, 4096);
1366 if ($line[0] != '#' && trim($line) != '') {
1367 list($char, $translit) = GeneralUtility::trimExplode(';', $line);
1368 if (!$translit) {
1369 $omit['U+' . $char] = 1;
1370 }
1371 $decomposition['U+' . $char] = explode(' ', $translit);
1372 }
1373 }
1374 fclose($fh);
1375 }
1376 }
1377 // Decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1378 foreach ($decomposition as $from => $to) {
1379 $code_decomp = array();
1380 while ($code_value = array_shift($to)) {
1381 // Do recursive decomposition
1382 if (isset($decomposition['U+' . $code_value])) {
1383 foreach (array_reverse($decomposition['U+' . $code_value]) as $cv) {
1384 array_unshift($to, $cv);
1385 }
1386 } elseif (!isset($mark[('U+' . $code_value)])) {
1387 // remove mark
1388 array_push($code_decomp, $code_value);
1389 }
1390 }
1391 if (count($code_decomp) || isset($omit[$from])) {
1392 $decomposition[$from] = $code_decomp;
1393 } else {
1394 unset($decomposition[$from]);
1395 }
1396 }
1397 // Create ascii only mapping
1398 $this->toASCII['utf-8'] = array();
1399 $ascii = &$this->toASCII['utf-8'];
1400 foreach ($decomposition as $from => $to) {
1401 $code_decomp = array();
1402 while ($code_value = array_shift($to)) {
1403 $ord = hexdec($code_value);
1404 if ($ord > 127) {
1405 continue 2;
1406 } else {
1407 // Skip decompositions containing non-ASCII chars
1408 array_push($code_decomp, chr($ord));
1409 }
1410 }
1411 $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1412 }
1413 // Add numeric decompositions
1414 foreach ($number as $from => $to) {
1415 $utf8_char = $this->UnumberToChar(hexdec($from));
1416 if (!isset($ascii[$utf8_char])) {
1417 $ascii[$utf8_char] = $to;
1418 }
1419 }
1420 if ($cacheFileCase) {
1421 GeneralUtility::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1422 }
1423 if ($cacheFileASCII) {
1424 GeneralUtility::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1425 }
1426 return 3;
1427 }
1428
1429 /**
1430 * This function initializes the folding table for a charset other than UTF-8.
1431 * This function is automatically called by the case folding functions.
1432 *
1433 * @param string $charset Charset for which to initialize case folding.
1434 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1435 * @access private
1436 * @todo Define visibility
1437 */
1438 public function initCaseFolding($charset) {
1439 // Only process if the case table is not yet loaded:
1440 if (is_array($this->caseFolding[$charset])) {
1441 return 1;
1442 }
1443 // Use cached version if possible
1444 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1445 if ($cacheFile && @is_file($cacheFile)) {
1446 $this->caseFolding[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1447 return 2;
1448 }
1449 // init UTF-8 conversion for this charset
1450 if (!$this->initCharset($charset)) {
1451 return FALSE;
1452 }
1453 // UTF-8 case folding is used as the base conversion table
1454 if (!$this->initUnicodeData('case')) {
1455 return FALSE;
1456 }
1457 $nochar = chr($this->noCharByteVal);
1458 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1459 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1460 $c = $this->utf8_decode($utf8, $charset);
1461 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1462 if ($cc != '' && $cc != $nochar) {
1463 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1464 }
1465 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1466 if ($cc != '' && $cc != $nochar) {
1467 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1468 }
1469 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1470 if ($cc != '' && $cc != $nochar) {
1471 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1472 }
1473 }
1474 // Add the ASCII case table
1475 $start = ord('a');
1476 $end = ord('z');
1477 for ($i = $start; $i <= $end; $i++) {
1478 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1479 }
1480 $start = ord('A');
1481 $end = ord('Z');
1482 for ($i = $start; $i <= $end; $i++) {
1483 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1484 }
1485 if ($cacheFile) {
1486 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1487 }
1488 return 3;
1489 }
1490
1491 /**
1492 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1493 * This function is automatically called by the ASCII transliteration functions.
1494 *
1495 * @param string $charset Charset for which to initialize conversion.
1496 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1497 * @access private
1498 * @todo Define visibility
1499 */
1500 public function initToASCII($charset) {
1501 // Only process if the case table is not yet loaded:
1502 if (is_array($this->toASCII[$charset])) {
1503 return 1;
1504 }
1505 // Use cached version if possible
1506 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1507 if ($cacheFile && @is_file($cacheFile)) {
1508 $this->toASCII[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1509 return 2;
1510 }
1511 // Init UTF-8 conversion for this charset
1512 if (!$this->initCharset($charset)) {
1513 return FALSE;
1514 }
1515 // UTF-8/ASCII transliteration is used as the base conversion table
1516 if (!$this->initUnicodeData('ascii')) {
1517 return FALSE;
1518 }
1519 $nochar = chr($this->noCharByteVal);
1520 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1521 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1522 $c = $this->utf8_decode($utf8, $charset);
1523 if (isset($this->toASCII['utf-8'][$utf8])) {
1524 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1525 }
1526 }
1527 if ($cacheFile) {
1528 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1529 }
1530 return 3;
1531 }
1532
1533 /********************************************
1534 *
1535 * String operation functions
1536 *
1537 ********************************************/
1538 /**
1539 * Returns a part of a string.
1540 * Unit-tested by Kasper (single byte charsets only)
1541 *
1542 * @param string $charset The character set
1543 * @param string $string Character string
1544 * @param integer $start Start position (character position)
1545 * @param integer $len Length (in characters)
1546 * @return string The substring
1547 * @see substr(), mb_substr()
1548 * @todo Define visibility
1549 */
1550 public function substr($charset, $string, $start, $len = NULL) {
1551 if ($len === 0 || $string === '') {
1552 return '';
1553 }
1554 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1555 // Cannot omit $len, when specifying charset
1556 if ($len == NULL) {
1557 // Save internal encoding
1558 $enc = mb_internal_encoding();
1559 mb_internal_encoding($charset);
1560 $str = mb_substr($string, $start);
1561 // Restore internal encoding
1562 mb_internal_encoding($enc);
1563 return $str;
1564 } else {
1565 return mb_substr($string, $start, $len, $charset);
1566 }
1567 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1568 // Cannot omit $len, when specifying charset
1569 if ($len == NULL) {
1570 // Save internal encoding
1571 $enc = iconv_get_encoding('internal_encoding');
1572 iconv_set_encoding('internal_encoding', $charset);
1573 $str = iconv_substr($string, $start);
1574 // Restore internal encoding
1575 iconv_set_encoding('internal_encoding', $enc);
1576 return $str;
1577 } else {
1578 return iconv_substr($string, $start, $len, $charset);
1579 }
1580 } elseif ($charset == 'utf-8') {
1581 return $this->utf8_substr($string, $start, $len);
1582 } elseif ($this->eucBasedSets[$charset]) {
1583 return $this->euc_substr($string, $start, $charset, $len);
1584 } elseif ($this->twoByteSets[$charset]) {
1585 return substr($string, $start * 2, $len * 2);
1586 } elseif ($this->fourByteSets[$charset]) {
1587 return substr($string, $start * 4, $len * 4);
1588 }
1589 // Treat everything else as single-byte encoding
1590 return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
1591 }
1592
1593 /**
1594 * Counts the number of characters.
1595 * Unit-tested by Kasper (single byte charsets only)
1596 *
1597 * @param string $charset The character set
1598 * @param string $string Character string
1599 * @return integer The number of characters
1600 * @see strlen()
1601 * @todo Define visibility
1602 */
1603 public function strlen($charset, $string) {
1604 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1605 return mb_strlen($string, $charset);
1606 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1607 return iconv_strlen($string, $charset);
1608 } elseif ($charset == 'utf-8') {
1609 return $this->utf8_strlen($string);
1610 } elseif ($this->eucBasedSets[$charset]) {
1611 return $this->euc_strlen($string, $charset);
1612 } elseif ($this->twoByteSets[$charset]) {
1613 return strlen($string) / 2;
1614 } elseif ($this->fourByteSets[$charset]) {
1615 return strlen($string) / 4;
1616 }
1617 // Treat everything else as single-byte encoding
1618 return strlen($string);
1619 }
1620
1621 /**
1622 * Method to crop strings using the mb_substr function.
1623 *
1624 * @param string $charset The character set
1625 * @param string $string String to be cropped
1626 * @param integer $len Crop length (in characters)
1627 * @param string $crop Crop signifier
1628 * @return string The shortened string
1629 * @see mb_strlen(), mb_substr()
1630 */
1631 protected function cropMbstring($charset, $string, $len, $crop = '') {
1632 if ((int)$len === 0 || mb_strlen($string, $charset) <= abs($len)) {
1633 return $string;
1634 }
1635 if ($len > 0) {
1636 $string = mb_substr($string, 0, $len, $charset) . $crop;
1637 } else {
1638 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1639 }
1640 return $string;
1641 }
1642
1643 /**
1644 * Truncates a string and pre-/appends a string.
1645 * Unit tested by Kasper
1646 *
1647 * @param string $charset The character set
1648 * @param string $string Character string
1649 * @param integer $len Length (in characters)
1650 * @param string $crop Crop signifier
1651 * @return string The shortened string
1652 * @see substr(), mb_strimwidth()
1653 * @todo Define visibility
1654 */
1655 public function crop($charset, $string, $len, $crop = '') {
1656 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1657 return $this->cropMbstring($charset, $string, $len, $crop);
1658 }
1659 if ((int)$len === 0) {
1660 return $string;
1661 }
1662 if ($charset == 'utf-8') {
1663 $i = $this->utf8_char2byte_pos($string, $len);
1664 } elseif ($this->eucBasedSets[$charset]) {
1665 $i = $this->euc_char2byte_pos($string, $len, $charset);
1666 } else {
1667 if ($len > 0) {
1668 $i = $len;
1669 } else {
1670 $i = strlen($string) + $len;
1671 if ($i <= 0) {
1672 $i = FALSE;
1673 }
1674 }
1675 }
1676 // $len outside actual string length
1677 if ($i === FALSE) {
1678 return $string;
1679 } else {
1680 if ($len > 0) {
1681 if (strlen($string[$i])) {
1682 return substr($string, 0, $i) . $crop;
1683 }
1684 } else {
1685 if (strlen($string[$i - 1])) {
1686 return $crop . substr($string, $i);
1687 }
1688 }
1689 }
1690 return $string;
1691 }
1692
1693 /**
1694 * Cuts a string short at a given byte length.
1695 *
1696 * @param string $charset The character set
1697 * @param string $string Character string
1698 * @param integer $len The byte length
1699 * @return string The shortened string
1700 * @see mb_strcut()
1701 * @todo Define visibility
1702 */
1703 public function strtrunc($charset, $string, $len) {
1704 if ($len <= 0) {
1705 return '';
1706 }
1707 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1708 return mb_strcut($string, 0, $len, $charset);
1709 } elseif ($charset == 'utf-8') {
1710 return $this->utf8_strtrunc($string, $len);
1711 } elseif ($this->eucBasedSets[$charset]) {
1712 return $this->euc_strtrunc($string, $len, $charset);
1713 } elseif ($this->twoByteSets[$charset]) {
1714 if ($len % 2) {
1715 $len--;
1716 }
1717 } elseif ($this->fourByteSets[$charset]) {
1718 $x = $len % 4;
1719 // Realign to position dividable by four
1720 $len -= $x;
1721 }
1722 // Treat everything else as single-byte encoding
1723 return substr($string, 0, $len);
1724 }
1725
1726 /**
1727 * Translates all characters of a string into their respective case values.
1728 * Unlike strtolower() and strtoupper() this method is locale independent.
1729 * Note that the string length may change!
1730 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1731 * Unit-tested by Kasper
1732 * Real case folding is language dependent, this method ignores this fact.
1733 *
1734 * @param string $charset Character set of string
1735 * @param string $string Input string to convert case for
1736 * @param string $case Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1737 * @return string The converted string
1738 * @see strtolower(), strtoupper()
1739 * @todo Define visibility
1740 */
1741 public function conv_case($charset, $string, $case) {
1742 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1743 if ($case == 'toLower') {
1744 $string = mb_strtolower($string, $charset);
1745 } else {
1746 $string = mb_strtoupper($string, $charset);
1747 }
1748 } elseif ($charset == 'utf-8') {
1749 $string = $this->utf8_char_mapping($string, 'case', $case);
1750 } elseif (isset($this->eucBasedSets[$charset])) {
1751 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1752 } else {
1753 // Treat everything else as single-byte encoding
1754 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1755 }
1756 return $string;
1757 }
1758
1759 /**
1760 * Equivalent of lcfirst/ucfirst but using character set.
1761 *
1762 * @param string $charset
1763 * @param string $string
1764 * @param string $case
1765 * @return string
1766 * @see \TYPO3\CMS\Core\Charset\CharsetConverter::conv_case()
1767 */
1768 public function convCaseFirst($charset, $string, $case) {
1769 $firstChar = $this->substr($charset, $string, 0, 1);
1770 $firstChar = $this->conv_case($charset, $firstChar, $case);
1771 $remainder = $this->substr($charset, $string, 1);
1772 return $firstChar . $remainder;
1773 }
1774
1775 /**
1776 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1777 *
1778 * @param string $charset Character set of string
1779 * @param string $string Input string to convert
1780 * @return string The converted string
1781 * @todo Define visibility
1782 */
1783 public function specCharsToASCII($charset, $string) {
1784 if ($charset === 'utf-8') {
1785 $string = $this->utf8_char_mapping($string, 'ascii');
1786 } elseif (isset($this->eucBasedSets[$charset])) {
1787 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1788 } else {
1789 // Treat everything else as single-byte encoding
1790 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1791 }
1792 return $string;
1793 }
1794
1795 /**
1796 * Converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1797 * into a TYPO3-readable language code
1798 *
1799 * @param string $languageCodesList List of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1800 * @return string A preferred language that TYPO3 supports, or "default" if none found
1801 */
1802 public function getPreferredClientLanguage($languageCodesList) {
1803 $allLanguageCodes = array();
1804 $selectedLanguage = 'default';
1805 // Get all languages where TYPO3 code is the same as the ISO code
1806 foreach ($this->charSetArray as $typo3Lang => $charSet) {
1807 $allLanguageCodes[$typo3Lang] = $typo3Lang;
1808 }
1809 // Get all languages where TYPO3 code differs from ISO code
1810 // or needs the country part
1811 // the iso codes will here overwrite the default typo3 language in the key
1812 foreach ($this->locales->getIsoMapping() as $typo3Lang => $isoLang) {
1813 $isoLang = join('-', explode('_', $isoLang));
1814 $allLanguageCodes[$typo3Lang] = $isoLang;
1815 }
1816 // Move the iso codes to the (because we're comparing the keys with "isset" later on)
1817 $allLanguageCodes = array_flip($allLanguageCodes);
1818 $preferredLanguages = GeneralUtility::trimExplode(',', $languageCodesList);
1819 // Order the preferred languages after they key
1820 $sortedPreferredLanguages = array();
1821 foreach ($preferredLanguages as $preferredLanguage) {
1822 $quality = 1.0;
1823 if (strpos($preferredLanguage, ';q=') !== FALSE) {
1824 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1825 }
1826 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1827 }
1828 // Loop through the languages, with the highest priority first
1829 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1830 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1831 if (isset($allLanguageCodes[$preferredLanguage])) {
1832 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1833 break;
1834 }
1835 // Strip the country code from the end
1836 list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
1837 if (isset($allLanguageCodes[$preferredLanguage])) {
1838 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1839 break;
1840 }
1841 }
1842 if (!$selectedLanguage || $selectedLanguage == 'en') {
1843 $selectedLanguage = 'default';
1844 }
1845 return $selectedLanguage;
1846 }
1847
1848 /********************************************
1849 *
1850 * Internal string operation functions
1851 *
1852 ********************************************/
1853 /**
1854 * Maps all characters of a string in a single byte charset.
1855 *
1856 * @param string $str The string
1857 * @param string $charset The charset
1858 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1859 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
1860 * @return string The converted string
1861 * @todo Define visibility
1862 */
1863 public function sb_char_mapping($str, $charset, $mode, $opt = '') {
1864 switch ($mode) {
1865 case 'case':
1866 if (!$this->initCaseFolding($charset)) {
1867 return $str;
1868 }
1869 // Do nothing
1870 $map = &$this->caseFolding[$charset][$opt];
1871 break;
1872 case 'ascii':
1873 if (!$this->initToASCII($charset)) {
1874 return $str;
1875 }
1876 // Do nothing
1877 $map = &$this->toASCII[$charset];
1878 break;
1879 default:
1880 return $str;
1881 }
1882 $out = '';
1883 for ($i = 0; strlen($str[$i]); $i++) {
1884 $c = $str[$i];
1885 if (isset($map[$c])) {
1886 $out .= $map[$c];
1887 } else {
1888 $out .= $c;
1889 }
1890 }
1891 return $out;
1892 }
1893
1894 /********************************************
1895 *
1896 * Internal UTF-8 string operation functions
1897 *
1898 ********************************************/
1899 /**
1900 * Returns a part of a UTF-8 string.
1901 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1902 *
1903 * @param string $str UTF-8 string
1904 * @param integer $start Start position (character position)
1905 * @param integer $len Length (in characters)
1906 * @return string The substring
1907 * @see substr()
1908 * @todo Define visibility
1909 */
1910 public function utf8_substr($str, $start, $len = NULL) {
1911 if ((string)$len === '0') {
1912 return '';
1913 }
1914 $byte_start = $this->utf8_char2byte_pos($str, $start);
1915 if ($byte_start === FALSE) {
1916 if ($start > 0) {
1917 // $start outside string length
1918 return FALSE;
1919 } else {
1920 $start = 0;
1921 }
1922 }
1923 $str = substr($str, $byte_start);
1924 if ($len != NULL) {
1925 $byte_end = $this->utf8_char2byte_pos($str, $len);
1926 // $len outside actual string length
1927 if ($byte_end === FALSE) {
1928 return $len < 0 ? '' : $str;
1929 } else {
1930 // When length is less than zero and exceeds, then we return blank string.
1931 return substr($str, 0, $byte_end);
1932 }
1933 } else {
1934 return $str;
1935 }
1936 }
1937
1938 /**
1939 * Counts the number of characters of a string in UTF-8.
1940 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1941 *
1942 * @param string $str UTF-8 multibyte character string
1943 * @return integer The number of characters
1944 * @see strlen()
1945 * @todo Define visibility
1946 */
1947 public function utf8_strlen($str) {
1948 $n = 0;
1949 for ($i = 0; strlen($str[$i]); $i++) {
1950 $c = ord($str[$i]);
1951 // Single-byte (0xxxxxx)
1952 if (!($c & 128)) {
1953 $n++;
1954 } elseif (($c & 192) == 192) {
1955 // Multi-byte starting byte (11xxxxxx)
1956 $n++;
1957 }
1958 }
1959 return $n;
1960 }
1961
1962 /**
1963 * Truncates a string in UTF-8 short at a given byte length.
1964 *
1965 * @param string $str UTF-8 multibyte character string
1966 * @param integer $len The byte length
1967 * @return string The shortened string
1968 * @see mb_strcut()
1969 * @todo Define visibility
1970 */
1971 public function utf8_strtrunc($str, $len) {
1972 $i = $len - 1;
1973 // Part of a multibyte sequence
1974 if (ord($str[$i]) & 128) {
1975 for (; $i > 0 && !(ord($str[$i]) & 64); $i--) {
1976
1977 }
1978 if ($i <= 0) {
1979 return '';
1980 }
1981 // Sanity check
1982 for ($bc = 0, $mbs = ord($str[$i]); $mbs & 128; $mbs = $mbs << 1) {
1983 // Calculate number of bytes
1984 $bc++;
1985 }
1986 if ($bc + $i > $len) {
1987 return substr($str, 0, $i);
1988 }
1989 }
1990 return substr($str, 0, $len);
1991 }
1992
1993 /**
1994 * Find position of first occurrence of a string, both arguments are in UTF-8.
1995 *
1996 * @param string $haystack UTF-8 string to search in
1997 * @param string $needle UTF-8 string to search for
1998 * @param integer $offset Positition to start the search
1999 * @return integer The character position
2000 * @see strpos()
2001 * @todo Define visibility
2002 */
2003 public function utf8_strpos($haystack, $needle, $offset = 0) {
2004 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
2005 return mb_strpos($haystack, $needle, $offset, 'utf-8');
2006 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
2007 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
2008 }
2009 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
2010 if ($byte_offset === FALSE) {
2011 // Offset beyond string length
2012 return FALSE;
2013 }
2014 $byte_pos = strpos($haystack, $needle, $byte_offset);
2015 if ($byte_pos === FALSE) {
2016 // Needle not found
2017 return FALSE;
2018 }
2019 return $this->utf8_byte2char_pos($haystack, $byte_pos);
2020 }
2021
2022 /**
2023 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
2024 *
2025 * @param string $haystack UTF-8 string to search in
2026 * @param string $needle UTF-8 character to search for (single character)
2027 * @return integer The character position
2028 * @see strrpos()
2029 * @todo Define visibility
2030 */
2031 public function utf8_strrpos($haystack, $needle) {
2032 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
2033 return mb_strrpos($haystack, $needle, 'utf-8');
2034 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
2035 return iconv_strrpos($haystack, $needle, 'utf-8');
2036 }
2037 $byte_pos = strrpos($haystack, $needle);
2038 if ($byte_pos === FALSE) {
2039 // Needle not found
2040 return FALSE;
2041 }
2042 return $this->utf8_byte2char_pos($haystack, $byte_pos);
2043 }
2044
2045 /**
2046 * Translates a character position into an 'absolute' byte position.
2047 * Unit tested by Kasper.
2048 *
2049 * @param string $str UTF-8 string
2050 * @param integer $pos Character position (negative values start from the end)
2051 * @return integer Byte position
2052 * @todo Define visibility
2053 */
2054 public function utf8_char2byte_pos($str, $pos) {
2055 // Number of characters found
2056 $n = 0;
2057 // Number of characters wanted
2058 $p = abs($pos);
2059 if ($pos >= 0) {
2060 $i = 0;
2061 $d = 1;
2062 } else {
2063 $i = strlen($str) - 1;
2064 $d = -1;
2065 }
2066 for (; strlen($str[$i]) && $n < $p; $i += $d) {
2067 $c = (int)ord($str[$i]);
2068 // single-byte (0xxxxxx)
2069 if (!($c & 128)) {
2070 $n++;
2071 } elseif (($c & 192) == 192) {
2072 // Multi-byte starting byte (11xxxxxx)
2073 $n++;
2074 }
2075 }
2076 if (!strlen($str[$i])) {
2077 // Offset beyond string length
2078 return FALSE;
2079 }
2080 if ($pos >= 0) {
2081 // Skip trailing multi-byte data bytes
2082 while (ord($str[$i]) & 128 && !(ord($str[$i]) & 64)) {
2083 $i++;
2084 }
2085 } else {
2086 // Correct offset
2087 $i++;
2088 }
2089 return $i;
2090 }
2091
2092 /**
2093 * Translates an 'absolute' byte position into a character position.
2094 * Unit tested by Kasper.
2095 *
2096 * @param string $str UTF-8 string
2097 * @param integer $pos Byte position
2098 * @return integer Character position
2099 * @todo Define visibility
2100 */
2101 public function utf8_byte2char_pos($str, $pos) {
2102 // Number of characters
2103 $n = 0;
2104 for ($i = $pos; $i > 0; $i--) {
2105 $c = (int)ord($str[$i]);
2106 // single-byte (0xxxxxx)
2107 if (!($c & 128)) {
2108 $n++;
2109 } elseif (($c & 192) == 192) {
2110 // Multi-byte starting byte (11xxxxxx)
2111 $n++;
2112 }
2113 }
2114 if (!strlen($str[$i])) {
2115 // Offset beyond string length
2116 return FALSE;
2117 }
2118 return $n;
2119 }
2120
2121 /**
2122 * Maps all characters of an UTF-8 string.
2123 *
2124 * @param string $str UTF-8 string
2125 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2126 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
2127 * @return string The converted string
2128 * @todo Define visibility
2129 */
2130 public function utf8_char_mapping($str, $mode, $opt = '') {
2131 if (!$this->initUnicodeData($mode)) {
2132 // Do nothing
2133 return $str;
2134 }
2135 $out = '';
2136 switch ($mode) {
2137 case 'case':
2138 $map = &$this->caseFolding['utf-8'][$opt];
2139 break;
2140 case 'ascii':
2141 $map = &$this->toASCII['utf-8'];
2142 break;
2143 default:
2144 return $str;
2145 }
2146 for ($i = 0; strlen($str[$i]); $i++) {
2147 $c = ord($str[$i]);
2148 // single-byte (0xxxxxx)
2149 if (!($c & 128)) {
2150 $mbc = $str[$i];
2151 } elseif (($c & 192) == 192) {
2152 // multi-byte starting byte (11xxxxxx)
2153 for ($bc = 0; $c & 128; $c = $c << 1) {
2154 $bc++;
2155 }
2156 // calculate number of bytes
2157 $mbc = substr($str, $i, $bc);
2158 $i += $bc - 1;
2159 }
2160 if (isset($map[$mbc])) {
2161 $out .= $map[$mbc];
2162 } else {
2163 $out .= $mbc;
2164 }
2165 }
2166 return $out;
2167 }
2168
2169 /********************************************
2170 *
2171 * Internal EUC string operation functions
2172 *
2173 * Extended Unix Code:
2174 * ASCII compatible 7bit single bytes chars
2175 * 8bit two byte chars
2176 *
2177 * Shift-JIS is treated as a special case.
2178 *
2179 ********************************************/
2180 /**
2181 * Cuts a string in the EUC charset family short at a given byte length.
2182 *
2183 * @param string $str EUC multibyte character string
2184 * @param integer $len The byte length
2185 * @param string $charset The charset
2186 * @return string The shortened string
2187 * @see mb_strcut()
2188 * @todo Define visibility
2189 */
2190 public function euc_strtrunc($str, $len, $charset) {
2191 $sjis = $charset == 'shift_jis';
2192 for ($i = 0; strlen($str[$i]) && $i < $len; $i++) {
2193 $c = ord($str[$i]);
2194 if ($sjis) {
2195 if ($c >= 128 && $c < 160 || $c >= 224) {
2196 $i++;
2197 }
2198 } else {
2199 if ($c >= 128) {
2200 $i++;
2201 }
2202 }
2203 }
2204 if (!strlen($str[$i])) {
2205 return $str;
2206 }
2207 // string shorter than supplied length
2208 if ($i > $len) {
2209 // We ended on a first byte
2210 return substr($str, 0, $len - 1);
2211 } else {
2212 return substr($str, 0, $len);
2213 }
2214 }
2215
2216 /**
2217 * Returns a part of a string in the EUC charset family.
2218 *
2219 * @param string $str EUC multibyte character string
2220 * @param integer $start Start position (character position)
2221 * @param string $charset The charset
2222 * @param integer $len Length (in characters)
2223 * @return string the substring
2224 * @todo Define visibility
2225 */
2226 public function euc_substr($str, $start, $charset, $len = NULL) {
2227 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2228 if ($byte_start === FALSE) {
2229 // $start outside string length
2230 return FALSE;
2231 }
2232 $str = substr($str, $byte_start);
2233 if ($len != NULL) {
2234 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2235 // $len outside actual string length
2236 if ($byte_end === FALSE) {
2237 return $str;
2238 } else {
2239 return substr($str, 0, $byte_end);
2240 }
2241 } else {
2242 return $str;
2243 }
2244 }
2245
2246 /**
2247 * Counts the number of characters of a string in the EUC charset family.
2248 *
2249 * @param string $str EUC multibyte character string
2250 * @param string $charset The charset
2251 * @return integer The number of characters
2252 * @see strlen()
2253 * @todo Define visibility
2254 */
2255 public function euc_strlen($str, $charset) {
2256 $sjis = $charset == 'shift_jis';
2257 $n = 0;
2258 for ($i = 0; strlen($str[$i]); $i++) {
2259 $c = ord($str[$i]);
2260 if ($sjis) {
2261 if ($c >= 128 && $c < 160 || $c >= 224) {
2262 $i++;
2263 }
2264 } else {
2265 if ($c >= 128) {
2266 $i++;
2267 }
2268 }
2269 $n++;
2270 }
2271 return $n;
2272 }
2273
2274 /**
2275 * Translates a character position into an 'absolute' byte position.
2276 *
2277 * @param string $str EUC multibyte character string
2278 * @param integer $pos Character position (negative values start from the end)
2279 * @param string $charset The charset
2280 * @return integer Byte position
2281 * @todo Define visibility
2282 */
2283 public function euc_char2byte_pos($str, $pos, $charset) {
2284 $sjis = $charset == 'shift_jis';
2285 // Number of characters seen
2286 $n = 0;
2287 // Number of characters wanted
2288 $p = abs($pos);
2289 if ($pos >= 0) {
2290 $i = 0;
2291 $d = 1;
2292 } else {
2293 $i = strlen($str) - 1;
2294 $d = -1;
2295 }
2296 for (; strlen($str[$i]) && $n < $p; $i += $d) {
2297 $c = ord($str[$i]);
2298 if ($sjis) {
2299 if ($c >= 128 && $c < 160 || $c >= 224) {
2300 $i += $d;
2301 }
2302 } else {
2303 if ($c >= 128) {
2304 $i += $d;
2305 }
2306 }
2307 $n++;
2308 }
2309 if (!strlen($str[$i])) {
2310 return FALSE;
2311 }
2312 // offset beyond string length
2313 if ($pos < 0) {
2314 $i++;
2315 }
2316 // correct offset
2317 return $i;
2318 }
2319
2320 /**
2321 * Maps all characters of a string in the EUC charset family.
2322 *
2323 * @param string $str EUC multibyte character string
2324 * @param string $charset The charset
2325 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2326 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
2327 * @return string The converted string
2328 * @todo Define visibility
2329 */
2330 public function euc_char_mapping($str, $charset, $mode, $opt = '') {
2331 switch ($mode) {
2332 case 'case':
2333 if (!$this->initCaseFolding($charset)) {
2334 return $str;
2335 }
2336 // do nothing
2337 $map = &$this->caseFolding[$charset][$opt];
2338 break;
2339 case 'ascii':
2340 if (!$this->initToASCII($charset)) {
2341 return $str;
2342 }
2343 // do nothing
2344 $map = &$this->toASCII[$charset];
2345 break;
2346 default:
2347 return $str;
2348 }
2349 $sjis = $charset == 'shift_jis';
2350 $out = '';
2351 for ($i = 0; strlen($str[$i]); $i++) {
2352 $mbc = $str[$i];
2353 $c = ord($mbc);
2354 if ($sjis) {
2355 // A double-byte char
2356 if ($c >= 128 && $c < 160 || $c >= 224) {
2357 $mbc = substr($str, $i, 2);
2358 $i++;
2359 }
2360 } else {
2361 // A double-byte char
2362 if ($c >= 128) {
2363 $mbc = substr($str, $i, 2);
2364 $i++;
2365 }
2366 }
2367 if (isset($map[$mbc])) {
2368 $out .= $map[$mbc];
2369 } else {
2370 $out .= $mbc;
2371 }
2372 }
2373 return $out;
2374 }
2375
2376 }