c5b9a3c982c4753f3aff5d9f8aa4cabf657e4f2a
[Packages/TYPO3.CMS.git] / typo3 / sysext / core / Classes / Charset / CharsetConverter.php
1 <?php
2 namespace TYPO3\CMS\Core\Charset;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Localization\Locales;
18 use TYPO3\CMS\Core\SingletonInterface;
19 use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
20 use TYPO3\CMS\Core\Utility\GeneralUtility;
21
22 /**
23 * Notes on UTF-8
24 *
25 * Functions working on UTF-8 strings:
26 *
27 * - strchr/strstr
28 * - strrchr
29 * - substr_count
30 * - implode/explode/join
31 *
32 * Functions nearly working on UTF-8 strings:
33 *
34 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
35 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
36 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
37 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
38 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
39 *
40 * Functions NOT working on UTF-8 strings:
41 *
42 * - str*cmp
43 * - stristr
44 * - stripos
45 * - substr
46 * - strrev
47 * - split/spliti
48 * - ...
49 */
50
51 /**
52 * Class for conversion between charsets
53 */
54 class CharsetConverter implements SingletonInterface
55 {
56
57 /**
58 * Possible strategies for handling multi-byte data
59 * Only used for internal purpose
60 * @internal
61 */
62 const STRATEGY_MBSTRING = 'mbstring';
63 const STRATEGY_ICONV = 'iconv';
64 const STRATEGY_FALLBACK = 'fallback';
65
66 /**
67 * Set to one of the strategies above, based on the availability of the environment.
68 *
69 * @var string
70 */
71 protected $conversionStrategy = null;
72
73 /**
74 * ASCII Value for chars with no equivalent.
75 *
76 * @var int
77 */
78 public $noCharByteVal = 63;
79
80 /**
81 * This is the array where parsed conversion tables are stored (cached)
82 *
83 * @var array
84 */
85 public $parsedCharsets = array();
86
87 /**
88 * An array where case folding data will be stored (cached)
89 *
90 * @var array
91 */
92 public $caseFolding = array();
93
94 /**
95 * An array where charset-to-ASCII mappings are stored (cached)
96 *
97 * @var array
98 */
99 public $toASCII = array();
100
101 /**
102 * This tells the converter which charsets has two bytes per char:
103 *
104 * @var array
105 */
106 public $twoByteSets = array(
107 'ucs-2' => 1
108 );
109
110 /**
111 * This tells the converter which charsets has four bytes per char:
112 *
113 * @var array
114 */
115 public $fourByteSets = array(
116 'ucs-4' => 1, // 4-byte Unicode
117 'utf-32' => 1
118 );
119
120 /**
121 * This tells the converter which charsets use a scheme like the Extended Unix Code:
122 *
123 * @var array
124 */
125 public $eucBasedSets = array(
126 'gb2312' => 1, // Chinese, simplified.
127 'big5' => 1, // Chinese, traditional.
128 'euc-kr' => 1, // Korean
129 'shift_jis' => 1
130 );
131
132 /**
133 * @link http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
134 * @link http://czyborra.com/charsets/iso8859.html
135 *
136 * @var array
137 */
138 public $synonyms = array(
139 'us' => 'ascii',
140 'us-ascii' => 'ascii',
141 'cp819' => 'iso-8859-1',
142 'ibm819' => 'iso-8859-1',
143 'iso-ir-100' => 'iso-8859-1',
144 'iso-ir-101' => 'iso-8859-2',
145 'iso-ir-109' => 'iso-8859-3',
146 'iso-ir-110' => 'iso-8859-4',
147 'iso-ir-144' => 'iso-8859-5',
148 'iso-ir-127' => 'iso-8859-6',
149 'iso-ir-126' => 'iso-8859-7',
150 'iso-ir-138' => 'iso-8859-8',
151 'iso-ir-148' => 'iso-8859-9',
152 'iso-ir-157' => 'iso-8859-10',
153 'iso-ir-179' => 'iso-8859-13',
154 'iso-ir-199' => 'iso-8859-14',
155 'iso-ir-203' => 'iso-8859-15',
156 'csisolatin1' => 'iso-8859-1',
157 'csisolatin2' => 'iso-8859-2',
158 'csisolatin3' => 'iso-8859-3',
159 'csisolatin5' => 'iso-8859-9',
160 'csisolatin8' => 'iso-8859-14',
161 'csisolatin9' => 'iso-8859-15',
162 'csisolatingreek' => 'iso-8859-7',
163 'iso-celtic' => 'iso-8859-14',
164 'latin1' => 'iso-8859-1',
165 'latin2' => 'iso-8859-2',
166 'latin3' => 'iso-8859-3',
167 'latin5' => 'iso-8859-9',
168 'latin6' => 'iso-8859-10',
169 'latin8' => 'iso-8859-14',
170 'latin9' => 'iso-8859-15',
171 'l1' => 'iso-8859-1',
172 'l2' => 'iso-8859-2',
173 'l3' => 'iso-8859-3',
174 'l5' => 'iso-8859-9',
175 'l6' => 'iso-8859-10',
176 'l8' => 'iso-8859-14',
177 'l9' => 'iso-8859-15',
178 'cyrillic' => 'iso-8859-5',
179 'arabic' => 'iso-8859-6',
180 'tis-620' => 'iso-8859-11',
181 'win874' => 'windows-874',
182 'win1250' => 'windows-1250',
183 'win1251' => 'windows-1251',
184 'win1252' => 'windows-1252',
185 'win1253' => 'windows-1253',
186 'win1254' => 'windows-1254',
187 'win1255' => 'windows-1255',
188 'win1256' => 'windows-1256',
189 'win1257' => 'windows-1257',
190 'win1258' => 'windows-1258',
191 'cp1250' => 'windows-1250',
192 'cp1251' => 'windows-1251',
193 'cp1252' => 'windows-1252',
194 'ms-ee' => 'windows-1250',
195 'ms-ansi' => 'windows-1252',
196 'ms-greek' => 'windows-1253',
197 'ms-turk' => 'windows-1254',
198 'winbaltrim' => 'windows-1257',
199 'koi-8ru' => 'koi-8r',
200 'koi8r' => 'koi-8r',
201 'cp878' => 'koi-8r',
202 'mac' => 'macroman',
203 'macintosh' => 'macroman',
204 'euc-cn' => 'gb2312',
205 'x-euc-cn' => 'gb2312',
206 'euccn' => 'gb2312',
207 'cp936' => 'gb2312',
208 'big-5' => 'big5',
209 'cp950' => 'big5',
210 'eucjp' => 'euc-jp',
211 'sjis' => 'shift_jis',
212 'shift-jis' => 'shift_jis',
213 'cp932' => 'shift_jis',
214 'cp949' => 'euc-kr',
215 'utf7' => 'utf-7',
216 'utf8' => 'utf-8',
217 'utf16' => 'utf-16',
218 'utf32' => 'utf-32',
219 'ucs2' => 'ucs-2',
220 'ucs4' => 'ucs-4'
221 );
222
223 /**
224 * Mapping of iso-639-1 language codes to script names
225 *
226 * @var array
227 */
228 public $lang_to_script = array(
229 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
230 'af' => 'west_european', // Afrikaans
231 'ar' => 'arabic',
232 'bg' => 'cyrillic', // Bulgarian
233 'bs' => 'east_european', // Bosnian
234 'cs' => 'east_european', // Czech
235 'da' => 'west_european', // Danish
236 'de' => 'west_european', // German
237 'es' => 'west_european', // Spanish
238 'et' => 'estonian',
239 'eo' => 'unicode', // Esperanto
240 'eu' => 'west_european', // Basque
241 'fa' => 'arabic', // Persian
242 'fi' => 'west_european', // Finish
243 'fo' => 'west_european', // Faroese
244 'fr' => 'west_european', // French
245 'ga' => 'west_european', // Irish
246 'gl' => 'west_european', // Galician
247 'gr' => 'greek',
248 'he' => 'hebrew', // Hebrew (since 1998)
249 'hi' => 'unicode', // Hindi
250 'hr' => 'east_european', // Croatian
251 'hu' => 'east_european', // Hungarian
252 'iw' => 'hebrew', // Hebrew (til 1998)
253 'is' => 'west_european', // Icelandic
254 'it' => 'west_european', // Italian
255 'ja' => 'japanese',
256 'ka' => 'unicode', // Georgian
257 'kl' => 'west_european', // Greenlandic
258 'km' => 'unicode', // Khmer
259 'ko' => 'korean',
260 'lt' => 'lithuanian',
261 'lv' => 'west_european', // Latvian/Lettish
262 'nl' => 'west_european', // Dutch
263 'no' => 'west_european', // Norwegian
264 'nb' => 'west_european', // Norwegian Bokmal
265 'nn' => 'west_european', // Norwegian Nynorsk
266 'pl' => 'east_european', // Polish
267 'pt' => 'west_european', // Portuguese
268 'ro' => 'east_european', // Romanian
269 'ru' => 'cyrillic', // Russian
270 'sk' => 'east_european', // Slovak
271 'sl' => 'east_european', // Slovenian
272 'sr' => 'cyrillic', // Serbian
273 'sv' => 'west_european', // Swedish
274 'sq' => 'albanian', // Albanian
275 'th' => 'thai',
276 'uk' => 'cyrillic', // Ukranian
277 'vi' => 'vietnamese',
278 'zh' => 'chinese',
279
280 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
281 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
282 'afk' => 'west_european', // Afrikaans
283 'ara' => 'arabic',
284 'bgr' => 'cyrillic', // Bulgarian
285 'cat' => 'west_european', // Catalan
286 'chs' => 'simpl_chinese',
287 'cht' => 'trad_chinese',
288 'csy' => 'east_european', // Czech
289 'dan' => 'west_european', // Danish
290 'deu' => 'west_european', // German
291 'dea' => 'west_european', // German (Austrian)
292 'des' => 'west_european', // German (Swiss)
293 'ena' => 'west_european', // English (Australian)
294 'enc' => 'west_european', // English (Canadian)
295 'eng' => 'west_european', // English
296 'enz' => 'west_european', // English (New Zealand)
297 'enu' => 'west_european', // English (United States)
298 'euq' => 'west_european', // Basque
299 'fos' => 'west_european', // Faroese
300 'far' => 'arabic', // Persian
301 'fin' => 'west_european', // Finish
302 'fra' => 'west_european', // French
303 'frb' => 'west_european', // French (Belgian)
304 'frc' => 'west_european', // French (Canadian)
305 'frs' => 'west_european', // French (Swiss)
306 'geo' => 'unicode', // Georgian
307 'glg' => 'west_european', // Galician
308 'ell' => 'greek',
309 'heb' => 'hebrew',
310 'hin' => 'unicode', // Hindi
311 'hun' => 'east_european', // Hungarian
312 'isl' => 'west_european', // Icelandic
313 'ita' => 'west_european', // Italian
314 'its' => 'west_european', // Italian (Swiss)
315 'jpn' => 'japanese',
316 'khm' => 'unicode', // Khmer
317 'kor' => 'korean',
318 'lth' => 'lithuanian',
319 'lvi' => 'west_european', // Latvian/Lettish
320 'msl' => 'west_european', // Malay
321 'nlb' => 'west_european', // Dutch (Belgian)
322 'nld' => 'west_european', // Dutch
323 'nor' => 'west_european', // Norwegian (bokmal)
324 'non' => 'west_european', // Norwegian (nynorsk)
325 'plk' => 'east_european', // Polish
326 'ptg' => 'west_european', // Portuguese
327 'ptb' => 'west_european', // Portuguese (Brazil)
328 'rom' => 'east_european', // Romanian
329 'rus' => 'cyrillic', // Russian
330 'slv' => 'east_european', // Slovenian
331 'sky' => 'east_european', // Slovak
332 'srl' => 'east_european', // Serbian (Latin)
333 'srb' => 'cyrillic', // Serbian (Cyrillic)
334 'esp' => 'west_european', // Spanish (trad. sort)
335 'esm' => 'west_european', // Spanish (Mexican)
336 'esn' => 'west_european', // Spanish (internat. sort)
337 'sve' => 'west_european', // Swedish
338 'sqi' => 'albanian', // Albanian
339 'tha' => 'thai',
340 'trk' => 'turkish',
341 'ukr' => 'cyrillic', // Ukrainian
342
343 // English language names
344 'afrikaans' => 'west_european',
345 'albanian' => 'albanian',
346 'arabic' => 'arabic',
347 'basque' => 'west_european',
348 'bosnian' => 'east_european',
349 'bulgarian' => 'east_european',
350 'catalan' => 'west_european',
351 'croatian' => 'east_european',
352 'czech' => 'east_european',
353 'danish' => 'west_european',
354 'dutch' => 'west_european',
355 'english' => 'west_european',
356 'esperanto' => 'unicode',
357 'estonian' => 'estonian',
358 'faroese' => 'west_european',
359 'farsi' => 'arabic',
360 'finnish' => 'west_european',
361 'french' => 'west_european',
362 'galician' => 'west_european',
363 'georgian' => 'unicode',
364 'german' => 'west_european',
365 'greek' => 'greek',
366 'greenlandic' => 'west_european',
367 'hebrew' => 'hebrew',
368 'hindi' => 'unicode',
369 'hungarian' => 'east_european',
370 'icelandic' => 'west_european',
371 'italian' => 'west_european',
372 'khmer' => 'unicode',
373 'latvian' => 'west_european',
374 'lettish' => 'west_european',
375 'lithuanian' => 'lithuanian',
376 'malay' => 'west_european',
377 'norwegian' => 'west_european',
378 'persian' => 'arabic',
379 'polish' => 'east_european',
380 'portuguese' => 'west_european',
381 'russian' => 'cyrillic',
382 'romanian' => 'east_european',
383 'serbian' => 'cyrillic',
384 'slovak' => 'east_european',
385 'slovenian' => 'east_european',
386 'spanish' => 'west_european',
387 'svedish' => 'west_european',
388 'that' => 'thai',
389 'turkish' => 'turkish',
390 'ukrainian' => 'cyrillic'
391 );
392
393 /**
394 * Mapping of language (family) names to charsets on Unix
395 *
396 * @var array
397 */
398 public $script_to_charset_unix = array(
399 'west_european' => 'iso-8859-1',
400 'estonian' => 'iso-8859-1',
401 'east_european' => 'iso-8859-2',
402 'baltic' => 'iso-8859-4',
403 'cyrillic' => 'iso-8859-5',
404 'arabic' => 'iso-8859-6',
405 'greek' => 'iso-8859-7',
406 'hebrew' => 'iso-8859-8',
407 'turkish' => 'iso-8859-9',
408 'thai' => 'iso-8859-11', // = TIS-620
409 'lithuanian' => 'iso-8859-13',
410 'chinese' => 'gb2312', // = euc-cn
411 'japanese' => 'euc-jp',
412 'korean' => 'euc-kr',
413 'simpl_chinese' => 'gb2312',
414 'trad_chinese' => 'big5',
415 'vietnamese' => '',
416 'unicode' => 'utf-8',
417 'albanian' => 'utf-8'
418 );
419
420 /**
421 * Mapping of language (family) names to charsets on Windows
422 *
423 * @var array
424 */
425 public $script_to_charset_windows = array(
426 'east_european' => 'windows-1250',
427 'cyrillic' => 'windows-1251',
428 'west_european' => 'windows-1252',
429 'greek' => 'windows-1253',
430 'turkish' => 'windows-1254',
431 'hebrew' => 'windows-1255',
432 'arabic' => 'windows-1256',
433 'baltic' => 'windows-1257',
434 'estonian' => 'windows-1257',
435 'lithuanian' => 'windows-1257',
436 'vietnamese' => 'windows-1258',
437 'thai' => 'cp874',
438 'korean' => 'cp949',
439 'chinese' => 'gb2312',
440 'japanese' => 'shift_jis',
441 'simpl_chinese' => 'gb2312',
442 'trad_chinese' => 'big5',
443 'albanian' => 'windows-1250',
444 'unicode' => 'utf-8'
445 );
446
447 /**
448 * Mapping of locale names to charsets
449 *
450 * @var array
451 */
452 public $locale_to_charset = array(
453 'japanese.euc' => 'euc-jp',
454 'ja_jp.ujis' => 'euc-jp',
455 'korean.euc' => 'euc-kr',
456 'sr@Latn' => 'iso-8859-2',
457 'zh_cn' => 'gb2312',
458 'zh_hk' => 'big5',
459 'zh_tw' => 'big5'
460 );
461
462 /**
463 * TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
464 * Empty values means "utf-8"
465 *
466 * @var array
467 */
468 public $charSetArray = array(
469 'af' => '',
470 'ar' => 'iso-8859-6',
471 'ba' => 'iso-8859-2',
472 'bg' => 'windows-1251',
473 'br' => '',
474 'ca' => 'iso-8859-15',
475 'ch' => 'gb2312',
476 'cs' => 'windows-1250',
477 'cz' => 'windows-1250',
478 'da' => '',
479 'de' => '',
480 'dk' => '',
481 'el' => 'iso-8859-7',
482 'eo' => 'utf-8',
483 'es' => '',
484 'et' => 'iso-8859-4',
485 'eu' => '',
486 'fa' => 'utf-8',
487 'fi' => '',
488 'fo' => 'utf-8',
489 'fr' => '',
490 'fr_CA' => '',
491 'ga' => '',
492 'ge' => 'utf-8',
493 'gl' => '',
494 'gr' => 'iso-8859-7',
495 'he' => 'utf-8',
496 'hi' => 'utf-8',
497 'hk' => 'big5',
498 'hr' => 'windows-1250',
499 'hu' => 'iso-8859-2',
500 'is' => 'utf-8',
501 'it' => '',
502 'ja' => 'shift_jis',
503 'jp' => 'shift_jis',
504 'ka' => 'utf-8',
505 'kl' => 'utf-8',
506 'km' => 'utf-8',
507 'ko' => 'euc-kr',
508 'kr' => 'euc-kr',
509 'lt' => 'windows-1257',
510 'lv' => 'utf-8',
511 'ms' => '',
512 'my' => '',
513 'nl' => '',
514 'no' => '',
515 'pl' => 'iso-8859-2',
516 'pt' => '',
517 'pt_BR' => '',
518 'qc' => '',
519 'ro' => 'iso-8859-2',
520 'ru' => 'windows-1251',
521 'se' => '',
522 'si' => 'windows-1250',
523 'sk' => 'windows-1250',
524 'sl' => 'windows-1250',
525 'sq' => 'utf-8',
526 'sr' => 'utf-8',
527 'sv' => '',
528 'th' => 'iso-8859-11',
529 'tr' => 'iso-8859-9',
530 'ua' => 'windows-1251',
531 'uk' => 'windows-1251',
532 'vi' => 'utf-8',
533 'vn' => 'utf-8',
534 'zh' => 'big5'
535 );
536
537 /**
538 * Normalize - changes input character set to lowercase letters.
539 *
540 * @param string $charset Input charset
541 * @return string Normalized charset
542 */
543 public function parse_charset($charset)
544 {
545 $charset = trim(strtolower($charset));
546 if (isset($this->synonyms[$charset])) {
547 $charset = $this->synonyms[$charset];
548 }
549 return $charset;
550 }
551
552 /**
553 * Get the charset of a locale.
554 *
555 * ln language
556 * ln_CN language / country
557 * ln_CN.cs language / country / charset
558 * ln_CN.cs@mod language / country / charset / modifier
559 *
560 * @param string $locale Locale string
561 * @return string Charset resolved for locale string
562 */
563 public function get_locale_charset($locale)
564 {
565 $locale = strtolower($locale);
566 // Exact locale specific charset?
567 if (isset($this->locale_to_charset[$locale])) {
568 return $this->locale_to_charset[$locale];
569 }
570 // Get modifier
571 list($locale, $modifier) = explode('@', $locale);
572 // Locale contains charset: use it
573 list($locale, $charset) = explode('.', $locale);
574 if ($charset) {
575 return $this->parse_charset($charset);
576 }
577 // Modifier is 'euro' (after charset check, because of xx.utf-8@euro)
578 if ($modifier === 'euro') {
579 return 'iso-8859-15';
580 }
581 // Get language
582 list($language, ) = explode('_', $locale);
583 if (isset($this->lang_to_script[$language])) {
584 $script = $this->lang_to_script[$language];
585 }
586 if (TYPO3_OS === 'WIN') {
587 $cs = $this->script_to_charset_windows[$script] ?: 'windows-1252';
588 } else {
589 $cs = $this->script_to_charset_unix[$script] ?: 'utf-8';
590 }
591 return $cs;
592 }
593
594 /********************************************
595 *
596 * Charset Conversion functions
597 *
598 ********************************************/
599 /**
600 * Convert from one charset to another charset.
601 *
602 * @param string $inputString Input string
603 * @param string $fromCharset From charset (the current charset of the string)
604 * @param string $toCharset To charset (the output charset wanted)
605 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
606 * @return string Converted string
607 * @see convArray()
608 */
609 public function conv($inputString, $fromCharset, $toCharset, $useEntityForNoChar = false)
610 {
611 if ($fromCharset === $toCharset) {
612 return $inputString;
613 }
614 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
615 if ($toCharset === 'utf-8' || !$useEntityForNoChar) {
616 switch ($this->getConversionStrategy()) {
617 case self::STRATEGY_MBSTRING:
618 $convertedString = mb_convert_encoding($inputString, $toCharset, $fromCharset);
619 if (false !== $convertedString) {
620 return $convertedString;
621 }
622 // Returns FALSE for unsupported charsets
623 break;
624 case self::STRATEGY_ICONV:
625 $convertedString = iconv($fromCharset, $toCharset . '//TRANSLIT', $inputString);
626 if (false !== $convertedString) {
627 return $convertedString;
628 }
629 break;
630 }
631 }
632 if ($fromCharset !== 'utf-8') {
633 $inputString = $this->utf8_encode($inputString, $fromCharset);
634 }
635 if ($toCharset !== 'utf-8') {
636 $inputString = $this->utf8_decode($inputString, $toCharset, $useEntityForNoChar);
637 }
638 return $inputString;
639 }
640
641 /**
642 * Convert all elements in ARRAY with type string from one charset to another charset.
643 * NOTICE: Array is passed by reference!
644 *
645 * @param array $array Input array, possibly multidimensional
646 * @param string $fromCharset From charset (the current charset of the string)
647 * @param string $toCharset To charset (the output charset wanted)
648 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
649 * @return void
650 * @see conv()
651 */
652 public function convArray(&$array, $fromCharset, $toCharset, $useEntityForNoChar = false)
653 {
654 foreach ($array as $key => $value) {
655 if (is_array($array[$key])) {
656 $this->convArray($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
657 } elseif (is_string($array[$key])) {
658 $array[$key] = $this->conv($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
659 }
660 }
661 }
662
663 /**
664 * Converts $str from $charset to UTF-8
665 *
666 * @param string $str String in local charset to convert to UTF-8
667 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
668 * @return string Output string, converted to UTF-8
669 */
670 public function utf8_encode($str, $charset)
671 {
672 if ($charset === 'utf-8') {
673 return $str;
674 }
675 // Charset is case-insensitive
676 // Parse conv. table if not already
677 if ($this->initCharset($charset)) {
678 $strLen = strlen($str);
679 $outStr = '';
680 // Traverse each char in string
681 for ($a = 0; $a < $strLen; $a++) {
682 $chr = substr($str, $a, 1);
683 $ord = ord($chr);
684 // If the charset has two bytes per char
685 if (isset($this->twoByteSets[$charset])) {
686 $ord2 = ord($str[$a + 1]);
687 // Assume big endian
688 $ord = $ord << 8 | $ord2;
689 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
690 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
691 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
692 } else {
693 $outStr .= chr($this->noCharByteVal);
694 }
695 // No char exists
696 $a++;
697 } elseif ($ord > 127) {
698 // If char has value over 127 it's a multibyte char in UTF-8
699 // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
700 if (isset($this->eucBasedSets[$charset])) {
701 // Shift-JIS: chars between 160 and 223 are single byte
702 if ($charset !== 'shift_jis' || ($ord < 160 || $ord > 223)) {
703 $a++;
704 $ord2 = ord(substr($str, $a, 1));
705 $ord = $ord * 256 + $ord2;
706 }
707 }
708 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
709 // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
710 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
711 } else {
712 $outStr .= chr($this->noCharByteVal);
713 }
714 } else {
715 $outStr .= $chr;
716 }
717 }
718 return $outStr;
719 }
720 }
721
722 /**
723 * Converts $str from UTF-8 to $charset
724 *
725 * @param string $str String in UTF-8 to convert to local charset
726 * @param string $charset Charset, lowercase. Must be found in csconvtbl/ folder.
727 * @param bool $useEntityForNoChar If set, then characters that are not available in the destination character set will be encoded as numeric entities
728 * @return string Output string, converted to local charset
729 */
730 public function utf8_decode($str, $charset, $useEntityForNoChar = false)
731 {
732 if ($charset === 'utf-8') {
733 return $str;
734 }
735 // Charset is case-insensitive.
736 // Parse conv. table if not already
737 if ($this->initCharset($charset)) {
738 $strLen = strlen($str);
739 $outStr = '';
740 // Traverse each char in UTF-8 string
741 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) {
742 $chr = substr($str, $a, 1);
743 $ord = ord($chr);
744 // This means multibyte! (first byte!)
745 if ($ord > 127) {
746 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
747 if ($ord & 64) {
748 // Add first byte
749 $buf = $chr;
750 // For each byte in multibyte string
751 for ($b = 0; $b < 8; $b++) {
752 // Shift it left and
753 $ord = $ord << 1;
754 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
755 if ($ord & 128) {
756 $a++;
757 // ... and add the next char.
758 $buf .= substr($str, $a, 1);
759 } else {
760 break;
761 }
762 }
763 // If the UTF-8 char-sequence is found then...
764 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
765 // The local number
766 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
767 // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
768 if ($mByte > 255) {
769 $outStr .= chr(($mByte >> 8 & 255)) . chr(($mByte & 255));
770 } else {
771 $outStr .= chr($mByte);
772 }
773 } elseif ($useEntityForNoChar) {
774 // Create num entity:
775 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
776 } else {
777 $outStr .= chr($this->noCharByteVal);
778 }
779 } else {
780 $outStr .= chr($this->noCharByteVal);
781 }
782 } else {
783 $outStr .= $chr;
784 }
785 }
786 return $outStr;
787 }
788 }
789
790 /**
791 * Converts all chars > 127 to numeric entities.
792 *
793 * @param string $str Input string
794 * @return string Output string
795 */
796 public function utf8_to_entities($str)
797 {
798 $strLen = strlen($str);
799 $outStr = '';
800 // Traverse each char in UTF-8 string.
801 for ($a = 0; $a < $strLen; $a++) {
802 $chr = substr($str, $a, 1);
803 $ord = ord($chr);
804 // This means multibyte! (first byte!)
805 if ($ord > 127) {
806 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
807 if ($ord & 64) {
808 // Add first byte
809 $buf = $chr;
810 // For each byte in multibyte string...
811 for ($b = 0; $b < 8; $b++) {
812 // Shift it left and ...
813 $ord = $ord << 1;
814 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
815 if ($ord & 128) {
816 $a++;
817 // ... and add the next char.
818 $buf .= substr($str, $a, 1);
819 } else {
820 break;
821 }
822 }
823 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
824 } else {
825 $outStr .= chr($this->noCharByteVal);
826 }
827 } else {
828 $outStr .= $chr;
829 }
830 }
831 return $outStr;
832 }
833
834 /**
835 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
836 *
837 * @param string $str Input string, UTF-8
838 * @param bool $alsoStdHtmlEnt If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
839 * @return string Output string
840 */
841 public function entities_to_utf8($str, $alsoStdHtmlEnt = false)
842 {
843 if ($alsoStdHtmlEnt) {
844 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8'));
845 }
846 $token = md5(microtime());
847 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
848 foreach ($parts as $k => $v) {
849 // Only take every second element
850 if ($k % 2 === 0) {
851 continue;
852 }
853 $position = 0;
854 // Dec or hex entities
855 if (substr($v, $position, 1) === '#') {
856 $position++;
857 if (substr($v, $position, 1) === 'x') {
858 $v = hexdec(substr($v, ++$position));
859 } else {
860 $v = substr($v, $position);
861 }
862 $parts[$k] = $this->UnumberToChar($v);
863 } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) {
864 // Other entities:
865 $v = $trans_tbl['&' . $v . ';'];
866 $parts[$k] = $v;
867 } else {
868 // No conversion:
869 $parts[$k] = '&' . $v . ';';
870 }
871 }
872 return implode('', $parts);
873 }
874
875 /**
876 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
877 *
878 * @param string $str Input string, UTF-8
879 * @param bool $convEntities If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
880 * @param bool $retChar If set, then instead of integer numbers the real UTF-8 char is returned.
881 * @return array Output array with the char numbers
882 */
883 public function utf8_to_numberarray($str, $convEntities = false, $retChar = false)
884 {
885 // If entities must be registered as well...:
886 if ($convEntities) {
887 $str = $this->entities_to_utf8($str, 1);
888 }
889 // Do conversion:
890 $strLen = strlen($str);
891 $outArr = array();
892 // Traverse each char in UTF-8 string.
893 for ($a = 0; $a < $strLen; $a++) {
894 $chr = substr($str, $a, 1);
895 $ord = ord($chr);
896 // This means multibyte! (first byte!)
897 if ($ord > 127) {
898 // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
899 if ($ord & 64) {
900 // Add first byte
901 $buf = $chr;
902 // For each byte in multibyte string...
903 for ($b = 0; $b < 8; $b++) {
904 // Shift it left and ...
905 $ord = $ord << 1;
906 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
907 if ($ord & 128) {
908 $a++;
909 // ... and add the next char.
910 $buf .= substr($str, $a, 1);
911 } else {
912 break;
913 }
914 }
915 $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
916 } else {
917 $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
918 }
919 } else {
920 $outArr[] = $retChar ? chr($ord) : $ord;
921 }
922 }
923 return $outArr;
924 }
925
926 /**
927 * Converts a UNICODE number to a UTF-8 multibyte character
928 * Algorithm based on script found at From: http://czyborra.com/utf/
929 * Unit-tested by Kasper
930 *
931 * The binary representation of the character's integer value is thus simply spread across the bytes
932 * and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
933 *
934 * bytes | bits | representation
935 * 1 | 7 | 0vvvvvvv
936 * 2 | 11 | 110vvvvv 10vvvvvv
937 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
938 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
939 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
940 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
941 *
942 * @param int $unicodeInteger UNICODE integer
943 * @return string UTF-8 multibyte character string
944 * @see utf8CharToUnumber()
945 */
946 public function UnumberToChar($unicodeInteger)
947 {
948 $str = '';
949 if ($unicodeInteger < 128) {
950 $str .= chr($unicodeInteger);
951 } elseif ($unicodeInteger < 2048) {
952 $str .= chr(192 | $unicodeInteger >> 6);
953 $str .= chr(128 | $unicodeInteger & 63);
954 } elseif ($unicodeInteger < 65536) {
955 $str .= chr(224 | $unicodeInteger >> 12);
956 $str .= chr(128 | $unicodeInteger >> 6 & 63);
957 $str .= chr(128 | $unicodeInteger & 63);
958 } elseif ($unicodeInteger < 2097152) {
959 $str .= chr(240 | $unicodeInteger >> 18);
960 $str .= chr(128 | $unicodeInteger >> 12 & 63);
961 $str .= chr(128 | $unicodeInteger >> 6 & 63);
962 $str .= chr(128 | $unicodeInteger & 63);
963 } elseif ($unicodeInteger < 67108864) {
964 $str .= chr(248 | $unicodeInteger >> 24);
965 $str .= chr(128 | $unicodeInteger >> 18 & 63);
966 $str .= chr(128 | $unicodeInteger >> 12 & 63);
967 $str .= chr(128 | $unicodeInteger >> 6 & 63);
968 $str .= chr(128 | $unicodeInteger & 63);
969 } elseif ($unicodeInteger < 2147483648) {
970 $str .= chr(252 | $unicodeInteger >> 30);
971 $str .= chr(128 | $unicodeInteger >> 24 & 63);
972 $str .= chr(128 | $unicodeInteger >> 18 & 63);
973 $str .= chr(128 | $unicodeInteger >> 12 & 63);
974 $str .= chr(128 | $unicodeInteger >> 6 & 63);
975 $str .= chr(128 | $unicodeInteger & 63);
976 } else {
977 // Cannot express a 32-bit character in UTF-8
978 $str .= chr($this->noCharByteVal);
979 }
980 return $str;
981 }
982
983 /**
984 * Converts a UTF-8 Multibyte character to a UNICODE number
985 * Unit-tested by Kasper
986 *
987 * @param string $str UTF-8 multibyte character string
988 * @param bool $hex If set, then a hex. number is returned.
989 * @return int UNICODE integer
990 * @see UnumberToChar()
991 */
992 public function utf8CharToUnumber($str, $hex = false)
993 {
994 // First char
995 $ord = ord($str[0]);
996 // This verifies that it IS a multi byte string
997 if (($ord & 192) === 192) {
998 $binBuf = '';
999 // For each byte in multibyte string...
1000 for ($b = 0; $b < 8; $b++) {
1001 // Shift it left and ...
1002 $ord = $ord << 1;
1003 // ... and with 8th bit - if that is set, then there are still bytes in sequence.
1004 if ($ord & 128) {
1005 $binBuf .= substr('00000000' . decbin(ord(substr($str, ($b + 1), 1))), -6);
1006 } else {
1007 break;
1008 }
1009 }
1010 $binBuf = substr(('00000000' . decbin(ord($str[0]))), -(6 - $b)) . $binBuf;
1011 $int = bindec($binBuf);
1012 } else {
1013 $int = $ord;
1014 }
1015 return $hex ? 'x' . dechex($int) : $int;
1016 }
1017
1018 /********************************************
1019 *
1020 * Init functions
1021 *
1022 ********************************************/
1023 /**
1024 * This will initialize a charset for use if it's defined in the 'typo3/sysext/core/Resources/Private/Charsets/csconvtbl/' folder
1025 * This function is automatically called by the conversion functions
1026 *
1027 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
1028 *
1029 * @param string $charset The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
1030 * @return int Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
1031 * @access private
1032 */
1033 public function initCharset($charset)
1034 {
1035 // Only process if the charset is not yet loaded:
1036 if (!is_array($this->parsedCharsets[$charset])) {
1037 // Conversion table filename:
1038 $charsetConvTableFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/csconvtbl/' . $charset . '.tbl';
1039 // If the conversion table is found:
1040 if ($charset && GeneralUtility::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
1041 // Cache file for charsets:
1042 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1043 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1044 if ($cacheFile && @is_file($cacheFile)) {
1045 $this->parsedCharsets[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1046 } else {
1047 // Parse conversion table into lines:
1048 $lines = GeneralUtility::trimExplode(LF, GeneralUtility::getUrl($charsetConvTableFile), true);
1049 // Initialize the internal variable holding the conv. table:
1050 $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
1051 // traverse the lines:
1052 $detectedType = '';
1053 foreach ($lines as $value) {
1054 // Comment line or blanks are ignored.
1055 if (trim($value) && $value[0] !== '#') {
1056 // Detect type if not done yet: (Done on first real line)
1057 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1058 if (!$detectedType) {
1059 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
1060 }
1061 if ($detectedType === 'ms-token') {
1062 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1063 } elseif ($detectedType === 'whitespaced') {
1064 $regA = array();
1065 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1066 $hexbyte = $regA[1];
1067 $utf8 = 'U+' . $regA[2];
1068 }
1069 $decval = hexdec(trim($hexbyte));
1070 if ($decval > 127) {
1071 $utf8decval = hexdec(substr(trim($utf8), 2));
1072 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1073 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1074 }
1075 }
1076 }
1077 if ($cacheFile) {
1078 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1079 }
1080 }
1081 return 2;
1082 } else {
1083 return false;
1084 }
1085 } else {
1086 return 1;
1087 }
1088 }
1089
1090 /**
1091 * This function initializes all UTF-8 character data tables.
1092 *
1093 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1094 *
1095 * @param string $mode Mode ("case", "ascii", ...)
1096 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1097 * @access private
1098 */
1099 public function initUnicodeData($mode = null)
1100 {
1101 // Cache files
1102 $cacheFileCase = GeneralUtility::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1103 $cacheFileASCII = GeneralUtility::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1104 // Only process if the tables are not yet loaded
1105 switch ($mode) {
1106 case 'case':
1107 if (is_array($this->caseFolding['utf-8'])) {
1108 return 1;
1109 }
1110 // Use cached version if possible
1111 if ($cacheFileCase && @is_file($cacheFileCase)) {
1112 $this->caseFolding['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileCase));
1113 return 2;
1114 }
1115 break;
1116 case 'ascii':
1117 if (is_array($this->toASCII['utf-8'])) {
1118 return 1;
1119 }
1120 // Use cached version if possible
1121 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1122 $this->toASCII['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileASCII));
1123 return 2;
1124 }
1125 break;
1126 }
1127 // Process main Unicode data file
1128 $unicodeDataFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/UnicodeData.txt';
1129 if (!(GeneralUtility::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1130 return false;
1131 }
1132 $fh = fopen($unicodeDataFile, 'rb');
1133 if (!$fh) {
1134 return false;
1135 }
1136 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1137 // Note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1138 $this->caseFolding['utf-8'] = array();
1139 $utf8CaseFolding = &$this->caseFolding['utf-8'];
1140 // a shorthand
1141 $utf8CaseFolding['toUpper'] = array();
1142 $utf8CaseFolding['toLower'] = array();
1143 $utf8CaseFolding['toTitle'] = array();
1144 // Array of temp. decompositions
1145 $decomposition = array();
1146 // Array of chars that are marks (eg. composing accents)
1147 $mark = array();
1148 // Array of chars that are numbers (eg. digits)
1149 $number = array();
1150 // Array of chars to be omitted (eg. Russian hard sign)
1151 $omit = array();
1152 while (!feof($fh)) {
1153 $line = fgets($fh, 4096);
1154 // Has a lot of info
1155 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title, ) = explode(';', rtrim($line));
1156 $ord = hexdec($char);
1157 if ($ord > 65535) {
1158 // Only process the BMP
1159 break;
1160 }
1161 $utf8_char = $this->UnumberToChar($ord);
1162 if ($upper) {
1163 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1164 }
1165 if ($lower) {
1166 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1167 }
1168 // Store "title" only when different from "upper" (only a few)
1169 if ($title && $title !== $upper) {
1170 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1171 }
1172 switch ($cat[0]) {
1173 case 'M':
1174 // mark (accent, umlaut, ...)
1175 $mark['U+' . $char] = 1;
1176 break;
1177 case 'N':
1178 // numeric value
1179 if ($ord > 128 && $num !== '') {
1180 $number['U+' . $char] = $num;
1181 }
1182 }
1183 // Accented Latin letters without "official" decomposition
1184 $match = array();
1185 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1186 $c = ord($match[2]);
1187 if ($match[1] === 'SMALL') {
1188 $c += 32;
1189 }
1190 $decomposition['U+' . $char] = array(dechex($c));
1191 continue;
1192 }
1193 $match = array();
1194 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1195 switch ($match[1]) {
1196 case '<circle>':
1197 // add parenthesis as circle replacement, eg (1)
1198 $match[2] = '0028 ' . $match[2] . ' 0029';
1199 break;
1200 case '<square>':
1201 // add square brackets as square replacement, eg [1]
1202 $match[2] = '005B ' . $match[2] . ' 005D';
1203 break;
1204 case '<compat>':
1205 // ignore multi char decompositions that start with a space
1206 if (preg_match('/^0020 /', $match[2])) {
1207 continue 2;
1208 }
1209 break;
1210 case '<initial>':
1211 case '<medial>':
1212 case '<final>':
1213 case '<isolated>':
1214 case '<vertical>':
1215 continue 2;
1216 }
1217 $decomposition['U+' . $char] = explode(' ', $match[2]);
1218 }
1219 }
1220 fclose($fh);
1221 // Process additional Unicode data for casing (allow folded characters to expand into a sequence)
1222 $specialCasingFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/SpecialCasing.txt';
1223 if (GeneralUtility::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1224 $fh = fopen($specialCasingFile, 'rb');
1225 if ($fh) {
1226 while (!feof($fh)) {
1227 $line = fgets($fh, 4096);
1228 if ($line[0] !== '#' && trim($line) !== '') {
1229 list($char, $lower, $title, $upper, $cond) = GeneralUtility::trimExplode(';', $line);
1230 if ($cond === '' || $cond[0] === '#') {
1231 $utf8_char = $this->UnumberToChar(hexdec($char));
1232 if ($char !== $lower) {
1233 $arr = explode(' ', $lower);
1234 for ($i = 0; isset($arr[$i]); $i++) {
1235 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1236 }
1237 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1238 }
1239 if ($char !== $title && $title !== $upper) {
1240 $arr = explode(' ', $title);
1241 for ($i = 0; isset($arr[$i]); $i++) {
1242 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1243 }
1244 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1245 }
1246 if ($char !== $upper) {
1247 $arr = explode(' ', $upper);
1248 for ($i = 0; isset($arr[$i]); $i++) {
1249 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1250 }
1251 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1252 }
1253 }
1254 }
1255 }
1256 fclose($fh);
1257 }
1258 }
1259 // Process custom decompositions
1260 $customTranslitFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/Translit.txt';
1261 if (GeneralUtility::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1262 $fh = fopen($customTranslitFile, 'rb');
1263 if ($fh) {
1264 while (!feof($fh)) {
1265 $line = fgets($fh, 4096);
1266 if ($line[0] !== '#' && trim($line) !== '') {
1267 list($char, $translit) = GeneralUtility::trimExplode(';', $line);
1268 if (!$translit) {
1269 $omit['U+' . $char] = 1;
1270 }
1271 $decomposition['U+' . $char] = explode(' ', $translit);
1272 }
1273 }
1274 fclose($fh);
1275 }
1276 }
1277 // Decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1278 foreach ($decomposition as $from => $to) {
1279 $code_decomp = array();
1280 while ($code_value = array_shift($to)) {
1281 // Do recursive decomposition
1282 if (isset($decomposition['U+' . $code_value])) {
1283 foreach (array_reverse($decomposition['U+' . $code_value]) as $cv) {
1284 array_unshift($to, $cv);
1285 }
1286 } elseif (!isset($mark['U+' . $code_value])) {
1287 // remove mark
1288 array_push($code_decomp, $code_value);
1289 }
1290 }
1291 if (!empty($code_decomp) || isset($omit[$from])) {
1292 $decomposition[$from] = $code_decomp;
1293 } else {
1294 unset($decomposition[$from]);
1295 }
1296 }
1297 // Create ascii only mapping
1298 $this->toASCII['utf-8'] = array();
1299 $ascii = &$this->toASCII['utf-8'];
1300 foreach ($decomposition as $from => $to) {
1301 $code_decomp = array();
1302 while ($code_value = array_shift($to)) {
1303 $ord = hexdec($code_value);
1304 if ($ord > 127) {
1305 continue 2;
1306 } else {
1307 // Skip decompositions containing non-ASCII chars
1308 array_push($code_decomp, chr($ord));
1309 }
1310 }
1311 $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1312 }
1313 // Add numeric decompositions
1314 foreach ($number as $from => $to) {
1315 $utf8_char = $this->UnumberToChar(hexdec($from));
1316 if (!isset($ascii[$utf8_char])) {
1317 $ascii[$utf8_char] = $to;
1318 }
1319 }
1320 if ($cacheFileCase) {
1321 GeneralUtility::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1322 }
1323 if ($cacheFileASCII) {
1324 GeneralUtility::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1325 }
1326 return 3;
1327 }
1328
1329 /**
1330 * This function initializes the folding table for a charset other than UTF-8.
1331 * This function is automatically called by the case folding functions.
1332 *
1333 * @param string $charset Charset for which to initialize case folding.
1334 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1335 * @access private
1336 */
1337 public function initCaseFolding($charset)
1338 {
1339 // Only process if the case table is not yet loaded:
1340 if (is_array($this->caseFolding[$charset])) {
1341 return 1;
1342 }
1343 // Use cached version if possible
1344 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1345 if ($cacheFile && @is_file($cacheFile)) {
1346 $this->caseFolding[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1347 return 2;
1348 }
1349 // init UTF-8 conversion for this charset
1350 if (!$this->initCharset($charset)) {
1351 return false;
1352 }
1353 // UTF-8 case folding is used as the base conversion table
1354 if (!$this->initUnicodeData('case')) {
1355 return false;
1356 }
1357 $nochar = chr($this->noCharByteVal);
1358 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1359 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1360 $c = $this->utf8_decode($utf8, $charset);
1361 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1362 if ($cc !== '' && $cc !== $nochar) {
1363 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1364 }
1365 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1366 if ($cc !== '' && $cc !== $nochar) {
1367 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1368 }
1369 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1370 if ($cc !== '' && $cc !== $nochar) {
1371 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1372 }
1373 }
1374 // Add the ASCII case table
1375 $start = ord('a');
1376 $end = ord('z');
1377 for ($i = $start; $i <= $end; $i++) {
1378 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1379 }
1380 $start = ord('A');
1381 $end = ord('Z');
1382 for ($i = $start; $i <= $end; $i++) {
1383 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1384 }
1385 if ($cacheFile) {
1386 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1387 }
1388 return 3;
1389 }
1390
1391 /**
1392 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1393 * This function is automatically called by the ASCII transliteration functions.
1394 *
1395 * @param string $charset Charset for which to initialize conversion.
1396 * @return int Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1397 * @access private
1398 */
1399 public function initToASCII($charset)
1400 {
1401 // Only process if the case table is not yet loaded:
1402 if (is_array($this->toASCII[$charset])) {
1403 return 1;
1404 }
1405 // Use cached version if possible
1406 $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1407 if ($cacheFile && @is_file($cacheFile)) {
1408 $this->toASCII[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1409 return 2;
1410 }
1411 // Init UTF-8 conversion for this charset
1412 if (!$this->initCharset($charset)) {
1413 return false;
1414 }
1415 // UTF-8/ASCII transliteration is used as the base conversion table
1416 if (!$this->initUnicodeData('ascii')) {
1417 return false;
1418 }
1419 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1420 // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1421 $c = $this->utf8_decode($utf8, $charset);
1422 if (isset($this->toASCII['utf-8'][$utf8])) {
1423 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1424 }
1425 }
1426 if ($cacheFile) {
1427 GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1428 }
1429 return 3;
1430 }
1431
1432 /********************************************
1433 *
1434 * String operation functions
1435 *
1436 ********************************************/
1437 /**
1438 * Returns a part of a string.
1439 * Unit-tested by Kasper (single byte charsets only)
1440 *
1441 * @param string $charset The character set
1442 * @param string $string Character string
1443 * @param int $start Start position (character position)
1444 * @param int $len Length (in characters)
1445 * @return string The substring
1446 * @see substr(), mb_substr()
1447 */
1448 public function substr($charset, $string, $start, $len = null)
1449 {
1450 if ($len === 0 || $string === '') {
1451 return '';
1452 }
1453 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1454 // Cannot omit $len, when specifying charset
1455 if ($len === null) {
1456 // Save internal encoding
1457 $enc = mb_internal_encoding();
1458 mb_internal_encoding($charset);
1459 $str = mb_substr($string, $start);
1460 // Restore internal encoding
1461 mb_internal_encoding($enc);
1462 return $str;
1463 } else {
1464 return mb_substr($string, $start, $len, $charset);
1465 }
1466 } elseif ($this->getConversionStrategy() === self::STRATEGY_ICONV) {
1467 // Cannot omit $len, when specifying charset
1468 if ($len === null) {
1469 // Save internal encoding
1470 $enc = iconv_get_encoding('internal_encoding');
1471 iconv_set_encoding('internal_encoding', $charset);
1472 $str = iconv_substr($string, $start);
1473 // Restore internal encoding
1474 iconv_set_encoding('internal_encoding', $enc);
1475 return $str;
1476 } else {
1477 return iconv_substr($string, $start, $len, $charset);
1478 }
1479 } elseif ($charset === 'utf-8') {
1480 return $this->utf8_substr($string, $start, $len);
1481 } elseif ($this->eucBasedSets[$charset]) {
1482 return $this->euc_substr($string, $start, $charset, $len);
1483 } elseif ($this->twoByteSets[$charset]) {
1484 return substr($string, $start * 2, $len * 2);
1485 } elseif ($this->fourByteSets[$charset]) {
1486 return substr($string, $start * 4, $len * 4);
1487 }
1488 // Treat everything else as single-byte encoding
1489 return $len === null ? substr($string, $start) : substr($string, $start, $len);
1490 }
1491
1492 /**
1493 * Counts the number of characters.
1494 * Unit-tested by Kasper (single byte charsets only)
1495 *
1496 * @param string $charset The character set
1497 * @param string $string Character string
1498 * @return int The number of characters
1499 * @see strlen()
1500 */
1501 public function strlen($charset, $string)
1502 {
1503 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1504 return mb_strlen($string, $charset);
1505 } elseif ($this->getConversionStrategy() === self::STRATEGY_ICONV) {
1506 return iconv_strlen($string, $charset);
1507 } elseif ($charset === 'utf-8') {
1508 return $this->utf8_strlen($string);
1509 } elseif ($this->eucBasedSets[$charset]) {
1510 return $this->euc_strlen($string, $charset);
1511 } elseif ($this->twoByteSets[$charset]) {
1512 return strlen($string) / 2;
1513 } elseif ($this->fourByteSets[$charset]) {
1514 return strlen($string) / 4;
1515 }
1516 // Treat everything else as single-byte encoding
1517 return strlen($string);
1518 }
1519
1520 /**
1521 * Method to crop strings using the mb_substr function.
1522 *
1523 * @param string $charset The character set
1524 * @param string $string String to be cropped
1525 * @param int $len Crop length (in characters)
1526 * @param string $crop Crop signifier
1527 * @return string The shortened string
1528 * @see mb_strlen(), mb_substr()
1529 */
1530 protected function cropMbstring($charset, $string, $len, $crop = '')
1531 {
1532 if ((int)$len === 0 || mb_strlen($string, $charset) <= abs($len)) {
1533 return $string;
1534 }
1535 if ($len > 0) {
1536 $string = mb_substr($string, 0, $len, $charset) . $crop;
1537 } else {
1538 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1539 }
1540 return $string;
1541 }
1542
1543 /**
1544 * Truncates a string and pre-/appends a string.
1545 * Unit tested by Kasper
1546 *
1547 * @param string $charset The character set
1548 * @param string $string Character string
1549 * @param int $len Length (in characters)
1550 * @param string $crop Crop signifier
1551 * @return string The shortened string
1552 * @see substr(), mb_strimwidth()
1553 */
1554 public function crop($charset, $string, $len, $crop = '')
1555 {
1556 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1557 return $this->cropMbstring($charset, $string, $len, $crop);
1558 }
1559 if ((int)$len === 0) {
1560 return $string;
1561 }
1562 if ($charset === 'utf-8') {
1563 $i = $this->utf8_char2byte_pos($string, $len);
1564 } elseif ($this->eucBasedSets[$charset]) {
1565 $i = $this->euc_char2byte_pos($string, $len, $charset);
1566 } else {
1567 if ($len > 0) {
1568 $i = $len;
1569 } else {
1570 $i = strlen($string) + $len;
1571 if ($i <= 0) {
1572 $i = false;
1573 }
1574 }
1575 }
1576 // $len outside actual string length
1577 if ($i === false) {
1578 return $string;
1579 } else {
1580 if ($len > 0) {
1581 if (isset($string[$i])) {
1582 return substr($string, 0, $i) . $crop;
1583 }
1584 } else {
1585 if (isset($string[$i - 1])) {
1586 return $crop . substr($string, $i);
1587 }
1588 }
1589 }
1590 return $string;
1591 }
1592
1593 /**
1594 * Cuts a string short at a given byte length.
1595 *
1596 * @param string $charset The character set
1597 * @param string $string Character string
1598 * @param int $len The byte length
1599 * @return string The shortened string
1600 * @see mb_strcut()
1601 */
1602 public function strtrunc($charset, $string, $len)
1603 {
1604 if ($len <= 0) {
1605 return '';
1606 }
1607 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1608 return mb_strcut($string, 0, $len, $charset);
1609 } elseif ($charset === 'utf-8') {
1610 return $this->utf8_strtrunc($string, $len);
1611 } elseif ($this->eucBasedSets[$charset]) {
1612 return $this->euc_strtrunc($string, $len, $charset);
1613 } elseif ($this->twoByteSets[$charset]) {
1614 if ($len % 2) {
1615 $len--;
1616 }
1617 } elseif ($this->fourByteSets[$charset]) {
1618 $x = $len % 4;
1619 // Realign to position dividable by four
1620 $len -= $x;
1621 }
1622 // Treat everything else as single-byte encoding
1623 return substr($string, 0, $len);
1624 }
1625
1626 /**
1627 * Translates all characters of a string into their respective case values.
1628 * Unlike strtolower() and strtoupper() this method is locale independent.
1629 * Note that the string length may change!
1630 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1631 * Unit-tested by Kasper
1632 * Real case folding is language dependent, this method ignores this fact.
1633 *
1634 * @param string $charset Character set of string
1635 * @param string $string Input string to convert case for
1636 * @param string $case Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1637 * @return string The converted string
1638 * @see strtolower(), strtoupper()
1639 */
1640 public function conv_case($charset, $string, $case)
1641 {
1642 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1643 if ($case === 'toLower') {
1644 $string = mb_strtolower($string, $charset);
1645 } else {
1646 $string = mb_strtoupper($string, $charset);
1647 }
1648 } elseif ($charset === 'utf-8') {
1649 $string = $this->utf8_char_mapping($string, 'case', $case);
1650 } elseif (isset($this->eucBasedSets[$charset])) {
1651 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1652 } else {
1653 // Treat everything else as single-byte encoding
1654 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1655 }
1656 return $string;
1657 }
1658
1659 /**
1660 * Equivalent of lcfirst/ucfirst but using character set.
1661 *
1662 * @param string $charset
1663 * @param string $string
1664 * @param string $case
1665 * @return string
1666 * @see \TYPO3\CMS\Core\Charset\CharsetConverter::conv_case()
1667 */
1668 public function convCaseFirst($charset, $string, $case)
1669 {
1670 $firstChar = $this->substr($charset, $string, 0, 1);
1671 $firstChar = $this->conv_case($charset, $firstChar, $case);
1672 $remainder = $this->substr($charset, $string, 1);
1673 return $firstChar . $remainder;
1674 }
1675
1676 /**
1677 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1678 *
1679 * @param string $charset Character set of string
1680 * @param string $string Input string to convert
1681 * @return string The converted string
1682 */
1683 public function specCharsToASCII($charset, $string)
1684 {
1685 if ($charset === 'utf-8') {
1686 $string = $this->utf8_char_mapping($string, 'ascii');
1687 } elseif (isset($this->eucBasedSets[$charset])) {
1688 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1689 } else {
1690 // Treat everything else as single-byte encoding
1691 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1692 }
1693 return $string;
1694 }
1695
1696 /**
1697 * Converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1698 * into a TYPO3-readable language code
1699 *
1700 * @param string $languageCodesList List of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1701 * @return string A preferred language that TYPO3 supports, or "default" if none found
1702 */
1703 public function getPreferredClientLanguage($languageCodesList)
1704 {
1705 $allLanguageCodes = $this->getAllLanguageCodes();
1706 $selectedLanguage = 'default';
1707 $preferredLanguages = GeneralUtility::trimExplode(',', $languageCodesList);
1708 // Order the preferred languages after they key
1709 $sortedPreferredLanguages = array();
1710 foreach ($preferredLanguages as $preferredLanguage) {
1711 $quality = 1.0;
1712 if (strpos($preferredLanguage, ';q=') !== false) {
1713 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1714 }
1715 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1716 }
1717 // Loop through the languages, with the highest priority first
1718 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1719 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1720 if (isset($allLanguageCodes[$preferredLanguage])) {
1721 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1722 break;
1723 }
1724 // Strip the country code from the end
1725 list($preferredLanguage, ) = explode('-', $preferredLanguage);
1726 if (isset($allLanguageCodes[$preferredLanguage])) {
1727 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1728 break;
1729 }
1730 }
1731 if (!$selectedLanguage || $selectedLanguage === 'en') {
1732 $selectedLanguage = 'default';
1733 }
1734 return $selectedLanguage;
1735 }
1736
1737 /**
1738 * Merges all available charsets and locales, currently only used for getPreferredClientLanguage()
1739 *
1740 * @return array
1741 */
1742 protected function getAllLanguageCodes()
1743 {
1744 // Get all languages where TYPO3 code is the same as the ISO code
1745 $typo3LanguageCodes = array_keys($this->charSetArray);
1746 $allLanguageCodes = array_combine($typo3LanguageCodes, $typo3LanguageCodes);
1747 // Get all languages where TYPO3 code differs from ISO code
1748 // or needs the country part
1749 // the iso codes will here overwrite the default typo3 language in the key
1750 /** @var Locales $locales */
1751 $locales = GeneralUtility::makeInstance(Locales::class);
1752 foreach ($locales->getIsoMapping() as $typo3Lang => $isoLang) {
1753 $isoLang = join('-', explode('_', $isoLang));
1754 $allLanguageCodes[$typo3Lang] = $isoLang;
1755 }
1756 // Move the iso codes to the (because we're comparing the keys with "isset" later on)
1757 return array_flip($allLanguageCodes);
1758 }
1759
1760 /********************************************
1761 *
1762 * Internal string operation functions
1763 *
1764 ********************************************/
1765 /**
1766 * Maps all characters of a string in a single byte charset.
1767 *
1768 * @param string $str The string
1769 * @param string $charset The charset
1770 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1771 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
1772 * @return string The converted string
1773 */
1774 public function sb_char_mapping($str, $charset, $mode, $opt = '')
1775 {
1776 switch ($mode) {
1777 case 'case':
1778 if (!$this->initCaseFolding($charset)) {
1779 return $str;
1780 }
1781 // Do nothing
1782 $map = &$this->caseFolding[$charset][$opt];
1783 break;
1784 case 'ascii':
1785 if (!$this->initToASCII($charset)) {
1786 return $str;
1787 }
1788 // Do nothing
1789 $map = &$this->toASCII[$charset];
1790 break;
1791 default:
1792 return $str;
1793 }
1794 $out = '';
1795 for ($i = 0; isset($str[$i]); $i++) {
1796 $c = $str[$i];
1797 if (isset($map[$c])) {
1798 $out .= $map[$c];
1799 } else {
1800 $out .= $c;
1801 }
1802 }
1803 return $out;
1804 }
1805
1806 /********************************************
1807 *
1808 * Internal UTF-8 string operation functions
1809 *
1810 ********************************************/
1811 /**
1812 * Returns a part of a UTF-8 string.
1813 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1814 *
1815 * @param string $str UTF-8 string
1816 * @param int $start Start position (character position)
1817 * @param int $len Length (in characters)
1818 * @return string The substring
1819 * @see substr()
1820 */
1821 public function utf8_substr($str, $start, $len = null)
1822 {
1823 if ((string)$len === '0') {
1824 return '';
1825 }
1826 $byte_start = $this->utf8_char2byte_pos($str, $start);
1827 if ($byte_start === false) {
1828 if ($start > 0) {
1829 // $start outside string length
1830 return false;
1831 }
1832 }
1833 $str = substr($str, $byte_start);
1834 if ($len != null) {
1835 $byte_end = $this->utf8_char2byte_pos($str, $len);
1836 // $len outside actual string length
1837 if ($byte_end === false) {
1838 return $len < 0 ? '' : $str;
1839 } else {
1840 // When length is less than zero and exceeds, then we return blank string.
1841 return substr($str, 0, $byte_end);
1842 }
1843 } else {
1844 return $str;
1845 }
1846 }
1847
1848 /**
1849 * Counts the number of characters of a string in UTF-8.
1850 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1851 *
1852 * @param string $str UTF-8 multibyte character string
1853 * @return int The number of characters
1854 * @see strlen()
1855 */
1856 public function utf8_strlen($str)
1857 {
1858 $n = 0;
1859 for ($i = 0; isset($str[$i]); $i++) {
1860 $c = ord($str[$i]);
1861 // Single-byte (0xxxxxx)
1862 if (!($c & 128)) {
1863 $n++;
1864 } elseif (($c & 192) === 192) {
1865 // Multi-byte starting byte (11xxxxxx)
1866 $n++;
1867 }
1868 }
1869 return $n;
1870 }
1871
1872 /**
1873 * Truncates a string in UTF-8 short at a given byte length.
1874 *
1875 * @param string $str UTF-8 multibyte character string
1876 * @param int $len The byte length
1877 * @return string The shortened string
1878 * @see mb_strcut()
1879 */
1880 public function utf8_strtrunc($str, $len)
1881 {
1882 $i = $len - 1;
1883 // Part of a multibyte sequence
1884 if (ord($str[$i]) & 128) {
1885 for (; $i > 0 && !(ord($str[$i]) & 64); $i--) {
1886 }
1887 if ($i <= 0) {
1888 return '';
1889 }
1890 // Sanity check
1891 for ($bc = 0, $mbs = ord($str[$i]); $mbs & 128; $mbs = $mbs << 1) {
1892 // Calculate number of bytes
1893 $bc++;
1894 }
1895 if ($bc + $i > $len) {
1896 return substr($str, 0, $i);
1897 }
1898 }
1899 return substr($str, 0, $len);
1900 }
1901
1902 /**
1903 * Find position of first occurrence of a string, both arguments are in UTF-8.
1904 *
1905 * @param string $haystack UTF-8 string to search in
1906 * @param string $needle UTF-8 string to search for
1907 * @param int $offset Position to start the search
1908 * @return int The character position
1909 * @see strpos()
1910 */
1911 public function utf8_strpos($haystack, $needle, $offset = 0)
1912 {
1913 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1914 return mb_strpos($haystack, $needle, $offset, 'utf-8');
1915 } elseif ($this->getConversionStrategy() === self::STRATEGY_ICONV) {
1916 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1917 }
1918 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1919 if ($byte_offset === false) {
1920 // Offset beyond string length
1921 return false;
1922 }
1923 $byte_pos = strpos($haystack, $needle, $byte_offset);
1924 if ($byte_pos === false) {
1925 // Needle not found
1926 return false;
1927 }
1928 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1929 }
1930
1931 /**
1932 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1933 *
1934 * @param string $haystack UTF-8 string to search in
1935 * @param string $needle UTF-8 character to search for (single character)
1936 * @return int The character position
1937 * @see strrpos()
1938 */
1939 public function utf8_strrpos($haystack, $needle)
1940 {
1941 if ($this->getConversionStrategy() === self::STRATEGY_MBSTRING) {
1942 return mb_strrpos($haystack, $needle, 'utf-8');
1943 } elseif ($this->getConversionStrategy() === self::STRATEGY_ICONV) {
1944 return iconv_strrpos($haystack, $needle, 'utf-8');
1945 }
1946 $byte_pos = strrpos($haystack, $needle);
1947 if ($byte_pos === false) {
1948 // Needle not found
1949 return false;
1950 }
1951 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1952 }
1953
1954 /**
1955 * Translates a character position into an 'absolute' byte position.
1956 * Unit tested by Kasper.
1957 *
1958 * @param string $str UTF-8 string
1959 * @param int $pos Character position (negative values start from the end)
1960 * @return int Byte position
1961 */
1962 public function utf8_char2byte_pos($str, $pos)
1963 {
1964 // Number of characters found
1965 $n = 0;
1966 // Number of characters wanted
1967 $p = abs($pos);
1968 if ($pos >= 0) {
1969 $i = 0;
1970 $d = 1;
1971 } else {
1972 $i = strlen($str) - 1;
1973 $d = -1;
1974 }
1975 for (; isset($str[$i]) && $n < $p; $i += $d) {
1976 $c = (int)ord($str[$i]);
1977 // single-byte (0xxxxxx)
1978 if (!($c & 128)) {
1979 $n++;
1980 } elseif (($c & 192) === 192) {
1981 // Multi-byte starting byte (11xxxxxx)
1982 $n++;
1983 }
1984 }
1985 if (!isset($str[$i])) {
1986 // Offset beyond string length
1987 return false;
1988 }
1989 if ($pos >= 0) {
1990 // Skip trailing multi-byte data bytes
1991 while (ord($str[$i]) & 128 && !(ord($str[$i]) & 64)) {
1992 $i++;
1993 }
1994 } else {
1995 // Correct offset
1996 $i++;
1997 }
1998 return $i;
1999 }
2000
2001 /**
2002 * Translates an 'absolute' byte position into a character position.
2003 * Unit tested by Kasper.
2004 *
2005 * @param string $str UTF-8 string
2006 * @param int $pos Byte position
2007 * @return int Character position
2008 */
2009 public function utf8_byte2char_pos($str, $pos)
2010 {
2011 // Number of characters
2012 $n = 0;
2013 for ($i = $pos; $i > 0; $i--) {
2014 $c = (int)ord($str[$i]);
2015 // single-byte (0xxxxxx)
2016 if (!($c & 128)) {
2017 $n++;
2018 } elseif (($c & 192) === 192) {
2019 // Multi-byte starting byte (11xxxxxx)
2020 $n++;
2021 }
2022 }
2023 if (!isset($str[$i])) {
2024 // Offset beyond string length
2025 return false;
2026 }
2027 return $n;
2028 }
2029
2030 /**
2031 * Maps all characters of an UTF-8 string.
2032 *
2033 * @param string $str UTF-8 string
2034 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2035 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
2036 * @return string The converted string
2037 */
2038 public function utf8_char_mapping($str, $mode, $opt = '')
2039 {
2040 if (!$this->initUnicodeData($mode)) {
2041 // Do nothing
2042 return $str;
2043 }
2044 $out = '';
2045 switch ($mode) {
2046 case 'case':
2047 $map = &$this->caseFolding['utf-8'][$opt];
2048 break;
2049 case 'ascii':
2050 $map = &$this->toASCII['utf-8'];
2051 break;
2052 default:
2053 return $str;
2054 }
2055 for ($i = 0; isset($str[$i]); $i++) {
2056 $c = ord($str[$i]);
2057 // single-byte (0xxxxxx)
2058 if (!($c & 128)) {
2059 $mbc = $str[$i];
2060 } elseif (($c & 192) === 192) {
2061 // multi-byte starting byte (11xxxxxx)
2062 for ($bc = 0; $c & 128; $c = $c << 1) {
2063 $bc++;
2064 }
2065 // calculate number of bytes
2066 $mbc = substr($str, $i, $bc);
2067 $i += $bc - 1;
2068 }
2069 if (isset($map[$mbc])) {
2070 $out .= $map[$mbc];
2071 } else {
2072 $out .= $mbc;
2073 }
2074 }
2075 return $out;
2076 }
2077
2078 /********************************************
2079 *
2080 * Internal EUC string operation functions
2081 *
2082 * Extended Unix Code:
2083 * ASCII compatible 7bit single bytes chars
2084 * 8bit two byte chars
2085 *
2086 * Shift-JIS is treated as a special case.
2087 *
2088 ********************************************/
2089 /**
2090 * Cuts a string in the EUC charset family short at a given byte length.
2091 *
2092 * @param string $str EUC multibyte character string
2093 * @param int $len The byte length
2094 * @param string $charset The charset
2095 * @return string The shortened string
2096 * @see mb_strcut()
2097 */
2098 public function euc_strtrunc($str, $len, $charset)
2099 {
2100 $shiftJis = $charset === 'shift_jis';
2101 for ($i = 0; isset($str[$i]) && $i < $len; $i++) {
2102 $c = ord($str[$i]);
2103 if ($shiftJis) {
2104 if ($c >= 128 && $c < 160 || $c >= 224) {
2105 $i++;
2106 }
2107 } else {
2108 if ($c >= 128) {
2109 $i++;
2110 }
2111 }
2112 }
2113 if (!isset($str[$i])) {
2114 return $str;
2115 }
2116 // string shorter than supplied length
2117 if ($i > $len) {
2118 // We ended on a first byte
2119 return substr($str, 0, $len - 1);
2120 } else {
2121 return substr($str, 0, $len);
2122 }
2123 }
2124
2125 /**
2126 * Returns a part of a string in the EUC charset family.
2127 *
2128 * @param string $str EUC multibyte character string
2129 * @param int $start Start position (character position)
2130 * @param string $charset The charset
2131 * @param int $len Length (in characters)
2132 * @return string the substring
2133 */
2134 public function euc_substr($str, $start, $charset, $len = null)
2135 {
2136 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2137 if ($byte_start === false) {
2138 // $start outside string length
2139 return false;
2140 }
2141 $str = substr($str, $byte_start);
2142 if ($len != null) {
2143 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2144 // $len outside actual string length
2145 if ($byte_end === false) {
2146 return $str;
2147 } else {
2148 return substr($str, 0, $byte_end);
2149 }
2150 } else {
2151 return $str;
2152 }
2153 }
2154
2155 /**
2156 * Counts the number of characters of a string in the EUC charset family.
2157 *
2158 * @param string $str EUC multibyte character string
2159 * @param string $charset The charset
2160 * @return int The number of characters
2161 * @see strlen()
2162 */
2163 public function euc_strlen($str, $charset)
2164 {
2165 $sjis = $charset === 'shift_jis';
2166 $n = 0;
2167 for ($i = 0; isset($str[$i]); $i++) {
2168 $c = ord($str[$i]);
2169 if ($sjis) {
2170 if ($c >= 128 && $c < 160 || $c >= 224) {
2171 $i++;
2172 }
2173 } else {
2174 if ($c >= 128) {
2175 $i++;
2176 }
2177 }
2178 $n++;
2179 }
2180 return $n;
2181 }
2182
2183 /**
2184 * Translates a character position into an 'absolute' byte position.
2185 *
2186 * @param string $str EUC multibyte character string
2187 * @param int $pos Character position (negative values start from the end)
2188 * @param string $charset The charset
2189 * @return int Byte position
2190 */
2191 public function euc_char2byte_pos($str, $pos, $charset)
2192 {
2193 $sjis = $charset === 'shift_jis';
2194 // Number of characters seen
2195 $n = 0;
2196 // Number of characters wanted
2197 $p = abs($pos);
2198 if ($pos >= 0) {
2199 $i = 0;
2200 $d = 1;
2201 } else {
2202 $i = strlen($str) - 1;
2203 $d = -1;
2204 }
2205 for (; isset($str[$i]) && $n < $p; $i += $d) {
2206 $c = ord($str[$i]);
2207 if ($sjis) {
2208 if ($c >= 128 && $c < 160 || $c >= 224) {
2209 $i += $d;
2210 }
2211 } else {
2212 if ($c >= 128) {
2213 $i += $d;
2214 }
2215 }
2216 $n++;
2217 }
2218 if (!isset($str[$i])) {
2219 return false;
2220 }
2221 // offset beyond string length
2222 if ($pos < 0) {
2223 $i++;
2224 }
2225 // correct offset
2226 return $i;
2227 }
2228
2229 /**
2230 * Maps all characters of a string in the EUC charset family.
2231 *
2232 * @param string $str EUC multibyte character string
2233 * @param string $charset The charset
2234 * @param string $mode Mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2235 * @param string $opt 'case': conversion 'toLower' or 'toUpper'
2236 * @return string The converted string
2237 */
2238 public function euc_char_mapping($str, $charset, $mode, $opt = '')
2239 {
2240 switch ($mode) {
2241 case 'case':
2242 if (!$this->initCaseFolding($charset)) {
2243 return $str;
2244 }
2245 // do nothing
2246 $map = &$this->caseFolding[$charset][$opt];
2247 break;
2248 case 'ascii':
2249 if (!$this->initToASCII($charset)) {
2250 return $str;
2251 }
2252 // do nothing
2253 $map = &$this->toASCII[$charset];
2254 break;
2255 default:
2256 return $str;
2257 }
2258 $sjis = $charset === 'shift_jis';
2259 $out = '';
2260 for ($i = 0; isset($str[$i]); $i++) {
2261 $mbc = $str[$i];
2262 $c = ord($mbc);
2263 if ($sjis) {
2264 // A double-byte char
2265 if ($c >= 128 && $c < 160 || $c >= 224) {
2266 $mbc = substr($str, $i, 2);
2267 $i++;
2268 }
2269 } else {
2270 // A double-byte char
2271 if ($c >= 128) {
2272 $mbc = substr($str, $i, 2);
2273 $i++;
2274 }
2275 }
2276 if (isset($map[$mbc])) {
2277 $out .= $map[$mbc];
2278 } else {
2279 $out .= $mbc;
2280 }
2281 }
2282 return $out;
2283 }
2284
2285 /**
2286 * Checks the selected strategy based on which method is available in the system.
2287 * "mbstring" takes precedence over "iconv".
2288 * See http://stackoverflow.com/questions/8233517/what-is-the-difference-between-iconv-and-mb-convert-encoding-in-php
2289 *
2290 * @return string could be "mbstring", "iconv" or "fallback"
2291 */
2292 protected function getConversionStrategy()
2293 {
2294 if ($this->conversionStrategy === null) {
2295 if (extension_loaded('mbstring')) {
2296 $this->conversionStrategy = self::STRATEGY_MBSTRING;
2297 } elseif (extension_loaded('iconv')) {
2298 $this->conversionStrategy = self::STRATEGY_ICONV;
2299 } else {
2300 $this->conversionStrategy = self::STRATEGY_FALLBACK;
2301 }
2302 }
2303 return $this->conversionStrategy;
2304 }
2305 }