[TASK] Move XLIFF handling to t3lib
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
28 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
29 */
30
31
32 /**
33 * Notes on UTF-8
34 *
35 * Functions working on UTF-8 strings:
36 *
37 * - strchr/strstr
38 * - strrchr
39 * - substr_count
40 * - implode/explode/join
41 *
42 * Functions nearly working on UTF-8 strings:
43 *
44 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
45 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
46 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
47 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
48 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
49 *
50 * Functions NOT working on UTF-8 strings:
51 *
52 * - str*cmp
53 * - stristr
54 * - stripos
55 * - substr
56 * - strrev
57 * - split/spliti
58 * - ...
59 *
60 */
61 /**
62 * Class for conversion between charsets
63 *
64 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
65 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
66 * @package TYPO3
67 * @subpackage t3lib
68 */
69 class t3lib_cs {
70 var $noCharByteVal = 63; // ASCII Value for chars with no equivalent.
71
72 // This is the array where parsed conversion tables are stored (cached)
73 var $parsedCharsets = array();
74
75 // An array where case folding data will be stored (cached)
76 var $caseFolding = array();
77
78 // An array where charset-to-ASCII mappings are stored (cached)
79 var $toASCII = array();
80
81 // This tells the converter which charsets has two bytes per char:
82 var $twoByteSets = array(
83 'ucs-2' => 1, // 2-byte Unicode
84 );
85
86 // This tells the converter which charsets has four bytes per char:
87 var $fourByteSets = array(
88 'ucs-4' => 1, // 4-byte Unicode
89 'utf-32' => 1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
90 );
91
92 // This tells the converter which charsets use a scheme like the Extended Unix Code:
93 var $eucBasedSets = array(
94 'gb2312' => 1, // Chinese, simplified.
95 'big5' => 1, // Chinese, traditional.
96 'euc-kr' => 1, // Korean
97 'shift_jis' => 1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
98 );
99
100 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
101 // http://czyborra.com/charsets/iso8859.html
102 var $synonyms = array(
103 'us' => 'ascii',
104 'us-ascii' => 'ascii',
105 'cp819' => 'iso-8859-1',
106 'ibm819' => 'iso-8859-1',
107 'iso-ir-100' => 'iso-8859-1',
108 'iso-ir-101' => 'iso-8859-2',
109 'iso-ir-109' => 'iso-8859-3',
110 'iso-ir-110' => 'iso-8859-4',
111 'iso-ir-144' => 'iso-8859-5',
112 'iso-ir-127' => 'iso-8859-6',
113 'iso-ir-126' => 'iso-8859-7',
114 'iso-ir-138' => 'iso-8859-8',
115 'iso-ir-148' => 'iso-8859-9',
116 'iso-ir-157' => 'iso-8859-10',
117 'iso-ir-179' => 'iso-8859-13',
118 'iso-ir-199' => 'iso-8859-14',
119 'iso-ir-203' => 'iso-8859-15',
120 'csisolatin1' => 'iso-8859-1',
121 'csisolatin2' => 'iso-8859-2',
122 'csisolatin3' => 'iso-8859-3',
123 'csisolatin5' => 'iso-8859-9',
124 'csisolatin8' => 'iso-8859-14',
125 'csisolatin9' => 'iso-8859-15',
126 'csisolatingreek' => 'iso-8859-7',
127 'iso-celtic' => 'iso-8859-14',
128 'latin1' => 'iso-8859-1',
129 'latin2' => 'iso-8859-2',
130 'latin3' => 'iso-8859-3',
131 'latin5' => 'iso-8859-9',
132 'latin6' => 'iso-8859-10',
133 'latin8' => 'iso-8859-14',
134 'latin9' => 'iso-8859-15',
135 'l1' => 'iso-8859-1',
136 'l2' => 'iso-8859-2',
137 'l3' => 'iso-8859-3',
138 'l5' => 'iso-8859-9',
139 'l6' => 'iso-8859-10',
140 'l8' => 'iso-8859-14',
141 'l9' => 'iso-8859-15',
142 'cyrillic' => 'iso-8859-5',
143 'arabic' => 'iso-8859-6',
144 'tis-620' => 'iso-8859-11',
145 'win874' => 'windows-874',
146 'win1250' => 'windows-1250',
147 'win1251' => 'windows-1251',
148 'win1252' => 'windows-1252',
149 'win1253' => 'windows-1253',
150 'win1254' => 'windows-1254',
151 'win1255' => 'windows-1255',
152 'win1256' => 'windows-1256',
153 'win1257' => 'windows-1257',
154 'win1258' => 'windows-1258',
155 'cp1250' => 'windows-1250',
156 'cp1251' => 'windows-1251',
157 'cp1252' => 'windows-1252',
158 'ms-ee' => 'windows-1250',
159 'ms-ansi' => 'windows-1252',
160 'ms-greek' => 'windows-1253',
161 'ms-turk' => 'windows-1254',
162 'winbaltrim' => 'windows-1257',
163 'koi-8ru' => 'koi-8r',
164 'koi8r' => 'koi-8r',
165 'cp878' => 'koi-8r',
166 'mac' => 'macroman',
167 'macintosh' => 'macroman',
168 'euc-cn' => 'gb2312',
169 'x-euc-cn' => 'gb2312',
170 'euccn' => 'gb2312',
171 'cp936' => 'gb2312',
172 'big-5' => 'big5',
173 'cp950' => 'big5',
174 'eucjp' => 'euc-jp',
175 'sjis' => 'shift_jis',
176 'shift-jis' => 'shift_jis',
177 'cp932' => 'shift_jis',
178 'cp949' => 'euc-kr',
179 'utf7' => 'utf-7',
180 'utf8' => 'utf-8',
181 'utf16' => 'utf-16',
182 'utf32' => 'utf-32',
183 'utf8' => 'utf-8',
184 'ucs2' => 'ucs-2',
185 'ucs4' => 'ucs-4',
186 );
187
188 // mapping of iso-639-1 language codes to script names
189 var $lang_to_script = array(
190 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
191 'ar' => 'arabic',
192 'bg' => 'cyrillic', // Bulgarian
193 'bs' => 'east_european', // Bosnian
194 'cs' => 'east_european', // Czech
195 'da' => 'west_european', // Danish
196 'de' => 'west_european', // German
197 'es' => 'west_european', // Spanish
198 'et' => 'estonian',
199 'eo' => 'unicode', // Esperanto
200 'eu' => 'west_european', // Basque
201 'fa' => 'arabic', // Persian
202 'fi' => 'west_european', // Finish
203 'fo' => 'west_european', // Faroese
204 'fr' => 'west_european', // French
205 'ga' => 'west_european', // Irish
206 'gl' => 'west_european', // Galician
207 'gr' => 'greek',
208 'he' => 'hebrew', // Hebrew (since 1998)
209 'hi' => 'unicode', // Hindi
210 'hr' => 'east_european', // Croatian
211 'hu' => 'east_european', // Hungarian
212 'iw' => 'hebrew', // Hebrew (til 1998)
213 'is' => 'west_european', // Icelandic
214 'it' => 'west_european', // Italian
215 'ja' => 'japanese',
216 'ka' => 'unicode', // Georgian
217 'kl' => 'west_european', // Greenlandic
218 'km' => 'unicode', // Khmer
219 'ko' => 'korean',
220 'lt' => 'lithuanian',
221 'lv' => 'west_european', // Latvian/Lettish
222 'nl' => 'west_european', // Dutch
223 'no' => 'west_european', // Norwegian
224 'nb' => 'west_european', // Norwegian Bokmal
225 'nn' => 'west_european', // Norwegian Nynorsk
226 'pl' => 'east_european', // Polish
227 'pt' => 'west_european', // Portuguese
228 'ro' => 'east_european', // Romanian
229 'ru' => 'cyrillic', // Russian
230 'sk' => 'east_european', // Slovak
231 'sl' => 'east_european', // Slovenian
232 'sr' => 'cyrillic', // Serbian
233 'sv' => 'west_european', // Swedish
234 'sq' => 'albanian', // Albanian
235 'th' => 'thai',
236 'uk' => 'cyrillic', // Ukranian
237 'vi' => 'vietnamese',
238 'zh' => 'chinese',
239 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
240 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
241 'ara' => 'arabic',
242 'bgr' => 'cyrillic', // Bulgarian
243 'cat' => 'west_european', // Catalan
244 'chs' => 'simpl_chinese',
245 'cht' => 'trad_chinese',
246 'csy' => 'east_european', // Czech
247 'dan' => 'west_european', // Danisch
248 'deu' => 'west_european', // German
249 'dea' => 'west_european', // German (Austrian)
250 'des' => 'west_european', // German (Swiss)
251 'ena' => 'west_european', // English (Australian)
252 'enc' => 'west_european', // English (Canadian)
253 'eng' => 'west_european', // English
254 'enz' => 'west_european', // English (New Zealand)
255 'enu' => 'west_european', // English (United States)
256 'euq' => 'west_european', // Basque
257 'fos' => 'west_european', // Faroese
258 'far' => 'arabic', // Persian
259 'fin' => 'west_european', // Finish
260 'fra' => 'west_european', // French
261 'frb' => 'west_european', // French (Belgian)
262 'frc' => 'west_european', // French (Canadian)
263 'frs' => 'west_european', // French (Swiss)
264 'geo' => 'unicode', // Georgian
265 'glg' => 'west_european', // Galician
266 'ell' => 'greek',
267 'heb' => 'hebrew',
268 'hin' => 'unicode', // Hindi
269 'hun' => 'east_european', // Hungarian
270 'isl' => 'west_euorpean', // Icelandic
271 'ita' => 'west_european', // Italian
272 'its' => 'west_european', // Italian (Swiss)
273 'jpn' => 'japanese',
274 'khm' => 'unicode', // Khmer
275 'kor' => 'korean',
276 'lth' => 'lithuanian',
277 'lvi' => 'west_european', // Latvian/Lettish
278 'msl' => 'west_european', // Malay
279 'nlb' => 'west_european', // Dutch (Belgian)
280 'nld' => 'west_european', // Dutch
281 'nor' => 'west_european', // Norwegian (bokmal)
282 'non' => 'west_european', // Norwegian (nynorsk)
283 'plk' => 'east_european', // Polish
284 'ptg' => 'west_european', // Portuguese
285 'ptb' => 'west_european', // Portuguese (Brazil)
286 'rom' => 'east_european', // Romanian
287 'rus' => 'cyrillic', // Russian
288 'slv' => 'east_european', // Slovenian
289 'sky' => 'east_european', // Slovak
290 'srl' => 'east_european', // Serbian (Latin)
291 'srb' => 'cyrillic', // Serbian (Cyrillic)
292 'esp' => 'west_european', // Spanish (trad. sort)
293 'esm' => 'west_european', // Spanish (Mexican)
294 'esn' => 'west_european', // Spanish (internat. sort)
295 'sve' => 'west_european', // Swedish
296 'sqi' => 'albanian', // Albanian
297 'tha' => 'thai',
298 'trk' => 'turkish',
299 'ukr' => 'cyrillic', // Ukrainian
300 // English language names
301 'albanian' => 'albanian',
302 'arabic' => 'arabic',
303 'basque' => 'west_european',
304 'bosnian' => 'east_european',
305 'bulgarian' => 'east_european',
306 'catalan' => 'west_european',
307 'croatian' => 'east_european',
308 'czech' => 'east_european',
309 'danish' => 'west_european',
310 'dutch' => 'west_european',
311 'english' => 'west_european',
312 'esperanto' => 'unicode',
313 'estonian' => 'estonian',
314 'faroese' => 'west_european',
315 'farsi' => 'arabic',
316 'finnish' => 'west_european',
317 'french' => 'west_european',
318 'galician' => 'west_european',
319 'georgian' => 'unicode',
320 'german' => 'west_european',
321 'greek' => 'greek',
322 'greenlandic' => 'west_european',
323 'hebrew' => 'hebrew',
324 'hindi' => 'unicode',
325 'hungarian' => 'east_european',
326 'icelandic' => 'west_european',
327 'italian' => 'west_european',
328 'khmer' => 'unicode',
329 'latvian' => 'west_european',
330 'lettish' => 'west_european',
331 'lithuanian' => 'lithuanian',
332 'malay' => 'west_european',
333 'norwegian' => 'west_european',
334 'persian' => 'arabic',
335 'polish' => 'east_european',
336 'portuguese' => 'west_european',
337 'russian' => 'cyrillic',
338 'romanian' => 'east_european',
339 'serbian' => 'cyrillic',
340 'slovak' => 'east_european',
341 'slovenian' => 'east_european',
342 'spanish' => 'west_european',
343 'svedish' => 'west_european',
344 'that' => 'thai',
345 'turkish' => 'turkish',
346 'ukrainian' => 'cyrillic',
347 );
348
349 // mapping of language (family) names to charsets on Unix
350 var $script_to_charset_unix = array(
351 'west_european' => 'iso-8859-1',
352 'estonian' => 'iso-8859-1',
353 'east_european' => 'iso-8859-2',
354 'baltic' => 'iso-8859-4',
355 'cyrillic' => 'iso-8859-5',
356 'arabic' => 'iso-8859-6',
357 'greek' => 'iso-8859-7',
358 'hebrew' => 'iso-8859-8',
359 'turkish' => 'iso-8859-9',
360 'thai' => 'iso-8859-11', // = TIS-620
361 'lithuanian' => 'iso-8859-13',
362 'chinese' => 'gb2312', // = euc-cn
363 'japanese' => 'euc-jp',
364 'korean' => 'euc-kr',
365 'simpl_chinese' => 'gb2312',
366 'trad_chinese' => 'big5',
367 'vietnamese' => '',
368 'unicode' => 'utf-8',
369 'albanian' => 'utf-8'
370 );
371
372 // mapping of language (family) names to charsets on Windows
373 var $script_to_charset_windows = array(
374 'east_european' => 'windows-1250',
375 'cyrillic' => 'windows-1251',
376 'west_european' => 'windows-1252',
377 'greek' => 'windows-1253',
378 'turkish' => 'windows-1254',
379 'hebrew' => 'windows-1255',
380 'arabic' => 'windows-1256',
381 'baltic' => 'windows-1257',
382 'estonian' => 'windows-1257',
383 'lithuanian' => 'windows-1257',
384 'vietnamese' => 'windows-1258',
385 'thai' => 'cp874',
386 'korean' => 'cp949',
387 'chinese' => 'gb2312',
388 'japanese' => 'shift_jis',
389 'simpl_chinese' => 'gb2312',
390 'trad_chinese' => 'big5',
391 'albanian' => 'windows-1250',
392 'unicode' => 'utf-8'
393 );
394
395 // mapping of locale names to charsets
396 var $locale_to_charset = array(
397 'japanese.euc' => 'euc-jp',
398 'ja_jp.ujis' => 'euc-jp',
399 'korean.euc' => 'euc-kr',
400 'sr@Latn' => 'iso-8859-2',
401 'zh_cn' => 'gb2312',
402 'zh_hk' => 'big5',
403 'zh_tw' => 'big5',
404 );
405
406 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
407 // Empty values means "iso-8859-1"
408 var $charSetArray = array(
409 'dk' => '',
410 'de' => '',
411 'no' => '',
412 'it' => '',
413 'fr' => '',
414 'es' => '',
415 'nl' => '',
416 'cz' => 'windows-1250',
417 'pl' => 'iso-8859-2',
418 'si' => 'windows-1250',
419 'fi' => '',
420 'tr' => 'iso-8859-9',
421 'se' => '',
422 'pt' => '',
423 'ru' => 'windows-1251',
424 'ro' => 'iso-8859-2',
425 'ch' => 'gb2312',
426 'sk' => 'windows-1250',
427 'lt' => 'windows-1257',
428 'is' => 'utf-8',
429 'hr' => 'windows-1250',
430 'hu' => 'iso-8859-2',
431 'gl' => '',
432 'th' => 'iso-8859-11',
433 'gr' => 'iso-8859-7',
434 'hk' => 'big5',
435 'eu' => '',
436 'bg' => 'windows-1251',
437 'br' => '',
438 'et' => 'iso-8859-4',
439 'ar' => 'iso-8859-6',
440 'he' => 'utf-8',
441 'ua' => 'windows-1251',
442 'jp' => 'shift_jis',
443 'lv' => 'utf-8',
444 'vn' => 'utf-8',
445 'ca' => 'iso-8859-15',
446 'ba' => 'iso-8859-2',
447 'kr' => 'euc-kr',
448 'eo' => 'utf-8',
449 'my' => '',
450 'hi' => 'utf-8',
451 'fo' => 'utf-8',
452 'fa' => 'utf-8',
453 'sr' => 'utf-8',
454 'sq' => 'utf-8',
455 'ge' => 'utf-8',
456 'ga' => '',
457 'km' => 'utf-8',
458 'qc' => '',
459 );
460
461 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
462 // Missing keys means: same as Typo3
463 var $isoArray = array(
464 'ba' => 'bs',
465 'br' => 'pt_BR',
466 'ch' => 'zh_CN',
467 'cz' => 'cs',
468 'dk' => 'da',
469 'si' => 'sl',
470 'se' => 'sv',
471 'gl' => 'kl',
472 'gr' => 'el',
473 'hk' => 'zh_HK',
474 'kr' => 'ko',
475 'ua' => 'uk',
476 'jp' => 'ja',
477 'qc' => 'fr_CA',
478 'vn' => 'vi',
479 'ge' => 'ka',
480 'ga' => 'gl',
481 );
482
483 /**
484 * Normalize - changes input character set to lowercase letters.
485 *
486 * @param string Input charset
487 * @return string Normalized charset
488 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
489 */
490 function parse_charset($charset) {
491 $charset = trim(strtolower($charset));
492 if (isset($this->synonyms[$charset])) {
493 $charset = $this->synonyms[$charset];
494 }
495
496 return $charset;
497 }
498
499 /**
500 * Get the charset of a locale.
501 *
502 * ln language
503 * ln_CN language / country
504 * ln_CN.cs language / country / charset
505 * ln_CN.cs@mod language / country / charset / modifier
506 *
507 * @param string Locale string
508 * @return string Charset resolved for locale string
509 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
510 */
511 function get_locale_charset($locale) {
512 $locale = strtolower($locale);
513
514 // exact locale specific charset?
515 if (isset($this->locale_to_charset[$locale])) {
516 return $this->locale_to_charset[$locale];
517 }
518
519 // get modifier
520 list($locale, $modifier) = explode('@', $locale);
521
522 // locale contains charset: use it
523 list($locale, $charset) = explode('.', $locale);
524 if ($charset) {
525 return $this->parse_charset($charset);
526 }
527
528 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
529 if ($modifier == 'euro') {
530 return 'iso-8859-15';
531 }
532
533 // get language
534 list($language, $country) = explode('_', $locale);
535 if (isset($this->lang_to_script[$language])) {
536 $script = $this->lang_to_script[$language];
537 }
538
539 if (TYPO3_OS == 'WIN') {
540 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
541 } else {
542 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
543 }
544
545 return $cs;
546 }
547
548
549 /********************************************
550 *
551 * Charset Conversion functions
552 *
553 ********************************************/
554
555 /**
556 * Convert from one charset to another charset.
557 *
558 * @param string Input string
559 * @param string From charset (the current charset of the string)
560 * @param string To charset (the output charset wanted)
561 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
562 * @return string Converted string
563 * @see convArray()
564 */
565 function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
566 if ($fromCS == $toCS) {
567 return $str;
568 }
569
570 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
571 if ($toCS == 'utf-8' || !$useEntityForNoChar) {
572 switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
573 case 'mbstring':
574 $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
575 if (FALSE !== $conv_str) {
576 return $conv_str;
577 } // returns FALSE for unsupported charsets
578 break;
579
580 case 'iconv':
581 $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
582 if (FALSE !== $conv_str) {
583 return $conv_str;
584 }
585 break;
586
587 case 'recode':
588 $conv_str = recode_string($fromCS . '..' . $toCS, $str);
589 if (FALSE !== $conv_str) {
590 return $conv_str;
591 }
592 break;
593 }
594 // fallback to TYPO3 conversion
595 }
596
597 if ($fromCS != 'utf-8') {
598 $str = $this->utf8_encode($str, $fromCS);
599 }
600 if ($toCS != 'utf-8') {
601 $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
602 }
603 return $str;
604 }
605
606 /**
607 * Convert all elements in ARRAY with type string from one charset to another charset.
608 * NOTICE: Array is passed by reference!
609 *
610 * @param string Input array, possibly multidimensional
611 * @param string From charset (the current charset of the string)
612 * @param string To charset (the output charset wanted)
613 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
614 * @return void
615 * @see conv()
616 */
617 function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
618 foreach ($array as $key => $value) {
619 if (is_array($array[$key])) {
620 $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
621 } elseif (is_string($array[$key])) {
622 $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
623 }
624 }
625 }
626
627 /**
628 * Converts $str from $charset to UTF-8
629 *
630 * @param string String in local charset to convert to UTF-8
631 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
632 * @return string Output string, converted to UTF-8
633 */
634 function utf8_encode($str, $charset) {
635
636 if ($charset === 'utf-8') {
637 return $str;
638 }
639
640 // Charset is case-insensitive.
641 if ($this->initCharset($charset)) { // Parse conv. table if not already...
642 $strLen = strlen($str);
643 $outStr = '';
644
645 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in string.
646 $chr = substr($str, $a, 1);
647 $ord = ord($chr);
648 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
649 $ord2 = ord($str{$a + 1});
650 $ord = $ord << 8 | $ord2; // assume big endian
651
652 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
653 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
654 } else {
655 $outStr .= chr($this->noCharByteVal);
656 } // No char exists
657 $a++;
658 } elseif ($ord > 127) { // If char has value over 127 it's a multibyte char in UTF-8
659 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
660 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
661 $a++;
662 $ord2 = ord(substr($str, $a, 1));
663 $ord = $ord * 256 + $ord2;
664 }
665 }
666
667 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
668 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
669 } else {
670 $outStr .= chr($this->noCharByteVal);
671 } // No char exists
672 } else {
673 $outStr .= $chr;
674 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
675 }
676 return $outStr;
677 }
678 }
679
680 /**
681 * Converts $str from UTF-8 to $charset
682 *
683 * @param string String in UTF-8 to convert to local charset
684 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
685 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
686 * @return string Output string, converted to local charset
687 */
688 function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
689
690 if ($charset === 'utf-8') {
691 return $str;
692 }
693
694 // Charset is case-insensitive.
695 if ($this->initCharset($charset)) { // Parse conv. table if not already...
696 $strLen = strlen($str);
697 $outStr = '';
698 $buf = '';
699 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) { // Traverse each char in UTF-8 string.
700 $chr = substr($str, $a, 1);
701 $ord = ord($chr);
702 if ($ord > 127) { // This means multibyte! (first byte!)
703 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
704
705 $buf = $chr; // Add first byte
706 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
707 $ord = $ord << 1; // Shift it left and ...
708 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
709 $a++; // Increase pointer...
710 $buf .= substr($str, $a, 1); // ... and add the next char.
711 } else {
712 break;
713 }
714 }
715
716 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
717 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
718 if ($mByte > 255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
719 $outStr .= chr(($mByte >> 8) & 255) . chr($mByte & 255);
720 } else {
721 $outStr .= chr($mByte);
722 }
723 } elseif ($useEntityForNoChar) { // Create num entity:
724 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
725 } else {
726 $outStr .= chr($this->noCharByteVal);
727 } // No char exists
728 } else {
729 $outStr .= chr($this->noCharByteVal);
730 } // No char exists (MIDDLE of MB sequence!)
731 } else {
732 $outStr .= $chr;
733 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
734 }
735 return $outStr;
736 }
737 }
738
739 /**
740 * Converts all chars > 127 to numeric entities.
741 *
742 * @param string Input string
743 * @return string Output string
744 */
745 function utf8_to_entities($str) {
746 $strLen = strlen($str);
747 $outStr = '';
748 $buf = '';
749 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
750 $chr = substr($str, $a, 1);
751 $ord = ord($chr);
752 if ($ord > 127) { // This means multibyte! (first byte!)
753 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
754 $buf = $chr; // Add first byte
755 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
756 $ord = $ord << 1; // Shift it left and ...
757 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
758 $a++; // Increase pointer...
759 $buf .= substr($str, $a, 1); // ... and add the next char.
760 } else {
761 break;
762 }
763 }
764
765 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
766 } else {
767 $outStr .= chr($this->noCharByteVal);
768 } // No char exists (MIDDLE of MB sequence!)
769 } else {
770 $outStr .= $chr;
771 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
772 }
773
774 return $outStr;
775 }
776
777 /**
778 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
779 *
780 * @param string Input string, UTF-8
781 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
782 * @return string Output string
783 */
784 function entities_to_utf8($str, $alsoStdHtmlEnt = 0) {
785 if ($alsoStdHtmlEnt) {
786 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
787 }
788
789 $token = md5(microtime());
790 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
791 foreach ($parts as $k => $v) {
792 if ($k % 2) {
793 if (substr($v, 0, 1) == '#') { // Dec or hex entities:
794 if (substr($v, 1, 1) == 'x') {
795 $parts[$k] = $this->UnumberToChar(hexdec(substr($v, 2)));
796 } else {
797 $parts[$k] = $this->UnumberToChar(substr($v, 1));
798 }
799 } elseif ($alsoStdHtmlEnt && $trans_tbl['&' . $v . ';']) { // Other entities:
800 $parts[$k] = $this->utf8_encode($trans_tbl['&' . $v . ';'], 'iso-8859-1');
801 } else { // No conversion:
802 $parts[$k] = '&' . $v . ';';
803 }
804 }
805 }
806
807 return implode('', $parts);
808 }
809
810 /**
811 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
812 *
813 * @param string Input string, UTF-8
814 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
815 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
816 * @return array Output array with the char numbers
817 */
818 function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
819 // If entities must be registered as well...:
820 if ($convEntities) {
821 $str = $this->entities_to_utf8($str, 1);
822 }
823 // Do conversion:
824 $strLen = strlen($str);
825 $outArr = array();
826 $buf = '';
827 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
828 $chr = substr($str, $a, 1);
829 $ord = ord($chr);
830 if ($ord > 127) { // This means multibyte! (first byte!)
831 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
832 $buf = $chr; // Add first byte
833 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
834 $ord = $ord << 1; // Shift it left and ...
835 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
836 $a++; // Increase pointer...
837 $buf .= substr($str, $a, 1); // ... and add the next char.
838 } else {
839 break;
840 }
841 }
842
843 $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
844 } else {
845 $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
846 } // No char exists (MIDDLE of MB sequence!)
847 } else {
848 $outArr[] = $retChar ? chr($ord) : $ord;
849 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
850 }
851
852 return $outArr;
853 }
854
855 /**
856 * Converts a UNICODE number to a UTF-8 multibyte character
857 * Algorithm based on script found at From: http://czyborra.com/utf/
858 * Unit-tested by Kasper
859 *
860 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
861 *
862 * bytes | bits | representation
863 * 1 | 7 | 0vvvvvvv
864 * 2 | 11 | 110vvvvv 10vvvvvv
865 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
866 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
867 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
868 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
869 *
870 * @param integer UNICODE integer
871 * @return string UTF-8 multibyte character string
872 * @see utf8CharToUnumber()
873 */
874 function UnumberToChar($cbyte) {
875 $str = '';
876
877 if ($cbyte < 0x80) {
878 $str .= chr($cbyte);
879 } else {
880 if ($cbyte < 0x800) {
881 $str .= chr(0xC0 | ($cbyte >> 6));
882 $str .= chr(0x80 | ($cbyte & 0x3F));
883 } else {
884 if ($cbyte < 0x10000) {
885 $str .= chr(0xE0 | ($cbyte >> 12));
886 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
887 $str .= chr(0x80 | ($cbyte & 0x3F));
888 } else {
889 if ($cbyte < 0x200000) {
890 $str .= chr(0xF0 | ($cbyte >> 18));
891 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
892 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
893 $str .= chr(0x80 | ($cbyte & 0x3F));
894 } else {
895 if ($cbyte < 0x4000000) {
896 $str .= chr(0xF8 | ($cbyte >> 24));
897 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
898 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
899 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
900 $str .= chr(0x80 | ($cbyte & 0x3F));
901 } else {
902 if ($cbyte < 0x80000000) {
903 $str .= chr(0xFC | ($cbyte >> 30));
904 $str .= chr(0x80 | (($cbyte >> 24) & 0x3F));
905 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
906 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
907 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
908 $str .= chr(0x80 | ($cbyte & 0x3F));
909 } else { // Cannot express a 32-bit character in UTF-8
910 $str .= chr($this->noCharByteVal);
911 }
912 }
913 }
914 }
915 }
916 }
917 return $str;
918 }
919
920 /**
921 * Converts a UTF-8 Multibyte character to a UNICODE number
922 * Unit-tested by Kasper
923 *
924 * @param string UTF-8 multibyte character string
925 * @param boolean If set, then a hex. number is returned.
926 * @return integer UNICODE integer
927 * @see UnumberToChar()
928 */
929 function utf8CharToUnumber($str, $hex = 0) {
930 $ord = ord(substr($str, 0, 1)); // First char
931
932 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
933 $binBuf = '';
934 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
935 $ord = $ord << 1; // Shift it left and ...
936 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
937 $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6);
938 } else {
939 break;
940 }
941 }
942 $binBuf = substr('00000000' . decbin(ord(substr($str, 0, 1))), -(6 - $b)) . $binBuf;
943
944 $int = bindec($binBuf);
945 } else {
946 $int = $ord;
947 }
948
949 return $hex ? 'x' . dechex($int) : $int;
950 }
951
952
953 /********************************************
954 *
955 * Init functions
956 *
957 ********************************************/
958
959 /**
960 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
961 * This function is automatically called by the conversion functions
962 *
963 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
964 *
965 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
966 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
967 * @access private
968 */
969 function initCharset($charset) {
970 // Only process if the charset is not yet loaded:
971 if (!is_array($this->parsedCharsets[$charset])) {
972
973 // Conversion table filename:
974 $charsetConvTableFile = PATH_t3lib . 'csconvtbl/' . $charset . '.tbl';
975
976 // If the conversion table is found:
977 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
978 // Cache file for charsets:
979 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
980 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
981 if ($cacheFile && @is_file($cacheFile)) {
982 $this->parsedCharsets[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
983 } else {
984 // Parse conversion table into lines:
985 $lines = t3lib_div::trimExplode(LF, t3lib_div::getUrl($charsetConvTableFile), 1);
986 // Initialize the internal variable holding the conv. table:
987 $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
988 // traverse the lines:
989 $detectedType = '';
990 foreach ($lines as $value) {
991 if (trim($value) && substr($value, 0, 1) != '#') { // Comment line or blanks are ignored.
992
993 // Detect type if not done yet: (Done on first real line)
994 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
995 if (!$detectedType) {
996 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
997 }
998
999 if ($detectedType == 'ms-token') {
1000 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1001 } elseif ($detectedType == 'whitespaced') {
1002 $regA = array();
1003 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1004 $hexbyte = $regA[1];
1005 $utf8 = 'U+' . $regA[2];
1006 }
1007 $decval = hexdec(trim($hexbyte));
1008 if ($decval > 127) {
1009 $utf8decval = hexdec(substr(trim($utf8), 2));
1010 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1011 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1012 }
1013 }
1014 }
1015 if ($cacheFile) {
1016 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1017 }
1018 }
1019 return 2;
1020 } else {
1021 return FALSE;
1022 }
1023 } else {
1024 return 1;
1025 }
1026 }
1027
1028 /**
1029 * This function initializes all UTF-8 character data tables.
1030 *
1031 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1032 *
1033 * @param string Mode ("case", "ascii", ...)
1034 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1035 * @access private
1036 */
1037 function initUnicodeData($mode = NULL) {
1038 // cache files
1039 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1040 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1041
1042 // Only process if the tables are not yet loaded
1043 switch ($mode) {
1044 case 'case':
1045 if (is_array($this->caseFolding['utf-8'])) {
1046 return 1;
1047 }
1048
1049 // Use cached version if possible
1050 if ($cacheFileCase && @is_file($cacheFileCase)) {
1051 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1052 return 2;
1053 }
1054 break;
1055
1056 case 'ascii':
1057 if (is_array($this->toASCII['utf-8'])) {
1058 return 1;
1059 }
1060
1061 // Use cached version if possible
1062 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1063 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1064 return 2;
1065 }
1066 break;
1067 }
1068
1069 // process main Unicode data file
1070 $unicodeDataFile = PATH_t3lib . 'unidata/UnicodeData.txt';
1071 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1072 return FALSE;
1073 }
1074
1075 $fh = fopen($unicodeDataFile, 'rb');
1076 if (!$fh) {
1077 return FALSE;
1078 }
1079
1080 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1081 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1082 $this->caseFolding['utf-8'] = array();
1083 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1084 $utf8CaseFolding['toUpper'] = array();
1085 $utf8CaseFolding['toLower'] = array();
1086 $utf8CaseFolding['toTitle'] = array();
1087
1088 $decomposition = array(); // array of temp. decompositions
1089 $mark = array(); // array of chars that are marks (eg. composing accents)
1090 $number = array(); // array of chars that are numbers (eg. digits)
1091 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1092
1093 while (!feof($fh)) {
1094 $line = fgets($fh, 4096);
1095 // has a lot of info
1096 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line));
1097
1098 $ord = hexdec($char);
1099 if ($ord > 0xFFFF) {
1100 break;
1101 } // only process the BMP
1102
1103 $utf8_char = $this->UnumberToChar($ord);
1104
1105 if ($upper) {
1106 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1107 }
1108 if ($lower) {
1109 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1110 }
1111 // store "title" only when different from "upper" (only a few)
1112 if ($title && $title != $upper) {
1113 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1114 }
1115
1116 switch ($cat{0}) {
1117 case 'M': // mark (accent, umlaut, ...)
1118 $mark["U+$char"] = 1;
1119 break;
1120
1121 case 'N': // numeric value
1122 if ($ord > 0x80 && $num != '') {
1123 $number["U+$char"] = $num;
1124 }
1125 }
1126
1127 // accented Latin letters without "official" decomposition
1128 $match = array();
1129 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1130 $c = ord($match[2]);
1131 if ($match[1] == 'SMALL') {
1132 $c += 32;
1133 }
1134
1135 $decomposition["U+$char"] = array(dechex($c));
1136 continue;
1137 }
1138
1139 $match = array();
1140 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1141 switch ($match[1]) {
1142 case '<circle>': // add parenthesis as circle replacement, eg (1)
1143 $match[2] = '0028 ' . $match[2] . ' 0029';
1144 break;
1145
1146 case '<square>': // add square brackets as square replacement, eg [1]
1147 $match[2] = '005B ' . $match[2] . ' 005D';
1148 break;
1149
1150 case '<compat>': // ignore multi char decompositions that start with a space
1151 if (preg_match('/^0020 /', $match[2])) {
1152 continue 2;
1153 }
1154 break;
1155
1156 // ignore Arabic and vertical layout presentation decomposition
1157 case '<initial>':
1158 case '<medial>':
1159 case '<final>':
1160 case '<isolated>':
1161 case '<vertical>':
1162 continue 2;
1163 }
1164 $decomposition["U+$char"] = explode(' ', $match[2]);
1165 }
1166 }
1167 fclose($fh);
1168
1169 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1170 $specialCasingFile = PATH_t3lib . 'unidata/SpecialCasing.txt';
1171 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1172 $fh = fopen($specialCasingFile, 'rb');
1173 if ($fh) {
1174 while (!feof($fh)) {
1175 $line = fgets($fh, 4096);
1176 if ($line{0} != '#' && trim($line) != '') {
1177
1178 list($char, $lower, $title, $upper, $cond) = t3lib_div::trimExplode(';', $line);
1179 if ($cond == '' || $cond{0} == '#') {
1180 $utf8_char = $this->UnumberToChar(hexdec($char));
1181 if ($char != $lower) {
1182 $arr = explode(' ', $lower);
1183 for ($i = 0; isset($arr[$i]); $i++) {
1184 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1185 }
1186 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1187 }
1188 if ($char != $title && $title != $upper) {
1189 $arr = explode(' ', $title);
1190 for ($i = 0; isset($arr[$i]); $i++) {
1191 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1192 }
1193 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1194 }
1195 if ($char != $upper) {
1196 $arr = explode(' ', $upper);
1197 for ($i = 0; isset($arr[$i]); $i++) {
1198 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1199 }
1200 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1201 }
1202 }
1203 }
1204 }
1205 fclose($fh);
1206 }
1207 }
1208
1209 // process custom decompositions
1210 $customTranslitFile = PATH_t3lib . 'unidata/Translit.txt';
1211 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1212 $fh = fopen($customTranslitFile, 'rb');
1213 if ($fh) {
1214 while (!feof($fh)) {
1215 $line = fgets($fh, 4096);
1216 if ($line{0} != '#' && trim($line) != '') {
1217 list($char, $translit) = t3lib_div::trimExplode(';', $line);
1218 if (!$translit) {
1219 $omit["U+$char"] = 1;
1220 }
1221 $decomposition["U+$char"] = explode(' ', $translit);
1222
1223 }
1224 }
1225 fclose($fh);
1226 }
1227 }
1228
1229 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1230 foreach ($decomposition as $from => $to) {
1231 $code_decomp = array();
1232
1233 while ($code_value = array_shift($to)) {
1234 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1235 foreach (array_reverse($decomposition["U+$code_value"]) as $cv) {
1236 array_unshift($to, $cv);
1237 }
1238 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1239 array_push($code_decomp, $code_value);
1240 }
1241 }
1242 if (count($code_decomp) || isset($omit[$from])) {
1243 $decomposition[$from] = $code_decomp;
1244 } else {
1245 unset($decomposition[$from]);
1246 }
1247 }
1248
1249 // create ascii only mapping
1250 $this->toASCII['utf-8'] = array();
1251 $ascii =& $this->toASCII['utf-8'];
1252
1253 foreach ($decomposition as $from => $to) {
1254 $code_decomp = array();
1255 while ($code_value = array_shift($to)) {
1256 $ord = hexdec($code_value);
1257 if ($ord > 127) {
1258 continue 2;
1259 } // skip decompositions containing non-ASCII chars
1260 else
1261 {
1262 array_push($code_decomp, chr($ord));
1263 }
1264 }
1265 $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1266 }
1267
1268 // add numeric decompositions
1269 foreach ($number as $from => $to) {
1270 $utf8_char = $this->UnumberToChar(hexdec($from));
1271 if (!isset($ascii[$utf8_char])) {
1272 $ascii[$utf8_char] = $to;
1273 }
1274 }
1275
1276 if ($cacheFileCase) {
1277 t3lib_div::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1278 }
1279
1280 if ($cacheFileASCII) {
1281 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1282 }
1283
1284 return 3;
1285 }
1286
1287 /**
1288 * This function initializes the folding table for a charset other than UTF-8.
1289 * This function is automatically called by the case folding functions.
1290 *
1291 * @param string Charset for which to initialize case folding.
1292 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1293 * @access private
1294 */
1295 function initCaseFolding($charset) {
1296 // Only process if the case table is not yet loaded:
1297 if (is_array($this->caseFolding[$charset])) {
1298 return 1;
1299 }
1300
1301 // Use cached version if possible
1302 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1303 if ($cacheFile && @is_file($cacheFile)) {
1304 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1305 return 2;
1306 }
1307
1308 // init UTF-8 conversion for this charset
1309 if (!$this->initCharset($charset)) {
1310 return FALSE;
1311 }
1312
1313 // UTF-8 case folding is used as the base conversion table
1314 if (!$this->initUnicodeData('case')) {
1315 return FALSE;
1316 }
1317
1318 $nochar = chr($this->noCharByteVal);
1319 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1320 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1321 $c = $this->utf8_decode($utf8, $charset);
1322
1323 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1324 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1325 if ($cc != '' && $cc != $nochar) {
1326 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1327 }
1328
1329 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1330 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1331 if ($cc != '' && $cc != $nochar) {
1332 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1333 }
1334
1335 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1336 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1337 if ($cc != '' && $cc != $nochar) {
1338 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1339 }
1340 }
1341
1342 // add the ASCII case table
1343 for ($i = ord('a'); $i <= ord('z'); $i++) {
1344 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1345 }
1346 for ($i = ord('A'); $i <= ord('Z'); $i++) {
1347 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1348 }
1349
1350 if ($cacheFile) {
1351 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1352 }
1353
1354 return 3;
1355 }
1356
1357 /**
1358 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1359 * This function is automatically called by the ASCII transliteration functions.
1360 *
1361 * @param string Charset for which to initialize conversion.
1362 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1363 * @access private
1364 */
1365 function initToASCII($charset) {
1366 // Only process if the case table is not yet loaded:
1367 if (is_array($this->toASCII[$charset])) {
1368 return 1;
1369 }
1370
1371 // Use cached version if possible
1372 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1373 if ($cacheFile && @is_file($cacheFile)) {
1374 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1375 return 2;
1376 }
1377
1378 // init UTF-8 conversion for this charset
1379 if (!$this->initCharset($charset)) {
1380 return FALSE;
1381 }
1382
1383 // UTF-8/ASCII transliteration is used as the base conversion table
1384 if (!$this->initUnicodeData('ascii')) {
1385 return FALSE;
1386 }
1387
1388 $nochar = chr($this->noCharByteVal);
1389 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1390 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1391 $c = $this->utf8_decode($utf8, $charset);
1392
1393 if (isset($this->toASCII['utf-8'][$utf8])) {
1394 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1395 }
1396 }
1397
1398 if ($cacheFile) {
1399 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1400 }
1401
1402 return 3;
1403 }
1404
1405
1406 /********************************************
1407 *
1408 * String operation functions
1409 *
1410 ********************************************/
1411
1412 /**
1413 * Returns a part of a string.
1414 * Unit-tested by Kasper (single byte charsets only)
1415 *
1416 * @param string The character set
1417 * @param string Character string
1418 * @param integer Start position (character position)
1419 * @param integer Length (in characters)
1420 * @return string The substring
1421 * @see substr(), mb_substr()
1422 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1423 */
1424 function substr($charset, $string, $start, $len = NULL) {
1425 if ($len === 0 || $string === '') {
1426 return '';
1427 }
1428
1429 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1430 // cannot omit $len, when specifying charset
1431 if ($len == NULL) {
1432 $enc = mb_internal_encoding(); // save internal encoding
1433 mb_internal_encoding($charset);
1434 $str = mb_substr($string, $start);
1435 mb_internal_encoding($enc); // restore internal encoding
1436
1437 return $str;
1438 }
1439 else {
1440 return mb_substr($string, $start, $len, $charset);
1441 }
1442 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1443 // cannot omit $len, when specifying charset
1444 if ($len == NULL) {
1445 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1446 iconv_set_encoding('internal_encoding', $charset);
1447 $str = iconv_substr($string, $start);
1448 iconv_set_encoding('internal_encoding', $enc); // restore internal encoding
1449
1450 return $str;
1451 }
1452 else {
1453 return iconv_substr($string, $start, $len, $charset);
1454 }
1455 } elseif ($charset == 'utf-8') {
1456 return $this->utf8_substr($string, $start, $len);
1457 } elseif ($this->eucBasedSets[$charset]) {
1458 return $this->euc_substr($string, $start, $charset, $len);
1459 } elseif ($this->twoByteSets[$charset]) {
1460 return substr($string, $start * 2, $len * 2);
1461 } elseif ($this->fourByteSets[$charset]) {
1462 return substr($string, $start * 4, $len * 4);
1463 }
1464
1465 // treat everything else as single-byte encoding
1466 return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
1467 }
1468
1469 /**
1470 * Counts the number of characters.
1471 * Unit-tested by Kasper (single byte charsets only)
1472 *
1473 * @param string The character set
1474 * @param string Character string
1475 * @return integer The number of characters
1476 * @see strlen()
1477 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1478 */
1479 function strlen($charset, $string) {
1480 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1481 return mb_strlen($string, $charset);
1482 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1483 return iconv_strlen($string, $charset);
1484 } elseif ($charset == 'utf-8') {
1485 return $this->utf8_strlen($string);
1486 } elseif ($this->eucBasedSets[$charset]) {
1487 return $this->euc_strlen($string, $charset);
1488 } elseif ($this->twoByteSets[$charset]) {
1489 return strlen($string) / 2;
1490 } elseif ($this->fourByteSets[$charset]) {
1491 return strlen($string) / 4;
1492 }
1493 // treat everything else as single-byte encoding
1494 return strlen($string);
1495 }
1496
1497 /**
1498 * Method to crop strings using the mb_substr function.
1499 *
1500 * @param string The character set
1501 * @param string String to be cropped
1502 * @param integer Crop length (in characters)
1503 * @param string Crop signifier
1504 * @return string The shortened string
1505 * @see mb_strlen(), mb_substr()
1506 */
1507 protected function cropMbstring($charset, $string, $len, $crop = '') {
1508 if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) {
1509 return $string;
1510 }
1511
1512 if ($len > 0) {
1513 $string = mb_substr($string, 0, $len, $charset) . $crop;
1514 } else {
1515 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1516 }
1517
1518 return $string;
1519 }
1520
1521 /**
1522 * Truncates a string and pre-/appends a string.
1523 * Unit tested by Kasper
1524 *
1525 * @param string The character set
1526 * @param string Character string
1527 * @param integer Length (in characters)
1528 * @param string Crop signifier
1529 * @return string The shortened string
1530 * @see substr(), mb_strimwidth()
1531 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1532 */
1533 function crop($charset, $string, $len, $crop = '') {
1534 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1535 return $this->cropMbstring($charset, $string, $len, $crop);
1536 }
1537
1538 if (intval($len) == 0) {
1539 return $string;
1540 }
1541
1542 if ($charset == 'utf-8') {
1543 $i = $this->utf8_char2byte_pos($string, $len);
1544 } elseif ($this->eucBasedSets[$charset]) {
1545 $i = $this->euc_char2byte_pos($string, $len, $charset);
1546 } else {
1547 if ($len > 0) {
1548 $i = $len;
1549 } else {
1550 $i = strlen($string) + $len;
1551 if ($i <= 0) {
1552 $i = FALSE;
1553 }
1554 }
1555 }
1556
1557 if ($i === FALSE) { // $len outside actual string length
1558 return $string;
1559 } else {
1560 if ($len > 0) {
1561 if (strlen($string{$i})) {
1562 return substr($string, 0, $i) . $crop;
1563
1564 }
1565 } else {
1566 if (strlen($string{$i - 1})) {
1567 return $crop . substr($string, $i);
1568 }
1569 }
1570
1571 /*
1572 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return TRUE here (which is not a catastrophe, but...)
1573 if ($len > 0) {
1574 return substr($string,0,$i).$crop;
1575 } else {
1576 return $crop.substr($string,$i);
1577 }
1578 }
1579 */
1580 }
1581 return $string;
1582 }
1583
1584 /**
1585 * Cuts a string short at a given byte length.
1586 *
1587 * @param string The character set
1588 * @param string Character string
1589 * @param integer The byte length
1590 * @return string The shortened string
1591 * @see mb_strcut()
1592 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1593 */
1594 function strtrunc($charset, $string, $len) {
1595 if ($len <= 0) {
1596 return '';
1597 }
1598
1599 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1600 return mb_strcut($string, 0, $len, $charset);
1601 } elseif ($charset == 'utf-8') {
1602 return $this->utf8_strtrunc($string, $len);
1603 } elseif ($this->eucBasedSets[$charset]) {
1604 return $this->euc_strtrunc($string, $len, $charset);
1605 } elseif ($this->twoByteSets[$charset]) {
1606 if ($len % 2) {
1607 $len--;
1608 } // don't cut at odd positions
1609 } elseif ($this->fourByteSets[$charset]) {
1610 $x = $len % 4;
1611 $len -= $x; // realign to position dividable by four
1612 }
1613 // treat everything else as single-byte encoding
1614 return substr($string, 0, $len);
1615 }
1616
1617 /**
1618 * Translates all characters of a string into their respective case values.
1619 * Unlike strtolower() and strtoupper() this method is locale independent.
1620 * Note that the string length may change!
1621 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1622 * Unit-tested by Kasper
1623 * Real case folding is language dependent, this method ignores this fact.
1624 *
1625 * @param string Character set of string
1626 * @param string Input string to convert case for
1627 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1628 * @return string The converted string
1629 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1630 * @see strtolower(), strtoupper()
1631 */
1632 function conv_case($charset, $string, $case) {
1633 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1634 if ($case == 'toLower') {
1635 $string = mb_strtolower($string, $charset);
1636 } else {
1637 $string = mb_strtoupper($string, $charset);
1638 }
1639 } elseif ($charset == 'utf-8') {
1640 $string = $this->utf8_char_mapping($string, 'case', $case);
1641 } elseif (isset($this->eucBasedSets[$charset])) {
1642 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1643 } else {
1644 // treat everything else as single-byte encoding
1645 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1646 }
1647
1648 return $string;
1649 }
1650
1651 /**
1652 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1653 *
1654 * @param string Character set of string
1655 * @param string Input string to convert
1656 * @return string The converted string
1657 */
1658 function specCharsToASCII($charset, $string) {
1659 if ($charset == 'utf-8') {
1660 $string = $this->utf8_char_mapping($string, 'ascii');
1661 } elseif (isset($this->eucBasedSets[$charset])) {
1662 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1663 } else {
1664 // treat everything else as single-byte encoding
1665 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1666 }
1667
1668 return $string;
1669 }
1670
1671
1672 /**
1673 * converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1674 * into a TYPO3-readable language code
1675 * @param $languageCodesList list of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1676 * see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
1677 * @return string a preferred language that TYPO3 supports, or "default" if none found
1678 * @author Benjamin Mack (benni.typo3.org)
1679 */
1680 public function getPreferredClientLanguage($languageCodesList) {
1681 $allLanguageCodes = array();
1682 $selectedLanguage = 'default';
1683
1684 // get all languages where TYPO3 code is the same as the ISO code
1685 foreach ($this->charSetArray as $typo3Lang => $charSet) {
1686 $allLanguageCodes[$typo3Lang] = $typo3Lang;
1687 }
1688
1689 // get all languages where TYPO3 code differs from ISO code
1690 // or needs the country part
1691 // the iso codes will here overwrite the default typo3 language in the key
1692 foreach ($this->isoArray as $typo3Lang => $isoLang) {
1693 $isoLang = join('-', explode('_', $isoLang));
1694 $allLanguageCodes[$typo3Lang] = $isoLang;
1695 }
1696
1697 // move the iso codes to the (because we're comparing the keys with "isset" later on)
1698 $allLanguageCodes = array_flip($allLanguageCodes);
1699
1700
1701 $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList);
1702 // order the preferred languages after they key
1703 $sortedPreferredLanguages = array();
1704 foreach ($preferredLanguages as $preferredLanguage) {
1705 $quality = 1.0;
1706 if (strpos($preferredLanguage, ';q=') !== FALSE) {
1707 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1708 }
1709 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1710 }
1711
1712 // loop through the languages, with the highest priority first
1713 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1714 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1715 if (isset($allLanguageCodes[$preferredLanguage])) {
1716 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1717 break;
1718 }
1719
1720 // strip the country code from the end
1721 list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
1722 if (isset($allLanguageCodes[$preferredLanguage])) {
1723 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1724 break;
1725 }
1726 }
1727 if (!$selectedLanguage || $selectedLanguage == 'en') {
1728 $selectedLanguage = 'default';
1729 }
1730 return $selectedLanguage;
1731 }
1732
1733
1734 /********************************************
1735 *
1736 * Internal string operation functions
1737 *
1738 ********************************************/
1739
1740 /**
1741 * Maps all characters of a string in a single byte charset.
1742 *
1743 * @param string the string
1744 * @param string the charset
1745 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1746 * @param string 'case': conversion 'toLower' or 'toUpper'
1747 * @return string the converted string
1748 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1749 */
1750 function sb_char_mapping($str, $charset, $mode, $opt = '') {
1751 switch ($mode) {
1752 case 'case':
1753 if (!$this->initCaseFolding($charset)) {
1754 return $str;
1755 } // do nothing
1756 $map =& $this->caseFolding[$charset][$opt];
1757 break;
1758
1759 case 'ascii':
1760 if (!$this->initToASCII($charset)) {
1761 return $str;
1762 } // do nothing
1763 $map =& $this->toASCII[$charset];
1764 break;
1765
1766 default:
1767 return $str;
1768 }
1769
1770 $out = '';
1771 for ($i = 0; strlen($str{$i}); $i++) {
1772 $c = $str{$i};
1773 if (isset($map[$c])) {
1774 $out .= $map[$c];
1775 } else {
1776 $out .= $c;
1777 }
1778 }
1779
1780 return $out;
1781 }
1782
1783
1784 /********************************************
1785 *
1786 * Internal UTF-8 string operation functions
1787 *
1788 ********************************************/
1789
1790 /**
1791 * Returns a part of a UTF-8 string.
1792 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1793 *
1794 * @param string UTF-8 string
1795 * @param integer Start position (character position)
1796 * @param integer Length (in characters)
1797 * @return string The substring
1798 * @see substr()
1799 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1800 */
1801 function utf8_substr($str, $start, $len = NULL) {
1802 if (!strcmp($len, '0')) {
1803 return '';
1804 }
1805
1806 $byte_start = $this->utf8_char2byte_pos($str, $start);
1807 if ($byte_start === FALSE) {
1808 if ($start > 0) {
1809 return FALSE; // $start outside string length
1810 } else {
1811 $start = 0;
1812 }
1813 }
1814
1815 $str = substr($str, $byte_start);
1816
1817 if ($len != NULL) {
1818 $byte_end = $this->utf8_char2byte_pos($str, $len);
1819 if ($byte_end === FALSE) // $len outside actual string length
1820 {
1821 return $len < 0 ? '' : $str;
1822 } // When length is less than zero and exceeds, then we return blank string.
1823 else
1824 {
1825 return substr($str, 0, $byte_end);
1826 }
1827 }
1828 else {
1829 return $str;
1830 }
1831 }
1832
1833 /**
1834 * Counts the number of characters of a string in UTF-8.
1835 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1836 *
1837 * @param string UTF-8 multibyte character string
1838 * @return integer The number of characters
1839 * @see strlen()
1840 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1841 */
1842 function utf8_strlen($str) {
1843 $n = 0;
1844 for ($i = 0; strlen($str{$i}); $i++) {
1845 $c = ord($str{$i});
1846 if (!($c & 0x80)) // single-byte (0xxxxxx)
1847 {
1848 $n++;
1849 }
1850 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1851 {
1852 $n++;
1853 }
1854 }
1855 return $n;
1856 }
1857
1858 /**
1859 * Truncates a string in UTF-8 short at a given byte length.
1860 *
1861 * @param string UTF-8 multibyte character string
1862 * @param integer the byte length
1863 * @return string the shortened string
1864 * @see mb_strcut()
1865 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1866 */
1867 function utf8_strtrunc($str, $len) {
1868 $i = $len - 1;
1869 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1870 for (; $i > 0 && !(ord($str{$i}) & 0x40); $i--) {
1871 // find the first byte
1872 ;
1873 }
1874 if ($i <= 0) {
1875 return '';
1876 } // sanity check
1877 for ($bc = 0, $mbs = ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) {
1878 // calculate number of bytes
1879 $bc++;
1880 }
1881 if ($bc + $i > $len) {
1882 return substr($str, 0, $i);
1883 }
1884 // fallthru: multibyte char fits into length
1885 }
1886 return substr($str, 0, $len);
1887 }
1888
1889 /**
1890 * Find position of first occurrence of a string, both arguments are in UTF-8.
1891 *
1892 * @param string UTF-8 string to search in
1893 * @param string UTF-8 string to search for
1894 * @param integer Positition to start the search
1895 * @return integer The character position
1896 * @see strpos()
1897 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1898 */
1899 function utf8_strpos($haystack, $needle, $offset = 0) {
1900 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1901 return mb_strpos($haystack, $needle, $offset, 'utf-8');
1902 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1903 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1904 }
1905
1906 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1907 if ($byte_offset === FALSE) {
1908 return FALSE;
1909 } // offset beyond string length
1910
1911 $byte_pos = strpos($haystack, $needle, $byte_offset);
1912 if ($byte_pos === FALSE) {
1913 return FALSE;
1914 } // needle not found
1915
1916 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1917 }
1918
1919 /**
1920 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1921 *
1922 * @param string UTF-8 string to search in
1923 * @param string UTF-8 character to search for (single character)
1924 * @return integer The character position
1925 * @see strrpos()
1926 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1927 */
1928 function utf8_strrpos($haystack, $needle) {
1929 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1930 return mb_strrpos($haystack, $needle, 'utf-8');
1931 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1932 return iconv_strrpos($haystack, $needle, 'utf-8');
1933 }
1934
1935 $byte_pos = strrpos($haystack, $needle);
1936 if ($byte_pos === FALSE) {
1937 return FALSE;
1938 } // needle not found
1939
1940 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1941 }
1942
1943 /**
1944 * Translates a character position into an 'absolute' byte position.
1945 * Unit tested by Kasper.
1946 *
1947 * @param string UTF-8 string
1948 * @param integer Character position (negative values start from the end)
1949 * @return integer Byte position
1950 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1951 */
1952 function utf8_char2byte_pos($str, $pos) {
1953 $n = 0; // number of characters found
1954 $p = abs($pos); // number of characters wanted
1955
1956 if ($pos >= 0) {
1957 $i = 0;
1958 $d = 1;
1959 } else {
1960 $i = strlen($str) - 1;
1961 $d = -1;
1962 }
1963
1964 for (; strlen($str{$i}) && $n < $p; $i += $d) {
1965 $c = (int) ord($str{$i});
1966 if (!($c & 0x80)) // single-byte (0xxxxxx)
1967 {
1968 $n++;
1969 }
1970 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1971 {
1972 $n++;
1973 }
1974 }
1975 if (!strlen($str{$i})) {
1976 return FALSE;
1977 } // offset beyond string length
1978
1979 if ($pos >= 0) {
1980 // skip trailing multi-byte data bytes
1981 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) {
1982 $i++;
1983 }
1984 } else {
1985 // correct offset
1986 $i++;
1987 }
1988
1989 return $i;
1990 }
1991
1992 /**
1993 * Translates an 'absolute' byte position into a character position.
1994 * Unit tested by Kasper.
1995 *
1996 * @param string UTF-8 string
1997 * @param integer byte position
1998 * @return integer character position
1999 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2000 */
2001 function utf8_byte2char_pos($str, $pos) {
2002 $n = 0; // number of characters
2003 for ($i = $pos; $i > 0; $i--) {
2004 $c = (int) ord($str{$i});
2005 if (!($c & 0x80)) // single-byte (0xxxxxx)
2006 {
2007 $n++;
2008 }
2009 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2010 {
2011 $n++;
2012 }
2013 }
2014 if (!strlen($str{$i})) {
2015 return FALSE;
2016 } // offset beyond string length
2017
2018 return $n;
2019 }
2020
2021 /**
2022 * Maps all characters of an UTF-8 string.
2023 *
2024 * @param string UTF-8 string
2025 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2026 * @param string 'case': conversion 'toLower' or 'toUpper'
2027 * @return string the converted string
2028 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2029 */
2030 function utf8_char_mapping($str, $mode, $opt = '') {
2031 if (!$this->initUnicodeData($mode)) {
2032 return $str;
2033 } // do nothing
2034
2035 $out = '';
2036 switch ($mode) {
2037 case 'case':
2038 $map =& $this->caseFolding['utf-8'][$opt];
2039 break;
2040
2041 case 'ascii':
2042 $map =& $this->toASCII['utf-8'];
2043 break;
2044
2045 default:
2046 return $str;
2047 }
2048
2049 for ($i = 0; strlen($str{$i}); $i++) {
2050 $c = ord($str{$i});
2051 if (!($c & 0x80)) // single-byte (0xxxxxx)
2052 {
2053 $mbc = $str{$i};
2054 }
2055 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
2056 for ($bc = 0; $c & 0x80; $c = $c << 1) {
2057 $bc++;
2058 } // calculate number of bytes
2059 $mbc = substr($str, $i, $bc);
2060 $i += $bc - 1;
2061 }
2062
2063 if (isset($map[$mbc])) {
2064 $out .= $map[$mbc];
2065 } else {
2066 $out .= $mbc;
2067 }
2068 }
2069
2070 return $out;
2071 }
2072
2073
2074 /********************************************
2075 *
2076 * Internal EUC string operation functions
2077 *
2078 * Extended Unix Code:
2079 * ASCII compatible 7bit single bytes chars
2080 * 8bit two byte chars
2081 *
2082 * Shift-JIS is treated as a special case.
2083 *
2084 ********************************************/
2085
2086 /**
2087 * Cuts a string in the EUC charset family short at a given byte length.
2088 *
2089 * @param string EUC multibyte character string
2090 * @param integer the byte length
2091 * @param string the charset
2092 * @return string the shortened string
2093 * @see mb_strcut()
2094 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2095 */
2096 function euc_strtrunc($str, $len, $charset) {
2097 $sjis = ($charset == 'shift_jis');
2098 for ($i = 0; strlen($str{$i}) && $i < $len; $i++) {
2099 $c = ord($str{$i});
2100 if ($sjis) {
2101 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2102 $i++;
2103 } // advance a double-byte char
2104 }
2105 else {
2106 if ($c >= 0x80) {
2107 $i++;
2108 } // advance a double-byte char
2109 }
2110 }
2111 if (!strlen($str{$i})) {
2112 return $str;
2113 } // string shorter than supplied length
2114
2115 if ($i > $len) {
2116 return substr($str, 0, $len - 1); // we ended on a first byte
2117 } else {
2118 return substr($str, 0, $len);
2119 }
2120 }
2121
2122 /**
2123 * Returns a part of a string in the EUC charset family.
2124 *
2125 * @param string EUC multibyte character string
2126 * @param integer start position (character position)
2127 * @param string the charset
2128 * @param integer length (in characters)
2129 * @return string the substring
2130 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2131 */
2132 function euc_substr($str, $start, $charset, $len = NULL) {
2133 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2134 if ($byte_start === FALSE) {
2135 return FALSE;
2136 } // $start outside string length
2137
2138 $str = substr($str, $byte_start);
2139
2140 if ($len != NULL) {
2141 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2142 if ($byte_end === FALSE) // $len outside actual string length
2143 {
2144 return $str;
2145 }
2146 else
2147 {
2148 return substr($str, 0, $byte_end);
2149 }
2150 }
2151 else {
2152 return $str;
2153 }
2154 }
2155
2156 /**
2157 * Counts the number of characters of a string in the EUC charset family.
2158 *
2159 * @param string EUC multibyte character string
2160 * @param string the charset
2161 * @return integer the number of characters
2162 * @see strlen()
2163 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2164 */
2165 function euc_strlen($str, $charset) {
2166 $sjis = ($charset == 'shift_jis');
2167 $n = 0;
2168 for ($i = 0; strlen($str{$i}); $i++) {
2169 $c = ord($str{$i});
2170 if ($sjis) {
2171 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2172 $i++;
2173 } // advance a double-byte char
2174 }
2175 else {
2176 if ($c >= 0x80) {
2177 $i++;
2178 } // advance a double-byte char
2179 }
2180
2181 $n++;
2182 }
2183
2184 return $n;
2185 }
2186
2187 /**
2188 * Translates a character position into an 'absolute' byte position.
2189 *
2190 * @param string EUC multibyte character string
2191 * @param integer character position (negative values start from the end)
2192 * @param string the charset
2193 * @return integer byte position
2194 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2195 */
2196 function euc_char2byte_pos($str, $pos, $charset) {
2197 $sjis = ($charset == 'shift_jis');
2198 $n = 0; // number of characters seen
2199 $p = abs($pos); // number of characters wanted
2200
2201 if ($pos >= 0) {
2202 $i = 0;
2203 $d = 1;
2204 } else {
2205 $i = strlen($str) - 1;
2206 $d = -1;
2207 }
2208
2209 for (; strlen($str{$i}) && $n < $p; $i += $d) {
2210 $c = ord($str{$i});
2211 if ($sjis) {
2212 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2213 $i += $d;
2214 } // advance a double-byte char
2215 }
2216 else {
2217 if ($c >= 0x80) {
2218 $i += $d;
2219 } // advance a double-byte char
2220 }
2221
2222 $n++;
2223 }
2224 if (!strlen($str{$i})) {
2225 return FALSE;
2226 } // offset beyond string length
2227
2228 if ($pos < 0) {
2229 $i++;
2230 } // correct offset
2231
2232 return $i;
2233 }
2234
2235 /**
2236 * Maps all characters of a string in the EUC charset family.
2237 *
2238 * @param string EUC multibyte character string
2239 * @param string the charset
2240 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2241 * @param string 'case': conversion 'toLower' or 'toUpper'
2242 * @return string the converted string
2243 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2244 */
2245 function euc_char_mapping($str, $charset, $mode, $opt = '') {
2246 switch ($mode) {
2247 case 'case':
2248 if (!$this->initCaseFolding($charset)) {
2249 return $str;
2250 } // do nothing
2251 $map =& $this->caseFolding[$charset][$opt];
2252 break;
2253
2254 case 'ascii':
2255 if (!$this->initToASCII($charset)) {
2256 return $str;
2257 } // do nothing
2258 $map =& $this->toASCII[$charset];
2259 break;
2260
2261 default:
2262 return $str;
2263 }
2264
2265 $sjis = ($charset == 'shift_jis');
2266 $out = '';
2267 for ($i = 0; strlen($str{$i}); $i++) {
2268 $mbc = $str{$i};
2269 $c = ord($mbc);
2270
2271 if ($sjis) {
2272 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2273 $mbc = substr($str, $i, 2);
2274 $i++;
2275 }
2276 }
2277 else {
2278 if ($c >= 0x80) { // a double-byte char
2279 $mbc = substr($str, $i, 2);
2280 $i++;
2281 }
2282 }
2283
2284 if (isset($map[$mbc])) {
2285 $out .= $map[$mbc];
2286 } else {
2287 $out .= $mbc;
2288 }
2289 }
2290
2291 return $out;
2292 }
2293
2294 }
2295
2296 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])) {
2297 include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2298 }
2299
2300 ?>