[TASK] Replace Space Indent into Tab indent (CGL Cleanup)
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
28 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
29 */
30
31
32 /**
33 * Notes on UTF-8
34 *
35 * Functions working on UTF-8 strings:
36 *
37 * - strchr/strstr
38 * - strrchr
39 * - substr_count
40 * - implode/explode/join
41 *
42 * Functions nearly working on UTF-8 strings:
43 *
44 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
45 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
46 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
47 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
48 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
49 *
50 * Functions NOT working on UTF-8 strings:
51 *
52 * - str*cmp
53 * - stristr
54 * - stripos
55 * - substr
56 * - strrev
57 * - split/spliti
58 * - ...
59 *
60 */
61 /**
62 * Class for conversion between charsets
63 *
64 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
65 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
66 * @package TYPO3
67 * @subpackage t3lib
68 */
69 class t3lib_cs {
70
71 /**
72 * @var t3lib_l10n_Locales
73 */
74 protected $locales;
75
76 var $noCharByteVal = 63; // ASCII Value for chars with no equivalent.
77
78 // This is the array where parsed conversion tables are stored (cached)
79 var $parsedCharsets = array();
80
81 // An array where case folding data will be stored (cached)
82 var $caseFolding = array();
83
84 // An array where charset-to-ASCII mappings are stored (cached)
85 var $toASCII = array();
86
87 // This tells the converter which charsets has two bytes per char:
88 var $twoByteSets = array(
89 'ucs-2' => 1, // 2-byte Unicode
90 );
91
92 // This tells the converter which charsets has four bytes per char:
93 var $fourByteSets = array(
94 'ucs-4' => 1, // 4-byte Unicode
95 'utf-32' => 1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
96 );
97
98 // This tells the converter which charsets use a scheme like the Extended Unix Code:
99 var $eucBasedSets = array(
100 'gb2312' => 1, // Chinese, simplified.
101 'big5' => 1, // Chinese, traditional.
102 'euc-kr' => 1, // Korean
103 'shift_jis' => 1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
104 );
105
106 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
107 // http://czyborra.com/charsets/iso8859.html
108 var $synonyms = array(
109 'us' => 'ascii',
110 'us-ascii' => 'ascii',
111 'cp819' => 'iso-8859-1',
112 'ibm819' => 'iso-8859-1',
113 'iso-ir-100' => 'iso-8859-1',
114 'iso-ir-101' => 'iso-8859-2',
115 'iso-ir-109' => 'iso-8859-3',
116 'iso-ir-110' => 'iso-8859-4',
117 'iso-ir-144' => 'iso-8859-5',
118 'iso-ir-127' => 'iso-8859-6',
119 'iso-ir-126' => 'iso-8859-7',
120 'iso-ir-138' => 'iso-8859-8',
121 'iso-ir-148' => 'iso-8859-9',
122 'iso-ir-157' => 'iso-8859-10',
123 'iso-ir-179' => 'iso-8859-13',
124 'iso-ir-199' => 'iso-8859-14',
125 'iso-ir-203' => 'iso-8859-15',
126 'csisolatin1' => 'iso-8859-1',
127 'csisolatin2' => 'iso-8859-2',
128 'csisolatin3' => 'iso-8859-3',
129 'csisolatin5' => 'iso-8859-9',
130 'csisolatin8' => 'iso-8859-14',
131 'csisolatin9' => 'iso-8859-15',
132 'csisolatingreek' => 'iso-8859-7',
133 'iso-celtic' => 'iso-8859-14',
134 'latin1' => 'iso-8859-1',
135 'latin2' => 'iso-8859-2',
136 'latin3' => 'iso-8859-3',
137 'latin5' => 'iso-8859-9',
138 'latin6' => 'iso-8859-10',
139 'latin8' => 'iso-8859-14',
140 'latin9' => 'iso-8859-15',
141 'l1' => 'iso-8859-1',
142 'l2' => 'iso-8859-2',
143 'l3' => 'iso-8859-3',
144 'l5' => 'iso-8859-9',
145 'l6' => 'iso-8859-10',
146 'l8' => 'iso-8859-14',
147 'l9' => 'iso-8859-15',
148 'cyrillic' => 'iso-8859-5',
149 'arabic' => 'iso-8859-6',
150 'tis-620' => 'iso-8859-11',
151 'win874' => 'windows-874',
152 'win1250' => 'windows-1250',
153 'win1251' => 'windows-1251',
154 'win1252' => 'windows-1252',
155 'win1253' => 'windows-1253',
156 'win1254' => 'windows-1254',
157 'win1255' => 'windows-1255',
158 'win1256' => 'windows-1256',
159 'win1257' => 'windows-1257',
160 'win1258' => 'windows-1258',
161 'cp1250' => 'windows-1250',
162 'cp1251' => 'windows-1251',
163 'cp1252' => 'windows-1252',
164 'ms-ee' => 'windows-1250',
165 'ms-ansi' => 'windows-1252',
166 'ms-greek' => 'windows-1253',
167 'ms-turk' => 'windows-1254',
168 'winbaltrim' => 'windows-1257',
169 'koi-8ru' => 'koi-8r',
170 'koi8r' => 'koi-8r',
171 'cp878' => 'koi-8r',
172 'mac' => 'macroman',
173 'macintosh' => 'macroman',
174 'euc-cn' => 'gb2312',
175 'x-euc-cn' => 'gb2312',
176 'euccn' => 'gb2312',
177 'cp936' => 'gb2312',
178 'big-5' => 'big5',
179 'cp950' => 'big5',
180 'eucjp' => 'euc-jp',
181 'sjis' => 'shift_jis',
182 'shift-jis' => 'shift_jis',
183 'cp932' => 'shift_jis',
184 'cp949' => 'euc-kr',
185 'utf7' => 'utf-7',
186 'utf8' => 'utf-8',
187 'utf16' => 'utf-16',
188 'utf32' => 'utf-32',
189 'utf8' => 'utf-8',
190 'ucs2' => 'ucs-2',
191 'ucs4' => 'ucs-4',
192 );
193
194 // mapping of iso-639-1 language codes to script names
195 var $lang_to_script = array(
196 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
197 'af' => 'west_european', //Afrikaans
198 'ar' => 'arabic',
199 'bg' => 'cyrillic', // Bulgarian
200 'bs' => 'east_european', // Bosnian
201 'cs' => 'east_european', // Czech
202 'da' => 'west_european', // Danish
203 'de' => 'west_european', // German
204 'es' => 'west_european', // Spanish
205 'et' => 'estonian',
206 'eo' => 'unicode', // Esperanto
207 'eu' => 'west_european', // Basque
208 'fa' => 'arabic', // Persian
209 'fi' => 'west_european', // Finish
210 'fo' => 'west_european', // Faroese
211 'fr' => 'west_european', // French
212 'ga' => 'west_european', // Irish
213 'gl' => 'west_european', // Galician
214 'gr' => 'greek',
215 'he' => 'hebrew', // Hebrew (since 1998)
216 'hi' => 'unicode', // Hindi
217 'hr' => 'east_european', // Croatian
218 'hu' => 'east_european', // Hungarian
219 'iw' => 'hebrew', // Hebrew (til 1998)
220 'is' => 'west_european', // Icelandic
221 'it' => 'west_european', // Italian
222 'ja' => 'japanese',
223 'ka' => 'unicode', // Georgian
224 'kl' => 'west_european', // Greenlandic
225 'km' => 'unicode', // Khmer
226 'ko' => 'korean',
227 'lt' => 'lithuanian',
228 'lv' => 'west_european', // Latvian/Lettish
229 'nl' => 'west_european', // Dutch
230 'no' => 'west_european', // Norwegian
231 'nb' => 'west_european', // Norwegian Bokmal
232 'nn' => 'west_european', // Norwegian Nynorsk
233 'pl' => 'east_european', // Polish
234 'pt' => 'west_european', // Portuguese
235 'ro' => 'east_european', // Romanian
236 'ru' => 'cyrillic', // Russian
237 'sk' => 'east_european', // Slovak
238 'sl' => 'east_european', // Slovenian
239 'sr' => 'cyrillic', // Serbian
240 'sv' => 'west_european', // Swedish
241 'sq' => 'albanian', // Albanian
242 'th' => 'thai',
243 'uk' => 'cyrillic', // Ukranian
244 'vi' => 'vietnamese',
245 'zh' => 'chinese',
246 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
247 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
248 'afk'=> 'west_european', // Afrikaans
249 'ara' => 'arabic',
250 'bgr' => 'cyrillic', // Bulgarian
251 'cat' => 'west_european', // Catalan
252 'chs' => 'simpl_chinese',
253 'cht' => 'trad_chinese',
254 'csy' => 'east_european', // Czech
255 'dan' => 'west_european', // Danisch
256 'deu' => 'west_european', // German
257 'dea' => 'west_european', // German (Austrian)
258 'des' => 'west_european', // German (Swiss)
259 'ena' => 'west_european', // English (Australian)
260 'enc' => 'west_european', // English (Canadian)
261 'eng' => 'west_european', // English
262 'enz' => 'west_european', // English (New Zealand)
263 'enu' => 'west_european', // English (United States)
264 'euq' => 'west_european', // Basque
265 'fos' => 'west_european', // Faroese
266 'far' => 'arabic', // Persian
267 'fin' => 'west_european', // Finish
268 'fra' => 'west_european', // French
269 'frb' => 'west_european', // French (Belgian)
270 'frc' => 'west_european', // French (Canadian)
271 'frs' => 'west_european', // French (Swiss)
272 'geo' => 'unicode', // Georgian
273 'glg' => 'west_european', // Galician
274 'ell' => 'greek',
275 'heb' => 'hebrew',
276 'hin' => 'unicode', // Hindi
277 'hun' => 'east_european', // Hungarian
278 'isl' => 'west_euorpean', // Icelandic
279 'ita' => 'west_european', // Italian
280 'its' => 'west_european', // Italian (Swiss)
281 'jpn' => 'japanese',
282 'khm' => 'unicode', // Khmer
283 'kor' => 'korean',
284 'lth' => 'lithuanian',
285 'lvi' => 'west_european', // Latvian/Lettish
286 'msl' => 'west_european', // Malay
287 'nlb' => 'west_european', // Dutch (Belgian)
288 'nld' => 'west_european', // Dutch
289 'nor' => 'west_european', // Norwegian (bokmal)
290 'non' => 'west_european', // Norwegian (nynorsk)
291 'plk' => 'east_european', // Polish
292 'ptg' => 'west_european', // Portuguese
293 'ptb' => 'west_european', // Portuguese (Brazil)
294 'rom' => 'east_european', // Romanian
295 'rus' => 'cyrillic', // Russian
296 'slv' => 'east_european', // Slovenian
297 'sky' => 'east_european', // Slovak
298 'srl' => 'east_european', // Serbian (Latin)
299 'srb' => 'cyrillic', // Serbian (Cyrillic)
300 'esp' => 'west_european', // Spanish (trad. sort)
301 'esm' => 'west_european', // Spanish (Mexican)
302 'esn' => 'west_european', // Spanish (internat. sort)
303 'sve' => 'west_european', // Swedish
304 'sqi' => 'albanian', // Albanian
305 'tha' => 'thai',
306 'trk' => 'turkish',
307 'ukr' => 'cyrillic', // Ukrainian
308 // English language names
309 'afrikaans' => 'west_european',
310 'albanian' => 'albanian',
311 'arabic' => 'arabic',
312 'basque' => 'west_european',
313 'bosnian' => 'east_european',
314 'bulgarian' => 'east_european',
315 'catalan' => 'west_european',
316 'croatian' => 'east_european',
317 'czech' => 'east_european',
318 'danish' => 'west_european',
319 'dutch' => 'west_european',
320 'english' => 'west_european',
321 'esperanto' => 'unicode',
322 'estonian' => 'estonian',
323 'faroese' => 'west_european',
324 'farsi' => 'arabic',
325 'finnish' => 'west_european',
326 'french' => 'west_european',
327 'galician' => 'west_european',
328 'georgian' => 'unicode',
329 'german' => 'west_european',
330 'greek' => 'greek',
331 'greenlandic' => 'west_european',
332 'hebrew' => 'hebrew',
333 'hindi' => 'unicode',
334 'hungarian' => 'east_european',
335 'icelandic' => 'west_european',
336 'italian' => 'west_european',
337 'khmer' => 'unicode',
338 'latvian' => 'west_european',
339 'lettish' => 'west_european',
340 'lithuanian' => 'lithuanian',
341 'malay' => 'west_european',
342 'norwegian' => 'west_european',
343 'persian' => 'arabic',
344 'polish' => 'east_european',
345 'portuguese' => 'west_european',
346 'russian' => 'cyrillic',
347 'romanian' => 'east_european',
348 'serbian' => 'cyrillic',
349 'slovak' => 'east_european',
350 'slovenian' => 'east_european',
351 'spanish' => 'west_european',
352 'svedish' => 'west_european',
353 'that' => 'thai',
354 'turkish' => 'turkish',
355 'ukrainian' => 'cyrillic',
356 );
357
358 // mapping of language (family) names to charsets on Unix
359 var $script_to_charset_unix = array(
360 'west_european' => 'iso-8859-1',
361 'estonian' => 'iso-8859-1',
362 'east_european' => 'iso-8859-2',
363 'baltic' => 'iso-8859-4',
364 'cyrillic' => 'iso-8859-5',
365 'arabic' => 'iso-8859-6',
366 'greek' => 'iso-8859-7',
367 'hebrew' => 'iso-8859-8',
368 'turkish' => 'iso-8859-9',
369 'thai' => 'iso-8859-11', // = TIS-620
370 'lithuanian' => 'iso-8859-13',
371 'chinese' => 'gb2312', // = euc-cn
372 'japanese' => 'euc-jp',
373 'korean' => 'euc-kr',
374 'simpl_chinese' => 'gb2312',
375 'trad_chinese' => 'big5',
376 'vietnamese' => '',
377 'unicode' => 'utf-8',
378 'albanian' => 'utf-8'
379 );
380
381 // mapping of language (family) names to charsets on Windows
382 var $script_to_charset_windows = array(
383 'east_european' => 'windows-1250',
384 'cyrillic' => 'windows-1251',
385 'west_european' => 'windows-1252',
386 'greek' => 'windows-1253',
387 'turkish' => 'windows-1254',
388 'hebrew' => 'windows-1255',
389 'arabic' => 'windows-1256',
390 'baltic' => 'windows-1257',
391 'estonian' => 'windows-1257',
392 'lithuanian' => 'windows-1257',
393 'vietnamese' => 'windows-1258',
394 'thai' => 'cp874',
395 'korean' => 'cp949',
396 'chinese' => 'gb2312',
397 'japanese' => 'shift_jis',
398 'simpl_chinese' => 'gb2312',
399 'trad_chinese' => 'big5',
400 'albanian' => 'windows-1250',
401 'unicode' => 'utf-8'
402 );
403
404 // mapping of locale names to charsets
405 var $locale_to_charset = array(
406 'japanese.euc' => 'euc-jp',
407 'ja_jp.ujis' => 'euc-jp',
408 'korean.euc' => 'euc-kr',
409 'sr@Latn' => 'iso-8859-2',
410 'zh_cn' => 'gb2312',
411 'zh_hk' => 'big5',
412 'zh_tw' => 'big5',
413 );
414
415 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
416 // Empty values means "iso-8859-1"
417 var $charSetArray = array(
418 'af' => '',
419 'ar' => 'iso-8859-6',
420 'ba' => 'iso-8859-2',
421 'bg' => 'windows-1251',
422 'br' => '',
423 'ca' => 'iso-8859-15',
424 'ch' => 'gb2312',
425 'cs' => 'windows-1250',
426 'cz' => 'windows-1250',
427 'da' => '',
428 'de' => '',
429 'dk' => '',
430 'el' => 'iso-8859-7',
431 'eo' => 'utf-8',
432 'es' => '',
433 'et' => 'iso-8859-4',
434 'eu' => '',
435 'fa' => 'utf-8',
436 'fi' => '',
437 'fo' => 'utf-8',
438 'fr' => '',
439 'fr_CA' => '',
440 'ga' => '',
441 'ge' => 'utf-8',
442 'gl' => '',
443 'gr' => 'iso-8859-7',
444 'he' => 'utf-8',
445 'hi' => 'utf-8',
446 'hk' => 'big5',
447 'hr' => 'windows-1250',
448 'hu' => 'iso-8859-2',
449 'is' => 'utf-8',
450 'it' => '',
451 'ja' => 'shift_jis',
452 'jp' => 'shift_jis',
453 'ka' => 'utf-8',
454 'kl' => 'utf-8',
455 'km' => 'utf-8',
456 'ko' => 'euc-kr',
457 'kr' => 'euc-kr',
458 'lt' => 'windows-1257',
459 'lv' => 'utf-8',
460 'ms' => '',
461 'my' => '',
462 'nl' => '',
463 'no' => '',
464 'pl' => 'iso-8859-2',
465 'pt' => '',
466 'pt_BR' => '',
467 'qc' => '',
468 'ro' => 'iso-8859-2',
469 'ru' => 'windows-1251',
470 'se' => '',
471 'si' => 'windows-1250',
472 'sk' => 'windows-1250',
473 'sl' => 'windows-1250',
474 'sq' => 'utf-8',
475 'sr' => 'utf-8',
476 'sv' => '',
477 'th' => 'iso-8859-11',
478 'tr' => 'iso-8859-9',
479 'ua' => 'windows-1251',
480 'uk' => 'windows-1251',
481 'vi' => 'utf-8',
482 'vn' => 'utf-8',
483 'zh' => 'big5',
484 );
485
486 /**
487 * Default constructor.
488 */
489 public function __construct() {
490 $this->locales = t3lib_div::makeInstance('t3lib_l10n_Locales');
491 }
492
493 /**
494 * Normalize - changes input character set to lowercase letters.
495 *
496 * @param string Input charset
497 * @return string Normalized charset
498 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
499 */
500 function parse_charset($charset) {
501 $charset = trim(strtolower($charset));
502 if (isset($this->synonyms[$charset])) {
503 $charset = $this->synonyms[$charset];
504 }
505
506 return $charset;
507 }
508
509 /**
510 * Get the charset of a locale.
511 *
512 * ln language
513 * ln_CN language / country
514 * ln_CN.cs language / country / charset
515 * ln_CN.cs@mod language / country / charset / modifier
516 *
517 * @param string Locale string
518 * @return string Charset resolved for locale string
519 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
520 */
521 function get_locale_charset($locale) {
522 $locale = strtolower($locale);
523
524 // exact locale specific charset?
525 if (isset($this->locale_to_charset[$locale])) {
526 return $this->locale_to_charset[$locale];
527 }
528
529 // get modifier
530 list($locale, $modifier) = explode('@', $locale);
531
532 // locale contains charset: use it
533 list($locale, $charset) = explode('.', $locale);
534 if ($charset) {
535 return $this->parse_charset($charset);
536 }
537
538 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
539 if ($modifier == 'euro') {
540 return 'iso-8859-15';
541 }
542
543 // get language
544 list($language, $country) = explode('_', $locale);
545 if (isset($this->lang_to_script[$language])) {
546 $script = $this->lang_to_script[$language];
547 }
548
549 if (TYPO3_OS == 'WIN') {
550 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
551 } else {
552 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'utf-8';
553 }
554
555 return $cs;
556 }
557
558
559 /********************************************
560 *
561 * Charset Conversion functions
562 *
563 ********************************************/
564
565 /**
566 * Convert from one charset to another charset.
567 *
568 * @param string Input string
569 * @param string From charset (the current charset of the string)
570 * @param string To charset (the output charset wanted)
571 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
572 * @return string Converted string
573 * @see convArray()
574 */
575 function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
576 if ($fromCS == $toCS) {
577 return $str;
578 }
579
580 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
581 if ($toCS == 'utf-8' || !$useEntityForNoChar) {
582 switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
583 case 'mbstring':
584 $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
585 if (FALSE !== $conv_str) {
586 return $conv_str;
587 } // returns FALSE for unsupported charsets
588 break;
589
590 case 'iconv':
591 $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
592 if (FALSE !== $conv_str) {
593 return $conv_str;
594 }
595 break;
596
597 case 'recode':
598 $conv_str = recode_string($fromCS . '..' . $toCS, $str);
599 if (FALSE !== $conv_str) {
600 return $conv_str;
601 }
602 break;
603 }
604 // fallback to TYPO3 conversion
605 }
606
607 if ($fromCS != 'utf-8') {
608 $str = $this->utf8_encode($str, $fromCS);
609 }
610 if ($toCS != 'utf-8') {
611 $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
612 }
613 return $str;
614 }
615
616 /**
617 * Convert all elements in ARRAY with type string from one charset to another charset.
618 * NOTICE: Array is passed by reference!
619 *
620 * @param string Input array, possibly multidimensional
621 * @param string From charset (the current charset of the string)
622 * @param string To charset (the output charset wanted)
623 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
624 * @return void
625 * @see conv()
626 */
627 function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
628 foreach ($array as $key => $value) {
629 if (is_array($array[$key])) {
630 $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
631 } elseif (is_string($array[$key])) {
632 $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
633 }
634 }
635 }
636
637 /**
638 * Converts $str from $charset to UTF-8
639 *
640 * @param string String in local charset to convert to UTF-8
641 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
642 * @return string Output string, converted to UTF-8
643 */
644 function utf8_encode($str, $charset) {
645
646 if ($charset === 'utf-8') {
647 return $str;
648 }
649
650 // Charset is case-insensitive.
651 if ($this->initCharset($charset)) { // Parse conv. table if not already...
652 $strLen = strlen($str);
653 $outStr = '';
654
655 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in string.
656 $chr = substr($str, $a, 1);
657 $ord = ord($chr);
658 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
659 $ord2 = ord($str{$a + 1});
660 $ord = $ord << 8 | $ord2; // assume big endian
661
662 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
663 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
664 } else {
665 $outStr .= chr($this->noCharByteVal);
666 } // No char exists
667 $a++;
668 } elseif ($ord > 127) { // If char has value over 127 it's a multibyte char in UTF-8
669 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
670 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
671 $a++;
672 $ord2 = ord(substr($str, $a, 1));
673 $ord = $ord * 256 + $ord2;
674 }
675 }
676
677 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
678 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
679 } else {
680 $outStr .= chr($this->noCharByteVal);
681 } // No char exists
682 } else {
683 $outStr .= $chr;
684 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
685 }
686 return $outStr;
687 }
688 }
689
690 /**
691 * Converts $str from UTF-8 to $charset
692 *
693 * @param string String in UTF-8 to convert to local charset
694 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
695 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
696 * @return string Output string, converted to local charset
697 */
698 function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
699
700 if ($charset === 'utf-8') {
701 return $str;
702 }
703
704 // Charset is case-insensitive.
705 if ($this->initCharset($charset)) { // Parse conv. table if not already...
706 $strLen = strlen($str);
707 $outStr = '';
708 $buf = '';
709 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) { // Traverse each char in UTF-8 string.
710 $chr = substr($str, $a, 1);
711 $ord = ord($chr);
712 if ($ord > 127) { // This means multibyte! (first byte!)
713 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
714
715 $buf = $chr; // Add first byte
716 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
717 $ord = $ord << 1; // Shift it left and ...
718 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
719 $a++; // Increase pointer...
720 $buf .= substr($str, $a, 1); // ... and add the next char.
721 } else {
722 break;
723 }
724 }
725
726 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
727 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
728 if ($mByte > 255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
729 $outStr .= chr(($mByte >> 8) & 255) . chr($mByte & 255);
730 } else {
731 $outStr .= chr($mByte);
732 }
733 } elseif ($useEntityForNoChar) { // Create num entity:
734 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
735 } else {
736 $outStr .= chr($this->noCharByteVal);
737 } // No char exists
738 } else {
739 $outStr .= chr($this->noCharByteVal);
740 } // No char exists (MIDDLE of MB sequence!)
741 } else {
742 $outStr .= $chr;
743 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
744 }
745 return $outStr;
746 }
747 }
748
749 /**
750 * Converts all chars > 127 to numeric entities.
751 *
752 * @param string Input string
753 * @return string Output string
754 */
755 function utf8_to_entities($str) {
756 $strLen = strlen($str);
757 $outStr = '';
758 $buf = '';
759 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
760 $chr = substr($str, $a, 1);
761 $ord = ord($chr);
762 if ($ord > 127) { // This means multibyte! (first byte!)
763 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
764 $buf = $chr; // Add first byte
765 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
766 $ord = $ord << 1; // Shift it left and ...
767 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
768 $a++; // Increase pointer...
769 $buf .= substr($str, $a, 1); // ... and add the next char.
770 } else {
771 break;
772 }
773 }
774
775 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
776 } else {
777 $outStr .= chr($this->noCharByteVal);
778 } // No char exists (MIDDLE of MB sequence!)
779 } else {
780 $outStr .= $chr;
781 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
782 }
783
784 return $outStr;
785 }
786
787 /**
788 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
789 *
790 * @param string Input string, UTF-8
791 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
792 * @return string Output string
793 */
794 function entities_to_utf8($str, $alsoStdHtmlEnt = FALSE) {
795 if ($alsoStdHtmlEnt) {
796 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8'));
797 }
798
799 $token = md5(microtime());
800 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
801 foreach ($parts as $k => $v) {
802 // only take every second element
803 if ($k % 2 === 0) {
804 continue;
805 }
806
807 $position = 0;
808 if (substr($v, $position, 1) == '#') { // Dec or hex entities:
809 $position++;
810 if (substr($v, $position, 1) == 'x') {
811 $v = hexdec(substr($v, ++$position));
812 } else {
813 $v = substr($v, $position);
814 }
815 $parts[$k] = $this->UnumberToChar($v);
816 } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) { // Other entities:
817 $parts[$k] = $trans_tbl['&' . $v . ';'];
818 } else { // No conversion:
819 $parts[$k] = '&' . $v . ';';
820 }
821 }
822
823 return implode('', $parts);
824 }
825
826 /**
827 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
828 *
829 * @param string Input string, UTF-8
830 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
831 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
832 * @return array Output array with the char numbers
833 */
834 function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
835 // If entities must be registered as well...:
836 if ($convEntities) {
837 $str = $this->entities_to_utf8($str, 1);
838 }
839 // Do conversion:
840 $strLen = strlen($str);
841 $outArr = array();
842 $buf = '';
843 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
844 $chr = substr($str, $a, 1);
845 $ord = ord($chr);
846 if ($ord > 127) { // This means multibyte! (first byte!)
847 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
848 $buf = $chr; // Add first byte
849 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
850 $ord = $ord << 1; // Shift it left and ...
851 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
852 $a++; // Increase pointer...
853 $buf .= substr($str, $a, 1); // ... and add the next char.
854 } else {
855 break;
856 }
857 }
858
859 $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
860 } else {
861 $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
862 } // No char exists (MIDDLE of MB sequence!)
863 } else {
864 $outArr[] = $retChar ? chr($ord) : $ord;
865 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
866 }
867
868 return $outArr;
869 }
870
871 /**
872 * Converts a UNICODE number to a UTF-8 multibyte character
873 * Algorithm based on script found at From: http://czyborra.com/utf/
874 * Unit-tested by Kasper
875 *
876 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
877 *
878 * bytes | bits | representation
879 * 1 | 7 | 0vvvvvvv
880 * 2 | 11 | 110vvvvv 10vvvvvv
881 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
882 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
883 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
884 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
885 *
886 * @param integer UNICODE integer
887 * @return string UTF-8 multibyte character string
888 * @see utf8CharToUnumber()
889 */
890 function UnumberToChar($cbyte) {
891 $str = '';
892
893 if ($cbyte < 0x80) {
894 $str .= chr($cbyte);
895 } else {
896 if ($cbyte < 0x800) {
897 $str .= chr(0xC0 | ($cbyte >> 6));
898 $str .= chr(0x80 | ($cbyte & 0x3F));
899 } else {
900 if ($cbyte < 0x10000) {
901 $str .= chr(0xE0 | ($cbyte >> 12));
902 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
903 $str .= chr(0x80 | ($cbyte & 0x3F));
904 } else {
905 if ($cbyte < 0x200000) {
906 $str .= chr(0xF0 | ($cbyte >> 18));
907 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
908 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
909 $str .= chr(0x80 | ($cbyte & 0x3F));
910 } else {
911 if ($cbyte < 0x4000000) {
912 $str .= chr(0xF8 | ($cbyte >> 24));
913 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
914 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
915 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
916 $str .= chr(0x80 | ($cbyte & 0x3F));
917 } else {
918 if ($cbyte < 0x80000000) {
919 $str .= chr(0xFC | ($cbyte >> 30));
920 $str .= chr(0x80 | (($cbyte >> 24) & 0x3F));
921 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
922 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
923 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
924 $str .= chr(0x80 | ($cbyte & 0x3F));
925 } else { // Cannot express a 32-bit character in UTF-8
926 $str .= chr($this->noCharByteVal);
927 }
928 }
929 }
930 }
931 }
932 }
933 return $str;
934 }
935
936 /**
937 * Converts a UTF-8 Multibyte character to a UNICODE number
938 * Unit-tested by Kasper
939 *
940 * @param string UTF-8 multibyte character string
941 * @param boolean If set, then a hex. number is returned.
942 * @return integer UNICODE integer
943 * @see UnumberToChar()
944 */
945 function utf8CharToUnumber($str, $hex = 0) {
946 $ord = ord(substr($str, 0, 1)); // First char
947
948 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
949 $binBuf = '';
950 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
951 $ord = $ord << 1; // Shift it left and ...
952 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
953 $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6);
954 } else {
955 break;
956 }
957 }
958 $binBuf = substr('00000000' . decbin(ord(substr($str, 0, 1))), -(6 - $b)) . $binBuf;
959
960 $int = bindec($binBuf);
961 } else {
962 $int = $ord;
963 }
964
965 return $hex ? 'x' . dechex($int) : $int;
966 }
967
968
969 /********************************************
970 *
971 * Init functions
972 *
973 ********************************************/
974
975 /**
976 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
977 * This function is automatically called by the conversion functions
978 *
979 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
980 *
981 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
982 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
983 * @access private
984 */
985 function initCharset($charset) {
986 // Only process if the charset is not yet loaded:
987 if (!is_array($this->parsedCharsets[$charset])) {
988
989 // Conversion table filename:
990 $charsetConvTableFile = PATH_t3lib . 'csconvtbl/' . $charset . '.tbl';
991
992 // If the conversion table is found:
993 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
994 // Cache file for charsets:
995 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
996 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
997 if ($cacheFile && @is_file($cacheFile)) {
998 $this->parsedCharsets[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
999 } else {
1000 // Parse conversion table into lines:
1001 $lines = t3lib_div::trimExplode(LF, t3lib_div::getUrl($charsetConvTableFile), 1);
1002 // Initialize the internal variable holding the conv. table:
1003 $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
1004 // traverse the lines:
1005 $detectedType = '';
1006 foreach ($lines as $value) {
1007 if (trim($value) && substr($value, 0, 1) != '#') { // Comment line or blanks are ignored.
1008
1009 // Detect type if not done yet: (Done on first real line)
1010 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1011 if (!$detectedType) {
1012 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
1013 }
1014
1015 if ($detectedType == 'ms-token') {
1016 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1017 } elseif ($detectedType == 'whitespaced') {
1018 $regA = array();
1019 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1020 $hexbyte = $regA[1];
1021 $utf8 = 'U+' . $regA[2];
1022 }
1023 $decval = hexdec(trim($hexbyte));
1024 if ($decval > 127) {
1025 $utf8decval = hexdec(substr(trim($utf8), 2));
1026 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1027 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1028 }
1029 }
1030 }
1031 if ($cacheFile) {
1032 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1033 }
1034 }
1035 return 2;
1036 } else {
1037 return FALSE;
1038 }
1039 } else {
1040 return 1;
1041 }
1042 }
1043
1044 /**
1045 * This function initializes all UTF-8 character data tables.
1046 *
1047 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1048 *
1049 * @param string Mode ("case", "ascii", ...)
1050 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1051 * @access private
1052 */
1053 function initUnicodeData($mode = NULL) {
1054 // cache files
1055 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1056 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1057
1058 // Only process if the tables are not yet loaded
1059 switch ($mode) {
1060 case 'case':
1061 if (is_array($this->caseFolding['utf-8'])) {
1062 return 1;
1063 }
1064
1065 // Use cached version if possible
1066 if ($cacheFileCase && @is_file($cacheFileCase)) {
1067 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1068 return 2;
1069 }
1070 break;
1071
1072 case 'ascii':
1073 if (is_array($this->toASCII['utf-8'])) {
1074 return 1;
1075 }
1076
1077 // Use cached version if possible
1078 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1079 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1080 return 2;
1081 }
1082 break;
1083 }
1084
1085 // process main Unicode data file
1086 $unicodeDataFile = PATH_t3lib . 'unidata/UnicodeData.txt';
1087 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1088 return FALSE;
1089 }
1090
1091 $fh = fopen($unicodeDataFile, 'rb');
1092 if (!$fh) {
1093 return FALSE;
1094 }
1095
1096 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1097 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1098 $this->caseFolding['utf-8'] = array();
1099 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1100 $utf8CaseFolding['toUpper'] = array();
1101 $utf8CaseFolding['toLower'] = array();
1102 $utf8CaseFolding['toTitle'] = array();
1103
1104 $decomposition = array(); // array of temp. decompositions
1105 $mark = array(); // array of chars that are marks (eg. composing accents)
1106 $number = array(); // array of chars that are numbers (eg. digits)
1107 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1108
1109 while (!feof($fh)) {
1110 $line = fgets($fh, 4096);
1111 // has a lot of info
1112 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line));
1113
1114 $ord = hexdec($char);
1115 if ($ord > 0xFFFF) {
1116 break;
1117 } // only process the BMP
1118
1119 $utf8_char = $this->UnumberToChar($ord);
1120
1121 if ($upper) {
1122 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1123 }
1124 if ($lower) {
1125 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1126 }
1127 // store "title" only when different from "upper" (only a few)
1128 if ($title && $title != $upper) {
1129 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1130 }
1131
1132 switch ($cat{0}) {
1133 case 'M': // mark (accent, umlaut, ...)
1134 $mark["U+$char"] = 1;
1135 break;
1136
1137 case 'N': // numeric value
1138 if ($ord > 0x80 && $num != '') {
1139 $number["U+$char"] = $num;
1140 }
1141 }
1142
1143 // accented Latin letters without "official" decomposition
1144 $match = array();
1145 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1146 $c = ord($match[2]);
1147 if ($match[1] == 'SMALL') {
1148 $c += 32;
1149 }
1150
1151 $decomposition["U+$char"] = array(dechex($c));
1152 continue;
1153 }
1154
1155 $match = array();
1156 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1157 switch ($match[1]) {
1158 case '<circle>': // add parenthesis as circle replacement, eg (1)
1159 $match[2] = '0028 ' . $match[2] . ' 0029';
1160 break;
1161
1162 case '<square>': // add square brackets as square replacement, eg [1]
1163 $match[2] = '005B ' . $match[2] . ' 005D';
1164 break;
1165
1166 case '<compat>': // ignore multi char decompositions that start with a space
1167 if (preg_match('/^0020 /', $match[2])) {
1168 continue 2;
1169 }
1170 break;
1171
1172 // ignore Arabic and vertical layout presentation decomposition
1173 case '<initial>':
1174 case '<medial>':
1175 case '<final>':
1176 case '<isolated>':
1177 case '<vertical>':
1178 continue 2;
1179 }
1180 $decomposition["U+$char"] = explode(' ', $match[2]);
1181 }
1182 }
1183 fclose($fh);
1184
1185 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1186 $specialCasingFile = PATH_t3lib . 'unidata/SpecialCasing.txt';
1187 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1188 $fh = fopen($specialCasingFile, 'rb');
1189 if ($fh) {
1190 while (!feof($fh)) {
1191 $line = fgets($fh, 4096);
1192 if ($line{0} != '#' && trim($line) != '') {
1193
1194 list($char, $lower, $title, $upper, $cond) = t3lib_div::trimExplode(';', $line);
1195 if ($cond == '' || $cond{0} == '#') {
1196 $utf8_char = $this->UnumberToChar(hexdec($char));
1197 if ($char != $lower) {
1198 $arr = explode(' ', $lower);
1199 for ($i = 0; isset($arr[$i]); $i++) {
1200 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1201 }
1202 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1203 }
1204 if ($char != $title && $title != $upper) {
1205 $arr = explode(' ', $title);
1206 for ($i = 0; isset($arr[$i]); $i++) {
1207 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1208 }
1209 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1210 }
1211 if ($char != $upper) {
1212 $arr = explode(' ', $upper);
1213 for ($i = 0; isset($arr[$i]); $i++) {
1214 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1215 }
1216 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1217 }
1218 }
1219 }
1220 }
1221 fclose($fh);
1222 }
1223 }
1224
1225 // process custom decompositions
1226 $customTranslitFile = PATH_t3lib . 'unidata/Translit.txt';
1227 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1228 $fh = fopen($customTranslitFile, 'rb');
1229 if ($fh) {
1230 while (!feof($fh)) {
1231 $line = fgets($fh, 4096);
1232 if ($line{0} != '#' && trim($line) != '') {
1233 list($char, $translit) = t3lib_div::trimExplode(';', $line);
1234 if (!$translit) {
1235 $omit["U+$char"] = 1;
1236 }
1237 $decomposition["U+$char"] = explode(' ', $translit);
1238
1239 }
1240 }
1241 fclose($fh);
1242 }
1243 }
1244
1245 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1246 foreach ($decomposition as $from => $to) {
1247 $code_decomp = array();
1248
1249 while ($code_value = array_shift($to)) {
1250 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1251 foreach (array_reverse($decomposition["U+$code_value"]) as $cv) {
1252 array_unshift($to, $cv);
1253 }
1254 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1255 array_push($code_decomp, $code_value);
1256 }
1257 }
1258 if (count($code_decomp) || isset($omit[$from])) {
1259 $decomposition[$from] = $code_decomp;
1260 } else {
1261 unset($decomposition[$from]);
1262 }
1263 }
1264
1265 // create ascii only mapping
1266 $this->toASCII['utf-8'] = array();
1267 $ascii =& $this->toASCII['utf-8'];
1268
1269 foreach ($decomposition as $from => $to) {
1270 $code_decomp = array();
1271 while ($code_value = array_shift($to)) {
1272 $ord = hexdec($code_value);
1273 if ($ord > 127) {
1274 continue 2;
1275 } // skip decompositions containing non-ASCII chars
1276 else
1277 {
1278 array_push($code_decomp, chr($ord));
1279 }
1280 }
1281 $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1282 }
1283
1284 // add numeric decompositions
1285 foreach ($number as $from => $to) {
1286 $utf8_char = $this->UnumberToChar(hexdec($from));
1287 if (!isset($ascii[$utf8_char])) {
1288 $ascii[$utf8_char] = $to;
1289 }
1290 }
1291
1292 if ($cacheFileCase) {
1293 t3lib_div::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1294 }
1295
1296 if ($cacheFileASCII) {
1297 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1298 }
1299
1300 return 3;
1301 }
1302
1303 /**
1304 * This function initializes the folding table for a charset other than UTF-8.
1305 * This function is automatically called by the case folding functions.
1306 *
1307 * @param string Charset for which to initialize case folding.
1308 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1309 * @access private
1310 */
1311 function initCaseFolding($charset) {
1312 // Only process if the case table is not yet loaded:
1313 if (is_array($this->caseFolding[$charset])) {
1314 return 1;
1315 }
1316
1317 // Use cached version if possible
1318 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1319 if ($cacheFile && @is_file($cacheFile)) {
1320 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1321 return 2;
1322 }
1323
1324 // init UTF-8 conversion for this charset
1325 if (!$this->initCharset($charset)) {
1326 return FALSE;
1327 }
1328
1329 // UTF-8 case folding is used as the base conversion table
1330 if (!$this->initUnicodeData('case')) {
1331 return FALSE;
1332 }
1333
1334 $nochar = chr($this->noCharByteVal);
1335 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1336 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1337 $c = $this->utf8_decode($utf8, $charset);
1338
1339 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1340 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1341 if ($cc != '' && $cc != $nochar) {
1342 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1343 }
1344
1345 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1346 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1347 if ($cc != '' && $cc != $nochar) {
1348 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1349 }
1350
1351 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1352 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1353 if ($cc != '' && $cc != $nochar) {
1354 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1355 }
1356 }
1357
1358 // add the ASCII case table
1359 for ($i = ord('a'); $i <= ord('z'); $i++) {
1360 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1361 }
1362 for ($i = ord('A'); $i <= ord('Z'); $i++) {
1363 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1364 }
1365
1366 if ($cacheFile) {
1367 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1368 }
1369
1370 return 3;
1371 }
1372
1373 /**
1374 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1375 * This function is automatically called by the ASCII transliteration functions.
1376 *
1377 * @param string Charset for which to initialize conversion.
1378 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1379 * @access private
1380 */
1381 function initToASCII($charset) {
1382 // Only process if the case table is not yet loaded:
1383 if (is_array($this->toASCII[$charset])) {
1384 return 1;
1385 }
1386
1387 // Use cached version if possible
1388 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1389 if ($cacheFile && @is_file($cacheFile)) {
1390 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1391 return 2;
1392 }
1393
1394 // init UTF-8 conversion for this charset
1395 if (!$this->initCharset($charset)) {
1396 return FALSE;
1397 }
1398
1399 // UTF-8/ASCII transliteration is used as the base conversion table
1400 if (!$this->initUnicodeData('ascii')) {
1401 return FALSE;
1402 }
1403
1404 $nochar = chr($this->noCharByteVal);
1405 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1406 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1407 $c = $this->utf8_decode($utf8, $charset);
1408
1409 if (isset($this->toASCII['utf-8'][$utf8])) {
1410 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1411 }
1412 }
1413
1414 if ($cacheFile) {
1415 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1416 }
1417
1418 return 3;
1419 }
1420
1421
1422 /********************************************
1423 *
1424 * String operation functions
1425 *
1426 ********************************************/
1427
1428 /**
1429 * Returns a part of a string.
1430 * Unit-tested by Kasper (single byte charsets only)
1431 *
1432 * @param string The character set
1433 * @param string Character string
1434 * @param integer Start position (character position)
1435 * @param integer Length (in characters)
1436 * @return string The substring
1437 * @see substr(), mb_substr()
1438 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1439 */
1440 function substr($charset, $string, $start, $len = NULL) {
1441 if ($len === 0 || $string === '') {
1442 return '';
1443 }
1444
1445 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1446 // cannot omit $len, when specifying charset
1447 if ($len == NULL) {
1448 $enc = mb_internal_encoding(); // save internal encoding
1449 mb_internal_encoding($charset);
1450 $str = mb_substr($string, $start);
1451 mb_internal_encoding($enc); // restore internal encoding
1452
1453 return $str;
1454 }
1455 else {
1456 return mb_substr($string, $start, $len, $charset);
1457 }
1458 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1459 // cannot omit $len, when specifying charset
1460 if ($len == NULL) {
1461 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1462 iconv_set_encoding('internal_encoding', $charset);
1463 $str = iconv_substr($string, $start);
1464 iconv_set_encoding('internal_encoding', $enc); // restore internal encoding
1465
1466 return $str;
1467 }
1468 else {
1469 return iconv_substr($string, $start, $len, $charset);
1470 }
1471 } elseif ($charset == 'utf-8') {
1472 return $this->utf8_substr($string, $start, $len);
1473 } elseif ($this->eucBasedSets[$charset]) {
1474 return $this->euc_substr($string, $start, $charset, $len);
1475 } elseif ($this->twoByteSets[$charset]) {
1476 return substr($string, $start * 2, $len * 2);
1477 } elseif ($this->fourByteSets[$charset]) {
1478 return substr($string, $start * 4, $len * 4);
1479 }
1480
1481 // treat everything else as single-byte encoding
1482 return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
1483 }
1484
1485 /**
1486 * Counts the number of characters.
1487 * Unit-tested by Kasper (single byte charsets only)
1488 *
1489 * @param string The character set
1490 * @param string Character string
1491 * @return integer The number of characters
1492 * @see strlen()
1493 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1494 */
1495 function strlen($charset, $string) {
1496 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1497 return mb_strlen($string, $charset);
1498 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1499 return iconv_strlen($string, $charset);
1500 } elseif ($charset == 'utf-8') {
1501 return $this->utf8_strlen($string);
1502 } elseif ($this->eucBasedSets[$charset]) {
1503 return $this->euc_strlen($string, $charset);
1504 } elseif ($this->twoByteSets[$charset]) {
1505 return strlen($string) / 2;
1506 } elseif ($this->fourByteSets[$charset]) {
1507 return strlen($string) / 4;
1508 }
1509 // treat everything else as single-byte encoding
1510 return strlen($string);
1511 }
1512
1513 /**
1514 * Method to crop strings using the mb_substr function.
1515 *
1516 * @param string The character set
1517 * @param string String to be cropped
1518 * @param integer Crop length (in characters)
1519 * @param string Crop signifier
1520 * @return string The shortened string
1521 * @see mb_strlen(), mb_substr()
1522 */
1523 protected function cropMbstring($charset, $string, $len, $crop = '') {
1524 if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) {
1525 return $string;
1526 }
1527
1528 if ($len > 0) {
1529 $string = mb_substr($string, 0, $len, $charset) . $crop;
1530 } else {
1531 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1532 }
1533
1534 return $string;
1535 }
1536
1537 /**
1538 * Truncates a string and pre-/appends a string.
1539 * Unit tested by Kasper
1540 *
1541 * @param string The character set
1542 * @param string Character string
1543 * @param integer Length (in characters)
1544 * @param string Crop signifier
1545 * @return string The shortened string
1546 * @see substr(), mb_strimwidth()
1547 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1548 */
1549 function crop($charset, $string, $len, $crop = '') {
1550 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1551 return $this->cropMbstring($charset, $string, $len, $crop);
1552 }
1553
1554 if (intval($len) == 0) {
1555 return $string;
1556 }
1557
1558 if ($charset == 'utf-8') {
1559 $i = $this->utf8_char2byte_pos($string, $len);
1560 } elseif ($this->eucBasedSets[$charset]) {
1561 $i = $this->euc_char2byte_pos($string, $len, $charset);
1562 } else {
1563 if ($len > 0) {
1564 $i = $len;
1565 } else {
1566 $i = strlen($string) + $len;
1567 if ($i <= 0) {
1568 $i = FALSE;
1569 }
1570 }
1571 }
1572
1573 if ($i === FALSE) { // $len outside actual string length
1574 return $string;
1575 } else {
1576 if ($len > 0) {
1577 if (strlen($string{$i})) {
1578 return substr($string, 0, $i) . $crop;
1579
1580 }
1581 } else {
1582 if (strlen($string{$i - 1})) {
1583 return $crop . substr($string, $i);
1584 }
1585 }
1586
1587 /*
1588 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return TRUE here (which is not a catastrophe, but...)
1589 if ($len > 0) {
1590 return substr($string,0,$i).$crop;
1591 } else {
1592 return $crop.substr($string,$i);
1593 }
1594 }
1595 */
1596 }
1597 return $string;
1598 }
1599
1600 /**
1601 * Cuts a string short at a given byte length.
1602 *
1603 * @param string The character set
1604 * @param string Character string
1605 * @param integer The byte length
1606 * @return string The shortened string
1607 * @see mb_strcut()
1608 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1609 */
1610 function strtrunc($charset, $string, $len) {
1611 if ($len <= 0) {
1612 return '';
1613 }
1614
1615 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1616 return mb_strcut($string, 0, $len, $charset);
1617 } elseif ($charset == 'utf-8') {
1618 return $this->utf8_strtrunc($string, $len);
1619 } elseif ($this->eucBasedSets[$charset]) {
1620 return $this->euc_strtrunc($string, $len, $charset);
1621 } elseif ($this->twoByteSets[$charset]) {
1622 if ($len % 2) {
1623 $len--;
1624 } // don't cut at odd positions
1625 } elseif ($this->fourByteSets[$charset]) {
1626 $x = $len % 4;
1627 $len -= $x; // realign to position dividable by four
1628 }
1629 // treat everything else as single-byte encoding
1630 return substr($string, 0, $len);
1631 }
1632
1633 /**
1634 * Translates all characters of a string into their respective case values.
1635 * Unlike strtolower() and strtoupper() this method is locale independent.
1636 * Note that the string length may change!
1637 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1638 * Unit-tested by Kasper
1639 * Real case folding is language dependent, this method ignores this fact.
1640 *
1641 * @param string Character set of string
1642 * @param string Input string to convert case for
1643 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1644 * @return string The converted string
1645 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1646 * @see strtolower(), strtoupper()
1647 */
1648 function conv_case($charset, $string, $case) {
1649 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1650 if ($case == 'toLower') {
1651 $string = mb_strtolower($string, $charset);
1652 } else {
1653 $string = mb_strtoupper($string, $charset);
1654 }
1655 } elseif ($charset == 'utf-8') {
1656 $string = $this->utf8_char_mapping($string, 'case', $case);
1657 } elseif (isset($this->eucBasedSets[$charset])) {
1658 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1659 } else {
1660 // treat everything else as single-byte encoding
1661 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1662 }
1663
1664 return $string;
1665 }
1666
1667 /**
1668 * Equivalent of lcfirst/ucfirst but using character set.
1669 *
1670 * @param string $charset
1671 * @param string $string
1672 * @param string $case
1673 * @return string
1674 * @see t3lib_cs::conv_case()
1675 */
1676 public function convCaseFirst($charset, $string, $case) {
1677 $firstChar = $this->substr($charset, $string, 0, 1);
1678 $firstChar = $this->conv_case($charset, $firstChar, $case);
1679 $remainder = $this->substr($charset, $string, 1);
1680 return $firstChar . $remainder;
1681 }
1682
1683 /**
1684 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1685 *
1686 * @param string $charset Character set of string
1687 * @param string $string Input string to convert
1688 * @return string The converted string
1689 */
1690 function specCharsToASCII($charset, $string) {
1691 if ($charset == 'utf-8') {
1692 $string = $this->utf8_char_mapping($string, 'ascii');
1693 } elseif (isset($this->eucBasedSets[$charset])) {
1694 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1695 } else {
1696 // treat everything else as single-byte encoding
1697 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1698 }
1699
1700 return $string;
1701 }
1702
1703
1704 /**
1705 * converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1706 * into a TYPO3-readable language code
1707 * @param $languageCodesList list of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1708 * see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
1709 * @return string a preferred language that TYPO3 supports, or "default" if none found
1710 * @author Benjamin Mack (benni.typo3.org)
1711 */
1712 public function getPreferredClientLanguage($languageCodesList) {
1713 $allLanguageCodes = array();
1714 $selectedLanguage = 'default';
1715
1716 // get all languages where TYPO3 code is the same as the ISO code
1717 foreach ($this->charSetArray as $typo3Lang => $charSet) {
1718 $allLanguageCodes[$typo3Lang] = $typo3Lang;
1719 }
1720
1721 // get all languages where TYPO3 code differs from ISO code
1722 // or needs the country part
1723 // the iso codes will here overwrite the default typo3 language in the key
1724 foreach ($this->locales->getIsoMapping() as $typo3Lang => $isoLang) {
1725 $isoLang = join('-', explode('_', $isoLang));
1726 $allLanguageCodes[$typo3Lang] = $isoLang;
1727 }
1728
1729 // move the iso codes to the (because we're comparing the keys with "isset" later on)
1730 $allLanguageCodes = array_flip($allLanguageCodes);
1731
1732
1733 $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList);
1734 // order the preferred languages after they key
1735 $sortedPreferredLanguages = array();
1736 foreach ($preferredLanguages as $preferredLanguage) {
1737 $quality = 1.0;
1738 if (strpos($preferredLanguage, ';q=') !== FALSE) {
1739 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1740 }
1741 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1742 }
1743
1744 // loop through the languages, with the highest priority first
1745 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1746 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1747 if (isset($allLanguageCodes[$preferredLanguage])) {
1748 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1749 break;
1750 }
1751
1752 // strip the country code from the end
1753 list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
1754 if (isset($allLanguageCodes[$preferredLanguage])) {
1755 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1756 break;
1757 }
1758 }
1759 if (!$selectedLanguage || $selectedLanguage == 'en') {
1760 $selectedLanguage = 'default';
1761 }
1762 return $selectedLanguage;
1763 }
1764
1765
1766 /********************************************
1767 *
1768 * Internal string operation functions
1769 *
1770 ********************************************/
1771
1772 /**
1773 * Maps all characters of a string in a single byte charset.
1774 *
1775 * @param string the string
1776 * @param string the charset
1777 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1778 * @param string 'case': conversion 'toLower' or 'toUpper'
1779 * @return string the converted string
1780 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1781 */
1782 function sb_char_mapping($str, $charset, $mode, $opt = '') {
1783 switch ($mode) {
1784 case 'case':
1785 if (!$this->initCaseFolding($charset)) {
1786 return $str;
1787 } // do nothing
1788 $map =& $this->caseFolding[$charset][$opt];
1789 break;
1790
1791 case 'ascii':
1792 if (!$this->initToASCII($charset)) {
1793 return $str;
1794 } // do nothing
1795 $map =& $this->toASCII[$charset];
1796 break;
1797
1798 default:
1799 return $str;
1800 }
1801
1802 $out = '';
1803 for ($i = 0; strlen($str{$i}); $i++) {
1804 $c = $str{$i};
1805 if (isset($map[$c])) {
1806 $out .= $map[$c];
1807 } else {
1808 $out .= $c;
1809 }
1810 }
1811
1812 return $out;
1813 }
1814
1815
1816 /********************************************
1817 *
1818 * Internal UTF-8 string operation functions
1819 *
1820 ********************************************/
1821
1822 /**
1823 * Returns a part of a UTF-8 string.
1824 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1825 *
1826 * @param string UTF-8 string
1827 * @param integer Start position (character position)
1828 * @param integer Length (in characters)
1829 * @return string The substring
1830 * @see substr()
1831 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1832 */
1833 function utf8_substr($str, $start, $len = NULL) {
1834 if (!strcmp($len, '0')) {
1835 return '';
1836 }
1837
1838 $byte_start = $this->utf8_char2byte_pos($str, $start);
1839 if ($byte_start === FALSE) {
1840 if ($start > 0) {
1841 return FALSE; // $start outside string length
1842 } else {
1843 $start = 0;
1844 }
1845 }
1846
1847 $str = substr($str, $byte_start);
1848
1849 if ($len != NULL) {
1850 $byte_end = $this->utf8_char2byte_pos($str, $len);
1851 if ($byte_end === FALSE) // $len outside actual string length
1852 {
1853 return $len < 0 ? '' : $str;
1854 } // When length is less than zero and exceeds, then we return blank string.
1855 else
1856 {
1857 return substr($str, 0, $byte_end);
1858 }
1859 }
1860 else {
1861 return $str;
1862 }
1863 }
1864
1865 /**
1866 * Counts the number of characters of a string in UTF-8.
1867 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1868 *
1869 * @param string UTF-8 multibyte character string
1870 * @return integer The number of characters
1871 * @see strlen()
1872 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1873 */
1874 function utf8_strlen($str) {
1875 $n = 0;
1876 for ($i = 0; strlen($str{$i}); $i++) {
1877 $c = ord($str{$i});
1878 if (!($c & 0x80)) // single-byte (0xxxxxx)
1879 {
1880 $n++;
1881 }
1882 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1883 {
1884 $n++;
1885 }
1886 }
1887 return $n;
1888 }
1889
1890 /**
1891 * Truncates a string in UTF-8 short at a given byte length.
1892 *
1893 * @param string UTF-8 multibyte character string
1894 * @param integer the byte length
1895 * @return string the shortened string
1896 * @see mb_strcut()
1897 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1898 */
1899 function utf8_strtrunc($str, $len) {
1900 $i = $len - 1;
1901 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1902 for (; $i > 0 && !(ord($str{$i}) & 0x40); $i--) {
1903 // find the first byte
1904 ;
1905 }
1906 if ($i <= 0) {
1907 return '';
1908 } // sanity check
1909 for ($bc = 0, $mbs = ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) {
1910 // calculate number of bytes
1911 $bc++;
1912 }
1913 if ($bc + $i > $len) {
1914 return substr($str, 0, $i);
1915 }
1916 // fallthru: multibyte char fits into length
1917 }
1918 return substr($str, 0, $len);
1919 }
1920
1921 /**
1922 * Find position of first occurrence of a string, both arguments are in UTF-8.
1923 *
1924 * @param string UTF-8 string to search in
1925 * @param string UTF-8 string to search for
1926 * @param integer Positition to start the search
1927 * @return integer The character position
1928 * @see strpos()
1929 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1930 */
1931 function utf8_strpos($haystack, $needle, $offset = 0) {
1932 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1933 return mb_strpos($haystack, $needle, $offset, 'utf-8');
1934 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1935 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1936 }
1937
1938 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1939 if ($byte_offset === FALSE) {
1940 return FALSE;
1941 } // offset beyond string length
1942
1943 $byte_pos = strpos($haystack, $needle, $byte_offset);
1944 if ($byte_pos === FALSE) {
1945 return FALSE;
1946 } // needle not found
1947
1948 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1949 }
1950
1951 /**
1952 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1953 *
1954 * @param string UTF-8 string to search in
1955 * @param string UTF-8 character to search for (single character)
1956 * @return integer The character position
1957 * @see strrpos()
1958 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1959 */
1960 function utf8_strrpos($haystack, $needle) {
1961 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1962 return mb_strrpos($haystack, $needle, 'utf-8');
1963 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1964 return iconv_strrpos($haystack, $needle, 'utf-8');
1965 }
1966
1967 $byte_pos = strrpos($haystack, $needle);
1968 if ($byte_pos === FALSE) {
1969 return FALSE;
1970 } // needle not found
1971
1972 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1973 }
1974
1975 /**
1976 * Translates a character position into an 'absolute' byte position.
1977 * Unit tested by Kasper.
1978 *
1979 * @param string UTF-8 string
1980 * @param integer Character position (negative values start from the end)
1981 * @return integer Byte position
1982 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1983 */
1984 function utf8_char2byte_pos($str, $pos) {
1985 $n = 0; // number of characters found
1986 $p = abs($pos); // number of characters wanted
1987
1988 if ($pos >= 0) {
1989 $i = 0;
1990 $d = 1;
1991 } else {
1992 $i = strlen($str) - 1;
1993 $d = -1;
1994 }
1995
1996 for (; strlen($str{$i}) && $n < $p; $i += $d) {
1997 $c = (int) ord($str{$i});
1998 if (!($c & 0x80)) // single-byte (0xxxxxx)
1999 {
2000 $n++;
2001 }
2002 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2003 {
2004 $n++;
2005 }
2006 }
2007 if (!strlen($str{$i})) {
2008 return FALSE;
2009 } // offset beyond string length
2010
2011 if ($pos >= 0) {
2012 // skip trailing multi-byte data bytes
2013 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) {
2014 $i++;
2015 }
2016 } else {
2017 // correct offset
2018 $i++;
2019 }
2020
2021 return $i;
2022 }
2023
2024 /**
2025 * Translates an 'absolute' byte position into a character position.
2026 * Unit tested by Kasper.
2027 *
2028 * @param string UTF-8 string
2029 * @param integer byte position
2030 * @return integer character position
2031 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2032 */
2033 function utf8_byte2char_pos($str, $pos) {
2034 $n = 0; // number of characters
2035 for ($i = $pos; $i > 0; $i--) {
2036 $c = (int) ord($str{$i});
2037 if (!($c & 0x80)) // single-byte (0xxxxxx)
2038 {
2039 $n++;
2040 }
2041 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2042 {
2043 $n++;
2044 }
2045 }
2046 if (!strlen($str{$i})) {
2047 return FALSE;
2048 } // offset beyond string length
2049
2050 return $n;
2051 }
2052
2053 /**
2054 * Maps all characters of an UTF-8 string.
2055 *
2056 * @param string UTF-8 string
2057 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2058 * @param string 'case': conversion 'toLower' or 'toUpper'
2059 * @return string the converted string
2060 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2061 */
2062 function utf8_char_mapping($str, $mode, $opt = '') {
2063 if (!$this->initUnicodeData($mode)) {
2064 return $str;
2065 } // do nothing
2066
2067 $out = '';
2068 switch ($mode) {
2069 case 'case':
2070 $map =& $this->caseFolding['utf-8'][$opt];
2071 break;
2072
2073 case 'ascii':
2074 $map =& $this->toASCII['utf-8'];
2075 break;
2076
2077 default:
2078 return $str;
2079 }
2080
2081 for ($i = 0; strlen($str{$i}); $i++) {
2082 $c = ord($str{$i});
2083 if (!($c & 0x80)) // single-byte (0xxxxxx)
2084 {
2085 $mbc = $str{$i};
2086 }
2087 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
2088 for ($bc = 0; $c & 0x80; $c = $c << 1) {
2089 $bc++;
2090 } // calculate number of bytes
2091 $mbc = substr($str, $i, $bc);
2092 $i += $bc - 1;
2093 }
2094
2095 if (isset($map[$mbc])) {
2096 $out .= $map[$mbc];
2097 } else {
2098 $out .= $mbc;
2099 }
2100 }
2101
2102 return $out;
2103 }
2104
2105
2106 /********************************************
2107 *
2108 * Internal EUC string operation functions
2109 *
2110 * Extended Unix Code:
2111 * ASCII compatible 7bit single bytes chars
2112 * 8bit two byte chars
2113 *
2114 * Shift-JIS is treated as a special case.
2115 *
2116 ********************************************/
2117
2118 /**
2119 * Cuts a string in the EUC charset family short at a given byte length.
2120 *
2121 * @param string EUC multibyte character string
2122 * @param integer the byte length
2123 * @param string the charset
2124 * @return string the shortened string
2125 * @see mb_strcut()
2126 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2127 */
2128 function euc_strtrunc($str, $len, $charset) {
2129 $sjis = ($charset == 'shift_jis');
2130 for ($i = 0; strlen($str{$i}) && $i < $len; $i++) {
2131 $c = ord($str{$i});
2132 if ($sjis) {
2133 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2134 $i++;
2135 } // advance a double-byte char
2136 }
2137 else {
2138 if ($c >= 0x80) {
2139 $i++;
2140 } // advance a double-byte char
2141 }
2142 }
2143 if (!strlen($str{$i})) {
2144 return $str;
2145 } // string shorter than supplied length
2146
2147 if ($i > $len) {
2148 return substr($str, 0, $len - 1); // we ended on a first byte
2149 } else {
2150 return substr($str, 0, $len);
2151 }
2152 }
2153
2154 /**
2155 * Returns a part of a string in the EUC charset family.
2156 *
2157 * @param string EUC multibyte character string
2158 * @param integer start position (character position)
2159 * @param string the charset
2160 * @param integer length (in characters)
2161 * @return string the substring
2162 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2163 */
2164 function euc_substr($str, $start, $charset, $len = NULL) {
2165 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2166 if ($byte_start === FALSE) {
2167 return FALSE;
2168 } // $start outside string length
2169
2170 $str = substr($str, $byte_start);
2171
2172 if ($len != NULL) {
2173 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2174 if ($byte_end === FALSE) // $len outside actual string length
2175 {
2176 return $str;
2177 }
2178 else
2179 {
2180 return substr($str, 0, $byte_end);
2181 }
2182 }
2183 else {
2184 return $str;
2185 }
2186 }
2187
2188 /**
2189 * Counts the number of characters of a string in the EUC charset family.
2190 *
2191 * @param string EUC multibyte character string
2192 * @param string the charset
2193 * @return integer the number of characters
2194 * @see strlen()
2195 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2196 */
2197 function euc_strlen($str, $charset) {
2198 $sjis = ($charset == 'shift_jis');
2199 $n = 0;
2200 for ($i = 0; strlen($str{$i}); $i++) {
2201 $c = ord($str{$i});
2202 if ($sjis) {
2203 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2204 $i++;
2205 } // advance a double-byte char
2206 }
2207 else {
2208 if ($c >= 0x80) {
2209 $i++;
2210 } // advance a double-byte char
2211 }
2212
2213 $n++;
2214 }
2215
2216 return $n;
2217 }
2218
2219 /**
2220 * Translates a character position into an 'absolute' byte position.
2221 *
2222 * @param string EUC multibyte character string
2223 * @param integer character position (negative values start from the end)
2224 * @param string the charset
2225 * @return integer byte position
2226 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2227 */
2228 function euc_char2byte_pos($str, $pos, $charset) {
2229 $sjis = ($charset == 'shift_jis');
2230 $n = 0; // number of characters seen
2231 $p = abs($pos); // number of characters wanted
2232
2233 if ($pos >= 0) {
2234 $i = 0;
2235 $d = 1;
2236 } else {
2237 $i = strlen($str) - 1;
2238 $d = -1;
2239 }
2240
2241 for (; strlen($str{$i}) && $n < $p; $i += $d) {
2242 $c = ord($str{$i});
2243 if ($sjis) {
2244 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2245 $i += $d;
2246 } // advance a double-byte char
2247 }
2248 else {
2249 if ($c >= 0x80) {
2250 $i += $d;
2251 } // advance a double-byte char
2252 }
2253
2254 $n++;
2255 }
2256 if (!strlen($str{$i})) {
2257 return FALSE;
2258 } // offset beyond string length
2259
2260 if ($pos < 0) {
2261 $i++;
2262 } // correct offset
2263
2264 return $i;
2265 }
2266
2267 /**
2268 * Maps all characters of a string in the EUC charset family.
2269 *
2270 * @param string EUC multibyte character string
2271 * @param string the charset
2272 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2273 * @param string 'case': conversion 'toLower' or 'toUpper'
2274 * @return string the converted string
2275 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2276 */
2277 function euc_char_mapping($str, $charset, $mode, $opt = '') {
2278 switch ($mode) {
2279 case 'case':
2280 if (!$this->initCaseFolding($charset)) {
2281 return $str;
2282 } // do nothing
2283 $map =& $this->caseFolding[$charset][$opt];
2284 break;
2285
2286 case 'ascii':
2287 if (!$this->initToASCII($charset)) {
2288 return $str;
2289 } // do nothing
2290 $map =& $this->toASCII[$charset];
2291 break;
2292
2293 default:
2294 return $str;
2295 }
2296
2297 $sjis = ($charset == 'shift_jis');
2298 $out = '';
2299 for ($i = 0; strlen($str{$i}); $i++) {
2300 $mbc = $str{$i};
2301 $c = ord($mbc);
2302
2303 if ($sjis) {
2304 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2305 $mbc = substr($str, $i, 2);
2306 $i++;
2307 }
2308 }
2309 else {
2310 if ($c >= 0x80) { // a double-byte char
2311 $mbc = substr($str, $i, 2);
2312 $i++;
2313 }
2314 }
2315
2316 if (isset($map[$mbc])) {
2317 $out .= $map[$mbc];
2318 } else {
2319 $out .= $mbc;
2320 }
2321 }
2322
2323 return $out;
2324 }
2325
2326 }
2327
2328 ?>