Fixed bug #12295: Cleaning config_default, exclude values from being listed in instal...
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2009 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 136: class t3lib_cs
38 * 488: function parse_charset($charset)
39 * 507: function get_locale_charset($locale)
40 *
41 * SECTION: Charset Conversion functions
42 * 560: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
43 * 600: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
44 * 617: function utf8_encode($str,$charset)
45 * 663: function utf8_decode($str,$charset,$useEntityForNoChar=0)
46 * 706: function utf8_to_entities($str)
47 * 739: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
48 * 773: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
49 * 823: function UnumberToChar($cbyte)
50 * 868: function utf8CharToUnumber($str,$hex=0)
51 *
52 * SECTION: Init functions
53 * 911: function initCharset($charset)
54 * 973: function initUnicodeData($mode=null)
55 * 1198: function initCaseFolding($charset)
56 * 1260: function initToASCII($charset)
57 *
58 * SECTION: String operation functions
59 * 1331: function substr($charset,$string,$start,$len=null)
60 * 1384: function strlen($charset,$string)
61 * 1414: function crop($charset,$string,$len,$crop='')
62 * 1467: function strtrunc($charset,$string,$len)
63 * 1501: function conv_case($charset,$string,$case)
64 * 1527: function specCharsToASCII($charset,$string)
65 *
66 * SECTION: Internal string operation functions
67 * 1567: function sb_char_mapping($str,$charset,$mode,$opt='')
68 *
69 * SECTION: Internal UTF-8 string operation functions
70 * 1622: function utf8_substr($str,$start,$len=null)
71 * 1655: function utf8_strlen($str)
72 * 1676: function utf8_strtrunc($str,$len)
73 * 1698: function utf8_strpos($haystack,$needle,$offset=0)
74 * 1723: function utf8_strrpos($haystack,$needle)
75 * 1745: function utf8_char2byte_pos($str,$pos)
76 * 1786: function utf8_byte2char_pos($str,$pos)
77 * 1809: function utf8_char_mapping($str,$mode,$opt='')
78 *
79 * SECTION: Internal EUC string operation functions
80 * 1885: function euc_strtrunc($str,$len,$charset)
81 * 1914: function euc_substr($str,$start,$charset,$len=null)
82 * 1939: function euc_strlen($str,$charset)
83 * 1966: function euc_char2byte_pos($str,$pos,$charset)
84 * 2007: function euc_char_mapping($str,$charset,$mode,$opt='')
85 *
86 * TOTAL FUNCTIONS: 35
87 * (This index is automatically created/updated by the extension "extdeveval")
88 *
89 */
90
91
92
93
94
95
96
97
98 /**
99 * Notes on UTF-8
100 *
101 * Functions working on UTF-8 strings:
102 *
103 * - strchr/strstr
104 * - strrchr
105 * - substr_count
106 * - implode/explode/join
107 *
108 * Functions nearly working on UTF-8 strings:
109 *
110 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
111 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
112 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
113 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
114 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
115 *
116 * Functions NOT working on UTF-8 strings:
117 *
118 * - str*cmp
119 * - stristr
120 * - stripos
121 * - substr
122 * - strrev
123 * - split/spliti
124 * - ...
125 *
126 */
127 /**
128 * Class for conversion between charsets
129 *
130 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
131 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
132 * @package TYPO3
133 * @subpackage t3lib
134 */
135 class t3lib_cs {
136 var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
137
138 // This is the array where parsed conversion tables are stored (cached)
139 var $parsedCharsets=array();
140
141 // An array where case folding data will be stored (cached)
142 var $caseFolding=array();
143
144 // An array where charset-to-ASCII mappings are stored (cached)
145 var $toASCII=array();
146
147 // This tells the converter which charsets has two bytes per char:
148 var $twoByteSets=array(
149 'ucs-2'=>1, // 2-byte Unicode
150 );
151
152 // This tells the converter which charsets has four bytes per char:
153 var $fourByteSets=array(
154 'ucs-4'=>1, // 4-byte Unicode
155 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
156 );
157
158 // This tells the converter which charsets use a scheme like the Extended Unix Code:
159 var $eucBasedSets=array(
160 'gb2312'=>1, // Chinese, simplified.
161 'big5'=>1, // Chinese, traditional.
162 'euc-kr'=>1, // Korean
163 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
164 );
165
166 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
167 // http://czyborra.com/charsets/iso8859.html
168 var $synonyms=array(
169 'us' => 'ascii',
170 'us-ascii'=> 'ascii',
171 'cp819' => 'iso-8859-1',
172 'ibm819' => 'iso-8859-1',
173 'iso-ir-100' => 'iso-8859-1',
174 'iso-ir-101' => 'iso-8859-2',
175 'iso-ir-109' => 'iso-8859-3',
176 'iso-ir-110' => 'iso-8859-4',
177 'iso-ir-144' => 'iso-8859-5',
178 'iso-ir-127' => 'iso-8859-6',
179 'iso-ir-126' => 'iso-8859-7',
180 'iso-ir-138' => 'iso-8859-8',
181 'iso-ir-148' => 'iso-8859-9',
182 'iso-ir-157' => 'iso-8859-10',
183 'iso-ir-179' => 'iso-8859-13',
184 'iso-ir-199' => 'iso-8859-14',
185 'iso-ir-203' => 'iso-8859-15',
186 'csisolatin1' => 'iso-8859-1',
187 'csisolatin2' => 'iso-8859-2',
188 'csisolatin3' => 'iso-8859-3',
189 'csisolatin5' => 'iso-8859-9',
190 'csisolatin8' => 'iso-8859-14',
191 'csisolatin9' => 'iso-8859-15',
192 'csisolatingreek' => 'iso-8859-7',
193 'iso-celtic' => 'iso-8859-14',
194 'latin1' => 'iso-8859-1',
195 'latin2' => 'iso-8859-2',
196 'latin3' => 'iso-8859-3',
197 'latin5' => 'iso-8859-9',
198 'latin6' => 'iso-8859-10',
199 'latin8' => 'iso-8859-14',
200 'latin9' => 'iso-8859-15',
201 'l1' => 'iso-8859-1',
202 'l2' => 'iso-8859-2',
203 'l3' => 'iso-8859-3',
204 'l5' => 'iso-8859-9',
205 'l6' => 'iso-8859-10',
206 'l8' => 'iso-8859-14',
207 'l9' => 'iso-8859-15',
208 'cyrillic' => 'iso-8859-5',
209 'arabic' => 'iso-8859-6',
210 'tis-620' => 'iso-8859-11',
211 'win874' => 'windows-874',
212 'win1250' => 'windows-1250',
213 'win1251' => 'windows-1251',
214 'win1252' => 'windows-1252',
215 'win1253' => 'windows-1253',
216 'win1254' => 'windows-1254',
217 'win1255' => 'windows-1255',
218 'win1256' => 'windows-1256',
219 'win1257' => 'windows-1257',
220 'win1258' => 'windows-1258',
221 'cp1250' => 'windows-1250',
222 'cp1251' => 'windows-1251',
223 'cp1252' => 'windows-1252',
224 'ms-ee' => 'windows-1250',
225 'ms-ansi' => 'windows-1252',
226 'ms-greek' => 'windows-1253',
227 'ms-turk' => 'windows-1254',
228 'winbaltrim' => 'windows-1257',
229 'koi-8ru' => 'koi-8r',
230 'koi8r' => 'koi-8r',
231 'cp878' => 'koi-8r',
232 'mac' => 'macroman',
233 'macintosh' => 'macroman',
234 'euc-cn' => 'gb2312',
235 'x-euc-cn' => 'gb2312',
236 'euccn' => 'gb2312',
237 'cp936' => 'gb2312',
238 'big-5' => 'big5',
239 'cp950' => 'big5',
240 'eucjp' => 'euc-jp',
241 'sjis' => 'shift_jis',
242 'shift-jis' => 'shift_jis',
243 'cp932' => 'shift_jis',
244 'cp949' => 'euc-kr',
245 'utf7' => 'utf-7',
246 'utf8' => 'utf-8',
247 'utf16' => 'utf-16',
248 'utf32' => 'utf-32',
249 'utf8' => 'utf-8',
250 'ucs2' => 'ucs-2',
251 'ucs4' => 'ucs-4',
252 );
253
254 // mapping of iso-639-1 language codes to script names
255 var $lang_to_script=array(
256 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
257 'ar' => 'arabic',
258 'bg' => 'cyrillic', // Bulgarian
259 'bs' => 'east_european', // Bosnian
260 'cs' => 'east_european', // Czech
261 'da' => 'west_european', // Danish
262 'de' => 'west_european', // German
263 'es' => 'west_european', // Spanish
264 'et' => 'estonian',
265 'eo' => 'unicode', // Esperanto
266 'eu' => 'west_european', // Basque
267 'fa' => 'arabic', // Persian
268 'fi' => 'west_european', // Finish
269 'fo' => 'west_european', // Faroese
270 'fr' => 'west_european', // French
271 'ga' => 'west_european', // Galician
272 'ge' => 'unicode', // Georgian
273 'gr' => 'greek',
274 'he' => 'hebrew', // Hebrew (since 1998)
275 'hi' => 'unicode', // Hindi
276 'hr' => 'east_european', // Croatian
277 'hu' => 'east_european', // Hungarian
278 'iw' => 'hebrew', // Hebrew (til 1998)
279 'is' => 'west_european', // Icelandic
280 'it' => 'west_european', // Italian
281 'ja' => 'japanese',
282 'kl' => 'west_european', // Greenlandic
283 'ko' => 'korean',
284 'lt' => 'lithuanian',
285 'lv' => 'west_european', // Latvian/Lettish
286 'nl' => 'west_european', // Dutch
287 'no' => 'west_european', // Norwegian
288 'nb' => 'west_european', // Norwegian Bokmal
289 'nn' => 'west_european', // Norwegian Nynorsk
290 'pl' => 'east_european', // Polish
291 'pt' => 'west_european', // Portuguese
292 'ro' => 'east_european', // Romanian
293 'ru' => 'cyrillic', // Russian
294 'sk' => 'east_european', // Slovak
295 'sl' => 'east_european', // Slovenian
296 'sr' => 'cyrillic', // Serbian
297 'sv' => 'west_european', // Swedish
298 'sq' => 'albanian', // Albanian
299 'th' => 'thai',
300 'uk' => 'cyrillic', // Ukranian
301 'vi' => 'vietnamese',
302 'zh' => 'chinese',
303 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
304 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
305 'ara' => 'arabic',
306 'bgr' => 'cyrillic', // Bulgarian
307 'cat' => 'west_european', // Catalan
308 'chs' => 'simpl_chinese',
309 'cht' => 'trad_chinese',
310 'csy' => 'east_european', // Czech
311 'dan' => 'west_european', // Danisch
312 'deu' => 'west_european', // German
313 'dea' => 'west_european', // German (Austrian)
314 'des' => 'west_european', // German (Swiss)
315 'ena' => 'west_european', // English (Australian)
316 'enc' => 'west_european', // English (Canadian)
317 'eng' => 'west_european', // English
318 'enz' => 'west_european', // English (New Zealand)
319 'enu' => 'west_european', // English (United States)
320 'euq' => 'west_european', // Basque
321 'fos' => 'west_european', // Faroese
322 'far' => 'arabic', // Persian
323 'fin' => 'west_european', // Finish
324 'fra' => 'west_european', // French
325 'frb' => 'west_european', // French (Belgian)
326 'frc' => 'west_european', // French (Canadian)
327 'frs' => 'west_european', // French (Swiss)
328 'geo' => 'unicode', // Georgian
329 'glg' => 'west_european', // Galician
330 'ell' => 'greek',
331 'heb' => 'hebrew',
332 'hin' => 'unicode', // Hindi
333 'hun' => 'east_european', // Hungarian
334 'isl' => 'west_euorpean', // Icelandic
335 'ita' => 'west_european', // Italian
336 'its' => 'west_european', // Italian (Swiss)
337 'jpn' => 'japanese',
338 'kor' => 'korean',
339 'lth' => 'lithuanian',
340 'lvi' => 'west_european', // Latvian/Lettish
341 'msl' => 'west_european', // Malay
342 'nlb' => 'west_european', // Dutch (Belgian)
343 'nld' => 'west_european', // Dutch
344 'nor' => 'west_european', // Norwegian (bokmal)
345 'non' => 'west_european', // Norwegian (nynorsk)
346 'plk' => 'east_european', // Polish
347 'ptg' => 'west_european', // Portuguese
348 'ptb' => 'west_european', // Portuguese (Brazil)
349 'rom' => 'east_european', // Romanian
350 'rus' => 'cyrillic', // Russian
351 'slv' => 'east_european', // Slovenian
352 'sky' => 'east_european', // Slovak
353 'srl' => 'east_european', // Serbian (Latin)
354 'srb' => 'cyrillic', // Serbian (Cyrillic)
355 'esp' => 'west_european', // Spanish (trad. sort)
356 'esm' => 'west_european', // Spanish (Mexican)
357 'esn' => 'west_european', // Spanish (internat. sort)
358 'sve' => 'west_european', // Swedish
359 'sqi' => 'albanian', // Albanian
360 'tha' => 'thai',
361 'trk' => 'turkish',
362 'ukr' => 'cyrillic', // Ukrainian
363 // English language names
364 'albanian' => 'albanian',
365 'arabic' => 'arabic',
366 'basque' => 'west_european',
367 'bosnian' => 'east_european',
368 'bulgarian' => 'east_european',
369 'catalan' => 'west_european',
370 'croatian' => 'east_european',
371 'czech' => 'east_european',
372 'danish' => 'west_european',
373 'dutch' => 'west_european',
374 'english' => 'west_european',
375 'esperanto' => 'unicode',
376 'estonian' => 'estonian',
377 'faroese' => 'west_european',
378 'farsi' => 'arabic',
379 'finnish' => 'west_european',
380 'french' => 'west_european',
381 'galician' => 'west_european',
382 'georgian' => 'unicode',
383 'german' => 'west_european',
384 'greek' => 'greek',
385 'greenlandic' => 'west_european',
386 'hebrew' => 'hebrew',
387 'hindi' => 'unicode',
388 'hungarian' => 'east_european',
389 'icelandic' => 'west_european',
390 'italian' => 'west_european',
391 'latvian' => 'west_european',
392 'lettish' => 'west_european',
393 'lithuanian' => 'lithuanian',
394 'malay' => 'west_european',
395 'norwegian' => 'west_european',
396 'persian' => 'arabic',
397 'polish' => 'east_european',
398 'portuguese' => 'west_european',
399 'russian' => 'cyrillic',
400 'romanian' => 'east_european',
401 'serbian' => 'cyrillic',
402 'slovak' => 'east_european',
403 'slovenian' => 'east_european',
404 'spanish' => 'west_european',
405 'svedish' => 'west_european',
406 'that' => 'thai',
407 'turkish' => 'turkish',
408 'ukrainian' => 'cyrillic',
409 );
410
411 // mapping of language (family) names to charsets on Unix
412 var $script_to_charset_unix=array(
413 'west_european' => 'iso-8859-1',
414 'estonian' => 'iso-8859-1',
415 'east_european' => 'iso-8859-2',
416 'baltic' => 'iso-8859-4',
417 'cyrillic' => 'iso-8859-5',
418 'arabic' => 'iso-8859-6',
419 'greek' => 'iso-8859-7',
420 'hebrew' => 'iso-8859-8',
421 'turkish' => 'iso-8859-9',
422 'thai' => 'iso-8859-11', // = TIS-620
423 'lithuanian' => 'iso-8859-13',
424 'chinese' => 'gb2312', // = euc-cn
425 'japanese' => 'euc-jp',
426 'korean' => 'euc-kr',
427 'simpl_chinese' => 'gb2312',
428 'trad_chinese' => 'big5',
429 'vietnamese' => '',
430 'unicode' => 'utf-8',
431 'albanian' => 'utf-8'
432 );
433
434 // mapping of language (family) names to charsets on Windows
435 var $script_to_charset_windows=array(
436 'east_european' => 'windows-1250',
437 'cyrillic' => 'windows-1251',
438 'west_european' => 'windows-1252',
439 'greek' => 'windows-1253',
440 'turkish' => 'windows-1254',
441 'hebrew' => 'windows-1255',
442 'arabic' => 'windows-1256',
443 'baltic' => 'windows-1257',
444 'estonian' => 'windows-1257',
445 'lithuanian' => 'windows-1257',
446 'vietnamese' => 'windows-1258',
447 'thai' => 'cp874',
448 'korean' => 'cp949',
449 'chinese' => 'gb2312',
450 'japanese' => 'shift_jis',
451 'simpl_chinese' => 'gb2312',
452 'trad_chinese' => 'big5',
453 'albanian' => 'windows-1250',
454 'unicode' => 'utf-8'
455 );
456
457 // mapping of locale names to charsets
458 var $locale_to_charset=array(
459 'japanese.euc' => 'euc-jp',
460 'ja_jp.ujis' => 'euc-jp',
461 'korean.euc' => 'euc-kr',
462 'sr@Latn' => 'iso-8859-2',
463 'zh_cn' => 'gb2312',
464 'zh_hk' => 'big5',
465 'zh_tw' => 'big5',
466 );
467
468 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
469 // Empty values means "iso-8859-1"
470 var $charSetArray = array(
471 'dk' => '',
472 'de' => '',
473 'no' => '',
474 'it' => '',
475 'fr' => '',
476 'es' => '',
477 'nl' => '',
478 'cz' => 'windows-1250',
479 'pl' => 'iso-8859-2',
480 'si' => 'windows-1250',
481 'fi' => '',
482 'tr' => 'iso-8859-9',
483 'se' => '',
484 'pt' => '',
485 'ru' => 'windows-1251',
486 'ro' => 'iso-8859-2',
487 'ch' => 'gb2312',
488 'sk' => 'windows-1250',
489 'lt' => 'windows-1257',
490 'is' => 'utf-8',
491 'hr' => 'windows-1250',
492 'hu' => 'iso-8859-2',
493 'gl' => '',
494 'th' => 'iso-8859-11',
495 'gr' => 'iso-8859-7',
496 'hk' => 'big5',
497 'eu' => '',
498 'bg' => 'windows-1251',
499 'br' => '',
500 'et' => 'iso-8859-4',
501 'ar' => 'iso-8859-6',
502 'he' => 'utf-8',
503 'ua' => 'windows-1251',
504 'jp' => 'shift_jis',
505 'lv' => 'utf-8',
506 'vn' => 'utf-8',
507 'ca' => 'iso-8859-15',
508 'ba' => 'iso-8859-2',
509 'kr' => 'euc-kr',
510 'eo' => 'utf-8',
511 'my' => '',
512 'hi' => 'utf-8',
513 'fo' => 'utf-8',
514 'fa' => 'utf-8',
515 'sr' => 'utf-8',
516 'sq' => 'utf-8',
517 'ge' => 'utf-8',
518 'ga' => '',
519 );
520
521 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
522 // Missing keys means: same as Typo3
523 var $isoArray = array(
524 'ba' => 'bs',
525 'br' => 'pt_BR',
526 'ch' => 'zh_CN',
527 'cz' => 'cs',
528 'dk' => 'da',
529 'si' => 'sl',
530 'se' => 'sv',
531 'gl' => 'kl',
532 'gr' => 'el',
533 'hk' => 'zh_HK',
534 'kr' => 'ko',
535 'ua' => 'uk',
536 'jp' => 'ja',
537 'vn' => 'vi',
538 );
539
540 /**
541 * Normalize - changes input character set to lowercase letters.
542 *
543 * @param string Input charset
544 * @return string Normalized charset
545 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
546 */
547 function parse_charset($charset) {
548 $charset = trim(strtolower($charset));
549 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
550
551 return $charset;
552 }
553
554 /**
555 * Get the charset of a locale.
556 *
557 * ln language
558 * ln_CN language / country
559 * ln_CN.cs language / country / charset
560 * ln_CN.cs@mod language / country / charset / modifier
561 *
562 * @param string Locale string
563 * @return string Charset resolved for locale string
564 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
565 */
566 function get_locale_charset($locale) {
567 $locale = strtolower($locale);
568
569 // exact locale specific charset?
570 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale];
571
572 // get modifier
573 list($locale,$modifier) = explode('@',$locale);
574
575 // locale contains charset: use it
576 list($locale,$charset) = explode('.',$locale);
577 if ($charset) return $this->parse_charset($charset);
578
579 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
580 if ($modifier == 'euro') return 'iso-8859-15';
581
582 // get language
583 list($language,$country) = explode('_',$locale);
584 if (isset($this->lang_to_script[$language])) $script = $this->lang_to_script[$language];
585
586 if (TYPO3_OS == 'WIN') {
587 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
588 } else {
589 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
590 }
591
592 return $cs;
593 }
594
595
596
597
598
599
600
601
602
603 /********************************************
604 *
605 * Charset Conversion functions
606 *
607 ********************************************/
608
609 /**
610 * Convert from one charset to another charset.
611 *
612 * @param string Input string
613 * @param string From charset (the current charset of the string)
614 * @param string To charset (the output charset wanted)
615 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
616 * @return string Converted string
617 * @see convArray()
618 */
619 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
620 if ($fromCS==$toCS) return $str;
621
622 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
623 if ($toCS=='utf-8' || !$useEntityForNoChar) {
624 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
625 case 'mbstring':
626 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
627 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
628 break;
629
630 case 'iconv':
631 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
632 if (false !== $conv_str) return $conv_str;
633 break;
634
635 case 'recode':
636 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
637 if (false !== $conv_str) return $conv_str;
638 break;
639 }
640 // fallback to TYPO3 conversion
641 }
642
643 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
644 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
645 return $str;
646 }
647
648 /**
649 * Convert all elements in ARRAY from one charset to another charset.
650 * NOTICE: Array is passed by reference!
651 *
652 * @param string Input array, possibly multidimensional
653 * @param string From charset (the current charset of the string)
654 * @param string To charset (the output charset wanted)
655 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
656 * @return void
657 * @see conv()
658 */
659 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
660 foreach($array as $key => $value) {
661 if (is_array($array[$key])) {
662 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
663 } else {
664 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
665 }
666 }
667 }
668
669 /**
670 * Converts $str from $charset to UTF-8
671 *
672 * @param string String in local charset to convert to UTF-8
673 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
674 * @return string Output string, converted to UTF-8
675 */
676 function utf8_encode($str,$charset) {
677
678 if ($charset === 'utf-8') return $str;
679
680 // Charset is case-insensitive.
681 if ($this->initCharset($charset)) { // Parse conv. table if not already...
682 $strLen = strlen($str);
683 $outStr='';
684
685 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
686 $chr=substr($str,$a,1);
687 $ord=ord($chr);
688 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
689 $ord2 = ord($str{$a+1});
690 $ord = $ord<<8 | $ord2; // assume big endian
691
692 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
693 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
694 } else $outStr.=chr($this->noCharByteVal); // No char exists
695 $a++;
696 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
697 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
698 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
699 $a++;
700 $ord2=ord(substr($str,$a,1));
701 $ord = $ord*256+$ord2;
702 }
703 }
704
705 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
706 $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
707 } else $outStr.= chr($this->noCharByteVal); // No char exists
708 } else $outStr.= $chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
709 }
710 return $outStr;
711 }
712 }
713
714 /**
715 * Converts $str from UTF-8 to $charset
716 *
717 * @param string String in UTF-8 to convert to local charset
718 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
719 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
720 * @return string Output string, converted to local charset
721 */
722 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
723
724 if ($charset === 'utf-8') {
725 return $str;
726 }
727
728 // Charset is case-insensitive.
729 if ($this->initCharset($charset)) { // Parse conv. table if not already...
730 $strLen = strlen($str);
731 $outStr='';
732 $buf='';
733 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
734 $chr=substr($str,$a,1);
735 $ord=ord($chr);
736 if ($ord>127) { // This means multibyte! (first byte!)
737 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
738
739 $buf=$chr; // Add first byte
740 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
741 $ord = $ord << 1; // Shift it left and ...
742 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
743 $a++; // Increase pointer...
744 $buf.=substr($str,$a,1); // ... and add the next char.
745 } else break;
746 }
747
748 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
749 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
750 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
751 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
752 } else $outStr.= chr($mByte);
753 } elseif ($useEntityForNoChar) { // Create num entity:
754 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
755 } else $outStr.=chr($this->noCharByteVal); // No char exists
756 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
757 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
758 }
759 return $outStr;
760 }
761 }
762
763 /**
764 * Converts all chars > 127 to numeric entities.
765 *
766 * @param string Input string
767 * @return string Output string
768 */
769 function utf8_to_entities($str) {
770 $strLen = strlen($str);
771 $outStr='';
772 $buf='';
773 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
774 $chr=substr($str,$a,1);
775 $ord=ord($chr);
776 if ($ord>127) { // This means multibyte! (first byte!)
777 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
778 $buf=$chr; // Add first byte
779 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
780 $ord = $ord << 1; // Shift it left and ...
781 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
782 $a++; // Increase pointer...
783 $buf.=substr($str,$a,1); // ... and add the next char.
784 } else break;
785 }
786
787 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
788 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
789 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
790 }
791
792 return $outStr;
793 }
794
795 /**
796 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
797 *
798 * @param string Input string, UTF-8
799 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
800 * @return string Output string
801 */
802 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
803 if ($alsoStdHtmlEnt) {
804 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
805 }
806
807 $token = md5(microtime());
808 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
809 foreach($parts as $k => $v) {
810 if ($k%2) {
811 if (substr($v,0,1)=='#') { // Dec or hex entities:
812 if (substr($v,1,1)=='x') {
813 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
814 } else {
815 $parts[$k] = $this->UnumberToChar(substr($v,1));
816 }
817 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
818 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
819 } else { // No conversion:
820 $parts[$k] ='&'.$v.';';
821 }
822 }
823 }
824
825 return implode('',$parts);
826 }
827
828 /**
829 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
830 *
831 * @param string Input string, UTF-8
832 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
833 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
834 * @return array Output array with the char numbers
835 */
836 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
837 // If entities must be registered as well...:
838 if ($convEntities) {
839 $str = $this->entities_to_utf8($str,1);
840 }
841 // Do conversion:
842 $strLen = strlen($str);
843 $outArr=array();
844 $buf='';
845 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
846 $chr=substr($str,$a,1);
847 $ord=ord($chr);
848 if ($ord>127) { // This means multibyte! (first byte!)
849 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
850 $buf=$chr; // Add first byte
851 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
852 $ord = $ord << 1; // Shift it left and ...
853 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
854 $a++; // Increase pointer...
855 $buf.=substr($str,$a,1); // ... and add the next char.
856 } else break;
857 }
858
859 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
860 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
861 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
862 }
863
864 return $outArr;
865 }
866
867 /**
868 * Converts a UNICODE number to a UTF-8 multibyte character
869 * Algorithm based on script found at From: http://czyborra.com/utf/
870 * Unit-tested by Kasper
871 *
872 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
873 *
874 * bytes | bits | representation
875 * 1 | 7 | 0vvvvvvv
876 * 2 | 11 | 110vvvvv 10vvvvvv
877 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
878 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
879 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
880 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
881 *
882 * @param integer UNICODE integer
883 * @return string UTF-8 multibyte character string
884 * @see utf8CharToUnumber()
885 */
886 function UnumberToChar($cbyte) {
887 $str='';
888
889 if ($cbyte < 0x80) {
890 $str.=chr($cbyte);
891 } else if ($cbyte < 0x800) {
892 $str.=chr(0xC0 | ($cbyte >> 6));
893 $str.=chr(0x80 | ($cbyte & 0x3F));
894 } else if ($cbyte < 0x10000) {
895 $str.=chr(0xE0 | ($cbyte >> 12));
896 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
897 $str.=chr(0x80 | ($cbyte & 0x3F));
898 } else if ($cbyte < 0x200000) {
899 $str.=chr(0xF0 | ($cbyte >> 18));
900 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
901 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
902 $str.=chr(0x80 | ($cbyte & 0x3F));
903 } else if ($cbyte < 0x4000000) {
904 $str.=chr(0xF8 | ($cbyte >> 24));
905 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
906 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
907 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
908 $str.=chr(0x80 | ($cbyte & 0x3F));
909 } else if ($cbyte < 0x80000000) {
910 $str.=chr(0xFC | ($cbyte >> 30));
911 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
912 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
913 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
914 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
915 $str.=chr(0x80 | ($cbyte & 0x3F));
916 } else { // Cannot express a 32-bit character in UTF-8
917 $str .= chr($this->noCharByteVal);
918 }
919 return $str;
920 }
921
922 /**
923 * Converts a UTF-8 Multibyte character to a UNICODE number
924 * Unit-tested by Kasper
925 *
926 * @param string UTF-8 multibyte character string
927 * @param boolean If set, then a hex. number is returned.
928 * @return integer UNICODE integer
929 * @see UnumberToChar()
930 */
931 function utf8CharToUnumber($str,$hex=0) {
932 $ord=ord(substr($str,0,1)); // First char
933
934 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
935 $binBuf='';
936 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
937 $ord = $ord << 1; // Shift it left and ...
938 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
939 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
940 } else break;
941 }
942 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
943
944 $int = bindec($binBuf);
945 } else $int = $ord;
946
947 return $hex ? 'x'.dechex($int) : $int;
948 }
949
950
951
952
953
954
955
956
957
958 /********************************************
959 *
960 * Init functions
961 *
962 ********************************************/
963
964 /**
965 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
966 * This function is automatically called by the conversion functions
967 *
968 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
969 *
970 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
971 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
972 * @access private
973 */
974 function initCharset($charset) {
975 // Only process if the charset is not yet loaded:
976 if (!is_array($this->parsedCharsets[$charset])) {
977
978 // Conversion table filename:
979 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
980
981 // If the conversion table is found:
982 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
983 // Cache file for charsets:
984 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
985 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
986 if ($cacheFile && @is_file($cacheFile)) {
987 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
988 } else {
989 // Parse conversion table into lines:
990 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
991 // Initialize the internal variable holding the conv. table:
992 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
993 // traverse the lines:
994 $detectedType='';
995 foreach($lines as $value) {
996 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
997
998 // Detect type if not done yet: (Done on first real line)
999 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1000 if (!$detectedType) $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/',$value) ? 'whitespaced' : 'ms-token';
1001
1002 if ($detectedType=='ms-token') {
1003 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1004 } elseif ($detectedType=='whitespaced') {
1005 $regA=array();
1006 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/',$value,$regA);
1007 $hexbyte = $regA[1];
1008 $utf8 = 'U+'.$regA[2];
1009 }
1010 $decval = hexdec(trim($hexbyte));
1011 if ($decval>127) {
1012 $utf8decval = hexdec(substr(trim($utf8),2));
1013 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
1014 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
1015 }
1016 }
1017 }
1018 if ($cacheFile) {
1019 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
1020 }
1021 }
1022 return 2;
1023 } else return false;
1024 } else return 1;
1025 }
1026
1027 /**
1028 * This function initializes all UTF-8 character data tables.
1029 *
1030 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1031 *
1032 * @param string Mode ("case", "ascii", ...)
1033 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1034 * @access private
1035 */
1036 function initUnicodeData($mode=null) {
1037 // cache files
1038 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1039 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1040
1041 // Only process if the tables are not yet loaded
1042 switch($mode) {
1043 case 'case':
1044 if (is_array($this->caseFolding['utf-8'])) return 1;
1045
1046 // Use cached version if possible
1047 if ($cacheFileCase && @is_file($cacheFileCase)) {
1048 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1049 return 2;
1050 }
1051 break;
1052
1053 case 'ascii':
1054 if (is_array($this->toASCII['utf-8'])) return 1;
1055
1056 // Use cached version if possible
1057 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1058 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1059 return 2;
1060 }
1061 break;
1062 }
1063
1064 // process main Unicode data file
1065 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
1066 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
1067
1068 $fh = fopen($unicodeDataFile,'rb');
1069 if (!$fh) return false;
1070
1071 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1072 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1073 $this->caseFolding['utf-8'] = array();
1074 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1075 $utf8CaseFolding['toUpper'] = array();
1076 $utf8CaseFolding['toLower'] = array();
1077 $utf8CaseFolding['toTitle'] = array();
1078
1079 $decomposition = array(); // array of temp. decompositions
1080 $mark = array(); // array of chars that are marks (eg. composing accents)
1081 $number = array(); // array of chars that are numbers (eg. digits)
1082 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1083
1084 while (!feof($fh)) {
1085 $line = fgets($fh,4096);
1086 // has a lot of info
1087 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = explode(';', rtrim($line));
1088
1089 $ord = hexdec($char);
1090 if ($ord > 0xFFFF) break; // only process the BMP
1091
1092 $utf8_char = $this->UnumberToChar($ord);
1093
1094 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1095 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1096 // store "title" only when different from "upper" (only a few)
1097 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1098
1099 switch ($cat{0}) {
1100 case 'M': // mark (accent, umlaut, ...)
1101 $mark["U+$char"] = 1;
1102 break;
1103
1104 case 'N': // numeric value
1105 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
1106 }
1107
1108 // accented Latin letters without "official" decomposition
1109 $match = array();
1110 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/',$name,$match) && !$decomp) {
1111 $c = ord($match[2]);
1112 if ($match[1] == 'SMALL') $c += 32;
1113
1114 $decomposition["U+$char"] = array(dechex($c));
1115 continue;
1116 }
1117
1118 $match = array();
1119 if (preg_match('/(<.*>)? *(.+)/',$decomp,$match)) {
1120 switch($match[1]) {
1121 case '<circle>': // add parenthesis as circle replacement, eg (1)
1122 $match[2] = '0028 '.$match[2].' 0029';
1123 break;
1124
1125 case '<square>': // add square brackets as square replacement, eg [1]
1126 $match[2] = '005B '.$match[2].' 005D';
1127 break;
1128
1129 case '<compat>': // ignore multi char decompositions that start with a space
1130 if (preg_match('/^0020 /',$match[2])) continue 2;
1131 break;
1132
1133 // ignore Arabic and vertical layout presentation decomposition
1134 case '<initial>':
1135 case '<medial>':
1136 case '<final>':
1137 case '<isolated>':
1138 case '<vertical>':
1139 continue 2;
1140 }
1141 $decomposition["U+$char"] = explode(' ', $match[2]);
1142 }
1143 }
1144 fclose($fh);
1145
1146 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1147 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
1148 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1149 $fh = fopen($specialCasingFile,'rb');
1150 if ($fh) {
1151 while (!feof($fh)) {
1152 $line = fgets($fh,4096);
1153 if ($line{0} != '#' && trim($line) != '') {
1154
1155 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
1156 if ($cond == '' || $cond{0} == '#') {
1157 $utf8_char = $this->UnumberToChar(hexdec($char));
1158 if ($char != $lower) {
1159 $arr = explode(' ', $lower);
1160 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1161 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1162 }
1163 if ($char != $title && $title != $upper) {
1164 $arr = explode(' ', $title);
1165 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1166 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1167 }
1168 if ($char != $upper) {
1169 $arr = explode(' ', $upper);
1170 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1171 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1172 }
1173 }
1174 }
1175 }
1176 fclose($fh);
1177 }
1178 }
1179
1180 // process custom decompositions
1181 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
1182 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1183 $fh = fopen($customTranslitFile,'rb');
1184 if ($fh) {
1185 while (!feof($fh)) {
1186 $line = fgets($fh,4096);
1187 if ($line{0} != '#' && trim($line) != '') {
1188 list($char,$translit) = t3lib_div::trimExplode(';', $line);
1189 if (!$translit) $omit["U+$char"] = 1;
1190 $decomposition["U+$char"] = explode(' ', $translit);
1191
1192 }
1193 }
1194 fclose($fh);
1195 }
1196 }
1197
1198 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1199 foreach($decomposition as $from => $to) {
1200 $code_decomp = array();
1201
1202 while ($code_value = array_shift($to)) {
1203 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1204 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
1205 array_unshift($to, $cv);
1206 }
1207 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1208 array_push($code_decomp, $code_value);
1209 }
1210 }
1211 if (count($code_decomp) || isset($omit[$from])) {
1212 $decomposition[$from] = $code_decomp;
1213 } else {
1214 unset($decomposition[$from]);
1215 }
1216 }
1217
1218 // create ascii only mapping
1219 $this->toASCII['utf-8'] = array();
1220 $ascii =& $this->toASCII['utf-8'];
1221
1222 foreach($decomposition as $from => $to) {
1223 $code_decomp = array();
1224 while ($code_value = array_shift($to)) {
1225 $ord = hexdec($code_value);
1226 if ($ord > 127)
1227 continue 2; // skip decompositions containing non-ASCII chars
1228 else
1229 array_push($code_decomp,chr($ord));
1230 }
1231 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1232 }
1233
1234 // add numeric decompositions
1235 foreach($number as $from => $to) {
1236 $utf8_char = $this->UnumberToChar(hexdec($from));
1237 if (!isset($ascii[$utf8_char])) {
1238 $ascii[$utf8_char] = $to;
1239 }
1240 }
1241
1242 if ($cacheFileCase) {
1243 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1244 }
1245
1246 if ($cacheFileASCII) {
1247 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1248 }
1249
1250 return 3;
1251 }
1252
1253 /**
1254 * This function initializes the folding table for a charset other than UTF-8.
1255 * This function is automatically called by the case folding functions.
1256 *
1257 * @param string Charset for which to initialize case folding.
1258 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1259 * @access private
1260 */
1261 function initCaseFolding($charset) {
1262 // Only process if the case table is not yet loaded:
1263 if (is_array($this->caseFolding[$charset])) return 1;
1264
1265 // Use cached version if possible
1266 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1267 if ($cacheFile && @is_file($cacheFile)) {
1268 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1269 return 2;
1270 }
1271
1272 // init UTF-8 conversion for this charset
1273 if (!$this->initCharset($charset)) {
1274 return false;
1275 }
1276
1277 // UTF-8 case folding is used as the base conversion table
1278 if (!$this->initUnicodeData('case')) {
1279 return false;
1280 }
1281
1282 $nochar = chr($this->noCharByteVal);
1283 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1284 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1285 $c = $this->utf8_decode($utf8, $charset);
1286
1287 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1288 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1289 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1290
1291 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1292 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1293 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
1294
1295 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1296 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1297 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1298 }
1299
1300 // add the ASCII case table
1301 for ($i=ord('a'); $i<=ord('z'); $i++) {
1302 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
1303 }
1304 for ($i=ord('A'); $i<=ord('Z'); $i++) {
1305 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
1306 }
1307
1308 if ($cacheFile) {
1309 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
1310 }
1311
1312 return 3;
1313 }
1314
1315 /**
1316 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1317 * This function is automatically called by the ASCII transliteration functions.
1318 *
1319 * @param string Charset for which to initialize conversion.
1320 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1321 * @access private
1322 */
1323 function initToASCII($charset) {
1324 // Only process if the case table is not yet loaded:
1325 if (is_array($this->toASCII[$charset])) return 1;
1326
1327 // Use cached version if possible
1328 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1329 if ($cacheFile && @is_file($cacheFile)) {
1330 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1331 return 2;
1332 }
1333
1334 // init UTF-8 conversion for this charset
1335 if (!$this->initCharset($charset)) {
1336 return false;
1337 }
1338
1339 // UTF-8/ASCII transliteration is used as the base conversion table
1340 if (!$this->initUnicodeData('ascii')) {
1341 return false;
1342 }
1343
1344 $nochar = chr($this->noCharByteVal);
1345 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1346 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1347 $c = $this->utf8_decode($utf8, $charset);
1348
1349 if (isset($this->toASCII['utf-8'][$utf8])) {
1350 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1351 }
1352 }
1353
1354 if ($cacheFile) {
1355 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
1356 }
1357
1358 return 3;
1359 }
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376 /********************************************
1377 *
1378 * String operation functions
1379 *
1380 ********************************************/
1381
1382 /**
1383 * Returns a part of a string.
1384 * Unit-tested by Kasper (single byte charsets only)
1385 *
1386 * @param string The character set
1387 * @param string Character string
1388 * @param integer Start position (character position)
1389 * @param integer Length (in characters)
1390 * @return string The substring
1391 * @see substr(), mb_substr()
1392 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1393 */
1394 function substr($charset,$string,$start,$len=null) {
1395 if ($len===0) return '';
1396
1397 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1398 // cannot omit $len, when specifying charset
1399 if ($len==null) {
1400 $enc = mb_internal_encoding(); // save internal encoding
1401 mb_internal_encoding($charset);
1402 $str = mb_substr($string,$start);
1403 mb_internal_encoding($enc); // restore internal encoding
1404
1405 return $str;
1406 }
1407 else {
1408 return mb_substr($string,$start,$len,$charset);
1409 }
1410 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1411 // cannot omit $len, when specifying charset
1412 if ($len==null) {
1413 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1414 iconv_set_encoding('internal_encoding',$charset);
1415 $str = iconv_substr($string,$start);
1416 iconv_set_encoding('internal_encoding',$enc); // restore internal encoding
1417
1418 return $str;
1419 }
1420 else {
1421 return iconv_substr($string,$start,$len,$charset);
1422 }
1423 } elseif ($charset == 'utf-8') {
1424 return $this->utf8_substr($string,$start,$len);
1425 } elseif ($this->eucBasedSets[$charset]) {
1426 return $this->euc_substr($string,$start,$charset,$len);
1427 } elseif ($this->twoByteSets[$charset]) {
1428 return substr($string,$start*2,$len*2);
1429 } elseif ($this->fourByteSets[$charset]) {
1430 return substr($string,$start*4,$len*4);
1431 }
1432
1433 // treat everything else as single-byte encoding
1434 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
1435 }
1436
1437 /**
1438 * Counts the number of characters.
1439 * Unit-tested by Kasper (single byte charsets only)
1440 *
1441 * @param string The character set
1442 * @param string Character string
1443 * @return integer The number of characters
1444 * @see strlen()
1445 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1446 */
1447 function strlen($charset,$string) {
1448 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1449 return mb_strlen($string,$charset);
1450 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1451 return iconv_strlen($string,$charset);
1452 } elseif ($charset == 'utf-8') {
1453 return $this->utf8_strlen($string);
1454 } elseif ($this->eucBasedSets[$charset]) {
1455 return $this->euc_strlen($string,$charset);
1456 } elseif ($this->twoByteSets[$charset]) {
1457 return strlen($string)/2;
1458 } elseif ($this->fourByteSets[$charset]) {
1459 return strlen($string)/4;
1460 }
1461 // treat everything else as single-byte encoding
1462 return strlen($string);
1463 }
1464
1465 /**
1466 * Method to crop strings using the mb_substr function.
1467 *
1468 * @param string The character set
1469 * @param string String to be cropped
1470 * @param integer Crop length (in characters)
1471 * @param string Crop signifier
1472 * @return string The shortened string
1473 * @see mb_strlen(), mb_substr()
1474 */
1475 protected function cropMbstring($charset, $string, $len, $crop = '') {
1476 if (intval($len) == 0 || mb_strlen($string) < $len) {
1477 return $string;
1478 }
1479
1480 if ($len > 0) {
1481 $string = mb_substr($string, 0, $len, $charset) . $crop;
1482 } else {
1483 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1484 }
1485
1486 return $string;
1487 }
1488
1489 /**
1490 * Truncates a string and pre-/appends a string.
1491 * Unit tested by Kasper
1492 *
1493 * @param string The character set
1494 * @param string Character string
1495 * @param integer Length (in characters)
1496 * @param string Crop signifier
1497 * @return string The shortened string
1498 * @see substr(), mb_strimwidth()
1499 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1500 */
1501 function crop($charset,$string,$len,$crop='') {
1502 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1503 return $this->cropMbstring($charset, $string, $len, $crop);
1504 }
1505
1506 if (intval($len) == 0) return $string;
1507
1508 if ($charset == 'utf-8') {
1509 $i = $this->utf8_char2byte_pos($string,$len);
1510 } elseif ($this->eucBasedSets[$charset]) {
1511 $i = $this->euc_char2byte_pos($string,$len,$charset);
1512 } else {
1513 if ($len > 0) {
1514 $i = $len;
1515 } else {
1516 $i = strlen($string)+$len;
1517 if ($i<=0) $i = false;
1518 }
1519 }
1520
1521 if ($i === false) { // $len outside actual string length
1522 return $string;
1523 } else {
1524 if ($len > 0) {
1525 if (strlen($string{$i})) {
1526 return substr($string,0,$i).$crop;
1527
1528 }
1529 } else {
1530 if (strlen($string{$i-1})) {
1531 return $crop.substr($string,$i);
1532 }
1533 }
1534
1535 /*
1536 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1537 if ($len > 0) {
1538 return substr($string,0,$i).$crop;
1539 } else {
1540 return $crop.substr($string,$i);
1541 }
1542 }
1543 */
1544 }
1545 return $string;
1546 }
1547
1548 /**
1549 * Cuts a string short at a given byte length.
1550 *
1551 * @param string The character set
1552 * @param string Character string
1553 * @param integer The byte length
1554 * @return string The shortened string
1555 * @see mb_strcut()
1556 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1557 */
1558 function strtrunc($charset,$string,$len) {
1559 if ($len <= 0) return '';
1560
1561 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1562 return mb_strcut($string,0,$len,$charset);
1563 } elseif ($charset == 'utf-8') {
1564 return $this->utf8_strtrunc($string,$len);
1565 } elseif ($this->eucBasedSets[$charset]) {
1566 return $this->euc_strtrunc($string,$charset);
1567 } elseif ($this->twoByteSets[$charset]) {
1568 if ($len % 2) $len--; // don't cut at odd positions
1569 } elseif ($this->fourByteSets[$charset]) {
1570 $x = $len % 4;
1571 $len -= $x; // realign to position dividable by four
1572 }
1573 // treat everything else as single-byte encoding
1574 return substr($string,0,$len);
1575 }
1576
1577 /**
1578 * Translates all characters of a string into their respective case values.
1579 * Unlike strtolower() and strtoupper() this method is locale independent.
1580 * Note that the string length may change!
1581 * eg. lower case German �(sharp S) becomes upper case "SS"
1582 * Unit-tested by Kasper
1583 * Real case folding is language dependent, this method ignores this fact.
1584 *
1585 * @param string Character set of string
1586 * @param string Input string to convert case for
1587 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1588 * @return string The converted string
1589 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1590 * @see strtolower(), strtoupper()
1591 */
1592 function conv_case($charset,$string,$case) {
1593 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1594 if ($case == 'toLower') {
1595 $string = mb_strtolower($string,$charset);
1596 } else {
1597 $string = mb_strtoupper($string,$charset);
1598 }
1599 } elseif ($charset == 'utf-8') {
1600 $string = $this->utf8_char_mapping($string,'case',$case);
1601 } elseif (isset($this->eucBasedSets[$charset])) {
1602 $string = $this->euc_char_mapping($string,$charset,'case',$case);
1603 } else {
1604 // treat everything else as single-byte encoding
1605 $string = $this->sb_char_mapping($string,$charset,'case',$case);
1606 }
1607
1608 return $string;
1609 }
1610
1611 /**
1612 * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
1613 *
1614 * @param string Character set of string
1615 * @param string Input string to convert
1616 * @return string The converted string
1617 */
1618 function specCharsToASCII($charset,$string) {
1619 if ($charset == 'utf-8') {
1620 $string = $this->utf8_char_mapping($string,'ascii');
1621 } elseif (isset($this->eucBasedSets[$charset])) {
1622 $string = $this->euc_char_mapping($string,$charset,'ascii');
1623 } else {
1624 // treat everything else as single-byte encoding
1625 $string = $this->sb_char_mapping($string,$charset,'ascii');
1626 }
1627
1628 return $string;
1629 }
1630
1631
1632 /**
1633 * converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1634 * into a TYPO3-readable language code
1635 * @param $languageCodesList list of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1636 * see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
1637 * @return string a preferred language that TYPO3 supports, or "default" if none found
1638 * @author Benjamin Mack (benni.typo3.org)
1639 */
1640 public function getPreferredClientLanguage($languageCodesList) {
1641 $allLanguageCodes = array();
1642 $selectedLanguage = 'default';
1643
1644 // get all languages where TYPO3 code is the same as the ISO code
1645 foreach ($this->charSetArray as $typo3Lang => $charSet) {
1646 $allLanguageCodes[$typo3Lang] = $typo3Lang;
1647 }
1648
1649 // get all languages where TYPO3 code differs from ISO code
1650 // or needs the country part
1651 // the iso codes will here overwrite the default typo3 language in the key
1652 foreach ($this->isoArray as $typo3Lang => $isoLang) {
1653 $isoLang = join('-', explode('_', $isoLang));
1654 $allLanguageCodes[$typo3Lang] = $isoLang;
1655 }
1656
1657 // move the iso codes to the (because we're comparing the keys with "isset" later on)
1658 $allLanguageCodes = array_flip($allLanguageCodes);
1659
1660
1661 $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList);
1662 // order the preferred languages after they key
1663 $sortedPreferredLanguages = array();
1664 foreach ($preferredLanguages as $preferredLanguage) {
1665 $quality = 1.0;
1666 if (strpos($preferredLanguage, ';q=') !== false) {
1667 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1668 }
1669 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1670 }
1671
1672 // loop through the languages, with the highest priority first
1673 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1674 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1675 if (isset($allLanguageCodes[$preferredLanguage])) {
1676 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1677 break;
1678 }
1679
1680 // strip the country code from the end
1681 list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
1682 if (isset($allLanguageCodes[$preferredLanguage])) {
1683 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1684 break;
1685 }
1686 }
1687 if (!$selectedLanguage || $selectedLanguage == 'en') {
1688 $selectedLanguage = 'default';
1689 }
1690 return $selectedLanguage;
1691 }
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702 /********************************************
1703 *
1704 * Internal string operation functions
1705 *
1706 ********************************************/
1707
1708 /**
1709 * Maps all characters of a string in a single byte charset.
1710 *
1711 * @param string the string
1712 * @param string the charset
1713 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1714 * @param string 'case': conversion 'toLower' or 'toUpper'
1715 * @return string the converted string
1716 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1717 */
1718 function sb_char_mapping($str,$charset,$mode,$opt='') {
1719 switch($mode) {
1720 case 'case':
1721 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1722 $map =& $this->caseFolding[$charset][$opt];
1723 break;
1724
1725 case 'ascii':
1726 if (!$this->initToASCII($charset)) return $str; // do nothing
1727 $map =& $this->toASCII[$charset];
1728 break;
1729
1730 default:
1731 return $str;
1732 }
1733
1734 $out = '';
1735 for($i=0; strlen($str{$i}); $i++) {
1736 $c = $str{$i};
1737 if (isset($map[$c])) {
1738 $out .= $map[$c];
1739 } else {
1740 $out .= $c;
1741 }
1742 }
1743
1744 return $out;
1745 }
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756 /********************************************
1757 *
1758 * Internal UTF-8 string operation functions
1759 *
1760 ********************************************/
1761
1762 /**
1763 * Returns a part of a UTF-8 string.
1764 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1765 *
1766 * @param string UTF-8 string
1767 * @param integer Start position (character position)
1768 * @param integer Length (in characters)
1769 * @return string The substring
1770 * @see substr()
1771 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1772 */
1773 function utf8_substr($str,$start,$len=null) {
1774 if (!strcmp($len,'0')) return '';
1775
1776 $byte_start = $this->utf8_char2byte_pos($str,$start);
1777 if ($byte_start === false) {
1778 if ($start > 0) {
1779 return false; // $start outside string length
1780 } else {
1781 $start = 0;
1782 }
1783 }
1784
1785 $str = substr($str,$byte_start);
1786
1787 if ($len!=null) {
1788 $byte_end = $this->utf8_char2byte_pos($str,$len);
1789 if ($byte_end === false) // $len outside actual string length
1790 return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string.
1791 else
1792 return substr($str,0,$byte_end);
1793 }
1794 else return $str;
1795 }
1796
1797 /**
1798 * Counts the number of characters of a string in UTF-8.
1799 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1800 *
1801 * @param string UTF-8 multibyte character string
1802 * @return integer The number of characters
1803 * @see strlen()
1804 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1805 */
1806 function utf8_strlen($str) {
1807 $n=0;
1808 for($i=0; strlen($str{$i}); $i++) {
1809 $c = ord($str{$i});
1810 if (!($c & 0x80)) // single-byte (0xxxxxx)
1811 $n++;
1812 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1813 $n++;
1814 }
1815 return $n;
1816 }
1817
1818 /**
1819 * Truncates a string in UTF-8 short at a given byte length.
1820 *
1821 * @param string UTF-8 multibyte character string
1822 * @param integer the byte length
1823 * @return string the shortened string
1824 * @see mb_strcut()
1825 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1826 */
1827 function utf8_strtrunc($str,$len) {
1828 $i = $len-1;
1829 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1830 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1831 if ($i <= 0) return ''; // sanity check
1832 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1833 if ($bc+$i > $len) return substr($str,0,$i);
1834 // fallthru: multibyte char fits into length
1835 }
1836 return substr($str,0,$len);
1837 }
1838
1839 /**
1840 * Find position of first occurrence of a string, both arguments are in UTF-8.
1841 *
1842 * @param string UTF-8 string to search in
1843 * @param string UTF-8 string to search for
1844 * @param integer Positition to start the search
1845 * @return integer The character position
1846 * @see strpos()
1847 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1848 */
1849 function utf8_strpos($haystack,$needle,$offset=0) {
1850 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1851 return mb_strpos($haystack,$needle,$offset,'utf-8');
1852 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1853 return iconv_strpos($haystack,$needle,$offset,'utf-8');
1854 }
1855
1856 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1857 if ($byte_offset === false) return false; // offset beyond string length
1858
1859 $byte_pos = strpos($haystack,$needle,$byte_offset);
1860 if ($byte_pos === false) return false; // needle not found
1861
1862 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1863 }
1864
1865 /**
1866 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1867 *
1868 * @param string UTF-8 string to search in
1869 * @param string UTF-8 character to search for (single character)
1870 * @return integer The character position
1871 * @see strrpos()
1872 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1873 */
1874 function utf8_strrpos($haystack,$needle) {
1875 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1876 return mb_strrpos($haystack,$needle,'utf-8');
1877 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1878 return iconv_strrpos($haystack,$needle,'utf-8');
1879 }
1880
1881 $byte_pos = strrpos($haystack,$needle);
1882 if ($byte_pos === false) return false; // needle not found
1883
1884 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1885 }
1886
1887 /**
1888 * Translates a character position into an 'absolute' byte position.
1889 * Unit tested by Kasper.
1890 *
1891 * @param string UTF-8 string
1892 * @param integer Character position (negative values start from the end)
1893 * @return integer Byte position
1894 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1895 */
1896 function utf8_char2byte_pos($str,$pos) {
1897 $n = 0; // number of characters found
1898 $p = abs($pos); // number of characters wanted
1899
1900 if ($pos >= 0) {
1901 $i = 0;
1902 $d = 1;
1903 } else {
1904 $i = strlen($str)-1;
1905 $d = -1;
1906 }
1907
1908 for( ; strlen($str{$i}) && $n<$p; $i+=$d) {
1909 $c = (int)ord($str{$i});
1910 if (!($c & 0x80)) // single-byte (0xxxxxx)
1911 $n++;
1912 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1913 $n++;
1914 }
1915 if (!strlen($str{$i})) return false; // offset beyond string length
1916
1917 if ($pos >= 0) {
1918 // skip trailing multi-byte data bytes
1919 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1920 } else {
1921 // correct offset
1922 $i++;
1923 }
1924
1925 return $i;
1926 }
1927
1928 /**
1929 * Translates an 'absolute' byte position into a character position.
1930 * Unit tested by Kasper.
1931 *
1932 * @param string UTF-8 string
1933 * @param integer byte position
1934 * @return integer character position
1935 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1936 */
1937 function utf8_byte2char_pos($str,$pos) {
1938 $n = 0; // number of characters
1939 for($i=$pos; $i>0; $i--) {
1940 $c = (int)ord($str{$i});
1941 if (!($c & 0x80)) // single-byte (0xxxxxx)
1942 $n++;
1943 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1944 $n++;
1945 }
1946 if (!strlen($str{$i})) return false; // offset beyond string length
1947
1948 return $n;
1949 }
1950
1951 /**
1952 * Maps all characters of an UTF-8 string.
1953 *
1954 * @param string UTF-8 string
1955 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1956 * @param string 'case': conversion 'toLower' or 'toUpper'
1957 * @return string the converted string
1958 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1959 */
1960 function utf8_char_mapping($str,$mode,$opt='') {
1961 if (!$this->initUnicodeData($mode)) return $str; // do nothing
1962
1963 $out = '';
1964 switch($mode) {
1965 case 'case':
1966 $map =& $this->caseFolding['utf-8'][$opt];
1967 break;
1968
1969 case 'ascii':
1970 $map =& $this->toASCII['utf-8'];
1971 break;
1972
1973 default:
1974 return $str;
1975 }
1976
1977 for($i=0; strlen($str{$i}); $i++) {
1978 $c = ord($str{$i});
1979 if (!($c & 0x80)) // single-byte (0xxxxxx)
1980 $mbc = $str{$i};
1981 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1982 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1983 $mbc = substr($str,$i,$bc);
1984 $i += $bc-1;
1985 }
1986
1987 if (isset($map[$mbc])) {
1988 $out .= $map[$mbc];
1989 } else {
1990 $out .= $mbc;
1991 }
1992 }
1993
1994 return $out;
1995 }
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014 /********************************************
2015 *
2016 * Internal EUC string operation functions
2017 *
2018 * Extended Unix Code:
2019 * ASCII compatible 7bit single bytes chars
2020 * 8bit two byte chars
2021 *
2022 * Shift-JIS is treated as a special case.
2023 *
2024 ********************************************/
2025
2026 /**
2027 * Cuts a string in the EUC charset family short at a given byte length.
2028 *
2029 * @param string EUC multibyte character string
2030 * @param integer the byte length
2031 * @param string the charset
2032 * @return string the shortened string
2033 * @see mb_strcut()
2034 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2035 */
2036 function euc_strtrunc($str,$len,$charset) {
2037 $sjis = ($charset == 'shift_jis');
2038 for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
2039 $c = ord($str{$i});
2040 if ($sjis) {
2041 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
2042 }
2043 else {
2044 if ($c >= 0x80) $i++; // advance a double-byte char
2045 }
2046 }
2047 if (!strlen($str{$i})) return $str; // string shorter than supplied length
2048
2049 if ($i>$len) {
2050 return substr($str,0,$len-1); // we ended on a first byte
2051 } else {
2052 return substr($str,0,$len);
2053 }
2054 }
2055
2056 /**
2057 * Returns a part of a string in the EUC charset family.
2058 *
2059 * @param string EUC multibyte character string
2060 * @param integer start position (character position)
2061 * @param string the charset
2062 * @param integer length (in characters)
2063 * @return string the substring
2064 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2065 */
2066 function euc_substr($str,$start,$charset,$len=null) {
2067 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
2068 if ($byte_start === false) return false; // $start outside string length
2069
2070 $str = substr($str,$byte_start);
2071
2072 if ($len!=null) {
2073 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
2074 if ($byte_end === false) // $len outside actual string length
2075 return $str;
2076 else
2077 return substr($str,0,$byte_end);
2078 }
2079 else return $str;
2080 }
2081
2082 /**
2083 * Counts the number of characters of a string in the EUC charset family.
2084 *
2085 * @param string EUC multibyte character string
2086 * @param string the charset
2087 * @return integer the number of characters
2088 * @see strlen()
2089 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2090 */
2091 function euc_strlen($str,$charset) {
2092 $sjis = ($charset == 'shift_jis');
2093 $n=0;
2094 for ($i=0; strlen($str{$i}); $i++) {
2095 $c = ord($str{$i});
2096 if ($sjis) {
2097 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
2098 }
2099 else {
2100 if ($c >= 0x80) $i++; // advance a double-byte char
2101 }
2102
2103 $n++;
2104 }
2105
2106 return $n;
2107 }
2108
2109 /**
2110 * Translates a character position into an 'absolute' byte position.
2111 *
2112 * @param string EUC multibyte character string
2113 * @param integer character position (negative values start from the end)
2114 * @param string the charset
2115 * @return integer byte position
2116 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2117 */
2118 function euc_char2byte_pos($str,$pos,$charset) {
2119 $sjis = ($charset == 'shift_jis');
2120 $n = 0; // number of characters seen
2121 $p = abs($pos); // number of characters wanted
2122
2123 if ($pos >= 0) {
2124 $i = 0;
2125 $d = 1;
2126 } else {
2127 $i = strlen($str)-1;
2128 $d = -1;
2129 }
2130
2131 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
2132 $c = ord($str{$i});
2133 if ($sjis) {
2134 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char
2135 }
2136 else {
2137 if ($c >= 0x80) $i+=$d; // advance a double-byte char
2138 }
2139
2140 $n++;
2141 }
2142 if (!strlen($str{$i})) return false; // offset beyond string length
2143
2144 if ($pos < 0) $i++; // correct offset
2145
2146 return $i;
2147 }
2148
2149 /**
2150 * Maps all characters of a string in the EUC charset family.
2151 *
2152 * @param string EUC multibyte character string
2153 * @param string the charset
2154 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2155 * @param string 'case': conversion 'toLower' or 'toUpper'
2156 * @return string the converted string
2157 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2158 */
2159 function euc_char_mapping($str,$charset,$mode,$opt='') {
2160 switch($mode) {
2161 case 'case':
2162 if (!$this->initCaseFolding($charset)) return $str; // do nothing
2163 $map =& $this->caseFolding[$charset][$opt];
2164 break;
2165
2166 case 'ascii':
2167 if (!$this->initToASCII($charset)) return $str; // do nothing
2168 $map =& $this->toASCII[$charset];
2169 break;
2170
2171 default:
2172 return $str;
2173 }
2174
2175 $sjis = ($charset == 'shift_jis');
2176 $out = '';
2177 for($i=0; strlen($str{$i}); $i++) {
2178 $mbc = $str{$i};
2179 $c = ord($mbc);
2180
2181 if ($sjis) {
2182 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2183 $mbc = substr($str,$i,2);
2184 $i++;
2185 }
2186 }
2187 else {
2188 if ($c >= 0x80) { // a double-byte char
2189 $mbc = substr($str,$i,2);
2190 $i++;
2191 }
2192 }
2193
2194 if (isset($map[$mbc])) {
2195 $out .= $map[$mbc];
2196 } else {
2197 $out .= $mbc;
2198 }
2199 }
2200
2201 return $out;
2202 }
2203
2204 }
2205
2206 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
2207 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2208 }
2209
2210 ?>