Fixed bug / Feature #11293: Login page is not translatable
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2009 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 136: class t3lib_cs
38 * 488: function parse_charset($charset)
39 * 507: function get_locale_charset($locale)
40 *
41 * SECTION: Charset Conversion functions
42 * 560: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
43 * 600: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
44 * 617: function utf8_encode($str,$charset)
45 * 663: function utf8_decode($str,$charset,$useEntityForNoChar=0)
46 * 706: function utf8_to_entities($str)
47 * 739: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
48 * 773: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
49 * 823: function UnumberToChar($cbyte)
50 * 868: function utf8CharToUnumber($str,$hex=0)
51 *
52 * SECTION: Init functions
53 * 911: function initCharset($charset)
54 * 973: function initUnicodeData($mode=null)
55 * 1198: function initCaseFolding($charset)
56 * 1260: function initToASCII($charset)
57 *
58 * SECTION: String operation functions
59 * 1331: function substr($charset,$string,$start,$len=null)
60 * 1384: function strlen($charset,$string)
61 * 1414: function crop($charset,$string,$len,$crop='')
62 * 1467: function strtrunc($charset,$string,$len)
63 * 1501: function conv_case($charset,$string,$case)
64 * 1527: function specCharsToASCII($charset,$string)
65 *
66 * SECTION: Internal string operation functions
67 * 1567: function sb_char_mapping($str,$charset,$mode,$opt='')
68 *
69 * SECTION: Internal UTF-8 string operation functions
70 * 1622: function utf8_substr($str,$start,$len=null)
71 * 1655: function utf8_strlen($str)
72 * 1676: function utf8_strtrunc($str,$len)
73 * 1698: function utf8_strpos($haystack,$needle,$offset=0)
74 * 1723: function utf8_strrpos($haystack,$needle)
75 * 1745: function utf8_char2byte_pos($str,$pos)
76 * 1786: function utf8_byte2char_pos($str,$pos)
77 * 1809: function utf8_char_mapping($str,$mode,$opt='')
78 *
79 * SECTION: Internal EUC string operation functions
80 * 1885: function euc_strtrunc($str,$len,$charset)
81 * 1914: function euc_substr($str,$start,$charset,$len=null)
82 * 1939: function euc_strlen($str,$charset)
83 * 1966: function euc_char2byte_pos($str,$pos,$charset)
84 * 2007: function euc_char_mapping($str,$charset,$mode,$opt='')
85 *
86 * TOTAL FUNCTIONS: 35
87 * (This index is automatically created/updated by the extension "extdeveval")
88 *
89 */
90
91
92
93
94
95
96
97
98 /**
99 * Notes on UTF-8
100 *
101 * Functions working on UTF-8 strings:
102 *
103 * - strchr/strstr
104 * - strrchr
105 * - substr_count
106 * - implode/explode/join
107 *
108 * Functions nearly working on UTF-8 strings:
109 *
110 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
111 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
112 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
113 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
114 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
115 *
116 * Functions NOT working on UTF-8 strings:
117 *
118 * - str*cmp
119 * - stristr
120 * - stripos
121 * - substr
122 * - strrev
123 * - split/spliti
124 * - ...
125 *
126 */
127 /**
128 * Class for conversion between charsets
129 *
130 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
131 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
132 * @package TYPO3
133 * @subpackage t3lib
134 */
135 class t3lib_cs {
136 var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
137
138 // This is the array where parsed conversion tables are stored (cached)
139 var $parsedCharsets=array();
140
141 // An array where case folding data will be stored (cached)
142 var $caseFolding=array();
143
144 // An array where charset-to-ASCII mappings are stored (cached)
145 var $toASCII=array();
146
147 // This tells the converter which charsets has two bytes per char:
148 var $twoByteSets=array(
149 'ucs-2'=>1, // 2-byte Unicode
150 );
151
152 // This tells the converter which charsets has four bytes per char:
153 var $fourByteSets=array(
154 'ucs-4'=>1, // 4-byte Unicode
155 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
156 );
157
158 // This tells the converter which charsets use a scheme like the Extended Unix Code:
159 var $eucBasedSets=array(
160 'gb2312'=>1, // Chinese, simplified.
161 'big5'=>1, // Chinese, traditional.
162 'euc-kr'=>1, // Korean
163 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
164 );
165
166 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
167 // http://czyborra.com/charsets/iso8859.html
168 var $synonyms=array(
169 'us' => 'ascii',
170 'us-ascii'=> 'ascii',
171 'cp819' => 'iso-8859-1',
172 'ibm819' => 'iso-8859-1',
173 'iso-ir-100' => 'iso-8859-1',
174 'iso-ir-101' => 'iso-8859-2',
175 'iso-ir-109' => 'iso-8859-3',
176 'iso-ir-110' => 'iso-8859-4',
177 'iso-ir-144' => 'iso-8859-5',
178 'iso-ir-127' => 'iso-8859-6',
179 'iso-ir-126' => 'iso-8859-7',
180 'iso-ir-138' => 'iso-8859-8',
181 'iso-ir-148' => 'iso-8859-9',
182 'iso-ir-157' => 'iso-8859-10',
183 'iso-ir-179' => 'iso-8859-13',
184 'iso-ir-199' => 'iso-8859-14',
185 'iso-ir-203' => 'iso-8859-15',
186 'csisolatin1' => 'iso-8859-1',
187 'csisolatin2' => 'iso-8859-2',
188 'csisolatin3' => 'iso-8859-3',
189 'csisolatin5' => 'iso-8859-9',
190 'csisolatin8' => 'iso-8859-14',
191 'csisolatin9' => 'iso-8859-15',
192 'csisolatingreek' => 'iso-8859-7',
193 'iso-celtic' => 'iso-8859-14',
194 'latin1' => 'iso-8859-1',
195 'latin2' => 'iso-8859-2',
196 'latin3' => 'iso-8859-3',
197 'latin5' => 'iso-8859-9',
198 'latin6' => 'iso-8859-10',
199 'latin8' => 'iso-8859-14',
200 'latin9' => 'iso-8859-15',
201 'l1' => 'iso-8859-1',
202 'l2' => 'iso-8859-2',
203 'l3' => 'iso-8859-3',
204 'l5' => 'iso-8859-9',
205 'l6' => 'iso-8859-10',
206 'l8' => 'iso-8859-14',
207 'l9' => 'iso-8859-15',
208 'cyrillic' => 'iso-8859-5',
209 'arabic' => 'iso-8859-6',
210 'tis-620' => 'iso-8859-11',
211 'win874' => 'windows-874',
212 'win1250' => 'windows-1250',
213 'win1251' => 'windows-1251',
214 'win1252' => 'windows-1252',
215 'win1253' => 'windows-1253',
216 'win1254' => 'windows-1254',
217 'win1255' => 'windows-1255',
218 'win1256' => 'windows-1256',
219 'win1257' => 'windows-1257',
220 'win1258' => 'windows-1258',
221 'cp1250' => 'windows-1250',
222 'cp1251' => 'windows-1251',
223 'cp1252' => 'windows-1252',
224 'ms-ee' => 'windows-1250',
225 'ms-ansi' => 'windows-1252',
226 'ms-greek' => 'windows-1253',
227 'ms-turk' => 'windows-1254',
228 'winbaltrim' => 'windows-1257',
229 'koi-8ru' => 'koi-8r',
230 'koi8r' => 'koi-8r',
231 'cp878' => 'koi-8r',
232 'mac' => 'macroman',
233 'macintosh' => 'macroman',
234 'euc-cn' => 'gb2312',
235 'x-euc-cn' => 'gb2312',
236 'euccn' => 'gb2312',
237 'cp936' => 'gb2312',
238 'big-5' => 'big5',
239 'cp950' => 'big5',
240 'eucjp' => 'euc-jp',
241 'sjis' => 'shift_jis',
242 'shift-jis' => 'shift_jis',
243 'cp932' => 'shift_jis',
244 'cp949' => 'euc-kr',
245 'utf7' => 'utf-7',
246 'utf8' => 'utf-8',
247 'utf16' => 'utf-16',
248 'utf32' => 'utf-32',
249 'utf8' => 'utf-8',
250 'ucs2' => 'ucs-2',
251 'ucs4' => 'ucs-4',
252 );
253
254 // mapping of iso-639-1 language codes to script names
255 var $lang_to_script=array(
256 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
257 'ar' => 'arabic',
258 'bg' => 'cyrillic', // Bulgarian
259 'bs' => 'east_european', // Bosnian
260 'cs' => 'east_european', // Czech
261 'da' => 'west_european', // Danish
262 'de' => 'west_european', // German
263 'es' => 'west_european', // Spanish
264 'et' => 'estonian',
265 'eo' => 'unicode', // Esperanto
266 'eu' => 'west_european', // Basque
267 'fa' => 'arabic', // Persian
268 'fi' => 'west_european', // Finish
269 'fo' => 'west_european', // Faroese
270 'fr' => 'west_european', // French
271 'ga' => 'west_european', // Galician
272 'ge' => 'unicode', // Georgian
273 'gr' => 'greek',
274 'he' => 'hebrew', // Hebrew (since 1998)
275 'hi' => 'unicode', // Hindi
276 'hr' => 'east_european', // Croatian
277 'hu' => 'east_european', // Hungarian
278 'iw' => 'hebrew', // Hebrew (til 1998)
279 'is' => 'west_european', // Icelandic
280 'it' => 'west_european', // Italian
281 'ja' => 'japanese',
282 'kl' => 'west_european', // Greenlandic
283 'ko' => 'korean',
284 'lt' => 'lithuanian',
285 'lv' => 'west_european', // Latvian/Lettish
286 'nl' => 'west_european', // Dutch
287 'no' => 'west_european', // Norwegian
288 'nb' => 'west_european', // Norwegian Bokmal
289 'nn' => 'west_european', // Norwegian Nynorsk
290 'pl' => 'east_european', // Polish
291 'pt' => 'west_european', // Portuguese
292 'ro' => 'east_european', // Romanian
293 'ru' => 'cyrillic', // Russian
294 'sk' => 'east_european', // Slovak
295 'sl' => 'east_european', // Slovenian
296 'sr' => 'cyrillic', // Serbian
297 'sv' => 'west_european', // Swedish
298 'sq' => 'albanian', // Albanian
299 'th' => 'thai',
300 'uk' => 'cyrillic', // Ukranian
301 'vi' => 'vietnamese',
302 'zh' => 'chinese',
303 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
304 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
305 'ara' => 'arabic',
306 'bgr' => 'cyrillic', // Bulgarian
307 'cat' => 'west_european', // Catalan
308 'chs' => 'simpl_chinese',
309 'cht' => 'trad_chinese',
310 'csy' => 'east_european', // Czech
311 'dan' => 'west_european', // Danisch
312 'deu' => 'west_european', // German
313 'dea' => 'west_european', // German (Austrian)
314 'des' => 'west_european', // German (Swiss)
315 'ena' => 'west_european', // English (Australian)
316 'enc' => 'west_european', // English (Canadian)
317 'eng' => 'west_european', // English
318 'enz' => 'west_european', // English (New Zealand)
319 'enu' => 'west_european', // English (United States)
320 'euq' => 'west_european', // Basque
321 'fos' => 'west_european', // Faroese
322 'far' => 'arabic', // Persian
323 'fin' => 'west_european', // Finish
324 'fra' => 'west_european', // French
325 'frb' => 'west_european', // French (Belgian)
326 'frc' => 'west_european', // French (Canadian)
327 'frs' => 'west_european', // French (Swiss)
328 'geo' => 'unicode', // Georgian
329 'glg' => 'west_european', // Galician
330 'ell' => 'greek',
331 'heb' => 'hebrew',
332 'hin' => 'unicode', // Hindi
333 'hun' => 'east_european', // Hungarian
334 'isl' => 'west_euorpean', // Icelandic
335 'ita' => 'west_european', // Italian
336 'its' => 'west_european', // Italian (Swiss)
337 'jpn' => 'japanese',
338 'kor' => 'korean',
339 'lth' => 'lithuanian',
340 'lvi' => 'west_european', // Latvian/Lettish
341 'msl' => 'west_european', // Malay
342 'nlb' => 'west_european', // Dutch (Belgian)
343 'nld' => 'west_european', // Dutch
344 'nor' => 'west_european', // Norwegian (bokmal)
345 'non' => 'west_european', // Norwegian (nynorsk)
346 'plk' => 'east_european', // Polish
347 'ptg' => 'west_european', // Portuguese
348 'ptb' => 'west_european', // Portuguese (Brazil)
349 'rom' => 'east_european', // Romanian
350 'rus' => 'cyrillic', // Russian
351 'slv' => 'east_european', // Slovenian
352 'sky' => 'east_european', // Slovak
353 'srl' => 'east_european', // Serbian (Latin)
354 'srb' => 'cyrillic', // Serbian (Cyrillic)
355 'esp' => 'west_european', // Spanish (trad. sort)
356 'esm' => 'west_european', // Spanish (Mexican)
357 'esn' => 'west_european', // Spanish (internat. sort)
358 'sve' => 'west_european', // Swedish
359 'sqi' => 'albanian', // Albanian
360 'tha' => 'thai',
361 'trk' => 'turkish',
362 'ukr' => 'cyrillic', // Ukrainian
363 // English language names
364 'albanian' => 'albanian',
365 'arabic' => 'arabic',
366 'basque' => 'west_european',
367 'bosnian' => 'east_european',
368 'bulgarian' => 'east_european',
369 'catalan' => 'west_european',
370 'croatian' => 'east_european',
371 'czech' => 'east_european',
372 'danish' => 'west_european',
373 'dutch' => 'west_european',
374 'english' => 'west_european',
375 'esperanto' => 'unicode',
376 'estonian' => 'estonian',
377 'faroese' => 'west_european',
378 'farsi' => 'arabic',
379 'finnish' => 'west_european',
380 'french' => 'west_european',
381 'galician' => 'west_european',
382 'georgian' => 'unicode',
383 'german' => 'west_european',
384 'greek' => 'greek',
385 'greenlandic' => 'west_european',
386 'hebrew' => 'hebrew',
387 'hindi' => 'unicode',
388 'hungarian' => 'east_european',
389 'icelandic' => 'west_european',
390 'italian' => 'west_european',
391 'latvian' => 'west_european',
392 'lettish' => 'west_european',
393 'lithuanian' => 'lithuanian',
394 'malay' => 'west_european',
395 'norwegian' => 'west_european',
396 'persian' => 'arabic',
397 'polish' => 'east_european',
398 'portuguese' => 'west_european',
399 'russian' => 'cyrillic',
400 'romanian' => 'east_european',
401 'serbian' => 'cyrillic',
402 'slovak' => 'east_european',
403 'slovenian' => 'east_european',
404 'spanish' => 'west_european',
405 'svedish' => 'west_european',
406 'that' => 'thai',
407 'turkish' => 'turkish',
408 'ukrainian' => 'cyrillic',
409 );
410
411 // mapping of language (family) names to charsets on Unix
412 var $script_to_charset_unix=array(
413 'west_european' => 'iso-8859-1',
414 'estonian' => 'iso-8859-1',
415 'east_european' => 'iso-8859-2',
416 'baltic' => 'iso-8859-4',
417 'cyrillic' => 'iso-8859-5',
418 'arabic' => 'iso-8859-6',
419 'greek' => 'iso-8859-7',
420 'hebrew' => 'iso-8859-8',
421 'turkish' => 'iso-8859-9',
422 'thai' => 'iso-8859-11', // = TIS-620
423 'lithuanian' => 'iso-8859-13',
424 'chinese' => 'gb2312', // = euc-cn
425 'japanese' => 'euc-jp',
426 'korean' => 'euc-kr',
427 'simpl_chinese' => 'gb2312',
428 'trad_chinese' => 'big5',
429 'vietnamese' => '',
430 'unicode' => 'utf-8',
431 'albanian' => 'utf-8'
432 );
433
434 // mapping of language (family) names to charsets on Windows
435 var $script_to_charset_windows=array(
436 'east_european' => 'windows-1250',
437 'cyrillic' => 'windows-1251',
438 'west_european' => 'windows-1252',
439 'greek' => 'windows-1253',
440 'turkish' => 'windows-1254',
441 'hebrew' => 'windows-1255',
442 'arabic' => 'windows-1256',
443 'baltic' => 'windows-1257',
444 'estonian' => 'windows-1257',
445 'lithuanian' => 'windows-1257',
446 'vietnamese' => 'windows-1258',
447 'thai' => 'cp874',
448 'korean' => 'cp949',
449 'chinese' => 'gb2312',
450 'japanese' => 'shift_jis',
451 'simpl_chinese' => 'gb2312',
452 'trad_chinese' => 'big5',
453 'albanian' => 'windows-1250',
454 'unicode' => 'utf-8'
455 );
456
457 // mapping of locale names to charsets
458 var $locale_to_charset=array(
459 'japanese.euc' => 'euc-jp',
460 'ja_jp.ujis' => 'euc-jp',
461 'korean.euc' => 'euc-kr',
462 'sr@Latn' => 'iso-8859-2',
463 'zh_cn' => 'gb2312',
464 'zh_hk' => 'big5',
465 'zh_tw' => 'big5',
466 );
467
468 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
469 // Empty values means "iso-8859-1"
470 var $charSetArray = array(
471 'dk' => '',
472 'de' => '',
473 'no' => '',
474 'it' => '',
475 'fr' => '',
476 'es' => '',
477 'nl' => '',
478 'cz' => 'windows-1250',
479 'pl' => 'iso-8859-2',
480 'si' => 'windows-1250',
481 'fi' => '',
482 'tr' => 'iso-8859-9',
483 'se' => '',
484 'pt' => '',
485 'ru' => 'windows-1251',
486 'ro' => 'iso-8859-2',
487 'ch' => 'gb2312',
488 'sk' => 'windows-1250',
489 'lt' => 'windows-1257',
490 'is' => 'utf-8',
491 'hr' => 'windows-1250',
492 'hu' => 'iso-8859-2',
493 'gl' => '',
494 'th' => 'iso-8859-11',
495 'gr' => 'iso-8859-7',
496 'hk' => 'big5',
497 'eu' => '',
498 'bg' => 'windows-1251',
499 'br' => '',
500 'et' => 'iso-8859-4',
501 'ar' => 'iso-8859-6',
502 'he' => 'utf-8',
503 'ua' => 'windows-1251',
504 'jp' => 'shift_jis',
505 'lv' => 'utf-8',
506 'vn' => 'utf-8',
507 'ca' => 'iso-8859-15',
508 'ba' => 'iso-8859-2',
509 'kr' => 'euc-kr',
510 'eo' => 'utf-8',
511 'my' => '',
512 'hi' => 'utf-8',
513 'fo' => 'utf-8',
514 'fa' => 'utf-8',
515 'sr' => 'utf-8',
516 'sq' => 'utf-8',
517 'ge' => 'utf-8',
518 'ga' => '',
519 );
520
521 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
522 // Missing keys means: same as Typo3
523 var $isoArray = array(
524 'ba' => 'bs',
525 'br' => 'pt_BR',
526 'ch' => 'zh_CN',
527 'cz' => 'cs',
528 'dk' => 'da',
529 'si' => 'sl',
530 'se' => 'sv',
531 'gl' => 'kl',
532 'gr' => 'el',
533 'hk' => 'zh_HK',
534 'kr' => 'ko',
535 'ua' => 'uk',
536 'jp' => 'ja',
537 'vn' => 'vi',
538 );
539
540 /**
541 * Normalize - changes input character set to lowercase letters.
542 *
543 * @param string Input charset
544 * @return string Normalized charset
545 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
546 */
547 function parse_charset($charset) {
548 $charset = trim(strtolower($charset));
549 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
550
551 return $charset;
552 }
553
554 /**
555 * Get the charset of a locale.
556 *
557 * ln language
558 * ln_CN language / country
559 * ln_CN.cs language / country / charset
560 * ln_CN.cs@mod language / country / charset / modifier
561 *
562 * @param string Locale string
563 * @return string Charset resolved for locale string
564 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
565 */
566 function get_locale_charset($locale) {
567 $locale = strtolower($locale);
568
569 // exact locale specific charset?
570 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale];
571
572 // get modifier
573 list($locale,$modifier) = explode('@',$locale);
574
575 // locale contains charset: use it
576 list($locale,$charset) = explode('.',$locale);
577 if ($charset) return $this->parse_charset($charset);
578
579 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
580 if ($modifier == 'euro') return 'iso-8859-15';
581
582 // get language
583 list($language,$country) = explode('_',$locale);
584 if (isset($this->lang_to_script[$language])) $script = $this->lang_to_script[$language];
585
586 if (TYPO3_OS == 'WIN') {
587 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
588 } else {
589 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
590 }
591
592 return $cs;
593 }
594
595
596
597
598
599
600
601
602
603 /********************************************
604 *
605 * Charset Conversion functions
606 *
607 ********************************************/
608
609 /**
610 * Convert from one charset to another charset.
611 *
612 * @param string Input string
613 * @param string From charset (the current charset of the string)
614 * @param string To charset (the output charset wanted)
615 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
616 * @return string Converted string
617 * @see convArray()
618 */
619 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
620 if ($fromCS==$toCS) return $str;
621
622 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
623 if ($toCS=='utf-8' || !$useEntityForNoChar) {
624 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
625 case 'mbstring':
626 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
627 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
628 break;
629
630 case 'iconv':
631 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
632 if (false !== $conv_str) return $conv_str;
633 break;
634
635 case 'recode':
636 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
637 if (false !== $conv_str) return $conv_str;
638 break;
639 }
640 // fallback to TYPO3 conversion
641 }
642
643 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
644 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
645 return $str;
646 }
647
648 /**
649 * Convert all elements in ARRAY from one charset to another charset.
650 * NOTICE: Array is passed by reference!
651 *
652 * @param string Input array, possibly multidimensional
653 * @param string From charset (the current charset of the string)
654 * @param string To charset (the output charset wanted)
655 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
656 * @return void
657 * @see conv()
658 */
659 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
660 foreach($array as $key => $value) {
661 if (is_array($array[$key])) {
662 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
663 } else {
664 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
665 }
666 }
667 }
668
669 /**
670 * Converts $str from $charset to UTF-8
671 *
672 * @param string String in local charset to convert to UTF-8
673 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
674 * @return string Output string, converted to UTF-8
675 */
676 function utf8_encode($str,$charset) {
677
678 if ($charset === 'utf-8') return $str;
679
680 // Charset is case-insensitive.
681 if ($this->initCharset($charset)) { // Parse conv. table if not already...
682 $strLen = strlen($str);
683 $outStr='';
684
685 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
686 $chr=substr($str,$a,1);
687 $ord=ord($chr);
688 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
689 $ord2 = ord($str{$a+1});
690 $ord = $ord<<8 | $ord2; // assume big endian
691
692 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
693 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
694 } else $outStr.=chr($this->noCharByteVal); // No char exists
695 $a++;
696 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
697 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
698 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
699 $a++;
700 $ord2=ord(substr($str,$a,1));
701 $ord = $ord*256+$ord2;
702 }
703 }
704
705 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
706 $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
707 } else $outStr.= chr($this->noCharByteVal); // No char exists
708 } else $outStr.= $chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
709 }
710 return $outStr;
711 }
712 }
713
714 /**
715 * Converts $str from UTF-8 to $charset
716 *
717 * @param string String in UTF-8 to convert to local charset
718 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
719 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
720 * @return string Output string, converted to local charset
721 */
722 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
723
724 if ($charset === 'utf-8') {
725 return $str;
726 }
727
728 // Charset is case-insensitive.
729 if ($this->initCharset($charset)) { // Parse conv. table if not already...
730 $strLen = strlen($str);
731 $outStr='';
732 $buf='';
733 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
734 $chr=substr($str,$a,1);
735 $ord=ord($chr);
736 if ($ord>127) { // This means multibyte! (first byte!)
737 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
738
739 $buf=$chr; // Add first byte
740 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
741 $ord = $ord << 1; // Shift it left and ...
742 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
743 $a++; // Increase pointer...
744 $buf.=substr($str,$a,1); // ... and add the next char.
745 } else break;
746 }
747
748 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
749 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
750 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
751 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
752 } else $outStr.= chr($mByte);
753 } elseif ($useEntityForNoChar) { // Create num entity:
754 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
755 } else $outStr.=chr($this->noCharByteVal); // No char exists
756 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
757 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
758 }
759 return $outStr;
760 }
761 }
762
763 /**
764 * Converts all chars > 127 to numeric entities.
765 *
766 * @param string Input string
767 * @return string Output string
768 */
769 function utf8_to_entities($str) {
770 $strLen = strlen($str);
771 $outStr='';
772 $buf='';
773 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
774 $chr=substr($str,$a,1);
775 $ord=ord($chr);
776 if ($ord>127) { // This means multibyte! (first byte!)
777 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
778 $buf=$chr; // Add first byte
779 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
780 $ord = $ord << 1; // Shift it left and ...
781 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
782 $a++; // Increase pointer...
783 $buf.=substr($str,$a,1); // ... and add the next char.
784 } else break;
785 }
786
787 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
788 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
789 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
790 }
791
792 return $outStr;
793 }
794
795 /**
796 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
797 *
798 * @param string Input string, UTF-8
799 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
800 * @return string Output string
801 */
802 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
803 if ($alsoStdHtmlEnt) {
804 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
805 }
806
807 $token = md5(microtime());
808 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
809 foreach($parts as $k => $v) {
810 if ($k%2) {
811 if (substr($v,0,1)=='#') { // Dec or hex entities:
812 if (substr($v,1,1)=='x') {
813 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
814 } else {
815 $parts[$k] = $this->UnumberToChar(substr($v,1));
816 }
817 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
818 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
819 } else { // No conversion:
820 $parts[$k] ='&'.$v.';';
821 }
822 }
823 }
824
825 return implode('',$parts);
826 }
827
828 /**
829 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
830 *
831 * @param string Input string, UTF-8
832 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
833 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
834 * @return array Output array with the char numbers
835 */
836 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
837 // If entities must be registered as well...:
838 if ($convEntities) {
839 $str = $this->entities_to_utf8($str,1);
840 }
841 // Do conversion:
842 $strLen = strlen($str);
843 $outArr=array();
844 $buf='';
845 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
846 $chr=substr($str,$a,1);
847 $ord=ord($chr);
848 if ($ord>127) { // This means multibyte! (first byte!)
849 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
850 $buf=$chr; // Add first byte
851 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
852 $ord = $ord << 1; // Shift it left and ...
853 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
854 $a++; // Increase pointer...
855 $buf.=substr($str,$a,1); // ... and add the next char.
856 } else break;
857 }
858
859 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
860 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
861 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
862 }
863
864 return $outArr;
865 }
866
867 /**
868 * Converts a UNICODE number to a UTF-8 multibyte character
869 * Algorithm based on script found at From: http://czyborra.com/utf/
870 * Unit-tested by Kasper
871 *
872 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
873 *
874 * bytes | bits | representation
875 * 1 | 7 | 0vvvvvvv
876 * 2 | 11 | 110vvvvv 10vvvvvv
877 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
878 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
879 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
880 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
881 *
882 * @param integer UNICODE integer
883 * @return string UTF-8 multibyte character string
884 * @see utf8CharToUnumber()
885 */
886 function UnumberToChar($cbyte) {
887 $str='';
888
889 if ($cbyte < 0x80) {
890 $str.=chr($cbyte);
891 } else if ($cbyte < 0x800) {
892 $str.=chr(0xC0 | ($cbyte >> 6));
893 $str.=chr(0x80 | ($cbyte & 0x3F));
894 } else if ($cbyte < 0x10000) {
895 $str.=chr(0xE0 | ($cbyte >> 12));
896 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
897 $str.=chr(0x80 | ($cbyte & 0x3F));
898 } else if ($cbyte < 0x200000) {
899 $str.=chr(0xF0 | ($cbyte >> 18));
900 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
901 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
902 $str.=chr(0x80 | ($cbyte & 0x3F));
903 } else if ($cbyte < 0x4000000) {
904 $str.=chr(0xF8 | ($cbyte >> 24));
905 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
906 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
907 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
908 $str.=chr(0x80 | ($cbyte & 0x3F));
909 } else if ($cbyte < 0x80000000) {
910 $str.=chr(0xFC | ($cbyte >> 30));
911 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
912 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
913 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
914 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
915 $str.=chr(0x80 | ($cbyte & 0x3F));
916 } else { // Cannot express a 32-bit character in UTF-8
917 $str .= chr($this->noCharByteVal);
918 }
919 return $str;
920 }
921
922 /**
923 * Converts a UTF-8 Multibyte character to a UNICODE number
924 * Unit-tested by Kasper
925 *
926 * @param string UTF-8 multibyte character string
927 * @param boolean If set, then a hex. number is returned.
928 * @return integer UNICODE integer
929 * @see UnumberToChar()
930 */
931 function utf8CharToUnumber($str,$hex=0) {
932 $ord=ord(substr($str,0,1)); // First char
933
934 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
935 $binBuf='';
936 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
937 $ord = $ord << 1; // Shift it left and ...
938 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
939 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
940 } else break;
941 }
942 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
943
944 $int = bindec($binBuf);
945 } else $int = $ord;
946
947 return $hex ? 'x'.dechex($int) : $int;
948 }
949
950
951
952
953
954
955
956
957
958 /********************************************
959 *
960 * Init functions
961 *
962 ********************************************/
963
964 /**
965 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
966 * This function is automatically called by the conversion functions
967 *
968 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
969 *
970 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
971 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
972 * @access private
973 */
974 function initCharset($charset) {
975 // Only process if the charset is not yet loaded:
976 if (!is_array($this->parsedCharsets[$charset])) {
977
978 // Conversion table filename:
979 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
980
981 // If the conversion table is found:
982 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
983 // Cache file for charsets:
984 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
985 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
986 if ($cacheFile && @is_file($cacheFile)) {
987 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
988 } else {
989 // Parse conversion table into lines:
990 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
991 // Initialize the internal variable holding the conv. table:
992 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
993 // traverse the lines:
994 $detectedType='';
995 foreach($lines as $value) {
996 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
997
998 // Detect type if not done yet: (Done on first real line)
999 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1000 if (!$detectedType) $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/',$value) ? 'whitespaced' : 'ms-token';
1001
1002 if ($detectedType=='ms-token') {
1003 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1004 } elseif ($detectedType=='whitespaced') {
1005 $regA=array();
1006 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/',$value,$regA);
1007 $hexbyte = $regA[1];
1008 $utf8 = 'U+'.$regA[2];
1009 }
1010 $decval = hexdec(trim($hexbyte));
1011 if ($decval>127) {
1012 $utf8decval = hexdec(substr(trim($utf8),2));
1013 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
1014 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
1015 }
1016 }
1017 }
1018 if ($cacheFile) {
1019 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
1020 }
1021 }
1022 return 2;
1023 } else return false;
1024 } else return 1;
1025 }
1026
1027 /**
1028 * This function initializes all UTF-8 character data tables.
1029 *
1030 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1031 *
1032 * @param string Mode ("case", "ascii", ...)
1033 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1034 * @access private
1035 */
1036 function initUnicodeData($mode=null) {
1037 // cache files
1038 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1039 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1040
1041 // Only process if the tables are not yet loaded
1042 switch($mode) {
1043 case 'case':
1044 if (is_array($this->caseFolding['utf-8'])) return 1;
1045
1046 // Use cached version if possible
1047 if ($cacheFileCase && @is_file($cacheFileCase)) {
1048 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1049 return 2;
1050 }
1051 break;
1052
1053 case 'ascii':
1054 if (is_array($this->toASCII['utf-8'])) return 1;
1055
1056 // Use cached version if possible
1057 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1058 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1059 return 2;
1060 }
1061 break;
1062 }
1063
1064 // process main Unicode data file
1065 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
1066 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
1067
1068 $fh = fopen($unicodeDataFile,'rb');
1069 if (!$fh) return false;
1070
1071 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1072 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1073 $this->caseFolding['utf-8'] = array();
1074 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1075 $utf8CaseFolding['toUpper'] = array();
1076 $utf8CaseFolding['toLower'] = array();
1077 $utf8CaseFolding['toTitle'] = array();
1078
1079 $decomposition = array(); // array of temp. decompositions
1080 $mark = array(); // array of chars that are marks (eg. composing accents)
1081 $number = array(); // array of chars that are numbers (eg. digits)
1082 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1083
1084 while (!feof($fh)) {
1085 $line = fgets($fh,4096);
1086 // has a lot of info
1087 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = explode(';', rtrim($line));
1088
1089 $ord = hexdec($char);
1090 if ($ord > 0xFFFF) break; // only process the BMP
1091
1092 $utf8_char = $this->UnumberToChar($ord);
1093
1094 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1095 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1096 // store "title" only when different from "upper" (only a few)
1097 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1098
1099 switch ($cat{0}) {
1100 case 'M': // mark (accent, umlaut, ...)
1101 $mark["U+$char"] = 1;
1102 break;
1103
1104 case 'N': // numeric value
1105 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
1106 }
1107
1108 // accented Latin letters without "official" decomposition
1109 $match = array();
1110 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/',$name,$match) && !$decomp) {
1111 $c = ord($match[2]);
1112 if ($match[1] == 'SMALL') $c += 32;
1113
1114 $decomposition["U+$char"] = array(dechex($c));
1115 continue;
1116 }
1117
1118 $match = array();
1119 if (preg_match('/(<.*>)? *(.+)/',$decomp,$match)) {
1120 switch($match[1]) {
1121 case '<circle>': // add parenthesis as circle replacement, eg (1)
1122 $match[2] = '0028 '.$match[2].' 0029';
1123 break;
1124
1125 case '<square>': // add square brackets as square replacement, eg [1]
1126 $match[2] = '005B '.$match[2].' 005D';
1127 break;
1128
1129 case '<compat>': // ignore multi char decompositions that start with a space
1130 if (preg_match('/^0020 /',$match[2])) continue 2;
1131 break;
1132
1133 // ignore Arabic and vertical layout presentation decomposition
1134 case '<initial>':
1135 case '<medial>':
1136 case '<final>':
1137 case '<isolated>':
1138 case '<vertical>':
1139 continue 2;
1140 }
1141 $decomposition["U+$char"] = explode(' ', $match[2]);
1142 }
1143 }
1144 fclose($fh);
1145
1146 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1147 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
1148 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1149 $fh = fopen($specialCasingFile,'rb');
1150 if ($fh) {
1151 while (!feof($fh)) {
1152 $line = fgets($fh,4096);
1153 if ($line{0} != '#' && trim($line) != '') {
1154
1155 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
1156 if ($cond == '' || $cond{0} == '#') {
1157 $utf8_char = $this->UnumberToChar(hexdec($char));
1158 if ($char != $lower) {
1159 $arr = explode(' ', $lower);
1160 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1161 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1162 }
1163 if ($char != $title && $title != $upper) {
1164 $arr = explode(' ', $title);
1165 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1166 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1167 }
1168 if ($char != $upper) {
1169 $arr = explode(' ', $upper);
1170 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1171 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1172 }
1173 }
1174 }
1175 }
1176 fclose($fh);
1177 }
1178 }
1179
1180 // process custom decompositions
1181 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
1182 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1183 $fh = fopen($customTranslitFile,'rb');
1184 if ($fh) {
1185 while (!feof($fh)) {
1186 $line = fgets($fh,4096);
1187 if ($line{0} != '#' && trim($line) != '') {
1188 list($char,$translit) = t3lib_div::trimExplode(';', $line);
1189 if (!$translit) $omit["U+$char"] = 1;
1190 $decomposition["U+$char"] = explode(' ', $translit);
1191
1192 }
1193 }
1194 fclose($fh);
1195 }
1196 }
1197
1198 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1199 foreach($decomposition as $from => $to) {
1200 $code_decomp = array();
1201
1202 while ($code_value = array_shift($to)) {
1203 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1204 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
1205 array_unshift($to, $cv);
1206 }
1207 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1208 array_push($code_decomp, $code_value);
1209 }
1210 }
1211 if (count($code_decomp) || isset($omit[$from])) {
1212 $decomposition[$from] = $code_decomp;
1213 } else {
1214 unset($decomposition[$from]);
1215 }
1216 }
1217
1218 // create ascii only mapping
1219 $this->toASCII['utf-8'] = array();
1220 $ascii =& $this->toASCII['utf-8'];
1221
1222 foreach($decomposition as $from => $to) {
1223 $code_decomp = array();
1224 while ($code_value = array_shift($to)) {
1225 $ord = hexdec($code_value);
1226 if ($ord > 127)
1227 continue 2; // skip decompositions containing non-ASCII chars
1228 else
1229 array_push($code_decomp,chr($ord));
1230 }
1231 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1232 }
1233
1234 // add numeric decompositions
1235 foreach($number as $from => $to) {
1236 $utf8_char = $this->UnumberToChar(hexdec($from));
1237 if (!isset($ascii[$utf8_char])) {
1238 $ascii[$utf8_char] = $to;
1239 }
1240 }
1241
1242 if ($cacheFileCase) {
1243 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1244 }
1245
1246 if ($cacheFileASCII) {
1247 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1248 }
1249
1250 return 3;
1251 }
1252
1253 /**
1254 * This function initializes the folding table for a charset other than UTF-8.
1255 * This function is automatically called by the case folding functions.
1256 *
1257 * @param string Charset for which to initialize case folding.
1258 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1259 * @access private
1260 */
1261 function initCaseFolding($charset) {
1262 // Only process if the case table is not yet loaded:
1263 if (is_array($this->caseFolding[$charset])) return 1;
1264
1265 // Use cached version if possible
1266 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1267 if ($cacheFile && @is_file($cacheFile)) {
1268 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1269 return 2;
1270 }
1271
1272 // init UTF-8 conversion for this charset
1273 if (!$this->initCharset($charset)) {
1274 return false;
1275 }
1276
1277 // UTF-8 case folding is used as the base conversion table
1278 if (!$this->initUnicodeData('case')) {
1279 return false;
1280 }
1281
1282 $nochar = chr($this->noCharByteVal);
1283 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1284 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1285 $c = $this->utf8_decode($utf8, $charset);
1286
1287 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1288 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1289 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1290
1291 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1292 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1293 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
1294
1295 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1296 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1297 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1298 }
1299
1300 // add the ASCII case table
1301 for ($i=ord('a'); $i<=ord('z'); $i++) {
1302 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
1303 }
1304 for ($i=ord('A'); $i<=ord('Z'); $i++) {
1305 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
1306 }
1307
1308 if ($cacheFile) {
1309 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
1310 }
1311
1312 return 3;
1313 }
1314
1315 /**
1316 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1317 * This function is automatically called by the ASCII transliteration functions.
1318 *
1319 * @param string Charset for which to initialize conversion.
1320 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1321 * @access private
1322 */
1323 function initToASCII($charset) {
1324 // Only process if the case table is not yet loaded:
1325 if (is_array($this->toASCII[$charset])) return 1;
1326
1327 // Use cached version if possible
1328 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1329 if ($cacheFile && @is_file($cacheFile)) {
1330 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1331 return 2;
1332 }
1333
1334 // init UTF-8 conversion for this charset
1335 if (!$this->initCharset($charset)) {
1336 return false;
1337 }
1338
1339 // UTF-8/ASCII transliteration is used as the base conversion table
1340 if (!$this->initUnicodeData('ascii')) {
1341 return false;
1342 }
1343
1344 $nochar = chr($this->noCharByteVal);
1345 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1346 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1347 $c = $this->utf8_decode($utf8, $charset);
1348
1349 if (isset($this->toASCII['utf-8'][$utf8])) {
1350 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1351 }
1352 }
1353
1354 if ($cacheFile) {
1355 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
1356 }
1357
1358 return 3;
1359 }
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376 /********************************************
1377 *
1378 * String operation functions
1379 *
1380 ********************************************/
1381
1382 /**
1383 * Returns a part of a string.
1384 * Unit-tested by Kasper (single byte charsets only)
1385 *
1386 * @param string The character set
1387 * @param string Character string
1388 * @param integer Start position (character position)
1389 * @param integer Length (in characters)
1390 * @return string The substring
1391 * @see substr(), mb_substr()
1392 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1393 */
1394 function substr($charset,$string,$start,$len=null) {
1395 if ($len===0) return '';
1396
1397 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1398 // cannot omit $len, when specifying charset
1399 if ($len==null) {
1400 $enc = mb_internal_encoding(); // save internal encoding
1401 mb_internal_encoding($charset);
1402 $str = mb_substr($string,$start);
1403 mb_internal_encoding($enc); // restore internal encoding
1404
1405 return $str;
1406 }
1407 else {
1408 return mb_substr($string,$start,$len,$charset);
1409 }
1410 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1411 // cannot omit $len, when specifying charset
1412 if ($len==null) {
1413 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1414 iconv_set_encoding('internal_encoding',$charset);
1415 $str = iconv_substr($string,$start);
1416 iconv_set_encoding('internal_encoding',$enc); // restore internal encoding
1417
1418 return $str;
1419 }
1420 else {
1421 return iconv_substr($string,$start,$len,$charset);
1422 }
1423 } elseif ($charset == 'utf-8') {
1424 return $this->utf8_substr($string,$start,$len);
1425 } elseif ($this->eucBasedSets[$charset]) {
1426 return $this->euc_substr($string,$start,$charset,$len);
1427 } elseif ($this->twoByteSets[$charset]) {
1428 return substr($string,$start*2,$len*2);
1429 } elseif ($this->fourByteSets[$charset]) {
1430 return substr($string,$start*4,$len*4);
1431 }
1432
1433 // treat everything else as single-byte encoding
1434 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
1435 }
1436
1437 /**
1438 * Counts the number of characters.
1439 * Unit-tested by Kasper (single byte charsets only)
1440 *
1441 * @param string The character set
1442 * @param string Character string
1443 * @return integer The number of characters
1444 * @see strlen()
1445 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1446 */
1447 function strlen($charset,$string) {
1448 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1449 return mb_strlen($string,$charset);
1450 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1451 return iconv_strlen($string,$charset);
1452 } elseif ($charset == 'utf-8') {
1453 return $this->utf8_strlen($string);
1454 } elseif ($this->eucBasedSets[$charset]) {
1455 return $this->euc_strlen($string,$charset);
1456 } elseif ($this->twoByteSets[$charset]) {
1457 return strlen($string)/2;
1458 } elseif ($this->fourByteSets[$charset]) {
1459 return strlen($string)/4;
1460 }
1461 // treat everything else as single-byte encoding
1462 return strlen($string);
1463 }
1464
1465 /**
1466 * Truncates a string and pre-/appends a string.
1467 * Unit tested by Kasper
1468 *
1469 * @param string The character set
1470 * @param string Character string
1471 * @param integer Length (in characters)
1472 * @param string Crop signifier
1473 * @return string The shortened string
1474 * @see substr(), mb_strimwidth()
1475 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1476 */
1477 function crop($charset,$string,$len,$crop='') {
1478 if (intval($len) == 0) return $string;
1479
1480 if ($charset == 'utf-8') {
1481 $i = $this->utf8_char2byte_pos($string,$len);
1482 } elseif ($this->eucBasedSets[$charset]) {
1483 $i = $this->euc_char2byte_pos($string,$len,$charset);
1484 } else {
1485 if ($len > 0) {
1486 $i = $len;
1487 } else {
1488 $i = strlen($string)+$len;
1489 if ($i<=0) $i = false;
1490 }
1491 }
1492
1493 if ($i === false) { // $len outside actual string length
1494 return $string;
1495 } else {
1496 if ($len > 0) {
1497 if (strlen($string{$i})) {
1498 return substr($string,0,$i).$crop;
1499
1500 }
1501 } else {
1502 if (strlen($string{$i-1})) {
1503 return $crop.substr($string,$i);
1504 }
1505 }
1506
1507 /*
1508 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1509 if ($len > 0) {
1510 return substr($string,0,$i).$crop;
1511 } else {
1512 return $crop.substr($string,$i);
1513 }
1514 }
1515 */
1516 }
1517 return $string;
1518 }
1519
1520 /**
1521 * Cuts a string short at a given byte length.
1522 *
1523 * @param string The character set
1524 * @param string Character string
1525 * @param integer The byte length
1526 * @return string The shortened string
1527 * @see mb_strcut()
1528 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1529 */
1530 function strtrunc($charset,$string,$len) {
1531 if ($len <= 0) return '';
1532
1533 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1534 return mb_strcut($string,0,$len,$charset);
1535 } elseif ($charset == 'utf-8') {
1536 return $this->utf8_strtrunc($string,$len);
1537 } elseif ($this->eucBasedSets[$charset]) {
1538 return $this->euc_strtrunc($string,$charset);
1539 } elseif ($this->twoByteSets[$charset]) {
1540 if ($len % 2) $len--; // don't cut at odd positions
1541 } elseif ($this->fourByteSets[$charset]) {
1542 $x = $len % 4;
1543 $len -= $x; // realign to position dividable by four
1544 }
1545 // treat everything else as single-byte encoding
1546 return substr($string,0,$len);
1547 }
1548
1549 /**
1550 * Translates all characters of a string into their respective case values.
1551 * Unlike strtolower() and strtoupper() this method is locale independent.
1552 * Note that the string length may change!
1553 * eg. lower case German �(sharp S) becomes upper case "SS"
1554 * Unit-tested by Kasper
1555 * Real case folding is language dependent, this method ignores this fact.
1556 *
1557 * @param string Character set of string
1558 * @param string Input string to convert case for
1559 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1560 * @return string The converted string
1561 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1562 * @see strtolower(), strtoupper()
1563 */
1564 function conv_case($charset,$string,$case) {
1565 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1566 if ($case == 'toLower') {
1567 $string = mb_strtolower($string,$charset);
1568 } else {
1569 $string = mb_strtoupper($string,$charset);
1570 }
1571 } elseif ($charset == 'utf-8') {
1572 $string = $this->utf8_char_mapping($string,'case',$case);
1573 } elseif (isset($this->eucBasedSets[$charset])) {
1574 $string = $this->euc_char_mapping($string,$charset,'case',$case);
1575 } else {
1576 // treat everything else as single-byte encoding
1577 $string = $this->sb_char_mapping($string,$charset,'case',$case);
1578 }
1579
1580 return $string;
1581 }
1582
1583 /**
1584 * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
1585 *
1586 * @param string Character set of string
1587 * @param string Input string to convert
1588 * @return string The converted string
1589 */
1590 function specCharsToASCII($charset,$string) {
1591 if ($charset == 'utf-8') {
1592 $string = $this->utf8_char_mapping($string,'ascii');
1593 } elseif (isset($this->eucBasedSets[$charset])) {
1594 $string = $this->euc_char_mapping($string,$charset,'ascii');
1595 } else {
1596 // treat everything else as single-byte encoding
1597 $string = $this->sb_char_mapping($string,$charset,'ascii');
1598 }
1599
1600 return $string;
1601 }
1602
1603
1604 /**
1605 * converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1606 * into a TYPO3-readable language code
1607 * @param $languageCodesList list of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1608 * see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
1609 * @return string a preferred language that TYPO3 supports, or "default" if none found
1610 * @author Benjamin Mack (benni.typo3.org)
1611 */
1612 public function getPreferredClientLanguage($languageCodesList) {
1613 $allLanguageCodes = array();
1614 $selectedLanguage = 'default';
1615
1616 // get all languages where TYPO3 code is the same as the ISO code
1617 foreach ($this->charSetArray as $typo3Lang => $charSet) {
1618 $allLanguageCodes[$typo3Lang] = $typo3Lang;
1619 }
1620
1621 // get all languages where TYPO3 code differs from ISO code
1622 // or needs the country part
1623 // the iso codes will here overwrite the default typo3 language in the key
1624 foreach ($this->isoArray as $typo3Lang => $isoLang) {
1625 $isoLang = join('-', explode('_', $isoLang));
1626 $allLanguageCodes[$typo3Lang] = $isoLang;
1627 }
1628
1629 // move the iso codes to the (because we're comparing the keys with "isset" later on)
1630 $allLanguageCodes = array_flip($allLanguageCodes);
1631
1632
1633 $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList);
1634 // order the preferred languages after they key
1635 $sortedPreferredLanguages = array();
1636 foreach ($preferredLanguages as $preferredLanguage) {
1637 $quality = 1.0;
1638 if (strpos($preferredLanguage, ';q=') !== false) {
1639 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1640 }
1641 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1642 }
1643
1644 // loop through the languages, with the highest priority first
1645 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1646 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1647 if (isset($allLanguageCodes[$preferredLanguage])) {
1648 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1649 break;
1650 }
1651
1652 // strip the country code from the end
1653 list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
1654 if (isset($allLanguageCodes[$preferredLanguage])) {
1655 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1656 break;
1657 }
1658 }
1659 if (!$selectedLanguage || $selectedLanguage == 'en') {
1660 $selectedLanguage = 'default';
1661 }
1662 return $selectedLanguage;
1663 }
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674 /********************************************
1675 *
1676 * Internal string operation functions
1677 *
1678 ********************************************/
1679
1680 /**
1681 * Maps all characters of a string in a single byte charset.
1682 *
1683 * @param string the string
1684 * @param string the charset
1685 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1686 * @param string 'case': conversion 'toLower' or 'toUpper'
1687 * @return string the converted string
1688 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1689 */
1690 function sb_char_mapping($str,$charset,$mode,$opt='') {
1691 switch($mode) {
1692 case 'case':
1693 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1694 $map =& $this->caseFolding[$charset][$opt];
1695 break;
1696
1697 case 'ascii':
1698 if (!$this->initToASCII($charset)) return $str; // do nothing
1699 $map =& $this->toASCII[$charset];
1700 break;
1701
1702 default:
1703 return $str;
1704 }
1705
1706 $out = '';
1707 for($i=0; strlen($str{$i}); $i++) {
1708 $c = $str{$i};
1709 if (isset($map[$c])) {
1710 $out .= $map[$c];
1711 } else {
1712 $out .= $c;
1713 }
1714 }
1715
1716 return $out;
1717 }
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728 /********************************************
1729 *
1730 * Internal UTF-8 string operation functions
1731 *
1732 ********************************************/
1733
1734 /**
1735 * Returns a part of a UTF-8 string.
1736 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1737 *
1738 * @param string UTF-8 string
1739 * @param integer Start position (character position)
1740 * @param integer Length (in characters)
1741 * @return string The substring
1742 * @see substr()
1743 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1744 */
1745 function utf8_substr($str,$start,$len=null) {
1746 if (!strcmp($len,'0')) return '';
1747
1748 $byte_start = $this->utf8_char2byte_pos($str,$start);
1749 if ($byte_start === false) {
1750 if ($start > 0) {
1751 return false; // $start outside string length
1752 } else {
1753 $start = 0;
1754 }
1755 }
1756
1757 $str = substr($str,$byte_start);
1758
1759 if ($len!=null) {
1760 $byte_end = $this->utf8_char2byte_pos($str,$len);
1761 if ($byte_end === false) // $len outside actual string length
1762 return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string.
1763 else
1764 return substr($str,0,$byte_end);
1765 }
1766 else return $str;
1767 }
1768
1769 /**
1770 * Counts the number of characters of a string in UTF-8.
1771 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1772 *
1773 * @param string UTF-8 multibyte character string
1774 * @return integer The number of characters
1775 * @see strlen()
1776 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1777 */
1778 function utf8_strlen($str) {
1779 $n=0;
1780 for($i=0; strlen($str{$i}); $i++) {
1781 $c = ord($str{$i});
1782 if (!($c & 0x80)) // single-byte (0xxxxxx)
1783 $n++;
1784 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1785 $n++;
1786 }
1787 return $n;
1788 }
1789
1790 /**
1791 * Truncates a string in UTF-8 short at a given byte length.
1792 *
1793 * @param string UTF-8 multibyte character string
1794 * @param integer the byte length
1795 * @return string the shortened string
1796 * @see mb_strcut()
1797 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1798 */
1799 function utf8_strtrunc($str,$len) {
1800 $i = $len-1;
1801 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1802 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1803 if ($i <= 0) return ''; // sanity check
1804 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1805 if ($bc+$i > $len) return substr($str,0,$i);
1806 // fallthru: multibyte char fits into length
1807 }
1808 return substr($str,0,$len);
1809 }
1810
1811 /**
1812 * Find position of first occurrence of a string, both arguments are in UTF-8.
1813 *
1814 * @param string UTF-8 string to search in
1815 * @param string UTF-8 string to search for
1816 * @param integer Positition to start the search
1817 * @return integer The character position
1818 * @see strpos()
1819 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1820 */
1821 function utf8_strpos($haystack,$needle,$offset=0) {
1822 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1823 return mb_strpos($haystack,$needle,$offset,'utf-8');
1824 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1825 return iconv_strpos($haystack,$needle,$offset,'utf-8');
1826 }
1827
1828 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1829 if ($byte_offset === false) return false; // offset beyond string length
1830
1831 $byte_pos = strpos($haystack,$needle,$byte_offset);
1832 if ($byte_pos === false) return false; // needle not found
1833
1834 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1835 }
1836
1837 /**
1838 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1839 *
1840 * @param string UTF-8 string to search in
1841 * @param string UTF-8 character to search for (single character)
1842 * @return integer The character position
1843 * @see strrpos()
1844 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1845 */
1846 function utf8_strrpos($haystack,$needle) {
1847 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1848 return mb_strrpos($haystack,$needle,'utf-8');
1849 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1850 return iconv_strrpos($haystack,$needle,'utf-8');
1851 }
1852
1853 $byte_pos = strrpos($haystack,$needle);
1854 if ($byte_pos === false) return false; // needle not found
1855
1856 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1857 }
1858
1859 /**
1860 * Translates a character position into an 'absolute' byte position.
1861 * Unit tested by Kasper.
1862 *
1863 * @param string UTF-8 string
1864 * @param integer Character position (negative values start from the end)
1865 * @return integer Byte position
1866 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1867 */
1868 function utf8_char2byte_pos($str,$pos) {
1869 $n = 0; // number of characters found
1870 $p = abs($pos); // number of characters wanted
1871
1872 if ($pos >= 0) {
1873 $i = 0;
1874 $d = 1;
1875 } else {
1876 $i = strlen($str)-1;
1877 $d = -1;
1878 }
1879
1880 for( ; strlen($str{$i}) && $n<$p; $i+=$d) {
1881 $c = (int)ord($str{$i});
1882 if (!($c & 0x80)) // single-byte (0xxxxxx)
1883 $n++;
1884 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1885 $n++;
1886 }
1887 if (!strlen($str{$i})) return false; // offset beyond string length
1888
1889 if ($pos >= 0) {
1890 // skip trailing multi-byte data bytes
1891 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1892 } else {
1893 // correct offset
1894 $i++;
1895 }
1896
1897 return $i;
1898 }
1899
1900 /**
1901 * Translates an 'absolute' byte position into a character position.
1902 * Unit tested by Kasper.
1903 *
1904 * @param string UTF-8 string
1905 * @param integer byte position
1906 * @return integer character position
1907 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1908 */
1909 function utf8_byte2char_pos($str,$pos) {
1910 $n = 0; // number of characters
1911 for($i=$pos; $i>0; $i--) {
1912 $c = (int)ord($str{$i});
1913 if (!($c & 0x80)) // single-byte (0xxxxxx)
1914 $n++;
1915 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1916 $n++;
1917 }
1918 if (!strlen($str{$i})) return false; // offset beyond string length
1919
1920 return $n;
1921 }
1922
1923 /**
1924 * Maps all characters of an UTF-8 string.
1925 *
1926 * @param string UTF-8 string
1927 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1928 * @param string 'case': conversion 'toLower' or 'toUpper'
1929 * @return string the converted string
1930 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1931 */
1932 function utf8_char_mapping($str,$mode,$opt='') {
1933 if (!$this->initUnicodeData($mode)) return $str; // do nothing
1934
1935 $out = '';
1936 switch($mode) {
1937 case 'case':
1938 $map =& $this->caseFolding['utf-8'][$opt];
1939 break;
1940
1941 case 'ascii':
1942 $map =& $this->toASCII['utf-8'];
1943 break;
1944
1945 default:
1946 return $str;
1947 }
1948
1949 for($i=0; strlen($str{$i}); $i++) {
1950 $c = ord($str{$i});
1951 if (!($c & 0x80)) // single-byte (0xxxxxx)
1952 $mbc = $str{$i};
1953 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1954 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1955 $mbc = substr($str,$i,$bc);
1956 $i += $bc-1;
1957 }
1958
1959 if (isset($map[$mbc])) {
1960 $out .= $map[$mbc];
1961 } else {
1962 $out .= $mbc;
1963 }
1964 }
1965
1966 return $out;
1967 }
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986 /********************************************
1987 *
1988 * Internal EUC string operation functions
1989 *
1990 * Extended Unix Code:
1991 * ASCII compatible 7bit single bytes chars
1992 * 8bit two byte chars
1993 *
1994 * Shift-JIS is treated as a special case.
1995 *
1996 ********************************************/
1997
1998 /**
1999 * Cuts a string in the EUC charset family short at a given byte length.
2000 *
2001 * @param string EUC multibyte character string
2002 * @param integer the byte length
2003 * @param string the charset
2004 * @return string the shortened string
2005 * @see mb_strcut()
2006 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2007 */
2008 function euc_strtrunc($str,$len,$charset) {
2009 $sjis = ($charset == 'shift_jis');
2010 for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
2011 $c = ord($str{$i});
2012 if ($sjis) {
2013 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
2014 }
2015 else {
2016 if ($c >= 0x80) $i++; // advance a double-byte char
2017 }
2018 }
2019 if (!strlen($str{$i})) return $str; // string shorter than supplied length
2020
2021 if ($i>$len) {
2022 return substr($str,0,$len-1); // we ended on a first byte
2023 } else {
2024 return substr($str,0,$len);
2025 }
2026 }
2027
2028 /**
2029 * Returns a part of a string in the EUC charset family.
2030 *
2031 * @param string EUC multibyte character string
2032 * @param integer start position (character position)
2033 * @param string the charset
2034 * @param integer length (in characters)
2035 * @return string the substring
2036 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2037 */
2038 function euc_substr($str,$start,$charset,$len=null) {
2039 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
2040 if ($byte_start === false) return false; // $start outside string length
2041
2042 $str = substr($str,$byte_start);
2043
2044 if ($len!=null) {
2045 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
2046 if ($byte_end === false) // $len outside actual string length
2047 return $str;
2048 else
2049 return substr($str,0,$byte_end);
2050 }
2051 else return $str;
2052 }
2053
2054 /**
2055 * Counts the number of characters of a string in the EUC charset family.
2056 *
2057 * @param string EUC multibyte character string
2058 * @param string the charset
2059 * @return integer the number of characters
2060 * @see strlen()
2061 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2062 */
2063 function euc_strlen($str,$charset) {
2064 $sjis = ($charset == 'shift_jis');
2065 $n=0;
2066 for ($i=0; strlen($str{$i}); $i++) {
2067 $c = ord($str{$i});
2068 if ($sjis) {
2069 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
2070 }
2071 else {
2072 if ($c >= 0x80) $i++; // advance a double-byte char
2073 }
2074
2075 $n++;
2076 }
2077
2078 return $n;
2079 }
2080
2081 /**
2082 * Translates a character position into an 'absolute' byte position.
2083 *
2084 * @param string EUC multibyte character string
2085 * @param integer character position (negative values start from the end)
2086 * @param string the charset
2087 * @return integer byte position
2088 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2089 */
2090 function euc_char2byte_pos($str,$pos,$charset) {
2091 $sjis = ($charset == 'shift_jis');
2092 $n = 0; // number of characters seen
2093 $p = abs($pos); // number of characters wanted
2094
2095 if ($pos >= 0) {
2096 $i = 0;
2097 $d = 1;
2098 } else {
2099 $i = strlen($str)-1;
2100 $d = -1;
2101 }
2102
2103 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
2104 $c = ord($str{$i});
2105 if ($sjis) {
2106 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char
2107 }
2108 else {
2109 if ($c >= 0x80) $i+=$d; // advance a double-byte char
2110 }
2111
2112 $n++;
2113 }
2114 if (!strlen($str{$i})) return false; // offset beyond string length
2115
2116 if ($pos < 0) $i++; // correct offset
2117
2118 return $i;
2119 }
2120
2121 /**
2122 * Maps all characters of a string in the EUC charset family.
2123 *
2124 * @param string EUC multibyte character string
2125 * @param string the charset
2126 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2127 * @param string 'case': conversion 'toLower' or 'toUpper'
2128 * @return string the converted string
2129 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2130 */
2131 function euc_char_mapping($str,$charset,$mode,$opt='') {
2132 switch($mode) {
2133 case 'case':
2134 if (!$this->initCaseFolding($charset)) return $str; // do nothing
2135 $map =& $this->caseFolding[$charset][$opt];
2136 break;
2137
2138 case 'ascii':
2139 if (!$this->initToASCII($charset)) return $str; // do nothing
2140 $map =& $this->toASCII[$charset];
2141 break;
2142
2143 default:
2144 return $str;
2145 }
2146
2147 $sjis = ($charset == 'shift_jis');
2148 $out = '';
2149 for($i=0; strlen($str{$i}); $i++) {
2150 $mbc = $str{$i};
2151 $c = ord($mbc);
2152
2153 if ($sjis) {
2154 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2155 $mbc = substr($str,$i,2);
2156 $i++;
2157 }
2158 }
2159 else {
2160 if ($c >= 0x80) { // a double-byte char
2161 $mbc = substr($str,$i,2);
2162 $i++;
2163 }
2164 }
2165
2166 if (isset($map[$mbc])) {
2167 $out .= $map[$mbc];
2168 } else {
2169 $out .= $mbc;
2170 }
2171 }
2172
2173 return $out;
2174 }
2175
2176 }
2177
2178 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
2179 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2180 }
2181
2182 ?>