Revert "[RELEASE] Release of TYPO3 4.6.0alpha1"
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
28 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
29 */
30 /**
31 * [CLASS/FUNCTION INDEX of SCRIPT]
32 *
33 *
34 *
35 * 136: class t3lib_cs
36 * 488: function parse_charset($charset)
37 * 507: function get_locale_charset($locale)
38 *
39 * SECTION: Charset Conversion functions
40 * 560: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
41 * 600: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
42 * 617: function utf8_encode($str,$charset)
43 * 663: function utf8_decode($str,$charset,$useEntityForNoChar=0)
44 * 706: function utf8_to_entities($str)
45 * 739: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
46 * 773: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
47 * 823: function UnumberToChar($cbyte)
48 * 868: function utf8CharToUnumber($str,$hex=0)
49 *
50 * SECTION: Init functions
51 * 911: function initCharset($charset)
52 * 973: function initUnicodeData($mode=NULL)
53 * 1198: function initCaseFolding($charset)
54 * 1260: function initToASCII($charset)
55 *
56 * SECTION: String operation functions
57 * 1331: function substr($charset,$string,$start,$len=NULL)
58 * 1384: function strlen($charset,$string)
59 * 1414: function crop($charset,$string,$len,$crop='')
60 * 1467: function strtrunc($charset,$string,$len)
61 * 1501: function conv_case($charset,$string,$case)
62 * 1527: function specCharsToASCII($charset,$string)
63 *
64 * SECTION: Internal string operation functions
65 * 1567: function sb_char_mapping($str,$charset,$mode,$opt='')
66 *
67 * SECTION: Internal UTF-8 string operation functions
68 * 1622: function utf8_substr($str,$start,$len=NULL)
69 * 1655: function utf8_strlen($str)
70 * 1676: function utf8_strtrunc($str,$len)
71 * 1698: function utf8_strpos($haystack,$needle,$offset=0)
72 * 1723: function utf8_strrpos($haystack,$needle)
73 * 1745: function utf8_char2byte_pos($str,$pos)
74 * 1786: function utf8_byte2char_pos($str,$pos)
75 * 1809: function utf8_char_mapping($str,$mode,$opt='')
76 *
77 * SECTION: Internal EUC string operation functions
78 * 1885: function euc_strtrunc($str,$len,$charset)
79 * 1914: function euc_substr($str,$start,$charset,$len=NULL)
80 * 1939: function euc_strlen($str,$charset)
81 * 1966: function euc_char2byte_pos($str,$pos,$charset)
82 * 2007: function euc_char_mapping($str,$charset,$mode,$opt='')
83 *
84 * TOTAL FUNCTIONS: 35
85 * (This index is automatically created/updated by the extension "extdeveval")
86 *
87 */
88
89
90 /**
91 * Notes on UTF-8
92 *
93 * Functions working on UTF-8 strings:
94 *
95 * - strchr/strstr
96 * - strrchr
97 * - substr_count
98 * - implode/explode/join
99 *
100 * Functions nearly working on UTF-8 strings:
101 *
102 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
103 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
104 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
105 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
106 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
107 *
108 * Functions NOT working on UTF-8 strings:
109 *
110 * - str*cmp
111 * - stristr
112 * - stripos
113 * - substr
114 * - strrev
115 * - split/spliti
116 * - ...
117 *
118 */
119 /**
120 * Class for conversion between charsets
121 *
122 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
123 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
124 * @package TYPO3
125 * @subpackage t3lib
126 */
127 class t3lib_cs {
128 var $noCharByteVal = 63; // ASCII Value for chars with no equivalent.
129
130 // This is the array where parsed conversion tables are stored (cached)
131 var $parsedCharsets = array();
132
133 // An array where case folding data will be stored (cached)
134 var $caseFolding = array();
135
136 // An array where charset-to-ASCII mappings are stored (cached)
137 var $toASCII = array();
138
139 // This tells the converter which charsets has two bytes per char:
140 var $twoByteSets = array(
141 'ucs-2' => 1, // 2-byte Unicode
142 );
143
144 // This tells the converter which charsets has four bytes per char:
145 var $fourByteSets = array(
146 'ucs-4' => 1, // 4-byte Unicode
147 'utf-32' => 1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
148 );
149
150 // This tells the converter which charsets use a scheme like the Extended Unix Code:
151 var $eucBasedSets = array(
152 'gb2312' => 1, // Chinese, simplified.
153 'big5' => 1, // Chinese, traditional.
154 'euc-kr' => 1, // Korean
155 'shift_jis' => 1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
156 );
157
158 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
159 // http://czyborra.com/charsets/iso8859.html
160 var $synonyms = array(
161 'us' => 'ascii',
162 'us-ascii' => 'ascii',
163 'cp819' => 'iso-8859-1',
164 'ibm819' => 'iso-8859-1',
165 'iso-ir-100' => 'iso-8859-1',
166 'iso-ir-101' => 'iso-8859-2',
167 'iso-ir-109' => 'iso-8859-3',
168 'iso-ir-110' => 'iso-8859-4',
169 'iso-ir-144' => 'iso-8859-5',
170 'iso-ir-127' => 'iso-8859-6',
171 'iso-ir-126' => 'iso-8859-7',
172 'iso-ir-138' => 'iso-8859-8',
173 'iso-ir-148' => 'iso-8859-9',
174 'iso-ir-157' => 'iso-8859-10',
175 'iso-ir-179' => 'iso-8859-13',
176 'iso-ir-199' => 'iso-8859-14',
177 'iso-ir-203' => 'iso-8859-15',
178 'csisolatin1' => 'iso-8859-1',
179 'csisolatin2' => 'iso-8859-2',
180 'csisolatin3' => 'iso-8859-3',
181 'csisolatin5' => 'iso-8859-9',
182 'csisolatin8' => 'iso-8859-14',
183 'csisolatin9' => 'iso-8859-15',
184 'csisolatingreek' => 'iso-8859-7',
185 'iso-celtic' => 'iso-8859-14',
186 'latin1' => 'iso-8859-1',
187 'latin2' => 'iso-8859-2',
188 'latin3' => 'iso-8859-3',
189 'latin5' => 'iso-8859-9',
190 'latin6' => 'iso-8859-10',
191 'latin8' => 'iso-8859-14',
192 'latin9' => 'iso-8859-15',
193 'l1' => 'iso-8859-1',
194 'l2' => 'iso-8859-2',
195 'l3' => 'iso-8859-3',
196 'l5' => 'iso-8859-9',
197 'l6' => 'iso-8859-10',
198 'l8' => 'iso-8859-14',
199 'l9' => 'iso-8859-15',
200 'cyrillic' => 'iso-8859-5',
201 'arabic' => 'iso-8859-6',
202 'tis-620' => 'iso-8859-11',
203 'win874' => 'windows-874',
204 'win1250' => 'windows-1250',
205 'win1251' => 'windows-1251',
206 'win1252' => 'windows-1252',
207 'win1253' => 'windows-1253',
208 'win1254' => 'windows-1254',
209 'win1255' => 'windows-1255',
210 'win1256' => 'windows-1256',
211 'win1257' => 'windows-1257',
212 'win1258' => 'windows-1258',
213 'cp1250' => 'windows-1250',
214 'cp1251' => 'windows-1251',
215 'cp1252' => 'windows-1252',
216 'ms-ee' => 'windows-1250',
217 'ms-ansi' => 'windows-1252',
218 'ms-greek' => 'windows-1253',
219 'ms-turk' => 'windows-1254',
220 'winbaltrim' => 'windows-1257',
221 'koi-8ru' => 'koi-8r',
222 'koi8r' => 'koi-8r',
223 'cp878' => 'koi-8r',
224 'mac' => 'macroman',
225 'macintosh' => 'macroman',
226 'euc-cn' => 'gb2312',
227 'x-euc-cn' => 'gb2312',
228 'euccn' => 'gb2312',
229 'cp936' => 'gb2312',
230 'big-5' => 'big5',
231 'cp950' => 'big5',
232 'eucjp' => 'euc-jp',
233 'sjis' => 'shift_jis',
234 'shift-jis' => 'shift_jis',
235 'cp932' => 'shift_jis',
236 'cp949' => 'euc-kr',
237 'utf7' => 'utf-7',
238 'utf8' => 'utf-8',
239 'utf16' => 'utf-16',
240 'utf32' => 'utf-32',
241 'utf8' => 'utf-8',
242 'ucs2' => 'ucs-2',
243 'ucs4' => 'ucs-4',
244 );
245
246 // mapping of iso-639-1 language codes to script names
247 var $lang_to_script = array(
248 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
249 'ar' => 'arabic',
250 'bg' => 'cyrillic', // Bulgarian
251 'bs' => 'east_european', // Bosnian
252 'cs' => 'east_european', // Czech
253 'da' => 'west_european', // Danish
254 'de' => 'west_european', // German
255 'es' => 'west_european', // Spanish
256 'et' => 'estonian',
257 'eo' => 'unicode', // Esperanto
258 'eu' => 'west_european', // Basque
259 'fa' => 'arabic', // Persian
260 'fi' => 'west_european', // Finish
261 'fo' => 'west_european', // Faroese
262 'fr' => 'west_european', // French
263 'ga' => 'west_european', // Irish
264 'gl' => 'west_european', // Galician
265 'gr' => 'greek',
266 'he' => 'hebrew', // Hebrew (since 1998)
267 'hi' => 'unicode', // Hindi
268 'hr' => 'east_european', // Croatian
269 'hu' => 'east_european', // Hungarian
270 'iw' => 'hebrew', // Hebrew (til 1998)
271 'is' => 'west_european', // Icelandic
272 'it' => 'west_european', // Italian
273 'ja' => 'japanese',
274 'ka' => 'unicode', // Georgian
275 'kl' => 'west_european', // Greenlandic
276 'km' => 'unicode', // Khmer
277 'ko' => 'korean',
278 'lt' => 'lithuanian',
279 'lv' => 'west_european', // Latvian/Lettish
280 'nl' => 'west_european', // Dutch
281 'no' => 'west_european', // Norwegian
282 'nb' => 'west_european', // Norwegian Bokmal
283 'nn' => 'west_european', // Norwegian Nynorsk
284 'pl' => 'east_european', // Polish
285 'pt' => 'west_european', // Portuguese
286 'ro' => 'east_european', // Romanian
287 'ru' => 'cyrillic', // Russian
288 'sk' => 'east_european', // Slovak
289 'sl' => 'east_european', // Slovenian
290 'sr' => 'cyrillic', // Serbian
291 'sv' => 'west_european', // Swedish
292 'sq' => 'albanian', // Albanian
293 'th' => 'thai',
294 'uk' => 'cyrillic', // Ukranian
295 'vi' => 'vietnamese',
296 'zh' => 'chinese',
297 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
298 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
299 'ara' => 'arabic',
300 'bgr' => 'cyrillic', // Bulgarian
301 'cat' => 'west_european', // Catalan
302 'chs' => 'simpl_chinese',
303 'cht' => 'trad_chinese',
304 'csy' => 'east_european', // Czech
305 'dan' => 'west_european', // Danisch
306 'deu' => 'west_european', // German
307 'dea' => 'west_european', // German (Austrian)
308 'des' => 'west_european', // German (Swiss)
309 'ena' => 'west_european', // English (Australian)
310 'enc' => 'west_european', // English (Canadian)
311 'eng' => 'west_european', // English
312 'enz' => 'west_european', // English (New Zealand)
313 'enu' => 'west_european', // English (United States)
314 'euq' => 'west_european', // Basque
315 'fos' => 'west_european', // Faroese
316 'far' => 'arabic', // Persian
317 'fin' => 'west_european', // Finish
318 'fra' => 'west_european', // French
319 'frb' => 'west_european', // French (Belgian)
320 'frc' => 'west_european', // French (Canadian)
321 'frs' => 'west_european', // French (Swiss)
322 'geo' => 'unicode', // Georgian
323 'glg' => 'west_european', // Galician
324 'ell' => 'greek',
325 'heb' => 'hebrew',
326 'hin' => 'unicode', // Hindi
327 'hun' => 'east_european', // Hungarian
328 'isl' => 'west_euorpean', // Icelandic
329 'ita' => 'west_european', // Italian
330 'its' => 'west_european', // Italian (Swiss)
331 'jpn' => 'japanese',
332 'khm' => 'unicode', // Khmer
333 'kor' => 'korean',
334 'lth' => 'lithuanian',
335 'lvi' => 'west_european', // Latvian/Lettish
336 'msl' => 'west_european', // Malay
337 'nlb' => 'west_european', // Dutch (Belgian)
338 'nld' => 'west_european', // Dutch
339 'nor' => 'west_european', // Norwegian (bokmal)
340 'non' => 'west_european', // Norwegian (nynorsk)
341 'plk' => 'east_european', // Polish
342 'ptg' => 'west_european', // Portuguese
343 'ptb' => 'west_european', // Portuguese (Brazil)
344 'rom' => 'east_european', // Romanian
345 'rus' => 'cyrillic', // Russian
346 'slv' => 'east_european', // Slovenian
347 'sky' => 'east_european', // Slovak
348 'srl' => 'east_european', // Serbian (Latin)
349 'srb' => 'cyrillic', // Serbian (Cyrillic)
350 'esp' => 'west_european', // Spanish (trad. sort)
351 'esm' => 'west_european', // Spanish (Mexican)
352 'esn' => 'west_european', // Spanish (internat. sort)
353 'sve' => 'west_european', // Swedish
354 'sqi' => 'albanian', // Albanian
355 'tha' => 'thai',
356 'trk' => 'turkish',
357 'ukr' => 'cyrillic', // Ukrainian
358 // English language names
359 'albanian' => 'albanian',
360 'arabic' => 'arabic',
361 'basque' => 'west_european',
362 'bosnian' => 'east_european',
363 'bulgarian' => 'east_european',
364 'catalan' => 'west_european',
365 'croatian' => 'east_european',
366 'czech' => 'east_european',
367 'danish' => 'west_european',
368 'dutch' => 'west_european',
369 'english' => 'west_european',
370 'esperanto' => 'unicode',
371 'estonian' => 'estonian',
372 'faroese' => 'west_european',
373 'farsi' => 'arabic',
374 'finnish' => 'west_european',
375 'french' => 'west_european',
376 'galician' => 'west_european',
377 'georgian' => 'unicode',
378 'german' => 'west_european',
379 'greek' => 'greek',
380 'greenlandic' => 'west_european',
381 'hebrew' => 'hebrew',
382 'hindi' => 'unicode',
383 'hungarian' => 'east_european',
384 'icelandic' => 'west_european',
385 'italian' => 'west_european',
386 'khmer' => 'unicode',
387 'latvian' => 'west_european',
388 'lettish' => 'west_european',
389 'lithuanian' => 'lithuanian',
390 'malay' => 'west_european',
391 'norwegian' => 'west_european',
392 'persian' => 'arabic',
393 'polish' => 'east_european',
394 'portuguese' => 'west_european',
395 'russian' => 'cyrillic',
396 'romanian' => 'east_european',
397 'serbian' => 'cyrillic',
398 'slovak' => 'east_european',
399 'slovenian' => 'east_european',
400 'spanish' => 'west_european',
401 'svedish' => 'west_european',
402 'that' => 'thai',
403 'turkish' => 'turkish',
404 'ukrainian' => 'cyrillic',
405 );
406
407 // mapping of language (family) names to charsets on Unix
408 var $script_to_charset_unix = array(
409 'west_european' => 'iso-8859-1',
410 'estonian' => 'iso-8859-1',
411 'east_european' => 'iso-8859-2',
412 'baltic' => 'iso-8859-4',
413 'cyrillic' => 'iso-8859-5',
414 'arabic' => 'iso-8859-6',
415 'greek' => 'iso-8859-7',
416 'hebrew' => 'iso-8859-8',
417 'turkish' => 'iso-8859-9',
418 'thai' => 'iso-8859-11', // = TIS-620
419 'lithuanian' => 'iso-8859-13',
420 'chinese' => 'gb2312', // = euc-cn
421 'japanese' => 'euc-jp',
422 'korean' => 'euc-kr',
423 'simpl_chinese' => 'gb2312',
424 'trad_chinese' => 'big5',
425 'vietnamese' => '',
426 'unicode' => 'utf-8',
427 'albanian' => 'utf-8'
428 );
429
430 // mapping of language (family) names to charsets on Windows
431 var $script_to_charset_windows = array(
432 'east_european' => 'windows-1250',
433 'cyrillic' => 'windows-1251',
434 'west_european' => 'windows-1252',
435 'greek' => 'windows-1253',
436 'turkish' => 'windows-1254',
437 'hebrew' => 'windows-1255',
438 'arabic' => 'windows-1256',
439 'baltic' => 'windows-1257',
440 'estonian' => 'windows-1257',
441 'lithuanian' => 'windows-1257',
442 'vietnamese' => 'windows-1258',
443 'thai' => 'cp874',
444 'korean' => 'cp949',
445 'chinese' => 'gb2312',
446 'japanese' => 'shift_jis',
447 'simpl_chinese' => 'gb2312',
448 'trad_chinese' => 'big5',
449 'albanian' => 'windows-1250',
450 'unicode' => 'utf-8'
451 );
452
453 // mapping of locale names to charsets
454 var $locale_to_charset = array(
455 'japanese.euc' => 'euc-jp',
456 'ja_jp.ujis' => 'euc-jp',
457 'korean.euc' => 'euc-kr',
458 'sr@Latn' => 'iso-8859-2',
459 'zh_cn' => 'gb2312',
460 'zh_hk' => 'big5',
461 'zh_tw' => 'big5',
462 );
463
464 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
465 // Empty values means "iso-8859-1"
466 var $charSetArray = array(
467 'dk' => '',
468 'de' => '',
469 'no' => '',
470 'it' => '',
471 'fr' => '',
472 'es' => '',
473 'nl' => '',
474 'cz' => 'windows-1250',
475 'pl' => 'iso-8859-2',
476 'si' => 'windows-1250',
477 'fi' => '',
478 'tr' => 'iso-8859-9',
479 'se' => '',
480 'pt' => '',
481 'ru' => 'windows-1251',
482 'ro' => 'iso-8859-2',
483 'ch' => 'gb2312',
484 'sk' => 'windows-1250',
485 'lt' => 'windows-1257',
486 'is' => 'utf-8',
487 'hr' => 'windows-1250',
488 'hu' => 'iso-8859-2',
489 'gl' => '',
490 'th' => 'iso-8859-11',
491 'gr' => 'iso-8859-7',
492 'hk' => 'big5',
493 'eu' => '',
494 'bg' => 'windows-1251',
495 'br' => '',
496 'et' => 'iso-8859-4',
497 'ar' => 'iso-8859-6',
498 'he' => 'utf-8',
499 'ua' => 'windows-1251',
500 'jp' => 'shift_jis',
501 'lv' => 'utf-8',
502 'vn' => 'utf-8',
503 'ca' => 'iso-8859-15',
504 'ba' => 'iso-8859-2',
505 'kr' => 'euc-kr',
506 'eo' => 'utf-8',
507 'my' => '',
508 'hi' => 'utf-8',
509 'fo' => 'utf-8',
510 'fa' => 'utf-8',
511 'sr' => 'utf-8',
512 'sq' => 'utf-8',
513 'ge' => 'utf-8',
514 'ga' => '',
515 'km' => 'utf-8',
516 'qc' => '',
517 );
518
519 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
520 // Missing keys means: same as Typo3
521 var $isoArray = array(
522 'ba' => 'bs',
523 'br' => 'pt_BR',
524 'ch' => 'zh_CN',
525 'cz' => 'cs',
526 'dk' => 'da',
527 'si' => 'sl',
528 'se' => 'sv',
529 'gl' => 'kl',
530 'gr' => 'el',
531 'hk' => 'zh_HK',
532 'kr' => 'ko',
533 'ua' => 'uk',
534 'jp' => 'ja',
535 'qc' => 'fr_CA',
536 'vn' => 'vi',
537 'ge' => 'ka',
538 'ga' => 'gl',
539 );
540
541 /**
542 * Normalize - changes input character set to lowercase letters.
543 *
544 * @param string Input charset
545 * @return string Normalized charset
546 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
547 */
548 function parse_charset($charset) {
549 $charset = trim(strtolower($charset));
550 if (isset($this->synonyms[$charset])) {
551 $charset = $this->synonyms[$charset];
552 }
553
554 return $charset;
555 }
556
557 /**
558 * Get the charset of a locale.
559 *
560 * ln language
561 * ln_CN language / country
562 * ln_CN.cs language / country / charset
563 * ln_CN.cs@mod language / country / charset / modifier
564 *
565 * @param string Locale string
566 * @return string Charset resolved for locale string
567 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
568 */
569 function get_locale_charset($locale) {
570 $locale = strtolower($locale);
571
572 // exact locale specific charset?
573 if (isset($this->locale_to_charset[$locale])) {
574 return $this->locale_to_charset[$locale];
575 }
576
577 // get modifier
578 list($locale, $modifier) = explode('@', $locale);
579
580 // locale contains charset: use it
581 list($locale, $charset) = explode('.', $locale);
582 if ($charset) {
583 return $this->parse_charset($charset);
584 }
585
586 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
587 if ($modifier == 'euro') {
588 return 'iso-8859-15';
589 }
590
591 // get language
592 list($language, $country) = explode('_', $locale);
593 if (isset($this->lang_to_script[$language])) {
594 $script = $this->lang_to_script[$language];
595 }
596
597 if (TYPO3_OS == 'WIN') {
598 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
599 } else {
600 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
601 }
602
603 return $cs;
604 }
605
606
607 /********************************************
608 *
609 * Charset Conversion functions
610 *
611 ********************************************/
612
613 /**
614 * Convert from one charset to another charset.
615 *
616 * @param string Input string
617 * @param string From charset (the current charset of the string)
618 * @param string To charset (the output charset wanted)
619 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
620 * @return string Converted string
621 * @see convArray()
622 */
623 function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
624 if ($fromCS == $toCS) {
625 return $str;
626 }
627
628 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
629 if ($toCS == 'utf-8' || !$useEntityForNoChar) {
630 switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
631 case 'mbstring':
632 $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
633 if (FALSE !== $conv_str) {
634 return $conv_str;
635 } // returns FALSE for unsupported charsets
636 break;
637
638 case 'iconv':
639 $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
640 if (FALSE !== $conv_str) {
641 return $conv_str;
642 }
643 break;
644
645 case 'recode':
646 $conv_str = recode_string($fromCS . '..' . $toCS, $str);
647 if (FALSE !== $conv_str) {
648 return $conv_str;
649 }
650 break;
651 }
652 // fallback to TYPO3 conversion
653 }
654
655 if ($fromCS != 'utf-8') {
656 $str = $this->utf8_encode($str, $fromCS);
657 }
658 if ($toCS != 'utf-8') {
659 $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
660 }
661 return $str;
662 }
663
664 /**
665 * Convert all elements in ARRAY with type string from one charset to another charset.
666 * NOTICE: Array is passed by reference!
667 *
668 * @param string Input array, possibly multidimensional
669 * @param string From charset (the current charset of the string)
670 * @param string To charset (the output charset wanted)
671 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
672 * @return void
673 * @see conv()
674 */
675 function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
676 foreach ($array as $key => $value) {
677 if (is_array($array[$key])) {
678 $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
679 } elseif (is_string($array[$key])) {
680 $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
681 }
682 }
683 }
684
685 /**
686 * Converts $str from $charset to UTF-8
687 *
688 * @param string String in local charset to convert to UTF-8
689 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
690 * @return string Output string, converted to UTF-8
691 */
692 function utf8_encode($str, $charset) {
693
694 if ($charset === 'utf-8') {
695 return $str;
696 }
697
698 // Charset is case-insensitive.
699 if ($this->initCharset($charset)) { // Parse conv. table if not already...
700 $strLen = strlen($str);
701 $outStr = '';
702
703 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in string.
704 $chr = substr($str, $a, 1);
705 $ord = ord($chr);
706 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
707 $ord2 = ord($str{$a + 1});
708 $ord = $ord << 8 | $ord2; // assume big endian
709
710 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
711 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
712 } else {
713 $outStr .= chr($this->noCharByteVal);
714 } // No char exists
715 $a++;
716 } elseif ($ord > 127) { // If char has value over 127 it's a multibyte char in UTF-8
717 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
718 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
719 $a++;
720 $ord2 = ord(substr($str, $a, 1));
721 $ord = $ord * 256 + $ord2;
722 }
723 }
724
725 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
726 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
727 } else {
728 $outStr .= chr($this->noCharByteVal);
729 } // No char exists
730 } else {
731 $outStr .= $chr;
732 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
733 }
734 return $outStr;
735 }
736 }
737
738 /**
739 * Converts $str from UTF-8 to $charset
740 *
741 * @param string String in UTF-8 to convert to local charset
742 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
743 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
744 * @return string Output string, converted to local charset
745 */
746 function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
747
748 if ($charset === 'utf-8') {
749 return $str;
750 }
751
752 // Charset is case-insensitive.
753 if ($this->initCharset($charset)) { // Parse conv. table if not already...
754 $strLen = strlen($str);
755 $outStr = '';
756 $buf = '';
757 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) { // Traverse each char in UTF-8 string.
758 $chr = substr($str, $a, 1);
759 $ord = ord($chr);
760 if ($ord > 127) { // This means multibyte! (first byte!)
761 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
762
763 $buf = $chr; // Add first byte
764 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
765 $ord = $ord << 1; // Shift it left and ...
766 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
767 $a++; // Increase pointer...
768 $buf .= substr($str, $a, 1); // ... and add the next char.
769 } else {
770 break;
771 }
772 }
773
774 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
775 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
776 if ($mByte > 255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
777 $outStr .= chr(($mByte >> 8) & 255) . chr($mByte & 255);
778 } else {
779 $outStr .= chr($mByte);
780 }
781 } elseif ($useEntityForNoChar) { // Create num entity:
782 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
783 } else {
784 $outStr .= chr($this->noCharByteVal);
785 } // No char exists
786 } else {
787 $outStr .= chr($this->noCharByteVal);
788 } // No char exists (MIDDLE of MB sequence!)
789 } else {
790 $outStr .= $chr;
791 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
792 }
793 return $outStr;
794 }
795 }
796
797 /**
798 * Converts all chars > 127 to numeric entities.
799 *
800 * @param string Input string
801 * @return string Output string
802 */
803 function utf8_to_entities($str) {
804 $strLen = strlen($str);
805 $outStr = '';
806 $buf = '';
807 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
808 $chr = substr($str, $a, 1);
809 $ord = ord($chr);
810 if ($ord > 127) { // This means multibyte! (first byte!)
811 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
812 $buf = $chr; // Add first byte
813 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
814 $ord = $ord << 1; // Shift it left and ...
815 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
816 $a++; // Increase pointer...
817 $buf .= substr($str, $a, 1); // ... and add the next char.
818 } else {
819 break;
820 }
821 }
822
823 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
824 } else {
825 $outStr .= chr($this->noCharByteVal);
826 } // No char exists (MIDDLE of MB sequence!)
827 } else {
828 $outStr .= $chr;
829 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
830 }
831
832 return $outStr;
833 }
834
835 /**
836 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
837 *
838 * @param string Input string, UTF-8
839 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
840 * @return string Output string
841 */
842 function entities_to_utf8($str, $alsoStdHtmlEnt = 0) {
843 if ($alsoStdHtmlEnt) {
844 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
845 }
846
847 $token = md5(microtime());
848 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
849 foreach ($parts as $k => $v) {
850 if ($k % 2) {
851 if (substr($v, 0, 1) == '#') { // Dec or hex entities:
852 if (substr($v, 1, 1) == 'x') {
853 $parts[$k] = $this->UnumberToChar(hexdec(substr($v, 2)));
854 } else {
855 $parts[$k] = $this->UnumberToChar(substr($v, 1));
856 }
857 } elseif ($alsoStdHtmlEnt && $trans_tbl['&' . $v . ';']) { // Other entities:
858 $parts[$k] = $this->utf8_encode($trans_tbl['&' . $v . ';'], 'iso-8859-1');
859 } else { // No conversion:
860 $parts[$k] = '&' . $v . ';';
861 }
862 }
863 }
864
865 return implode('', $parts);
866 }
867
868 /**
869 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
870 *
871 * @param string Input string, UTF-8
872 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
873 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
874 * @return array Output array with the char numbers
875 */
876 function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
877 // If entities must be registered as well...:
878 if ($convEntities) {
879 $str = $this->entities_to_utf8($str, 1);
880 }
881 // Do conversion:
882 $strLen = strlen($str);
883 $outArr = array();
884 $buf = '';
885 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
886 $chr = substr($str, $a, 1);
887 $ord = ord($chr);
888 if ($ord > 127) { // This means multibyte! (first byte!)
889 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
890 $buf = $chr; // Add first byte
891 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
892 $ord = $ord << 1; // Shift it left and ...
893 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
894 $a++; // Increase pointer...
895 $buf .= substr($str, $a, 1); // ... and add the next char.
896 } else {
897 break;
898 }
899 }
900
901 $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
902 } else {
903 $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
904 } // No char exists (MIDDLE of MB sequence!)
905 } else {
906 $outArr[] = $retChar ? chr($ord) : $ord;
907 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
908 }
909
910 return $outArr;
911 }
912
913 /**
914 * Converts a UNICODE number to a UTF-8 multibyte character
915 * Algorithm based on script found at From: http://czyborra.com/utf/
916 * Unit-tested by Kasper
917 *
918 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
919 *
920 * bytes | bits | representation
921 * 1 | 7 | 0vvvvvvv
922 * 2 | 11 | 110vvvvv 10vvvvvv
923 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
924 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
925 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
926 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
927 *
928 * @param integer UNICODE integer
929 * @return string UTF-8 multibyte character string
930 * @see utf8CharToUnumber()
931 */
932 function UnumberToChar($cbyte) {
933 $str = '';
934
935 if ($cbyte < 0x80) {
936 $str .= chr($cbyte);
937 } else {
938 if ($cbyte < 0x800) {
939 $str .= chr(0xC0 | ($cbyte >> 6));
940 $str .= chr(0x80 | ($cbyte & 0x3F));
941 } else {
942 if ($cbyte < 0x10000) {
943 $str .= chr(0xE0 | ($cbyte >> 12));
944 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
945 $str .= chr(0x80 | ($cbyte & 0x3F));
946 } else {
947 if ($cbyte < 0x200000) {
948 $str .= chr(0xF0 | ($cbyte >> 18));
949 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
950 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
951 $str .= chr(0x80 | ($cbyte & 0x3F));
952 } else {
953 if ($cbyte < 0x4000000) {
954 $str .= chr(0xF8 | ($cbyte >> 24));
955 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
956 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
957 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
958 $str .= chr(0x80 | ($cbyte & 0x3F));
959 } else {
960 if ($cbyte < 0x80000000) {
961 $str .= chr(0xFC | ($cbyte >> 30));
962 $str .= chr(0x80 | (($cbyte >> 24) & 0x3F));
963 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
964 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
965 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
966 $str .= chr(0x80 | ($cbyte & 0x3F));
967 } else { // Cannot express a 32-bit character in UTF-8
968 $str .= chr($this->noCharByteVal);
969 }
970 }
971 }
972 }
973 }
974 }
975 return $str;
976 }
977
978 /**
979 * Converts a UTF-8 Multibyte character to a UNICODE number
980 * Unit-tested by Kasper
981 *
982 * @param string UTF-8 multibyte character string
983 * @param boolean If set, then a hex. number is returned.
984 * @return integer UNICODE integer
985 * @see UnumberToChar()
986 */
987 function utf8CharToUnumber($str, $hex = 0) {
988 $ord = ord(substr($str, 0, 1)); // First char
989
990 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
991 $binBuf = '';
992 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
993 $ord = $ord << 1; // Shift it left and ...
994 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
995 $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6);
996 } else {
997 break;
998 }
999 }
1000 $binBuf = substr('00000000' . decbin(ord(substr($str, 0, 1))), -(6 - $b)) . $binBuf;
1001
1002 $int = bindec($binBuf);
1003 } else {
1004 $int = $ord;
1005 }
1006
1007 return $hex ? 'x' . dechex($int) : $int;
1008 }
1009
1010
1011 /********************************************
1012 *
1013 * Init functions
1014 *
1015 ********************************************/
1016
1017 /**
1018 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
1019 * This function is automatically called by the conversion functions
1020 *
1021 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
1022 *
1023 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
1024 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
1025 * @access private
1026 */
1027 function initCharset($charset) {
1028 // Only process if the charset is not yet loaded:
1029 if (!is_array($this->parsedCharsets[$charset])) {
1030
1031 // Conversion table filename:
1032 $charsetConvTableFile = PATH_t3lib . 'csconvtbl/' . $charset . '.tbl';
1033
1034 // If the conversion table is found:
1035 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
1036 // Cache file for charsets:
1037 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1038 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1039 if ($cacheFile && @is_file($cacheFile)) {
1040 $this->parsedCharsets[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1041 } else {
1042 // Parse conversion table into lines:
1043 $lines = t3lib_div::trimExplode(LF, t3lib_div::getUrl($charsetConvTableFile), 1);
1044 // Initialize the internal variable holding the conv. table:
1045 $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
1046 // traverse the lines:
1047 $detectedType = '';
1048 foreach ($lines as $value) {
1049 if (trim($value) && substr($value, 0, 1) != '#') { // Comment line or blanks are ignored.
1050
1051 // Detect type if not done yet: (Done on first real line)
1052 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1053 if (!$detectedType) {
1054 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
1055 }
1056
1057 if ($detectedType == 'ms-token') {
1058 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1059 } elseif ($detectedType == 'whitespaced') {
1060 $regA = array();
1061 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1062 $hexbyte = $regA[1];
1063 $utf8 = 'U+' . $regA[2];
1064 }
1065 $decval = hexdec(trim($hexbyte));
1066 if ($decval > 127) {
1067 $utf8decval = hexdec(substr(trim($utf8), 2));
1068 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1069 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1070 }
1071 }
1072 }
1073 if ($cacheFile) {
1074 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1075 }
1076 }
1077 return 2;
1078 } else {
1079 return FALSE;
1080 }
1081 } else {
1082 return 1;
1083 }
1084 }
1085
1086 /**
1087 * This function initializes all UTF-8 character data tables.
1088 *
1089 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1090 *
1091 * @param string Mode ("case", "ascii", ...)
1092 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1093 * @access private
1094 */
1095 function initUnicodeData($mode = NULL) {
1096 // cache files
1097 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1098 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1099
1100 // Only process if the tables are not yet loaded
1101 switch ($mode) {
1102 case 'case':
1103 if (is_array($this->caseFolding['utf-8'])) {
1104 return 1;
1105 }
1106
1107 // Use cached version if possible
1108 if ($cacheFileCase && @is_file($cacheFileCase)) {
1109 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1110 return 2;
1111 }
1112 break;
1113
1114 case 'ascii':
1115 if (is_array($this->toASCII['utf-8'])) {
1116 return 1;
1117 }
1118
1119 // Use cached version if possible
1120 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1121 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1122 return 2;
1123 }
1124 break;
1125 }
1126
1127 // process main Unicode data file
1128 $unicodeDataFile = PATH_t3lib . 'unidata/UnicodeData.txt';
1129 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1130 return FALSE;
1131 }
1132
1133 $fh = fopen($unicodeDataFile, 'rb');
1134 if (!$fh) {
1135 return FALSE;
1136 }
1137
1138 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1139 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1140 $this->caseFolding['utf-8'] = array();
1141 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1142 $utf8CaseFolding['toUpper'] = array();
1143 $utf8CaseFolding['toLower'] = array();
1144 $utf8CaseFolding['toTitle'] = array();
1145
1146 $decomposition = array(); // array of temp. decompositions
1147 $mark = array(); // array of chars that are marks (eg. composing accents)
1148 $number = array(); // array of chars that are numbers (eg. digits)
1149 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1150
1151 while (!feof($fh)) {
1152 $line = fgets($fh, 4096);
1153 // has a lot of info
1154 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line));
1155
1156 $ord = hexdec($char);
1157 if ($ord > 0xFFFF) {
1158 break;
1159 } // only process the BMP
1160
1161 $utf8_char = $this->UnumberToChar($ord);
1162
1163 if ($upper) {
1164 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1165 }
1166 if ($lower) {
1167 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1168 }
1169 // store "title" only when different from "upper" (only a few)
1170 if ($title && $title != $upper) {
1171 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1172 }
1173
1174 switch ($cat{0}) {
1175 case 'M': // mark (accent, umlaut, ...)
1176 $mark["U+$char"] = 1;
1177 break;
1178
1179 case 'N': // numeric value
1180 if ($ord > 0x80 && $num != '') {
1181 $number["U+$char"] = $num;
1182 }
1183 }
1184
1185 // accented Latin letters without "official" decomposition
1186 $match = array();
1187 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1188 $c = ord($match[2]);
1189 if ($match[1] == 'SMALL') {
1190 $c += 32;
1191 }
1192
1193 $decomposition["U+$char"] = array(dechex($c));
1194 continue;
1195 }
1196
1197 $match = array();
1198 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1199 switch ($match[1]) {
1200 case '<circle>': // add parenthesis as circle replacement, eg (1)
1201 $match[2] = '0028 ' . $match[2] . ' 0029';
1202 break;
1203
1204 case '<square>': // add square brackets as square replacement, eg [1]
1205 $match[2] = '005B ' . $match[2] . ' 005D';
1206 break;
1207
1208 case '<compat>': // ignore multi char decompositions that start with a space
1209 if (preg_match('/^0020 /', $match[2])) {
1210 continue 2;
1211 }
1212 break;
1213
1214 // ignore Arabic and vertical layout presentation decomposition
1215 case '<initial>':
1216 case '<medial>':
1217 case '<final>':
1218 case '<isolated>':
1219 case '<vertical>':
1220 continue 2;
1221 }
1222 $decomposition["U+$char"] = explode(' ', $match[2]);
1223 }
1224 }
1225 fclose($fh);
1226
1227 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1228 $specialCasingFile = PATH_t3lib . 'unidata/SpecialCasing.txt';
1229 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1230 $fh = fopen($specialCasingFile, 'rb');
1231 if ($fh) {
1232 while (!feof($fh)) {
1233 $line = fgets($fh, 4096);
1234 if ($line{0} != '#' && trim($line) != '') {
1235
1236 list($char, $lower, $title, $upper, $cond) = t3lib_div::trimExplode(';', $line);
1237 if ($cond == '' || $cond{0} == '#') {
1238 $utf8_char = $this->UnumberToChar(hexdec($char));
1239 if ($char != $lower) {
1240 $arr = explode(' ', $lower);
1241 for ($i = 0; isset($arr[$i]); $i++) {
1242 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1243 }
1244 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1245 }
1246 if ($char != $title && $title != $upper) {
1247 $arr = explode(' ', $title);
1248 for ($i = 0; isset($arr[$i]); $i++) {
1249 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1250 }
1251 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1252 }
1253 if ($char != $upper) {
1254 $arr = explode(' ', $upper);
1255 for ($i = 0; isset($arr[$i]); $i++) {
1256 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1257 }
1258 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1259 }
1260 }
1261 }
1262 }
1263 fclose($fh);
1264 }
1265 }
1266
1267 // process custom decompositions
1268 $customTranslitFile = PATH_t3lib . 'unidata/Translit.txt';
1269 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1270 $fh = fopen($customTranslitFile, 'rb');
1271 if ($fh) {
1272 while (!feof($fh)) {
1273 $line = fgets($fh, 4096);
1274 if ($line{0} != '#' && trim($line) != '') {
1275 list($char, $translit) = t3lib_div::trimExplode(';', $line);
1276 if (!$translit) {
1277 $omit["U+$char"] = 1;
1278 }
1279 $decomposition["U+$char"] = explode(' ', $translit);
1280
1281 }
1282 }
1283 fclose($fh);
1284 }
1285 }
1286
1287 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1288 foreach ($decomposition as $from => $to) {
1289 $code_decomp = array();
1290
1291 while ($code_value = array_shift($to)) {
1292 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1293 foreach (array_reverse($decomposition["U+$code_value"]) as $cv) {
1294 array_unshift($to, $cv);
1295 }
1296 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1297 array_push($code_decomp, $code_value);
1298 }
1299 }
1300 if (count($code_decomp) || isset($omit[$from])) {
1301 $decomposition[$from] = $code_decomp;
1302 } else {
1303 unset($decomposition[$from]);
1304 }
1305 }
1306
1307 // create ascii only mapping
1308 $this->toASCII['utf-8'] = array();
1309 $ascii =& $this->toASCII['utf-8'];
1310
1311 foreach ($decomposition as $from => $to) {
1312 $code_decomp = array();
1313 while ($code_value = array_shift($to)) {
1314 $ord = hexdec($code_value);
1315 if ($ord > 127) {
1316 continue 2;
1317 } // skip decompositions containing non-ASCII chars
1318 else
1319 {
1320 array_push($code_decomp, chr($ord));
1321 }
1322 }
1323 $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1324 }
1325
1326 // add numeric decompositions
1327 foreach ($number as $from => $to) {
1328 $utf8_char = $this->UnumberToChar(hexdec($from));
1329 if (!isset($ascii[$utf8_char])) {
1330 $ascii[$utf8_char] = $to;
1331 }
1332 }
1333
1334 if ($cacheFileCase) {
1335 t3lib_div::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1336 }
1337
1338 if ($cacheFileASCII) {
1339 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1340 }
1341
1342 return 3;
1343 }
1344
1345 /**
1346 * This function initializes the folding table for a charset other than UTF-8.
1347 * This function is automatically called by the case folding functions.
1348 *
1349 * @param string Charset for which to initialize case folding.
1350 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1351 * @access private
1352 */
1353 function initCaseFolding($charset) {
1354 // Only process if the case table is not yet loaded:
1355 if (is_array($this->caseFolding[$charset])) {
1356 return 1;
1357 }
1358
1359 // Use cached version if possible
1360 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1361 if ($cacheFile && @is_file($cacheFile)) {
1362 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1363 return 2;
1364 }
1365
1366 // init UTF-8 conversion for this charset
1367 if (!$this->initCharset($charset)) {
1368 return FALSE;
1369 }
1370
1371 // UTF-8 case folding is used as the base conversion table
1372 if (!$this->initUnicodeData('case')) {
1373 return FALSE;
1374 }
1375
1376 $nochar = chr($this->noCharByteVal);
1377 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1378 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1379 $c = $this->utf8_decode($utf8, $charset);
1380
1381 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1382 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1383 if ($cc != '' && $cc != $nochar) {
1384 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1385 }
1386
1387 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1388 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1389 if ($cc != '' && $cc != $nochar) {
1390 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1391 }
1392
1393 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1394 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1395 if ($cc != '' && $cc != $nochar) {
1396 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1397 }
1398 }
1399
1400 // add the ASCII case table
1401 for ($i = ord('a'); $i <= ord('z'); $i++) {
1402 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1403 }
1404 for ($i = ord('A'); $i <= ord('Z'); $i++) {
1405 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1406 }
1407
1408 if ($cacheFile) {
1409 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1410 }
1411
1412 return 3;
1413 }
1414
1415 /**
1416 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1417 * This function is automatically called by the ASCII transliteration functions.
1418 *
1419 * @param string Charset for which to initialize conversion.
1420 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1421 * @access private
1422 */
1423 function initToASCII($charset) {
1424 // Only process if the case table is not yet loaded:
1425 if (is_array($this->toASCII[$charset])) {
1426 return 1;
1427 }
1428
1429 // Use cached version if possible
1430 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1431 if ($cacheFile && @is_file($cacheFile)) {
1432 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1433 return 2;
1434 }
1435
1436 // init UTF-8 conversion for this charset
1437 if (!$this->initCharset($charset)) {
1438 return FALSE;
1439 }
1440
1441 // UTF-8/ASCII transliteration is used as the base conversion table
1442 if (!$this->initUnicodeData('ascii')) {
1443 return FALSE;
1444 }
1445
1446 $nochar = chr($this->noCharByteVal);
1447 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1448 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1449 $c = $this->utf8_decode($utf8, $charset);
1450
1451 if (isset($this->toASCII['utf-8'][$utf8])) {
1452 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1453 }
1454 }
1455
1456 if ($cacheFile) {
1457 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1458 }
1459
1460 return 3;
1461 }
1462
1463
1464 /********************************************
1465 *
1466 * String operation functions
1467 *
1468 ********************************************/
1469
1470 /**
1471 * Returns a part of a string.
1472 * Unit-tested by Kasper (single byte charsets only)
1473 *
1474 * @param string The character set
1475 * @param string Character string
1476 * @param integer Start position (character position)
1477 * @param integer Length (in characters)
1478 * @return string The substring
1479 * @see substr(), mb_substr()
1480 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1481 */
1482 function substr($charset, $string, $start, $len = NULL) {
1483 if ($len === 0 || $string === '') {
1484 return '';
1485 }
1486
1487 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1488 // cannot omit $len, when specifying charset
1489 if ($len == NULL) {
1490 $enc = mb_internal_encoding(); // save internal encoding
1491 mb_internal_encoding($charset);
1492 $str = mb_substr($string, $start);
1493 mb_internal_encoding($enc); // restore internal encoding
1494
1495 return $str;
1496 }
1497 else {
1498 return mb_substr($string, $start, $len, $charset);
1499 }
1500 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1501 // cannot omit $len, when specifying charset
1502 if ($len == NULL) {
1503 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1504 iconv_set_encoding('internal_encoding', $charset);
1505 $str = iconv_substr($string, $start);
1506 iconv_set_encoding('internal_encoding', $enc); // restore internal encoding
1507
1508 return $str;
1509 }
1510 else {
1511 return iconv_substr($string, $start, $len, $charset);
1512 }
1513 } elseif ($charset == 'utf-8') {
1514 return $this->utf8_substr($string, $start, $len);
1515 } elseif ($this->eucBasedSets[$charset]) {
1516 return $this->euc_substr($string, $start, $charset, $len);
1517 } elseif ($this->twoByteSets[$charset]) {
1518 return substr($string, $start * 2, $len * 2);
1519 } elseif ($this->fourByteSets[$charset]) {
1520 return substr($string, $start * 4, $len * 4);
1521 }
1522
1523 // treat everything else as single-byte encoding
1524 return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
1525 }
1526
1527 /**
1528 * Counts the number of characters.
1529 * Unit-tested by Kasper (single byte charsets only)
1530 *
1531 * @param string The character set
1532 * @param string Character string
1533 * @return integer The number of characters
1534 * @see strlen()
1535 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1536 */
1537 function strlen($charset, $string) {
1538 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1539 return mb_strlen($string, $charset);
1540 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1541 return iconv_strlen($string, $charset);
1542 } elseif ($charset == 'utf-8') {
1543 return $this->utf8_strlen($string);
1544 } elseif ($this->eucBasedSets[$charset]) {
1545 return $this->euc_strlen($string, $charset);
1546 } elseif ($this->twoByteSets[$charset]) {
1547 return strlen($string) / 2;
1548 } elseif ($this->fourByteSets[$charset]) {
1549 return strlen($string) / 4;
1550 }
1551 // treat everything else as single-byte encoding
1552 return strlen($string);
1553 }
1554
1555 /**
1556 * Method to crop strings using the mb_substr function.
1557 *
1558 * @param string The character set
1559 * @param string String to be cropped
1560 * @param integer Crop length (in characters)
1561 * @param string Crop signifier
1562 * @return string The shortened string
1563 * @see mb_strlen(), mb_substr()
1564 */
1565 protected function cropMbstring($charset, $string, $len, $crop = '') {
1566 if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) {
1567 return $string;
1568 }
1569
1570 if ($len > 0) {
1571 $string = mb_substr($string, 0, $len, $charset) . $crop;
1572 } else {
1573 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1574 }
1575
1576 return $string;
1577 }
1578
1579 /**
1580 * Truncates a string and pre-/appends a string.
1581 * Unit tested by Kasper
1582 *
1583 * @param string The character set
1584 * @param string Character string
1585 * @param integer Length (in characters)
1586 * @param string Crop signifier
1587 * @return string The shortened string
1588 * @see substr(), mb_strimwidth()
1589 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1590 */
1591 function crop($charset, $string, $len, $crop = '') {
1592 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1593 return $this->cropMbstring($charset, $string, $len, $crop);
1594 }
1595
1596 if (intval($len) == 0) {
1597 return $string;
1598 }
1599
1600 if ($charset == 'utf-8') {
1601 $i = $this->utf8_char2byte_pos($string, $len);
1602 } elseif ($this->eucBasedSets[$charset]) {
1603 $i = $this->euc_char2byte_pos($string, $len, $charset);
1604 } else {
1605 if ($len > 0) {
1606 $i = $len;
1607 } else {
1608 $i = strlen($string) + $len;
1609 if ($i <= 0) {
1610 $i = FALSE;
1611 }
1612 }
1613 }
1614
1615 if ($i === FALSE) { // $len outside actual string length
1616 return $string;
1617 } else {
1618 if ($len > 0) {
1619 if (strlen($string{$i})) {
1620 return substr($string, 0, $i) . $crop;
1621
1622 }
1623 } else {
1624 if (strlen($string{$i - 1})) {
1625 return $crop . substr($string, $i);
1626 }
1627 }
1628
1629 /*
1630 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return TRUE here (which is not a catastrophe, but...)
1631 if ($len > 0) {
1632 return substr($string,0,$i).$crop;
1633 } else {
1634 return $crop.substr($string,$i);
1635 }
1636 }
1637 */
1638 }
1639 return $string;
1640 }
1641
1642 /**
1643 * Cuts a string short at a given byte length.
1644 *
1645 * @param string The character set
1646 * @param string Character string
1647 * @param integer The byte length
1648 * @return string The shortened string
1649 * @see mb_strcut()
1650 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1651 */
1652 function strtrunc($charset, $string, $len) {
1653 if ($len <= 0) {
1654 return '';
1655 }
1656
1657 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1658 return mb_strcut($string, 0, $len, $charset);
1659 } elseif ($charset == 'utf-8') {
1660 return $this->utf8_strtrunc($string, $len);
1661 } elseif ($this->eucBasedSets[$charset]) {
1662 return $this->euc_strtrunc($string, $len, $charset);
1663 } elseif ($this->twoByteSets[$charset]) {
1664 if ($len % 2) {
1665 $len--;
1666 } // don't cut at odd positions
1667 } elseif ($this->fourByteSets[$charset]) {
1668 $x = $len % 4;
1669 $len -= $x; // realign to position dividable by four
1670 }
1671 // treat everything else as single-byte encoding
1672 return substr($string, 0, $len);
1673 }
1674
1675 /**
1676 * Translates all characters of a string into their respective case values.
1677 * Unlike strtolower() and strtoupper() this method is locale independent.
1678 * Note that the string length may change!
1679 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1680 * Unit-tested by Kasper
1681 * Real case folding is language dependent, this method ignores this fact.
1682 *
1683 * @param string Character set of string
1684 * @param string Input string to convert case for
1685 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1686 * @return string The converted string
1687 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1688 * @see strtolower(), strtoupper()
1689 */
1690 function conv_case($charset, $string, $case) {
1691 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1692 if ($case == 'toLower') {
1693 $string = mb_strtolower($string, $charset);
1694 } else {
1695 $string = mb_strtoupper($string, $charset);
1696 }
1697 } elseif ($charset == 'utf-8') {
1698 $string = $this->utf8_char_mapping($string, 'case', $case);
1699 } elseif (isset($this->eucBasedSets[$charset])) {
1700 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1701 } else {
1702 // treat everything else as single-byte encoding
1703 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1704 }
1705
1706 return $string;
1707 }
1708
1709 /**
1710 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1711 *
1712 * @param string Character set of string
1713 * @param string Input string to convert
1714 * @return string The converted string
1715 */
1716 function specCharsToASCII($charset, $string) {
1717 if ($charset == 'utf-8') {
1718 $string = $this->utf8_char_mapping($string, 'ascii');
1719 } elseif (isset($this->eucBasedSets[$charset])) {
1720 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1721 } else {
1722 // treat everything else as single-byte encoding
1723 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1724 }
1725
1726 return $string;
1727 }
1728
1729
1730 /**
1731 * converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1732 * into a TYPO3-readable language code
1733 * @param $languageCodesList list of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1734 * see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
1735 * @return string a preferred language that TYPO3 supports, or "default" if none found
1736 * @author Benjamin Mack (benni.typo3.org)
1737 */
1738 public function getPreferredClientLanguage($languageCodesList) {
1739 $allLanguageCodes = array();
1740 $selectedLanguage = 'default';
1741
1742 // get all languages where TYPO3 code is the same as the ISO code
1743 foreach ($this->charSetArray as $typo3Lang => $charSet) {
1744 $allLanguageCodes[$typo3Lang] = $typo3Lang;
1745 }
1746
1747 // get all languages where TYPO3 code differs from ISO code
1748 // or needs the country part
1749 // the iso codes will here overwrite the default typo3 language in the key
1750 foreach ($this->isoArray as $typo3Lang => $isoLang) {
1751 $isoLang = join('-', explode('_', $isoLang));
1752 $allLanguageCodes[$typo3Lang] = $isoLang;
1753 }
1754
1755 // move the iso codes to the (because we're comparing the keys with "isset" later on)
1756 $allLanguageCodes = array_flip($allLanguageCodes);
1757
1758
1759 $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList);
1760 // order the preferred languages after they key
1761 $sortedPreferredLanguages = array();
1762 foreach ($preferredLanguages as $preferredLanguage) {
1763 $quality = 1.0;
1764 if (strpos($preferredLanguage, ';q=') !== FALSE) {
1765 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1766 }
1767 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1768 }
1769
1770 // loop through the languages, with the highest priority first
1771 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1772 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1773 if (isset($allLanguageCodes[$preferredLanguage])) {
1774 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1775 break;
1776 }
1777
1778 // strip the country code from the end
1779 list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
1780 if (isset($allLanguageCodes[$preferredLanguage])) {
1781 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1782 break;
1783 }
1784 }
1785 if (!$selectedLanguage || $selectedLanguage == 'en') {
1786 $selectedLanguage = 'default';
1787 }
1788 return $selectedLanguage;
1789 }
1790
1791
1792 /********************************************
1793 *
1794 * Internal string operation functions
1795 *
1796 ********************************************/
1797
1798 /**
1799 * Maps all characters of a string in a single byte charset.
1800 *
1801 * @param string the string
1802 * @param string the charset
1803 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1804 * @param string 'case': conversion 'toLower' or 'toUpper'
1805 * @return string the converted string
1806 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1807 */
1808 function sb_char_mapping($str, $charset, $mode, $opt = '') {
1809 switch ($mode) {
1810 case 'case':
1811 if (!$this->initCaseFolding($charset)) {
1812 return $str;
1813 } // do nothing
1814 $map =& $this->caseFolding[$charset][$opt];
1815 break;
1816
1817 case 'ascii':
1818 if (!$this->initToASCII($charset)) {
1819 return $str;
1820 } // do nothing
1821 $map =& $this->toASCII[$charset];
1822 break;
1823
1824 default:
1825 return $str;
1826 }
1827
1828 $out = '';
1829 for ($i = 0; strlen($str{$i}); $i++) {
1830 $c = $str{$i};
1831 if (isset($map[$c])) {
1832 $out .= $map[$c];
1833 } else {
1834 $out .= $c;
1835 }
1836 }
1837
1838 return $out;
1839 }
1840
1841
1842 /********************************************
1843 *
1844 * Internal UTF-8 string operation functions
1845 *
1846 ********************************************/
1847
1848 /**
1849 * Returns a part of a UTF-8 string.
1850 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1851 *
1852 * @param string UTF-8 string
1853 * @param integer Start position (character position)
1854 * @param integer Length (in characters)
1855 * @return string The substring
1856 * @see substr()
1857 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1858 */
1859 function utf8_substr($str, $start, $len = NULL) {
1860 if (!strcmp($len, '0')) {
1861 return '';
1862 }
1863
1864 $byte_start = $this->utf8_char2byte_pos($str, $start);
1865 if ($byte_start === FALSE) {
1866 if ($start > 0) {
1867 return FALSE; // $start outside string length
1868 } else {
1869 $start = 0;
1870 }
1871 }
1872
1873 $str = substr($str, $byte_start);
1874
1875 if ($len != NULL) {
1876 $byte_end = $this->utf8_char2byte_pos($str, $len);
1877 if ($byte_end === FALSE) // $len outside actual string length
1878 {
1879 return $len < 0 ? '' : $str;
1880 } // When length is less than zero and exceeds, then we return blank string.
1881 else
1882 {
1883 return substr($str, 0, $byte_end);
1884 }
1885 }
1886 else {
1887 return $str;
1888 }
1889 }
1890
1891 /**
1892 * Counts the number of characters of a string in UTF-8.
1893 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1894 *
1895 * @param string UTF-8 multibyte character string
1896 * @return integer The number of characters
1897 * @see strlen()
1898 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1899 */
1900 function utf8_strlen($str) {
1901 $n = 0;
1902 for ($i = 0; strlen($str{$i}); $i++) {
1903 $c = ord($str{$i});
1904 if (!($c & 0x80)) // single-byte (0xxxxxx)
1905 {
1906 $n++;
1907 }
1908 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1909 {
1910 $n++;
1911 }
1912 }
1913 return $n;
1914 }
1915
1916 /**
1917 * Truncates a string in UTF-8 short at a given byte length.
1918 *
1919 * @param string UTF-8 multibyte character string
1920 * @param integer the byte length
1921 * @return string the shortened string
1922 * @see mb_strcut()
1923 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1924 */
1925 function utf8_strtrunc($str, $len) {
1926 $i = $len - 1;
1927 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1928 for (; $i > 0 && !(ord($str{$i}) & 0x40); $i--) {
1929 // find the first byte
1930 ;
1931 }
1932 if ($i <= 0) {
1933 return '';
1934 } // sanity check
1935 for ($bc = 0, $mbs = ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) {
1936 // calculate number of bytes
1937 $bc++;
1938 }
1939 if ($bc + $i > $len) {
1940 return substr($str, 0, $i);
1941 }
1942 // fallthru: multibyte char fits into length
1943 }
1944 return substr($str, 0, $len);
1945 }
1946
1947 /**
1948 * Find position of first occurrence of a string, both arguments are in UTF-8.
1949 *
1950 * @param string UTF-8 string to search in
1951 * @param string UTF-8 string to search for
1952 * @param integer Positition to start the search
1953 * @return integer The character position
1954 * @see strpos()
1955 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1956 */
1957 function utf8_strpos($haystack, $needle, $offset = 0) {
1958 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1959 return mb_strpos($haystack, $needle, $offset, 'utf-8');
1960 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1961 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1962 }
1963
1964 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1965 if ($byte_offset === FALSE) {
1966 return FALSE;
1967 } // offset beyond string length
1968
1969 $byte_pos = strpos($haystack, $needle, $byte_offset);
1970 if ($byte_pos === FALSE) {
1971 return FALSE;
1972 } // needle not found
1973
1974 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1975 }
1976
1977 /**
1978 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1979 *
1980 * @param string UTF-8 string to search in
1981 * @param string UTF-8 character to search for (single character)
1982 * @return integer The character position
1983 * @see strrpos()
1984 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1985 */
1986 function utf8_strrpos($haystack, $needle) {
1987 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1988 return mb_strrpos($haystack, $needle, 'utf-8');
1989 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1990 return iconv_strrpos($haystack, $needle, 'utf-8');
1991 }
1992
1993 $byte_pos = strrpos($haystack, $needle);
1994 if ($byte_pos === FALSE) {
1995 return FALSE;
1996 } // needle not found
1997
1998 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1999 }
2000
2001 /**
2002 * Translates a character position into an 'absolute' byte position.
2003 * Unit tested by Kasper.
2004 *
2005 * @param string UTF-8 string
2006 * @param integer Character position (negative values start from the end)
2007 * @return integer Byte position
2008 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2009 */
2010 function utf8_char2byte_pos($str, $pos) {
2011 $n = 0; // number of characters found
2012 $p = abs($pos); // number of characters wanted
2013
2014 if ($pos >= 0) {
2015 $i = 0;
2016 $d = 1;
2017 } else {
2018 $i = strlen($str) - 1;
2019 $d = -1;
2020 }
2021
2022 for (; strlen($str{$i}) && $n < $p; $i += $d) {
2023 $c = (int) ord($str{$i});
2024 if (!($c & 0x80)) // single-byte (0xxxxxx)
2025 {
2026 $n++;
2027 }
2028 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2029 {
2030 $n++;
2031 }
2032 }
2033 if (!strlen($str{$i})) {
2034 return FALSE;
2035 } // offset beyond string length
2036
2037 if ($pos >= 0) {
2038 // skip trailing multi-byte data bytes
2039 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) {
2040 $i++;
2041 }
2042 } else {
2043 // correct offset
2044 $i++;
2045 }
2046
2047 return $i;
2048 }
2049
2050 /**
2051 * Translates an 'absolute' byte position into a character position.
2052 * Unit tested by Kasper.
2053 *
2054 * @param string UTF-8 string
2055 * @param integer byte position
2056 * @return integer character position
2057 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2058 */
2059 function utf8_byte2char_pos($str, $pos) {
2060 $n = 0; // number of characters
2061 for ($i = $pos; $i > 0; $i--) {
2062 $c = (int) ord($str{$i});
2063 if (!($c & 0x80)) // single-byte (0xxxxxx)
2064 {
2065 $n++;
2066 }
2067 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2068 {
2069 $n++;
2070 }
2071 }
2072 if (!strlen($str{$i})) {
2073 return FALSE;
2074 } // offset beyond string length
2075
2076 return $n;
2077 }
2078
2079 /**
2080 * Maps all characters of an UTF-8 string.
2081 *
2082 * @param string UTF-8 string
2083 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2084 * @param string 'case': conversion 'toLower' or 'toUpper'
2085 * @return string the converted string
2086 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2087 */
2088 function utf8_char_mapping($str, $mode, $opt = '') {
2089 if (!$this->initUnicodeData($mode)) {
2090 return $str;
2091 } // do nothing
2092
2093 $out = '';
2094 switch ($mode) {
2095 case 'case':
2096 $map =& $this->caseFolding['utf-8'][$opt];
2097 break;
2098
2099 case 'ascii':
2100 $map =& $this->toASCII['utf-8'];
2101 break;
2102
2103 default:
2104 return $str;
2105 }
2106
2107 for ($i = 0; strlen($str{$i}); $i++) {
2108 $c = ord($str{$i});
2109 if (!($c & 0x80)) // single-byte (0xxxxxx)
2110 {
2111 $mbc = $str{$i};
2112 }
2113 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
2114 for ($bc = 0; $c & 0x80; $c = $c << 1) {
2115 $bc++;
2116 } // calculate number of bytes
2117 $mbc = substr($str, $i, $bc);
2118 $i += $bc - 1;
2119 }
2120
2121 if (isset($map[$mbc])) {
2122 $out .= $map[$mbc];
2123 } else {
2124 $out .= $mbc;
2125 }
2126 }
2127
2128 return $out;
2129 }
2130
2131
2132 /********************************************
2133 *
2134 * Internal EUC string operation functions
2135 *
2136 * Extended Unix Code:
2137 * ASCII compatible 7bit single bytes chars
2138 * 8bit two byte chars
2139 *
2140 * Shift-JIS is treated as a special case.
2141 *
2142 ********************************************/
2143
2144 /**
2145 * Cuts a string in the EUC charset family short at a given byte length.
2146 *
2147 * @param string EUC multibyte character string
2148 * @param integer the byte length
2149 * @param string the charset
2150 * @return string the shortened string
2151 * @see mb_strcut()
2152 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2153 */
2154 function euc_strtrunc($str, $len, $charset) {
2155 $sjis = ($charset == 'shift_jis');
2156 for ($i = 0; strlen($str{$i}) && $i < $len; $i++) {
2157 $c = ord($str{$i});
2158 if ($sjis) {
2159 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2160 $i++;
2161 } // advance a double-byte char
2162 }
2163 else {
2164 if ($c >= 0x80) {
2165 $i++;
2166 } // advance a double-byte char
2167 }
2168 }
2169 if (!strlen($str{$i})) {
2170 return $str;
2171 } // string shorter than supplied length
2172
2173 if ($i > $len) {
2174 return substr($str, 0, $len - 1); // we ended on a first byte
2175 } else {
2176 return substr($str, 0, $len);
2177 }
2178 }
2179
2180 /**
2181 * Returns a part of a string in the EUC charset family.
2182 *
2183 * @param string EUC multibyte character string
2184 * @param integer start position (character position)
2185 * @param string the charset
2186 * @param integer length (in characters)
2187 * @return string the substring
2188 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2189 */
2190 function euc_substr($str, $start, $charset, $len = NULL) {
2191 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2192 if ($byte_start === FALSE) {
2193 return FALSE;
2194 } // $start outside string length
2195
2196 $str = substr($str, $byte_start);
2197
2198 if ($len != NULL) {
2199 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2200 if ($byte_end === FALSE) // $len outside actual string length
2201 {
2202 return $str;
2203 }
2204 else
2205 {
2206 return substr($str, 0, $byte_end);
2207 }
2208 }
2209 else {
2210 return $str;
2211 }
2212 }
2213
2214 /**
2215 * Counts the number of characters of a string in the EUC charset family.
2216 *
2217 * @param string EUC multibyte character string
2218 * @param string the charset
2219 * @return integer the number of characters
2220 * @see strlen()
2221 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2222 */
2223 function euc_strlen($str, $charset) {
2224 $sjis = ($charset == 'shift_jis');
2225 $n = 0;
2226 for ($i = 0; strlen($str{$i}); $i++) {
2227 $c = ord($str{$i});
2228 if ($sjis) {
2229 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2230 $i++;
2231 } // advance a double-byte char
2232 }
2233 else {
2234 if ($c >= 0x80) {
2235 $i++;
2236 } // advance a double-byte char
2237 }
2238
2239 $n++;
2240 }
2241
2242 return $n;
2243 }
2244
2245 /**
2246 * Translates a character position into an 'absolute' byte position.
2247 *
2248 * @param string EUC multibyte character string
2249 * @param integer character position (negative values start from the end)
2250 * @param string the charset
2251 * @return integer byte position
2252 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2253 */
2254 function euc_char2byte_pos($str, $pos, $charset) {
2255 $sjis = ($charset == 'shift_jis');
2256 $n = 0; // number of characters seen
2257 $p = abs($pos); // number of characters wanted
2258
2259 if ($pos >= 0) {
2260 $i = 0;
2261 $d = 1;
2262 } else {
2263 $i = strlen($str) - 1;
2264 $d = -1;
2265 }
2266
2267 for (; strlen($str{$i}) && $n < $p; $i += $d) {
2268 $c = ord($str{$i});
2269 if ($sjis) {
2270 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2271 $i += $d;
2272 } // advance a double-byte char
2273 }
2274 else {
2275 if ($c >= 0x80) {
2276 $i += $d;
2277 } // advance a double-byte char
2278 }
2279
2280 $n++;
2281 }
2282 if (!strlen($str{$i})) {
2283 return FALSE;
2284 } // offset beyond string length
2285
2286 if ($pos < 0) {
2287 $i++;
2288 } // correct offset
2289
2290 return $i;
2291 }
2292
2293 /**
2294 * Maps all characters of a string in the EUC charset family.
2295 *
2296 * @param string EUC multibyte character string
2297 * @param string the charset
2298 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2299 * @param string 'case': conversion 'toLower' or 'toUpper'
2300 * @return string the converted string
2301 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2302 */
2303 function euc_char_mapping($str, $charset, $mode, $opt = '') {
2304 switch ($mode) {
2305 case 'case':
2306 if (!$this->initCaseFolding($charset)) {
2307 return $str;
2308 } // do nothing
2309 $map =& $this->caseFolding[$charset][$opt];
2310 break;
2311
2312 case 'ascii':
2313 if (!$this->initToASCII($charset)) {
2314 return $str;
2315 } // do nothing
2316 $map =& $this->toASCII[$charset];
2317 break;
2318
2319 default:
2320 return $str;
2321 }
2322
2323 $sjis = ($charset == 'shift_jis');
2324 $out = '';
2325 for ($i = 0; strlen($str{$i}); $i++) {
2326 $mbc = $str{$i};
2327 $c = ord($mbc);
2328
2329 if ($sjis) {
2330 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2331 $mbc = substr($str, $i, 2);
2332 $i++;
2333 }
2334 }
2335 else {
2336 if ($c >= 0x80) { // a double-byte char
2337 $mbc = substr($str, $i, 2);
2338 $i++;
2339 }
2340 }
2341
2342 if (isset($map[$mbc])) {
2343 $out .= $map[$mbc];
2344 } else {
2345 $out .= $mbc;
2346 }
2347 }
2348
2349 return $out;
2350 }
2351
2352 }
2353
2354 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])) {
2355 include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2356 }
2357
2358 ?>