[BUGFIX] Using datetime field with datepicker the time information gets lost
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 136: class t3lib_cs
38 * 488: function parse_charset($charset)
39 * 507: function get_locale_charset($locale)
40 *
41 * SECTION: Charset Conversion functions
42 * 560: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
43 * 600: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
44 * 617: function utf8_encode($str,$charset)
45 * 663: function utf8_decode($str,$charset,$useEntityForNoChar=0)
46 * 706: function utf8_to_entities($str)
47 * 739: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
48 * 773: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
49 * 823: function UnumberToChar($cbyte)
50 * 868: function utf8CharToUnumber($str,$hex=0)
51 *
52 * SECTION: Init functions
53 * 911: function initCharset($charset)
54 * 973: function initUnicodeData($mode=null)
55 * 1198: function initCaseFolding($charset)
56 * 1260: function initToASCII($charset)
57 *
58 * SECTION: String operation functions
59 * 1331: function substr($charset,$string,$start,$len=null)
60 * 1384: function strlen($charset,$string)
61 * 1414: function crop($charset,$string,$len,$crop='')
62 * 1467: function strtrunc($charset,$string,$len)
63 * 1501: function conv_case($charset,$string,$case)
64 * 1527: function specCharsToASCII($charset,$string)
65 *
66 * SECTION: Internal string operation functions
67 * 1567: function sb_char_mapping($str,$charset,$mode,$opt='')
68 *
69 * SECTION: Internal UTF-8 string operation functions
70 * 1622: function utf8_substr($str,$start,$len=null)
71 * 1655: function utf8_strlen($str)
72 * 1676: function utf8_strtrunc($str,$len)
73 * 1698: function utf8_strpos($haystack,$needle,$offset=0)
74 * 1723: function utf8_strrpos($haystack,$needle)
75 * 1745: function utf8_char2byte_pos($str,$pos)
76 * 1786: function utf8_byte2char_pos($str,$pos)
77 * 1809: function utf8_char_mapping($str,$mode,$opt='')
78 *
79 * SECTION: Internal EUC string operation functions
80 * 1885: function euc_strtrunc($str,$len,$charset)
81 * 1914: function euc_substr($str,$start,$charset,$len=null)
82 * 1939: function euc_strlen($str,$charset)
83 * 1966: function euc_char2byte_pos($str,$pos,$charset)
84 * 2007: function euc_char_mapping($str,$charset,$mode,$opt='')
85 *
86 * TOTAL FUNCTIONS: 35
87 * (This index is automatically created/updated by the extension "extdeveval")
88 *
89 */
90
91
92 /**
93 * Notes on UTF-8
94 *
95 * Functions working on UTF-8 strings:
96 *
97 * - strchr/strstr
98 * - strrchr
99 * - substr_count
100 * - implode/explode/join
101 *
102 * Functions nearly working on UTF-8 strings:
103 *
104 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
105 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
106 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
107 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
108 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
109 *
110 * Functions NOT working on UTF-8 strings:
111 *
112 * - str*cmp
113 * - stristr
114 * - stripos
115 * - substr
116 * - strrev
117 * - split/spliti
118 * - ...
119 *
120 */
121 /**
122 * Class for conversion between charsets
123 *
124 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
125 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
126 * @package TYPO3
127 * @subpackage t3lib
128 */
129 class t3lib_cs {
130 var $noCharByteVal = 63; // ASCII Value for chars with no equivalent.
131
132 // This is the array where parsed conversion tables are stored (cached)
133 var $parsedCharsets = array();
134
135 // An array where case folding data will be stored (cached)
136 var $caseFolding = array();
137
138 // An array where charset-to-ASCII mappings are stored (cached)
139 var $toASCII = array();
140
141 // This tells the converter which charsets has two bytes per char:
142 var $twoByteSets = array(
143 'ucs-2' => 1, // 2-byte Unicode
144 );
145
146 // This tells the converter which charsets has four bytes per char:
147 var $fourByteSets = array(
148 'ucs-4' => 1, // 4-byte Unicode
149 'utf-32' => 1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
150 );
151
152 // This tells the converter which charsets use a scheme like the Extended Unix Code:
153 var $eucBasedSets = array(
154 'gb2312' => 1, // Chinese, simplified.
155 'big5' => 1, // Chinese, traditional.
156 'euc-kr' => 1, // Korean
157 'shift_jis' => 1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
158 );
159
160 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
161 // http://czyborra.com/charsets/iso8859.html
162 var $synonyms = array(
163 'us' => 'ascii',
164 'us-ascii' => 'ascii',
165 'cp819' => 'iso-8859-1',
166 'ibm819' => 'iso-8859-1',
167 'iso-ir-100' => 'iso-8859-1',
168 'iso-ir-101' => 'iso-8859-2',
169 'iso-ir-109' => 'iso-8859-3',
170 'iso-ir-110' => 'iso-8859-4',
171 'iso-ir-144' => 'iso-8859-5',
172 'iso-ir-127' => 'iso-8859-6',
173 'iso-ir-126' => 'iso-8859-7',
174 'iso-ir-138' => 'iso-8859-8',
175 'iso-ir-148' => 'iso-8859-9',
176 'iso-ir-157' => 'iso-8859-10',
177 'iso-ir-179' => 'iso-8859-13',
178 'iso-ir-199' => 'iso-8859-14',
179 'iso-ir-203' => 'iso-8859-15',
180 'csisolatin1' => 'iso-8859-1',
181 'csisolatin2' => 'iso-8859-2',
182 'csisolatin3' => 'iso-8859-3',
183 'csisolatin5' => 'iso-8859-9',
184 'csisolatin8' => 'iso-8859-14',
185 'csisolatin9' => 'iso-8859-15',
186 'csisolatingreek' => 'iso-8859-7',
187 'iso-celtic' => 'iso-8859-14',
188 'latin1' => 'iso-8859-1',
189 'latin2' => 'iso-8859-2',
190 'latin3' => 'iso-8859-3',
191 'latin5' => 'iso-8859-9',
192 'latin6' => 'iso-8859-10',
193 'latin8' => 'iso-8859-14',
194 'latin9' => 'iso-8859-15',
195 'l1' => 'iso-8859-1',
196 'l2' => 'iso-8859-2',
197 'l3' => 'iso-8859-3',
198 'l5' => 'iso-8859-9',
199 'l6' => 'iso-8859-10',
200 'l8' => 'iso-8859-14',
201 'l9' => 'iso-8859-15',
202 'cyrillic' => 'iso-8859-5',
203 'arabic' => 'iso-8859-6',
204 'tis-620' => 'iso-8859-11',
205 'win874' => 'windows-874',
206 'win1250' => 'windows-1250',
207 'win1251' => 'windows-1251',
208 'win1252' => 'windows-1252',
209 'win1253' => 'windows-1253',
210 'win1254' => 'windows-1254',
211 'win1255' => 'windows-1255',
212 'win1256' => 'windows-1256',
213 'win1257' => 'windows-1257',
214 'win1258' => 'windows-1258',
215 'cp1250' => 'windows-1250',
216 'cp1251' => 'windows-1251',
217 'cp1252' => 'windows-1252',
218 'ms-ee' => 'windows-1250',
219 'ms-ansi' => 'windows-1252',
220 'ms-greek' => 'windows-1253',
221 'ms-turk' => 'windows-1254',
222 'winbaltrim' => 'windows-1257',
223 'koi-8ru' => 'koi-8r',
224 'koi8r' => 'koi-8r',
225 'cp878' => 'koi-8r',
226 'mac' => 'macroman',
227 'macintosh' => 'macroman',
228 'euc-cn' => 'gb2312',
229 'x-euc-cn' => 'gb2312',
230 'euccn' => 'gb2312',
231 'cp936' => 'gb2312',
232 'big-5' => 'big5',
233 'cp950' => 'big5',
234 'eucjp' => 'euc-jp',
235 'sjis' => 'shift_jis',
236 'shift-jis' => 'shift_jis',
237 'cp932' => 'shift_jis',
238 'cp949' => 'euc-kr',
239 'utf7' => 'utf-7',
240 'utf8' => 'utf-8',
241 'utf16' => 'utf-16',
242 'utf32' => 'utf-32',
243 'utf8' => 'utf-8',
244 'ucs2' => 'ucs-2',
245 'ucs4' => 'ucs-4',
246 );
247
248 // mapping of iso-639-1 language codes to script names
249 var $lang_to_script = array(
250 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
251 'ar' => 'arabic',
252 'bg' => 'cyrillic', // Bulgarian
253 'bs' => 'east_european', // Bosnian
254 'cs' => 'east_european', // Czech
255 'da' => 'west_european', // Danish
256 'de' => 'west_european', // German
257 'es' => 'west_european', // Spanish
258 'et' => 'estonian',
259 'eo' => 'unicode', // Esperanto
260 'eu' => 'west_european', // Basque
261 'fa' => 'arabic', // Persian
262 'fi' => 'west_european', // Finish
263 'fo' => 'west_european', // Faroese
264 'fr' => 'west_european', // French
265 'ga' => 'west_european', // Irish
266 'gl' => 'west_european', // Galician
267 'gr' => 'greek',
268 'he' => 'hebrew', // Hebrew (since 1998)
269 'hi' => 'unicode', // Hindi
270 'hr' => 'east_european', // Croatian
271 'hu' => 'east_european', // Hungarian
272 'iw' => 'hebrew', // Hebrew (til 1998)
273 'is' => 'west_european', // Icelandic
274 'it' => 'west_european', // Italian
275 'ja' => 'japanese',
276 'ka' => 'unicode', // Georgian
277 'kl' => 'west_european', // Greenlandic
278 'km' => 'unicode', // Khmer
279 'ko' => 'korean',
280 'lt' => 'lithuanian',
281 'lv' => 'west_european', // Latvian/Lettish
282 'nl' => 'west_european', // Dutch
283 'no' => 'west_european', // Norwegian
284 'nb' => 'west_european', // Norwegian Bokmal
285 'nn' => 'west_european', // Norwegian Nynorsk
286 'pl' => 'east_european', // Polish
287 'pt' => 'west_european', // Portuguese
288 'ro' => 'east_european', // Romanian
289 'ru' => 'cyrillic', // Russian
290 'sk' => 'east_european', // Slovak
291 'sl' => 'east_european', // Slovenian
292 'sr' => 'cyrillic', // Serbian
293 'sv' => 'west_european', // Swedish
294 'sq' => 'albanian', // Albanian
295 'th' => 'thai',
296 'uk' => 'cyrillic', // Ukranian
297 'vi' => 'vietnamese',
298 'zh' => 'chinese',
299 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
300 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
301 'ara' => 'arabic',
302 'bgr' => 'cyrillic', // Bulgarian
303 'cat' => 'west_european', // Catalan
304 'chs' => 'simpl_chinese',
305 'cht' => 'trad_chinese',
306 'csy' => 'east_european', // Czech
307 'dan' => 'west_european', // Danisch
308 'deu' => 'west_european', // German
309 'dea' => 'west_european', // German (Austrian)
310 'des' => 'west_european', // German (Swiss)
311 'ena' => 'west_european', // English (Australian)
312 'enc' => 'west_european', // English (Canadian)
313 'eng' => 'west_european', // English
314 'enz' => 'west_european', // English (New Zealand)
315 'enu' => 'west_european', // English (United States)
316 'euq' => 'west_european', // Basque
317 'fos' => 'west_european', // Faroese
318 'far' => 'arabic', // Persian
319 'fin' => 'west_european', // Finish
320 'fra' => 'west_european', // French
321 'frb' => 'west_european', // French (Belgian)
322 'frc' => 'west_european', // French (Canadian)
323 'frs' => 'west_european', // French (Swiss)
324 'geo' => 'unicode', // Georgian
325 'glg' => 'west_european', // Galician
326 'ell' => 'greek',
327 'heb' => 'hebrew',
328 'hin' => 'unicode', // Hindi
329 'hun' => 'east_european', // Hungarian
330 'isl' => 'west_euorpean', // Icelandic
331 'ita' => 'west_european', // Italian
332 'its' => 'west_european', // Italian (Swiss)
333 'jpn' => 'japanese',
334 'khm' => 'unicode', // Khmer
335 'kor' => 'korean',
336 'lth' => 'lithuanian',
337 'lvi' => 'west_european', // Latvian/Lettish
338 'msl' => 'west_european', // Malay
339 'nlb' => 'west_european', // Dutch (Belgian)
340 'nld' => 'west_european', // Dutch
341 'nor' => 'west_european', // Norwegian (bokmal)
342 'non' => 'west_european', // Norwegian (nynorsk)
343 'plk' => 'east_european', // Polish
344 'ptg' => 'west_european', // Portuguese
345 'ptb' => 'west_european', // Portuguese (Brazil)
346 'rom' => 'east_european', // Romanian
347 'rus' => 'cyrillic', // Russian
348 'slv' => 'east_european', // Slovenian
349 'sky' => 'east_european', // Slovak
350 'srl' => 'east_european', // Serbian (Latin)
351 'srb' => 'cyrillic', // Serbian (Cyrillic)
352 'esp' => 'west_european', // Spanish (trad. sort)
353 'esm' => 'west_european', // Spanish (Mexican)
354 'esn' => 'west_european', // Spanish (internat. sort)
355 'sve' => 'west_european', // Swedish
356 'sqi' => 'albanian', // Albanian
357 'tha' => 'thai',
358 'trk' => 'turkish',
359 'ukr' => 'cyrillic', // Ukrainian
360 // English language names
361 'albanian' => 'albanian',
362 'arabic' => 'arabic',
363 'basque' => 'west_european',
364 'bosnian' => 'east_european',
365 'bulgarian' => 'east_european',
366 'catalan' => 'west_european',
367 'croatian' => 'east_european',
368 'czech' => 'east_european',
369 'danish' => 'west_european',
370 'dutch' => 'west_european',
371 'english' => 'west_european',
372 'esperanto' => 'unicode',
373 'estonian' => 'estonian',
374 'faroese' => 'west_european',
375 'farsi' => 'arabic',
376 'finnish' => 'west_european',
377 'french' => 'west_european',
378 'galician' => 'west_european',
379 'georgian' => 'unicode',
380 'german' => 'west_european',
381 'greek' => 'greek',
382 'greenlandic' => 'west_european',
383 'hebrew' => 'hebrew',
384 'hindi' => 'unicode',
385 'hungarian' => 'east_european',
386 'icelandic' => 'west_european',
387 'italian' => 'west_european',
388 'khmer' => 'unicode',
389 'latvian' => 'west_european',
390 'lettish' => 'west_european',
391 'lithuanian' => 'lithuanian',
392 'malay' => 'west_european',
393 'norwegian' => 'west_european',
394 'persian' => 'arabic',
395 'polish' => 'east_european',
396 'portuguese' => 'west_european',
397 'russian' => 'cyrillic',
398 'romanian' => 'east_european',
399 'serbian' => 'cyrillic',
400 'slovak' => 'east_european',
401 'slovenian' => 'east_european',
402 'spanish' => 'west_european',
403 'svedish' => 'west_european',
404 'that' => 'thai',
405 'turkish' => 'turkish',
406 'ukrainian' => 'cyrillic',
407 );
408
409 // mapping of language (family) names to charsets on Unix
410 var $script_to_charset_unix = array(
411 'west_european' => 'iso-8859-1',
412 'estonian' => 'iso-8859-1',
413 'east_european' => 'iso-8859-2',
414 'baltic' => 'iso-8859-4',
415 'cyrillic' => 'iso-8859-5',
416 'arabic' => 'iso-8859-6',
417 'greek' => 'iso-8859-7',
418 'hebrew' => 'iso-8859-8',
419 'turkish' => 'iso-8859-9',
420 'thai' => 'iso-8859-11', // = TIS-620
421 'lithuanian' => 'iso-8859-13',
422 'chinese' => 'gb2312', // = euc-cn
423 'japanese' => 'euc-jp',
424 'korean' => 'euc-kr',
425 'simpl_chinese' => 'gb2312',
426 'trad_chinese' => 'big5',
427 'vietnamese' => '',
428 'unicode' => 'utf-8',
429 'albanian' => 'utf-8'
430 );
431
432 // mapping of language (family) names to charsets on Windows
433 var $script_to_charset_windows = array(
434 'east_european' => 'windows-1250',
435 'cyrillic' => 'windows-1251',
436 'west_european' => 'windows-1252',
437 'greek' => 'windows-1253',
438 'turkish' => 'windows-1254',
439 'hebrew' => 'windows-1255',
440 'arabic' => 'windows-1256',
441 'baltic' => 'windows-1257',
442 'estonian' => 'windows-1257',
443 'lithuanian' => 'windows-1257',
444 'vietnamese' => 'windows-1258',
445 'thai' => 'cp874',
446 'korean' => 'cp949',
447 'chinese' => 'gb2312',
448 'japanese' => 'shift_jis',
449 'simpl_chinese' => 'gb2312',
450 'trad_chinese' => 'big5',
451 'albanian' => 'windows-1250',
452 'unicode' => 'utf-8'
453 );
454
455 // mapping of locale names to charsets
456 var $locale_to_charset = array(
457 'japanese.euc' => 'euc-jp',
458 'ja_jp.ujis' => 'euc-jp',
459 'korean.euc' => 'euc-kr',
460 'sr@Latn' => 'iso-8859-2',
461 'zh_cn' => 'gb2312',
462 'zh_hk' => 'big5',
463 'zh_tw' => 'big5',
464 );
465
466 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
467 // Empty values means "iso-8859-1"
468 var $charSetArray = array(
469 'dk' => '',
470 'de' => '',
471 'no' => '',
472 'it' => '',
473 'fr' => '',
474 'es' => '',
475 'nl' => '',
476 'cz' => 'windows-1250',
477 'pl' => 'iso-8859-2',
478 'si' => 'windows-1250',
479 'fi' => '',
480 'tr' => 'iso-8859-9',
481 'se' => '',
482 'pt' => '',
483 'ru' => 'windows-1251',
484 'ro' => 'iso-8859-2',
485 'ch' => 'gb2312',
486 'sk' => 'windows-1250',
487 'lt' => 'windows-1257',
488 'is' => 'utf-8',
489 'hr' => 'windows-1250',
490 'hu' => 'iso-8859-2',
491 'gl' => '',
492 'th' => 'iso-8859-11',
493 'gr' => 'iso-8859-7',
494 'hk' => 'big5',
495 'eu' => '',
496 'bg' => 'windows-1251',
497 'br' => '',
498 'et' => 'iso-8859-4',
499 'ar' => 'iso-8859-6',
500 'he' => 'utf-8',
501 'ua' => 'windows-1251',
502 'jp' => 'shift_jis',
503 'lv' => 'utf-8',
504 'vn' => 'utf-8',
505 'ca' => 'iso-8859-15',
506 'ba' => 'iso-8859-2',
507 'kr' => 'euc-kr',
508 'eo' => 'utf-8',
509 'my' => '',
510 'hi' => 'utf-8',
511 'fo' => 'utf-8',
512 'fa' => 'utf-8',
513 'sr' => 'utf-8',
514 'sq' => 'utf-8',
515 'ge' => 'utf-8',
516 'ga' => '',
517 'km' => 'utf-8',
518 'qc' => '',
519 );
520
521 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
522 // Missing keys means: same as Typo3
523 var $isoArray = array(
524 'ba' => 'bs',
525 'br' => 'pt_BR',
526 'ch' => 'zh_CN',
527 'cz' => 'cs',
528 'dk' => 'da',
529 'si' => 'sl',
530 'se' => 'sv',
531 'gl' => 'kl',
532 'gr' => 'el',
533 'hk' => 'zh_HK',
534 'kr' => 'ko',
535 'ua' => 'uk',
536 'jp' => 'ja',
537 'qc' => 'fr_CA',
538 'vn' => 'vi',
539 'ge' => 'ka',
540 'ga' => 'gl',
541 );
542
543 /**
544 * Normalize - changes input character set to lowercase letters.
545 *
546 * @param string Input charset
547 * @return string Normalized charset
548 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
549 */
550 function parse_charset($charset) {
551 $charset = trim(strtolower($charset));
552 if (isset($this->synonyms[$charset])) {
553 $charset = $this->synonyms[$charset];
554 }
555
556 return $charset;
557 }
558
559 /**
560 * Get the charset of a locale.
561 *
562 * ln language
563 * ln_CN language / country
564 * ln_CN.cs language / country / charset
565 * ln_CN.cs@mod language / country / charset / modifier
566 *
567 * @param string Locale string
568 * @return string Charset resolved for locale string
569 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
570 */
571 function get_locale_charset($locale) {
572 $locale = strtolower($locale);
573
574 // exact locale specific charset?
575 if (isset($this->locale_to_charset[$locale])) {
576 return $this->locale_to_charset[$locale];
577 }
578
579 // get modifier
580 list($locale, $modifier) = explode('@', $locale);
581
582 // locale contains charset: use it
583 list($locale, $charset) = explode('.', $locale);
584 if ($charset) {
585 return $this->parse_charset($charset);
586 }
587
588 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
589 if ($modifier == 'euro') {
590 return 'iso-8859-15';
591 }
592
593 // get language
594 list($language, $country) = explode('_', $locale);
595 if (isset($this->lang_to_script[$language])) {
596 $script = $this->lang_to_script[$language];
597 }
598
599 if (TYPO3_OS == 'WIN') {
600 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
601 } else {
602 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
603 }
604
605 return $cs;
606 }
607
608
609 /********************************************
610 *
611 * Charset Conversion functions
612 *
613 ********************************************/
614
615 /**
616 * Convert from one charset to another charset.
617 *
618 * @param string Input string
619 * @param string From charset (the current charset of the string)
620 * @param string To charset (the output charset wanted)
621 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
622 * @return string Converted string
623 * @see convArray()
624 */
625 function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
626 if ($fromCS == $toCS) {
627 return $str;
628 }
629
630 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
631 if ($toCS == 'utf-8' || !$useEntityForNoChar) {
632 switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
633 case 'mbstring':
634 $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
635 if (FALSE !== $conv_str) {
636 return $conv_str;
637 } // returns false for unsupported charsets
638 break;
639
640 case 'iconv':
641 $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
642 if (FALSE !== $conv_str) {
643 return $conv_str;
644 }
645 break;
646
647 case 'recode':
648 $conv_str = recode_string($fromCS . '..' . $toCS, $str);
649 if (FALSE !== $conv_str) {
650 return $conv_str;
651 }
652 break;
653 }
654 // fallback to TYPO3 conversion
655 }
656
657 if ($fromCS != 'utf-8') {
658 $str = $this->utf8_encode($str, $fromCS);
659 }
660 if ($toCS != 'utf-8') {
661 $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
662 }
663 return $str;
664 }
665
666 /**
667 * Convert all elements in ARRAY with type string from one charset to another charset.
668 * NOTICE: Array is passed by reference!
669 *
670 * @param string Input array, possibly multidimensional
671 * @param string From charset (the current charset of the string)
672 * @param string To charset (the output charset wanted)
673 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
674 * @return void
675 * @see conv()
676 */
677 function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
678 foreach ($array as $key => $value) {
679 if (is_array($array[$key])) {
680 $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
681 } elseif (is_string($array[$key])) {
682 $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
683 }
684 }
685 }
686
687 /**
688 * Converts $str from $charset to UTF-8
689 *
690 * @param string String in local charset to convert to UTF-8
691 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
692 * @return string Output string, converted to UTF-8
693 */
694 function utf8_encode($str, $charset) {
695
696 if ($charset === 'utf-8') {
697 return $str;
698 }
699
700 // Charset is case-insensitive.
701 if ($this->initCharset($charset)) { // Parse conv. table if not already...
702 $strLen = strlen($str);
703 $outStr = '';
704
705 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in string.
706 $chr = substr($str, $a, 1);
707 $ord = ord($chr);
708 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
709 $ord2 = ord($str{$a + 1});
710 $ord = $ord << 8 | $ord2; // assume big endian
711
712 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
713 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
714 } else {
715 $outStr .= chr($this->noCharByteVal);
716 } // No char exists
717 $a++;
718 } elseif ($ord > 127) { // If char has value over 127 it's a multibyte char in UTF-8
719 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
720 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
721 $a++;
722 $ord2 = ord(substr($str, $a, 1));
723 $ord = $ord * 256 + $ord2;
724 }
725 }
726
727 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
728 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
729 } else {
730 $outStr .= chr($this->noCharByteVal);
731 } // No char exists
732 } else {
733 $outStr .= $chr;
734 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
735 }
736 return $outStr;
737 }
738 }
739
740 /**
741 * Converts $str from UTF-8 to $charset
742 *
743 * @param string String in UTF-8 to convert to local charset
744 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
745 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
746 * @return string Output string, converted to local charset
747 */
748 function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
749
750 if ($charset === 'utf-8') {
751 return $str;
752 }
753
754 // Charset is case-insensitive.
755 if ($this->initCharset($charset)) { // Parse conv. table if not already...
756 $strLen = strlen($str);
757 $outStr = '';
758 $buf = '';
759 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) { // Traverse each char in UTF-8 string.
760 $chr = substr($str, $a, 1);
761 $ord = ord($chr);
762 if ($ord > 127) { // This means multibyte! (first byte!)
763 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
764
765 $buf = $chr; // Add first byte
766 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
767 $ord = $ord << 1; // Shift it left and ...
768 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
769 $a++; // Increase pointer...
770 $buf .= substr($str, $a, 1); // ... and add the next char.
771 } else {
772 break;
773 }
774 }
775
776 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
777 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
778 if ($mByte > 255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
779 $outStr .= chr(($mByte >> 8) & 255) . chr($mByte & 255);
780 } else {
781 $outStr .= chr($mByte);
782 }
783 } elseif ($useEntityForNoChar) { // Create num entity:
784 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
785 } else {
786 $outStr .= chr($this->noCharByteVal);
787 } // No char exists
788 } else {
789 $outStr .= chr($this->noCharByteVal);
790 } // No char exists (MIDDLE of MB sequence!)
791 } else {
792 $outStr .= $chr;
793 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
794 }
795 return $outStr;
796 }
797 }
798
799 /**
800 * Converts all chars > 127 to numeric entities.
801 *
802 * @param string Input string
803 * @return string Output string
804 */
805 function utf8_to_entities($str) {
806 $strLen = strlen($str);
807 $outStr = '';
808 $buf = '';
809 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
810 $chr = substr($str, $a, 1);
811 $ord = ord($chr);
812 if ($ord > 127) { // This means multibyte! (first byte!)
813 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
814 $buf = $chr; // Add first byte
815 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
816 $ord = $ord << 1; // Shift it left and ...
817 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
818 $a++; // Increase pointer...
819 $buf .= substr($str, $a, 1); // ... and add the next char.
820 } else {
821 break;
822 }
823 }
824
825 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
826 } else {
827 $outStr .= chr($this->noCharByteVal);
828 } // No char exists (MIDDLE of MB sequence!)
829 } else {
830 $outStr .= $chr;
831 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
832 }
833
834 return $outStr;
835 }
836
837 /**
838 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
839 *
840 * @param string Input string, UTF-8
841 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
842 * @return string Output string
843 */
844 function entities_to_utf8($str, $alsoStdHtmlEnt = 0) {
845 if ($alsoStdHtmlEnt) {
846 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
847 }
848
849 $token = md5(microtime());
850 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
851 foreach ($parts as $k => $v) {
852 if ($k % 2) {
853 if (substr($v, 0, 1) == '#') { // Dec or hex entities:
854 if (substr($v, 1, 1) == 'x') {
855 $parts[$k] = $this->UnumberToChar(hexdec(substr($v, 2)));
856 } else {
857 $parts[$k] = $this->UnumberToChar(substr($v, 1));
858 }
859 } elseif ($alsoStdHtmlEnt && $trans_tbl['&' . $v . ';']) { // Other entities:
860 $parts[$k] = $this->utf8_encode($trans_tbl['&' . $v . ';'], 'iso-8859-1');
861 } else { // No conversion:
862 $parts[$k] = '&' . $v . ';';
863 }
864 }
865 }
866
867 return implode('', $parts);
868 }
869
870 /**
871 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
872 *
873 * @param string Input string, UTF-8
874 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
875 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
876 * @return array Output array with the char numbers
877 */
878 function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
879 // If entities must be registered as well...:
880 if ($convEntities) {
881 $str = $this->entities_to_utf8($str, 1);
882 }
883 // Do conversion:
884 $strLen = strlen($str);
885 $outArr = array();
886 $buf = '';
887 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
888 $chr = substr($str, $a, 1);
889 $ord = ord($chr);
890 if ($ord > 127) { // This means multibyte! (first byte!)
891 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
892 $buf = $chr; // Add first byte
893 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
894 $ord = $ord << 1; // Shift it left and ...
895 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
896 $a++; // Increase pointer...
897 $buf .= substr($str, $a, 1); // ... and add the next char.
898 } else {
899 break;
900 }
901 }
902
903 $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
904 } else {
905 $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
906 } // No char exists (MIDDLE of MB sequence!)
907 } else {
908 $outArr[] = $retChar ? chr($ord) : $ord;
909 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
910 }
911
912 return $outArr;
913 }
914
915 /**
916 * Converts a UNICODE number to a UTF-8 multibyte character
917 * Algorithm based on script found at From: http://czyborra.com/utf/
918 * Unit-tested by Kasper
919 *
920 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
921 *
922 * bytes | bits | representation
923 * 1 | 7 | 0vvvvvvv
924 * 2 | 11 | 110vvvvv 10vvvvvv
925 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
926 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
927 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
928 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
929 *
930 * @param integer UNICODE integer
931 * @return string UTF-8 multibyte character string
932 * @see utf8CharToUnumber()
933 */
934 function UnumberToChar($cbyte) {
935 $str = '';
936
937 if ($cbyte < 0x80) {
938 $str .= chr($cbyte);
939 } else {
940 if ($cbyte < 0x800) {
941 $str .= chr(0xC0 | ($cbyte >> 6));
942 $str .= chr(0x80 | ($cbyte & 0x3F));
943 } else {
944 if ($cbyte < 0x10000) {
945 $str .= chr(0xE0 | ($cbyte >> 12));
946 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
947 $str .= chr(0x80 | ($cbyte & 0x3F));
948 } else {
949 if ($cbyte < 0x200000) {
950 $str .= chr(0xF0 | ($cbyte >> 18));
951 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
952 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
953 $str .= chr(0x80 | ($cbyte & 0x3F));
954 } else {
955 if ($cbyte < 0x4000000) {
956 $str .= chr(0xF8 | ($cbyte >> 24));
957 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
958 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
959 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
960 $str .= chr(0x80 | ($cbyte & 0x3F));
961 } else {
962 if ($cbyte < 0x80000000) {
963 $str .= chr(0xFC | ($cbyte >> 30));
964 $str .= chr(0x80 | (($cbyte >> 24) & 0x3F));
965 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
966 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
967 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
968 $str .= chr(0x80 | ($cbyte & 0x3F));
969 } else { // Cannot express a 32-bit character in UTF-8
970 $str .= chr($this->noCharByteVal);
971 }
972 }
973 }
974 }
975 }
976 }
977 return $str;
978 }
979
980 /**
981 * Converts a UTF-8 Multibyte character to a UNICODE number
982 * Unit-tested by Kasper
983 *
984 * @param string UTF-8 multibyte character string
985 * @param boolean If set, then a hex. number is returned.
986 * @return integer UNICODE integer
987 * @see UnumberToChar()
988 */
989 function utf8CharToUnumber($str, $hex = 0) {
990 $ord = ord(substr($str, 0, 1)); // First char
991
992 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
993 $binBuf = '';
994 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
995 $ord = $ord << 1; // Shift it left and ...
996 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
997 $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6);
998 } else {
999 break;
1000 }
1001 }
1002 $binBuf = substr('00000000' . decbin(ord(substr($str, 0, 1))), -(6 - $b)) . $binBuf;
1003
1004 $int = bindec($binBuf);
1005 } else {
1006 $int = $ord;
1007 }
1008
1009 return $hex ? 'x' . dechex($int) : $int;
1010 }
1011
1012
1013 /********************************************
1014 *
1015 * Init functions
1016 *
1017 ********************************************/
1018
1019 /**
1020 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
1021 * This function is automatically called by the conversion functions
1022 *
1023 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
1024 *
1025 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
1026 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
1027 * @access private
1028 */
1029 function initCharset($charset) {
1030 // Only process if the charset is not yet loaded:
1031 if (!is_array($this->parsedCharsets[$charset])) {
1032
1033 // Conversion table filename:
1034 $charsetConvTableFile = PATH_t3lib . 'csconvtbl/' . $charset . '.tbl';
1035
1036 // If the conversion table is found:
1037 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
1038 // Cache file for charsets:
1039 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1040 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1041 if ($cacheFile && @is_file($cacheFile)) {
1042 $this->parsedCharsets[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1043 } else {
1044 // Parse conversion table into lines:
1045 $lines = t3lib_div::trimExplode(LF, t3lib_div::getUrl($charsetConvTableFile), 1);
1046 // Initialize the internal variable holding the conv. table:
1047 $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
1048 // traverse the lines:
1049 $detectedType = '';
1050 foreach ($lines as $value) {
1051 if (trim($value) && substr($value, 0, 1) != '#') { // Comment line or blanks are ignored.
1052
1053 // Detect type if not done yet: (Done on first real line)
1054 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1055 if (!$detectedType) {
1056 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
1057 }
1058
1059 if ($detectedType == 'ms-token') {
1060 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1061 } elseif ($detectedType == 'whitespaced') {
1062 $regA = array();
1063 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1064 $hexbyte = $regA[1];
1065 $utf8 = 'U+' . $regA[2];
1066 }
1067 $decval = hexdec(trim($hexbyte));
1068 if ($decval > 127) {
1069 $utf8decval = hexdec(substr(trim($utf8), 2));
1070 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1071 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1072 }
1073 }
1074 }
1075 if ($cacheFile) {
1076 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1077 }
1078 }
1079 return 2;
1080 } else {
1081 return FALSE;
1082 }
1083 } else {
1084 return 1;
1085 }
1086 }
1087
1088 /**
1089 * This function initializes all UTF-8 character data tables.
1090 *
1091 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1092 *
1093 * @param string Mode ("case", "ascii", ...)
1094 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1095 * @access private
1096 */
1097 function initUnicodeData($mode = NULL) {
1098 // cache files
1099 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1100 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1101
1102 // Only process if the tables are not yet loaded
1103 switch ($mode) {
1104 case 'case':
1105 if (is_array($this->caseFolding['utf-8'])) {
1106 return 1;
1107 }
1108
1109 // Use cached version if possible
1110 if ($cacheFileCase && @is_file($cacheFileCase)) {
1111 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1112 return 2;
1113 }
1114 break;
1115
1116 case 'ascii':
1117 if (is_array($this->toASCII['utf-8'])) {
1118 return 1;
1119 }
1120
1121 // Use cached version if possible
1122 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1123 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1124 return 2;
1125 }
1126 break;
1127 }
1128
1129 // process main Unicode data file
1130 $unicodeDataFile = PATH_t3lib . 'unidata/UnicodeData.txt';
1131 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1132 return FALSE;
1133 }
1134
1135 $fh = fopen($unicodeDataFile, 'rb');
1136 if (!$fh) {
1137 return FALSE;
1138 }
1139
1140 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1141 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1142 $this->caseFolding['utf-8'] = array();
1143 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1144 $utf8CaseFolding['toUpper'] = array();
1145 $utf8CaseFolding['toLower'] = array();
1146 $utf8CaseFolding['toTitle'] = array();
1147
1148 $decomposition = array(); // array of temp. decompositions
1149 $mark = array(); // array of chars that are marks (eg. composing accents)
1150 $number = array(); // array of chars that are numbers (eg. digits)
1151 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1152
1153 while (!feof($fh)) {
1154 $line = fgets($fh, 4096);
1155 // has a lot of info
1156 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line));
1157
1158 $ord = hexdec($char);
1159 if ($ord > 0xFFFF) {
1160 break;
1161 } // only process the BMP
1162
1163 $utf8_char = $this->UnumberToChar($ord);
1164
1165 if ($upper) {
1166 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1167 }
1168 if ($lower) {
1169 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1170 }
1171 // store "title" only when different from "upper" (only a few)
1172 if ($title && $title != $upper) {
1173 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1174 }
1175
1176 switch ($cat{0}) {
1177 case 'M': // mark (accent, umlaut, ...)
1178 $mark["U+$char"] = 1;
1179 break;
1180
1181 case 'N': // numeric value
1182 if ($ord > 0x80 && $num != '') {
1183 $number["U+$char"] = $num;
1184 }
1185 }
1186
1187 // accented Latin letters without "official" decomposition
1188 $match = array();
1189 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1190 $c = ord($match[2]);
1191 if ($match[1] == 'SMALL') {
1192 $c += 32;
1193 }
1194
1195 $decomposition["U+$char"] = array(dechex($c));
1196 continue;
1197 }
1198
1199 $match = array();
1200 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1201 switch ($match[1]) {
1202 case '<circle>': // add parenthesis as circle replacement, eg (1)
1203 $match[2] = '0028 ' . $match[2] . ' 0029';
1204 break;
1205
1206 case '<square>': // add square brackets as square replacement, eg [1]
1207 $match[2] = '005B ' . $match[2] . ' 005D';
1208 break;
1209
1210 case '<compat>': // ignore multi char decompositions that start with a space
1211 if (preg_match('/^0020 /', $match[2])) {
1212 continue 2;
1213 }
1214 break;
1215
1216 // ignore Arabic and vertical layout presentation decomposition
1217 case '<initial>':
1218 case '<medial>':
1219 case '<final>':
1220 case '<isolated>':
1221 case '<vertical>':
1222 continue 2;
1223 }
1224 $decomposition["U+$char"] = explode(' ', $match[2]);
1225 }
1226 }
1227 fclose($fh);
1228
1229 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1230 $specialCasingFile = PATH_t3lib . 'unidata/SpecialCasing.txt';
1231 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1232 $fh = fopen($specialCasingFile, 'rb');
1233 if ($fh) {
1234 while (!feof($fh)) {
1235 $line = fgets($fh, 4096);
1236 if ($line{0} != '#' && trim($line) != '') {
1237
1238 list($char, $lower, $title, $upper, $cond) = t3lib_div::trimExplode(';', $line);
1239 if ($cond == '' || $cond{0} == '#') {
1240 $utf8_char = $this->UnumberToChar(hexdec($char));
1241 if ($char != $lower) {
1242 $arr = explode(' ', $lower);
1243 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1244 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1245 }
1246 if ($char != $title && $title != $upper) {
1247 $arr = explode(' ', $title);
1248 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1249 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1250 }
1251 if ($char != $upper) {
1252 $arr = explode(' ', $upper);
1253 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1254 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1255 }
1256 }
1257 }
1258 }
1259 fclose($fh);
1260 }
1261 }
1262
1263 // process custom decompositions
1264 $customTranslitFile = PATH_t3lib . 'unidata/Translit.txt';
1265 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1266 $fh = fopen($customTranslitFile, 'rb');
1267 if ($fh) {
1268 while (!feof($fh)) {
1269 $line = fgets($fh, 4096);
1270 if ($line{0} != '#' && trim($line) != '') {
1271 list($char, $translit) = t3lib_div::trimExplode(';', $line);
1272 if (!$translit) {
1273 $omit["U+$char"] = 1;
1274 }
1275 $decomposition["U+$char"] = explode(' ', $translit);
1276
1277 }
1278 }
1279 fclose($fh);
1280 }
1281 }
1282
1283 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1284 foreach ($decomposition as $from => $to) {
1285 $code_decomp = array();
1286
1287 while ($code_value = array_shift($to)) {
1288 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1289 foreach (array_reverse($decomposition["U+$code_value"]) as $cv) {
1290 array_unshift($to, $cv);
1291 }
1292 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1293 array_push($code_decomp, $code_value);
1294 }
1295 }
1296 if (count($code_decomp) || isset($omit[$from])) {
1297 $decomposition[$from] = $code_decomp;
1298 } else {
1299 unset($decomposition[$from]);
1300 }
1301 }
1302
1303 // create ascii only mapping
1304 $this->toASCII['utf-8'] = array();
1305 $ascii =& $this->toASCII['utf-8'];
1306
1307 foreach ($decomposition as $from => $to) {
1308 $code_decomp = array();
1309 while ($code_value = array_shift($to)) {
1310 $ord = hexdec($code_value);
1311 if ($ord > 127) {
1312 continue 2;
1313 } // skip decompositions containing non-ASCII chars
1314 else
1315 {
1316 array_push($code_decomp, chr($ord));
1317 }
1318 }
1319 $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1320 }
1321
1322 // add numeric decompositions
1323 foreach ($number as $from => $to) {
1324 $utf8_char = $this->UnumberToChar(hexdec($from));
1325 if (!isset($ascii[$utf8_char])) {
1326 $ascii[$utf8_char] = $to;
1327 }
1328 }
1329
1330 if ($cacheFileCase) {
1331 t3lib_div::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1332 }
1333
1334 if ($cacheFileASCII) {
1335 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1336 }
1337
1338 return 3;
1339 }
1340
1341 /**
1342 * This function initializes the folding table for a charset other than UTF-8.
1343 * This function is automatically called by the case folding functions.
1344 *
1345 * @param string Charset for which to initialize case folding.
1346 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1347 * @access private
1348 */
1349 function initCaseFolding($charset) {
1350 // Only process if the case table is not yet loaded:
1351 if (is_array($this->caseFolding[$charset])) {
1352 return 1;
1353 }
1354
1355 // Use cached version if possible
1356 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1357 if ($cacheFile && @is_file($cacheFile)) {
1358 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1359 return 2;
1360 }
1361
1362 // init UTF-8 conversion for this charset
1363 if (!$this->initCharset($charset)) {
1364 return FALSE;
1365 }
1366
1367 // UTF-8 case folding is used as the base conversion table
1368 if (!$this->initUnicodeData('case')) {
1369 return FALSE;
1370 }
1371
1372 $nochar = chr($this->noCharByteVal);
1373 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1374 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1375 $c = $this->utf8_decode($utf8, $charset);
1376
1377 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1378 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1379 if ($cc != '' && $cc != $nochar) {
1380 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1381 }
1382
1383 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1384 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1385 if ($cc != '' && $cc != $nochar) {
1386 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1387 }
1388
1389 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1390 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1391 if ($cc != '' && $cc != $nochar) {
1392 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1393 }
1394 }
1395
1396 // add the ASCII case table
1397 for ($i = ord('a'); $i <= ord('z'); $i++) {
1398 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1399 }
1400 for ($i = ord('A'); $i <= ord('Z'); $i++) {
1401 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1402 }
1403
1404 if ($cacheFile) {
1405 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1406 }
1407
1408 return 3;
1409 }
1410
1411 /**
1412 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1413 * This function is automatically called by the ASCII transliteration functions.
1414 *
1415 * @param string Charset for which to initialize conversion.
1416 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1417 * @access private
1418 */
1419 function initToASCII($charset) {
1420 // Only process if the case table is not yet loaded:
1421 if (is_array($this->toASCII[$charset])) {
1422 return 1;
1423 }
1424
1425 // Use cached version if possible
1426 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1427 if ($cacheFile && @is_file($cacheFile)) {
1428 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1429 return 2;
1430 }
1431
1432 // init UTF-8 conversion for this charset
1433 if (!$this->initCharset($charset)) {
1434 return FALSE;
1435 }
1436
1437 // UTF-8/ASCII transliteration is used as the base conversion table
1438 if (!$this->initUnicodeData('ascii')) {
1439 return FALSE;
1440 }
1441
1442 $nochar = chr($this->noCharByteVal);
1443 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1444 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1445 $c = $this->utf8_decode($utf8, $charset);
1446
1447 if (isset($this->toASCII['utf-8'][$utf8])) {
1448 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1449 }
1450 }
1451
1452 if ($cacheFile) {
1453 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1454 }
1455
1456 return 3;
1457 }
1458
1459
1460 /********************************************
1461 *
1462 * String operation functions
1463 *
1464 ********************************************/
1465
1466 /**
1467 * Returns a part of a string.
1468 * Unit-tested by Kasper (single byte charsets only)
1469 *
1470 * @param string The character set
1471 * @param string Character string
1472 * @param integer Start position (character position)
1473 * @param integer Length (in characters)
1474 * @return string The substring
1475 * @see substr(), mb_substr()
1476 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1477 */
1478 function substr($charset, $string, $start, $len = NULL) {
1479 if ($len === 0 || $string === '') {
1480 return '';
1481 }
1482
1483 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1484 // cannot omit $len, when specifying charset
1485 if ($len == NULL) {
1486 $enc = mb_internal_encoding(); // save internal encoding
1487 mb_internal_encoding($charset);
1488 $str = mb_substr($string, $start);
1489 mb_internal_encoding($enc); // restore internal encoding
1490
1491 return $str;
1492 }
1493 else {
1494 return mb_substr($string, $start, $len, $charset);
1495 }
1496 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1497 // cannot omit $len, when specifying charset
1498 if ($len == NULL) {
1499 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1500 iconv_set_encoding('internal_encoding', $charset);
1501 $str = iconv_substr($string, $start);
1502 iconv_set_encoding('internal_encoding', $enc); // restore internal encoding
1503
1504 return $str;
1505 }
1506 else {
1507 return iconv_substr($string, $start, $len, $charset);
1508 }
1509 } elseif ($charset == 'utf-8') {
1510 return $this->utf8_substr($string, $start, $len);
1511 } elseif ($this->eucBasedSets[$charset]) {
1512 return $this->euc_substr($string, $start, $charset, $len);
1513 } elseif ($this->twoByteSets[$charset]) {
1514 return substr($string, $start * 2, $len * 2);
1515 } elseif ($this->fourByteSets[$charset]) {
1516 return substr($string, $start * 4, $len * 4);
1517 }
1518
1519 // treat everything else as single-byte encoding
1520 return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
1521 }
1522
1523 /**
1524 * Counts the number of characters.
1525 * Unit-tested by Kasper (single byte charsets only)
1526 *
1527 * @param string The character set
1528 * @param string Character string
1529 * @return integer The number of characters
1530 * @see strlen()
1531 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1532 */
1533 function strlen($charset, $string) {
1534 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1535 return mb_strlen($string, $charset);
1536 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1537 return iconv_strlen($string, $charset);
1538 } elseif ($charset == 'utf-8') {
1539 return $this->utf8_strlen($string);
1540 } elseif ($this->eucBasedSets[$charset]) {
1541 return $this->euc_strlen($string, $charset);
1542 } elseif ($this->twoByteSets[$charset]) {
1543 return strlen($string) / 2;
1544 } elseif ($this->fourByteSets[$charset]) {
1545 return strlen($string) / 4;
1546 }
1547 // treat everything else as single-byte encoding
1548 return strlen($string);
1549 }
1550
1551 /**
1552 * Method to crop strings using the mb_substr function.
1553 *
1554 * @param string The character set
1555 * @param string String to be cropped
1556 * @param integer Crop length (in characters)
1557 * @param string Crop signifier
1558 * @return string The shortened string
1559 * @see mb_strlen(), mb_substr()
1560 */
1561 protected function cropMbstring($charset, $string, $len, $crop = '') {
1562 if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) {
1563 return $string;
1564 }
1565
1566 if ($len > 0) {
1567 $string = mb_substr($string, 0, $len, $charset) . $crop;
1568 } else {
1569 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1570 }
1571
1572 return $string;
1573 }
1574
1575 /**
1576 * Truncates a string and pre-/appends a string.
1577 * Unit tested by Kasper
1578 *
1579 * @param string The character set
1580 * @param string Character string
1581 * @param integer Length (in characters)
1582 * @param string Crop signifier
1583 * @return string The shortened string
1584 * @see substr(), mb_strimwidth()
1585 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1586 */
1587 function crop($charset, $string, $len, $crop = '') {
1588 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1589 return $this->cropMbstring($charset, $string, $len, $crop);
1590 }
1591
1592 if (intval($len) == 0) {
1593 return $string;
1594 }
1595
1596 if ($charset == 'utf-8') {
1597 $i = $this->utf8_char2byte_pos($string, $len);
1598 } elseif ($this->eucBasedSets[$charset]) {
1599 $i = $this->euc_char2byte_pos($string, $len, $charset);
1600 } else {
1601 if ($len > 0) {
1602 $i = $len;
1603 } else {
1604 $i = strlen($string) + $len;
1605 if ($i <= 0) {
1606 $i = FALSE;
1607 }
1608 }
1609 }
1610
1611 if ($i === FALSE) { // $len outside actual string length
1612 return $string;
1613 } else {
1614 if ($len > 0) {
1615 if (strlen($string{$i})) {
1616 return substr($string, 0, $i) . $crop;
1617
1618 }
1619 } else {
1620 if (strlen($string{$i - 1})) {
1621 return $crop . substr($string, $i);
1622 }
1623 }
1624
1625 /*
1626 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1627 if ($len > 0) {
1628 return substr($string,0,$i).$crop;
1629 } else {
1630 return $crop.substr($string,$i);
1631 }
1632 }
1633 */
1634 }
1635 return $string;
1636 }
1637
1638 /**
1639 * Cuts a string short at a given byte length.
1640 *
1641 * @param string The character set
1642 * @param string Character string
1643 * @param integer The byte length
1644 * @return string The shortened string
1645 * @see mb_strcut()
1646 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1647 */
1648 function strtrunc($charset, $string, $len) {
1649 if ($len <= 0) {
1650 return '';
1651 }
1652
1653 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1654 return mb_strcut($string, 0, $len, $charset);
1655 } elseif ($charset == 'utf-8') {
1656 return $this->utf8_strtrunc($string, $len);
1657 } elseif ($this->eucBasedSets[$charset]) {
1658 return $this->euc_strtrunc($string, $len, $charset);
1659 } elseif ($this->twoByteSets[$charset]) {
1660 if ($len % 2) {
1661 $len--;
1662 } // don't cut at odd positions
1663 } elseif ($this->fourByteSets[$charset]) {
1664 $x = $len % 4;
1665 $len -= $x; // realign to position dividable by four
1666 }
1667 // treat everything else as single-byte encoding
1668 return substr($string, 0, $len);
1669 }
1670
1671 /**
1672 * Translates all characters of a string into their respective case values.
1673 * Unlike strtolower() and strtoupper() this method is locale independent.
1674 * Note that the string length may change!
1675 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1676 * Unit-tested by Kasper
1677 * Real case folding is language dependent, this method ignores this fact.
1678 *
1679 * @param string Character set of string
1680 * @param string Input string to convert case for
1681 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1682 * @return string The converted string
1683 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1684 * @see strtolower(), strtoupper()
1685 */
1686 function conv_case($charset, $string, $case) {
1687 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1688 if ($case == 'toLower') {
1689 $string = mb_strtolower($string, $charset);
1690 } else {
1691 $string = mb_strtoupper($string, $charset);
1692 }
1693 } elseif ($charset == 'utf-8') {
1694 $string = $this->utf8_char_mapping($string, 'case', $case);
1695 } elseif (isset($this->eucBasedSets[$charset])) {
1696 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1697 } else {
1698 // treat everything else as single-byte encoding
1699 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1700 }
1701
1702 return $string;
1703 }
1704
1705 /**
1706 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1707 *
1708 * @param string Character set of string
1709 * @param string Input string to convert
1710 * @return string The converted string
1711 */
1712 function specCharsToASCII($charset, $string) {
1713 if ($charset == 'utf-8') {
1714 $string = $this->utf8_char_mapping($string, 'ascii');
1715 } elseif (isset($this->eucBasedSets[$charset])) {
1716 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1717 } else {
1718 // treat everything else as single-byte encoding
1719 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1720 }
1721
1722 return $string;
1723 }
1724
1725
1726 /**
1727 * converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1728 * into a TYPO3-readable language code
1729 * @param $languageCodesList list of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1730 * see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
1731 * @return string a preferred language that TYPO3 supports, or "default" if none found
1732 * @author Benjamin Mack (benni.typo3.org)
1733 */
1734 public function getPreferredClientLanguage($languageCodesList) {
1735 $allLanguageCodes = array();
1736 $selectedLanguage = 'default';
1737
1738 // get all languages where TYPO3 code is the same as the ISO code
1739 foreach ($this->charSetArray as $typo3Lang => $charSet) {
1740 $allLanguageCodes[$typo3Lang] = $typo3Lang;
1741 }
1742
1743 // get all languages where TYPO3 code differs from ISO code
1744 // or needs the country part
1745 // the iso codes will here overwrite the default typo3 language in the key
1746 foreach ($this->isoArray as $typo3Lang => $isoLang) {
1747 $isoLang = join('-', explode('_', $isoLang));
1748 $allLanguageCodes[$typo3Lang] = $isoLang;
1749 }
1750
1751 // move the iso codes to the (because we're comparing the keys with "isset" later on)
1752 $allLanguageCodes = array_flip($allLanguageCodes);
1753
1754
1755 $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList);
1756 // order the preferred languages after they key
1757 $sortedPreferredLanguages = array();
1758 foreach ($preferredLanguages as $preferredLanguage) {
1759 $quality = 1.0;
1760 if (strpos($preferredLanguage, ';q=') !== FALSE) {
1761 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1762 }
1763 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1764 }
1765
1766 // loop through the languages, with the highest priority first
1767 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1768 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1769 if (isset($allLanguageCodes[$preferredLanguage])) {
1770 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1771 break;
1772 }
1773
1774 // strip the country code from the end
1775 list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
1776 if (isset($allLanguageCodes[$preferredLanguage])) {
1777 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1778 break;
1779 }
1780 }
1781 if (!$selectedLanguage || $selectedLanguage == 'en') {
1782 $selectedLanguage = 'default';
1783 }
1784 return $selectedLanguage;
1785 }
1786
1787
1788 /********************************************
1789 *
1790 * Internal string operation functions
1791 *
1792 ********************************************/
1793
1794 /**
1795 * Maps all characters of a string in a single byte charset.
1796 *
1797 * @param string the string
1798 * @param string the charset
1799 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1800 * @param string 'case': conversion 'toLower' or 'toUpper'
1801 * @return string the converted string
1802 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1803 */
1804 function sb_char_mapping($str, $charset, $mode, $opt = '') {
1805 switch ($mode) {
1806 case 'case':
1807 if (!$this->initCaseFolding($charset)) {
1808 return $str;
1809 } // do nothing
1810 $map =& $this->caseFolding[$charset][$opt];
1811 break;
1812
1813 case 'ascii':
1814 if (!$this->initToASCII($charset)) {
1815 return $str;
1816 } // do nothing
1817 $map =& $this->toASCII[$charset];
1818 break;
1819
1820 default:
1821 return $str;
1822 }
1823
1824 $out = '';
1825 for ($i = 0; strlen($str{$i}); $i++) {
1826 $c = $str{$i};
1827 if (isset($map[$c])) {
1828 $out .= $map[$c];
1829 } else {
1830 $out .= $c;
1831 }
1832 }
1833
1834 return $out;
1835 }
1836
1837
1838 /********************************************
1839 *
1840 * Internal UTF-8 string operation functions
1841 *
1842 ********************************************/
1843
1844 /**
1845 * Returns a part of a UTF-8 string.
1846 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1847 *
1848 * @param string UTF-8 string
1849 * @param integer Start position (character position)
1850 * @param integer Length (in characters)
1851 * @return string The substring
1852 * @see substr()
1853 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1854 */
1855 function utf8_substr($str, $start, $len = NULL) {
1856 if (!strcmp($len, '0')) {
1857 return '';
1858 }
1859
1860 $byte_start = $this->utf8_char2byte_pos($str, $start);
1861 if ($byte_start === FALSE) {
1862 if ($start > 0) {
1863 return FALSE; // $start outside string length
1864 } else {
1865 $start = 0;
1866 }
1867 }
1868
1869 $str = substr($str, $byte_start);
1870
1871 if ($len != NULL) {
1872 $byte_end = $this->utf8_char2byte_pos($str, $len);
1873 if ($byte_end === FALSE) // $len outside actual string length
1874 {
1875 return $len < 0 ? '' : $str;
1876 } // When length is less than zero and exceeds, then we return blank string.
1877 else
1878 {
1879 return substr($str, 0, $byte_end);
1880 }
1881 }
1882 else {
1883 return $str;
1884 }
1885 }
1886
1887 /**
1888 * Counts the number of characters of a string in UTF-8.
1889 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1890 *
1891 * @param string UTF-8 multibyte character string
1892 * @return integer The number of characters
1893 * @see strlen()
1894 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1895 */
1896 function utf8_strlen($str) {
1897 $n = 0;
1898 for ($i = 0; strlen($str{$i}); $i++) {
1899 $c = ord($str{$i});
1900 if (!($c & 0x80)) // single-byte (0xxxxxx)
1901 {
1902 $n++;
1903 }
1904 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1905 {
1906 $n++;
1907 }
1908 }
1909 return $n;
1910 }
1911
1912 /**
1913 * Truncates a string in UTF-8 short at a given byte length.
1914 *
1915 * @param string UTF-8 multibyte character string
1916 * @param integer the byte length
1917 * @return string the shortened string
1918 * @see mb_strcut()
1919 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1920 */
1921 function utf8_strtrunc($str, $len) {
1922 $i = $len - 1;
1923 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1924 for (; $i > 0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1925 if ($i <= 0) {
1926 return '';
1927 } // sanity check
1928 for ($bc = 0, $mbs = ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1929 if ($bc + $i > $len) {
1930 return substr($str, 0, $i);
1931 }
1932 // fallthru: multibyte char fits into length
1933 }
1934 return substr($str, 0, $len);
1935 }
1936
1937 /**
1938 * Find position of first occurrence of a string, both arguments are in UTF-8.
1939 *
1940 * @param string UTF-8 string to search in
1941 * @param string UTF-8 string to search for
1942 * @param integer Positition to start the search
1943 * @return integer The character position
1944 * @see strpos()
1945 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1946 */
1947 function utf8_strpos($haystack, $needle, $offset = 0) {
1948 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1949 return mb_strpos($haystack, $needle, $offset, 'utf-8');
1950 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1951 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1952 }
1953
1954 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1955 if ($byte_offset === FALSE) {
1956 return FALSE;
1957 } // offset beyond string length
1958
1959 $byte_pos = strpos($haystack, $needle, $byte_offset);
1960 if ($byte_pos === FALSE) {
1961 return FALSE;
1962 } // needle not found
1963
1964 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1965 }
1966
1967 /**
1968 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1969 *
1970 * @param string UTF-8 string to search in
1971 * @param string UTF-8 character to search for (single character)
1972 * @return integer The character position
1973 * @see strrpos()
1974 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1975 */
1976 function utf8_strrpos($haystack, $needle) {
1977 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1978 return mb_strrpos($haystack, $needle, 'utf-8');
1979 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1980 return iconv_strrpos($haystack, $needle, 'utf-8');
1981 }
1982
1983 $byte_pos = strrpos($haystack, $needle);
1984 if ($byte_pos === FALSE) {
1985 return FALSE;
1986 } // needle not found
1987
1988 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1989 }
1990
1991 /**
1992 * Translates a character position into an 'absolute' byte position.
1993 * Unit tested by Kasper.
1994 *
1995 * @param string UTF-8 string
1996 * @param integer Character position (negative values start from the end)
1997 * @return integer Byte position
1998 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1999 */
2000 function utf8_char2byte_pos($str, $pos) {
2001 $n = 0; // number of characters found
2002 $p = abs($pos); // number of characters wanted
2003
2004 if ($pos >= 0) {
2005 $i = 0;
2006 $d = 1;
2007 } else {
2008 $i = strlen($str) - 1;
2009 $d = -1;
2010 }
2011
2012 for (; strlen($str{$i}) && $n < $p; $i += $d) {
2013 $c = (int) ord($str{$i});
2014 if (!($c & 0x80)) // single-byte (0xxxxxx)
2015 {
2016 $n++;
2017 }
2018 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2019 {
2020 $n++;
2021 }
2022 }
2023 if (!strlen($str{$i})) {
2024 return FALSE;
2025 } // offset beyond string length
2026
2027 if ($pos >= 0) {
2028 // skip trailing multi-byte data bytes
2029 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) {
2030 $i++;
2031 }
2032 } else {
2033 // correct offset
2034 $i++;
2035 }
2036
2037 return $i;
2038 }
2039
2040 /**
2041 * Translates an 'absolute' byte position into a character position.
2042 * Unit tested by Kasper.
2043 *
2044 * @param string UTF-8 string
2045 * @param integer byte position
2046 * @return integer character position
2047 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2048 */
2049 function utf8_byte2char_pos($str, $pos) {
2050 $n = 0; // number of characters
2051 for ($i = $pos; $i > 0; $i--) {
2052 $c = (int) ord($str{$i});
2053 if (!($c & 0x80)) // single-byte (0xxxxxx)
2054 {
2055 $n++;
2056 }
2057 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2058 {
2059 $n++;
2060 }
2061 }
2062 if (!strlen($str{$i})) {
2063 return FALSE;
2064 } // offset beyond string length
2065
2066 return $n;
2067 }
2068
2069 /**
2070 * Maps all characters of an UTF-8 string.
2071 *
2072 * @param string UTF-8 string
2073 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2074 * @param string 'case': conversion 'toLower' or 'toUpper'
2075 * @return string the converted string
2076 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2077 */
2078 function utf8_char_mapping($str, $mode, $opt = '') {
2079 if (!$this->initUnicodeData($mode)) {
2080 return $str;
2081 } // do nothing
2082
2083 $out = '';
2084 switch ($mode) {
2085 case 'case':
2086 $map =& $this->caseFolding['utf-8'][$opt];
2087 break;
2088
2089 case 'ascii':
2090 $map =& $this->toASCII['utf-8'];
2091 break;
2092
2093 default:
2094 return $str;
2095 }
2096
2097 for ($i = 0; strlen($str{$i}); $i++) {
2098 $c = ord($str{$i});
2099 if (!($c & 0x80)) // single-byte (0xxxxxx)
2100 {
2101 $mbc = $str{$i};
2102 }
2103 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
2104 for ($bc = 0; $c & 0x80; $c = $c << 1) {
2105 $bc++;
2106 } // calculate number of bytes
2107 $mbc = substr($str, $i, $bc);
2108 $i += $bc - 1;
2109 }
2110
2111 if (isset($map[$mbc])) {
2112 $out .= $map[$mbc];
2113 } else {
2114 $out .= $mbc;
2115 }
2116 }
2117
2118 return $out;
2119 }
2120
2121
2122 /********************************************
2123 *
2124 * Internal EUC string operation functions
2125 *
2126 * Extended Unix Code:
2127 * ASCII compatible 7bit single bytes chars
2128 * 8bit two byte chars
2129 *
2130 * Shift-JIS is treated as a special case.
2131 *
2132 ********************************************/
2133
2134 /**
2135 * Cuts a string in the EUC charset family short at a given byte length.
2136 *
2137 * @param string EUC multibyte character string
2138 * @param integer the byte length
2139 * @param string the charset
2140 * @return string the shortened string
2141 * @see mb_strcut()
2142 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2143 */
2144 function euc_strtrunc($str, $len, $charset) {
2145 $sjis = ($charset == 'shift_jis');
2146 for ($i = 0; strlen($str{$i}) && $i < $len; $i++) {
2147 $c = ord($str{$i});
2148 if ($sjis) {
2149 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2150 $i++;
2151 } // advance a double-byte char
2152 }
2153 else {
2154 if ($c >= 0x80) {
2155 $i++;
2156 } // advance a double-byte char
2157 }
2158 }
2159 if (!strlen($str{$i})) {
2160 return $str;
2161 } // string shorter than supplied length
2162
2163 if ($i > $len) {
2164 return substr($str, 0, $len - 1); // we ended on a first byte
2165 } else {
2166 return substr($str, 0, $len);
2167 }
2168 }
2169
2170 /**
2171 * Returns a part of a string in the EUC charset family.
2172 *
2173 * @param string EUC multibyte character string
2174 * @param integer start position (character position)
2175 * @param string the charset
2176 * @param integer length (in characters)
2177 * @return string the substring
2178 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2179 */
2180 function euc_substr($str, $start, $charset, $len = NULL) {
2181 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2182 if ($byte_start === FALSE) {
2183 return FALSE;
2184 } // $start outside string length
2185
2186 $str = substr($str, $byte_start);
2187
2188 if ($len != NULL) {
2189 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2190 if ($byte_end === FALSE) // $len outside actual string length
2191 {
2192 return $str;
2193 }
2194 else
2195 {
2196 return substr($str, 0, $byte_end);
2197 }
2198 }
2199 else {
2200 return $str;
2201 }
2202 }
2203
2204 /**
2205 * Counts the number of characters of a string in the EUC charset family.
2206 *
2207 * @param string EUC multibyte character string
2208 * @param string the charset
2209 * @return integer the number of characters
2210 * @see strlen()
2211 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2212 */
2213 function euc_strlen($str, $charset) {
2214 $sjis = ($charset == 'shift_jis');
2215 $n = 0;
2216 for ($i = 0; strlen($str{$i}); $i++) {
2217 $c = ord($str{$i});
2218 if ($sjis) {
2219 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2220 $i++;
2221 } // advance a double-byte char
2222 }
2223 else {
2224 if ($c >= 0x80) {
2225 $i++;
2226 } // advance a double-byte char
2227 }
2228
2229 $n++;
2230 }
2231
2232 return $n;
2233 }
2234
2235 /**
2236 * Translates a character position into an 'absolute' byte position.
2237 *
2238 * @param string EUC multibyte character string
2239 * @param integer character position (negative values start from the end)
2240 * @param string the charset
2241 * @return integer byte position
2242 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2243 */
2244 function euc_char2byte_pos($str, $pos, $charset) {
2245 $sjis = ($charset == 'shift_jis');
2246 $n = 0; // number of characters seen
2247 $p = abs($pos); // number of characters wanted
2248
2249 if ($pos >= 0) {
2250 $i = 0;
2251 $d = 1;
2252 } else {
2253 $i = strlen($str) - 1;
2254 $d = -1;
2255 }
2256
2257 for (; strlen($str{$i}) && $n < $p; $i += $d) {
2258 $c = ord($str{$i});
2259 if ($sjis) {
2260 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2261 $i += $d;
2262 } // advance a double-byte char
2263 }
2264 else {
2265 if ($c >= 0x80) {
2266 $i += $d;
2267 } // advance a double-byte char
2268 }
2269
2270 $n++;
2271 }
2272 if (!strlen($str{$i})) {
2273 return FALSE;
2274 } // offset beyond string length
2275
2276 if ($pos < 0) {
2277 $i++;
2278 } // correct offset
2279
2280 return $i;
2281 }
2282
2283 /**
2284 * Maps all characters of a string in the EUC charset family.
2285 *
2286 * @param string EUC multibyte character string
2287 * @param string the charset
2288 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2289 * @param string 'case': conversion 'toLower' or 'toUpper'
2290 * @return string the converted string
2291 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2292 */
2293 function euc_char_mapping($str, $charset, $mode, $opt = '') {
2294 switch ($mode) {
2295 case 'case':
2296 if (!$this->initCaseFolding($charset)) {
2297 return $str;
2298 } // do nothing
2299 $map =& $this->caseFolding[$charset][$opt];
2300 break;
2301
2302 case 'ascii':
2303 if (!$this->initToASCII($charset)) {
2304 return $str;
2305 } // do nothing
2306 $map =& $this->toASCII[$charset];
2307 break;
2308
2309 default:
2310 return $str;
2311 }
2312
2313 $sjis = ($charset == 'shift_jis');
2314 $out = '';
2315 for ($i = 0; strlen($str{$i}); $i++) {
2316 $mbc = $str{$i};
2317 $c = ord($mbc);
2318
2319 if ($sjis) {
2320 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2321 $mbc = substr($str, $i, 2);
2322 $i++;
2323 }
2324 }
2325 else {
2326 if ($c >= 0x80) { // a double-byte char
2327 $mbc = substr($str, $i, 2);
2328 $i++;
2329 }
2330 }
2331
2332 if (isset($map[$mbc])) {
2333 $out .= $map[$mbc];
2334 } else {
2335 $out .= $mbc;
2336 }
2337 }
2338
2339 return $out;
2340 }
2341
2342 }
2343
2344 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])) {
2345 include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2346 }
2347
2348 ?>