Fixed bug #16445: Performance Tuning: Replace exec_SELECTgetRows with exec_SELECTgetS...
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2010 Kasper Skårhøj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 136: class t3lib_cs
38 * 488: function parse_charset($charset)
39 * 507: function get_locale_charset($locale)
40 *
41 * SECTION: Charset Conversion functions
42 * 560: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
43 * 600: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
44 * 617: function utf8_encode($str,$charset)
45 * 663: function utf8_decode($str,$charset,$useEntityForNoChar=0)
46 * 706: function utf8_to_entities($str)
47 * 739: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
48 * 773: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
49 * 823: function UnumberToChar($cbyte)
50 * 868: function utf8CharToUnumber($str,$hex=0)
51 *
52 * SECTION: Init functions
53 * 911: function initCharset($charset)
54 * 973: function initUnicodeData($mode=null)
55 * 1198: function initCaseFolding($charset)
56 * 1260: function initToASCII($charset)
57 *
58 * SECTION: String operation functions
59 * 1331: function substr($charset,$string,$start,$len=null)
60 * 1384: function strlen($charset,$string)
61 * 1414: function crop($charset,$string,$len,$crop='')
62 * 1467: function strtrunc($charset,$string,$len)
63 * 1501: function conv_case($charset,$string,$case)
64 * 1527: function specCharsToASCII($charset,$string)
65 *
66 * SECTION: Internal string operation functions
67 * 1567: function sb_char_mapping($str,$charset,$mode,$opt='')
68 *
69 * SECTION: Internal UTF-8 string operation functions
70 * 1622: function utf8_substr($str,$start,$len=null)
71 * 1655: function utf8_strlen($str)
72 * 1676: function utf8_strtrunc($str,$len)
73 * 1698: function utf8_strpos($haystack,$needle,$offset=0)
74 * 1723: function utf8_strrpos($haystack,$needle)
75 * 1745: function utf8_char2byte_pos($str,$pos)
76 * 1786: function utf8_byte2char_pos($str,$pos)
77 * 1809: function utf8_char_mapping($str,$mode,$opt='')
78 *
79 * SECTION: Internal EUC string operation functions
80 * 1885: function euc_strtrunc($str,$len,$charset)
81 * 1914: function euc_substr($str,$start,$charset,$len=null)
82 * 1939: function euc_strlen($str,$charset)
83 * 1966: function euc_char2byte_pos($str,$pos,$charset)
84 * 2007: function euc_char_mapping($str,$charset,$mode,$opt='')
85 *
86 * TOTAL FUNCTIONS: 35
87 * (This index is automatically created/updated by the extension "extdeveval")
88 *
89 */
90
91
92 /**
93 * Notes on UTF-8
94 *
95 * Functions working on UTF-8 strings:
96 *
97 * - strchr/strstr
98 * - strrchr
99 * - substr_count
100 * - implode/explode/join
101 *
102 * Functions nearly working on UTF-8 strings:
103 *
104 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
105 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
106 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
107 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
108 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
109 *
110 * Functions NOT working on UTF-8 strings:
111 *
112 * - str*cmp
113 * - stristr
114 * - stripos
115 * - substr
116 * - strrev
117 * - split/spliti
118 * - ...
119 *
120 */
121 /**
122 * Class for conversion between charsets
123 *
124 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
125 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
126 * @package TYPO3
127 * @subpackage t3lib
128 */
129 class t3lib_cs {
130 var $noCharByteVal = 63; // ASCII Value for chars with no equivalent.
131
132 // This is the array where parsed conversion tables are stored (cached)
133 var $parsedCharsets = array();
134
135 // An array where case folding data will be stored (cached)
136 var $caseFolding = array();
137
138 // An array where charset-to-ASCII mappings are stored (cached)
139 var $toASCII = array();
140
141 // This tells the converter which charsets has two bytes per char:
142 var $twoByteSets = array(
143 'ucs-2' => 1, // 2-byte Unicode
144 );
145
146 // This tells the converter which charsets has four bytes per char:
147 var $fourByteSets = array(
148 'ucs-4' => 1, // 4-byte Unicode
149 'utf-32' => 1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
150 );
151
152 // This tells the converter which charsets use a scheme like the Extended Unix Code:
153 var $eucBasedSets = array(
154 'gb2312' => 1, // Chinese, simplified.
155 'big5' => 1, // Chinese, traditional.
156 'euc-kr' => 1, // Korean
157 'shift_jis' => 1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
158 );
159
160 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
161 // http://czyborra.com/charsets/iso8859.html
162 var $synonyms = array(
163 'us' => 'ascii',
164 'us-ascii' => 'ascii',
165 'cp819' => 'iso-8859-1',
166 'ibm819' => 'iso-8859-1',
167 'iso-ir-100' => 'iso-8859-1',
168 'iso-ir-101' => 'iso-8859-2',
169 'iso-ir-109' => 'iso-8859-3',
170 'iso-ir-110' => 'iso-8859-4',
171 'iso-ir-144' => 'iso-8859-5',
172 'iso-ir-127' => 'iso-8859-6',
173 'iso-ir-126' => 'iso-8859-7',
174 'iso-ir-138' => 'iso-8859-8',
175 'iso-ir-148' => 'iso-8859-9',
176 'iso-ir-157' => 'iso-8859-10',
177 'iso-ir-179' => 'iso-8859-13',
178 'iso-ir-199' => 'iso-8859-14',
179 'iso-ir-203' => 'iso-8859-15',
180 'csisolatin1' => 'iso-8859-1',
181 'csisolatin2' => 'iso-8859-2',
182 'csisolatin3' => 'iso-8859-3',
183 'csisolatin5' => 'iso-8859-9',
184 'csisolatin8' => 'iso-8859-14',
185 'csisolatin9' => 'iso-8859-15',
186 'csisolatingreek' => 'iso-8859-7',
187 'iso-celtic' => 'iso-8859-14',
188 'latin1' => 'iso-8859-1',
189 'latin2' => 'iso-8859-2',
190 'latin3' => 'iso-8859-3',
191 'latin5' => 'iso-8859-9',
192 'latin6' => 'iso-8859-10',
193 'latin8' => 'iso-8859-14',
194 'latin9' => 'iso-8859-15',
195 'l1' => 'iso-8859-1',
196 'l2' => 'iso-8859-2',
197 'l3' => 'iso-8859-3',
198 'l5' => 'iso-8859-9',
199 'l6' => 'iso-8859-10',
200 'l8' => 'iso-8859-14',
201 'l9' => 'iso-8859-15',
202 'cyrillic' => 'iso-8859-5',
203 'arabic' => 'iso-8859-6',
204 'tis-620' => 'iso-8859-11',
205 'win874' => 'windows-874',
206 'win1250' => 'windows-1250',
207 'win1251' => 'windows-1251',
208 'win1252' => 'windows-1252',
209 'win1253' => 'windows-1253',
210 'win1254' => 'windows-1254',
211 'win1255' => 'windows-1255',
212 'win1256' => 'windows-1256',
213 'win1257' => 'windows-1257',
214 'win1258' => 'windows-1258',
215 'cp1250' => 'windows-1250',
216 'cp1251' => 'windows-1251',
217 'cp1252' => 'windows-1252',
218 'ms-ee' => 'windows-1250',
219 'ms-ansi' => 'windows-1252',
220 'ms-greek' => 'windows-1253',
221 'ms-turk' => 'windows-1254',
222 'winbaltrim' => 'windows-1257',
223 'koi-8ru' => 'koi-8r',
224 'koi8r' => 'koi-8r',
225 'cp878' => 'koi-8r',
226 'mac' => 'macroman',
227 'macintosh' => 'macroman',
228 'euc-cn' => 'gb2312',
229 'x-euc-cn' => 'gb2312',
230 'euccn' => 'gb2312',
231 'cp936' => 'gb2312',
232 'big-5' => 'big5',
233 'cp950' => 'big5',
234 'eucjp' => 'euc-jp',
235 'sjis' => 'shift_jis',
236 'shift-jis' => 'shift_jis',
237 'cp932' => 'shift_jis',
238 'cp949' => 'euc-kr',
239 'utf7' => 'utf-7',
240 'utf8' => 'utf-8',
241 'utf16' => 'utf-16',
242 'utf32' => 'utf-32',
243 'utf8' => 'utf-8',
244 'ucs2' => 'ucs-2',
245 'ucs4' => 'ucs-4',
246 );
247
248 // mapping of iso-639-1 language codes to script names
249 var $lang_to_script = array(
250 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
251 'ar' => 'arabic',
252 'bg' => 'cyrillic', // Bulgarian
253 'bs' => 'east_european', // Bosnian
254 'cs' => 'east_european', // Czech
255 'da' => 'west_european', // Danish
256 'de' => 'west_european', // German
257 'es' => 'west_european', // Spanish
258 'et' => 'estonian',
259 'eo' => 'unicode', // Esperanto
260 'eu' => 'west_european', // Basque
261 'fa' => 'arabic', // Persian
262 'fi' => 'west_european', // Finish
263 'fo' => 'west_european', // Faroese
264 'fr' => 'west_european', // French
265 'ga' => 'west_european', // Galician
266 'ge' => 'unicode', // Georgian
267 'gr' => 'greek',
268 'he' => 'hebrew', // Hebrew (since 1998)
269 'hi' => 'unicode', // Hindi
270 'hr' => 'east_european', // Croatian
271 'hu' => 'east_european', // Hungarian
272 'iw' => 'hebrew', // Hebrew (til 1998)
273 'is' => 'west_european', // Icelandic
274 'it' => 'west_european', // Italian
275 'ja' => 'japanese',
276 'kl' => 'west_european', // Greenlandic
277 'km' => 'unicode', // Khmer
278 'ko' => 'korean',
279 'lt' => 'lithuanian',
280 'lv' => 'west_european', // Latvian/Lettish
281 'nl' => 'west_european', // Dutch
282 'no' => 'west_european', // Norwegian
283 'nb' => 'west_european', // Norwegian Bokmal
284 'nn' => 'west_european', // Norwegian Nynorsk
285 'pl' => 'east_european', // Polish
286 'pt' => 'west_european', // Portuguese
287 'ro' => 'east_european', // Romanian
288 'ru' => 'cyrillic', // Russian
289 'sk' => 'east_european', // Slovak
290 'sl' => 'east_european', // Slovenian
291 'sr' => 'cyrillic', // Serbian
292 'sv' => 'west_european', // Swedish
293 'sq' => 'albanian', // Albanian
294 'th' => 'thai',
295 'uk' => 'cyrillic', // Ukranian
296 'vi' => 'vietnamese',
297 'zh' => 'chinese',
298 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
299 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
300 'ara' => 'arabic',
301 'bgr' => 'cyrillic', // Bulgarian
302 'cat' => 'west_european', // Catalan
303 'chs' => 'simpl_chinese',
304 'cht' => 'trad_chinese',
305 'csy' => 'east_european', // Czech
306 'dan' => 'west_european', // Danisch
307 'deu' => 'west_european', // German
308 'dea' => 'west_european', // German (Austrian)
309 'des' => 'west_european', // German (Swiss)
310 'ena' => 'west_european', // English (Australian)
311 'enc' => 'west_european', // English (Canadian)
312 'eng' => 'west_european', // English
313 'enz' => 'west_european', // English (New Zealand)
314 'enu' => 'west_european', // English (United States)
315 'euq' => 'west_european', // Basque
316 'fos' => 'west_european', // Faroese
317 'far' => 'arabic', // Persian
318 'fin' => 'west_european', // Finish
319 'fra' => 'west_european', // French
320 'frb' => 'west_european', // French (Belgian)
321 'frc' => 'west_european', // French (Canadian)
322 'frs' => 'west_european', // French (Swiss)
323 'geo' => 'unicode', // Georgian
324 'glg' => 'west_european', // Galician
325 'ell' => 'greek',
326 'heb' => 'hebrew',
327 'hin' => 'unicode', // Hindi
328 'hun' => 'east_european', // Hungarian
329 'isl' => 'west_euorpean', // Icelandic
330 'ita' => 'west_european', // Italian
331 'its' => 'west_european', // Italian (Swiss)
332 'jpn' => 'japanese',
333 'khm' => 'unicode', // Khmer
334 'kor' => 'korean',
335 'lth' => 'lithuanian',
336 'lvi' => 'west_european', // Latvian/Lettish
337 'msl' => 'west_european', // Malay
338 'nlb' => 'west_european', // Dutch (Belgian)
339 'nld' => 'west_european', // Dutch
340 'nor' => 'west_european', // Norwegian (bokmal)
341 'non' => 'west_european', // Norwegian (nynorsk)
342 'plk' => 'east_european', // Polish
343 'ptg' => 'west_european', // Portuguese
344 'ptb' => 'west_european', // Portuguese (Brazil)
345 'rom' => 'east_european', // Romanian
346 'rus' => 'cyrillic', // Russian
347 'slv' => 'east_european', // Slovenian
348 'sky' => 'east_european', // Slovak
349 'srl' => 'east_european', // Serbian (Latin)
350 'srb' => 'cyrillic', // Serbian (Cyrillic)
351 'esp' => 'west_european', // Spanish (trad. sort)
352 'esm' => 'west_european', // Spanish (Mexican)
353 'esn' => 'west_european', // Spanish (internat. sort)
354 'sve' => 'west_european', // Swedish
355 'sqi' => 'albanian', // Albanian
356 'tha' => 'thai',
357 'trk' => 'turkish',
358 'ukr' => 'cyrillic', // Ukrainian
359 // English language names
360 'albanian' => 'albanian',
361 'arabic' => 'arabic',
362 'basque' => 'west_european',
363 'bosnian' => 'east_european',
364 'bulgarian' => 'east_european',
365 'catalan' => 'west_european',
366 'croatian' => 'east_european',
367 'czech' => 'east_european',
368 'danish' => 'west_european',
369 'dutch' => 'west_european',
370 'english' => 'west_european',
371 'esperanto' => 'unicode',
372 'estonian' => 'estonian',
373 'faroese' => 'west_european',
374 'farsi' => 'arabic',
375 'finnish' => 'west_european',
376 'french' => 'west_european',
377 'galician' => 'west_european',
378 'georgian' => 'unicode',
379 'german' => 'west_european',
380 'greek' => 'greek',
381 'greenlandic' => 'west_european',
382 'hebrew' => 'hebrew',
383 'hindi' => 'unicode',
384 'hungarian' => 'east_european',
385 'icelandic' => 'west_european',
386 'italian' => 'west_european',
387 'khmer' => 'unicode',
388 'latvian' => 'west_european',
389 'lettish' => 'west_european',
390 'lithuanian' => 'lithuanian',
391 'malay' => 'west_european',
392 'norwegian' => 'west_european',
393 'persian' => 'arabic',
394 'polish' => 'east_european',
395 'portuguese' => 'west_european',
396 'russian' => 'cyrillic',
397 'romanian' => 'east_european',
398 'serbian' => 'cyrillic',
399 'slovak' => 'east_european',
400 'slovenian' => 'east_european',
401 'spanish' => 'west_european',
402 'svedish' => 'west_european',
403 'that' => 'thai',
404 'turkish' => 'turkish',
405 'ukrainian' => 'cyrillic',
406 );
407
408 // mapping of language (family) names to charsets on Unix
409 var $script_to_charset_unix = array(
410 'west_european' => 'iso-8859-1',
411 'estonian' => 'iso-8859-1',
412 'east_european' => 'iso-8859-2',
413 'baltic' => 'iso-8859-4',
414 'cyrillic' => 'iso-8859-5',
415 'arabic' => 'iso-8859-6',
416 'greek' => 'iso-8859-7',
417 'hebrew' => 'iso-8859-8',
418 'turkish' => 'iso-8859-9',
419 'thai' => 'iso-8859-11', // = TIS-620
420 'lithuanian' => 'iso-8859-13',
421 'chinese' => 'gb2312', // = euc-cn
422 'japanese' => 'euc-jp',
423 'korean' => 'euc-kr',
424 'simpl_chinese' => 'gb2312',
425 'trad_chinese' => 'big5',
426 'vietnamese' => '',
427 'unicode' => 'utf-8',
428 'albanian' => 'utf-8'
429 );
430
431 // mapping of language (family) names to charsets on Windows
432 var $script_to_charset_windows = array(
433 'east_european' => 'windows-1250',
434 'cyrillic' => 'windows-1251',
435 'west_european' => 'windows-1252',
436 'greek' => 'windows-1253',
437 'turkish' => 'windows-1254',
438 'hebrew' => 'windows-1255',
439 'arabic' => 'windows-1256',
440 'baltic' => 'windows-1257',
441 'estonian' => 'windows-1257',
442 'lithuanian' => 'windows-1257',
443 'vietnamese' => 'windows-1258',
444 'thai' => 'cp874',
445 'korean' => 'cp949',
446 'chinese' => 'gb2312',
447 'japanese' => 'shift_jis',
448 'simpl_chinese' => 'gb2312',
449 'trad_chinese' => 'big5',
450 'albanian' => 'windows-1250',
451 'unicode' => 'utf-8'
452 );
453
454 // mapping of locale names to charsets
455 var $locale_to_charset = array(
456 'japanese.euc' => 'euc-jp',
457 'ja_jp.ujis' => 'euc-jp',
458 'korean.euc' => 'euc-kr',
459 'sr@Latn' => 'iso-8859-2',
460 'zh_cn' => 'gb2312',
461 'zh_hk' => 'big5',
462 'zh_tw' => 'big5',
463 );
464
465 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
466 // Empty values means "iso-8859-1"
467 var $charSetArray = array(
468 'dk' => '',
469 'de' => '',
470 'no' => '',
471 'it' => '',
472 'fr' => '',
473 'es' => '',
474 'nl' => '',
475 'cz' => 'windows-1250',
476 'pl' => 'iso-8859-2',
477 'si' => 'windows-1250',
478 'fi' => '',
479 'tr' => 'iso-8859-9',
480 'se' => '',
481 'pt' => '',
482 'ru' => 'windows-1251',
483 'ro' => 'iso-8859-2',
484 'ch' => 'gb2312',
485 'sk' => 'windows-1250',
486 'lt' => 'windows-1257',
487 'is' => 'utf-8',
488 'hr' => 'windows-1250',
489 'hu' => 'iso-8859-2',
490 'gl' => '',
491 'th' => 'iso-8859-11',
492 'gr' => 'iso-8859-7',
493 'hk' => 'big5',
494 'eu' => '',
495 'bg' => 'windows-1251',
496 'br' => '',
497 'et' => 'iso-8859-4',
498 'ar' => 'iso-8859-6',
499 'he' => 'utf-8',
500 'ua' => 'windows-1251',
501 'jp' => 'shift_jis',
502 'lv' => 'utf-8',
503 'vn' => 'utf-8',
504 'ca' => 'iso-8859-15',
505 'ba' => 'iso-8859-2',
506 'kr' => 'euc-kr',
507 'eo' => 'utf-8',
508 'my' => '',
509 'hi' => 'utf-8',
510 'fo' => 'utf-8',
511 'fa' => 'utf-8',
512 'sr' => 'utf-8',
513 'sq' => 'utf-8',
514 'ge' => 'utf-8',
515 'ga' => '',
516 'km' => 'utf-8',
517 );
518
519 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
520 // Missing keys means: same as Typo3
521 var $isoArray = array(
522 'ba' => 'bs',
523 'br' => 'pt_BR',
524 'ch' => 'zh_CN',
525 'cz' => 'cs',
526 'dk' => 'da',
527 'si' => 'sl',
528 'se' => 'sv',
529 'gl' => 'kl',
530 'gr' => 'el',
531 'hk' => 'zh_HK',
532 'kr' => 'ko',
533 'ua' => 'uk',
534 'jp' => 'ja',
535 'vn' => 'vi',
536 );
537
538 /**
539 * Normalize - changes input character set to lowercase letters.
540 *
541 * @param string Input charset
542 * @return string Normalized charset
543 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
544 */
545 function parse_charset($charset) {
546 $charset = trim(strtolower($charset));
547 if (isset($this->synonyms[$charset])) {
548 $charset = $this->synonyms[$charset];
549 }
550
551 return $charset;
552 }
553
554 /**
555 * Get the charset of a locale.
556 *
557 * ln language
558 * ln_CN language / country
559 * ln_CN.cs language / country / charset
560 * ln_CN.cs@mod language / country / charset / modifier
561 *
562 * @param string Locale string
563 * @return string Charset resolved for locale string
564 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
565 */
566 function get_locale_charset($locale) {
567 $locale = strtolower($locale);
568
569 // exact locale specific charset?
570 if (isset($this->locale_to_charset[$locale])) {
571 return $this->locale_to_charset[$locale];
572 }
573
574 // get modifier
575 list($locale, $modifier) = explode('@', $locale);
576
577 // locale contains charset: use it
578 list($locale, $charset) = explode('.', $locale);
579 if ($charset) {
580 return $this->parse_charset($charset);
581 }
582
583 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
584 if ($modifier == 'euro') {
585 return 'iso-8859-15';
586 }
587
588 // get language
589 list($language, $country) = explode('_', $locale);
590 if (isset($this->lang_to_script[$language])) {
591 $script = $this->lang_to_script[$language];
592 }
593
594 if (TYPO3_OS == 'WIN') {
595 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
596 } else {
597 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
598 }
599
600 return $cs;
601 }
602
603
604 /********************************************
605 *
606 * Charset Conversion functions
607 *
608 ********************************************/
609
610 /**
611 * Convert from one charset to another charset.
612 *
613 * @param string Input string
614 * @param string From charset (the current charset of the string)
615 * @param string To charset (the output charset wanted)
616 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
617 * @return string Converted string
618 * @see convArray()
619 */
620 function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
621 if ($fromCS == $toCS) {
622 return $str;
623 }
624
625 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
626 if ($toCS == 'utf-8' || !$useEntityForNoChar) {
627 switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
628 case 'mbstring':
629 $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
630 if (FALSE !== $conv_str) {
631 return $conv_str;
632 } // returns false for unsupported charsets
633 break;
634
635 case 'iconv':
636 $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
637 if (FALSE !== $conv_str) {
638 return $conv_str;
639 }
640 break;
641
642 case 'recode':
643 $conv_str = recode_string($fromCS . '..' . $toCS, $str);
644 if (FALSE !== $conv_str) {
645 return $conv_str;
646 }
647 break;
648 }
649 // fallback to TYPO3 conversion
650 }
651
652 if ($fromCS != 'utf-8') {
653 $str = $this->utf8_encode($str, $fromCS);
654 }
655 if ($toCS != 'utf-8') {
656 $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
657 }
658 return $str;
659 }
660
661 /**
662 * Convert all elements in ARRAY with type string from one charset to another charset.
663 * NOTICE: Array is passed by reference!
664 *
665 * @param string Input array, possibly multidimensional
666 * @param string From charset (the current charset of the string)
667 * @param string To charset (the output charset wanted)
668 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
669 * @return void
670 * @see conv()
671 */
672 function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
673 foreach ($array as $key => $value) {
674 if (is_array($array[$key])) {
675 $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
676 } elseif (is_string($array[$key])) {
677 $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
678 }
679 }
680 }
681
682 /**
683 * Converts $str from $charset to UTF-8
684 *
685 * @param string String in local charset to convert to UTF-8
686 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
687 * @return string Output string, converted to UTF-8
688 */
689 function utf8_encode($str, $charset) {
690
691 if ($charset === 'utf-8') {
692 return $str;
693 }
694
695 // Charset is case-insensitive.
696 if ($this->initCharset($charset)) { // Parse conv. table if not already...
697 $strLen = strlen($str);
698 $outStr = '';
699
700 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in string.
701 $chr = substr($str, $a, 1);
702 $ord = ord($chr);
703 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
704 $ord2 = ord($str{$a + 1});
705 $ord = $ord << 8 | $ord2; // assume big endian
706
707 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
708 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
709 } else {
710 $outStr .= chr($this->noCharByteVal);
711 } // No char exists
712 $a++;
713 } elseif ($ord > 127) { // If char has value over 127 it's a multibyte char in UTF-8
714 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
715 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
716 $a++;
717 $ord2 = ord(substr($str, $a, 1));
718 $ord = $ord * 256 + $ord2;
719 }
720 }
721
722 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
723 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
724 } else {
725 $outStr .= chr($this->noCharByteVal);
726 } // No char exists
727 } else {
728 $outStr .= $chr;
729 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
730 }
731 return $outStr;
732 }
733 }
734
735 /**
736 * Converts $str from UTF-8 to $charset
737 *
738 * @param string String in UTF-8 to convert to local charset
739 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
740 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
741 * @return string Output string, converted to local charset
742 */
743 function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
744
745 if ($charset === 'utf-8') {
746 return $str;
747 }
748
749 // Charset is case-insensitive.
750 if ($this->initCharset($charset)) { // Parse conv. table if not already...
751 $strLen = strlen($str);
752 $outStr = '';
753 $buf = '';
754 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) { // Traverse each char in UTF-8 string.
755 $chr = substr($str, $a, 1);
756 $ord = ord($chr);
757 if ($ord > 127) { // This means multibyte! (first byte!)
758 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
759
760 $buf = $chr; // Add first byte
761 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
762 $ord = $ord << 1; // Shift it left and ...
763 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
764 $a++; // Increase pointer...
765 $buf .= substr($str, $a, 1); // ... and add the next char.
766 } else {
767 break;
768 }
769 }
770
771 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
772 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
773 if ($mByte > 255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
774 $outStr .= chr(($mByte >> 8) & 255) . chr($mByte & 255);
775 } else {
776 $outStr .= chr($mByte);
777 }
778 } elseif ($useEntityForNoChar) { // Create num entity:
779 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
780 } else {
781 $outStr .= chr($this->noCharByteVal);
782 } // No char exists
783 } else {
784 $outStr .= chr($this->noCharByteVal);
785 } // No char exists (MIDDLE of MB sequence!)
786 } else {
787 $outStr .= $chr;
788 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
789 }
790 return $outStr;
791 }
792 }
793
794 /**
795 * Converts all chars > 127 to numeric entities.
796 *
797 * @param string Input string
798 * @return string Output string
799 */
800 function utf8_to_entities($str) {
801 $strLen = strlen($str);
802 $outStr = '';
803 $buf = '';
804 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
805 $chr = substr($str, $a, 1);
806 $ord = ord($chr);
807 if ($ord > 127) { // This means multibyte! (first byte!)
808 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
809 $buf = $chr; // Add first byte
810 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
811 $ord = $ord << 1; // Shift it left and ...
812 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
813 $a++; // Increase pointer...
814 $buf .= substr($str, $a, 1); // ... and add the next char.
815 } else {
816 break;
817 }
818 }
819
820 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
821 } else {
822 $outStr .= chr($this->noCharByteVal);
823 } // No char exists (MIDDLE of MB sequence!)
824 } else {
825 $outStr .= $chr;
826 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
827 }
828
829 return $outStr;
830 }
831
832 /**
833 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
834 *
835 * @param string Input string, UTF-8
836 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
837 * @return string Output string
838 */
839 function entities_to_utf8($str, $alsoStdHtmlEnt = 0) {
840 if ($alsoStdHtmlEnt) {
841 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
842 }
843
844 $token = md5(microtime());
845 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
846 foreach ($parts as $k => $v) {
847 if ($k % 2) {
848 if (substr($v, 0, 1) == '#') { // Dec or hex entities:
849 if (substr($v, 1, 1) == 'x') {
850 $parts[$k] = $this->UnumberToChar(hexdec(substr($v, 2)));
851 } else {
852 $parts[$k] = $this->UnumberToChar(substr($v, 1));
853 }
854 } elseif ($alsoStdHtmlEnt && $trans_tbl['&' . $v . ';']) { // Other entities:
855 $parts[$k] = $this->utf8_encode($trans_tbl['&' . $v . ';'], 'iso-8859-1');
856 } else { // No conversion:
857 $parts[$k] = '&' . $v . ';';
858 }
859 }
860 }
861
862 return implode('', $parts);
863 }
864
865 /**
866 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
867 *
868 * @param string Input string, UTF-8
869 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
870 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
871 * @return array Output array with the char numbers
872 */
873 function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
874 // If entities must be registered as well...:
875 if ($convEntities) {
876 $str = $this->entities_to_utf8($str, 1);
877 }
878 // Do conversion:
879 $strLen = strlen($str);
880 $outArr = array();
881 $buf = '';
882 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
883 $chr = substr($str, $a, 1);
884 $ord = ord($chr);
885 if ($ord > 127) { // This means multibyte! (first byte!)
886 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
887 $buf = $chr; // Add first byte
888 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
889 $ord = $ord << 1; // Shift it left and ...
890 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
891 $a++; // Increase pointer...
892 $buf .= substr($str, $a, 1); // ... and add the next char.
893 } else {
894 break;
895 }
896 }
897
898 $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
899 } else {
900 $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
901 } // No char exists (MIDDLE of MB sequence!)
902 } else {
903 $outArr[] = $retChar ? chr($ord) : $ord;
904 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
905 }
906
907 return $outArr;
908 }
909
910 /**
911 * Converts a UNICODE number to a UTF-8 multibyte character
912 * Algorithm based on script found at From: http://czyborra.com/utf/
913 * Unit-tested by Kasper
914 *
915 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
916 *
917 * bytes | bits | representation
918 * 1 | 7 | 0vvvvvvv
919 * 2 | 11 | 110vvvvv 10vvvvvv
920 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
921 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
922 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
923 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
924 *
925 * @param integer UNICODE integer
926 * @return string UTF-8 multibyte character string
927 * @see utf8CharToUnumber()
928 */
929 function UnumberToChar($cbyte) {
930 $str = '';
931
932 if ($cbyte < 0x80) {
933 $str .= chr($cbyte);
934 } else {
935 if ($cbyte < 0x800) {
936 $str .= chr(0xC0 | ($cbyte >> 6));
937 $str .= chr(0x80 | ($cbyte & 0x3F));
938 } else {
939 if ($cbyte < 0x10000) {
940 $str .= chr(0xE0 | ($cbyte >> 12));
941 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
942 $str .= chr(0x80 | ($cbyte & 0x3F));
943 } else {
944 if ($cbyte < 0x200000) {
945 $str .= chr(0xF0 | ($cbyte >> 18));
946 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
947 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
948 $str .= chr(0x80 | ($cbyte & 0x3F));
949 } else {
950 if ($cbyte < 0x4000000) {
951 $str .= chr(0xF8 | ($cbyte >> 24));
952 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
953 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
954 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
955 $str .= chr(0x80 | ($cbyte & 0x3F));
956 } else {
957 if ($cbyte < 0x80000000) {
958 $str .= chr(0xFC | ($cbyte >> 30));
959 $str .= chr(0x80 | (($cbyte >> 24) & 0x3F));
960 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
961 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
962 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
963 $str .= chr(0x80 | ($cbyte & 0x3F));
964 } else { // Cannot express a 32-bit character in UTF-8
965 $str .= chr($this->noCharByteVal);
966 }
967 }
968 }
969 }
970 }
971 }
972 return $str;
973 }
974
975 /**
976 * Converts a UTF-8 Multibyte character to a UNICODE number
977 * Unit-tested by Kasper
978 *
979 * @param string UTF-8 multibyte character string
980 * @param boolean If set, then a hex. number is returned.
981 * @return integer UNICODE integer
982 * @see UnumberToChar()
983 */
984 function utf8CharToUnumber($str, $hex = 0) {
985 $ord = ord(substr($str, 0, 1)); // First char
986
987 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
988 $binBuf = '';
989 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
990 $ord = $ord << 1; // Shift it left and ...
991 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
992 $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6);
993 } else {
994 break;
995 }
996 }
997 $binBuf = substr('00000000' . decbin(ord(substr($str, 0, 1))), -(6 - $b)) . $binBuf;
998
999 $int = bindec($binBuf);
1000 } else {
1001 $int = $ord;
1002 }
1003
1004 return $hex ? 'x' . dechex($int) : $int;
1005 }
1006
1007
1008 /********************************************
1009 *
1010 * Init functions
1011 *
1012 ********************************************/
1013
1014 /**
1015 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
1016 * This function is automatically called by the conversion functions
1017 *
1018 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
1019 *
1020 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
1021 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
1022 * @access private
1023 */
1024 function initCharset($charset) {
1025 // Only process if the charset is not yet loaded:
1026 if (!is_array($this->parsedCharsets[$charset])) {
1027
1028 // Conversion table filename:
1029 $charsetConvTableFile = PATH_t3lib . 'csconvtbl/' . $charset . '.tbl';
1030
1031 // If the conversion table is found:
1032 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
1033 // Cache file for charsets:
1034 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1035 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1036 if ($cacheFile && @is_file($cacheFile)) {
1037 $this->parsedCharsets[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1038 } else {
1039 // Parse conversion table into lines:
1040 $lines = t3lib_div::trimExplode(LF, t3lib_div::getUrl($charsetConvTableFile), 1);
1041 // Initialize the internal variable holding the conv. table:
1042 $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
1043 // traverse the lines:
1044 $detectedType = '';
1045 foreach ($lines as $value) {
1046 if (trim($value) && substr($value, 0, 1) != '#') { // Comment line or blanks are ignored.
1047
1048 // Detect type if not done yet: (Done on first real line)
1049 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1050 if (!$detectedType) {
1051 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
1052 }
1053
1054 if ($detectedType == 'ms-token') {
1055 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1056 } elseif ($detectedType == 'whitespaced') {
1057 $regA = array();
1058 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1059 $hexbyte = $regA[1];
1060 $utf8 = 'U+' . $regA[2];
1061 }
1062 $decval = hexdec(trim($hexbyte));
1063 if ($decval > 127) {
1064 $utf8decval = hexdec(substr(trim($utf8), 2));
1065 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1066 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1067 }
1068 }
1069 }
1070 if ($cacheFile) {
1071 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1072 }
1073 }
1074 return 2;
1075 } else {
1076 return FALSE;
1077 }
1078 } else {
1079 return 1;
1080 }
1081 }
1082
1083 /**
1084 * This function initializes all UTF-8 character data tables.
1085 *
1086 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1087 *
1088 * @param string Mode ("case", "ascii", ...)
1089 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1090 * @access private
1091 */
1092 function initUnicodeData($mode = NULL) {
1093 // cache files
1094 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1095 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1096
1097 // Only process if the tables are not yet loaded
1098 switch ($mode) {
1099 case 'case':
1100 if (is_array($this->caseFolding['utf-8'])) {
1101 return 1;
1102 }
1103
1104 // Use cached version if possible
1105 if ($cacheFileCase && @is_file($cacheFileCase)) {
1106 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1107 return 2;
1108 }
1109 break;
1110
1111 case 'ascii':
1112 if (is_array($this->toASCII['utf-8'])) {
1113 return 1;
1114 }
1115
1116 // Use cached version if possible
1117 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1118 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1119 return 2;
1120 }
1121 break;
1122 }
1123
1124 // process main Unicode data file
1125 $unicodeDataFile = PATH_t3lib . 'unidata/UnicodeData.txt';
1126 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1127 return FALSE;
1128 }
1129
1130 $fh = fopen($unicodeDataFile, 'rb');
1131 if (!$fh) {
1132 return FALSE;
1133 }
1134
1135 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1136 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1137 $this->caseFolding['utf-8'] = array();
1138 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1139 $utf8CaseFolding['toUpper'] = array();
1140 $utf8CaseFolding['toLower'] = array();
1141 $utf8CaseFolding['toTitle'] = array();
1142
1143 $decomposition = array(); // array of temp. decompositions
1144 $mark = array(); // array of chars that are marks (eg. composing accents)
1145 $number = array(); // array of chars that are numbers (eg. digits)
1146 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1147
1148 while (!feof($fh)) {
1149 $line = fgets($fh, 4096);
1150 // has a lot of info
1151 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line));
1152
1153 $ord = hexdec($char);
1154 if ($ord > 0xFFFF) {
1155 break;
1156 } // only process the BMP
1157
1158 $utf8_char = $this->UnumberToChar($ord);
1159
1160 if ($upper) {
1161 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1162 }
1163 if ($lower) {
1164 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1165 }
1166 // store "title" only when different from "upper" (only a few)
1167 if ($title && $title != $upper) {
1168 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1169 }
1170
1171 switch ($cat{0}) {
1172 case 'M': // mark (accent, umlaut, ...)
1173 $mark["U+$char"] = 1;
1174 break;
1175
1176 case 'N': // numeric value
1177 if ($ord > 0x80 && $num != '') {
1178 $number["U+$char"] = $num;
1179 }
1180 }
1181
1182 // accented Latin letters without "official" decomposition
1183 $match = array();
1184 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1185 $c = ord($match[2]);
1186 if ($match[1] == 'SMALL') {
1187 $c += 32;
1188 }
1189
1190 $decomposition["U+$char"] = array(dechex($c));
1191 continue;
1192 }
1193
1194 $match = array();
1195 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1196 switch ($match[1]) {
1197 case '<circle>': // add parenthesis as circle replacement, eg (1)
1198 $match[2] = '0028 ' . $match[2] . ' 0029';
1199 break;
1200
1201 case '<square>': // add square brackets as square replacement, eg [1]
1202 $match[2] = '005B ' . $match[2] . ' 005D';
1203 break;
1204
1205 case '<compat>': // ignore multi char decompositions that start with a space
1206 if (preg_match('/^0020 /', $match[2])) {
1207 continue 2;
1208 }
1209 break;
1210
1211 // ignore Arabic and vertical layout presentation decomposition
1212 case '<initial>':
1213 case '<medial>':
1214 case '<final>':
1215 case '<isolated>':
1216 case '<vertical>':
1217 continue 2;
1218 }
1219 $decomposition["U+$char"] = explode(' ', $match[2]);
1220 }
1221 }
1222 fclose($fh);
1223
1224 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1225 $specialCasingFile = PATH_t3lib . 'unidata/SpecialCasing.txt';
1226 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1227 $fh = fopen($specialCasingFile, 'rb');
1228 if ($fh) {
1229 while (!feof($fh)) {
1230 $line = fgets($fh, 4096);
1231 if ($line{0} != '#' && trim($line) != '') {
1232
1233 list($char, $lower, $title, $upper, $cond) = t3lib_div::trimExplode(';', $line);
1234 if ($cond == '' || $cond{0} == '#') {
1235 $utf8_char = $this->UnumberToChar(hexdec($char));
1236 if ($char != $lower) {
1237 $arr = explode(' ', $lower);
1238 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1239 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1240 }
1241 if ($char != $title && $title != $upper) {
1242 $arr = explode(' ', $title);
1243 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1244 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1245 }
1246 if ($char != $upper) {
1247 $arr = explode(' ', $upper);
1248 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1249 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1250 }
1251 }
1252 }
1253 }
1254 fclose($fh);
1255 }
1256 }
1257
1258 // process custom decompositions
1259 $customTranslitFile = PATH_t3lib . 'unidata/Translit.txt';
1260 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1261 $fh = fopen($customTranslitFile, 'rb');
1262 if ($fh) {
1263 while (!feof($fh)) {
1264 $line = fgets($fh, 4096);
1265 if ($line{0} != '#' && trim($line) != '') {
1266 list($char, $translit) = t3lib_div::trimExplode(';', $line);
1267 if (!$translit) {
1268 $omit["U+$char"] = 1;
1269 }
1270 $decomposition["U+$char"] = explode(' ', $translit);
1271
1272 }
1273 }
1274 fclose($fh);
1275 }
1276 }
1277
1278 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1279 foreach ($decomposition as $from => $to) {
1280 $code_decomp = array();
1281
1282 while ($code_value = array_shift($to)) {
1283 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1284 foreach (array_reverse($decomposition["U+$code_value"]) as $cv) {
1285 array_unshift($to, $cv);
1286 }
1287 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1288 array_push($code_decomp, $code_value);
1289 }
1290 }
1291 if (count($code_decomp) || isset($omit[$from])) {
1292 $decomposition[$from] = $code_decomp;
1293 } else {
1294 unset($decomposition[$from]);
1295 }
1296 }
1297
1298 // create ascii only mapping
1299 $this->toASCII['utf-8'] = array();
1300 $ascii =& $this->toASCII['utf-8'];
1301
1302 foreach ($decomposition as $from => $to) {
1303 $code_decomp = array();
1304 while ($code_value = array_shift($to)) {
1305 $ord = hexdec($code_value);
1306 if ($ord > 127) {
1307 continue 2;
1308 } // skip decompositions containing non-ASCII chars
1309 else
1310 {
1311 array_push($code_decomp, chr($ord));
1312 }
1313 }
1314 $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1315 }
1316
1317 // add numeric decompositions
1318 foreach ($number as $from => $to) {
1319 $utf8_char = $this->UnumberToChar(hexdec($from));
1320 if (!isset($ascii[$utf8_char])) {
1321 $ascii[$utf8_char] = $to;
1322 }
1323 }
1324
1325 if ($cacheFileCase) {
1326 t3lib_div::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1327 }
1328
1329 if ($cacheFileASCII) {
1330 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1331 }
1332
1333 return 3;
1334 }
1335
1336 /**
1337 * This function initializes the folding table for a charset other than UTF-8.
1338 * This function is automatically called by the case folding functions.
1339 *
1340 * @param string Charset for which to initialize case folding.
1341 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1342 * @access private
1343 */
1344 function initCaseFolding($charset) {
1345 // Only process if the case table is not yet loaded:
1346 if (is_array($this->caseFolding[$charset])) {
1347 return 1;
1348 }
1349
1350 // Use cached version if possible
1351 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1352 if ($cacheFile && @is_file($cacheFile)) {
1353 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1354 return 2;
1355 }
1356
1357 // init UTF-8 conversion for this charset
1358 if (!$this->initCharset($charset)) {
1359 return FALSE;
1360 }
1361
1362 // UTF-8 case folding is used as the base conversion table
1363 if (!$this->initUnicodeData('case')) {
1364 return FALSE;
1365 }
1366
1367 $nochar = chr($this->noCharByteVal);
1368 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1369 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1370 $c = $this->utf8_decode($utf8, $charset);
1371
1372 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1373 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1374 if ($cc != '' && $cc != $nochar) {
1375 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1376 }
1377
1378 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1379 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1380 if ($cc != '' && $cc != $nochar) {
1381 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1382 }
1383
1384 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1385 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1386 if ($cc != '' && $cc != $nochar) {
1387 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1388 }
1389 }
1390
1391 // add the ASCII case table
1392 for ($i = ord('a'); $i <= ord('z'); $i++) {
1393 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1394 }
1395 for ($i = ord('A'); $i <= ord('Z'); $i++) {
1396 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1397 }
1398
1399 if ($cacheFile) {
1400 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1401 }
1402
1403 return 3;
1404 }
1405
1406 /**
1407 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1408 * This function is automatically called by the ASCII transliteration functions.
1409 *
1410 * @param string Charset for which to initialize conversion.
1411 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1412 * @access private
1413 */
1414 function initToASCII($charset) {
1415 // Only process if the case table is not yet loaded:
1416 if (is_array($this->toASCII[$charset])) {
1417 return 1;
1418 }
1419
1420 // Use cached version if possible
1421 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1422 if ($cacheFile && @is_file($cacheFile)) {
1423 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1424 return 2;
1425 }
1426
1427 // init UTF-8 conversion for this charset
1428 if (!$this->initCharset($charset)) {
1429 return FALSE;
1430 }
1431
1432 // UTF-8/ASCII transliteration is used as the base conversion table
1433 if (!$this->initUnicodeData('ascii')) {
1434 return FALSE;
1435 }
1436
1437 $nochar = chr($this->noCharByteVal);
1438 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1439 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1440 $c = $this->utf8_decode($utf8, $charset);
1441
1442 if (isset($this->toASCII['utf-8'][$utf8])) {
1443 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1444 }
1445 }
1446
1447 if ($cacheFile) {
1448 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1449 }
1450
1451 return 3;
1452 }
1453
1454
1455 /********************************************
1456 *
1457 * String operation functions
1458 *
1459 ********************************************/
1460
1461 /**
1462 * Returns a part of a string.
1463 * Unit-tested by Kasper (single byte charsets only)
1464 *
1465 * @param string The character set
1466 * @param string Character string
1467 * @param integer Start position (character position)
1468 * @param integer Length (in characters)
1469 * @return string The substring
1470 * @see substr(), mb_substr()
1471 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1472 */
1473 function substr($charset, $string, $start, $len = NULL) {
1474 if ($len === 0 || $string === '') {
1475 return '';
1476 }
1477
1478 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1479 // cannot omit $len, when specifying charset
1480 if ($len == NULL) {
1481 $enc = mb_internal_encoding(); // save internal encoding
1482 mb_internal_encoding($charset);
1483 $str = mb_substr($string, $start);
1484 mb_internal_encoding($enc); // restore internal encoding
1485
1486 return $str;
1487 }
1488 else {
1489 return mb_substr($string, $start, $len, $charset);
1490 }
1491 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1492 // cannot omit $len, when specifying charset
1493 if ($len == NULL) {
1494 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1495 iconv_set_encoding('internal_encoding', $charset);
1496 $str = iconv_substr($string, $start);
1497 iconv_set_encoding('internal_encoding', $enc); // restore internal encoding
1498
1499 return $str;
1500 }
1501 else {
1502 return iconv_substr($string, $start, $len, $charset);
1503 }
1504 } elseif ($charset == 'utf-8') {
1505 return $this->utf8_substr($string, $start, $len);
1506 } elseif ($this->eucBasedSets[$charset]) {
1507 return $this->euc_substr($string, $start, $charset, $len);
1508 } elseif ($this->twoByteSets[$charset]) {
1509 return substr($string, $start * 2, $len * 2);
1510 } elseif ($this->fourByteSets[$charset]) {
1511 return substr($string, $start * 4, $len * 4);
1512 }
1513
1514 // treat everything else as single-byte encoding
1515 return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
1516 }
1517
1518 /**
1519 * Counts the number of characters.
1520 * Unit-tested by Kasper (single byte charsets only)
1521 *
1522 * @param string The character set
1523 * @param string Character string
1524 * @return integer The number of characters
1525 * @see strlen()
1526 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1527 */
1528 function strlen($charset, $string) {
1529 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1530 return mb_strlen($string, $charset);
1531 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1532 return iconv_strlen($string, $charset);
1533 } elseif ($charset == 'utf-8') {
1534 return $this->utf8_strlen($string);
1535 } elseif ($this->eucBasedSets[$charset]) {
1536 return $this->euc_strlen($string, $charset);
1537 } elseif ($this->twoByteSets[$charset]) {
1538 return strlen($string) / 2;
1539 } elseif ($this->fourByteSets[$charset]) {
1540 return strlen($string) / 4;
1541 }
1542 // treat everything else as single-byte encoding
1543 return strlen($string);
1544 }
1545
1546 /**
1547 * Method to crop strings using the mb_substr function.
1548 *
1549 * @param string The character set
1550 * @param string String to be cropped
1551 * @param integer Crop length (in characters)
1552 * @param string Crop signifier
1553 * @return string The shortened string
1554 * @see mb_strlen(), mb_substr()
1555 */
1556 protected function cropMbstring($charset, $string, $len, $crop = '') {
1557 if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) {
1558 return $string;
1559 }
1560
1561 if ($len > 0) {
1562 $string = mb_substr($string, 0, $len, $charset) . $crop;
1563 } else {
1564 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1565 }
1566
1567 return $string;
1568 }
1569
1570 /**
1571 * Truncates a string and pre-/appends a string.
1572 * Unit tested by Kasper
1573 *
1574 * @param string The character set
1575 * @param string Character string
1576 * @param integer Length (in characters)
1577 * @param string Crop signifier
1578 * @return string The shortened string
1579 * @see substr(), mb_strimwidth()
1580 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1581 */
1582 function crop($charset, $string, $len, $crop = '') {
1583 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1584 return $this->cropMbstring($charset, $string, $len, $crop);
1585 }
1586
1587 if (intval($len) == 0) {
1588 return $string;
1589 }
1590
1591 if ($charset == 'utf-8') {
1592 $i = $this->utf8_char2byte_pos($string, $len);
1593 } elseif ($this->eucBasedSets[$charset]) {
1594 $i = $this->euc_char2byte_pos($string, $len, $charset);
1595 } else {
1596 if ($len > 0) {
1597 $i = $len;
1598 } else {
1599 $i = strlen($string) + $len;
1600 if ($i <= 0) {
1601 $i = FALSE;
1602 }
1603 }
1604 }
1605
1606 if ($i === FALSE) { // $len outside actual string length
1607 return $string;
1608 } else {
1609 if ($len > 0) {
1610 if (strlen($string{$i})) {
1611 return substr($string, 0, $i) . $crop;
1612
1613 }
1614 } else {
1615 if (strlen($string{$i - 1})) {
1616 return $crop . substr($string, $i);
1617 }
1618 }
1619
1620 /*
1621 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1622 if ($len > 0) {
1623 return substr($string,0,$i).$crop;
1624 } else {
1625 return $crop.substr($string,$i);
1626 }
1627 }
1628 */
1629 }
1630 return $string;
1631 }
1632
1633 /**
1634 * Cuts a string short at a given byte length.
1635 *
1636 * @param string The character set
1637 * @param string Character string
1638 * @param integer The byte length
1639 * @return string The shortened string
1640 * @see mb_strcut()
1641 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1642 */
1643 function strtrunc($charset, $string, $len) {
1644 if ($len <= 0) {
1645 return '';
1646 }
1647
1648 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1649 return mb_strcut($string, 0, $len, $charset);
1650 } elseif ($charset == 'utf-8') {
1651 return $this->utf8_strtrunc($string, $len);
1652 } elseif ($this->eucBasedSets[$charset]) {
1653 return $this->euc_strtrunc($string, $len, $charset);
1654 } elseif ($this->twoByteSets[$charset]) {
1655 if ($len % 2) {
1656 $len--;
1657 } // don't cut at odd positions
1658 } elseif ($this->fourByteSets[$charset]) {
1659 $x = $len % 4;
1660 $len -= $x; // realign to position dividable by four
1661 }
1662 // treat everything else as single-byte encoding
1663 return substr($string, 0, $len);
1664 }
1665
1666 /**
1667 * Translates all characters of a string into their respective case values.
1668 * Unlike strtolower() and strtoupper() this method is locale independent.
1669 * Note that the string length may change!
1670 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1671 * Unit-tested by Kasper
1672 * Real case folding is language dependent, this method ignores this fact.
1673 *
1674 * @param string Character set of string
1675 * @param string Input string to convert case for
1676 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1677 * @return string The converted string
1678 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1679 * @see strtolower(), strtoupper()
1680 */
1681 function conv_case($charset, $string, $case) {
1682 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1683 if ($case == 'toLower') {
1684 $string = mb_strtolower($string, $charset);
1685 } else {
1686 $string = mb_strtoupper($string, $charset);
1687 }
1688 } elseif ($charset == 'utf-8') {
1689 $string = $this->utf8_char_mapping($string, 'case', $case);
1690 } elseif (isset($this->eucBasedSets[$charset])) {
1691 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1692 } else {
1693 // treat everything else as single-byte encoding
1694 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1695 }
1696
1697 return $string;
1698 }
1699
1700 /**
1701 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1702 *
1703 * @param string Character set of string
1704 * @param string Input string to convert
1705 * @return string The converted string
1706 */
1707 function specCharsToASCII($charset, $string) {
1708 if ($charset == 'utf-8') {
1709 $string = $this->utf8_char_mapping($string, 'ascii');
1710 } elseif (isset($this->eucBasedSets[$charset])) {
1711 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1712 } else {
1713 // treat everything else as single-byte encoding
1714 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1715 }
1716
1717 return $string;
1718 }
1719
1720
1721 /**
1722 * converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1723 * into a TYPO3-readable language code
1724 * @param $languageCodesList list of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1725 * see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
1726 * @return string a preferred language that TYPO3 supports, or "default" if none found
1727 * @author Benjamin Mack (benni.typo3.org)
1728 */
1729 public function getPreferredClientLanguage($languageCodesList) {
1730 $allLanguageCodes = array();
1731 $selectedLanguage = 'default';
1732
1733 // get all languages where TYPO3 code is the same as the ISO code
1734 foreach ($this->charSetArray as $typo3Lang => $charSet) {
1735 $allLanguageCodes[$typo3Lang] = $typo3Lang;
1736 }
1737
1738 // get all languages where TYPO3 code differs from ISO code
1739 // or needs the country part
1740 // the iso codes will here overwrite the default typo3 language in the key
1741 foreach ($this->isoArray as $typo3Lang => $isoLang) {
1742 $isoLang = join('-', explode('_', $isoLang));
1743 $allLanguageCodes[$typo3Lang] = $isoLang;
1744 }
1745
1746 // move the iso codes to the (because we're comparing the keys with "isset" later on)
1747 $allLanguageCodes = array_flip($allLanguageCodes);
1748
1749
1750 $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList);
1751 // order the preferred languages after they key
1752 $sortedPreferredLanguages = array();
1753 foreach ($preferredLanguages as $preferredLanguage) {
1754 $quality = 1.0;
1755 if (strpos($preferredLanguage, ';q=') !== FALSE) {
1756 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1757 }
1758 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1759 }
1760
1761 // loop through the languages, with the highest priority first
1762 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1763 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1764 if (isset($allLanguageCodes[$preferredLanguage])) {
1765 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1766 break;
1767 }
1768
1769 // strip the country code from the end
1770 list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
1771 if (isset($allLanguageCodes[$preferredLanguage])) {
1772 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1773 break;
1774 }
1775 }
1776 if (!$selectedLanguage || $selectedLanguage == 'en') {
1777 $selectedLanguage = 'default';
1778 }
1779 return $selectedLanguage;
1780 }
1781
1782
1783 /********************************************
1784 *
1785 * Internal string operation functions
1786 *
1787 ********************************************/
1788
1789 /**
1790 * Maps all characters of a string in a single byte charset.
1791 *
1792 * @param string the string
1793 * @param string the charset
1794 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1795 * @param string 'case': conversion 'toLower' or 'toUpper'
1796 * @return string the converted string
1797 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1798 */
1799 function sb_char_mapping($str, $charset, $mode, $opt = '') {
1800 switch ($mode) {
1801 case 'case':
1802 if (!$this->initCaseFolding($charset)) {
1803 return $str;
1804 } // do nothing
1805 $map =& $this->caseFolding[$charset][$opt];
1806 break;
1807
1808 case 'ascii':
1809 if (!$this->initToASCII($charset)) {
1810 return $str;
1811 } // do nothing
1812 $map =& $this->toASCII[$charset];
1813 break;
1814
1815 default:
1816 return $str;
1817 }
1818
1819 $out = '';
1820 for ($i = 0; strlen($str{$i}); $i++) {
1821 $c = $str{$i};
1822 if (isset($map[$c])) {
1823 $out .= $map[$c];
1824 } else {
1825 $out .= $c;
1826 }
1827 }
1828
1829 return $out;
1830 }
1831
1832
1833 /********************************************
1834 *
1835 * Internal UTF-8 string operation functions
1836 *
1837 ********************************************/
1838
1839 /**
1840 * Returns a part of a UTF-8 string.
1841 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1842 *
1843 * @param string UTF-8 string
1844 * @param integer Start position (character position)
1845 * @param integer Length (in characters)
1846 * @return string The substring
1847 * @see substr()
1848 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1849 */
1850 function utf8_substr($str, $start, $len = NULL) {
1851 if (!strcmp($len, '0')) {
1852 return '';
1853 }
1854
1855 $byte_start = $this->utf8_char2byte_pos($str, $start);
1856 if ($byte_start === FALSE) {
1857 if ($start > 0) {
1858 return FALSE; // $start outside string length
1859 } else {
1860 $start = 0;
1861 }
1862 }
1863
1864 $str = substr($str, $byte_start);
1865
1866 if ($len != NULL) {
1867 $byte_end = $this->utf8_char2byte_pos($str, $len);
1868 if ($byte_end === FALSE) // $len outside actual string length
1869 {
1870 return $len < 0 ? '' : $str;
1871 } // When length is less than zero and exceeds, then we return blank string.
1872 else
1873 {
1874 return substr($str, 0, $byte_end);
1875 }
1876 }
1877 else {
1878 return $str;
1879 }
1880 }
1881
1882 /**
1883 * Counts the number of characters of a string in UTF-8.
1884 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1885 *
1886 * @param string UTF-8 multibyte character string
1887 * @return integer The number of characters
1888 * @see strlen()
1889 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1890 */
1891 function utf8_strlen($str) {
1892 $n = 0;
1893 for ($i = 0; strlen($str{$i}); $i++) {
1894 $c = ord($str{$i});
1895 if (!($c & 0x80)) // single-byte (0xxxxxx)
1896 {
1897 $n++;
1898 }
1899 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1900 {
1901 $n++;
1902 }
1903 }
1904 return $n;
1905 }
1906
1907 /**
1908 * Truncates a string in UTF-8 short at a given byte length.
1909 *
1910 * @param string UTF-8 multibyte character string
1911 * @param integer the byte length
1912 * @return string the shortened string
1913 * @see mb_strcut()
1914 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1915 */
1916 function utf8_strtrunc($str, $len) {
1917 $i = $len - 1;
1918 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1919 for (; $i > 0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1920 if ($i <= 0) {
1921 return '';
1922 } // sanity check
1923 for ($bc = 0, $mbs = ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1924 if ($bc + $i > $len) {
1925 return substr($str, 0, $i);
1926 }
1927 // fallthru: multibyte char fits into length
1928 }
1929 return substr($str, 0, $len);
1930 }
1931
1932 /**
1933 * Find position of first occurrence of a string, both arguments are in UTF-8.
1934 *
1935 * @param string UTF-8 string to search in
1936 * @param string UTF-8 string to search for
1937 * @param integer Positition to start the search
1938 * @return integer The character position
1939 * @see strpos()
1940 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1941 */
1942 function utf8_strpos($haystack, $needle, $offset = 0) {
1943 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1944 return mb_strpos($haystack, $needle, $offset, 'utf-8');
1945 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1946 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1947 }
1948
1949 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1950 if ($byte_offset === FALSE) {
1951 return FALSE;
1952 } // offset beyond string length
1953
1954 $byte_pos = strpos($haystack, $needle, $byte_offset);
1955 if ($byte_pos === FALSE) {
1956 return FALSE;
1957 } // needle not found
1958
1959 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1960 }
1961
1962 /**
1963 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1964 *
1965 * @param string UTF-8 string to search in
1966 * @param string UTF-8 character to search for (single character)
1967 * @return integer The character position
1968 * @see strrpos()
1969 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1970 */
1971 function utf8_strrpos($haystack, $needle) {
1972 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1973 return mb_strrpos($haystack, $needle, 'utf-8');
1974 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1975 return iconv_strrpos($haystack, $needle, 'utf-8');
1976 }
1977
1978 $byte_pos = strrpos($haystack, $needle);
1979 if ($byte_pos === FALSE) {
1980 return FALSE;
1981 } // needle not found
1982
1983 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1984 }
1985
1986 /**
1987 * Translates a character position into an 'absolute' byte position.
1988 * Unit tested by Kasper.
1989 *
1990 * @param string UTF-8 string
1991 * @param integer Character position (negative values start from the end)
1992 * @return integer Byte position
1993 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1994 */
1995 function utf8_char2byte_pos($str, $pos) {
1996 $n = 0; // number of characters found
1997 $p = abs($pos); // number of characters wanted
1998
1999 if ($pos >= 0) {
2000 $i = 0;
2001 $d = 1;
2002 } else {
2003 $i = strlen($str) - 1;
2004 $d = -1;
2005 }
2006
2007 for (; strlen($str{$i}) && $n < $p; $i += $d) {
2008 $c = (int) ord($str{$i});
2009 if (!($c & 0x80)) // single-byte (0xxxxxx)
2010 {
2011 $n++;
2012 }
2013 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2014 {
2015 $n++;
2016 }
2017 }
2018 if (!strlen($str{$i})) {
2019 return FALSE;
2020 } // offset beyond string length
2021
2022 if ($pos >= 0) {
2023 // skip trailing multi-byte data bytes
2024 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) {
2025 $i++;
2026 }
2027 } else {
2028 // correct offset
2029 $i++;
2030 }
2031
2032 return $i;
2033 }
2034
2035 /**
2036 * Translates an 'absolute' byte position into a character position.
2037 * Unit tested by Kasper.
2038 *
2039 * @param string UTF-8 string
2040 * @param integer byte position
2041 * @return integer character position
2042 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2043 */
2044 function utf8_byte2char_pos($str, $pos) {
2045 $n = 0; // number of characters
2046 for ($i = $pos; $i > 0; $i--) {
2047 $c = (int) ord($str{$i});
2048 if (!($c & 0x80)) // single-byte (0xxxxxx)
2049 {
2050 $n++;
2051 }
2052 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2053 {
2054 $n++;
2055 }
2056 }
2057 if (!strlen($str{$i})) {
2058 return FALSE;
2059 } // offset beyond string length
2060
2061 return $n;
2062 }
2063
2064 /**
2065 * Maps all characters of an UTF-8 string.
2066 *
2067 * @param string UTF-8 string
2068 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2069 * @param string 'case': conversion 'toLower' or 'toUpper'
2070 * @return string the converted string
2071 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2072 */
2073 function utf8_char_mapping($str, $mode, $opt = '') {
2074 if (!$this->initUnicodeData($mode)) {
2075 return $str;
2076 } // do nothing
2077
2078 $out = '';
2079 switch ($mode) {
2080 case 'case':
2081 $map =& $this->caseFolding['utf-8'][$opt];
2082 break;
2083
2084 case 'ascii':
2085 $map =& $this->toASCII['utf-8'];
2086 break;
2087
2088 default:
2089 return $str;
2090 }
2091
2092 for ($i = 0; strlen($str{$i}); $i++) {
2093 $c = ord($str{$i});
2094 if (!($c & 0x80)) // single-byte (0xxxxxx)
2095 {
2096 $mbc = $str{$i};
2097 }
2098 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
2099 for ($bc = 0; $c & 0x80; $c = $c << 1) {
2100 $bc++;
2101 } // calculate number of bytes
2102 $mbc = substr($str, $i, $bc);
2103 $i += $bc - 1;
2104 }
2105
2106 if (isset($map[$mbc])) {
2107 $out .= $map[$mbc];
2108 } else {
2109 $out .= $mbc;
2110 }
2111 }
2112
2113 return $out;
2114 }
2115
2116
2117 /********************************************
2118 *
2119 * Internal EUC string operation functions
2120 *
2121 * Extended Unix Code:
2122 * ASCII compatible 7bit single bytes chars
2123 * 8bit two byte chars
2124 *
2125 * Shift-JIS is treated as a special case.
2126 *
2127 ********************************************/
2128
2129 /**
2130 * Cuts a string in the EUC charset family short at a given byte length.
2131 *
2132 * @param string EUC multibyte character string
2133 * @param integer the byte length
2134 * @param string the charset
2135 * @return string the shortened string
2136 * @see mb_strcut()
2137 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2138 */
2139 function euc_strtrunc($str, $len, $charset) {
2140 $sjis = ($charset == 'shift_jis');
2141 for ($i = 0; strlen($str{$i}) && $i < $len; $i++) {
2142 $c = ord($str{$i});
2143 if ($sjis) {
2144 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2145 $i++;
2146 } // advance a double-byte char
2147 }
2148 else {
2149 if ($c >= 0x80) {
2150 $i++;
2151 } // advance a double-byte char
2152 }
2153 }
2154 if (!strlen($str{$i})) {
2155 return $str;
2156 } // string shorter than supplied length
2157
2158 if ($i > $len) {
2159 return substr($str, 0, $len - 1); // we ended on a first byte
2160 } else {
2161 return substr($str, 0, $len);
2162 }
2163 }
2164
2165 /**
2166 * Returns a part of a string in the EUC charset family.
2167 *
2168 * @param string EUC multibyte character string
2169 * @param integer start position (character position)
2170 * @param string the charset
2171 * @param integer length (in characters)
2172 * @return string the substring
2173 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2174 */
2175 function euc_substr($str, $start, $charset, $len = NULL) {
2176 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2177 if ($byte_start === FALSE) {
2178 return FALSE;
2179 } // $start outside string length
2180
2181 $str = substr($str, $byte_start);
2182
2183 if ($len != NULL) {
2184 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2185 if ($byte_end === FALSE) // $len outside actual string length
2186 {
2187 return $str;
2188 }
2189 else
2190 {
2191 return substr($str, 0, $byte_end);
2192 }
2193 }
2194 else {
2195 return $str;
2196 }
2197 }
2198
2199 /**
2200 * Counts the number of characters of a string in the EUC charset family.
2201 *
2202 * @param string EUC multibyte character string
2203 * @param string the charset
2204 * @return integer the number of characters
2205 * @see strlen()
2206 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2207 */
2208 function euc_strlen($str, $charset) {
2209 $sjis = ($charset == 'shift_jis');
2210 $n = 0;
2211 for ($i = 0; strlen($str{$i}); $i++) {
2212 $c = ord($str{$i});
2213 if ($sjis) {
2214 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2215 $i++;
2216 } // advance a double-byte char
2217 }
2218 else {
2219 if ($c >= 0x80) {
2220 $i++;
2221 } // advance a double-byte char
2222 }
2223
2224 $n++;
2225 }
2226
2227 return $n;
2228 }
2229
2230 /**
2231 * Translates a character position into an 'absolute' byte position.
2232 *
2233 * @param string EUC multibyte character string
2234 * @param integer character position (negative values start from the end)
2235 * @param string the charset
2236 * @return integer byte position
2237 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2238 */
2239 function euc_char2byte_pos($str, $pos, $charset) {
2240 $sjis = ($charset == 'shift_jis');
2241 $n = 0; // number of characters seen
2242 $p = abs($pos); // number of characters wanted
2243
2244 if ($pos >= 0) {
2245 $i = 0;
2246 $d = 1;
2247 } else {
2248 $i = strlen($str) - 1;
2249 $d = -1;
2250 }
2251
2252 for (; strlen($str{$i}) && $n < $p; $i += $d) {
2253 $c = ord($str{$i});
2254 if ($sjis) {
2255 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2256 $i += $d;
2257 } // advance a double-byte char
2258 }
2259 else {
2260 if ($c >= 0x80) {
2261 $i += $d;
2262 } // advance a double-byte char
2263 }
2264
2265 $n++;
2266 }
2267 if (!strlen($str{$i})) {
2268 return FALSE;
2269 } // offset beyond string length
2270
2271 if ($pos < 0) {
2272 $i++;
2273 } // correct offset
2274
2275 return $i;
2276 }
2277
2278 /**
2279 * Maps all characters of a string in the EUC charset family.
2280 *
2281 * @param string EUC multibyte character string
2282 * @param string the charset
2283 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2284 * @param string 'case': conversion 'toLower' or 'toUpper'
2285 * @return string the converted string
2286 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2287 */
2288 function euc_char_mapping($str, $charset, $mode, $opt = '') {
2289 switch ($mode) {
2290 case 'case':
2291 if (!$this->initCaseFolding($charset)) {
2292 return $str;
2293 } // do nothing
2294 $map =& $this->caseFolding[$charset][$opt];
2295 break;
2296
2297 case 'ascii':
2298 if (!$this->initToASCII($charset)) {
2299 return $str;
2300 } // do nothing
2301 $map =& $this->toASCII[$charset];
2302 break;
2303
2304 default:
2305 return $str;
2306 }
2307
2308 $sjis = ($charset == 'shift_jis');
2309 $out = '';
2310 for ($i = 0; strlen($str{$i}); $i++) {
2311 $mbc = $str{$i};
2312 $c = ord($mbc);
2313
2314 if ($sjis) {
2315 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2316 $mbc = substr($str, $i, 2);
2317 $i++;
2318 }
2319 }
2320 else {
2321 if ($c >= 0x80) { // a double-byte char
2322 $mbc = substr($str, $i, 2);
2323 $i++;
2324 }
2325 }
2326
2327 if (isset($map[$mbc])) {
2328 $out .= $map[$mbc];
2329 } else {
2330 $out .= $mbc;
2331 }
2332 }
2333
2334 return $out;
2335 }
2336
2337 }
2338
2339 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
2340 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2341 }
2342
2343 ?>