Fixed bug #15092: Ajax loaded items of inline records are encoded twice (Thanks to...
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2010 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 136: class t3lib_cs
38 * 488: function parse_charset($charset)
39 * 507: function get_locale_charset($locale)
40 *
41 * SECTION: Charset Conversion functions
42 * 560: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
43 * 600: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
44 * 617: function utf8_encode($str,$charset)
45 * 663: function utf8_decode($str,$charset,$useEntityForNoChar=0)
46 * 706: function utf8_to_entities($str)
47 * 739: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
48 * 773: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
49 * 823: function UnumberToChar($cbyte)
50 * 868: function utf8CharToUnumber($str,$hex=0)
51 *
52 * SECTION: Init functions
53 * 911: function initCharset($charset)
54 * 973: function initUnicodeData($mode=null)
55 * 1198: function initCaseFolding($charset)
56 * 1260: function initToASCII($charset)
57 *
58 * SECTION: String operation functions
59 * 1331: function substr($charset,$string,$start,$len=null)
60 * 1384: function strlen($charset,$string)
61 * 1414: function crop($charset,$string,$len,$crop='')
62 * 1467: function strtrunc($charset,$string,$len)
63 * 1501: function conv_case($charset,$string,$case)
64 * 1527: function specCharsToASCII($charset,$string)
65 *
66 * SECTION: Internal string operation functions
67 * 1567: function sb_char_mapping($str,$charset,$mode,$opt='')
68 *
69 * SECTION: Internal UTF-8 string operation functions
70 * 1622: function utf8_substr($str,$start,$len=null)
71 * 1655: function utf8_strlen($str)
72 * 1676: function utf8_strtrunc($str,$len)
73 * 1698: function utf8_strpos($haystack,$needle,$offset=0)
74 * 1723: function utf8_strrpos($haystack,$needle)
75 * 1745: function utf8_char2byte_pos($str,$pos)
76 * 1786: function utf8_byte2char_pos($str,$pos)
77 * 1809: function utf8_char_mapping($str,$mode,$opt='')
78 *
79 * SECTION: Internal EUC string operation functions
80 * 1885: function euc_strtrunc($str,$len,$charset)
81 * 1914: function euc_substr($str,$start,$charset,$len=null)
82 * 1939: function euc_strlen($str,$charset)
83 * 1966: function euc_char2byte_pos($str,$pos,$charset)
84 * 2007: function euc_char_mapping($str,$charset,$mode,$opt='')
85 *
86 * TOTAL FUNCTIONS: 35
87 * (This index is automatically created/updated by the extension "extdeveval")
88 *
89 */
90
91
92
93
94
95
96
97
98 /**
99 * Notes on UTF-8
100 *
101 * Functions working on UTF-8 strings:
102 *
103 * - strchr/strstr
104 * - strrchr
105 * - substr_count
106 * - implode/explode/join
107 *
108 * Functions nearly working on UTF-8 strings:
109 *
110 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
111 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
112 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
113 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
114 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
115 *
116 * Functions NOT working on UTF-8 strings:
117 *
118 * - str*cmp
119 * - stristr
120 * - stripos
121 * - substr
122 * - strrev
123 * - split/spliti
124 * - ...
125 *
126 */
127 /**
128 * Class for conversion between charsets
129 *
130 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
131 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
132 * @package TYPO3
133 * @subpackage t3lib
134 */
135 class t3lib_cs {
136 var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
137
138 // This is the array where parsed conversion tables are stored (cached)
139 var $parsedCharsets=array();
140
141 // An array where case folding data will be stored (cached)
142 var $caseFolding=array();
143
144 // An array where charset-to-ASCII mappings are stored (cached)
145 var $toASCII=array();
146
147 // This tells the converter which charsets has two bytes per char:
148 var $twoByteSets=array(
149 'ucs-2'=>1, // 2-byte Unicode
150 );
151
152 // This tells the converter which charsets has four bytes per char:
153 var $fourByteSets=array(
154 'ucs-4'=>1, // 4-byte Unicode
155 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
156 );
157
158 // This tells the converter which charsets use a scheme like the Extended Unix Code:
159 var $eucBasedSets=array(
160 'gb2312'=>1, // Chinese, simplified.
161 'big5'=>1, // Chinese, traditional.
162 'euc-kr'=>1, // Korean
163 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
164 );
165
166 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
167 // http://czyborra.com/charsets/iso8859.html
168 var $synonyms=array(
169 'us' => 'ascii',
170 'us-ascii'=> 'ascii',
171 'cp819' => 'iso-8859-1',
172 'ibm819' => 'iso-8859-1',
173 'iso-ir-100' => 'iso-8859-1',
174 'iso-ir-101' => 'iso-8859-2',
175 'iso-ir-109' => 'iso-8859-3',
176 'iso-ir-110' => 'iso-8859-4',
177 'iso-ir-144' => 'iso-8859-5',
178 'iso-ir-127' => 'iso-8859-6',
179 'iso-ir-126' => 'iso-8859-7',
180 'iso-ir-138' => 'iso-8859-8',
181 'iso-ir-148' => 'iso-8859-9',
182 'iso-ir-157' => 'iso-8859-10',
183 'iso-ir-179' => 'iso-8859-13',
184 'iso-ir-199' => 'iso-8859-14',
185 'iso-ir-203' => 'iso-8859-15',
186 'csisolatin1' => 'iso-8859-1',
187 'csisolatin2' => 'iso-8859-2',
188 'csisolatin3' => 'iso-8859-3',
189 'csisolatin5' => 'iso-8859-9',
190 'csisolatin8' => 'iso-8859-14',
191 'csisolatin9' => 'iso-8859-15',
192 'csisolatingreek' => 'iso-8859-7',
193 'iso-celtic' => 'iso-8859-14',
194 'latin1' => 'iso-8859-1',
195 'latin2' => 'iso-8859-2',
196 'latin3' => 'iso-8859-3',
197 'latin5' => 'iso-8859-9',
198 'latin6' => 'iso-8859-10',
199 'latin8' => 'iso-8859-14',
200 'latin9' => 'iso-8859-15',
201 'l1' => 'iso-8859-1',
202 'l2' => 'iso-8859-2',
203 'l3' => 'iso-8859-3',
204 'l5' => 'iso-8859-9',
205 'l6' => 'iso-8859-10',
206 'l8' => 'iso-8859-14',
207 'l9' => 'iso-8859-15',
208 'cyrillic' => 'iso-8859-5',
209 'arabic' => 'iso-8859-6',
210 'tis-620' => 'iso-8859-11',
211 'win874' => 'windows-874',
212 'win1250' => 'windows-1250',
213 'win1251' => 'windows-1251',
214 'win1252' => 'windows-1252',
215 'win1253' => 'windows-1253',
216 'win1254' => 'windows-1254',
217 'win1255' => 'windows-1255',
218 'win1256' => 'windows-1256',
219 'win1257' => 'windows-1257',
220 'win1258' => 'windows-1258',
221 'cp1250' => 'windows-1250',
222 'cp1251' => 'windows-1251',
223 'cp1252' => 'windows-1252',
224 'ms-ee' => 'windows-1250',
225 'ms-ansi' => 'windows-1252',
226 'ms-greek' => 'windows-1253',
227 'ms-turk' => 'windows-1254',
228 'winbaltrim' => 'windows-1257',
229 'koi-8ru' => 'koi-8r',
230 'koi8r' => 'koi-8r',
231 'cp878' => 'koi-8r',
232 'mac' => 'macroman',
233 'macintosh' => 'macroman',
234 'euc-cn' => 'gb2312',
235 'x-euc-cn' => 'gb2312',
236 'euccn' => 'gb2312',
237 'cp936' => 'gb2312',
238 'big-5' => 'big5',
239 'cp950' => 'big5',
240 'eucjp' => 'euc-jp',
241 'sjis' => 'shift_jis',
242 'shift-jis' => 'shift_jis',
243 'cp932' => 'shift_jis',
244 'cp949' => 'euc-kr',
245 'utf7' => 'utf-7',
246 'utf8' => 'utf-8',
247 'utf16' => 'utf-16',
248 'utf32' => 'utf-32',
249 'utf8' => 'utf-8',
250 'ucs2' => 'ucs-2',
251 'ucs4' => 'ucs-4',
252 );
253
254 // mapping of iso-639-1 language codes to script names
255 var $lang_to_script=array(
256 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
257 'ar' => 'arabic',
258 'bg' => 'cyrillic', // Bulgarian
259 'bs' => 'east_european', // Bosnian
260 'cs' => 'east_european', // Czech
261 'da' => 'west_european', // Danish
262 'de' => 'west_european', // German
263 'es' => 'west_european', // Spanish
264 'et' => 'estonian',
265 'eo' => 'unicode', // Esperanto
266 'eu' => 'west_european', // Basque
267 'fa' => 'arabic', // Persian
268 'fi' => 'west_european', // Finish
269 'fo' => 'west_european', // Faroese
270 'fr' => 'west_european', // French
271 'ga' => 'west_european', // Galician
272 'ge' => 'unicode', // Georgian
273 'gr' => 'greek',
274 'he' => 'hebrew', // Hebrew (since 1998)
275 'hi' => 'unicode', // Hindi
276 'hr' => 'east_european', // Croatian
277 'hu' => 'east_european', // Hungarian
278 'iw' => 'hebrew', // Hebrew (til 1998)
279 'is' => 'west_european', // Icelandic
280 'it' => 'west_european', // Italian
281 'ja' => 'japanese',
282 'kl' => 'west_european', // Greenlandic
283 'ko' => 'korean',
284 'lt' => 'lithuanian',
285 'lv' => 'west_european', // Latvian/Lettish
286 'nl' => 'west_european', // Dutch
287 'no' => 'west_european', // Norwegian
288 'nb' => 'west_european', // Norwegian Bokmal
289 'nn' => 'west_european', // Norwegian Nynorsk
290 'pl' => 'east_european', // Polish
291 'pt' => 'west_european', // Portuguese
292 'ro' => 'east_european', // Romanian
293 'ru' => 'cyrillic', // Russian
294 'sk' => 'east_european', // Slovak
295 'sl' => 'east_european', // Slovenian
296 'sr' => 'cyrillic', // Serbian
297 'sv' => 'west_european', // Swedish
298 'sq' => 'albanian', // Albanian
299 'th' => 'thai',
300 'uk' => 'cyrillic', // Ukranian
301 'vi' => 'vietnamese',
302 'zh' => 'chinese',
303 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
304 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
305 'ara' => 'arabic',
306 'bgr' => 'cyrillic', // Bulgarian
307 'cat' => 'west_european', // Catalan
308 'chs' => 'simpl_chinese',
309 'cht' => 'trad_chinese',
310 'csy' => 'east_european', // Czech
311 'dan' => 'west_european', // Danisch
312 'deu' => 'west_european', // German
313 'dea' => 'west_european', // German (Austrian)
314 'des' => 'west_european', // German (Swiss)
315 'ena' => 'west_european', // English (Australian)
316 'enc' => 'west_european', // English (Canadian)
317 'eng' => 'west_european', // English
318 'enz' => 'west_european', // English (New Zealand)
319 'enu' => 'west_european', // English (United States)
320 'euq' => 'west_european', // Basque
321 'fos' => 'west_european', // Faroese
322 'far' => 'arabic', // Persian
323 'fin' => 'west_european', // Finish
324 'fra' => 'west_european', // French
325 'frb' => 'west_european', // French (Belgian)
326 'frc' => 'west_european', // French (Canadian)
327 'frs' => 'west_european', // French (Swiss)
328 'geo' => 'unicode', // Georgian
329 'glg' => 'west_european', // Galician
330 'ell' => 'greek',
331 'heb' => 'hebrew',
332 'hin' => 'unicode', // Hindi
333 'hun' => 'east_european', // Hungarian
334 'isl' => 'west_euorpean', // Icelandic
335 'ita' => 'west_european', // Italian
336 'its' => 'west_european', // Italian (Swiss)
337 'jpn' => 'japanese',
338 'kor' => 'korean',
339 'lth' => 'lithuanian',
340 'lvi' => 'west_european', // Latvian/Lettish
341 'msl' => 'west_european', // Malay
342 'nlb' => 'west_european', // Dutch (Belgian)
343 'nld' => 'west_european', // Dutch
344 'nor' => 'west_european', // Norwegian (bokmal)
345 'non' => 'west_european', // Norwegian (nynorsk)
346 'plk' => 'east_european', // Polish
347 'ptg' => 'west_european', // Portuguese
348 'ptb' => 'west_european', // Portuguese (Brazil)
349 'rom' => 'east_european', // Romanian
350 'rus' => 'cyrillic', // Russian
351 'slv' => 'east_european', // Slovenian
352 'sky' => 'east_european', // Slovak
353 'srl' => 'east_european', // Serbian (Latin)
354 'srb' => 'cyrillic', // Serbian (Cyrillic)
355 'esp' => 'west_european', // Spanish (trad. sort)
356 'esm' => 'west_european', // Spanish (Mexican)
357 'esn' => 'west_european', // Spanish (internat. sort)
358 'sve' => 'west_european', // Swedish
359 'sqi' => 'albanian', // Albanian
360 'tha' => 'thai',
361 'trk' => 'turkish',
362 'ukr' => 'cyrillic', // Ukrainian
363 // English language names
364 'albanian' => 'albanian',
365 'arabic' => 'arabic',
366 'basque' => 'west_european',
367 'bosnian' => 'east_european',
368 'bulgarian' => 'east_european',
369 'catalan' => 'west_european',
370 'croatian' => 'east_european',
371 'czech' => 'east_european',
372 'danish' => 'west_european',
373 'dutch' => 'west_european',
374 'english' => 'west_european',
375 'esperanto' => 'unicode',
376 'estonian' => 'estonian',
377 'faroese' => 'west_european',
378 'farsi' => 'arabic',
379 'finnish' => 'west_european',
380 'french' => 'west_european',
381 'galician' => 'west_european',
382 'georgian' => 'unicode',
383 'german' => 'west_european',
384 'greek' => 'greek',
385 'greenlandic' => 'west_european',
386 'hebrew' => 'hebrew',
387 'hindi' => 'unicode',
388 'hungarian' => 'east_european',
389 'icelandic' => 'west_european',
390 'italian' => 'west_european',
391 'latvian' => 'west_european',
392 'lettish' => 'west_european',
393 'lithuanian' => 'lithuanian',
394 'malay' => 'west_european',
395 'norwegian' => 'west_european',
396 'persian' => 'arabic',
397 'polish' => 'east_european',
398 'portuguese' => 'west_european',
399 'russian' => 'cyrillic',
400 'romanian' => 'east_european',
401 'serbian' => 'cyrillic',
402 'slovak' => 'east_european',
403 'slovenian' => 'east_european',
404 'spanish' => 'west_european',
405 'svedish' => 'west_european',
406 'that' => 'thai',
407 'turkish' => 'turkish',
408 'ukrainian' => 'cyrillic',
409 );
410
411 // mapping of language (family) names to charsets on Unix
412 var $script_to_charset_unix=array(
413 'west_european' => 'iso-8859-1',
414 'estonian' => 'iso-8859-1',
415 'east_european' => 'iso-8859-2',
416 'baltic' => 'iso-8859-4',
417 'cyrillic' => 'iso-8859-5',
418 'arabic' => 'iso-8859-6',
419 'greek' => 'iso-8859-7',
420 'hebrew' => 'iso-8859-8',
421 'turkish' => 'iso-8859-9',
422 'thai' => 'iso-8859-11', // = TIS-620
423 'lithuanian' => 'iso-8859-13',
424 'chinese' => 'gb2312', // = euc-cn
425 'japanese' => 'euc-jp',
426 'korean' => 'euc-kr',
427 'simpl_chinese' => 'gb2312',
428 'trad_chinese' => 'big5',
429 'vietnamese' => '',
430 'unicode' => 'utf-8',
431 'albanian' => 'utf-8'
432 );
433
434 // mapping of language (family) names to charsets on Windows
435 var $script_to_charset_windows=array(
436 'east_european' => 'windows-1250',
437 'cyrillic' => 'windows-1251',
438 'west_european' => 'windows-1252',
439 'greek' => 'windows-1253',
440 'turkish' => 'windows-1254',
441 'hebrew' => 'windows-1255',
442 'arabic' => 'windows-1256',
443 'baltic' => 'windows-1257',
444 'estonian' => 'windows-1257',
445 'lithuanian' => 'windows-1257',
446 'vietnamese' => 'windows-1258',
447 'thai' => 'cp874',
448 'korean' => 'cp949',
449 'chinese' => 'gb2312',
450 'japanese' => 'shift_jis',
451 'simpl_chinese' => 'gb2312',
452 'trad_chinese' => 'big5',
453 'albanian' => 'windows-1250',
454 'unicode' => 'utf-8'
455 );
456
457 // mapping of locale names to charsets
458 var $locale_to_charset=array(
459 'japanese.euc' => 'euc-jp',
460 'ja_jp.ujis' => 'euc-jp',
461 'korean.euc' => 'euc-kr',
462 'sr@Latn' => 'iso-8859-2',
463 'zh_cn' => 'gb2312',
464 'zh_hk' => 'big5',
465 'zh_tw' => 'big5',
466 );
467
468 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
469 // Empty values means "iso-8859-1"
470 var $charSetArray = array(
471 'dk' => '',
472 'de' => '',
473 'no' => '',
474 'it' => '',
475 'fr' => '',
476 'es' => '',
477 'nl' => '',
478 'cz' => 'windows-1250',
479 'pl' => 'iso-8859-2',
480 'si' => 'windows-1250',
481 'fi' => '',
482 'tr' => 'iso-8859-9',
483 'se' => '',
484 'pt' => '',
485 'ru' => 'windows-1251',
486 'ro' => 'iso-8859-2',
487 'ch' => 'gb2312',
488 'sk' => 'windows-1250',
489 'lt' => 'windows-1257',
490 'is' => 'utf-8',
491 'hr' => 'windows-1250',
492 'hu' => 'iso-8859-2',
493 'gl' => '',
494 'th' => 'iso-8859-11',
495 'gr' => 'iso-8859-7',
496 'hk' => 'big5',
497 'eu' => '',
498 'bg' => 'windows-1251',
499 'br' => '',
500 'et' => 'iso-8859-4',
501 'ar' => 'iso-8859-6',
502 'he' => 'utf-8',
503 'ua' => 'windows-1251',
504 'jp' => 'shift_jis',
505 'lv' => 'utf-8',
506 'vn' => 'utf-8',
507 'ca' => 'iso-8859-15',
508 'ba' => 'iso-8859-2',
509 'kr' => 'euc-kr',
510 'eo' => 'utf-8',
511 'my' => '',
512 'hi' => 'utf-8',
513 'fo' => 'utf-8',
514 'fa' => 'utf-8',
515 'sr' => 'utf-8',
516 'sq' => 'utf-8',
517 'ge' => 'utf-8',
518 'ga' => '',
519 );
520
521 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
522 // Missing keys means: same as Typo3
523 var $isoArray = array(
524 'ba' => 'bs',
525 'br' => 'pt_BR',
526 'ch' => 'zh_CN',
527 'cz' => 'cs',
528 'dk' => 'da',
529 'si' => 'sl',
530 'se' => 'sv',
531 'gl' => 'kl',
532 'gr' => 'el',
533 'hk' => 'zh_HK',
534 'kr' => 'ko',
535 'ua' => 'uk',
536 'jp' => 'ja',
537 'vn' => 'vi',
538 );
539
540 /**
541 * Normalize - changes input character set to lowercase letters.
542 *
543 * @param string Input charset
544 * @return string Normalized charset
545 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
546 */
547 function parse_charset($charset) {
548 $charset = trim(strtolower($charset));
549 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
550
551 return $charset;
552 }
553
554 /**
555 * Get the charset of a locale.
556 *
557 * ln language
558 * ln_CN language / country
559 * ln_CN.cs language / country / charset
560 * ln_CN.cs@mod language / country / charset / modifier
561 *
562 * @param string Locale string
563 * @return string Charset resolved for locale string
564 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
565 */
566 function get_locale_charset($locale) {
567 $locale = strtolower($locale);
568
569 // exact locale specific charset?
570 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale];
571
572 // get modifier
573 list($locale,$modifier) = explode('@',$locale);
574
575 // locale contains charset: use it
576 list($locale,$charset) = explode('.',$locale);
577 if ($charset) return $this->parse_charset($charset);
578
579 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
580 if ($modifier == 'euro') return 'iso-8859-15';
581
582 // get language
583 list($language,$country) = explode('_',$locale);
584 if (isset($this->lang_to_script[$language])) $script = $this->lang_to_script[$language];
585
586 if (TYPO3_OS == 'WIN') {
587 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
588 } else {
589 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
590 }
591
592 return $cs;
593 }
594
595
596
597
598
599
600
601
602
603 /********************************************
604 *
605 * Charset Conversion functions
606 *
607 ********************************************/
608
609 /**
610 * Convert from one charset to another charset.
611 *
612 * @param string Input string
613 * @param string From charset (the current charset of the string)
614 * @param string To charset (the output charset wanted)
615 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
616 * @return string Converted string
617 * @see convArray()
618 */
619 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
620 if ($fromCS==$toCS) return $str;
621
622 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
623 if ($toCS=='utf-8' || !$useEntityForNoChar) {
624 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
625 case 'mbstring':
626 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
627 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
628 break;
629
630 case 'iconv':
631 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
632 if (false !== $conv_str) return $conv_str;
633 break;
634
635 case 'recode':
636 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
637 if (false !== $conv_str) return $conv_str;
638 break;
639 }
640 // fallback to TYPO3 conversion
641 }
642
643 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
644 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
645 return $str;
646 }
647
648 /**
649 * Convert all elements in ARRAY with type string from one charset to another charset.
650 * NOTICE: Array is passed by reference!
651 *
652 * @param string Input array, possibly multidimensional
653 * @param string From charset (the current charset of the string)
654 * @param string To charset (the output charset wanted)
655 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
656 * @return void
657 * @see conv()
658 */
659 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
660 foreach($array as $key => $value) {
661 if (is_array($array[$key])) {
662 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
663 } elseif (is_string($array[$key])) {
664 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
665 }
666 }
667 }
668
669 /**
670 * Converts $str from $charset to UTF-8
671 *
672 * @param string String in local charset to convert to UTF-8
673 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
674 * @return string Output string, converted to UTF-8
675 */
676 function utf8_encode($str,$charset) {
677
678 if ($charset === 'utf-8') return $str;
679
680 // Charset is case-insensitive.
681 if ($this->initCharset($charset)) { // Parse conv. table if not already...
682 $strLen = strlen($str);
683 $outStr='';
684
685 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
686 $chr=substr($str,$a,1);
687 $ord=ord($chr);
688 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
689 $ord2 = ord($str{$a+1});
690 $ord = $ord<<8 | $ord2; // assume big endian
691
692 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
693 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
694 } else $outStr.=chr($this->noCharByteVal); // No char exists
695 $a++;
696 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
697 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
698 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
699 $a++;
700 $ord2=ord(substr($str,$a,1));
701 $ord = $ord*256+$ord2;
702 }
703 }
704
705 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
706 $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
707 } else $outStr.= chr($this->noCharByteVal); // No char exists
708 } else $outStr.= $chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
709 }
710 return $outStr;
711 }
712 }
713
714 /**
715 * Converts $str from UTF-8 to $charset
716 *
717 * @param string String in UTF-8 to convert to local charset
718 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
719 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
720 * @return string Output string, converted to local charset
721 */
722 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
723
724 if ($charset === 'utf-8') {
725 return $str;
726 }
727
728 // Charset is case-insensitive.
729 if ($this->initCharset($charset)) { // Parse conv. table if not already...
730 $strLen = strlen($str);
731 $outStr='';
732 $buf='';
733 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
734 $chr=substr($str,$a,1);
735 $ord=ord($chr);
736 if ($ord>127) { // This means multibyte! (first byte!)
737 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
738
739 $buf=$chr; // Add first byte
740 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
741 $ord = $ord << 1; // Shift it left and ...
742 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
743 $a++; // Increase pointer...
744 $buf.=substr($str,$a,1); // ... and add the next char.
745 } else break;
746 }
747
748 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
749 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
750 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
751 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
752 } else $outStr.= chr($mByte);
753 } elseif ($useEntityForNoChar) { // Create num entity:
754 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
755 } else $outStr.=chr($this->noCharByteVal); // No char exists
756 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
757 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
758 }
759 return $outStr;
760 }
761 }
762
763 /**
764 * Converts all chars > 127 to numeric entities.
765 *
766 * @param string Input string
767 * @return string Output string
768 */
769 function utf8_to_entities($str) {
770 $strLen = strlen($str);
771 $outStr='';
772 $buf='';
773 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
774 $chr=substr($str,$a,1);
775 $ord=ord($chr);
776 if ($ord>127) { // This means multibyte! (first byte!)
777 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
778 $buf=$chr; // Add first byte
779 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
780 $ord = $ord << 1; // Shift it left and ...
781 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
782 $a++; // Increase pointer...
783 $buf.=substr($str,$a,1); // ... and add the next char.
784 } else break;
785 }
786
787 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
788 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
789 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
790 }
791
792 return $outStr;
793 }
794
795 /**
796 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
797 *
798 * @param string Input string, UTF-8
799 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
800 * @return string Output string
801 */
802 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
803 if ($alsoStdHtmlEnt) {
804 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
805 }
806
807 $token = md5(microtime());
808 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
809 foreach($parts as $k => $v) {
810 if ($k%2) {
811 if (substr($v,0,1)=='#') { // Dec or hex entities:
812 if (substr($v,1,1)=='x') {
813 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
814 } else {
815 $parts[$k] = $this->UnumberToChar(substr($v,1));
816 }
817 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
818 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
819 } else { // No conversion:
820 $parts[$k] ='&'.$v.';';
821 }
822 }
823 }
824
825 return implode('',$parts);
826 }
827
828 /**
829 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
830 *
831 * @param string Input string, UTF-8
832 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
833 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
834 * @return array Output array with the char numbers
835 */
836 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
837 // If entities must be registered as well...:
838 if ($convEntities) {
839 $str = $this->entities_to_utf8($str,1);
840 }
841 // Do conversion:
842 $strLen = strlen($str);
843 $outArr=array();
844 $buf='';
845 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
846 $chr=substr($str,$a,1);
847 $ord=ord($chr);
848 if ($ord>127) { // This means multibyte! (first byte!)
849 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
850 $buf=$chr; // Add first byte
851 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
852 $ord = $ord << 1; // Shift it left and ...
853 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
854 $a++; // Increase pointer...
855 $buf.=substr($str,$a,1); // ... and add the next char.
856 } else break;
857 }
858
859 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
860 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
861 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
862 }
863
864 return $outArr;
865 }
866
867 /**
868 * Converts a UNICODE number to a UTF-8 multibyte character
869 * Algorithm based on script found at From: http://czyborra.com/utf/
870 * Unit-tested by Kasper
871 *
872 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
873 *
874 * bytes | bits | representation
875 * 1 | 7 | 0vvvvvvv
876 * 2 | 11 | 110vvvvv 10vvvvvv
877 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
878 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
879 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
880 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
881 *
882 * @param integer UNICODE integer
883 * @return string UTF-8 multibyte character string
884 * @see utf8CharToUnumber()
885 */
886 function UnumberToChar($cbyte) {
887 $str='';
888
889 if ($cbyte < 0x80) {
890 $str.=chr($cbyte);
891 } else if ($cbyte < 0x800) {
892 $str.=chr(0xC0 | ($cbyte >> 6));
893 $str.=chr(0x80 | ($cbyte & 0x3F));
894 } else if ($cbyte < 0x10000) {
895 $str.=chr(0xE0 | ($cbyte >> 12));
896 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
897 $str.=chr(0x80 | ($cbyte & 0x3F));
898 } else if ($cbyte < 0x200000) {
899 $str.=chr(0xF0 | ($cbyte >> 18));
900 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
901 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
902 $str.=chr(0x80 | ($cbyte & 0x3F));
903 } else if ($cbyte < 0x4000000) {
904 $str.=chr(0xF8 | ($cbyte >> 24));
905 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
906 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
907 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
908 $str.=chr(0x80 | ($cbyte & 0x3F));
909 } else if ($cbyte < 0x80000000) {
910 $str.=chr(0xFC | ($cbyte >> 30));
911 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
912 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
913 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
914 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
915 $str.=chr(0x80 | ($cbyte & 0x3F));
916 } else { // Cannot express a 32-bit character in UTF-8
917 $str .= chr($this->noCharByteVal);
918 }
919 return $str;
920 }
921
922 /**
923 * Converts a UTF-8 Multibyte character to a UNICODE number
924 * Unit-tested by Kasper
925 *
926 * @param string UTF-8 multibyte character string
927 * @param boolean If set, then a hex. number is returned.
928 * @return integer UNICODE integer
929 * @see UnumberToChar()
930 */
931 function utf8CharToUnumber($str,$hex=0) {
932 $ord=ord(substr($str,0,1)); // First char
933
934 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
935 $binBuf='';
936 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
937 $ord = $ord << 1; // Shift it left and ...
938 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
939 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
940 } else break;
941 }
942 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
943
944 $int = bindec($binBuf);
945 } else $int = $ord;
946
947 return $hex ? 'x'.dechex($int) : $int;
948 }
949
950
951
952
953
954
955
956
957
958 /********************************************
959 *
960 * Init functions
961 *
962 ********************************************/
963
964 /**
965 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
966 * This function is automatically called by the conversion functions
967 *
968 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
969 *
970 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
971 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
972 * @access private
973 */
974 function initCharset($charset) {
975 // Only process if the charset is not yet loaded:
976 if (!is_array($this->parsedCharsets[$charset])) {
977
978 // Conversion table filename:
979 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
980
981 // If the conversion table is found:
982 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
983 // Cache file for charsets:
984 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
985 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
986 if ($cacheFile && @is_file($cacheFile)) {
987 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
988 } else {
989 // Parse conversion table into lines:
990 $lines=t3lib_div::trimExplode(LF,t3lib_div::getUrl($charsetConvTableFile),1);
991 // Initialize the internal variable holding the conv. table:
992 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
993 // traverse the lines:
994 $detectedType='';
995 foreach($lines as $value) {
996 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
997
998 // Detect type if not done yet: (Done on first real line)
999 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1000 if (!$detectedType) $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/',$value) ? 'whitespaced' : 'ms-token';
1001
1002 if ($detectedType=='ms-token') {
1003 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1004 } elseif ($detectedType=='whitespaced') {
1005 $regA=array();
1006 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/',$value,$regA);
1007 $hexbyte = $regA[1];
1008 $utf8 = 'U+'.$regA[2];
1009 }
1010 $decval = hexdec(trim($hexbyte));
1011 if ($decval>127) {
1012 $utf8decval = hexdec(substr(trim($utf8),2));
1013 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
1014 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
1015 }
1016 }
1017 }
1018 if ($cacheFile) {
1019 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
1020 }
1021 }
1022 return 2;
1023 } else return false;
1024 } else return 1;
1025 }
1026
1027 /**
1028 * This function initializes all UTF-8 character data tables.
1029 *
1030 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1031 *
1032 * @param string Mode ("case", "ascii", ...)
1033 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1034 * @access private
1035 */
1036 function initUnicodeData($mode=null) {
1037 // cache files
1038 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1039 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1040
1041 // Only process if the tables are not yet loaded
1042 switch($mode) {
1043 case 'case':
1044 if (is_array($this->caseFolding['utf-8'])) return 1;
1045
1046 // Use cached version if possible
1047 if ($cacheFileCase && @is_file($cacheFileCase)) {
1048 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1049 return 2;
1050 }
1051 break;
1052
1053 case 'ascii':
1054 if (is_array($this->toASCII['utf-8'])) return 1;
1055
1056 // Use cached version if possible
1057 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1058 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1059 return 2;
1060 }
1061 break;
1062 }
1063
1064 // process main Unicode data file
1065 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
1066 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
1067
1068 $fh = fopen($unicodeDataFile,'rb');
1069 if (!$fh) return false;
1070
1071 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1072 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1073 $this->caseFolding['utf-8'] = array();
1074 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1075 $utf8CaseFolding['toUpper'] = array();
1076 $utf8CaseFolding['toLower'] = array();
1077 $utf8CaseFolding['toTitle'] = array();
1078
1079 $decomposition = array(); // array of temp. decompositions
1080 $mark = array(); // array of chars that are marks (eg. composing accents)
1081 $number = array(); // array of chars that are numbers (eg. digits)
1082 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1083
1084 while (!feof($fh)) {
1085 $line = fgets($fh,4096);
1086 // has a lot of info
1087 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = explode(';', rtrim($line));
1088
1089 $ord = hexdec($char);
1090 if ($ord > 0xFFFF) break; // only process the BMP
1091
1092 $utf8_char = $this->UnumberToChar($ord);
1093
1094 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1095 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1096 // store "title" only when different from "upper" (only a few)
1097 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1098
1099 switch ($cat{0}) {
1100 case 'M': // mark (accent, umlaut, ...)
1101 $mark["U+$char"] = 1;
1102 break;
1103
1104 case 'N': // numeric value
1105 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
1106 }
1107
1108 // accented Latin letters without "official" decomposition
1109 $match = array();
1110 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/',$name,$match) && !$decomp) {
1111 $c = ord($match[2]);
1112 if ($match[1] == 'SMALL') $c += 32;
1113
1114 $decomposition["U+$char"] = array(dechex($c));
1115 continue;
1116 }
1117
1118 $match = array();
1119 if (preg_match('/(<.*>)? *(.+)/',$decomp,$match)) {
1120 switch($match[1]) {
1121 case '<circle>': // add parenthesis as circle replacement, eg (1)
1122 $match[2] = '0028 '.$match[2].' 0029';
1123 break;
1124
1125 case '<square>': // add square brackets as square replacement, eg [1]
1126 $match[2] = '005B '.$match[2].' 005D';
1127 break;
1128
1129 case '<compat>': // ignore multi char decompositions that start with a space
1130 if (preg_match('/^0020 /',$match[2])) continue 2;
1131 break;
1132
1133 // ignore Arabic and vertical layout presentation decomposition
1134 case '<initial>':
1135 case '<medial>':
1136 case '<final>':
1137 case '<isolated>':
1138 case '<vertical>':
1139 continue 2;
1140 }
1141 $decomposition["U+$char"] = explode(' ', $match[2]);
1142 }
1143 }
1144 fclose($fh);
1145
1146 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1147 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
1148 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1149 $fh = fopen($specialCasingFile,'rb');
1150 if ($fh) {
1151 while (!feof($fh)) {
1152 $line = fgets($fh,4096);
1153 if ($line{0} != '#' && trim($line) != '') {
1154
1155 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
1156 if ($cond == '' || $cond{0} == '#') {
1157 $utf8_char = $this->UnumberToChar(hexdec($char));
1158 if ($char != $lower) {
1159 $arr = explode(' ', $lower);
1160 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1161 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1162 }
1163 if ($char != $title && $title != $upper) {
1164 $arr = explode(' ', $title);
1165 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1166 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1167 }
1168 if ($char != $upper) {
1169 $arr = explode(' ', $upper);
1170 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1171 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1172 }
1173 }
1174 }
1175 }
1176 fclose($fh);
1177 }
1178 }
1179
1180 // process custom decompositions
1181 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
1182 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1183 $fh = fopen($customTranslitFile,'rb');
1184 if ($fh) {
1185 while (!feof($fh)) {
1186 $line = fgets($fh,4096);
1187 if ($line{0} != '#' && trim($line) != '') {
1188 list($char,$translit) = t3lib_div::trimExplode(';', $line);
1189 if (!$translit) $omit["U+$char"] = 1;
1190 $decomposition["U+$char"] = explode(' ', $translit);
1191
1192 }
1193 }
1194 fclose($fh);
1195 }
1196 }
1197
1198 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1199 foreach($decomposition as $from => $to) {
1200 $code_decomp = array();
1201
1202 while ($code_value = array_shift($to)) {
1203 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1204 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
1205 array_unshift($to, $cv);
1206 }
1207 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1208 array_push($code_decomp, $code_value);
1209 }
1210 }
1211 if (count($code_decomp) || isset($omit[$from])) {
1212 $decomposition[$from] = $code_decomp;
1213 } else {
1214 unset($decomposition[$from]);
1215 }
1216 }
1217
1218 // create ascii only mapping
1219 $this->toASCII['utf-8'] = array();
1220 $ascii =& $this->toASCII['utf-8'];
1221
1222 foreach($decomposition as $from => $to) {
1223 $code_decomp = array();
1224 while ($code_value = array_shift($to)) {
1225 $ord = hexdec($code_value);
1226 if ($ord > 127)
1227 continue 2; // skip decompositions containing non-ASCII chars
1228 else
1229 array_push($code_decomp,chr($ord));
1230 }
1231 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1232 }
1233
1234 // add numeric decompositions
1235 foreach($number as $from => $to) {
1236 $utf8_char = $this->UnumberToChar(hexdec($from));
1237 if (!isset($ascii[$utf8_char])) {
1238 $ascii[$utf8_char] = $to;
1239 }
1240 }
1241
1242 if ($cacheFileCase) {
1243 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1244 }
1245
1246 if ($cacheFileASCII) {
1247 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1248 }
1249
1250 return 3;
1251 }
1252
1253 /**
1254 * This function initializes the folding table for a charset other than UTF-8.
1255 * This function is automatically called by the case folding functions.
1256 *
1257 * @param string Charset for which to initialize case folding.
1258 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1259 * @access private
1260 */
1261 function initCaseFolding($charset) {
1262 // Only process if the case table is not yet loaded:
1263 if (is_array($this->caseFolding[$charset])) return 1;
1264
1265 // Use cached version if possible
1266 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1267 if ($cacheFile && @is_file($cacheFile)) {
1268 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1269 return 2;
1270 }
1271
1272 // init UTF-8 conversion for this charset
1273 if (!$this->initCharset($charset)) {
1274 return false;
1275 }
1276
1277 // UTF-8 case folding is used as the base conversion table
1278 if (!$this->initUnicodeData('case')) {
1279 return false;
1280 }
1281
1282 $nochar = chr($this->noCharByteVal);
1283 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1284 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1285 $c = $this->utf8_decode($utf8, $charset);
1286
1287 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1288 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1289 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1290
1291 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1292 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1293 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
1294
1295 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1296 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1297 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1298 }
1299
1300 // add the ASCII case table
1301 for ($i=ord('a'); $i<=ord('z'); $i++) {
1302 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
1303 }
1304 for ($i=ord('A'); $i<=ord('Z'); $i++) {
1305 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
1306 }
1307
1308 if ($cacheFile) {
1309 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
1310 }
1311
1312 return 3;
1313 }
1314
1315 /**
1316 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1317 * This function is automatically called by the ASCII transliteration functions.
1318 *
1319 * @param string Charset for which to initialize conversion.
1320 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1321 * @access private
1322 */
1323 function initToASCII($charset) {
1324 // Only process if the case table is not yet loaded:
1325 if (is_array($this->toASCII[$charset])) return 1;
1326
1327 // Use cached version if possible
1328 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1329 if ($cacheFile && @is_file($cacheFile)) {
1330 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1331 return 2;
1332 }
1333
1334 // init UTF-8 conversion for this charset
1335 if (!$this->initCharset($charset)) {
1336 return false;
1337 }
1338
1339 // UTF-8/ASCII transliteration is used as the base conversion table
1340 if (!$this->initUnicodeData('ascii')) {
1341 return false;
1342 }
1343
1344 $nochar = chr($this->noCharByteVal);
1345 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1346 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1347 $c = $this->utf8_decode($utf8, $charset);
1348
1349 if (isset($this->toASCII['utf-8'][$utf8])) {
1350 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1351 }
1352 }
1353
1354 if ($cacheFile) {
1355 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
1356 }
1357
1358 return 3;
1359 }
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376 /********************************************
1377 *
1378 * String operation functions
1379 *
1380 ********************************************/
1381
1382 /**
1383 * Returns a part of a string.
1384 * Unit-tested by Kasper (single byte charsets only)
1385 *
1386 * @param string The character set
1387 * @param string Character string
1388 * @param integer Start position (character position)
1389 * @param integer Length (in characters)
1390 * @return string The substring
1391 * @see substr(), mb_substr()
1392 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1393 */
1394 function substr($charset,$string,$start,$len=null) {
1395 if ($len === 0 || $string === '') {
1396 return '';
1397 }
1398
1399 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1400 // cannot omit $len, when specifying charset
1401 if ($len==null) {
1402 $enc = mb_internal_encoding(); // save internal encoding
1403 mb_internal_encoding($charset);
1404 $str = mb_substr($string,$start);
1405 mb_internal_encoding($enc); // restore internal encoding
1406
1407 return $str;
1408 }
1409 else {
1410 return mb_substr($string,$start,$len,$charset);
1411 }
1412 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1413 // cannot omit $len, when specifying charset
1414 if ($len==null) {
1415 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1416 iconv_set_encoding('internal_encoding',$charset);
1417 $str = iconv_substr($string,$start);
1418 iconv_set_encoding('internal_encoding',$enc); // restore internal encoding
1419
1420 return $str;
1421 }
1422 else {
1423 return iconv_substr($string,$start,$len,$charset);
1424 }
1425 } elseif ($charset == 'utf-8') {
1426 return $this->utf8_substr($string,$start,$len);
1427 } elseif ($this->eucBasedSets[$charset]) {
1428 return $this->euc_substr($string,$start,$charset,$len);
1429 } elseif ($this->twoByteSets[$charset]) {
1430 return substr($string,$start*2,$len*2);
1431 } elseif ($this->fourByteSets[$charset]) {
1432 return substr($string,$start*4,$len*4);
1433 }
1434
1435 // treat everything else as single-byte encoding
1436 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
1437 }
1438
1439 /**
1440 * Counts the number of characters.
1441 * Unit-tested by Kasper (single byte charsets only)
1442 *
1443 * @param string The character set
1444 * @param string Character string
1445 * @return integer The number of characters
1446 * @see strlen()
1447 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1448 */
1449 function strlen($charset,$string) {
1450 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1451 return mb_strlen($string,$charset);
1452 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1453 return iconv_strlen($string,$charset);
1454 } elseif ($charset == 'utf-8') {
1455 return $this->utf8_strlen($string);
1456 } elseif ($this->eucBasedSets[$charset]) {
1457 return $this->euc_strlen($string,$charset);
1458 } elseif ($this->twoByteSets[$charset]) {
1459 return strlen($string)/2;
1460 } elseif ($this->fourByteSets[$charset]) {
1461 return strlen($string)/4;
1462 }
1463 // treat everything else as single-byte encoding
1464 return strlen($string);
1465 }
1466
1467 /**
1468 * Method to crop strings using the mb_substr function.
1469 *
1470 * @param string The character set
1471 * @param string String to be cropped
1472 * @param integer Crop length (in characters)
1473 * @param string Crop signifier
1474 * @return string The shortened string
1475 * @see mb_strlen(), mb_substr()
1476 */
1477 protected function cropMbstring($charset, $string, $len, $crop = '') {
1478 if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) {
1479 return $string;
1480 }
1481
1482 if ($len > 0) {
1483 $string = mb_substr($string, 0, $len, $charset) . $crop;
1484 } else {
1485 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1486 }
1487
1488 return $string;
1489 }
1490
1491 /**
1492 * Truncates a string and pre-/appends a string.
1493 * Unit tested by Kasper
1494 *
1495 * @param string The character set
1496 * @param string Character string
1497 * @param integer Length (in characters)
1498 * @param string Crop signifier
1499 * @return string The shortened string
1500 * @see substr(), mb_strimwidth()
1501 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1502 */
1503 function crop($charset,$string,$len,$crop='') {
1504 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1505 return $this->cropMbstring($charset, $string, $len, $crop);
1506 }
1507
1508 if (intval($len) == 0) return $string;
1509
1510 if ($charset == 'utf-8') {
1511 $i = $this->utf8_char2byte_pos($string,$len);
1512 } elseif ($this->eucBasedSets[$charset]) {
1513 $i = $this->euc_char2byte_pos($string,$len,$charset);
1514 } else {
1515 if ($len > 0) {
1516 $i = $len;
1517 } else {
1518 $i = strlen($string)+$len;
1519 if ($i<=0) $i = false;
1520 }
1521 }
1522
1523 if ($i === false) { // $len outside actual string length
1524 return $string;
1525 } else {
1526 if ($len > 0) {
1527 if (strlen($string{$i})) {
1528 return substr($string,0,$i).$crop;
1529
1530 }
1531 } else {
1532 if (strlen($string{$i-1})) {
1533 return $crop.substr($string,$i);
1534 }
1535 }
1536
1537 /*
1538 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1539 if ($len > 0) {
1540 return substr($string,0,$i).$crop;
1541 } else {
1542 return $crop.substr($string,$i);
1543 }
1544 }
1545 */
1546 }
1547 return $string;
1548 }
1549
1550 /**
1551 * Cuts a string short at a given byte length.
1552 *
1553 * @param string The character set
1554 * @param string Character string
1555 * @param integer The byte length
1556 * @return string The shortened string
1557 * @see mb_strcut()
1558 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1559 */
1560 function strtrunc($charset,$string,$len) {
1561 if ($len <= 0) return '';
1562
1563 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1564 return mb_strcut($string,0,$len,$charset);
1565 } elseif ($charset == 'utf-8') {
1566 return $this->utf8_strtrunc($string,$len);
1567 } elseif ($this->eucBasedSets[$charset]) {
1568 return $this->euc_strtrunc($string,$charset);
1569 } elseif ($this->twoByteSets[$charset]) {
1570 if ($len % 2) $len--; // don't cut at odd positions
1571 } elseif ($this->fourByteSets[$charset]) {
1572 $x = $len % 4;
1573 $len -= $x; // realign to position dividable by four
1574 }
1575 // treat everything else as single-byte encoding
1576 return substr($string,0,$len);
1577 }
1578
1579 /**
1580 * Translates all characters of a string into their respective case values.
1581 * Unlike strtolower() and strtoupper() this method is locale independent.
1582 * Note that the string length may change!
1583 * eg. lower case German �(sharp S) becomes upper case "SS"
1584 * Unit-tested by Kasper
1585 * Real case folding is language dependent, this method ignores this fact.
1586 *
1587 * @param string Character set of string
1588 * @param string Input string to convert case for
1589 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1590 * @return string The converted string
1591 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1592 * @see strtolower(), strtoupper()
1593 */
1594 function conv_case($charset,$string,$case) {
1595 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1596 if ($case == 'toLower') {
1597 $string = mb_strtolower($string,$charset);
1598 } else {
1599 $string = mb_strtoupper($string,$charset);
1600 }
1601 } elseif ($charset == 'utf-8') {
1602 $string = $this->utf8_char_mapping($string,'case',$case);
1603 } elseif (isset($this->eucBasedSets[$charset])) {
1604 $string = $this->euc_char_mapping($string,$charset,'case',$case);
1605 } else {
1606 // treat everything else as single-byte encoding
1607 $string = $this->sb_char_mapping($string,$charset,'case',$case);
1608 }
1609
1610 return $string;
1611 }
1612
1613 /**
1614 * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
1615 *
1616 * @param string Character set of string
1617 * @param string Input string to convert
1618 * @return string The converted string
1619 */
1620 function specCharsToASCII($charset,$string) {
1621 if ($charset == 'utf-8') {
1622 $string = $this->utf8_char_mapping($string,'ascii');
1623 } elseif (isset($this->eucBasedSets[$charset])) {
1624 $string = $this->euc_char_mapping($string,$charset,'ascii');
1625 } else {
1626 // treat everything else as single-byte encoding
1627 $string = $this->sb_char_mapping($string,$charset,'ascii');
1628 }
1629
1630 return $string;
1631 }
1632
1633
1634 /**
1635 * converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1636 * into a TYPO3-readable language code
1637 * @param $languageCodesList list of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1638 * see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
1639 * @return string a preferred language that TYPO3 supports, or "default" if none found
1640 * @author Benjamin Mack (benni.typo3.org)
1641 */
1642 public function getPreferredClientLanguage($languageCodesList) {
1643 $allLanguageCodes = array();
1644 $selectedLanguage = 'default';
1645
1646 // get all languages where TYPO3 code is the same as the ISO code
1647 foreach ($this->charSetArray as $typo3Lang => $charSet) {
1648 $allLanguageCodes[$typo3Lang] = $typo3Lang;
1649 }
1650
1651 // get all languages where TYPO3 code differs from ISO code
1652 // or needs the country part
1653 // the iso codes will here overwrite the default typo3 language in the key
1654 foreach ($this->isoArray as $typo3Lang => $isoLang) {
1655 $isoLang = join('-', explode('_', $isoLang));
1656 $allLanguageCodes[$typo3Lang] = $isoLang;
1657 }
1658
1659 // move the iso codes to the (because we're comparing the keys with "isset" later on)
1660 $allLanguageCodes = array_flip($allLanguageCodes);
1661
1662
1663 $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList);
1664 // order the preferred languages after they key
1665 $sortedPreferredLanguages = array();
1666 foreach ($preferredLanguages as $preferredLanguage) {
1667 $quality = 1.0;
1668 if (strpos($preferredLanguage, ';q=') !== false) {
1669 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1670 }
1671 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1672 }
1673
1674 // loop through the languages, with the highest priority first
1675 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1676 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1677 if (isset($allLanguageCodes[$preferredLanguage])) {
1678 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1679 break;
1680 }
1681
1682 // strip the country code from the end
1683 list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
1684 if (isset($allLanguageCodes[$preferredLanguage])) {
1685 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1686 break;
1687 }
1688 }
1689 if (!$selectedLanguage || $selectedLanguage == 'en') {
1690 $selectedLanguage = 'default';
1691 }
1692 return $selectedLanguage;
1693 }
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704 /********************************************
1705 *
1706 * Internal string operation functions
1707 *
1708 ********************************************/
1709
1710 /**
1711 * Maps all characters of a string in a single byte charset.
1712 *
1713 * @param string the string
1714 * @param string the charset
1715 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1716 * @param string 'case': conversion 'toLower' or 'toUpper'
1717 * @return string the converted string
1718 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1719 */
1720 function sb_char_mapping($str,$charset,$mode,$opt='') {
1721 switch($mode) {
1722 case 'case':
1723 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1724 $map =& $this->caseFolding[$charset][$opt];
1725 break;
1726
1727 case 'ascii':
1728 if (!$this->initToASCII($charset)) return $str; // do nothing
1729 $map =& $this->toASCII[$charset];
1730 break;
1731
1732 default:
1733 return $str;
1734 }
1735
1736 $out = '';
1737 for($i=0; strlen($str{$i}); $i++) {
1738 $c = $str{$i};
1739 if (isset($map[$c])) {
1740 $out .= $map[$c];
1741 } else {
1742 $out .= $c;
1743 }
1744 }
1745
1746 return $out;
1747 }
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758 /********************************************
1759 *
1760 * Internal UTF-8 string operation functions
1761 *
1762 ********************************************/
1763
1764 /**
1765 * Returns a part of a UTF-8 string.
1766 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1767 *
1768 * @param string UTF-8 string
1769 * @param integer Start position (character position)
1770 * @param integer Length (in characters)
1771 * @return string The substring
1772 * @see substr()
1773 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1774 */
1775 function utf8_substr($str,$start,$len=null) {
1776 if (!strcmp($len,'0')) return '';
1777
1778 $byte_start = $this->utf8_char2byte_pos($str,$start);
1779 if ($byte_start === false) {
1780 if ($start > 0) {
1781 return false; // $start outside string length
1782 } else {
1783 $start = 0;
1784 }
1785 }
1786
1787 $str = substr($str,$byte_start);
1788
1789 if ($len!=null) {
1790 $byte_end = $this->utf8_char2byte_pos($str,$len);
1791 if ($byte_end === false) // $len outside actual string length
1792 return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string.
1793 else
1794 return substr($str,0,$byte_end);
1795 }
1796 else return $str;
1797 }
1798
1799 /**
1800 * Counts the number of characters of a string in UTF-8.
1801 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1802 *
1803 * @param string UTF-8 multibyte character string
1804 * @return integer The number of characters
1805 * @see strlen()
1806 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1807 */
1808 function utf8_strlen($str) {
1809 $n=0;
1810 for($i=0; strlen($str{$i}); $i++) {
1811 $c = ord($str{$i});
1812 if (!($c & 0x80)) // single-byte (0xxxxxx)
1813 $n++;
1814 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1815 $n++;
1816 }
1817 return $n;
1818 }
1819
1820 /**
1821 * Truncates a string in UTF-8 short at a given byte length.
1822 *
1823 * @param string UTF-8 multibyte character string
1824 * @param integer the byte length
1825 * @return string the shortened string
1826 * @see mb_strcut()
1827 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1828 */
1829 function utf8_strtrunc($str,$len) {
1830 $i = $len-1;
1831 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1832 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1833 if ($i <= 0) return ''; // sanity check
1834 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1835 if ($bc+$i > $len) return substr($str,0,$i);
1836 // fallthru: multibyte char fits into length
1837 }
1838 return substr($str,0,$len);
1839 }
1840
1841 /**
1842 * Find position of first occurrence of a string, both arguments are in UTF-8.
1843 *
1844 * @param string UTF-8 string to search in
1845 * @param string UTF-8 string to search for
1846 * @param integer Positition to start the search
1847 * @return integer The character position
1848 * @see strpos()
1849 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1850 */
1851 function utf8_strpos($haystack,$needle,$offset=0) {
1852 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1853 return mb_strpos($haystack,$needle,$offset,'utf-8');
1854 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1855 return iconv_strpos($haystack,$needle,$offset,'utf-8');
1856 }
1857
1858 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1859 if ($byte_offset === false) return false; // offset beyond string length
1860
1861 $byte_pos = strpos($haystack,$needle,$byte_offset);
1862 if ($byte_pos === false) return false; // needle not found
1863
1864 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1865 }
1866
1867 /**
1868 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1869 *
1870 * @param string UTF-8 string to search in
1871 * @param string UTF-8 character to search for (single character)
1872 * @return integer The character position
1873 * @see strrpos()
1874 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1875 */
1876 function utf8_strrpos($haystack,$needle) {
1877 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1878 return mb_strrpos($haystack,$needle,'utf-8');
1879 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1880 return iconv_strrpos($haystack,$needle,'utf-8');
1881 }
1882
1883 $byte_pos = strrpos($haystack,$needle);
1884 if ($byte_pos === false) return false; // needle not found
1885
1886 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1887 }
1888
1889 /**
1890 * Translates a character position into an 'absolute' byte position.
1891 * Unit tested by Kasper.
1892 *
1893 * @param string UTF-8 string
1894 * @param integer Character position (negative values start from the end)
1895 * @return integer Byte position
1896 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1897 */
1898 function utf8_char2byte_pos($str,$pos) {
1899 $n = 0; // number of characters found
1900 $p = abs($pos); // number of characters wanted
1901
1902 if ($pos >= 0) {
1903 $i = 0;
1904 $d = 1;
1905 } else {
1906 $i = strlen($str)-1;
1907 $d = -1;
1908 }
1909
1910 for( ; strlen($str{$i}) && $n<$p; $i+=$d) {
1911 $c = (int)ord($str{$i});
1912 if (!($c & 0x80)) // single-byte (0xxxxxx)
1913 $n++;
1914 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1915 $n++;
1916 }
1917 if (!strlen($str{$i})) return false; // offset beyond string length
1918
1919 if ($pos >= 0) {
1920 // skip trailing multi-byte data bytes
1921 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1922 } else {
1923 // correct offset
1924 $i++;
1925 }
1926
1927 return $i;
1928 }
1929
1930 /**
1931 * Translates an 'absolute' byte position into a character position.
1932 * Unit tested by Kasper.
1933 *
1934 * @param string UTF-8 string
1935 * @param integer byte position
1936 * @return integer character position
1937 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1938 */
1939 function utf8_byte2char_pos($str,$pos) {
1940 $n = 0; // number of characters
1941 for($i=$pos; $i>0; $i--) {
1942 $c = (int)ord($str{$i});
1943 if (!($c & 0x80)) // single-byte (0xxxxxx)
1944 $n++;
1945 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1946 $n++;
1947 }
1948 if (!strlen($str{$i})) return false; // offset beyond string length
1949
1950 return $n;
1951 }
1952
1953 /**
1954 * Maps all characters of an UTF-8 string.
1955 *
1956 * @param string UTF-8 string
1957 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1958 * @param string 'case': conversion 'toLower' or 'toUpper'
1959 * @return string the converted string
1960 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1961 */
1962 function utf8_char_mapping($str,$mode,$opt='') {
1963 if (!$this->initUnicodeData($mode)) return $str; // do nothing
1964
1965 $out = '';
1966 switch($mode) {
1967 case 'case':
1968 $map =& $this->caseFolding['utf-8'][$opt];
1969 break;
1970
1971 case 'ascii':
1972 $map =& $this->toASCII['utf-8'];
1973 break;
1974
1975 default:
1976 return $str;
1977 }
1978
1979 for($i=0; strlen($str{$i}); $i++) {
1980 $c = ord($str{$i});
1981 if (!($c & 0x80)) // single-byte (0xxxxxx)
1982 $mbc = $str{$i};
1983 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1984 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1985 $mbc = substr($str,$i,$bc);
1986 $i += $bc-1;
1987 }
1988
1989 if (isset($map[$mbc])) {
1990 $out .= $map[$mbc];
1991 } else {
1992 $out .= $mbc;
1993 }
1994 }
1995
1996 return $out;
1997 }
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016 /********************************************
2017 *
2018 * Internal EUC string operation functions
2019 *
2020 * Extended Unix Code:
2021 * ASCII compatible 7bit single bytes chars
2022 * 8bit two byte chars
2023 *
2024 * Shift-JIS is treated as a special case.
2025 *
2026 ********************************************/
2027
2028 /**
2029 * Cuts a string in the EUC charset family short at a given byte length.
2030 *
2031 * @param string EUC multibyte character string
2032 * @param integer the byte length
2033 * @param string the charset
2034 * @return string the shortened string
2035 * @see mb_strcut()
2036 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2037 */
2038 function euc_strtrunc($str,$len,$charset) {
2039 $sjis = ($charset == 'shift_jis');
2040 for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
2041 $c = ord($str{$i});
2042 if ($sjis) {
2043 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
2044 }
2045 else {
2046 if ($c >= 0x80) $i++; // advance a double-byte char
2047 }
2048 }
2049 if (!strlen($str{$i})) return $str; // string shorter than supplied length
2050
2051 if ($i>$len) {
2052 return substr($str,0,$len-1); // we ended on a first byte
2053 } else {
2054 return substr($str,0,$len);
2055 }
2056 }
2057
2058 /**
2059 * Returns a part of a string in the EUC charset family.
2060 *
2061 * @param string EUC multibyte character string
2062 * @param integer start position (character position)
2063 * @param string the charset
2064 * @param integer length (in characters)
2065 * @return string the substring
2066 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2067 */
2068 function euc_substr($str,$start,$charset,$len=null) {
2069 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
2070 if ($byte_start === false) return false; // $start outside string length
2071
2072 $str = substr($str,$byte_start);
2073
2074 if ($len!=null) {
2075 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
2076 if ($byte_end === false) // $len outside actual string length
2077 return $str;
2078 else
2079 return substr($str,0,$byte_end);
2080 }
2081 else return $str;
2082 }
2083
2084 /**
2085 * Counts the number of characters of a string in the EUC charset family.
2086 *
2087 * @param string EUC multibyte character string
2088 * @param string the charset
2089 * @return integer the number of characters
2090 * @see strlen()
2091 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2092 */
2093 function euc_strlen($str,$charset) {
2094 $sjis = ($charset == 'shift_jis');
2095 $n=0;
2096 for ($i=0; strlen($str{$i}); $i++) {
2097 $c = ord($str{$i});
2098 if ($sjis) {
2099 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
2100 }
2101 else {
2102 if ($c >= 0x80) $i++; // advance a double-byte char
2103 }
2104
2105 $n++;
2106 }
2107
2108 return $n;
2109 }
2110
2111 /**
2112 * Translates a character position into an 'absolute' byte position.
2113 *
2114 * @param string EUC multibyte character string
2115 * @param integer character position (negative values start from the end)
2116 * @param string the charset
2117 * @return integer byte position
2118 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2119 */
2120 function euc_char2byte_pos($str,$pos,$charset) {
2121 $sjis = ($charset == 'shift_jis');
2122 $n = 0; // number of characters seen
2123 $p = abs($pos); // number of characters wanted
2124
2125 if ($pos >= 0) {
2126 $i = 0;
2127 $d = 1;
2128 } else {
2129 $i = strlen($str)-1;
2130 $d = -1;
2131 }
2132
2133 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
2134 $c = ord($str{$i});
2135 if ($sjis) {
2136 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char
2137 }
2138 else {
2139 if ($c >= 0x80) $i+=$d; // advance a double-byte char
2140 }
2141
2142 $n++;
2143 }
2144 if (!strlen($str{$i})) return false; // offset beyond string length
2145
2146 if ($pos < 0) $i++; // correct offset
2147
2148 return $i;
2149 }
2150
2151 /**
2152 * Maps all characters of a string in the EUC charset family.
2153 *
2154 * @param string EUC multibyte character string
2155 * @param string the charset
2156 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2157 * @param string 'case': conversion 'toLower' or 'toUpper'
2158 * @return string the converted string
2159 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2160 */
2161 function euc_char_mapping($str,$charset,$mode,$opt='') {
2162 switch($mode) {
2163 case 'case':
2164 if (!$this->initCaseFolding($charset)) return $str; // do nothing
2165 $map =& $this->caseFolding[$charset][$opt];
2166 break;
2167
2168 case 'ascii':
2169 if (!$this->initToASCII($charset)) return $str; // do nothing
2170 $map =& $this->toASCII[$charset];
2171 break;
2172
2173 default:
2174 return $str;
2175 }
2176
2177 $sjis = ($charset == 'shift_jis');
2178 $out = '';
2179 for($i=0; strlen($str{$i}); $i++) {
2180 $mbc = $str{$i};
2181 $c = ord($mbc);
2182
2183 if ($sjis) {
2184 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2185 $mbc = substr($str,$i,2);
2186 $i++;
2187 }
2188 }
2189 else {
2190 if ($c >= 0x80) { // a double-byte char
2191 $mbc = substr($str,$i,2);
2192 $i++;
2193 }
2194 }
2195
2196 if (isset($map[$mbc])) {
2197 $out .= $map[$mbc];
2198 } else {
2199 $out .= $mbc;
2200 }
2201 }
2202
2203 return $out;
2204 }
2205
2206 }
2207
2208 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
2209 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2210 }
2211
2212 ?>