Fixed bug #10735: Wrong returnUrl with clipboard actions
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2009 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 136: class t3lib_cs
38 * 488: function parse_charset($charset)
39 * 507: function get_locale_charset($locale)
40 *
41 * SECTION: Charset Conversion functions
42 * 560: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
43 * 600: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
44 * 617: function utf8_encode($str,$charset)
45 * 663: function utf8_decode($str,$charset,$useEntityForNoChar=0)
46 * 706: function utf8_to_entities($str)
47 * 739: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
48 * 773: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
49 * 823: function UnumberToChar($cbyte)
50 * 868: function utf8CharToUnumber($str,$hex=0)
51 *
52 * SECTION: Init functions
53 * 911: function initCharset($charset)
54 * 973: function initUnicodeData($mode=null)
55 * 1198: function initCaseFolding($charset)
56 * 1260: function initToASCII($charset)
57 *
58 * SECTION: String operation functions
59 * 1331: function substr($charset,$string,$start,$len=null)
60 * 1384: function strlen($charset,$string)
61 * 1414: function crop($charset,$string,$len,$crop='')
62 * 1467: function strtrunc($charset,$string,$len)
63 * 1501: function conv_case($charset,$string,$case)
64 * 1527: function specCharsToASCII($charset,$string)
65 *
66 * SECTION: Internal string operation functions
67 * 1567: function sb_char_mapping($str,$charset,$mode,$opt='')
68 *
69 * SECTION: Internal UTF-8 string operation functions
70 * 1622: function utf8_substr($str,$start,$len=null)
71 * 1655: function utf8_strlen($str)
72 * 1676: function utf8_strtrunc($str,$len)
73 * 1698: function utf8_strpos($haystack,$needle,$offset=0)
74 * 1723: function utf8_strrpos($haystack,$needle)
75 * 1745: function utf8_char2byte_pos($str,$pos)
76 * 1786: function utf8_byte2char_pos($str,$pos)
77 * 1809: function utf8_char_mapping($str,$mode,$opt='')
78 *
79 * SECTION: Internal EUC string operation functions
80 * 1885: function euc_strtrunc($str,$len,$charset)
81 * 1914: function euc_substr($str,$start,$charset,$len=null)
82 * 1939: function euc_strlen($str,$charset)
83 * 1966: function euc_char2byte_pos($str,$pos,$charset)
84 * 2007: function euc_char_mapping($str,$charset,$mode,$opt='')
85 *
86 * TOTAL FUNCTIONS: 35
87 * (This index is automatically created/updated by the extension "extdeveval")
88 *
89 */
90
91
92
93
94
95
96
97
98 /**
99 * Notes on UTF-8
100 *
101 * Functions working on UTF-8 strings:
102 *
103 * - strchr/strstr
104 * - strrchr
105 * - substr_count
106 * - implode/explode/join
107 *
108 * Functions nearly working on UTF-8 strings:
109 *
110 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
111 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
112 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
113 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
114 *
115 * Functions NOT working on UTF-8 strings:
116 *
117 * - str*cmp
118 * - stristr
119 * - stripos
120 * - substr
121 * - strrev
122 * - ereg/eregi
123 * - split/spliti
124 * - preg_*
125 * - ...
126 *
127 */
128 /**
129 * Class for conversion between charsets
130 *
131 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
132 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
133 * @package TYPO3
134 * @subpackage t3lib
135 */
136 class t3lib_cs {
137 var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
138
139 // This is the array where parsed conversion tables are stored (cached)
140 var $parsedCharsets=array();
141
142 // An array where case folding data will be stored (cached)
143 var $caseFolding=array();
144
145 // An array where charset-to-ASCII mappings are stored (cached)
146 var $toASCII=array();
147
148 // This tells the converter which charsets has two bytes per char:
149 var $twoByteSets=array(
150 'ucs-2'=>1, // 2-byte Unicode
151 );
152
153 // This tells the converter which charsets has four bytes per char:
154 var $fourByteSets=array(
155 'ucs-4'=>1, // 4-byte Unicode
156 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
157 );
158
159 // This tells the converter which charsets use a scheme like the Extended Unix Code:
160 var $eucBasedSets=array(
161 'gb2312'=>1, // Chinese, simplified.
162 'big5'=>1, // Chinese, traditional.
163 'euc-kr'=>1, // Korean
164 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
165 );
166
167 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
168 // http://czyborra.com/charsets/iso8859.html
169 var $synonyms=array(
170 'us' => 'ascii',
171 'us-ascii'=> 'ascii',
172 'cp819' => 'iso-8859-1',
173 'ibm819' => 'iso-8859-1',
174 'iso-ir-100' => 'iso-8859-1',
175 'iso-ir-101' => 'iso-8859-2',
176 'iso-ir-109' => 'iso-8859-3',
177 'iso-ir-110' => 'iso-8859-4',
178 'iso-ir-144' => 'iso-8859-5',
179 'iso-ir-127' => 'iso-8859-6',
180 'iso-ir-126' => 'iso-8859-7',
181 'iso-ir-138' => 'iso-8859-8',
182 'iso-ir-148' => 'iso-8859-9',
183 'iso-ir-157' => 'iso-8859-10',
184 'iso-ir-179' => 'iso-8859-13',
185 'iso-ir-199' => 'iso-8859-14',
186 'iso-ir-203' => 'iso-8859-15',
187 'csisolatin1' => 'iso-8859-1',
188 'csisolatin2' => 'iso-8859-2',
189 'csisolatin3' => 'iso-8859-3',
190 'csisolatin5' => 'iso-8859-9',
191 'csisolatin8' => 'iso-8859-14',
192 'csisolatin9' => 'iso-8859-15',
193 'csisolatingreek' => 'iso-8859-7',
194 'iso-celtic' => 'iso-8859-14',
195 'latin1' => 'iso-8859-1',
196 'latin2' => 'iso-8859-2',
197 'latin3' => 'iso-8859-3',
198 'latin5' => 'iso-8859-9',
199 'latin6' => 'iso-8859-10',
200 'latin8' => 'iso-8859-14',
201 'latin9' => 'iso-8859-15',
202 'l1' => 'iso-8859-1',
203 'l2' => 'iso-8859-2',
204 'l3' => 'iso-8859-3',
205 'l5' => 'iso-8859-9',
206 'l6' => 'iso-8859-10',
207 'l8' => 'iso-8859-14',
208 'l9' => 'iso-8859-15',
209 'cyrillic' => 'iso-8859-5',
210 'arabic' => 'iso-8859-6',
211 'tis-620' => 'iso-8859-11',
212 'win874' => 'windows-874',
213 'win1250' => 'windows-1250',
214 'win1251' => 'windows-1251',
215 'win1252' => 'windows-1252',
216 'win1253' => 'windows-1253',
217 'win1254' => 'windows-1254',
218 'win1255' => 'windows-1255',
219 'win1256' => 'windows-1256',
220 'win1257' => 'windows-1257',
221 'win1258' => 'windows-1258',
222 'cp1250' => 'windows-1250',
223 'cp1251' => 'windows-1251',
224 'cp1252' => 'windows-1252',
225 'ms-ee' => 'windows-1250',
226 'ms-ansi' => 'windows-1252',
227 'ms-greek' => 'windows-1253',
228 'ms-turk' => 'windows-1254',
229 'winbaltrim' => 'windows-1257',
230 'koi-8ru' => 'koi-8r',
231 'koi8r' => 'koi-8r',
232 'cp878' => 'koi-8r',
233 'mac' => 'macroman',
234 'macintosh' => 'macroman',
235 'euc-cn' => 'gb2312',
236 'x-euc-cn' => 'gb2312',
237 'euccn' => 'gb2312',
238 'cp936' => 'gb2312',
239 'big-5' => 'big5',
240 'cp950' => 'big5',
241 'eucjp' => 'euc-jp',
242 'sjis' => 'shift_jis',
243 'shift-jis' => 'shift_jis',
244 'cp932' => 'shift_jis',
245 'cp949' => 'euc-kr',
246 'utf7' => 'utf-7',
247 'utf8' => 'utf-8',
248 'utf16' => 'utf-16',
249 'utf32' => 'utf-32',
250 'utf8' => 'utf-8',
251 'ucs2' => 'ucs-2',
252 'ucs4' => 'ucs-4',
253 );
254
255 // mapping of iso-639-1 language codes to script names
256 var $lang_to_script=array(
257 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
258 'ar' => 'arabic',
259 'bg' => 'cyrillic', // Bulgarian
260 'bs' => 'east_european', // Bosnian
261 'cs' => 'east_european', // Czech
262 'da' => 'west_european', // Danish
263 'de' => 'west_european', // German
264 'es' => 'west_european', // Spanish
265 'et' => 'estonian',
266 'eo' => 'unicode', // Esperanto
267 'eu' => 'west_european', // Basque
268 'fa' => 'arabic', // Persian
269 'fi' => 'west_european', // Finish
270 'fo' => 'west_european', // Faroese
271 'fr' => 'west_european', // French
272 'ga' => 'west_european', // Galician
273 'ge' => 'unicode', // Georgian
274 'gr' => 'greek',
275 'he' => 'hebrew', // Hebrew (since 1998)
276 'hi' => 'unicode', // Hindi
277 'hr' => 'east_european', // Croatian
278 'hu' => 'east_european', // Hungarian
279 'iw' => 'hebrew', // Hebrew (til 1998)
280 'is' => 'west_european', // Icelandic
281 'it' => 'west_european', // Italian
282 'ja' => 'japanese',
283 'kl' => 'west_european', // Greenlandic
284 'ko' => 'korean',
285 'lt' => 'lithuanian',
286 'lv' => 'west_european', // Latvian/Lettish
287 'nl' => 'west_european', // Dutch
288 'no' => 'west_european', // Norwegian
289 'nb' => 'west_european', // Norwegian Bokmal
290 'nn' => 'west_european', // Norwegian Nynorsk
291 'pl' => 'east_european', // Polish
292 'pt' => 'west_european', // Portuguese
293 'ro' => 'east_european', // Romanian
294 'ru' => 'cyrillic', // Russian
295 'sk' => 'east_european', // Slovak
296 'sl' => 'east_european', // Slovenian
297 'sr' => 'cyrillic', // Serbian
298 'sv' => 'west_european', // Swedish
299 'sq' => 'albanian', // Albanian
300 'th' => 'thai',
301 'uk' => 'cyrillic', // Ukranian
302 'vi' => 'vietnamese',
303 'zh' => 'chinese',
304 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
305 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
306 'ara' => 'arabic',
307 'bgr' => 'cyrillic', // Bulgarian
308 'cat' => 'west_european', // Catalan
309 'chs' => 'simpl_chinese',
310 'cht' => 'trad_chinese',
311 'csy' => 'east_european', // Czech
312 'dan' => 'west_european', // Danisch
313 'deu' => 'west_european', // German
314 'dea' => 'west_european', // German (Austrian)
315 'des' => 'west_european', // German (Swiss)
316 'ena' => 'west_european', // English (Australian)
317 'enc' => 'west_european', // English (Canadian)
318 'eng' => 'west_european', // English
319 'enz' => 'west_european', // English (New Zealand)
320 'enu' => 'west_european', // English (United States)
321 'euq' => 'west_european', // Basque
322 'fos' => 'west_european', // Faroese
323 'far' => 'arabic', // Persian
324 'fin' => 'west_european', // Finish
325 'fra' => 'west_european', // French
326 'frb' => 'west_european', // French (Belgian)
327 'frc' => 'west_european', // French (Canadian)
328 'frs' => 'west_european', // French (Swiss)
329 'geo' => 'unicode', // Georgian
330 'glg' => 'west_european', // Galician
331 'ell' => 'greek',
332 'heb' => 'hebrew',
333 'hin' => 'unicode', // Hindi
334 'hun' => 'east_european', // Hungarian
335 'isl' => 'west_euorpean', // Icelandic
336 'ita' => 'west_european', // Italian
337 'its' => 'west_european', // Italian (Swiss)
338 'jpn' => 'japanese',
339 'kor' => 'korean',
340 'lth' => 'lithuanian',
341 'lvi' => 'west_european', // Latvian/Lettish
342 'msl' => 'west_european', // Malay
343 'nlb' => 'west_european', // Dutch (Belgian)
344 'nld' => 'west_european', // Dutch
345 'nor' => 'west_european', // Norwegian (bokmal)
346 'non' => 'west_european', // Norwegian (nynorsk)
347 'plk' => 'east_european', // Polish
348 'ptg' => 'west_european', // Portuguese
349 'ptb' => 'west_european', // Portuguese (Brazil)
350 'rom' => 'east_european', // Romanian
351 'rus' => 'cyrillic', // Russian
352 'slv' => 'east_european', // Slovenian
353 'sky' => 'east_european', // Slovak
354 'srl' => 'east_european', // Serbian (Latin)
355 'srb' => 'cyrillic', // Serbian (Cyrillic)
356 'esp' => 'west_european', // Spanish (trad. sort)
357 'esm' => 'west_european', // Spanish (Mexican)
358 'esn' => 'west_european', // Spanish (internat. sort)
359 'sve' => 'west_european', // Swedish
360 'sqi' => 'albanian', // Albanian
361 'tha' => 'thai',
362 'trk' => 'turkish',
363 'ukr' => 'cyrillic', // Ukrainian
364 // English language names
365 'albanian' => 'albanian',
366 'arabic' => 'arabic',
367 'basque' => 'west_european',
368 'bosnian' => 'east_european',
369 'bulgarian' => 'east_european',
370 'catalan' => 'west_european',
371 'croatian' => 'east_european',
372 'czech' => 'east_european',
373 'danish' => 'west_european',
374 'dutch' => 'west_european',
375 'english' => 'west_european',
376 'esperanto' => 'unicode',
377 'estonian' => 'estonian',
378 'faroese' => 'west_european',
379 'farsi' => 'arabic',
380 'finnish' => 'west_european',
381 'french' => 'west_european',
382 'galician' => 'west_european',
383 'georgian' => 'unicode',
384 'german' => 'west_european',
385 'greek' => 'greek',
386 'greenlandic' => 'west_european',
387 'hebrew' => 'hebrew',
388 'hindi' => 'unicode',
389 'hungarian' => 'east_european',
390 'icelandic' => 'west_european',
391 'italian' => 'west_european',
392 'latvian' => 'west_european',
393 'lettish' => 'west_european',
394 'lithuanian' => 'lithuanian',
395 'malay' => 'west_european',
396 'norwegian' => 'west_european',
397 'persian' => 'arabic',
398 'polish' => 'east_european',
399 'portuguese' => 'west_european',
400 'russian' => 'cyrillic',
401 'romanian' => 'east_european',
402 'serbian' => 'cyrillic',
403 'slovak' => 'east_european',
404 'slovenian' => 'east_european',
405 'spanish' => 'west_european',
406 'svedish' => 'west_european',
407 'that' => 'thai',
408 'turkish' => 'turkish',
409 'ukrainian' => 'cyrillic',
410 );
411
412 // mapping of language (family) names to charsets on Unix
413 var $script_to_charset_unix=array(
414 'west_european' => 'iso-8859-1',
415 'estonian' => 'iso-8859-1',
416 'east_european' => 'iso-8859-2',
417 'baltic' => 'iso-8859-4',
418 'cyrillic' => 'iso-8859-5',
419 'arabic' => 'iso-8859-6',
420 'greek' => 'iso-8859-7',
421 'hebrew' => 'iso-8859-8',
422 'turkish' => 'iso-8859-9',
423 'thai' => 'iso-8859-11', // = TIS-620
424 'lithuanian' => 'iso-8859-13',
425 'chinese' => 'gb2312', // = euc-cn
426 'japanese' => 'euc-jp',
427 'korean' => 'euc-kr',
428 'simpl_chinese' => 'gb2312',
429 'trad_chinese' => 'big5',
430 'vietnamese' => '',
431 'unicode' => 'utf-8',
432 'albanian' => 'utf-8'
433 );
434
435 // mapping of language (family) names to charsets on Windows
436 var $script_to_charset_windows=array(
437 'east_european' => 'windows-1250',
438 'cyrillic' => 'windows-1251',
439 'west_european' => 'windows-1252',
440 'greek' => 'windows-1253',
441 'turkish' => 'windows-1254',
442 'hebrew' => 'windows-1255',
443 'arabic' => 'windows-1256',
444 'baltic' => 'windows-1257',
445 'estonian' => 'windows-1257',
446 'lithuanian' => 'windows-1257',
447 'vietnamese' => 'windows-1258',
448 'thai' => 'cp874',
449 'korean' => 'cp949',
450 'chinese' => 'gb2312',
451 'japanese' => 'shift_jis',
452 'simpl_chinese' => 'gb2312',
453 'trad_chinese' => 'big5',
454 'albanian' => 'windows-1250',
455 'unicode' => 'utf-8'
456 );
457
458 // mapping of locale names to charsets
459 var $locale_to_charset=array(
460 'japanese.euc' => 'euc-jp',
461 'ja_jp.ujis' => 'euc-jp',
462 'korean.euc' => 'euc-kr',
463 'sr@Latn' => 'iso-8859-2',
464 'zh_cn' => 'gb2312',
465 'zh_hk' => 'big5',
466 'zh_tw' => 'big5',
467 );
468
469 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
470 // Empty values means "iso-8859-1"
471 var $charSetArray = array(
472 'dk' => '',
473 'de' => '',
474 'no' => '',
475 'it' => '',
476 'fr' => '',
477 'es' => '',
478 'nl' => '',
479 'cz' => 'windows-1250',
480 'pl' => 'iso-8859-2',
481 'si' => 'windows-1250',
482 'fi' => '',
483 'tr' => 'iso-8859-9',
484 'se' => '',
485 'pt' => '',
486 'ru' => 'windows-1251',
487 'ro' => 'iso-8859-2',
488 'ch' => 'gb2312',
489 'sk' => 'windows-1250',
490 'lt' => 'windows-1257',
491 'is' => 'utf-8',
492 'hr' => 'windows-1250',
493 'hu' => 'iso-8859-2',
494 'gl' => '',
495 'th' => 'iso-8859-11',
496 'gr' => 'iso-8859-7',
497 'hk' => 'big5',
498 'eu' => '',
499 'bg' => 'windows-1251',
500 'br' => '',
501 'et' => 'iso-8859-4',
502 'ar' => 'iso-8859-6',
503 'he' => 'utf-8',
504 'ua' => 'windows-1251',
505 'jp' => 'shift_jis',
506 'lv' => 'utf-8',
507 'vn' => 'utf-8',
508 'ca' => 'iso-8859-15',
509 'ba' => 'iso-8859-2',
510 'kr' => 'euc-kr',
511 'eo' => 'utf-8',
512 'my' => '',
513 'hi' => 'utf-8',
514 'fo' => 'utf-8',
515 'fa' => 'utf-8',
516 'sr' => 'utf-8',
517 'sq' => 'utf-8',
518 'ge' => 'utf-8',
519 'ga' => '',
520 );
521
522 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
523 // Missing keys means: same as Typo3
524 var $isoArray = array(
525 'ba' => 'bs',
526 'br' => 'pt_BR',
527 'ch' => 'zh_CN',
528 'cz' => 'cs',
529 'dk' => 'da',
530 'si' => 'sl',
531 'se' => 'sv',
532 'gl' => 'kl',
533 'gr' => 'el',
534 'hk' => 'zh_HK',
535 'kr' => 'ko',
536 'ua' => 'uk',
537 'jp' => 'ja',
538 'vn' => 'vi',
539 );
540
541 /**
542 * Normalize - changes input character set to lowercase letters.
543 *
544 * @param string Input charset
545 * @return string Normalized charset
546 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
547 */
548 function parse_charset($charset) {
549 $charset = trim(strtolower($charset));
550 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
551
552 return $charset;
553 }
554
555 /**
556 * Get the charset of a locale.
557 *
558 * ln language
559 * ln_CN language / country
560 * ln_CN.cs language / country / charset
561 * ln_CN.cs@mod language / country / charset / modifier
562 *
563 * @param string Locale string
564 * @return string Charset resolved for locale string
565 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
566 */
567 function get_locale_charset($locale) {
568 $locale = strtolower($locale);
569
570 // exact locale specific charset?
571 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale];
572
573 // get modifier
574 list($locale,$modifier) = explode('@',$locale);
575
576 // locale contains charset: use it
577 list($locale,$charset) = explode('.',$locale);
578 if ($charset) return $this->parse_charset($charset);
579
580 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
581 if ($modifier == 'euro') return 'iso-8859-15';
582
583 // get language
584 list($language,$country) = explode('_',$locale);
585 if (isset($this->lang_to_script[$language])) $script = $this->lang_to_script[$language];
586
587 if (TYPO3_OS == 'WIN') {
588 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
589 } else {
590 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
591 }
592
593 return $cs;
594 }
595
596
597
598
599
600
601
602
603
604 /********************************************
605 *
606 * Charset Conversion functions
607 *
608 ********************************************/
609
610 /**
611 * Convert from one charset to another charset.
612 *
613 * @param string Input string
614 * @param string From charset (the current charset of the string)
615 * @param string To charset (the output charset wanted)
616 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
617 * @return string Converted string
618 * @see convArray()
619 */
620 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
621 if ($fromCS==$toCS) return $str;
622
623 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
624 if ($toCS=='utf-8' || !$useEntityForNoChar) {
625 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
626 case 'mbstring':
627 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
628 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
629 break;
630
631 case 'iconv':
632 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
633 if (false !== $conv_str) return $conv_str;
634 break;
635
636 case 'recode':
637 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
638 if (false !== $conv_str) return $conv_str;
639 break;
640 }
641 // fallback to TYPO3 conversion
642 }
643
644 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
645 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
646 return $str;
647 }
648
649 /**
650 * Convert all elements in ARRAY from one charset to another charset.
651 * NOTICE: Array is passed by reference!
652 *
653 * @param string Input array, possibly multidimensional
654 * @param string From charset (the current charset of the string)
655 * @param string To charset (the output charset wanted)
656 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
657 * @return void
658 * @see conv()
659 */
660 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
661 foreach($array as $key => $value) {
662 if (is_array($array[$key])) {
663 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
664 } else {
665 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
666 }
667 }
668 }
669
670 /**
671 * Converts $str from $charset to UTF-8
672 *
673 * @param string String in local charset to convert to UTF-8
674 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
675 * @return string Output string, converted to UTF-8
676 */
677 function utf8_encode($str,$charset) {
678
679 if ($charset === 'utf-8') return $str;
680
681 // Charset is case-insensitive.
682 if ($this->initCharset($charset)) { // Parse conv. table if not already...
683 $strLen = strlen($str);
684 $outStr='';
685
686 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
687 $chr=substr($str,$a,1);
688 $ord=ord($chr);
689 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
690 $ord2 = ord($str{$a+1});
691 $ord = $ord<<8 | $ord2; // assume big endian
692
693 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
694 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
695 } else $outStr.=chr($this->noCharByteVal); // No char exists
696 $a++;
697 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
698 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
699 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
700 $a++;
701 $ord2=ord(substr($str,$a,1));
702 $ord = $ord*256+$ord2;
703 }
704 }
705
706 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
707 $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
708 } else $outStr.= chr($this->noCharByteVal); // No char exists
709 } else $outStr.= $chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
710 }
711 return $outStr;
712 }
713 }
714
715 /**
716 * Converts $str from UTF-8 to $charset
717 *
718 * @param string String in UTF-8 to convert to local charset
719 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
720 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
721 * @return string Output string, converted to local charset
722 */
723 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
724
725 if ($charset === 'utf-8') {
726 return $str;
727 }
728
729 // Charset is case-insensitive.
730 if ($this->initCharset($charset)) { // Parse conv. table if not already...
731 $strLen = strlen($str);
732 $outStr='';
733 $buf='';
734 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
735 $chr=substr($str,$a,1);
736 $ord=ord($chr);
737 if ($ord>127) { // This means multibyte! (first byte!)
738 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
739
740 $buf=$chr; // Add first byte
741 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
742 $ord = $ord << 1; // Shift it left and ...
743 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
744 $a++; // Increase pointer...
745 $buf.=substr($str,$a,1); // ... and add the next char.
746 } else break;
747 }
748
749 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
750 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
751 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
752 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
753 } else $outStr.= chr($mByte);
754 } elseif ($useEntityForNoChar) { // Create num entity:
755 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
756 } else $outStr.=chr($this->noCharByteVal); // No char exists
757 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
758 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
759 }
760 return $outStr;
761 }
762 }
763
764 /**
765 * Converts all chars > 127 to numeric entities.
766 *
767 * @param string Input string
768 * @return string Output string
769 */
770 function utf8_to_entities($str) {
771 $strLen = strlen($str);
772 $outStr='';
773 $buf='';
774 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
775 $chr=substr($str,$a,1);
776 $ord=ord($chr);
777 if ($ord>127) { // This means multibyte! (first byte!)
778 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
779 $buf=$chr; // Add first byte
780 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
781 $ord = $ord << 1; // Shift it left and ...
782 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
783 $a++; // Increase pointer...
784 $buf.=substr($str,$a,1); // ... and add the next char.
785 } else break;
786 }
787
788 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
789 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
790 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
791 }
792
793 return $outStr;
794 }
795
796 /**
797 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
798 *
799 * @param string Input string, UTF-8
800 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
801 * @return string Output string
802 */
803 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
804 if ($alsoStdHtmlEnt) {
805 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
806 }
807
808 $token = md5(microtime());
809 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
810 foreach($parts as $k => $v) {
811 if ($k%2) {
812 if (substr($v,0,1)=='#') { // Dec or hex entities:
813 if (substr($v,1,1)=='x') {
814 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
815 } else {
816 $parts[$k] = $this->UnumberToChar(substr($v,1));
817 }
818 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
819 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
820 } else { // No conversion:
821 $parts[$k] ='&'.$v.';';
822 }
823 }
824 }
825
826 return implode('',$parts);
827 }
828
829 /**
830 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
831 *
832 * @param string Input string, UTF-8
833 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
834 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
835 * @return array Output array with the char numbers
836 */
837 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
838 // If entities must be registered as well...:
839 if ($convEntities) {
840 $str = $this->entities_to_utf8($str,1);
841 }
842 // Do conversion:
843 $strLen = strlen($str);
844 $outArr=array();
845 $buf='';
846 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
847 $chr=substr($str,$a,1);
848 $ord=ord($chr);
849 if ($ord>127) { // This means multibyte! (first byte!)
850 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
851 $buf=$chr; // Add first byte
852 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
853 $ord = $ord << 1; // Shift it left and ...
854 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
855 $a++; // Increase pointer...
856 $buf.=substr($str,$a,1); // ... and add the next char.
857 } else break;
858 }
859
860 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
861 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
862 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
863 }
864
865 return $outArr;
866 }
867
868 /**
869 * Converts a UNICODE number to a UTF-8 multibyte character
870 * Algorithm based on script found at From: http://czyborra.com/utf/
871 * Unit-tested by Kasper
872 *
873 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
874 *
875 * bytes | bits | representation
876 * 1 | 7 | 0vvvvvvv
877 * 2 | 11 | 110vvvvv 10vvvvvv
878 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
879 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
880 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
881 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
882 *
883 * @param integer UNICODE integer
884 * @return string UTF-8 multibyte character string
885 * @see utf8CharToUnumber()
886 */
887 function UnumberToChar($cbyte) {
888 $str='';
889
890 if ($cbyte < 0x80) {
891 $str.=chr($cbyte);
892 } else if ($cbyte < 0x800) {
893 $str.=chr(0xC0 | ($cbyte >> 6));
894 $str.=chr(0x80 | ($cbyte & 0x3F));
895 } else if ($cbyte < 0x10000) {
896 $str.=chr(0xE0 | ($cbyte >> 12));
897 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
898 $str.=chr(0x80 | ($cbyte & 0x3F));
899 } else if ($cbyte < 0x200000) {
900 $str.=chr(0xF0 | ($cbyte >> 18));
901 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
902 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
903 $str.=chr(0x80 | ($cbyte & 0x3F));
904 } else if ($cbyte < 0x4000000) {
905 $str.=chr(0xF8 | ($cbyte >> 24));
906 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
907 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
908 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
909 $str.=chr(0x80 | ($cbyte & 0x3F));
910 } else if ($cbyte < 0x80000000) {
911 $str.=chr(0xFC | ($cbyte >> 30));
912 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
913 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
914 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
915 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
916 $str.=chr(0x80 | ($cbyte & 0x3F));
917 } else { // Cannot express a 32-bit character in UTF-8
918 $str .= chr($this->noCharByteVal);
919 }
920 return $str;
921 }
922
923 /**
924 * Converts a UTF-8 Multibyte character to a UNICODE number
925 * Unit-tested by Kasper
926 *
927 * @param string UTF-8 multibyte character string
928 * @param boolean If set, then a hex. number is returned.
929 * @return integer UNICODE integer
930 * @see UnumberToChar()
931 */
932 function utf8CharToUnumber($str,$hex=0) {
933 $ord=ord(substr($str,0,1)); // First char
934
935 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
936 $binBuf='';
937 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
938 $ord = $ord << 1; // Shift it left and ...
939 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
940 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
941 } else break;
942 }
943 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
944
945 $int = bindec($binBuf);
946 } else $int = $ord;
947
948 return $hex ? 'x'.dechex($int) : $int;
949 }
950
951
952
953
954
955
956
957
958
959 /********************************************
960 *
961 * Init functions
962 *
963 ********************************************/
964
965 /**
966 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
967 * This function is automatically called by the conversion functions
968 *
969 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
970 *
971 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
972 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
973 * @access private
974 */
975 function initCharset($charset) {
976 // Only process if the charset is not yet loaded:
977 if (!is_array($this->parsedCharsets[$charset])) {
978
979 // Conversion table filename:
980 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
981
982 // If the conversion table is found:
983 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
984 // Cache file for charsets:
985 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
986 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
987 if ($cacheFile && @is_file($cacheFile)) {
988 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
989 } else {
990 // Parse conversion table into lines:
991 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
992 // Initialize the internal variable holding the conv. table:
993 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
994 // traverse the lines:
995 $detectedType='';
996 foreach($lines as $value) {
997 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
998
999 // Detect type if not done yet: (Done on first real line)
1000 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1001 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
1002
1003 if ($detectedType=='ms-token') {
1004 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1005 } elseif ($detectedType=='whitespaced') {
1006 $regA=array();
1007 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
1008 $hexbyte = $regA[1];
1009 $utf8 = 'U+'.$regA[2];
1010 }
1011 $decval = hexdec(trim($hexbyte));
1012 if ($decval>127) {
1013 $utf8decval = hexdec(substr(trim($utf8),2));
1014 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
1015 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
1016 }
1017 }
1018 }
1019 if ($cacheFile) {
1020 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
1021 }
1022 }
1023 return 2;
1024 } else return false;
1025 } else return 1;
1026 }
1027
1028 /**
1029 * This function initializes all UTF-8 character data tables.
1030 *
1031 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1032 *
1033 * @param string Mode ("case", "ascii", ...)
1034 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1035 * @access private
1036 */
1037 function initUnicodeData($mode=null) {
1038 // cache files
1039 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1040 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1041
1042 // Only process if the tables are not yet loaded
1043 switch($mode) {
1044 case 'case':
1045 if (is_array($this->caseFolding['utf-8'])) return 1;
1046
1047 // Use cached version if possible
1048 if ($cacheFileCase && @is_file($cacheFileCase)) {
1049 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1050 return 2;
1051 }
1052 break;
1053
1054 case 'ascii':
1055 if (is_array($this->toASCII['utf-8'])) return 1;
1056
1057 // Use cached version if possible
1058 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1059 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1060 return 2;
1061 }
1062 break;
1063 }
1064
1065 // process main Unicode data file
1066 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
1067 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
1068
1069 $fh = fopen($unicodeDataFile,'rb');
1070 if (!$fh) return false;
1071
1072 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1073 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1074 $this->caseFolding['utf-8'] = array();
1075 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1076 $utf8CaseFolding['toUpper'] = array();
1077 $utf8CaseFolding['toLower'] = array();
1078 $utf8CaseFolding['toTitle'] = array();
1079
1080 $decomposition = array(); // array of temp. decompositions
1081 $mark = array(); // array of chars that are marks (eg. composing accents)
1082 $number = array(); // array of chars that are numbers (eg. digits)
1083 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1084
1085 while (!feof($fh)) {
1086 $line = fgets($fh,4096);
1087 // has a lot of info
1088 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = explode(';', rtrim($line));
1089
1090 $ord = hexdec($char);
1091 if ($ord > 0xFFFF) break; // only process the BMP
1092
1093 $utf8_char = $this->UnumberToChar($ord);
1094
1095 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1096 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1097 // store "title" only when different from "upper" (only a few)
1098 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1099
1100 switch ($cat{0}) {
1101 case 'M': // mark (accent, umlaut, ...)
1102 $mark["U+$char"] = 1;
1103 break;
1104
1105 case 'N': // numeric value
1106 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
1107 }
1108
1109 // accented Latin letters without "official" decomposition
1110 $match = array();
1111 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) {
1112 $c = ord($match[2]);
1113 if ($match[1] == 'SMALL') $c += 32;
1114
1115 $decomposition["U+$char"] = array(dechex($c));
1116 continue;
1117 }
1118
1119 $match = array();
1120 if (ereg('(<.*>)? *(.+)',$decomp,$match)) {
1121 switch($match[1]) {
1122 case '<circle>': // add parenthesis as circle replacement, eg (1)
1123 $match[2] = '0028 '.$match[2].' 0029';
1124 break;
1125
1126 case '<square>': // add square brackets as square replacement, eg [1]
1127 $match[2] = '005B '.$match[2].' 005D';
1128 break;
1129
1130 case '<compat>': // ignore multi char decompositions that start with a space
1131 if (ereg('^0020 ',$match[2])) continue 2;
1132 break;
1133
1134 // ignore Arabic and vertical layout presentation decomposition
1135 case '<initial>':
1136 case '<medial>':
1137 case '<final>':
1138 case '<isolated>':
1139 case '<vertical>':
1140 continue 2;
1141 }
1142 $decomposition["U+$char"] = explode(' ', $match[2]);
1143 }
1144 }
1145 fclose($fh);
1146
1147 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1148 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
1149 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1150 $fh = fopen($specialCasingFile,'rb');
1151 if ($fh) {
1152 while (!feof($fh)) {
1153 $line = fgets($fh,4096);
1154 if ($line{0} != '#' && trim($line) != '') {
1155
1156 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
1157 if ($cond == '' || $cond{0} == '#') {
1158 $utf8_char = $this->UnumberToChar(hexdec($char));
1159 if ($char != $lower) {
1160 $arr = explode(' ', $lower);
1161 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1162 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1163 }
1164 if ($char != $title && $title != $upper) {
1165 $arr = explode(' ', $title);
1166 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1167 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1168 }
1169 if ($char != $upper) {
1170 $arr = explode(' ', $upper);
1171 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1172 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1173 }
1174 }
1175 }
1176 }
1177 fclose($fh);
1178 }
1179 }
1180
1181 // process custom decompositions
1182 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
1183 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1184 $fh = fopen($customTranslitFile,'rb');
1185 if ($fh) {
1186 while (!feof($fh)) {
1187 $line = fgets($fh,4096);
1188 if ($line{0} != '#' && trim($line) != '') {
1189 list($char,$translit) = t3lib_div::trimExplode(';', $line);
1190 if (!$translit) $omit["U+$char"] = 1;
1191 $decomposition["U+$char"] = explode(' ', $translit);
1192
1193 }
1194 }
1195 fclose($fh);
1196 }
1197 }
1198
1199 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1200 foreach($decomposition as $from => $to) {
1201 $code_decomp = array();
1202
1203 while ($code_value = array_shift($to)) {
1204 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1205 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
1206 array_unshift($to, $cv);
1207 }
1208 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1209 array_push($code_decomp, $code_value);
1210 }
1211 }
1212 if (count($code_decomp) || isset($omit[$from])) {
1213 $decomposition[$from] = $code_decomp;
1214 } else {
1215 unset($decomposition[$from]);
1216 }
1217 }
1218
1219 // create ascii only mapping
1220 $this->toASCII['utf-8'] = array();
1221 $ascii =& $this->toASCII['utf-8'];
1222
1223 foreach($decomposition as $from => $to) {
1224 $code_decomp = array();
1225 while ($code_value = array_shift($to)) {
1226 $ord = hexdec($code_value);
1227 if ($ord > 127)
1228 continue 2; // skip decompositions containing non-ASCII chars
1229 else
1230 array_push($code_decomp,chr($ord));
1231 }
1232 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1233 }
1234
1235 // add numeric decompositions
1236 foreach($number as $from => $to) {
1237 $utf8_char = $this->UnumberToChar(hexdec($from));
1238 if (!isset($ascii[$utf8_char])) {
1239 $ascii[$utf8_char] = $to;
1240 }
1241 }
1242
1243 if ($cacheFileCase) {
1244 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1245 }
1246
1247 if ($cacheFileASCII) {
1248 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1249 }
1250
1251 return 3;
1252 }
1253
1254 /**
1255 * This function initializes the folding table for a charset other than UTF-8.
1256 * This function is automatically called by the case folding functions.
1257 *
1258 * @param string Charset for which to initialize case folding.
1259 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1260 * @access private
1261 */
1262 function initCaseFolding($charset) {
1263 // Only process if the case table is not yet loaded:
1264 if (is_array($this->caseFolding[$charset])) return 1;
1265
1266 // Use cached version if possible
1267 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1268 if ($cacheFile && @is_file($cacheFile)) {
1269 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1270 return 2;
1271 }
1272
1273 // init UTF-8 conversion for this charset
1274 if (!$this->initCharset($charset)) {
1275 return false;
1276 }
1277
1278 // UTF-8 case folding is used as the base conversion table
1279 if (!$this->initUnicodeData('case')) {
1280 return false;
1281 }
1282
1283 $nochar = chr($this->noCharByteVal);
1284 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1285 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1286 $c = $this->utf8_decode($utf8, $charset);
1287
1288 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1289 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1290 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1291
1292 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1293 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1294 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
1295
1296 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1297 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1298 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1299 }
1300
1301 // add the ASCII case table
1302 for ($i=ord('a'); $i<=ord('z'); $i++) {
1303 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
1304 }
1305 for ($i=ord('A'); $i<=ord('Z'); $i++) {
1306 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
1307 }
1308
1309 if ($cacheFile) {
1310 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
1311 }
1312
1313 return 3;
1314 }
1315
1316 /**
1317 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1318 * This function is automatically called by the ASCII transliteration functions.
1319 *
1320 * @param string Charset for which to initialize conversion.
1321 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1322 * @access private
1323 */
1324 function initToASCII($charset) {
1325 // Only process if the case table is not yet loaded:
1326 if (is_array($this->toASCII[$charset])) return 1;
1327
1328 // Use cached version if possible
1329 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1330 if ($cacheFile && @is_file($cacheFile)) {
1331 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1332 return 2;
1333 }
1334
1335 // init UTF-8 conversion for this charset
1336 if (!$this->initCharset($charset)) {
1337 return false;
1338 }
1339
1340 // UTF-8/ASCII transliteration is used as the base conversion table
1341 if (!$this->initUnicodeData('ascii')) {
1342 return false;
1343 }
1344
1345 $nochar = chr($this->noCharByteVal);
1346 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1347 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1348 $c = $this->utf8_decode($utf8, $charset);
1349
1350 if (isset($this->toASCII['utf-8'][$utf8])) {
1351 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1352 }
1353 }
1354
1355 if ($cacheFile) {
1356 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
1357 }
1358
1359 return 3;
1360 }
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377 /********************************************
1378 *
1379 * String operation functions
1380 *
1381 ********************************************/
1382
1383 /**
1384 * Returns a part of a string.
1385 * Unit-tested by Kasper (single byte charsets only)
1386 *
1387 * @param string The character set
1388 * @param string Character string
1389 * @param integer Start position (character position)
1390 * @param integer Length (in characters)
1391 * @return string The substring
1392 * @see substr(), mb_substr()
1393 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1394 */
1395 function substr($charset,$string,$start,$len=null) {
1396 if ($len===0) return '';
1397
1398 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1399 // cannot omit $len, when specifying charset
1400 if ($len==null) {
1401 $enc = mb_internal_encoding(); // save internal encoding
1402 mb_internal_encoding($charset);
1403 $str = mb_substr($string,$start);
1404 mb_internal_encoding($enc); // restore internal encoding
1405
1406 return $str;
1407 }
1408 else {
1409 return mb_substr($string,$start,$len,$charset);
1410 }
1411 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1412 // cannot omit $len, when specifying charset
1413 if ($len==null) {
1414 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1415 iconv_set_encoding('internal_encoding',$charset);
1416 $str = iconv_substr($string,$start);
1417 iconv_set_encoding('internal_encoding',$enc); // restore internal encoding
1418
1419 return $str;
1420 }
1421 else {
1422 return iconv_substr($string,$start,$len,$charset);
1423 }
1424 } elseif ($charset == 'utf-8') {
1425 return $this->utf8_substr($string,$start,$len);
1426 } elseif ($this->eucBasedSets[$charset]) {
1427 return $this->euc_substr($string,$start,$charset,$len);
1428 } elseif ($this->twoByteSets[$charset]) {
1429 return substr($string,$start*2,$len*2);
1430 } elseif ($this->fourByteSets[$charset]) {
1431 return substr($string,$start*4,$len*4);
1432 }
1433
1434 // treat everything else as single-byte encoding
1435 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
1436 }
1437
1438 /**
1439 * Counts the number of characters.
1440 * Unit-tested by Kasper (single byte charsets only)
1441 *
1442 * @param string The character set
1443 * @param string Character string
1444 * @return integer The number of characters
1445 * @see strlen()
1446 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1447 */
1448 function strlen($charset,$string) {
1449 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1450 return mb_strlen($string,$charset);
1451 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1452 return iconv_strlen($string,$charset);
1453 } elseif ($charset == 'utf-8') {
1454 return $this->utf8_strlen($string);
1455 } elseif ($this->eucBasedSets[$charset]) {
1456 return $this->euc_strlen($string,$charset);
1457 } elseif ($this->twoByteSets[$charset]) {
1458 return strlen($string)/2;
1459 } elseif ($this->fourByteSets[$charset]) {
1460 return strlen($string)/4;
1461 }
1462 // treat everything else as single-byte encoding
1463 return strlen($string);
1464 }
1465
1466 /**
1467 * Truncates a string and pre-/appends a string.
1468 * Unit tested by Kasper
1469 *
1470 * @param string The character set
1471 * @param string Character string
1472 * @param integer Length (in characters)
1473 * @param string Crop signifier
1474 * @return string The shortened string
1475 * @see substr(), mb_strimwidth()
1476 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1477 */
1478 function crop($charset,$string,$len,$crop='') {
1479 if (intval($len) == 0) return $string;
1480
1481 if ($charset == 'utf-8') {
1482 $i = $this->utf8_char2byte_pos($string,$len);
1483 } elseif ($this->eucBasedSets[$charset]) {
1484 $i = $this->euc_char2byte_pos($string,$len,$charset);
1485 } else {
1486 if ($len > 0) {
1487 $i = $len;
1488 } else {
1489 $i = strlen($string)+$len;
1490 if ($i<=0) $i = false;
1491 }
1492 }
1493
1494 if ($i === false) { // $len outside actual string length
1495 return $string;
1496 } else {
1497 if ($len > 0) {
1498 if (strlen($string{$i})) {
1499 return substr($string,0,$i).$crop;
1500
1501 }
1502 } else {
1503 if (strlen($string{$i-1})) {
1504 return $crop.substr($string,$i);
1505 }
1506 }
1507
1508 /*
1509 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1510 if ($len > 0) {
1511 return substr($string,0,$i).$crop;
1512 } else {
1513 return $crop.substr($string,$i);
1514 }
1515 }
1516 */
1517 }
1518 return $string;
1519 }
1520
1521 /**
1522 * Cuts a string short at a given byte length.
1523 *
1524 * @param string The character set
1525 * @param string Character string
1526 * @param integer The byte length
1527 * @return string The shortened string
1528 * @see mb_strcut()
1529 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1530 */
1531 function strtrunc($charset,$string,$len) {
1532 if ($len <= 0) return '';
1533
1534 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1535 return mb_strcut($string,0,$len,$charset);
1536 } elseif ($charset == 'utf-8') {
1537 return $this->utf8_strtrunc($string,$len);
1538 } elseif ($this->eucBasedSets[$charset]) {
1539 return $this->euc_strtrunc($string,$charset);
1540 } elseif ($this->twoByteSets[$charset]) {
1541 if ($len % 2) $len--; // don't cut at odd positions
1542 } elseif ($this->fourByteSets[$charset]) {
1543 $x = $len % 4;
1544 $len -= $x; // realign to position dividable by four
1545 }
1546 // treat everything else as single-byte encoding
1547 return substr($string,0,$len);
1548 }
1549
1550 /**
1551 * Translates all characters of a string into their respective case values.
1552 * Unlike strtolower() and strtoupper() this method is locale independent.
1553 * Note that the string length may change!
1554 * eg. lower case German �(sharp S) becomes upper case "SS"
1555 * Unit-tested by Kasper
1556 * Real case folding is language dependent, this method ignores this fact.
1557 *
1558 * @param string Character set of string
1559 * @param string Input string to convert case for
1560 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1561 * @return string The converted string
1562 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1563 * @see strtolower(), strtoupper()
1564 */
1565 function conv_case($charset,$string,$case) {
1566 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1567 if ($case == 'toLower') {
1568 $string = mb_strtolower($string,$charset);
1569 } else {
1570 $string = mb_strtoupper($string,$charset);
1571 }
1572 } elseif ($charset == 'utf-8') {
1573 $string = $this->utf8_char_mapping($string,'case',$case);
1574 } elseif (isset($this->eucBasedSets[$charset])) {
1575 $string = $this->euc_char_mapping($string,$charset,'case',$case);
1576 } else {
1577 // treat everything else as single-byte encoding
1578 $string = $this->sb_char_mapping($string,$charset,'case',$case);
1579 }
1580
1581 return $string;
1582 }
1583
1584 /**
1585 * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
1586 *
1587 * @param string Character set of string
1588 * @param string Input string to convert
1589 * @return string The converted string
1590 */
1591 function specCharsToASCII($charset,$string) {
1592 if ($charset == 'utf-8') {
1593 $string = $this->utf8_char_mapping($string,'ascii');
1594 } elseif (isset($this->eucBasedSets[$charset])) {
1595 $string = $this->euc_char_mapping($string,$charset,'ascii');
1596 } else {
1597 // treat everything else as single-byte encoding
1598 $string = $this->sb_char_mapping($string,$charset,'ascii');
1599 }
1600
1601 return $string;
1602 }
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615 /********************************************
1616 *
1617 * Internal string operation functions
1618 *
1619 ********************************************/
1620
1621 /**
1622 * Maps all characters of a string in a single byte charset.
1623 *
1624 * @param string the string
1625 * @param string the charset
1626 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1627 * @param string 'case': conversion 'toLower' or 'toUpper'
1628 * @return string the converted string
1629 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1630 */
1631 function sb_char_mapping($str,$charset,$mode,$opt='') {
1632 switch($mode) {
1633 case 'case':
1634 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1635 $map =& $this->caseFolding[$charset][$opt];
1636 break;
1637
1638 case 'ascii':
1639 if (!$this->initToASCII($charset)) return $str; // do nothing
1640 $map =& $this->toASCII[$charset];
1641 break;
1642
1643 default:
1644 return $str;
1645 }
1646
1647 $out = '';
1648 for($i=0; strlen($str{$i}); $i++) {
1649 $c = $str{$i};
1650 if (isset($map[$c])) {
1651 $out .= $map[$c];
1652 } else {
1653 $out .= $c;
1654 }
1655 }
1656
1657 return $out;
1658 }
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669 /********************************************
1670 *
1671 * Internal UTF-8 string operation functions
1672 *
1673 ********************************************/
1674
1675 /**
1676 * Returns a part of a UTF-8 string.
1677 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1678 *
1679 * @param string UTF-8 string
1680 * @param integer Start position (character position)
1681 * @param integer Length (in characters)
1682 * @return string The substring
1683 * @see substr()
1684 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1685 */
1686 function utf8_substr($str,$start,$len=null) {
1687 if (!strcmp($len,'0')) return '';
1688
1689 $byte_start = $this->utf8_char2byte_pos($str,$start);
1690 if ($byte_start === false) {
1691 if ($start > 0) {
1692 return false; // $start outside string length
1693 } else {
1694 $start = 0;
1695 }
1696 }
1697
1698 $str = substr($str,$byte_start);
1699
1700 if ($len!=null) {
1701 $byte_end = $this->utf8_char2byte_pos($str,$len);
1702 if ($byte_end === false) // $len outside actual string length
1703 return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string.
1704 else
1705 return substr($str,0,$byte_end);
1706 }
1707 else return $str;
1708 }
1709
1710 /**
1711 * Counts the number of characters of a string in UTF-8.
1712 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1713 *
1714 * @param string UTF-8 multibyte character string
1715 * @return integer The number of characters
1716 * @see strlen()
1717 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1718 */
1719 function utf8_strlen($str) {
1720 $n=0;
1721 for($i=0; strlen($str{$i}); $i++) {
1722 $c = ord($str{$i});
1723 if (!($c & 0x80)) // single-byte (0xxxxxx)
1724 $n++;
1725 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1726 $n++;
1727 }
1728 return $n;
1729 }
1730
1731 /**
1732 * Truncates a string in UTF-8 short at a given byte length.
1733 *
1734 * @param string UTF-8 multibyte character string
1735 * @param integer the byte length
1736 * @return string the shortened string
1737 * @see mb_strcut()
1738 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1739 */
1740 function utf8_strtrunc($str,$len) {
1741 $i = $len-1;
1742 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1743 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1744 if ($i <= 0) return ''; // sanity check
1745 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1746 if ($bc+$i > $len) return substr($str,0,$i);
1747 // fallthru: multibyte char fits into length
1748 }
1749 return substr($str,0,$len);
1750 }
1751
1752 /**
1753 * Find position of first occurrence of a string, both arguments are in UTF-8.
1754 *
1755 * @param string UTF-8 string to search in
1756 * @param string UTF-8 string to search for
1757 * @param integer Positition to start the search
1758 * @return integer The character position
1759 * @see strpos()
1760 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1761 */
1762 function utf8_strpos($haystack,$needle,$offset=0) {
1763 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1764 return mb_strpos($haystack,$needle,$offset,'utf-8');
1765 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1766 return iconv_strpos($haystack,$needle,$offset,'utf-8');
1767 }
1768
1769 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1770 if ($byte_offset === false) return false; // offset beyond string length
1771
1772 $byte_pos = strpos($haystack,$needle,$byte_offset);
1773 if ($byte_pos === false) return false; // needle not found
1774
1775 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1776 }
1777
1778 /**
1779 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1780 *
1781 * @param string UTF-8 string to search in
1782 * @param string UTF-8 character to search for (single character)
1783 * @return integer The character position
1784 * @see strrpos()
1785 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1786 */
1787 function utf8_strrpos($haystack,$needle) {
1788 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1789 return mb_strrpos($haystack,$needle,'utf-8');
1790 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1791 return iconv_strrpos($haystack,$needle,'utf-8');
1792 }
1793
1794 $byte_pos = strrpos($haystack,$needle);
1795 if ($byte_pos === false) return false; // needle not found
1796
1797 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1798 }
1799
1800 /**
1801 * Translates a character position into an 'absolute' byte position.
1802 * Unit tested by Kasper.
1803 *
1804 * @param string UTF-8 string
1805 * @param integer Character position (negative values start from the end)
1806 * @return integer Byte position
1807 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1808 */
1809 function utf8_char2byte_pos($str,$pos) {
1810 $n = 0; // number of characters found
1811 $p = abs($pos); // number of characters wanted
1812
1813 if ($pos >= 0) {
1814 $i = 0;
1815 $d = 1;
1816 } else {
1817 $i = strlen($str)-1;
1818 $d = -1;
1819 }
1820
1821 for( ; strlen($str{$i}) && $n<$p; $i+=$d) {
1822 $c = (int)ord($str{$i});
1823 if (!($c & 0x80)) // single-byte (0xxxxxx)
1824 $n++;
1825 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1826 $n++;
1827 }
1828 if (!strlen($str{$i})) return false; // offset beyond string length
1829
1830 if ($pos >= 0) {
1831 // skip trailing multi-byte data bytes
1832 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1833 } else {
1834 // correct offset
1835 $i++;
1836 }
1837
1838 return $i;
1839 }
1840
1841 /**
1842 * Translates an 'absolute' byte position into a character position.
1843 * Unit tested by Kasper.
1844 *
1845 * @param string UTF-8 string
1846 * @param integer byte position
1847 * @return integer character position
1848 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1849 */
1850 function utf8_byte2char_pos($str,$pos) {
1851 $n = 0; // number of characters
1852 for($i=$pos; $i>0; $i--) {
1853 $c = (int)ord($str{$i});
1854 if (!($c & 0x80)) // single-byte (0xxxxxx)
1855 $n++;
1856 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1857 $n++;
1858 }
1859 if (!strlen($str{$i})) return false; // offset beyond string length
1860
1861 return $n;
1862 }
1863
1864 /**
1865 * Maps all characters of an UTF-8 string.
1866 *
1867 * @param string UTF-8 string
1868 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1869 * @param string 'case': conversion 'toLower' or 'toUpper'
1870 * @return string the converted string
1871 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1872 */
1873 function utf8_char_mapping($str,$mode,$opt='') {
1874 if (!$this->initUnicodeData($mode)) return $str; // do nothing
1875
1876 $out = '';
1877 switch($mode) {
1878 case 'case':
1879 $map =& $this->caseFolding['utf-8'][$opt];
1880 break;
1881
1882 case 'ascii':
1883 $map =& $this->toASCII['utf-8'];
1884 break;
1885
1886 default:
1887 return $str;
1888 }
1889
1890 for($i=0; strlen($str{$i}); $i++) {
1891 $c = ord($str{$i});
1892 if (!($c & 0x80)) // single-byte (0xxxxxx)
1893 $mbc = $str{$i};
1894 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1895 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1896 $mbc = substr($str,$i,$bc);
1897 $i += $bc-1;
1898 }
1899
1900 if (isset($map[$mbc])) {
1901 $out .= $map[$mbc];
1902 } else {
1903 $out .= $mbc;
1904 }
1905 }
1906
1907 return $out;
1908 }
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927 /********************************************
1928 *
1929 * Internal EUC string operation functions
1930 *
1931 * Extended Unix Code:
1932 * ASCII compatible 7bit single bytes chars
1933 * 8bit two byte chars
1934 *
1935 * Shift-JIS is treated as a special case.
1936 *
1937 ********************************************/
1938
1939 /**
1940 * Cuts a string in the EUC charset family short at a given byte length.
1941 *
1942 * @param string EUC multibyte character string
1943 * @param integer the byte length
1944 * @param string the charset
1945 * @return string the shortened string
1946 * @see mb_strcut()
1947 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1948 */
1949 function euc_strtrunc($str,$len,$charset) {
1950 $sjis = ($charset == 'shift_jis');
1951 for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
1952 $c = ord($str{$i});
1953 if ($sjis) {
1954 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1955 }
1956 else {
1957 if ($c >= 0x80) $i++; // advance a double-byte char
1958 }
1959 }
1960 if (!strlen($str{$i})) return $str; // string shorter than supplied length
1961
1962 if ($i>$len)
1963 return substr($str,0,$len-1); // we ended on a first byte
1964 else
1965 return substr($str,0,$len);
1966 }
1967
1968 /**
1969 * Returns a part of a string in the EUC charset family.
1970 *
1971 * @param string EUC multibyte character string
1972 * @param integer start position (character position)
1973 * @param string the charset
1974 * @param integer length (in characters)
1975 * @return string the substring
1976 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1977 */
1978 function euc_substr($str,$start,$charset,$len=null) {
1979 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1980 if ($byte_start === false) return false; // $start outside string length
1981
1982 $str = substr($str,$byte_start);
1983
1984 if ($len!=null) {
1985 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1986 if ($byte_end === false) // $len outside actual string length
1987 return $str;
1988 else
1989 return substr($str,0,$byte_end);
1990 }
1991 else return $str;
1992 }
1993
1994 /**
1995 * Counts the number of characters of a string in the EUC charset family.
1996 *
1997 * @param string EUC multibyte character string
1998 * @param string the charset
1999 * @return integer the number of characters
2000 * @see strlen()
2001 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2002 */
2003 function euc_strlen($str,$charset) {
2004 $sjis = ($charset == 'shift_jis');
2005 $n=0;
2006 for ($i=0; strlen($str{$i}); $i++) {
2007 $c = ord($str{$i});
2008 if ($sjis) {
2009 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
2010 }
2011 else {
2012 if ($c >= 0x80) $i++; // advance a double-byte char
2013 }
2014
2015 $n++;
2016 }
2017
2018 return $n;
2019 }
2020
2021 /**
2022 * Translates a character position into an 'absolute' byte position.
2023 *
2024 * @param string EUC multibyte character string
2025 * @param integer character position (negative values start from the end)
2026 * @param string the charset
2027 * @return integer byte position
2028 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2029 */
2030 function euc_char2byte_pos($str,$pos,$charset) {
2031 $sjis = ($charset == 'shift_jis');
2032 $n = 0; // number of characters seen
2033 $p = abs($pos); // number of characters wanted
2034
2035 if ($pos >= 0) {
2036 $i = 0;
2037 $d = 1;
2038 } else {
2039 $i = strlen($str)-1;
2040 $d = -1;
2041 }
2042
2043 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
2044 $c = ord($str{$i});
2045 if ($sjis) {
2046 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char
2047 }
2048 else {
2049 if ($c >= 0x80) $i+=$d; // advance a double-byte char
2050 }
2051
2052 $n++;
2053 }
2054 if (!strlen($str{$i})) return false; // offset beyond string length
2055
2056 if ($pos < 0) $i++; // correct offset
2057
2058 return $i;
2059 }
2060
2061 /**
2062 * Maps all characters of a string in the EUC charset family.
2063 *
2064 * @param string EUC multibyte character string
2065 * @param string the charset
2066 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2067 * @param string 'case': conversion 'toLower' or 'toUpper'
2068 * @return string the converted string
2069 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2070 */
2071 function euc_char_mapping($str,$charset,$mode,$opt='') {
2072 switch($mode) {
2073 case 'case':
2074 if (!$this->initCaseFolding($charset)) return $str; // do nothing
2075 $map =& $this->caseFolding[$charset][$opt];
2076 break;
2077
2078 case 'ascii':
2079 if (!$this->initToASCII($charset)) return $str; // do nothing
2080 $map =& $this->toASCII[$charset];
2081 break;
2082
2083 default:
2084 return $str;
2085 }
2086
2087 $sjis = ($charset == 'shift_jis');
2088 $out = '';
2089 for($i=0; strlen($str{$i}); $i++) {
2090 $mbc = $str{$i};
2091 $c = ord($mbc);
2092
2093 if ($sjis) {
2094 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2095 $mbc = substr($str,$i,2);
2096 $i++;
2097 }
2098 }
2099 else {
2100 if ($c >= 0x80) { // a double-byte char
2101 $mbc = substr($str,$i,2);
2102 $i++;
2103 }
2104 }
2105
2106 if (isset($map[$mbc])) {
2107 $out .= $map[$mbc];
2108 } else {
2109 $out .= $mbc;
2110 }
2111 }
2112
2113 return $out;
2114 }
2115
2116 }
2117
2118 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
2119 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2120 }
2121
2122 ?>