#9076: iso-ir-109 detected as iso-8859-2 instead of iso-8859-3
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2009 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 136: class t3lib_cs
38 * 488: function parse_charset($charset)
39 * 507: function get_locale_charset($locale)
40 *
41 * SECTION: Charset Conversion functions
42 * 560: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
43 * 600: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
44 * 617: function utf8_encode($str,$charset)
45 * 663: function utf8_decode($str,$charset,$useEntityForNoChar=0)
46 * 706: function utf8_to_entities($str)
47 * 739: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
48 * 773: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
49 * 823: function UnumberToChar($cbyte)
50 * 868: function utf8CharToUnumber($str,$hex=0)
51 *
52 * SECTION: Init functions
53 * 911: function initCharset($charset)
54 * 973: function initUnicodeData($mode=null)
55 * 1198: function initCaseFolding($charset)
56 * 1260: function initToASCII($charset)
57 *
58 * SECTION: String operation functions
59 * 1331: function substr($charset,$string,$start,$len=null)
60 * 1384: function strlen($charset,$string)
61 * 1414: function crop($charset,$string,$len,$crop='')
62 * 1467: function strtrunc($charset,$string,$len)
63 * 1501: function conv_case($charset,$string,$case)
64 * 1527: function specCharsToASCII($charset,$string)
65 *
66 * SECTION: Internal string operation functions
67 * 1567: function sb_char_mapping($str,$charset,$mode,$opt='')
68 *
69 * SECTION: Internal UTF-8 string operation functions
70 * 1622: function utf8_substr($str,$start,$len=null)
71 * 1655: function utf8_strlen($str)
72 * 1676: function utf8_strtrunc($str,$len)
73 * 1698: function utf8_strpos($haystack,$needle,$offset=0)
74 * 1723: function utf8_strrpos($haystack,$needle)
75 * 1745: function utf8_char2byte_pos($str,$pos)
76 * 1786: function utf8_byte2char_pos($str,$pos)
77 * 1809: function utf8_char_mapping($str,$mode,$opt='')
78 *
79 * SECTION: Internal EUC string operation functions
80 * 1885: function euc_strtrunc($str,$len,$charset)
81 * 1914: function euc_substr($str,$start,$charset,$len=null)
82 * 1939: function euc_strlen($str,$charset)
83 * 1966: function euc_char2byte_pos($str,$pos,$charset)
84 * 2007: function euc_char_mapping($str,$charset,$mode,$opt='')
85 *
86 * TOTAL FUNCTIONS: 35
87 * (This index is automatically created/updated by the extension "extdeveval")
88 *
89 */
90
91
92
93
94
95
96
97
98 /**
99 * Notes on UTF-8
100 *
101 * Functions working on UTF-8 strings:
102 *
103 * - strchr/strstr
104 * - strrchr
105 * - substr_count
106 * - implode/explode/join
107 *
108 * Functions nearly working on UTF-8 strings:
109 *
110 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
111 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
112 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
113 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
114 *
115 * Functions NOT working on UTF-8 strings:
116 *
117 * - str*cmp
118 * - stristr
119 * - stripos
120 * - substr
121 * - strrev
122 * - ereg/eregi
123 * - split/spliti
124 * - preg_*
125 * - ...
126 *
127 */
128 /**
129 * Class for conversion between charsets
130 *
131 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
132 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
133 * @package TYPO3
134 * @subpackage t3lib
135 */
136 class t3lib_cs {
137 var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
138
139 // This is the array where parsed conversion tables are stored (cached)
140 var $parsedCharsets=array();
141
142 // An array where case folding data will be stored (cached)
143 var $caseFolding=array();
144
145 // An array where charset-to-ASCII mappings are stored (cached)
146 var $toASCII=array();
147
148 // This tells the converter which charsets has two bytes per char:
149 var $twoByteSets=array(
150 'ucs-2'=>1, // 2-byte Unicode
151 );
152
153 // This tells the converter which charsets has four bytes per char:
154 var $fourByteSets=array(
155 'ucs-4'=>1, // 4-byte Unicode
156 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
157 );
158
159 // This tells the converter which charsets use a scheme like the Extended Unix Code:
160 var $eucBasedSets=array(
161 'gb2312'=>1, // Chinese, simplified.
162 'big5'=>1, // Chinese, traditional.
163 'euc-kr'=>1, // Korean
164 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
165 );
166
167 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
168 // http://czyborra.com/charsets/iso8859.html
169 var $synonyms=array(
170 'us' => 'ascii',
171 'us-ascii'=> 'ascii',
172 'cp819' => 'iso-8859-1',
173 'ibm819' => 'iso-8859-1',
174 'iso-ir-100' => 'iso-8859-1',
175 'iso-ir-101' => 'iso-8859-2',
176 'iso-ir-109' => 'iso-8859-3',
177 'iso-ir-110' => 'iso-8859-4',
178 'iso-ir-144' => 'iso-8859-5',
179 'iso-ir-127' => 'iso-8859-6',
180 'iso-ir-126' => 'iso-8859-7',
181 'iso-ir-138' => 'iso-8859-8',
182 'iso-ir-148' => 'iso-8859-9',
183 'iso-ir-157' => 'iso-8859-10',
184 'iso-ir-179' => 'iso-8859-13',
185 'iso-ir-199' => 'iso-8859-14',
186 'iso-ir-203' => 'iso-8859-15',
187 'csisolatin1' => 'iso-8859-1',
188 'csisolatin2' => 'iso-8859-2',
189 'csisolatin3' => 'iso-8859-3',
190 'csisolatin5' => 'iso-8859-9',
191 'csisolatin8' => 'iso-8859-14',
192 'csisolatin9' => 'iso-8859-15',
193 'csisolatingreek' => 'iso-8859-7',
194 'iso-celtic' => 'iso-8859-14',
195 'latin1' => 'iso-8859-1',
196 'latin2' => 'iso-8859-2',
197 'latin3' => 'iso-8859-3',
198 'latin5' => 'iso-8859-9',
199 'latin6' => 'iso-8859-10',
200 'latin8' => 'iso-8859-14',
201 'latin9' => 'iso-8859-15',
202 'l1' => 'iso-8859-1',
203 'l2' => 'iso-8859-2',
204 'l3' => 'iso-8859-3',
205 'l5' => 'iso-8859-9',
206 'l6' => 'iso-8859-10',
207 'l8' => 'iso-8859-14',
208 'l9' => 'iso-8859-15',
209 'cyrillic' => 'iso-8859-5',
210 'arabic' => 'iso-8859-6',
211 'tis-620' => 'iso-8859-11',
212 'win874' => 'windows-874',
213 'win1250' => 'windows-1250',
214 'win1251' => 'windows-1251',
215 'win1252' => 'windows-1252',
216 'win1253' => 'windows-1253',
217 'win1254' => 'windows-1254',
218 'win1255' => 'windows-1255',
219 'win1256' => 'windows-1256',
220 'win1257' => 'windows-1257',
221 'win1258' => 'windows-1258',
222 'cp1250' => 'windows-1250',
223 'cp1251' => 'windows-1251',
224 'cp1252' => 'windows-1252',
225 'ms-ee' => 'windows-1250',
226 'ms-ansi' => 'windows-1252',
227 'ms-greek' => 'windows-1253',
228 'ms-turk' => 'windows-1254',
229 'winbaltrim' => 'windows-1257',
230 'koi-8ru' => 'koi-8r',
231 'koi8r' => 'koi-8r',
232 'cp878' => 'koi-8r',
233 'mac' => 'macroman',
234 'macintosh' => 'macroman',
235 'euc-cn' => 'gb2312',
236 'x-euc-cn' => 'gb2312',
237 'euccn' => 'gb2312',
238 'cp936' => 'gb2312',
239 'big-5' => 'big5',
240 'cp950' => 'big5',
241 'eucjp' => 'euc-jp',
242 'sjis' => 'shift_jis',
243 'shift-jis' => 'shift_jis',
244 'cp932' => 'shift_jis',
245 'cp949' => 'euc-kr',
246 'utf7' => 'utf-7',
247 'utf8' => 'utf-8',
248 'utf16' => 'utf-16',
249 'utf32' => 'utf-32',
250 'utf8' => 'utf-8',
251 'ucs2' => 'ucs-2',
252 'ucs4' => 'ucs-4',
253 );
254
255 // mapping of iso-639:2 language codes to script names
256 var $lang_to_script=array(
257 // iso-639:2 language codes, see:
258 // http://www.w3.org/WAI/ER/IG/ert/iso639.htm
259 // http://www.loc.gov/standards/iso639-2/langcodes.html
260 // http://www.unicode.org/onlinedat/languages.html
261 'ar' => 'arabic',
262 'bg' => 'cyrillic', // Bulgarian
263 'bs' => 'east_european', // Bosnian
264 'cs' => 'east_european', // Czech
265 'da' => 'west_european', // Danish
266 'de' => 'west_european', // German
267 'es' => 'west_european', // Spanish
268 'et' => 'estonian',
269 'eo' => 'unicode', // Esperanto
270 'eu' => 'west_european', // Basque
271 'fa' => 'arabic', // Persian
272 'fi' => 'west_european', // Finish
273 'fo' => 'west_european', // Faroese
274 'fr' => 'west_european', // French
275 'ga' => 'west_european', // Galician
276 'ge' => 'unicode', // Georgian
277 'gr' => 'greek',
278 'he' => 'hebrew', // Hebrew (since 1998)
279 'hi' => 'unicode', // Hindi
280 'hr' => 'east_european', // Croatian
281 'hu' => 'east_european', // Hungarian
282 'iw' => 'hebrew', // Hebrew (til 1998)
283 'is' => 'west_european', // Icelandic
284 'it' => 'west_european', // Italian
285 'ja' => 'japanese',
286 'kl' => 'west_european', // Greenlandic
287 'ko' => 'korean',
288 'lt' => 'lithuanian',
289 'lv' => 'west_european', // Latvian/Lettish
290 'nl' => 'west_european', // Dutch
291 'no' => 'west_european', // Norwegian
292 'pl' => 'east_european', // Polish
293 'pt' => 'west_european', // Portuguese
294 'ro' => 'east_european', // Romanian
295 'ru' => 'cyrillic', // Russian
296 'sk' => 'east_european', // Slovak
297 'sl' => 'east_european', // Slovenian
298 'sr' => 'cyrillic', // Serbian
299 'sv' => 'west_european', // Swedish
300 'sq' => 'albanian', // Albanian
301 'th' => 'thai',
302 'uk' => 'cyrillic', // Ukranian
303 'vi' => 'vietnamese',
304 'zh' => 'chinese',
305 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
306 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
307 'ara' => 'arabic',
308 'bgr' => 'cyrillic', // Bulgarian
309 'cat' => 'west_european', // Catalan
310 'chs' => 'simpl_chinese',
311 'cht' => 'trad_chinese',
312 'csy' => 'east_european', // Czech
313 'dan' => 'west_european', // Danisch
314 'deu' => 'west_european', // German
315 'dea' => 'west_european', // German (Austrian)
316 'des' => 'west_european', // German (Swiss)
317 'ena' => 'west_european', // English (Australian)
318 'enc' => 'west_european', // English (Canadian)
319 'eng' => 'west_european', // English
320 'enz' => 'west_european', // English (New Zealand)
321 'enu' => 'west_european', // English (United States)
322 'euq' => 'west_european', // Basque
323 'fos' => 'west_european', // Faroese
324 'far' => 'arabic', // Persian
325 'fin' => 'west_european', // Finish
326 'fra' => 'west_european', // French
327 'frb' => 'west_european', // French (Belgian)
328 'frc' => 'west_european', // French (Canadian)
329 'frs' => 'west_european', // French (Swiss)
330 'geo' => 'unicode', // Georgian
331 'glg' => 'west_european', // Galician
332 'ell' => 'greek',
333 'heb' => 'hebrew',
334 'hin' => 'unicode', // Hindi
335 'hun' => 'east_european', // Hungarian
336 'isl' => 'west_euorpean', // Icelandic
337 'ita' => 'west_european', // Italian
338 'its' => 'west_european', // Italian (Swiss)
339 'jpn' => 'japanese',
340 'kor' => 'korean',
341 'lth' => 'lithuanian',
342 'lvi' => 'west_european', // Latvian/Lettish
343 'msl' => 'west_european', // Malay
344 'nlb' => 'west_european', // Dutch (Belgian)
345 'nld' => 'west_european', // Dutch
346 'nor' => 'west_european', // Norwegian (bokmal)
347 'non' => 'west_european', // Norwegian (nynorsk)
348 'plk' => 'east_european', // Polish
349 'ptg' => 'west_european', // Portuguese
350 'ptb' => 'west_european', // Portuguese (Brazil)
351 'rom' => 'east_european', // Romanian
352 'rus' => 'cyrillic', // Russian
353 'slv' => 'east_european', // Slovenian
354 'sky' => 'east_european', // Slovak
355 'srl' => 'east_european', // Serbian (Latin)
356 'srb' => 'cyrillic', // Serbian (Cyrillic)
357 'esp' => 'west_european', // Spanish (trad. sort)
358 'esm' => 'west_european', // Spanish (Mexican)
359 'esn' => 'west_european', // Spanish (internat. sort)
360 'sve' => 'west_european', // Swedish
361 'sqi' => 'albanian', // Albanian
362 'tha' => 'thai',
363 'trk' => 'turkish',
364 'ukr' => 'cyrillic', // Ukrainian
365 // English language names
366 'albanian' => 'albanian',
367 'arabic' => 'arabic',
368 'basque' => 'west_european',
369 'bosnian' => 'east_european',
370 'bulgarian' => 'east_european',
371 'catalan' => 'west_european',
372 'croatian' => 'east_european',
373 'czech' => 'east_european',
374 'danish' => 'west_european',
375 'dutch' => 'west_european',
376 'english' => 'west_european',
377 'esperanto' => 'unicode',
378 'estonian' => 'estonian',
379 'faroese' => 'west_european',
380 'farsi' => 'arabic',
381 'finnish' => 'west_european',
382 'french' => 'west_european',
383 'galician' => 'west_european',
384 'georgian' => 'unicode',
385 'german' => 'west_european',
386 'greek' => 'greek',
387 'greenlandic' => 'west_european',
388 'hebrew' => 'hebrew',
389 'hindi' => 'unicode',
390 'hungarian' => 'east_european',
391 'icelandic' => 'west_european',
392 'italian' => 'west_european',
393 'latvian' => 'west_european',
394 'lettish' => 'west_european',
395 'lithuanian' => 'lithuanian',
396 'malay' => 'west_european',
397 'norwegian' => 'west_european',
398 'persian' => 'arabic',
399 'polish' => 'east_european',
400 'portuguese' => 'west_european',
401 'russian' => 'cyrillic',
402 'romanian' => 'east_european',
403 'serbian' => 'cyrillic',
404 'slovak' => 'east_european',
405 'slovenian' => 'east_european',
406 'spanish' => 'west_european',
407 'svedish' => 'west_european',
408 'that' => 'thai',
409 'turkish' => 'turkish',
410 'ukrainian' => 'cyrillic',
411 );
412
413 // mapping of language (family) names to charsets on Unix
414 var $script_to_charset_unix=array(
415 'west_european' => 'iso-8859-1',
416 'estonian' => 'iso-8859-1',
417 'east_european' => 'iso-8859-2',
418 'baltic' => 'iso-8859-4',
419 'cyrillic' => 'iso-8859-5',
420 'arabic' => 'iso-8859-6',
421 'greek' => 'iso-8859-7',
422 'hebrew' => 'iso-8859-8',
423 'turkish' => 'iso-8859-9',
424 'thai' => 'iso-8859-11', // = TIS-620
425 'lithuanian' => 'iso-8859-13',
426 'chinese' => 'gb2312', // = euc-cn
427 'japanese' => 'euc-jp',
428 'korean' => 'euc-kr',
429 'simpl_chinese' => 'gb2312',
430 'trad_chinese' => 'big5',
431 'vietnamese' => '',
432 'unicode' => 'utf-8',
433 'albanian' => 'utf-8'
434 );
435
436 // mapping of language (family) names to charsets on Windows
437 var $script_to_charset_windows=array(
438 'east_european' => 'windows-1250',
439 'cyrillic' => 'windows-1251',
440 'west_european' => 'windows-1252',
441 'greek' => 'windows-1253',
442 'turkish' => 'windows-1254',
443 'hebrew' => 'windows-1255',
444 'arabic' => 'windows-1256',
445 'baltic' => 'windows-1257',
446 'estonian' => 'windows-1257',
447 'lithuanian' => 'windows-1257',
448 'vietnamese' => 'windows-1258',
449 'thai' => 'cp874',
450 'korean' => 'cp949',
451 'chinese' => 'gb2312',
452 'japanese' => 'shift_jis',
453 'simpl_chinese' => 'gb2312',
454 'trad_chinese' => 'big5',
455 'albanian' => 'windows-1250',
456 'unicode' => 'utf-8'
457 );
458
459 // mapping of locale names to charsets
460 var $locale_to_charset=array(
461 'japanese.euc' => 'euc-jp',
462 'ja_jp.ujis' => 'euc-jp',
463 'korean.euc' => 'euc-kr',
464 'sr@Latn' => 'iso-8859-2',
465 'zh_cn' => 'gb2312',
466 'zh_hk' => 'big5',
467 'zh_tw' => 'big5',
468 );
469
470 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
471 // Empty values means "iso-8859-1"
472 var $charSetArray = array(
473 'dk' => '',
474 'de' => '',
475 'no' => '',
476 'it' => '',
477 'fr' => '',
478 'es' => '',
479 'nl' => '',
480 'cz' => 'windows-1250',
481 'pl' => 'iso-8859-2',
482 'si' => 'windows-1250',
483 'fi' => '',
484 'tr' => 'iso-8859-9',
485 'se' => '',
486 'pt' => '',
487 'ru' => 'windows-1251',
488 'ro' => 'iso-8859-2',
489 'ch' => 'gb2312',
490 'sk' => 'windows-1250',
491 'lt' => 'windows-1257',
492 'is' => 'utf-8',
493 'hr' => 'windows-1250',
494 'hu' => 'iso-8859-2',
495 'gl' => '',
496 'th' => 'iso-8859-11',
497 'gr' => 'iso-8859-7',
498 'hk' => 'big5',
499 'eu' => '',
500 'bg' => 'windows-1251',
501 'br' => '',
502 'et' => 'iso-8859-4',
503 'ar' => 'iso-8859-6',
504 'he' => 'utf-8',
505 'ua' => 'windows-1251',
506 'jp' => 'shift_jis',
507 'lv' => 'utf-8',
508 'vn' => 'utf-8',
509 'ca' => 'iso-8859-15',
510 'ba' => 'iso-8859-2',
511 'kr' => 'euc-kr',
512 'eo' => 'utf-8',
513 'my' => '',
514 'hi' => 'utf-8',
515 'fo' => 'utf-8',
516 'fa' => 'utf-8',
517 'sr' => 'utf-8',
518 'sq' => 'utf-8',
519 'ge' => 'utf-8',
520 'ga' => '',
521 );
522
523 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
524 // Missing keys means: same as Typo3
525 var $isoArray = array(
526 'ba' => 'bs',
527 'br' => 'pt_BR',
528 'ch' => 'zh_CN',
529 'cz' => 'cs',
530 'dk' => 'da',
531 'si' => 'sl',
532 'se' => 'sv',
533 'gl' => 'kl',
534 'gr' => 'el',
535 'hk' => 'zh_HK',
536 'kr' => 'ko',
537 'ua' => 'uk',
538 'jp' => 'ja',
539 'vn' => 'vi',
540 );
541
542 /**
543 * Normalize - changes input character set to lowercase letters.
544 *
545 * @param string Input charset
546 * @return string Normalized charset
547 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
548 */
549 function parse_charset($charset) {
550 $charset = trim(strtolower($charset));
551 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
552
553 return $charset;
554 }
555
556 /**
557 * Get the charset of a locale.
558 *
559 * ln language
560 * ln_CN language / country
561 * ln_CN.cs language / country / charset
562 * ln_CN.cs@mod language / country / charset / modifier
563 *
564 * @param string Locale string
565 * @return string Charset resolved for locale string
566 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
567 */
568 function get_locale_charset($locale) {
569 $locale = strtolower($locale);
570
571 // exact locale specific charset?
572 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale];
573
574 // get modifier
575 list($locale,$modifier) = explode('@',$locale);
576
577 // locale contains charset: use it
578 list($locale,$charset) = explode('.',$locale);
579 if ($charset) return $this->parse_charset($charset);
580
581 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
582 if ($modifier == 'euro') return 'iso-8859-15';
583
584 // get language
585 list($language,$country) = explode('_',$locale);
586 if (isset($this->lang_to_script[$language])) $script = $this->lang_to_script[$language];
587
588 if (TYPO3_OS == 'WIN') {
589 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
590 } else {
591 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
592 }
593
594 return $cs;
595 }
596
597
598
599
600
601
602
603
604
605 /********************************************
606 *
607 * Charset Conversion functions
608 *
609 ********************************************/
610
611 /**
612 * Convert from one charset to another charset.
613 *
614 * @param string Input string
615 * @param string From charset (the current charset of the string)
616 * @param string To charset (the output charset wanted)
617 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
618 * @return string Converted string
619 * @see convArray()
620 */
621 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
622 if ($fromCS==$toCS) return $str;
623
624 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
625 if ($toCS=='utf-8' || !$useEntityForNoChar) {
626 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
627 case 'mbstring':
628 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
629 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
630 break;
631
632 case 'iconv':
633 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
634 if (false !== $conv_str) return $conv_str;
635 break;
636
637 case 'recode':
638 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
639 if (false !== $conv_str) return $conv_str;
640 break;
641 }
642 // fallback to TYPO3 conversion
643 }
644
645 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
646 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
647 return $str;
648 }
649
650 /**
651 * Convert all elements in ARRAY from one charset to another charset.
652 * NOTICE: Array is passed by reference!
653 *
654 * @param string Input array, possibly multidimensional
655 * @param string From charset (the current charset of the string)
656 * @param string To charset (the output charset wanted)
657 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
658 * @return void
659 * @see conv()
660 */
661 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
662 foreach($array as $key => $value) {
663 if (is_array($array[$key])) {
664 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
665 } else {
666 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
667 }
668 }
669 }
670
671 /**
672 * Converts $str from $charset to UTF-8
673 *
674 * @param string String in local charset to convert to UTF-8
675 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
676 * @return string Output string, converted to UTF-8
677 */
678 function utf8_encode($str,$charset) {
679
680 if ($charset === 'utf-8') return $str;
681
682 // Charset is case-insensitive.
683 if ($this->initCharset($charset)) { // Parse conv. table if not already...
684 $strLen = strlen($str);
685 $outStr='';
686
687 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
688 $chr=substr($str,$a,1);
689 $ord=ord($chr);
690 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
691 $ord2 = ord($str{$a+1});
692 $ord = $ord<<8 | $ord2; // assume big endian
693
694 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
695 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
696 } else $outStr.=chr($this->noCharByteVal); // No char exists
697 $a++;
698 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
699 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
700 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
701 $a++;
702 $ord2=ord(substr($str,$a,1));
703 $ord = $ord*256+$ord2;
704 }
705 }
706
707 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
708 $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
709 } else $outStr.= chr($this->noCharByteVal); // No char exists
710 } else $outStr.= $chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
711 }
712 return $outStr;
713 }
714 }
715
716 /**
717 * Converts $str from UTF-8 to $charset
718 *
719 * @param string String in UTF-8 to convert to local charset
720 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
721 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
722 * @return string Output string, converted to local charset
723 */
724 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
725
726 if ($charset === 'utf-8') {
727 return $str;
728 }
729
730 // Charset is case-insensitive.
731 if ($this->initCharset($charset)) { // Parse conv. table if not already...
732 $strLen = strlen($str);
733 $outStr='';
734 $buf='';
735 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
736 $chr=substr($str,$a,1);
737 $ord=ord($chr);
738 if ($ord>127) { // This means multibyte! (first byte!)
739 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
740
741 $buf=$chr; // Add first byte
742 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
743 $ord = $ord << 1; // Shift it left and ...
744 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
745 $a++; // Increase pointer...
746 $buf.=substr($str,$a,1); // ... and add the next char.
747 } else break;
748 }
749
750 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
751 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
752 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
753 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
754 } else $outStr.= chr($mByte);
755 } elseif ($useEntityForNoChar) { // Create num entity:
756 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
757 } else $outStr.=chr($this->noCharByteVal); // No char exists
758 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
759 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
760 }
761 return $outStr;
762 }
763 }
764
765 /**
766 * Converts all chars > 127 to numeric entities.
767 *
768 * @param string Input string
769 * @return string Output string
770 */
771 function utf8_to_entities($str) {
772 $strLen = strlen($str);
773 $outStr='';
774 $buf='';
775 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
776 $chr=substr($str,$a,1);
777 $ord=ord($chr);
778 if ($ord>127) { // This means multibyte! (first byte!)
779 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
780 $buf=$chr; // Add first byte
781 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
782 $ord = $ord << 1; // Shift it left and ...
783 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
784 $a++; // Increase pointer...
785 $buf.=substr($str,$a,1); // ... and add the next char.
786 } else break;
787 }
788
789 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
790 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
791 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
792 }
793
794 return $outStr;
795 }
796
797 /**
798 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
799 *
800 * @param string Input string, UTF-8
801 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
802 * @return string Output string
803 */
804 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
805 if ($alsoStdHtmlEnt) {
806 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
807 }
808
809 $token = md5(microtime());
810 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
811 foreach($parts as $k => $v) {
812 if ($k%2) {
813 if (substr($v,0,1)=='#') { // Dec or hex entities:
814 if (substr($v,1,1)=='x') {
815 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
816 } else {
817 $parts[$k] = $this->UnumberToChar(substr($v,1));
818 }
819 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
820 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
821 } else { // No conversion:
822 $parts[$k] ='&'.$v.';';
823 }
824 }
825 }
826
827 return implode('',$parts);
828 }
829
830 /**
831 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
832 *
833 * @param string Input string, UTF-8
834 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
835 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
836 * @return array Output array with the char numbers
837 */
838 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
839 // If entities must be registered as well...:
840 if ($convEntities) {
841 $str = $this->entities_to_utf8($str,1);
842 }
843 // Do conversion:
844 $strLen = strlen($str);
845 $outArr=array();
846 $buf='';
847 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
848 $chr=substr($str,$a,1);
849 $ord=ord($chr);
850 if ($ord>127) { // This means multibyte! (first byte!)
851 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
852 $buf=$chr; // Add first byte
853 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
854 $ord = $ord << 1; // Shift it left and ...
855 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
856 $a++; // Increase pointer...
857 $buf.=substr($str,$a,1); // ... and add the next char.
858 } else break;
859 }
860
861 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
862 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
863 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
864 }
865
866 return $outArr;
867 }
868
869 /**
870 * Converts a UNICODE number to a UTF-8 multibyte character
871 * Algorithm based on script found at From: http://czyborra.com/utf/
872 * Unit-tested by Kasper
873 *
874 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
875 *
876 * bytes | bits | representation
877 * 1 | 7 | 0vvvvvvv
878 * 2 | 11 | 110vvvvv 10vvvvvv
879 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
880 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
881 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
882 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
883 *
884 * @param integer UNICODE integer
885 * @return string UTF-8 multibyte character string
886 * @see utf8CharToUnumber()
887 */
888 function UnumberToChar($cbyte) {
889 $str='';
890
891 if ($cbyte < 0x80) {
892 $str.=chr($cbyte);
893 } else if ($cbyte < 0x800) {
894 $str.=chr(0xC0 | ($cbyte >> 6));
895 $str.=chr(0x80 | ($cbyte & 0x3F));
896 } else if ($cbyte < 0x10000) {
897 $str.=chr(0xE0 | ($cbyte >> 12));
898 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
899 $str.=chr(0x80 | ($cbyte & 0x3F));
900 } else if ($cbyte < 0x200000) {
901 $str.=chr(0xF0 | ($cbyte >> 18));
902 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
903 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
904 $str.=chr(0x80 | ($cbyte & 0x3F));
905 } else if ($cbyte < 0x4000000) {
906 $str.=chr(0xF8 | ($cbyte >> 24));
907 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
908 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
909 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
910 $str.=chr(0x80 | ($cbyte & 0x3F));
911 } else if ($cbyte < 0x80000000) {
912 $str.=chr(0xFC | ($cbyte >> 30));
913 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
914 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
915 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
916 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
917 $str.=chr(0x80 | ($cbyte & 0x3F));
918 } else { // Cannot express a 32-bit character in UTF-8
919 $str .= chr($this->noCharByteVal);
920 }
921 return $str;
922 }
923
924 /**
925 * Converts a UTF-8 Multibyte character to a UNICODE number
926 * Unit-tested by Kasper
927 *
928 * @param string UTF-8 multibyte character string
929 * @param boolean If set, then a hex. number is returned.
930 * @return integer UNICODE integer
931 * @see UnumberToChar()
932 */
933 function utf8CharToUnumber($str,$hex=0) {
934 $ord=ord(substr($str,0,1)); // First char
935
936 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
937 $binBuf='';
938 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
939 $ord = $ord << 1; // Shift it left and ...
940 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
941 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
942 } else break;
943 }
944 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
945
946 $int = bindec($binBuf);
947 } else $int = $ord;
948
949 return $hex ? 'x'.dechex($int) : $int;
950 }
951
952
953
954
955
956
957
958
959
960 /********************************************
961 *
962 * Init functions
963 *
964 ********************************************/
965
966 /**
967 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
968 * This function is automatically called by the conversion functions
969 *
970 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
971 *
972 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
973 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
974 * @access private
975 */
976 function initCharset($charset) {
977 // Only process if the charset is not yet loaded:
978 if (!is_array($this->parsedCharsets[$charset])) {
979
980 // Conversion table filename:
981 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
982
983 // If the conversion table is found:
984 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
985 // Cache file for charsets:
986 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
987 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
988 if ($cacheFile && @is_file($cacheFile)) {
989 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
990 } else {
991 // Parse conversion table into lines:
992 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
993 // Initialize the internal variable holding the conv. table:
994 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
995 // traverse the lines:
996 $detectedType='';
997 foreach($lines as $value) {
998 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
999
1000 // Detect type if not done yet: (Done on first real line)
1001 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1002 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
1003
1004 if ($detectedType=='ms-token') {
1005 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1006 } elseif ($detectedType=='whitespaced') {
1007 $regA=array();
1008 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
1009 $hexbyte = $regA[1];
1010 $utf8 = 'U+'.$regA[2];
1011 }
1012 $decval = hexdec(trim($hexbyte));
1013 if ($decval>127) {
1014 $utf8decval = hexdec(substr(trim($utf8),2));
1015 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
1016 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
1017 }
1018 }
1019 }
1020 if ($cacheFile) {
1021 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
1022 }
1023 }
1024 return 2;
1025 } else return false;
1026 } else return 1;
1027 }
1028
1029 /**
1030 * This function initializes all UTF-8 character data tables.
1031 *
1032 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1033 *
1034 * @param string Mode ("case", "ascii", ...)
1035 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1036 * @access private
1037 */
1038 function initUnicodeData($mode=null) {
1039 // cache files
1040 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1041 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1042
1043 // Only process if the tables are not yet loaded
1044 switch($mode) {
1045 case 'case':
1046 if (is_array($this->caseFolding['utf-8'])) return 1;
1047
1048 // Use cached version if possible
1049 if ($cacheFileCase && @is_file($cacheFileCase)) {
1050 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1051 return 2;
1052 }
1053 break;
1054
1055 case 'ascii':
1056 if (is_array($this->toASCII['utf-8'])) return 1;
1057
1058 // Use cached version if possible
1059 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1060 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1061 return 2;
1062 }
1063 break;
1064 }
1065
1066 // process main Unicode data file
1067 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
1068 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
1069
1070 $fh = fopen($unicodeDataFile,'rb');
1071 if (!$fh) return false;
1072
1073 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1074 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1075 $this->caseFolding['utf-8'] = array();
1076 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1077 $utf8CaseFolding['toUpper'] = array();
1078 $utf8CaseFolding['toLower'] = array();
1079 $utf8CaseFolding['toTitle'] = array();
1080
1081 $decomposition = array(); // array of temp. decompositions
1082 $mark = array(); // array of chars that are marks (eg. composing accents)
1083 $number = array(); // array of chars that are numbers (eg. digits)
1084 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1085
1086 while (!feof($fh)) {
1087 $line = fgets($fh,4096);
1088 // has a lot of info
1089 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = explode(';', rtrim($line));
1090
1091 $ord = hexdec($char);
1092 if ($ord > 0xFFFF) break; // only process the BMP
1093
1094 $utf8_char = $this->UnumberToChar($ord);
1095
1096 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1097 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1098 // store "title" only when different from "upper" (only a few)
1099 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1100
1101 switch ($cat{0}) {
1102 case 'M': // mark (accent, umlaut, ...)
1103 $mark["U+$char"] = 1;
1104 break;
1105
1106 case 'N': // numeric value
1107 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
1108 }
1109
1110 // accented Latin letters without "official" decomposition
1111 $match = array();
1112 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) {
1113 $c = ord($match[2]);
1114 if ($match[1] == 'SMALL') $c += 32;
1115
1116 $decomposition["U+$char"] = array(dechex($c));
1117 continue;
1118 }
1119
1120 $match = array();
1121 if (ereg('(<.*>)? *(.+)',$decomp,$match)) {
1122 switch($match[1]) {
1123 case '<circle>': // add parenthesis as circle replacement, eg (1)
1124 $match[2] = '0028 '.$match[2].' 0029';
1125 break;
1126
1127 case '<square>': // add square brackets as square replacement, eg [1]
1128 $match[2] = '005B '.$match[2].' 005D';
1129 break;
1130
1131 case '<compat>': // ignore multi char decompositions that start with a space
1132 if (ereg('^0020 ',$match[2])) continue 2;
1133 break;
1134
1135 // ignore Arabic and vertical layout presentation decomposition
1136 case '<initial>':
1137 case '<medial>':
1138 case '<final>':
1139 case '<isolated>':
1140 case '<vertical>':
1141 continue 2;
1142 }
1143 $decomposition["U+$char"] = explode(' ', $match[2]);
1144 }
1145 }
1146 fclose($fh);
1147
1148 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1149 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
1150 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1151 $fh = fopen($specialCasingFile,'rb');
1152 if ($fh) {
1153 while (!feof($fh)) {
1154 $line = fgets($fh,4096);
1155 if ($line{0} != '#' && trim($line) != '') {
1156
1157 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
1158 if ($cond == '' || $cond{0} == '#') {
1159 $utf8_char = $this->UnumberToChar(hexdec($char));
1160 if ($char != $lower) {
1161 $arr = explode(' ', $lower);
1162 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1163 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1164 }
1165 if ($char != $title && $title != $upper) {
1166 $arr = explode(' ', $title);
1167 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1168 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1169 }
1170 if ($char != $upper) {
1171 $arr = explode(' ', $upper);
1172 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1173 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1174 }
1175 }
1176 }
1177 }
1178 fclose($fh);
1179 }
1180 }
1181
1182 // process custom decompositions
1183 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
1184 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1185 $fh = fopen($customTranslitFile,'rb');
1186 if ($fh) {
1187 while (!feof($fh)) {
1188 $line = fgets($fh,4096);
1189 if ($line{0} != '#' && trim($line) != '') {
1190 list($char,$translit) = t3lib_div::trimExplode(';', $line);
1191 if (!$translit) $omit["U+$char"] = 1;
1192 $decomposition["U+$char"] = explode(' ', $translit);
1193
1194 }
1195 }
1196 fclose($fh);
1197 }
1198 }
1199
1200 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1201 foreach($decomposition as $from => $to) {
1202 $code_decomp = array();
1203
1204 while ($code_value = array_shift($to)) {
1205 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1206 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
1207 array_unshift($to, $cv);
1208 }
1209 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1210 array_push($code_decomp, $code_value);
1211 }
1212 }
1213 if (count($code_decomp) || isset($omit[$from])) {
1214 $decomposition[$from] = $code_decomp;
1215 } else {
1216 unset($decomposition[$from]);
1217 }
1218 }
1219
1220 // create ascii only mapping
1221 $this->toASCII['utf-8'] = array();
1222 $ascii =& $this->toASCII['utf-8'];
1223
1224 foreach($decomposition as $from => $to) {
1225 $code_decomp = array();
1226 while ($code_value = array_shift($to)) {
1227 $ord = hexdec($code_value);
1228 if ($ord > 127)
1229 continue 2; // skip decompositions containing non-ASCII chars
1230 else
1231 array_push($code_decomp,chr($ord));
1232 }
1233 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1234 }
1235
1236 // add numeric decompositions
1237 foreach($number as $from => $to) {
1238 $utf8_char = $this->UnumberToChar(hexdec($from));
1239 if (!isset($ascii[$utf8_char])) {
1240 $ascii[$utf8_char] = $to;
1241 }
1242 }
1243
1244 if ($cacheFileCase) {
1245 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1246 }
1247
1248 if ($cacheFileASCII) {
1249 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1250 }
1251
1252 return 3;
1253 }
1254
1255 /**
1256 * This function initializes the folding table for a charset other than UTF-8.
1257 * This function is automatically called by the case folding functions.
1258 *
1259 * @param string Charset for which to initialize case folding.
1260 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1261 * @access private
1262 */
1263 function initCaseFolding($charset) {
1264 // Only process if the case table is not yet loaded:
1265 if (is_array($this->caseFolding[$charset])) return 1;
1266
1267 // Use cached version if possible
1268 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1269 if ($cacheFile && @is_file($cacheFile)) {
1270 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1271 return 2;
1272 }
1273
1274 // init UTF-8 conversion for this charset
1275 if (!$this->initCharset($charset)) {
1276 return false;
1277 }
1278
1279 // UTF-8 case folding is used as the base conversion table
1280 if (!$this->initUnicodeData('case')) {
1281 return false;
1282 }
1283
1284 $nochar = chr($this->noCharByteVal);
1285 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1286 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1287 $c = $this->utf8_decode($utf8, $charset);
1288
1289 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1290 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1291 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1292
1293 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1294 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1295 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
1296
1297 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1298 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1299 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1300 }
1301
1302 // add the ASCII case table
1303 for ($i=ord('a'); $i<=ord('z'); $i++) {
1304 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
1305 }
1306 for ($i=ord('A'); $i<=ord('Z'); $i++) {
1307 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
1308 }
1309
1310 if ($cacheFile) {
1311 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
1312 }
1313
1314 return 3;
1315 }
1316
1317 /**
1318 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1319 * This function is automatically called by the ASCII transliteration functions.
1320 *
1321 * @param string Charset for which to initialize conversion.
1322 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1323 * @access private
1324 */
1325 function initToASCII($charset) {
1326 // Only process if the case table is not yet loaded:
1327 if (is_array($this->toASCII[$charset])) return 1;
1328
1329 // Use cached version if possible
1330 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1331 if ($cacheFile && @is_file($cacheFile)) {
1332 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1333 return 2;
1334 }
1335
1336 // init UTF-8 conversion for this charset
1337 if (!$this->initCharset($charset)) {
1338 return false;
1339 }
1340
1341 // UTF-8/ASCII transliteration is used as the base conversion table
1342 if (!$this->initUnicodeData('ascii')) {
1343 return false;
1344 }
1345
1346 $nochar = chr($this->noCharByteVal);
1347 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1348 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1349 $c = $this->utf8_decode($utf8, $charset);
1350
1351 if (isset($this->toASCII['utf-8'][$utf8])) {
1352 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1353 }
1354 }
1355
1356 if ($cacheFile) {
1357 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
1358 }
1359
1360 return 3;
1361 }
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378 /********************************************
1379 *
1380 * String operation functions
1381 *
1382 ********************************************/
1383
1384 /**
1385 * Returns a part of a string.
1386 * Unit-tested by Kasper (single byte charsets only)
1387 *
1388 * @param string The character set
1389 * @param string Character string
1390 * @param integer Start position (character position)
1391 * @param integer Length (in characters)
1392 * @return string The substring
1393 * @see substr(), mb_substr()
1394 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1395 */
1396 function substr($charset,$string,$start,$len=null) {
1397 if ($len===0) return '';
1398
1399 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1400 // cannot omit $len, when specifying charset
1401 if ($len==null) {
1402 $enc = mb_internal_encoding(); // save internal encoding
1403 mb_internal_encoding($charset);
1404 $str = mb_substr($string,$start);
1405 mb_internal_encoding($enc); // restore internal encoding
1406
1407 return $str;
1408 }
1409 else {
1410 return mb_substr($string,$start,$len,$charset);
1411 }
1412 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1413 // cannot omit $len, when specifying charset
1414 if ($len==null) {
1415 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1416 iconv_set_encoding('internal_encoding',$charset);
1417 $str = iconv_substr($string,$start);
1418 iconv_set_encoding('internal_encoding',$enc); // restore internal encoding
1419
1420 return $str;
1421 }
1422 else {
1423 return iconv_substr($string,$start,$len,$charset);
1424 }
1425 } elseif ($charset == 'utf-8') {
1426 return $this->utf8_substr($string,$start,$len);
1427 } elseif ($this->eucBasedSets[$charset]) {
1428 return $this->euc_substr($string,$start,$charset,$len);
1429 } elseif ($this->twoByteSets[$charset]) {
1430 return substr($string,$start*2,$len*2);
1431 } elseif ($this->fourByteSets[$charset]) {
1432 return substr($string,$start*4,$len*4);
1433 }
1434
1435 // treat everything else as single-byte encoding
1436 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
1437 }
1438
1439 /**
1440 * Counts the number of characters.
1441 * Unit-tested by Kasper (single byte charsets only)
1442 *
1443 * @param string The character set
1444 * @param string Character string
1445 * @return integer The number of characters
1446 * @see strlen()
1447 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1448 */
1449 function strlen($charset,$string) {
1450 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1451 return mb_strlen($string,$charset);
1452 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1453 return iconv_strlen($string,$charset);
1454 } elseif ($charset == 'utf-8') {
1455 return $this->utf8_strlen($string);
1456 } elseif ($this->eucBasedSets[$charset]) {
1457 return $this->euc_strlen($string,$charset);
1458 } elseif ($this->twoByteSets[$charset]) {
1459 return strlen($string)/2;
1460 } elseif ($this->fourByteSets[$charset]) {
1461 return strlen($string)/4;
1462 }
1463 // treat everything else as single-byte encoding
1464 return strlen($string);
1465 }
1466
1467 /**
1468 * Truncates a string and pre-/appends a string.
1469 * Unit tested by Kasper
1470 *
1471 * @param string The character set
1472 * @param string Character string
1473 * @param integer Length (in characters)
1474 * @param string Crop signifier
1475 * @return string The shortened string
1476 * @see substr(), mb_strimwidth()
1477 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1478 */
1479 function crop($charset,$string,$len,$crop='') {
1480 if (intval($len) == 0) return $string;
1481
1482 if ($charset == 'utf-8') {
1483 $i = $this->utf8_char2byte_pos($string,$len);
1484 } elseif ($this->eucBasedSets[$charset]) {
1485 $i = $this->euc_char2byte_pos($string,$len,$charset);
1486 } else {
1487 if ($len > 0) {
1488 $i = $len;
1489 } else {
1490 $i = strlen($string)+$len;
1491 if ($i<=0) $i = false;
1492 }
1493 }
1494
1495 if ($i === false) { // $len outside actual string length
1496 return $string;
1497 } else {
1498 if ($len > 0) {
1499 if (strlen($string{$i})) {
1500 return substr($string,0,$i).$crop;
1501
1502 }
1503 } else {
1504 if (strlen($string{$i-1})) {
1505 return $crop.substr($string,$i);
1506 }
1507 }
1508
1509 /*
1510 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1511 if ($len > 0) {
1512 return substr($string,0,$i).$crop;
1513 } else {
1514 return $crop.substr($string,$i);
1515 }
1516 }
1517 */
1518 }
1519 return $string;
1520 }
1521
1522 /**
1523 * Cuts a string short at a given byte length.
1524 *
1525 * @param string The character set
1526 * @param string Character string
1527 * @param integer The byte length
1528 * @return string The shortened string
1529 * @see mb_strcut()
1530 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1531 */
1532 function strtrunc($charset,$string,$len) {
1533 if ($len <= 0) return '';
1534
1535 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1536 return mb_strcut($string,0,$len,$charset);
1537 } elseif ($charset == 'utf-8') {
1538 return $this->utf8_strtrunc($string,$len);
1539 } elseif ($this->eucBasedSets[$charset]) {
1540 return $this->euc_strtrunc($string,$charset);
1541 } elseif ($this->twoByteSets[$charset]) {
1542 if ($len % 2) $len--; // don't cut at odd positions
1543 } elseif ($this->fourByteSets[$charset]) {
1544 $x = $len % 4;
1545 $len -= $x; // realign to position dividable by four
1546 }
1547 // treat everything else as single-byte encoding
1548 return substr($string,0,$len);
1549 }
1550
1551 /**
1552 * Translates all characters of a string into their respective case values.
1553 * Unlike strtolower() and strtoupper() this method is locale independent.
1554 * Note that the string length may change!
1555 * eg. lower case German �(sharp S) becomes upper case "SS"
1556 * Unit-tested by Kasper
1557 * Real case folding is language dependent, this method ignores this fact.
1558 *
1559 * @param string Character set of string
1560 * @param string Input string to convert case for
1561 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1562 * @return string The converted string
1563 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1564 * @see strtolower(), strtoupper()
1565 */
1566 function conv_case($charset,$string,$case) {
1567 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1568 if ($case == 'toLower') {
1569 $string = mb_strtolower($string,$charset);
1570 } else {
1571 $string = mb_strtoupper($string,$charset);
1572 }
1573 } elseif ($charset == 'utf-8') {
1574 $string = $this->utf8_char_mapping($string,'case',$case);
1575 } elseif (isset($this->eucBasedSets[$charset])) {
1576 $string = $this->euc_char_mapping($string,$charset,'case',$case);
1577 } else {
1578 // treat everything else as single-byte encoding
1579 $string = $this->sb_char_mapping($string,$charset,'case',$case);
1580 }
1581
1582 return $string;
1583 }
1584
1585 /**
1586 * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
1587 *
1588 * @param string Character set of string
1589 * @param string Input string to convert
1590 * @return string The converted string
1591 */
1592 function specCharsToASCII($charset,$string) {
1593 if ($charset == 'utf-8') {
1594 $string = $this->utf8_char_mapping($string,'ascii');
1595 } elseif (isset($this->eucBasedSets[$charset])) {
1596 $string = $this->euc_char_mapping($string,$charset,'ascii');
1597 } else {
1598 // treat everything else as single-byte encoding
1599 $string = $this->sb_char_mapping($string,$charset,'ascii');
1600 }
1601
1602 return $string;
1603 }
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616 /********************************************
1617 *
1618 * Internal string operation functions
1619 *
1620 ********************************************/
1621
1622 /**
1623 * Maps all characters of a string in a single byte charset.
1624 *
1625 * @param string the string
1626 * @param string the charset
1627 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1628 * @param string 'case': conversion 'toLower' or 'toUpper'
1629 * @return string the converted string
1630 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1631 */
1632 function sb_char_mapping($str,$charset,$mode,$opt='') {
1633 switch($mode) {
1634 case 'case':
1635 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1636 $map =& $this->caseFolding[$charset][$opt];
1637 break;
1638
1639 case 'ascii':
1640 if (!$this->initToASCII($charset)) return $str; // do nothing
1641 $map =& $this->toASCII[$charset];
1642 break;
1643
1644 default:
1645 return $str;
1646 }
1647
1648 $out = '';
1649 for($i=0; strlen($str{$i}); $i++) {
1650 $c = $str{$i};
1651 if (isset($map[$c])) {
1652 $out .= $map[$c];
1653 } else {
1654 $out .= $c;
1655 }
1656 }
1657
1658 return $out;
1659 }
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670 /********************************************
1671 *
1672 * Internal UTF-8 string operation functions
1673 *
1674 ********************************************/
1675
1676 /**
1677 * Returns a part of a UTF-8 string.
1678 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1679 *
1680 * @param string UTF-8 string
1681 * @param integer Start position (character position)
1682 * @param integer Length (in characters)
1683 * @return string The substring
1684 * @see substr()
1685 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1686 */
1687 function utf8_substr($str,$start,$len=null) {
1688 if (!strcmp($len,'0')) return '';
1689
1690 $byte_start = $this->utf8_char2byte_pos($str,$start);
1691 if ($byte_start === false) {
1692 if ($start > 0) {
1693 return false; // $start outside string length
1694 } else {
1695 $start = 0;
1696 }
1697 }
1698
1699 $str = substr($str,$byte_start);
1700
1701 if ($len!=null) {
1702 $byte_end = $this->utf8_char2byte_pos($str,$len);
1703 if ($byte_end === false) // $len outside actual string length
1704 return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string.
1705 else
1706 return substr($str,0,$byte_end);
1707 }
1708 else return $str;
1709 }
1710
1711 /**
1712 * Counts the number of characters of a string in UTF-8.
1713 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1714 *
1715 * @param string UTF-8 multibyte character string
1716 * @return integer The number of characters
1717 * @see strlen()
1718 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1719 */
1720 function utf8_strlen($str) {
1721 $n=0;
1722 for($i=0; strlen($str{$i}); $i++) {
1723 $c = ord($str{$i});
1724 if (!($c & 0x80)) // single-byte (0xxxxxx)
1725 $n++;
1726 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1727 $n++;
1728 }
1729 return $n;
1730 }
1731
1732 /**
1733 * Truncates a string in UTF-8 short at a given byte length.
1734 *
1735 * @param string UTF-8 multibyte character string
1736 * @param integer the byte length
1737 * @return string the shortened string
1738 * @see mb_strcut()
1739 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1740 */
1741 function utf8_strtrunc($str,$len) {
1742 $i = $len-1;
1743 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1744 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1745 if ($i <= 0) return ''; // sanity check
1746 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1747 if ($bc+$i > $len) return substr($str,0,$i);
1748 // fallthru: multibyte char fits into length
1749 }
1750 return substr($str,0,$len);
1751 }
1752
1753 /**
1754 * Find position of first occurrence of a string, both arguments are in UTF-8.
1755 *
1756 * @param string UTF-8 string to search in
1757 * @param string UTF-8 string to search for
1758 * @param integer Positition to start the search
1759 * @return integer The character position
1760 * @see strpos()
1761 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1762 */
1763 function utf8_strpos($haystack,$needle,$offset=0) {
1764 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1765 return mb_strpos($haystack,$needle,$offset,'utf-8');
1766 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1767 return iconv_strpos($haystack,$needle,$offset,'utf-8');
1768 }
1769
1770 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1771 if ($byte_offset === false) return false; // offset beyond string length
1772
1773 $byte_pos = strpos($haystack,$needle,$byte_offset);
1774 if ($byte_pos === false) return false; // needle not found
1775
1776 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1777 }
1778
1779 /**
1780 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1781 *
1782 * @param string UTF-8 string to search in
1783 * @param string UTF-8 character to search for (single character)
1784 * @return integer The character position
1785 * @see strrpos()
1786 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1787 */
1788 function utf8_strrpos($haystack,$needle) {
1789 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1790 return mb_strrpos($haystack,$needle,'utf-8');
1791 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1792 return iconv_strrpos($haystack,$needle,'utf-8');
1793 }
1794
1795 $byte_pos = strrpos($haystack,$needle);
1796 if ($byte_pos === false) return false; // needle not found
1797
1798 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1799 }
1800
1801 /**
1802 * Translates a character position into an 'absolute' byte position.
1803 * Unit tested by Kasper.
1804 *
1805 * @param string UTF-8 string
1806 * @param integer Character position (negative values start from the end)
1807 * @return integer Byte position
1808 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1809 */
1810 function utf8_char2byte_pos($str,$pos) {
1811 $n = 0; // number of characters found
1812 $p = abs($pos); // number of characters wanted
1813
1814 if ($pos >= 0) {
1815 $i = 0;
1816 $d = 1;
1817 } else {
1818 $i = strlen($str)-1;
1819 $d = -1;
1820 }
1821
1822 for( ; strlen($str{$i}) && $n<$p; $i+=$d) {
1823 $c = (int)ord($str{$i});
1824 if (!($c & 0x80)) // single-byte (0xxxxxx)
1825 $n++;
1826 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1827 $n++;
1828 }
1829 if (!strlen($str{$i})) return false; // offset beyond string length
1830
1831 if ($pos >= 0) {
1832 // skip trailing multi-byte data bytes
1833 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1834 } else {
1835 // correct offset
1836 $i++;
1837 }
1838
1839 return $i;
1840 }
1841
1842 /**
1843 * Translates an 'absolute' byte position into a character position.
1844 * Unit tested by Kasper.
1845 *
1846 * @param string UTF-8 string
1847 * @param integer byte position
1848 * @return integer character position
1849 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1850 */
1851 function utf8_byte2char_pos($str,$pos) {
1852 $n = 0; // number of characters
1853 for($i=$pos; $i>0; $i--) {
1854 $c = (int)ord($str{$i});
1855 if (!($c & 0x80)) // single-byte (0xxxxxx)
1856 $n++;
1857 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1858 $n++;
1859 }
1860 if (!strlen($str{$i})) return false; // offset beyond string length
1861
1862 return $n;
1863 }
1864
1865 /**
1866 * Maps all characters of an UTF-8 string.
1867 *
1868 * @param string UTF-8 string
1869 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1870 * @param string 'case': conversion 'toLower' or 'toUpper'
1871 * @return string the converted string
1872 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1873 */
1874 function utf8_char_mapping($str,$mode,$opt='') {
1875 if (!$this->initUnicodeData($mode)) return $str; // do nothing
1876
1877 $out = '';
1878 switch($mode) {
1879 case 'case':
1880 $map =& $this->caseFolding['utf-8'][$opt];
1881 break;
1882
1883 case 'ascii':
1884 $map =& $this->toASCII['utf-8'];
1885 break;
1886
1887 default:
1888 return $str;
1889 }
1890
1891 for($i=0; strlen($str{$i}); $i++) {
1892 $c = ord($str{$i});
1893 if (!($c & 0x80)) // single-byte (0xxxxxx)
1894 $mbc = $str{$i};
1895 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1896 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1897 $mbc = substr($str,$i,$bc);
1898 $i += $bc-1;
1899 }
1900
1901 if (isset($map[$mbc])) {
1902 $out .= $map[$mbc];
1903 } else {
1904 $out .= $mbc;
1905 }
1906 }
1907
1908 return $out;
1909 }
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928 /********************************************
1929 *
1930 * Internal EUC string operation functions
1931 *
1932 * Extended Unix Code:
1933 * ASCII compatible 7bit single bytes chars
1934 * 8bit two byte chars
1935 *
1936 * Shift-JIS is treated as a special case.
1937 *
1938 ********************************************/
1939
1940 /**
1941 * Cuts a string in the EUC charset family short at a given byte length.
1942 *
1943 * @param string EUC multibyte character string
1944 * @param integer the byte length
1945 * @param string the charset
1946 * @return string the shortened string
1947 * @see mb_strcut()
1948 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1949 */
1950 function euc_strtrunc($str,$len,$charset) {
1951 $sjis = ($charset == 'shift_jis');
1952 for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
1953 $c = ord($str{$i});
1954 if ($sjis) {
1955 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1956 }
1957 else {
1958 if ($c >= 0x80) $i++; // advance a double-byte char
1959 }
1960 }
1961 if (!strlen($str{$i})) return $str; // string shorter than supplied length
1962
1963 if ($i>$len)
1964 return substr($str,0,$len-1); // we ended on a first byte
1965 else
1966 return substr($str,0,$len);
1967 }
1968
1969 /**
1970 * Returns a part of a string in the EUC charset family.
1971 *
1972 * @param string EUC multibyte character string
1973 * @param integer start position (character position)
1974 * @param string the charset
1975 * @param integer length (in characters)
1976 * @return string the substring
1977 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1978 */
1979 function euc_substr($str,$start,$charset,$len=null) {
1980 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1981 if ($byte_start === false) return false; // $start outside string length
1982
1983 $str = substr($str,$byte_start);
1984
1985 if ($len!=null) {
1986 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1987 if ($byte_end === false) // $len outside actual string length
1988 return $str;
1989 else
1990 return substr($str,0,$byte_end);
1991 }
1992 else return $str;
1993 }
1994
1995 /**
1996 * Counts the number of characters of a string in the EUC charset family.
1997 *
1998 * @param string EUC multibyte character string
1999 * @param string the charset
2000 * @return integer the number of characters
2001 * @see strlen()
2002 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2003 */
2004 function euc_strlen($str,$charset) {
2005 $sjis = ($charset == 'shift_jis');
2006 $n=0;
2007 for ($i=0; strlen($str{$i}); $i++) {
2008 $c = ord($str{$i});
2009 if ($sjis) {
2010 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
2011 }
2012 else {
2013 if ($c >= 0x80) $i++; // advance a double-byte char
2014 }
2015
2016 $n++;
2017 }
2018
2019 return $n;
2020 }
2021
2022 /**
2023 * Translates a character position into an 'absolute' byte position.
2024 *
2025 * @param string EUC multibyte character string
2026 * @param integer character position (negative values start from the end)
2027 * @param string the charset
2028 * @return integer byte position
2029 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2030 */
2031 function euc_char2byte_pos($str,$pos,$charset) {
2032 $sjis = ($charset == 'shift_jis');
2033 $n = 0; // number of characters seen
2034 $p = abs($pos); // number of characters wanted
2035
2036 if ($pos >= 0) {
2037 $i = 0;
2038 $d = 1;
2039 } else {
2040 $i = strlen($str)-1;
2041 $d = -1;
2042 }
2043
2044 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
2045 $c = ord($str{$i});
2046 if ($sjis) {
2047 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char
2048 }
2049 else {
2050 if ($c >= 0x80) $i+=$d; // advance a double-byte char
2051 }
2052
2053 $n++;
2054 }
2055 if (!strlen($str{$i})) return false; // offset beyond string length
2056
2057 if ($pos < 0) $i++; // correct offset
2058
2059 return $i;
2060 }
2061
2062 /**
2063 * Maps all characters of a string in the EUC charset family.
2064 *
2065 * @param string EUC multibyte character string
2066 * @param string the charset
2067 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2068 * @param string 'case': conversion 'toLower' or 'toUpper'
2069 * @return string the converted string
2070 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2071 */
2072 function euc_char_mapping($str,$charset,$mode,$opt='') {
2073 switch($mode) {
2074 case 'case':
2075 if (!$this->initCaseFolding($charset)) return $str; // do nothing
2076 $map =& $this->caseFolding[$charset][$opt];
2077 break;
2078
2079 case 'ascii':
2080 if (!$this->initToASCII($charset)) return $str; // do nothing
2081 $map =& $this->toASCII[$charset];
2082 break;
2083
2084 default:
2085 return $str;
2086 }
2087
2088 $sjis = ($charset == 'shift_jis');
2089 $out = '';
2090 for($i=0; strlen($str{$i}); $i++) {
2091 $mbc = $str{$i};
2092 $c = ord($mbc);
2093
2094 if ($sjis) {
2095 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2096 $mbc = substr($str,$i,2);
2097 $i++;
2098 }
2099 }
2100 else {
2101 if ($c >= 0x80) { // a double-byte char
2102 $mbc = substr($str,$i,2);
2103 $i++;
2104 }
2105 }
2106
2107 if (isset($map[$mbc])) {
2108 $out .= $map[$mbc];
2109 } else {
2110 $out .= $mbc;
2111 }
2112 }
2113
2114 return $out;
2115 }
2116
2117 }
2118
2119 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
2120 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2121 }
2122
2123 ?>