39a4039857402a9efef6bf26bb014742836916cf
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2009 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 136: class t3lib_cs
38 * 488: function parse_charset($charset)
39 * 507: function get_locale_charset($locale)
40 *
41 * SECTION: Charset Conversion functions
42 * 560: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
43 * 600: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
44 * 617: function utf8_encode($str,$charset)
45 * 663: function utf8_decode($str,$charset,$useEntityForNoChar=0)
46 * 706: function utf8_to_entities($str)
47 * 739: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
48 * 773: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
49 * 823: function UnumberToChar($cbyte)
50 * 868: function utf8CharToUnumber($str,$hex=0)
51 *
52 * SECTION: Init functions
53 * 911: function initCharset($charset)
54 * 973: function initUnicodeData($mode=null)
55 * 1198: function initCaseFolding($charset)
56 * 1260: function initToASCII($charset)
57 *
58 * SECTION: String operation functions
59 * 1331: function substr($charset,$string,$start,$len=null)
60 * 1384: function strlen($charset,$string)
61 * 1414: function crop($charset,$string,$len,$crop='')
62 * 1467: function strtrunc($charset,$string,$len)
63 * 1501: function conv_case($charset,$string,$case)
64 * 1527: function specCharsToASCII($charset,$string)
65 *
66 * SECTION: Internal string operation functions
67 * 1567: function sb_char_mapping($str,$charset,$mode,$opt='')
68 *
69 * SECTION: Internal UTF-8 string operation functions
70 * 1622: function utf8_substr($str,$start,$len=null)
71 * 1655: function utf8_strlen($str)
72 * 1676: function utf8_strtrunc($str,$len)
73 * 1698: function utf8_strpos($haystack,$needle,$offset=0)
74 * 1723: function utf8_strrpos($haystack,$needle)
75 * 1745: function utf8_char2byte_pos($str,$pos)
76 * 1786: function utf8_byte2char_pos($str,$pos)
77 * 1809: function utf8_char_mapping($str,$mode,$opt='')
78 *
79 * SECTION: Internal EUC string operation functions
80 * 1885: function euc_strtrunc($str,$len,$charset)
81 * 1914: function euc_substr($str,$start,$charset,$len=null)
82 * 1939: function euc_strlen($str,$charset)
83 * 1966: function euc_char2byte_pos($str,$pos,$charset)
84 * 2007: function euc_char_mapping($str,$charset,$mode,$opt='')
85 *
86 * TOTAL FUNCTIONS: 35
87 * (This index is automatically created/updated by the extension "extdeveval")
88 *
89 */
90
91
92
93
94
95
96
97
98 /**
99 * Notes on UTF-8
100 *
101 * Functions working on UTF-8 strings:
102 *
103 * - strchr/strstr
104 * - strrchr
105 * - substr_count
106 * - implode/explode/join
107 *
108 * Functions nearly working on UTF-8 strings:
109 *
110 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
111 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
112 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
113 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
114 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
115 *
116 * Functions NOT working on UTF-8 strings:
117 *
118 * - str*cmp
119 * - stristr
120 * - stripos
121 * - substr
122 * - strrev
123 * - split/spliti
124 * - ...
125 *
126 */
127 /**
128 * Class for conversion between charsets
129 *
130 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
131 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
132 * @package TYPO3
133 * @subpackage t3lib
134 */
135 class t3lib_cs {
136 var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
137
138 // This is the array where parsed conversion tables are stored (cached)
139 var $parsedCharsets=array();
140
141 // An array where case folding data will be stored (cached)
142 var $caseFolding=array();
143
144 // An array where charset-to-ASCII mappings are stored (cached)
145 var $toASCII=array();
146
147 // This tells the converter which charsets has two bytes per char:
148 var $twoByteSets=array(
149 'ucs-2'=>1, // 2-byte Unicode
150 );
151
152 // This tells the converter which charsets has four bytes per char:
153 var $fourByteSets=array(
154 'ucs-4'=>1, // 4-byte Unicode
155 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
156 );
157
158 // This tells the converter which charsets use a scheme like the Extended Unix Code:
159 var $eucBasedSets=array(
160 'gb2312'=>1, // Chinese, simplified.
161 'big5'=>1, // Chinese, traditional.
162 'euc-kr'=>1, // Korean
163 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
164 );
165
166 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
167 // http://czyborra.com/charsets/iso8859.html
168 var $synonyms=array(
169 'us' => 'ascii',
170 'us-ascii'=> 'ascii',
171 'cp819' => 'iso-8859-1',
172 'ibm819' => 'iso-8859-1',
173 'iso-ir-100' => 'iso-8859-1',
174 'iso-ir-101' => 'iso-8859-2',
175 'iso-ir-109' => 'iso-8859-3',
176 'iso-ir-110' => 'iso-8859-4',
177 'iso-ir-144' => 'iso-8859-5',
178 'iso-ir-127' => 'iso-8859-6',
179 'iso-ir-126' => 'iso-8859-7',
180 'iso-ir-138' => 'iso-8859-8',
181 'iso-ir-148' => 'iso-8859-9',
182 'iso-ir-157' => 'iso-8859-10',
183 'iso-ir-179' => 'iso-8859-13',
184 'iso-ir-199' => 'iso-8859-14',
185 'iso-ir-203' => 'iso-8859-15',
186 'csisolatin1' => 'iso-8859-1',
187 'csisolatin2' => 'iso-8859-2',
188 'csisolatin3' => 'iso-8859-3',
189 'csisolatin5' => 'iso-8859-9',
190 'csisolatin8' => 'iso-8859-14',
191 'csisolatin9' => 'iso-8859-15',
192 'csisolatingreek' => 'iso-8859-7',
193 'iso-celtic' => 'iso-8859-14',
194 'latin1' => 'iso-8859-1',
195 'latin2' => 'iso-8859-2',
196 'latin3' => 'iso-8859-3',
197 'latin5' => 'iso-8859-9',
198 'latin6' => 'iso-8859-10',
199 'latin8' => 'iso-8859-14',
200 'latin9' => 'iso-8859-15',
201 'l1' => 'iso-8859-1',
202 'l2' => 'iso-8859-2',
203 'l3' => 'iso-8859-3',
204 'l5' => 'iso-8859-9',
205 'l6' => 'iso-8859-10',
206 'l8' => 'iso-8859-14',
207 'l9' => 'iso-8859-15',
208 'cyrillic' => 'iso-8859-5',
209 'arabic' => 'iso-8859-6',
210 'tis-620' => 'iso-8859-11',
211 'win874' => 'windows-874',
212 'win1250' => 'windows-1250',
213 'win1251' => 'windows-1251',
214 'win1252' => 'windows-1252',
215 'win1253' => 'windows-1253',
216 'win1254' => 'windows-1254',
217 'win1255' => 'windows-1255',
218 'win1256' => 'windows-1256',
219 'win1257' => 'windows-1257',
220 'win1258' => 'windows-1258',
221 'cp1250' => 'windows-1250',
222 'cp1251' => 'windows-1251',
223 'cp1252' => 'windows-1252',
224 'ms-ee' => 'windows-1250',
225 'ms-ansi' => 'windows-1252',
226 'ms-greek' => 'windows-1253',
227 'ms-turk' => 'windows-1254',
228 'winbaltrim' => 'windows-1257',
229 'koi-8ru' => 'koi-8r',
230 'koi8r' => 'koi-8r',
231 'cp878' => 'koi-8r',
232 'mac' => 'macroman',
233 'macintosh' => 'macroman',
234 'euc-cn' => 'gb2312',
235 'x-euc-cn' => 'gb2312',
236 'euccn' => 'gb2312',
237 'cp936' => 'gb2312',
238 'big-5' => 'big5',
239 'cp950' => 'big5',
240 'eucjp' => 'euc-jp',
241 'sjis' => 'shift_jis',
242 'shift-jis' => 'shift_jis',
243 'cp932' => 'shift_jis',
244 'cp949' => 'euc-kr',
245 'utf7' => 'utf-7',
246 'utf8' => 'utf-8',
247 'utf16' => 'utf-16',
248 'utf32' => 'utf-32',
249 'utf8' => 'utf-8',
250 'ucs2' => 'ucs-2',
251 'ucs4' => 'ucs-4',
252 );
253
254 // mapping of iso-639-1 language codes to script names
255 var $lang_to_script=array(
256 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
257 'ar' => 'arabic',
258 'bg' => 'cyrillic', // Bulgarian
259 'bs' => 'east_european', // Bosnian
260 'cs' => 'east_european', // Czech
261 'da' => 'west_european', // Danish
262 'de' => 'west_european', // German
263 'es' => 'west_european', // Spanish
264 'et' => 'estonian',
265 'eo' => 'unicode', // Esperanto
266 'eu' => 'west_european', // Basque
267 'fa' => 'arabic', // Persian
268 'fi' => 'west_european', // Finish
269 'fo' => 'west_european', // Faroese
270 'fr' => 'west_european', // French
271 'ga' => 'west_european', // Galician
272 'ge' => 'unicode', // Georgian
273 'gr' => 'greek',
274 'he' => 'hebrew', // Hebrew (since 1998)
275 'hi' => 'unicode', // Hindi
276 'hr' => 'east_european', // Croatian
277 'hu' => 'east_european', // Hungarian
278 'iw' => 'hebrew', // Hebrew (til 1998)
279 'is' => 'west_european', // Icelandic
280 'it' => 'west_european', // Italian
281 'ja' => 'japanese',
282 'kl' => 'west_european', // Greenlandic
283 'ko' => 'korean',
284 'lt' => 'lithuanian',
285 'lv' => 'west_european', // Latvian/Lettish
286 'nl' => 'west_european', // Dutch
287 'no' => 'west_european', // Norwegian
288 'nb' => 'west_european', // Norwegian Bokmal
289 'nn' => 'west_european', // Norwegian Nynorsk
290 'pl' => 'east_european', // Polish
291 'pt' => 'west_european', // Portuguese
292 'ro' => 'east_european', // Romanian
293 'ru' => 'cyrillic', // Russian
294 'sk' => 'east_european', // Slovak
295 'sl' => 'east_european', // Slovenian
296 'sr' => 'cyrillic', // Serbian
297 'sv' => 'west_european', // Swedish
298 'sq' => 'albanian', // Albanian
299 'th' => 'thai',
300 'uk' => 'cyrillic', // Ukranian
301 'vi' => 'vietnamese',
302 'zh' => 'chinese',
303 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
304 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
305 'ara' => 'arabic',
306 'bgr' => 'cyrillic', // Bulgarian
307 'cat' => 'west_european', // Catalan
308 'chs' => 'simpl_chinese',
309 'cht' => 'trad_chinese',
310 'csy' => 'east_european', // Czech
311 'dan' => 'west_european', // Danisch
312 'deu' => 'west_european', // German
313 'dea' => 'west_european', // German (Austrian)
314 'des' => 'west_european', // German (Swiss)
315 'ena' => 'west_european', // English (Australian)
316 'enc' => 'west_european', // English (Canadian)
317 'eng' => 'west_european', // English
318 'enz' => 'west_european', // English (New Zealand)
319 'enu' => 'west_european', // English (United States)
320 'euq' => 'west_european', // Basque
321 'fos' => 'west_european', // Faroese
322 'far' => 'arabic', // Persian
323 'fin' => 'west_european', // Finish
324 'fra' => 'west_european', // French
325 'frb' => 'west_european', // French (Belgian)
326 'frc' => 'west_european', // French (Canadian)
327 'frs' => 'west_european', // French (Swiss)
328 'geo' => 'unicode', // Georgian
329 'glg' => 'west_european', // Galician
330 'ell' => 'greek',
331 'heb' => 'hebrew',
332 'hin' => 'unicode', // Hindi
333 'hun' => 'east_european', // Hungarian
334 'isl' => 'west_euorpean', // Icelandic
335 'ita' => 'west_european', // Italian
336 'its' => 'west_european', // Italian (Swiss)
337 'jpn' => 'japanese',
338 'kor' => 'korean',
339 'lth' => 'lithuanian',
340 'lvi' => 'west_european', // Latvian/Lettish
341 'msl' => 'west_european', // Malay
342 'nlb' => 'west_european', // Dutch (Belgian)
343 'nld' => 'west_european', // Dutch
344 'nor' => 'west_european', // Norwegian (bokmal)
345 'non' => 'west_european', // Norwegian (nynorsk)
346 'plk' => 'east_european', // Polish
347 'ptg' => 'west_european', // Portuguese
348 'ptb' => 'west_european', // Portuguese (Brazil)
349 'rom' => 'east_european', // Romanian
350 'rus' => 'cyrillic', // Russian
351 'slv' => 'east_european', // Slovenian
352 'sky' => 'east_european', // Slovak
353 'srl' => 'east_european', // Serbian (Latin)
354 'srb' => 'cyrillic', // Serbian (Cyrillic)
355 'esp' => 'west_european', // Spanish (trad. sort)
356 'esm' => 'west_european', // Spanish (Mexican)
357 'esn' => 'west_european', // Spanish (internat. sort)
358 'sve' => 'west_european', // Swedish
359 'sqi' => 'albanian', // Albanian
360 'tha' => 'thai',
361 'trk' => 'turkish',
362 'ukr' => 'cyrillic', // Ukrainian
363 // English language names
364 'albanian' => 'albanian',
365 'arabic' => 'arabic',
366 'basque' => 'west_european',
367 'bosnian' => 'east_european',
368 'bulgarian' => 'east_european',
369 'catalan' => 'west_european',
370 'croatian' => 'east_european',
371 'czech' => 'east_european',
372 'danish' => 'west_european',
373 'dutch' => 'west_european',
374 'english' => 'west_european',
375 'esperanto' => 'unicode',
376 'estonian' => 'estonian',
377 'faroese' => 'west_european',
378 'farsi' => 'arabic',
379 'finnish' => 'west_european',
380 'french' => 'west_european',
381 'galician' => 'west_european',
382 'georgian' => 'unicode',
383 'german' => 'west_european',
384 'greek' => 'greek',
385 'greenlandic' => 'west_european',
386 'hebrew' => 'hebrew',
387 'hindi' => 'unicode',
388 'hungarian' => 'east_european',
389 'icelandic' => 'west_european',
390 'italian' => 'west_european',
391 'latvian' => 'west_european',
392 'lettish' => 'west_european',
393 'lithuanian' => 'lithuanian',
394 'malay' => 'west_european',
395 'norwegian' => 'west_european',
396 'persian' => 'arabic',
397 'polish' => 'east_european',
398 'portuguese' => 'west_european',
399 'russian' => 'cyrillic',
400 'romanian' => 'east_european',
401 'serbian' => 'cyrillic',
402 'slovak' => 'east_european',
403 'slovenian' => 'east_european',
404 'spanish' => 'west_european',
405 'svedish' => 'west_european',
406 'that' => 'thai',
407 'turkish' => 'turkish',
408 'ukrainian' => 'cyrillic',
409 );
410
411 // mapping of language (family) names to charsets on Unix
412 var $script_to_charset_unix=array(
413 'west_european' => 'iso-8859-1',
414 'estonian' => 'iso-8859-1',
415 'east_european' => 'iso-8859-2',
416 'baltic' => 'iso-8859-4',
417 'cyrillic' => 'iso-8859-5',
418 'arabic' => 'iso-8859-6',
419 'greek' => 'iso-8859-7',
420 'hebrew' => 'iso-8859-8',
421 'turkish' => 'iso-8859-9',
422 'thai' => 'iso-8859-11', // = TIS-620
423 'lithuanian' => 'iso-8859-13',
424 'chinese' => 'gb2312', // = euc-cn
425 'japanese' => 'euc-jp',
426 'korean' => 'euc-kr',
427 'simpl_chinese' => 'gb2312',
428 'trad_chinese' => 'big5',
429 'vietnamese' => '',
430 'unicode' => 'utf-8',
431 'albanian' => 'utf-8'
432 );
433
434 // mapping of language (family) names to charsets on Windows
435 var $script_to_charset_windows=array(
436 'east_european' => 'windows-1250',
437 'cyrillic' => 'windows-1251',
438 'west_european' => 'windows-1252',
439 'greek' => 'windows-1253',
440 'turkish' => 'windows-1254',
441 'hebrew' => 'windows-1255',
442 'arabic' => 'windows-1256',
443 'baltic' => 'windows-1257',
444 'estonian' => 'windows-1257',
445 'lithuanian' => 'windows-1257',
446 'vietnamese' => 'windows-1258',
447 'thai' => 'cp874',
448 'korean' => 'cp949',
449 'chinese' => 'gb2312',
450 'japanese' => 'shift_jis',
451 'simpl_chinese' => 'gb2312',
452 'trad_chinese' => 'big5',
453 'albanian' => 'windows-1250',
454 'unicode' => 'utf-8'
455 );
456
457 // mapping of locale names to charsets
458 var $locale_to_charset=array(
459 'japanese.euc' => 'euc-jp',
460 'ja_jp.ujis' => 'euc-jp',
461 'korean.euc' => 'euc-kr',
462 'sr@Latn' => 'iso-8859-2',
463 'zh_cn' => 'gb2312',
464 'zh_hk' => 'big5',
465 'zh_tw' => 'big5',
466 );
467
468 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
469 // Empty values means "iso-8859-1"
470 var $charSetArray = array(
471 'dk' => '',
472 'de' => '',
473 'no' => '',
474 'it' => '',
475 'fr' => '',
476 'es' => '',
477 'nl' => '',
478 'cz' => 'windows-1250',
479 'pl' => 'iso-8859-2',
480 'si' => 'windows-1250',
481 'fi' => '',
482 'tr' => 'iso-8859-9',
483 'se' => '',
484 'pt' => '',
485 'ru' => 'windows-1251',
486 'ro' => 'iso-8859-2',
487 'ch' => 'gb2312',
488 'sk' => 'windows-1250',
489 'lt' => 'windows-1257',
490 'is' => 'utf-8',
491 'hr' => 'windows-1250',
492 'hu' => 'iso-8859-2',
493 'gl' => '',
494 'th' => 'iso-8859-11',
495 'gr' => 'iso-8859-7',
496 'hk' => 'big5',
497 'eu' => '',
498 'bg' => 'windows-1251',
499 'br' => '',
500 'et' => 'iso-8859-4',
501 'ar' => 'iso-8859-6',
502 'he' => 'utf-8',
503 'ua' => 'windows-1251',
504 'jp' => 'shift_jis',
505 'lv' => 'utf-8',
506 'vn' => 'utf-8',
507 'ca' => 'iso-8859-15',
508 'ba' => 'iso-8859-2',
509 'kr' => 'euc-kr',
510 'eo' => 'utf-8',
511 'my' => '',
512 'hi' => 'utf-8',
513 'fo' => 'utf-8',
514 'fa' => 'utf-8',
515 'sr' => 'utf-8',
516 'sq' => 'utf-8',
517 'ge' => 'utf-8',
518 'ga' => '',
519 );
520
521 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
522 // Missing keys means: same as Typo3
523 var $isoArray = array(
524 'ba' => 'bs',
525 'br' => 'pt_BR',
526 'ch' => 'zh_CN',
527 'cz' => 'cs',
528 'dk' => 'da',
529 'si' => 'sl',
530 'se' => 'sv',
531 'gl' => 'kl',
532 'gr' => 'el',
533 'hk' => 'zh_HK',
534 'kr' => 'ko',
535 'ua' => 'uk',
536 'jp' => 'ja',
537 'vn' => 'vi',
538 );
539
540 /**
541 * Normalize - changes input character set to lowercase letters.
542 *
543 * @param string Input charset
544 * @return string Normalized charset
545 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
546 */
547 function parse_charset($charset) {
548 $charset = trim(strtolower($charset));
549 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
550
551 return $charset;
552 }
553
554 /**
555 * Get the charset of a locale.
556 *
557 * ln language
558 * ln_CN language / country
559 * ln_CN.cs language / country / charset
560 * ln_CN.cs@mod language / country / charset / modifier
561 *
562 * @param string Locale string
563 * @return string Charset resolved for locale string
564 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
565 */
566 function get_locale_charset($locale) {
567 $locale = strtolower($locale);
568
569 // exact locale specific charset?
570 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale];
571
572 // get modifier
573 list($locale,$modifier) = explode('@',$locale);
574
575 // locale contains charset: use it
576 list($locale,$charset) = explode('.',$locale);
577 if ($charset) return $this->parse_charset($charset);
578
579 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
580 if ($modifier == 'euro') return 'iso-8859-15';
581
582 // get language
583 list($language,$country) = explode('_',$locale);
584 if (isset($this->lang_to_script[$language])) $script = $this->lang_to_script[$language];
585
586 if (TYPO3_OS == 'WIN') {
587 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
588 } else {
589 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
590 }
591
592 return $cs;
593 }
594
595
596
597
598
599
600
601
602
603 /********************************************
604 *
605 * Charset Conversion functions
606 *
607 ********************************************/
608
609 /**
610 * Convert from one charset to another charset.
611 *
612 * @param string Input string
613 * @param string From charset (the current charset of the string)
614 * @param string To charset (the output charset wanted)
615 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
616 * @return string Converted string
617 * @see convArray()
618 */
619 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
620 if ($fromCS==$toCS) return $str;
621
622 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
623 if ($toCS=='utf-8' || !$useEntityForNoChar) {
624 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
625 case 'mbstring':
626 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
627 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
628 break;
629
630 case 'iconv':
631 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
632 if (false !== $conv_str) return $conv_str;
633 break;
634
635 case 'recode':
636 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
637 if (false !== $conv_str) return $conv_str;
638 break;
639 }
640 // fallback to TYPO3 conversion
641 }
642
643 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
644 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
645 return $str;
646 }
647
648 /**
649 * Convert all elements in ARRAY from one charset to another charset.
650 * NOTICE: Array is passed by reference!
651 *
652 * @param string Input array, possibly multidimensional
653 * @param string From charset (the current charset of the string)
654 * @param string To charset (the output charset wanted)
655 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
656 * @return void
657 * @see conv()
658 */
659 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
660 foreach($array as $key => $value) {
661 if (is_array($array[$key])) {
662 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
663 } else {
664 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
665 }
666 }
667 }
668
669 /**
670 * Converts $str from $charset to UTF-8
671 *
672 * @param string String in local charset to convert to UTF-8
673 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
674 * @return string Output string, converted to UTF-8
675 */
676 function utf8_encode($str,$charset) {
677
678 if ($charset === 'utf-8') return $str;
679
680 // Charset is case-insensitive.
681 if ($this->initCharset($charset)) { // Parse conv. table if not already...
682 $strLen = strlen($str);
683 $outStr='';
684
685 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
686 $chr=substr($str,$a,1);
687 $ord=ord($chr);
688 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
689 $ord2 = ord($str{$a+1});
690 $ord = $ord<<8 | $ord2; // assume big endian
691
692 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
693 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
694 } else $outStr.=chr($this->noCharByteVal); // No char exists
695 $a++;
696 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
697 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
698 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
699 $a++;
700 $ord2=ord(substr($str,$a,1));
701 $ord = $ord*256+$ord2;
702 }
703 }
704
705 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
706 $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
707 } else $outStr.= chr($this->noCharByteVal); // No char exists
708 } else $outStr.= $chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
709 }
710 return $outStr;
711 }
712 }
713
714 /**
715 * Converts $str from UTF-8 to $charset
716 *
717 * @param string String in UTF-8 to convert to local charset
718 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
719 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
720 * @return string Output string, converted to local charset
721 */
722 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
723
724 if ($charset === 'utf-8') {
725 return $str;
726 }
727
728 // Charset is case-insensitive.
729 if ($this->initCharset($charset)) { // Parse conv. table if not already...
730 $strLen = strlen($str);
731 $outStr='';
732 $buf='';
733 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
734 $chr=substr($str,$a,1);
735 $ord=ord($chr);
736 if ($ord>127) { // This means multibyte! (first byte!)
737 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
738
739 $buf=$chr; // Add first byte
740 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
741 $ord = $ord << 1; // Shift it left and ...
742 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
743 $a++; // Increase pointer...
744 $buf.=substr($str,$a,1); // ... and add the next char.
745 } else break;
746 }
747
748 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
749 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
750 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
751 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
752 } else $outStr.= chr($mByte);
753 } elseif ($useEntityForNoChar) { // Create num entity:
754 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
755 } else $outStr.=chr($this->noCharByteVal); // No char exists
756 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
757 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
758 }
759 return $outStr;
760 }
761 }
762
763 /**
764 * Converts all chars > 127 to numeric entities.
765 *
766 * @param string Input string
767 * @return string Output string
768 */
769 function utf8_to_entities($str) {
770 $strLen = strlen($str);
771 $outStr='';
772 $buf='';
773 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
774 $chr=substr($str,$a,1);
775 $ord=ord($chr);
776 if ($ord>127) { // This means multibyte! (first byte!)
777 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
778 $buf=$chr; // Add first byte
779 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
780 $ord = $ord << 1; // Shift it left and ...
781 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
782 $a++; // Increase pointer...
783 $buf.=substr($str,$a,1); // ... and add the next char.
784 } else break;
785 }
786
787 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
788 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
789 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
790 }
791
792 return $outStr;
793 }
794
795 /**
796 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
797 *
798 * @param string Input string, UTF-8
799 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
800 * @return string Output string
801 */
802 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
803 if ($alsoStdHtmlEnt) {
804 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
805 }
806
807 $token = md5(microtime());
808 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
809 foreach($parts as $k => $v) {
810 if ($k%2) {
811 if (substr($v,0,1)=='#') { // Dec or hex entities:
812 if (substr($v,1,1)=='x') {
813 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
814 } else {
815 $parts[$k] = $this->UnumberToChar(substr($v,1));
816 }
817 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
818 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
819 } else { // No conversion:
820 $parts[$k] ='&'.$v.';';
821 }
822 }
823 }
824
825 return implode('',$parts);
826 }
827
828 /**
829 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
830 *
831 * @param string Input string, UTF-8
832 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
833 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
834 * @return array Output array with the char numbers
835 */
836 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
837 // If entities must be registered as well...:
838 if ($convEntities) {
839 $str = $this->entities_to_utf8($str,1);
840 }
841 // Do conversion:
842 $strLen = strlen($str);
843 $outArr=array();
844 $buf='';
845 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
846 $chr=substr($str,$a,1);
847 $ord=ord($chr);
848 if ($ord>127) { // This means multibyte! (first byte!)
849 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
850 $buf=$chr; // Add first byte
851 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
852 $ord = $ord << 1; // Shift it left and ...
853 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
854 $a++; // Increase pointer...
855 $buf.=substr($str,$a,1); // ... and add the next char.
856 } else break;
857 }
858
859 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
860 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
861 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
862 }
863
864 return $outArr;
865 }
866
867 /**
868 * Converts a UNICODE number to a UTF-8 multibyte character
869 * Algorithm based on script found at From: http://czyborra.com/utf/
870 * Unit-tested by Kasper
871 *
872 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
873 *
874 * bytes | bits | representation
875 * 1 | 7 | 0vvvvvvv
876 * 2 | 11 | 110vvvvv 10vvvvvv
877 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
878 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
879 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
880 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
881 *
882 * @param integer UNICODE integer
883 * @return string UTF-8 multibyte character string
884 * @see utf8CharToUnumber()
885 */
886 function UnumberToChar($cbyte) {
887 $str='';
888
889 if ($cbyte < 0x80) {
890 $str.=chr($cbyte);
891 } else if ($cbyte < 0x800) {
892 $str.=chr(0xC0 | ($cbyte >> 6));
893 $str.=chr(0x80 | ($cbyte & 0x3F));
894 } else if ($cbyte < 0x10000) {
895 $str.=chr(0xE0 | ($cbyte >> 12));
896 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
897 $str.=chr(0x80 | ($cbyte & 0x3F));
898 } else if ($cbyte < 0x200000) {
899 $str.=chr(0xF0 | ($cbyte >> 18));
900 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
901 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
902 $str.=chr(0x80 | ($cbyte & 0x3F));
903 } else if ($cbyte < 0x4000000) {
904 $str.=chr(0xF8 | ($cbyte >> 24));
905 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
906 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
907 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
908 $str.=chr(0x80 | ($cbyte & 0x3F));
909 } else if ($cbyte < 0x80000000) {
910 $str.=chr(0xFC | ($cbyte >> 30));
911 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
912 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
913 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
914 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
915 $str.=chr(0x80 | ($cbyte & 0x3F));
916 } else { // Cannot express a 32-bit character in UTF-8
917 $str .= chr($this->noCharByteVal);
918 }
919 return $str;
920 }
921
922 /**
923 * Converts a UTF-8 Multibyte character to a UNICODE number
924 * Unit-tested by Kasper
925 *
926 * @param string UTF-8 multibyte character string
927 * @param boolean If set, then a hex. number is returned.
928 * @return integer UNICODE integer
929 * @see UnumberToChar()
930 */
931 function utf8CharToUnumber($str,$hex=0) {
932 $ord=ord(substr($str,0,1)); // First char
933
934 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
935 $binBuf='';
936 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
937 $ord = $ord << 1; // Shift it left and ...
938 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
939 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
940 } else break;
941 }
942 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
943
944 $int = bindec($binBuf);
945 } else $int = $ord;
946
947 return $hex ? 'x'.dechex($int) : $int;
948 }
949
950
951
952
953
954
955
956
957
958 /********************************************
959 *
960 * Init functions
961 *
962 ********************************************/
963
964 /**
965 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
966 * This function is automatically called by the conversion functions
967 *
968 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
969 *
970 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
971 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
972 * @access private
973 */
974 function initCharset($charset) {
975 // Only process if the charset is not yet loaded:
976 if (!is_array($this->parsedCharsets[$charset])) {
977
978 // Conversion table filename:
979 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
980
981 // If the conversion table is found:
982 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
983 // Cache file for charsets:
984 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
985 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
986 if ($cacheFile && @is_file($cacheFile)) {
987 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
988 } else {
989 // Parse conversion table into lines:
990 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
991 // Initialize the internal variable holding the conv. table:
992 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
993 // traverse the lines:
994 $detectedType='';
995 foreach($lines as $value) {
996 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
997
998 // Detect type if not done yet: (Done on first real line)
999 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1000 if (!$detectedType) $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/',$value) ? 'whitespaced' : 'ms-token';
1001
1002 if ($detectedType=='ms-token') {
1003 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1004 } elseif ($detectedType=='whitespaced') {
1005 $regA=array();
1006 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/',$value,$regA);
1007 $hexbyte = $regA[1];
1008 $utf8 = 'U+'.$regA[2];
1009 }
1010 $decval = hexdec(trim($hexbyte));
1011 if ($decval>127) {
1012 $utf8decval = hexdec(substr(trim($utf8),2));
1013 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
1014 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
1015 }
1016 }
1017 }
1018 if ($cacheFile) {
1019 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
1020 }
1021 }
1022 return 2;
1023 } else return false;
1024 } else return 1;
1025 }
1026
1027 /**
1028 * This function initializes all UTF-8 character data tables.
1029 *
1030 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1031 *
1032 * @param string Mode ("case", "ascii", ...)
1033 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1034 * @access private
1035 */
1036 function initUnicodeData($mode=null) {
1037 // cache files
1038 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1039 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1040
1041 // Only process if the tables are not yet loaded
1042 switch($mode) {
1043 case 'case':
1044 if (is_array($this->caseFolding['utf-8'])) return 1;
1045
1046 // Use cached version if possible
1047 if ($cacheFileCase && @is_file($cacheFileCase)) {
1048 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1049 return 2;
1050 }
1051 break;
1052
1053 case 'ascii':
1054 if (is_array($this->toASCII['utf-8'])) return 1;
1055
1056 // Use cached version if possible
1057 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1058 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1059 return 2;
1060 }
1061 break;
1062 }
1063
1064 // process main Unicode data file
1065 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
1066 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
1067
1068 $fh = fopen($unicodeDataFile,'rb');
1069 if (!$fh) return false;
1070
1071 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1072 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1073 $this->caseFolding['utf-8'] = array();
1074 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1075 $utf8CaseFolding['toUpper'] = array();
1076 $utf8CaseFolding['toLower'] = array();
1077 $utf8CaseFolding['toTitle'] = array();
1078
1079 $decomposition = array(); // array of temp. decompositions
1080 $mark = array(); // array of chars that are marks (eg. composing accents)
1081 $number = array(); // array of chars that are numbers (eg. digits)
1082 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1083
1084 while (!feof($fh)) {
1085 $line = fgets($fh,4096);
1086 // has a lot of info
1087 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = explode(';', rtrim($line));
1088
1089 $ord = hexdec($char);
1090 if ($ord > 0xFFFF) break; // only process the BMP
1091
1092 $utf8_char = $this->UnumberToChar($ord);
1093
1094 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1095 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1096 // store "title" only when different from "upper" (only a few)
1097 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1098
1099 switch ($cat{0}) {
1100 case 'M': // mark (accent, umlaut, ...)
1101 $mark["U+$char"] = 1;
1102 break;
1103
1104 case 'N': // numeric value
1105 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
1106 }
1107
1108 // accented Latin letters without "official" decomposition
1109 $match = array();
1110 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/',$name,$match) && !$decomp) {
1111 $c = ord($match[2]);
1112 if ($match[1] == 'SMALL') $c += 32;
1113
1114 $decomposition["U+$char"] = array(dechex($c));
1115 continue;
1116 }
1117
1118 $match = array();
1119 if (preg_match('/(<.*>)? *(.+)/',$decomp,$match)) {
1120 switch($match[1]) {
1121 case '<circle>': // add parenthesis as circle replacement, eg (1)
1122 $match[2] = '0028 '.$match[2].' 0029';
1123 break;
1124
1125 case '<square>': // add square brackets as square replacement, eg [1]
1126 $match[2] = '005B '.$match[2].' 005D';
1127 break;
1128
1129 case '<compat>': // ignore multi char decompositions that start with a space
1130 if (preg_match('/^0020 /',$match[2])) continue 2;
1131 break;
1132
1133 // ignore Arabic and vertical layout presentation decomposition
1134 case '<initial>':
1135 case '<medial>':
1136 case '<final>':
1137 case '<isolated>':
1138 case '<vertical>':
1139 continue 2;
1140 }
1141 $decomposition["U+$char"] = explode(' ', $match[2]);
1142 }
1143 }
1144 fclose($fh);
1145
1146 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1147 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
1148 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1149 $fh = fopen($specialCasingFile,'rb');
1150 if ($fh) {
1151 while (!feof($fh)) {
1152 $line = fgets($fh,4096);
1153 if ($line{0} != '#' && trim($line) != '') {
1154
1155 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
1156 if ($cond == '' || $cond{0} == '#') {
1157 $utf8_char = $this->UnumberToChar(hexdec($char));
1158 if ($char != $lower) {
1159 $arr = explode(' ', $lower);
1160 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1161 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1162 }
1163 if ($char != $title && $title != $upper) {
1164 $arr = explode(' ', $title);
1165 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1166 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1167 }
1168 if ($char != $upper) {
1169 $arr = explode(' ', $upper);
1170 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1171 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1172 }
1173 }
1174 }
1175 }
1176 fclose($fh);
1177 }
1178 }
1179
1180 // process custom decompositions
1181 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
1182 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1183 $fh = fopen($customTranslitFile,'rb');
1184 if ($fh) {
1185 while (!feof($fh)) {
1186 $line = fgets($fh,4096);
1187 if ($line{0} != '#' && trim($line) != '') {
1188 list($char,$translit) = t3lib_div::trimExplode(';', $line);
1189 if (!$translit) $omit["U+$char"] = 1;
1190 $decomposition["U+$char"] = explode(' ', $translit);
1191
1192 }
1193 }
1194 fclose($fh);
1195 }
1196 }
1197
1198 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1199 foreach($decomposition as $from => $to) {
1200 $code_decomp = array();
1201
1202 while ($code_value = array_shift($to)) {
1203 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1204 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
1205 array_unshift($to, $cv);
1206 }
1207 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1208 array_push($code_decomp, $code_value);
1209 }
1210 }
1211 if (count($code_decomp) || isset($omit[$from])) {
1212 $decomposition[$from] = $code_decomp;
1213 } else {
1214 unset($decomposition[$from]);
1215 }
1216 }
1217
1218 // create ascii only mapping
1219 $this->toASCII['utf-8'] = array();
1220 $ascii =& $this->toASCII['utf-8'];
1221
1222 foreach($decomposition as $from => $to) {
1223 $code_decomp = array();
1224 while ($code_value = array_shift($to)) {
1225 $ord = hexdec($code_value);
1226 if ($ord > 127)
1227 continue 2; // skip decompositions containing non-ASCII chars
1228 else
1229 array_push($code_decomp,chr($ord));
1230 }
1231 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1232 }
1233
1234 // add numeric decompositions
1235 foreach($number as $from => $to) {
1236 $utf8_char = $this->UnumberToChar(hexdec($from));
1237 if (!isset($ascii[$utf8_char])) {
1238 $ascii[$utf8_char] = $to;
1239 }
1240 }
1241
1242 if ($cacheFileCase) {
1243 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1244 }
1245
1246 if ($cacheFileASCII) {
1247 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1248 }
1249
1250 return 3;
1251 }
1252
1253 /**
1254 * This function initializes the folding table for a charset other than UTF-8.
1255 * This function is automatically called by the case folding functions.
1256 *
1257 * @param string Charset for which to initialize case folding.
1258 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1259 * @access private
1260 */
1261 function initCaseFolding($charset) {
1262 // Only process if the case table is not yet loaded:
1263 if (is_array($this->caseFolding[$charset])) return 1;
1264
1265 // Use cached version if possible
1266 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1267 if ($cacheFile && @is_file($cacheFile)) {
1268 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1269 return 2;
1270 }
1271
1272 // init UTF-8 conversion for this charset
1273 if (!$this->initCharset($charset)) {
1274 return false;
1275 }
1276
1277 // UTF-8 case folding is used as the base conversion table
1278 if (!$this->initUnicodeData('case')) {
1279 return false;
1280 }
1281
1282 $nochar = chr($this->noCharByteVal);
1283 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1284 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1285 $c = $this->utf8_decode($utf8, $charset);
1286
1287 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1288 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1289 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1290
1291 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1292 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1293 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
1294
1295 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1296 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1297 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1298 }
1299
1300 // add the ASCII case table
1301 for ($i=ord('a'); $i<=ord('z'); $i++) {
1302 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
1303 }
1304 for ($i=ord('A'); $i<=ord('Z'); $i++) {
1305 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
1306 }
1307
1308 if ($cacheFile) {
1309 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
1310 }
1311
1312 return 3;
1313 }
1314
1315 /**
1316 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1317 * This function is automatically called by the ASCII transliteration functions.
1318 *
1319 * @param string Charset for which to initialize conversion.
1320 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1321 * @access private
1322 */
1323 function initToASCII($charset) {
1324 // Only process if the case table is not yet loaded:
1325 if (is_array($this->toASCII[$charset])) return 1;
1326
1327 // Use cached version if possible
1328 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1329 if ($cacheFile && @is_file($cacheFile)) {
1330 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1331 return 2;
1332 }
1333
1334 // init UTF-8 conversion for this charset
1335 if (!$this->initCharset($charset)) {
1336 return false;
1337 }
1338
1339 // UTF-8/ASCII transliteration is used as the base conversion table
1340 if (!$this->initUnicodeData('ascii')) {
1341 return false;
1342 }
1343
1344 $nochar = chr($this->noCharByteVal);
1345 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1346 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1347 $c = $this->utf8_decode($utf8, $charset);
1348
1349 if (isset($this->toASCII['utf-8'][$utf8])) {
1350 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1351 }
1352 }
1353
1354 if ($cacheFile) {
1355 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
1356 }
1357
1358 return 3;
1359 }
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376 /********************************************
1377 *
1378 * String operation functions
1379 *
1380 ********************************************/
1381
1382 /**
1383 * Returns a part of a string.
1384 * Unit-tested by Kasper (single byte charsets only)
1385 *
1386 * @param string The character set
1387 * @param string Character string
1388 * @param integer Start position (character position)
1389 * @param integer Length (in characters)
1390 * @return string The substring
1391 * @see substr(), mb_substr()
1392 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1393 */
1394 function substr($charset,$string,$start,$len=null) {
1395 if ($len===0) return '';
1396
1397 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1398 // cannot omit $len, when specifying charset
1399 if ($len==null) {
1400 $enc = mb_internal_encoding(); // save internal encoding
1401 mb_internal_encoding($charset);
1402 $str = mb_substr($string,$start);
1403 mb_internal_encoding($enc); // restore internal encoding
1404
1405 return $str;
1406 }
1407 else {
1408 return mb_substr($string,$start,$len,$charset);
1409 }
1410 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1411 // cannot omit $len, when specifying charset
1412 if ($len==null) {
1413 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1414 iconv_set_encoding('internal_encoding',$charset);
1415 $str = iconv_substr($string,$start);
1416 iconv_set_encoding('internal_encoding',$enc); // restore internal encoding
1417
1418 return $str;
1419 }
1420 else {
1421 return iconv_substr($string,$start,$len,$charset);
1422 }
1423 } elseif ($charset == 'utf-8') {
1424 return $this->utf8_substr($string,$start,$len);
1425 } elseif ($this->eucBasedSets[$charset]) {
1426 return $this->euc_substr($string,$start,$charset,$len);
1427 } elseif ($this->twoByteSets[$charset]) {
1428 return substr($string,$start*2,$len*2);
1429 } elseif ($this->fourByteSets[$charset]) {
1430 return substr($string,$start*4,$len*4);
1431 }
1432
1433 // treat everything else as single-byte encoding
1434 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
1435 }
1436
1437 /**
1438 * Counts the number of characters.
1439 * Unit-tested by Kasper (single byte charsets only)
1440 *
1441 * @param string The character set
1442 * @param string Character string
1443 * @return integer The number of characters
1444 * @see strlen()
1445 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1446 */
1447 function strlen($charset,$string) {
1448 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1449 return mb_strlen($string,$charset);
1450 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1451 return iconv_strlen($string,$charset);
1452 } elseif ($charset == 'utf-8') {
1453 return $this->utf8_strlen($string);
1454 } elseif ($this->eucBasedSets[$charset]) {
1455 return $this->euc_strlen($string,$charset);
1456 } elseif ($this->twoByteSets[$charset]) {
1457 return strlen($string)/2;
1458 } elseif ($this->fourByteSets[$charset]) {
1459 return strlen($string)/4;
1460 }
1461 // treat everything else as single-byte encoding
1462 return strlen($string);
1463 }
1464
1465 /**
1466 * Truncates a string and pre-/appends a string.
1467 * Unit tested by Kasper
1468 *
1469 * @param string The character set
1470 * @param string Character string
1471 * @param integer Length (in characters)
1472 * @param string Crop signifier
1473 * @return string The shortened string
1474 * @see substr(), mb_strimwidth()
1475 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1476 */
1477 function crop($charset,$string,$len,$crop='') {
1478 if (intval($len) == 0) return $string;
1479
1480 if ($charset == 'utf-8') {
1481 $i = $this->utf8_char2byte_pos($string,$len);
1482 } elseif ($this->eucBasedSets[$charset]) {
1483 $i = $this->euc_char2byte_pos($string,$len,$charset);
1484 } else {
1485 if ($len > 0) {
1486 $i = $len;
1487 } else {
1488 $i = strlen($string)+$len;
1489 if ($i<=0) $i = false;
1490 }
1491 }
1492
1493 if ($i === false) { // $len outside actual string length
1494 return $string;
1495 } else {
1496 if ($len > 0) {
1497 if (strlen($string{$i})) {
1498 return substr($string,0,$i).$crop;
1499
1500 }
1501 } else {
1502 if (strlen($string{$i-1})) {
1503 return $crop.substr($string,$i);
1504 }
1505 }
1506
1507 /*
1508 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1509 if ($len > 0) {
1510 return substr($string,0,$i).$crop;
1511 } else {
1512 return $crop.substr($string,$i);
1513 }
1514 }
1515 */
1516 }
1517 return $string;
1518 }
1519
1520 /**
1521 * Cuts a string short at a given byte length.
1522 *
1523 * @param string The character set
1524 * @param string Character string
1525 * @param integer The byte length
1526 * @return string The shortened string
1527 * @see mb_strcut()
1528 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1529 */
1530 function strtrunc($charset,$string,$len) {
1531 if ($len <= 0) return '';
1532
1533 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1534 return mb_strcut($string,0,$len,$charset);
1535 } elseif ($charset == 'utf-8') {
1536 return $this->utf8_strtrunc($string,$len);
1537 } elseif ($this->eucBasedSets[$charset]) {
1538 return $this->euc_strtrunc($string,$charset);
1539 } elseif ($this->twoByteSets[$charset]) {
1540 if ($len % 2) $len--; // don't cut at odd positions
1541 } elseif ($this->fourByteSets[$charset]) {
1542 $x = $len % 4;
1543 $len -= $x; // realign to position dividable by four
1544 }
1545 // treat everything else as single-byte encoding
1546 return substr($string,0,$len);
1547 }
1548
1549 /**
1550 * Translates all characters of a string into their respective case values.
1551 * Unlike strtolower() and strtoupper() this method is locale independent.
1552 * Note that the string length may change!
1553 * eg. lower case German �(sharp S) becomes upper case "SS"
1554 * Unit-tested by Kasper
1555 * Real case folding is language dependent, this method ignores this fact.
1556 *
1557 * @param string Character set of string
1558 * @param string Input string to convert case for
1559 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1560 * @return string The converted string
1561 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1562 * @see strtolower(), strtoupper()
1563 */
1564 function conv_case($charset,$string,$case) {
1565 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1566 if ($case == 'toLower') {
1567 $string = mb_strtolower($string,$charset);
1568 } else {
1569 $string = mb_strtoupper($string,$charset);
1570 }
1571 } elseif ($charset == 'utf-8') {
1572 $string = $this->utf8_char_mapping($string,'case',$case);
1573 } elseif (isset($this->eucBasedSets[$charset])) {
1574 $string = $this->euc_char_mapping($string,$charset,'case',$case);
1575 } else {
1576 // treat everything else as single-byte encoding
1577 $string = $this->sb_char_mapping($string,$charset,'case',$case);
1578 }
1579
1580 return $string;
1581 }
1582
1583 /**
1584 * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
1585 *
1586 * @param string Character set of string
1587 * @param string Input string to convert
1588 * @return string The converted string
1589 */
1590 function specCharsToASCII($charset,$string) {
1591 if ($charset == 'utf-8') {
1592 $string = $this->utf8_char_mapping($string,'ascii');
1593 } elseif (isset($this->eucBasedSets[$charset])) {
1594 $string = $this->euc_char_mapping($string,$charset,'ascii');
1595 } else {
1596 // treat everything else as single-byte encoding
1597 $string = $this->sb_char_mapping($string,$charset,'ascii');
1598 }
1599
1600 return $string;
1601 }
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614 /********************************************
1615 *
1616 * Internal string operation functions
1617 *
1618 ********************************************/
1619
1620 /**
1621 * Maps all characters of a string in a single byte charset.
1622 *
1623 * @param string the string
1624 * @param string the charset
1625 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1626 * @param string 'case': conversion 'toLower' or 'toUpper'
1627 * @return string the converted string
1628 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1629 */
1630 function sb_char_mapping($str,$charset,$mode,$opt='') {
1631 switch($mode) {
1632 case 'case':
1633 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1634 $map =& $this->caseFolding[$charset][$opt];
1635 break;
1636
1637 case 'ascii':
1638 if (!$this->initToASCII($charset)) return $str; // do nothing
1639 $map =& $this->toASCII[$charset];
1640 break;
1641
1642 default:
1643 return $str;
1644 }
1645
1646 $out = '';
1647 for($i=0; strlen($str{$i}); $i++) {
1648 $c = $str{$i};
1649 if (isset($map[$c])) {
1650 $out .= $map[$c];
1651 } else {
1652 $out .= $c;
1653 }
1654 }
1655
1656 return $out;
1657 }
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668 /********************************************
1669 *
1670 * Internal UTF-8 string operation functions
1671 *
1672 ********************************************/
1673
1674 /**
1675 * Returns a part of a UTF-8 string.
1676 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1677 *
1678 * @param string UTF-8 string
1679 * @param integer Start position (character position)
1680 * @param integer Length (in characters)
1681 * @return string The substring
1682 * @see substr()
1683 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1684 */
1685 function utf8_substr($str,$start,$len=null) {
1686 if (!strcmp($len,'0')) return '';
1687
1688 $byte_start = $this->utf8_char2byte_pos($str,$start);
1689 if ($byte_start === false) {
1690 if ($start > 0) {
1691 return false; // $start outside string length
1692 } else {
1693 $start = 0;
1694 }
1695 }
1696
1697 $str = substr($str,$byte_start);
1698
1699 if ($len!=null) {
1700 $byte_end = $this->utf8_char2byte_pos($str,$len);
1701 if ($byte_end === false) // $len outside actual string length
1702 return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string.
1703 else
1704 return substr($str,0,$byte_end);
1705 }
1706 else return $str;
1707 }
1708
1709 /**
1710 * Counts the number of characters of a string in UTF-8.
1711 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1712 *
1713 * @param string UTF-8 multibyte character string
1714 * @return integer The number of characters
1715 * @see strlen()
1716 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1717 */
1718 function utf8_strlen($str) {
1719 $n=0;
1720 for($i=0; strlen($str{$i}); $i++) {
1721 $c = ord($str{$i});
1722 if (!($c & 0x80)) // single-byte (0xxxxxx)
1723 $n++;
1724 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1725 $n++;
1726 }
1727 return $n;
1728 }
1729
1730 /**
1731 * Truncates a string in UTF-8 short at a given byte length.
1732 *
1733 * @param string UTF-8 multibyte character string
1734 * @param integer the byte length
1735 * @return string the shortened string
1736 * @see mb_strcut()
1737 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1738 */
1739 function utf8_strtrunc($str,$len) {
1740 $i = $len-1;
1741 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1742 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1743 if ($i <= 0) return ''; // sanity check
1744 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1745 if ($bc+$i > $len) return substr($str,0,$i);
1746 // fallthru: multibyte char fits into length
1747 }
1748 return substr($str,0,$len);
1749 }
1750
1751 /**
1752 * Find position of first occurrence of a string, both arguments are in UTF-8.
1753 *
1754 * @param string UTF-8 string to search in
1755 * @param string UTF-8 string to search for
1756 * @param integer Positition to start the search
1757 * @return integer The character position
1758 * @see strpos()
1759 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1760 */
1761 function utf8_strpos($haystack,$needle,$offset=0) {
1762 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1763 return mb_strpos($haystack,$needle,$offset,'utf-8');
1764 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1765 return iconv_strpos($haystack,$needle,$offset,'utf-8');
1766 }
1767
1768 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1769 if ($byte_offset === false) return false; // offset beyond string length
1770
1771 $byte_pos = strpos($haystack,$needle,$byte_offset);
1772 if ($byte_pos === false) return false; // needle not found
1773
1774 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1775 }
1776
1777 /**
1778 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1779 *
1780 * @param string UTF-8 string to search in
1781 * @param string UTF-8 character to search for (single character)
1782 * @return integer The character position
1783 * @see strrpos()
1784 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1785 */
1786 function utf8_strrpos($haystack,$needle) {
1787 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1788 return mb_strrpos($haystack,$needle,'utf-8');
1789 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1790 return iconv_strrpos($haystack,$needle,'utf-8');
1791 }
1792
1793 $byte_pos = strrpos($haystack,$needle);
1794 if ($byte_pos === false) return false; // needle not found
1795
1796 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1797 }
1798
1799 /**
1800 * Translates a character position into an 'absolute' byte position.
1801 * Unit tested by Kasper.
1802 *
1803 * @param string UTF-8 string
1804 * @param integer Character position (negative values start from the end)
1805 * @return integer Byte position
1806 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1807 */
1808 function utf8_char2byte_pos($str,$pos) {
1809 $n = 0; // number of characters found
1810 $p = abs($pos); // number of characters wanted
1811
1812 if ($pos >= 0) {
1813 $i = 0;
1814 $d = 1;
1815 } else {
1816 $i = strlen($str)-1;
1817 $d = -1;
1818 }
1819
1820 for( ; strlen($str{$i}) && $n<$p; $i+=$d) {
1821 $c = (int)ord($str{$i});
1822 if (!($c & 0x80)) // single-byte (0xxxxxx)
1823 $n++;
1824 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1825 $n++;
1826 }
1827 if (!strlen($str{$i})) return false; // offset beyond string length
1828
1829 if ($pos >= 0) {
1830 // skip trailing multi-byte data bytes
1831 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1832 } else {
1833 // correct offset
1834 $i++;
1835 }
1836
1837 return $i;
1838 }
1839
1840 /**
1841 * Translates an 'absolute' byte position into a character position.
1842 * Unit tested by Kasper.
1843 *
1844 * @param string UTF-8 string
1845 * @param integer byte position
1846 * @return integer character position
1847 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1848 */
1849 function utf8_byte2char_pos($str,$pos) {
1850 $n = 0; // number of characters
1851 for($i=$pos; $i>0; $i--) {
1852 $c = (int)ord($str{$i});
1853 if (!($c & 0x80)) // single-byte (0xxxxxx)
1854 $n++;
1855 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1856 $n++;
1857 }
1858 if (!strlen($str{$i})) return false; // offset beyond string length
1859
1860 return $n;
1861 }
1862
1863 /**
1864 * Maps all characters of an UTF-8 string.
1865 *
1866 * @param string UTF-8 string
1867 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1868 * @param string 'case': conversion 'toLower' or 'toUpper'
1869 * @return string the converted string
1870 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1871 */
1872 function utf8_char_mapping($str,$mode,$opt='') {
1873 if (!$this->initUnicodeData($mode)) return $str; // do nothing
1874
1875 $out = '';
1876 switch($mode) {
1877 case 'case':
1878 $map =& $this->caseFolding['utf-8'][$opt];
1879 break;
1880
1881 case 'ascii':
1882 $map =& $this->toASCII['utf-8'];
1883 break;
1884
1885 default:
1886 return $str;
1887 }
1888
1889 for($i=0; strlen($str{$i}); $i++) {
1890 $c = ord($str{$i});
1891 if (!($c & 0x80)) // single-byte (0xxxxxx)
1892 $mbc = $str{$i};
1893 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1894 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1895 $mbc = substr($str,$i,$bc);
1896 $i += $bc-1;
1897 }
1898
1899 if (isset($map[$mbc])) {
1900 $out .= $map[$mbc];
1901 } else {
1902 $out .= $mbc;
1903 }
1904 }
1905
1906 return $out;
1907 }
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926 /********************************************
1927 *
1928 * Internal EUC string operation functions
1929 *
1930 * Extended Unix Code:
1931 * ASCII compatible 7bit single bytes chars
1932 * 8bit two byte chars
1933 *
1934 * Shift-JIS is treated as a special case.
1935 *
1936 ********************************************/
1937
1938 /**
1939 * Cuts a string in the EUC charset family short at a given byte length.
1940 *
1941 * @param string EUC multibyte character string
1942 * @param integer the byte length
1943 * @param string the charset
1944 * @return string the shortened string
1945 * @see mb_strcut()
1946 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1947 */
1948 function euc_strtrunc($str,$len,$charset) {
1949 $sjis = ($charset == 'shift_jis');
1950 for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
1951 $c = ord($str{$i});
1952 if ($sjis) {
1953 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1954 }
1955 else {
1956 if ($c >= 0x80) $i++; // advance a double-byte char
1957 }
1958 }
1959 if (!strlen($str{$i})) return $str; // string shorter than supplied length
1960
1961 if ($i>$len) {
1962 return substr($str,0,$len-1); // we ended on a first byte
1963 } else {
1964 return substr($str,0,$len);
1965 }
1966 }
1967
1968 /**
1969 * Returns a part of a string in the EUC charset family.
1970 *
1971 * @param string EUC multibyte character string
1972 * @param integer start position (character position)
1973 * @param string the charset
1974 * @param integer length (in characters)
1975 * @return string the substring
1976 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1977 */
1978 function euc_substr($str,$start,$charset,$len=null) {
1979 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1980 if ($byte_start === false) return false; // $start outside string length
1981
1982 $str = substr($str,$byte_start);
1983
1984 if ($len!=null) {
1985 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1986 if ($byte_end === false) // $len outside actual string length
1987 return $str;
1988 else
1989 return substr($str,0,$byte_end);
1990 }
1991 else return $str;
1992 }
1993
1994 /**
1995 * Counts the number of characters of a string in the EUC charset family.
1996 *
1997 * @param string EUC multibyte character string
1998 * @param string the charset
1999 * @return integer the number of characters
2000 * @see strlen()
2001 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2002 */
2003 function euc_strlen($str,$charset) {
2004 $sjis = ($charset == 'shift_jis');
2005 $n=0;
2006 for ($i=0; strlen($str{$i}); $i++) {
2007 $c = ord($str{$i});
2008 if ($sjis) {
2009 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
2010 }
2011 else {
2012 if ($c >= 0x80) $i++; // advance a double-byte char
2013 }
2014
2015 $n++;
2016 }
2017
2018 return $n;
2019 }
2020
2021 /**
2022 * Translates a character position into an 'absolute' byte position.
2023 *
2024 * @param string EUC multibyte character string
2025 * @param integer character position (negative values start from the end)
2026 * @param string the charset
2027 * @return integer byte position
2028 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2029 */
2030 function euc_char2byte_pos($str,$pos,$charset) {
2031 $sjis = ($charset == 'shift_jis');
2032 $n = 0; // number of characters seen
2033 $p = abs($pos); // number of characters wanted
2034
2035 if ($pos >= 0) {
2036 $i = 0;
2037 $d = 1;
2038 } else {
2039 $i = strlen($str)-1;
2040 $d = -1;
2041 }
2042
2043 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
2044 $c = ord($str{$i});
2045 if ($sjis) {
2046 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char
2047 }
2048 else {
2049 if ($c >= 0x80) $i+=$d; // advance a double-byte char
2050 }
2051
2052 $n++;
2053 }
2054 if (!strlen($str{$i})) return false; // offset beyond string length
2055
2056 if ($pos < 0) $i++; // correct offset
2057
2058 return $i;
2059 }
2060
2061 /**
2062 * Maps all characters of a string in the EUC charset family.
2063 *
2064 * @param string EUC multibyte character string
2065 * @param string the charset
2066 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2067 * @param string 'case': conversion 'toLower' or 'toUpper'
2068 * @return string the converted string
2069 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2070 */
2071 function euc_char_mapping($str,$charset,$mode,$opt='') {
2072 switch($mode) {
2073 case 'case':
2074 if (!$this->initCaseFolding($charset)) return $str; // do nothing
2075 $map =& $this->caseFolding[$charset][$opt];
2076 break;
2077
2078 case 'ascii':
2079 if (!$this->initToASCII($charset)) return $str; // do nothing
2080 $map =& $this->toASCII[$charset];
2081 break;
2082
2083 default:
2084 return $str;
2085 }
2086
2087 $sjis = ($charset == 'shift_jis');
2088 $out = '';
2089 for($i=0; strlen($str{$i}); $i++) {
2090 $mbc = $str{$i};
2091 $c = ord($mbc);
2092
2093 if ($sjis) {
2094 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2095 $mbc = substr($str,$i,2);
2096 $i++;
2097 }
2098 }
2099 else {
2100 if ($c >= 0x80) { // a double-byte char
2101 $mbc = substr($str,$i,2);
2102 $i++;
2103 }
2104 }
2105
2106 if (isset($map[$mbc])) {
2107 $out .= $map[$mbc];
2108 } else {
2109 $out .= $mbc;
2110 }
2111 }
2112
2113 return $out;
2114 }
2115
2116 }
2117
2118 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
2119 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2120 }
2121
2122 ?>