2 /***************************************************************
5 * (c) 2003-2008 Kasper Skaarhoj (kasperYYYY@typo3.com)
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
25 * Class for conversion between charsets.
29 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
33 * [CLASS/FUNCTION INDEX of SCRIPT]
38 * 488: function parse_charset($charset)
39 * 507: function get_locale_charset($locale)
41 * SECTION: Charset Conversion functions
42 * 560: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
43 * 600: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
44 * 617: function utf8_encode($str,$charset)
45 * 663: function utf8_decode($str,$charset,$useEntityForNoChar=0)
46 * 706: function utf8_to_entities($str)
47 * 739: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
48 * 773: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
49 * 823: function UnumberToChar($cbyte)
50 * 868: function utf8CharToUnumber($str,$hex=0)
52 * SECTION: Init functions
53 * 911: function initCharset($charset)
54 * 973: function initUnicodeData($mode=null)
55 * 1198: function initCaseFolding($charset)
56 * 1260: function initToASCII($charset)
58 * SECTION: String operation functions
59 * 1331: function substr($charset,$string,$start,$len=null)
60 * 1384: function strlen($charset,$string)
61 * 1414: function crop($charset,$string,$len,$crop='')
62 * 1467: function strtrunc($charset,$string,$len)
63 * 1501: function conv_case($charset,$string,$case)
64 * 1527: function specCharsToASCII($charset,$string)
66 * SECTION: Internal string operation functions
67 * 1567: function sb_char_mapping($str,$charset,$mode,$opt='')
69 * SECTION: Internal UTF-8 string operation functions
70 * 1622: function utf8_substr($str,$start,$len=null)
71 * 1655: function utf8_strlen($str)
72 * 1676: function utf8_strtrunc($str,$len)
73 * 1698: function utf8_strpos($haystack,$needle,$offset=0)
74 * 1723: function utf8_strrpos($haystack,$needle)
75 * 1745: function utf8_char2byte_pos($str,$pos)
76 * 1786: function utf8_byte2char_pos($str,$pos)
77 * 1809: function utf8_char_mapping($str,$mode,$opt='')
79 * SECTION: Internal EUC string operation functions
80 * 1885: function euc_strtrunc($str,$len,$charset)
81 * 1914: function euc_substr($str,$start,$charset,$len=null)
82 * 1939: function euc_strlen($str,$charset)
83 * 1966: function euc_char2byte_pos($str,$pos,$charset)
84 * 2007: function euc_char_mapping($str,$charset,$mode,$opt='')
87 * (This index is automatically created/updated by the extension "extdeveval")
101 * Functions working on UTF-8 strings:
106 * - implode/explode/join
108 * Functions nearly working on UTF-8 strings:
110 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
111 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
112 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
113 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
115 * Functions NOT working on UTF-8 strings:
129 * Class for conversion between charsets
131 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
132 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
137 var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
139 // This is the array where parsed conversion tables are stored (cached)
140 var $parsedCharsets=array();
142 // An array where case folding data will be stored (cached)
143 var $caseFolding=array();
145 // An array where charset-to-ASCII mappings are stored (cached)
146 var $toASCII=array();
148 // This tells the converter which charsets has two bytes per char:
149 var $twoByteSets=array(
150 'ucs-2'=>1, // 2-byte Unicode
153 // This tells the converter which charsets has four bytes per char:
154 var $fourByteSets=array(
155 'ucs-4'=>1, // 4-byte Unicode
156 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
159 // This tells the converter which charsets use a scheme like the Extended Unix Code:
160 var $eucBasedSets=array(
161 'gb2312'=>1, // Chinese, simplified.
162 'big5'=>1, // Chinese, traditional.
163 'euc-kr'=>1, // Korean
164 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
167 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
168 // http://czyborra.com/charsets/iso8859.html
171 'us-ascii'=> 'ascii',
172 'cp819' => 'iso-8859-1',
173 'ibm819' => 'iso-8859-1',
174 'iso-ir-100' => 'iso-8859-1',
175 'iso-ir-109' => 'iso-8859-2',
176 'iso-ir-148' => 'iso-8859-9',
177 'iso-ir-199' => 'iso-8859-14',
178 'iso-ir-203' => 'iso-8859-15',
179 'csisolatin1' => 'iso-8859-1',
180 'csisolatin2' => 'iso-8859-2',
181 'csisolatin3' => 'iso-8859-3',
182 'csisolatin5' => 'iso-8859-9',
183 'csisolatin8' => 'iso-8859-14',
184 'csisolatin9' => 'iso-8859-15',
185 'csisolatingreek' => 'iso-8859-7',
186 'iso-celtic' => 'iso-8859-14',
187 'latin1' => 'iso-8859-1',
188 'latin2' => 'iso-8859-2',
189 'latin3' => 'iso-8859-3',
190 'latin5' => 'iso-8859-9',
191 'latin6' => 'iso-8859-10',
192 'latin8' => 'iso-8859-14',
193 'latin9' => 'iso-8859-15',
194 'l1' => 'iso-8859-1',
195 'l2' => 'iso-8859-2',
196 'l3' => 'iso-8859-3',
197 'l5' => 'iso-8859-9',
198 'l6' => 'iso-8859-10',
199 'l8' => 'iso-8859-14',
200 'l9' => 'iso-8859-15',
201 'cyrillic' => 'iso-8859-5',
202 'arabic' => 'iso-8859-6',
203 'tis-620' => 'iso-8859-11',
204 'win874' => 'windows-874',
205 'win1250' => 'windows-1250',
206 'win1251' => 'windows-1251',
207 'win1252' => 'windows-1252',
208 'win1253' => 'windows-1253',
209 'win1254' => 'windows-1254',
210 'win1255' => 'windows-1255',
211 'win1256' => 'windows-1256',
212 'win1257' => 'windows-1257',
213 'win1258' => 'windows-1258',
214 'cp1250' => 'windows-1250',
215 'cp1251' => 'windows-1251',
216 'cp1252' => 'windows-1252',
217 'ms-ee' => 'windows-1250',
218 'ms-ansi' => 'windows-1252',
219 'ms-greek' => 'windows-1253',
220 'ms-turk' => 'windows-1254',
221 'winbaltrim' => 'windows-1257',
222 'koi-8ru' => 'koi-8r',
226 'macintosh' => 'macroman',
227 'euc-cn' => 'gb2312',
228 'x-euc-cn' => 'gb2312',
234 'sjis' => 'shift_jis',
235 'shift-jis' => 'shift_jis',
236 'cp932' => 'shift_jis',
247 // mapping of iso-639:2 language codes to script names
248 var $lang_to_script=array(
249 // iso-639:2 language codes, see:
250 // http://www.w3.org/WAI/ER/IG/ert/iso639.htm
251 // http://www.loc.gov/standards/iso639-2/langcodes.html
252 // http://www.unicode.org/onlinedat/languages.html
254 'bg' => 'cyrillic', // Bulgarian
255 'bs' => 'east_european', // Bosnian
256 'cs' => 'east_european', // Czech
257 'da' => 'west_european', // Danish
258 'de' => 'west_european', // German
259 'es' => 'west_european', // Spanish
261 'eo' => 'unicode', // Esperanto
262 'eu' => 'west_european', // Basque
263 'fa' => 'arabic', // Persian
264 'fi' => 'west_european', // Finish
265 'fo' => 'west_european', // Faroese
266 'fr' => 'west_european', // French
267 'ga' => 'west_european', // Galician
268 'ge' => 'unicode', // Georgian
270 'he' => 'hebrew', // Hebrew (since 1998)
271 'hi' => 'unicode', // Hindi
272 'hr' => 'east_european', // Croatian
273 'hu' => 'east_european', // Hungarian
274 'iw' => 'hebrew', // Hebrew (til 1998)
275 'is' => 'west_european', // Icelandic
276 'it' => 'west_european', // Italian
278 'kl' => 'west_european', // Greenlandic
280 'lt' => 'lithuanian',
281 'lv' => 'west_european', // Latvian/Lettish
282 'nl' => 'west_european', // Dutch
283 'no' => 'west_european', // Norwegian
284 'pl' => 'east_european', // Polish
285 'pt' => 'west_european', // Portuguese
286 'ro' => 'east_european', // Romanian
287 'ru' => 'cyrillic', // Russian
288 'sk' => 'east_european', // Slovak
289 'sl' => 'east_european', // Slovenian
290 'sr' => 'cyrillic', // Serbian
291 'sv' => 'west_european', // Swedish
292 'sq' => 'albanian', // Albanian
294 'uk' => 'cyrillic', // Ukranian
295 'vi' => 'vietnamese',
297 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
298 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
300 'bgr' => 'cyrillic', // Bulgarian
301 'cat' => 'west_european', // Catalan
302 'chs' => 'simpl_chinese',
303 'cht' => 'trad_chinese',
304 'csy' => 'east_european', // Czech
305 'dan' => 'west_european', // Danisch
306 'deu' => 'west_european', // German
307 'dea' => 'west_european', // German (Austrian)
308 'des' => 'west_european', // German (Swiss)
309 'ena' => 'west_european', // English (Australian)
310 'enc' => 'west_european', // English (Canadian)
311 'eng' => 'west_european', // English
312 'enz' => 'west_european', // English (New Zealand)
313 'enu' => 'west_european', // English (United States)
314 'euq' => 'west_european', // Basque
315 'fos' => 'west_european', // Faroese
316 'far' => 'arabic', // Persian
317 'fin' => 'west_european', // Finish
318 'fra' => 'west_european', // French
319 'frb' => 'west_european', // French (Belgian)
320 'frc' => 'west_european', // French (Canadian)
321 'frs' => 'west_european', // French (Swiss)
322 'geo' => 'unicode', // Georgian
323 'glg' => 'west_european', // Galician
326 'hin' => 'unicode', // Hindi
327 'hun' => 'east_european', // Hungarian
328 'isl' => 'west_euorpean', // Icelandic
329 'ita' => 'west_european', // Italian
330 'its' => 'west_european', // Italian (Swiss)
333 'lth' => 'lithuanian',
334 'lvi' => 'west_european', // Latvian/Lettish
335 'msl' => 'west_european', // Malay
336 'nlb' => 'west_european', // Dutch (Belgian)
337 'nld' => 'west_european', // Dutch
338 'nor' => 'west_european', // Norwegian (bokmal)
339 'non' => 'west_european', // Norwegian (nynorsk)
340 'plk' => 'east_european', // Polish
341 'ptg' => 'west_european', // Portuguese
342 'ptb' => 'west_european', // Portuguese (Brazil)
343 'rom' => 'east_european', // Romanian
344 'rus' => 'cyrillic', // Russian
345 'slv' => 'east_european', // Slovenian
346 'sky' => 'east_european', // Slovak
347 'srl' => 'east_european', // Serbian (Latin)
348 'srb' => 'cyrillic', // Serbian (Cyrillic)
349 'esp' => 'west_european', // Spanish (trad. sort)
350 'esm' => 'west_european', // Spanish (Mexican)
351 'esn' => 'west_european', // Spanish (internat. sort)
352 'sve' => 'west_european', // Swedish
353 'sqi' => 'albanian', // Albanian
356 'ukr' => 'cyrillic', // Ukrainian
357 // English language names
358 'albanian' => 'albanian',
359 'arabic' => 'arabic',
360 'basque' => 'west_european',
361 'bosnian' => 'east_european',
362 'bulgarian' => 'east_european',
363 'catalan' => 'west_european',
364 'croatian' => 'east_european',
365 'czech' => 'east_european',
366 'danish' => 'west_european',
367 'dutch' => 'west_european',
368 'english' => 'west_european',
369 'esperanto' => 'unicode',
370 'estonian' => 'estonian',
371 'faroese' => 'west_european',
373 'finnish' => 'west_european',
374 'french' => 'west_european',
375 'galician' => 'west_european',
376 'georgian' => 'unicode',
377 'german' => 'west_european',
379 'greenlandic' => 'west_european',
380 'hebrew' => 'hebrew',
381 'hindi' => 'unicode',
382 'hungarian' => 'east_european',
383 'icelandic' => 'west_european',
384 'italian' => 'west_european',
385 'latvian' => 'west_european',
386 'lettish' => 'west_european',
387 'lithuanian' => 'lithuanian',
388 'malay' => 'west_european',
389 'norwegian' => 'west_european',
390 'persian' => 'arabic',
391 'polish' => 'east_european',
392 'portuguese' => 'west_european',
393 'russian' => 'cyrillic',
394 'romanian' => 'east_european',
395 'serbian' => 'cyrillic',
396 'slovak' => 'east_european',
397 'slovenian' => 'east_european',
398 'spanish' => 'west_european',
399 'svedish' => 'west_european',
401 'turkish' => 'turkish',
402 'ukrainian' => 'cyrillic',
405 // mapping of language (family) names to charsets on Unix
406 var $script_to_charset_unix=array(
407 'west_european' => 'iso-8859-1',
408 'estonian' => 'iso-8859-1',
409 'east_european' => 'iso-8859-2',
410 'baltic' => 'iso-8859-4',
411 'cyrillic' => 'iso-8859-5',
412 'arabic' => 'iso-8859-6',
413 'greek' => 'iso-8859-7',
414 'hebrew' => 'iso-8859-8',
415 'turkish' => 'iso-8859-9',
416 'thai' => 'iso-8859-11', // = TIS-620
417 'lithuanian' => 'iso-8859-13',
418 'chinese' => 'gb2312', // = euc-cn
419 'japanese' => 'euc-jp',
420 'korean' => 'euc-kr',
421 'simpl_chinese' => 'gb2312',
422 'trad_chinese' => 'big5',
424 'unicode' => 'utf-8',
425 'albanian' => 'utf-8'
428 // mapping of language (family) names to charsets on Windows
429 var $script_to_charset_windows=array(
430 'east_european' => 'windows-1250',
431 'cyrillic' => 'windows-1251',
432 'west_european' => 'windows-1252',
433 'greek' => 'windows-1253',
434 'turkish' => 'windows-1254',
435 'hebrew' => 'windows-1255',
436 'arabic' => 'windows-1256',
437 'baltic' => 'windows-1257',
438 'estonian' => 'windows-1257',
439 'lithuanian' => 'windows-1257',
440 'vietnamese' => 'windows-1258',
443 'chinese' => 'gb2312',
444 'japanese' => 'shift_jis',
445 'simpl_chinese' => 'gb2312',
446 'trad_chinese' => 'big5',
447 'albanian' => 'windows-1250',
451 // mapping of locale names to charsets
452 var $locale_to_charset=array(
453 'japanese.euc' => 'euc-jp',
454 'ja_jp.ujis' => 'euc-jp',
455 'korean.euc' => 'euc-kr',
456 'sr@Latn' => 'iso-8859-2',
462 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
463 // Empty values means "iso-8859-1"
464 var $charSetArray = array(
472 'cz' => 'windows-1250',
473 'pl' => 'iso-8859-2',
474 'si' => 'windows-1250',
476 'tr' => 'iso-8859-9',
479 'ru' => 'windows-1251',
480 'ro' => 'iso-8859-2',
482 'sk' => 'windows-1250',
483 'lt' => 'windows-1257',
485 'hr' => 'windows-1250',
486 'hu' => 'iso-8859-2',
488 'th' => 'iso-8859-11',
489 'gr' => 'iso-8859-7',
492 'bg' => 'windows-1251',
494 'et' => 'iso-8859-4',
495 'ar' => 'iso-8859-6',
497 'ua' => 'windows-1251',
501 'ca' => 'iso-8859-15',
502 'ba' => 'iso-8859-2',
515 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
516 // Missing keys means: same as Typo3
517 var $isoArray = array(
535 * Normalize - changes input character set to lowercase letters.
537 * @param string Input charset
538 * @return string Normalized charset
539 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
541 function parse_charset($charset) {
542 $charset = trim(strtolower($charset));
543 if (isset($this->synonyms
[$charset])) $charset = $this->synonyms
[$charset];
549 * Get the charset of a locale.
552 * ln_CN language / country
553 * ln_CN.cs language / country / charset
554 * ln_CN.cs@mod language / country / charset / modifier
556 * @param string Locale string
557 * @return string Charset resolved for locale string
558 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
560 function get_locale_charset($locale) {
561 $locale = strtolower($locale);
563 // exact locale specific charset?
564 if (isset($this->locale_to_charset
[$locale])) return $this->locale_to_charset
[$locale];
567 list($locale,$modifier) = explode('@',$locale);
569 // locale contains charset: use it
570 list($locale,$charset) = explode('.',$locale);
571 if ($charset) return $this->parse_charset($charset);
573 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
574 if ($modifier == 'euro') return 'iso-8859-15';
577 list($language,$country) = explode('_',$locale);
578 if (isset($this->lang_to_script
[$language])) $script = $this->lang_to_script
[$language];
580 if (TYPO3_OS
== 'WIN') {
581 $cs = $this->script_to_charset_windows
[$script] ?
$this->script_to_charset_windows
[$script] : 'windows-1252';
583 $cs = $this->script_to_charset_unix
[$script] ?
$this->script_to_charset_unix
[$script] : 'iso-8859-1';
597 /********************************************
599 * Charset Conversion functions
601 ********************************************/
604 * Convert from one charset to another charset.
606 * @param string Input string
607 * @param string From charset (the current charset of the string)
608 * @param string To charset (the output charset wanted)
609 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
610 * @return string Converted string
613 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
614 if ($fromCS==$toCS) return $str;
616 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
617 if ($toCS=='utf-8' ||
!$useEntityForNoChar) {
618 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
620 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
621 if (false
!== $conv_str) return $conv_str; // returns false for unsupported charsets
625 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
626 if (false
!== $conv_str) return $conv_str;
630 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
631 if (false
!== $conv_str) return $conv_str;
634 // fallback to TYPO3 conversion
637 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
638 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
643 * Convert all elements in ARRAY from one charset to another charset.
644 * NOTICE: Array is passed by reference!
646 * @param string Input array, possibly multidimensional
647 * @param string From charset (the current charset of the string)
648 * @param string To charset (the output charset wanted)
649 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
653 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
654 foreach($array as $key => $value) {
655 if (is_array($array[$key])) {
656 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
658 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
664 * Converts $str from $charset to UTF-8
666 * @param string String in local charset to convert to UTF-8
667 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
668 * @return string Output string, converted to UTF-8
670 function utf8_encode($str,$charset) {
672 if ($charset === 'utf-8') return $str;
674 // Charset is case-insensitive.
675 if ($this->initCharset($charset)) { // Parse conv. table if not already...
676 $strLen = strlen($str);
679 for ($a=0;$a<$strLen;$a++
) { // Traverse each char in string.
680 $chr=substr($str,$a,1);
682 if (isset($this->twoByteSets
[$charset])) { // If the charset has two bytes per char
683 $ord2 = ord($str{$a+
1});
684 $ord = $ord<<8 |
$ord2; // assume big endian
686 if (isset($this->parsedCharsets
[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
687 $outStr.=$this->parsedCharsets
[$charset]['local'][$ord];
688 } else $outStr.=chr($this->noCharByteVal
); // No char exists
690 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
691 if (isset($this->eucBasedSets
[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
692 if ($charset != 'shift_jis' ||
($ord < 0xA0 ||
$ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
694 $ord2=ord(substr($str,$a,1));
695 $ord = $ord*256+
$ord2;
699 if (isset($this->parsedCharsets
[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
700 $outStr.= $this->parsedCharsets
[$charset]['local'][$ord];
701 } else $outStr.= chr($this->noCharByteVal
); // No char exists
702 } else $outStr.= $chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
709 * Converts $str from UTF-8 to $charset
711 * @param string String in UTF-8 to convert to local charset
712 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
713 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
714 * @return string Output string, converted to local charset
716 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
718 // Charset is case-insensitive.
719 if ($this->initCharset($charset)) { // Parse conv. table if not already...
720 $strLen = strlen($str);
723 for ($a=0,$i=0;$a<$strLen;$a++
,$i++
) { // Traverse each char in UTF-8 string.
724 $chr=substr($str,$a,1);
726 if ($ord>127) { // This means multibyte! (first byte!)
727 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
729 $buf=$chr; // Add first byte
730 for ($b=0;$b<8;$b++
) { // for each byte in multibyte string...
731 $ord = $ord << 1; // Shift it left and ...
732 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
733 $a++
; // Increase pointer...
734 $buf.=substr($str,$a,1); // ... and add the next char.
738 if (isset($this->parsedCharsets
[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
739 $mByte = $this->parsedCharsets
[$charset]['utf8'][$buf]; // The local number
740 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
741 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
742 } else $outStr.= chr($mByte);
743 } elseif ($useEntityForNoChar) { // Create num entity:
744 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
745 } else $outStr.=chr($this->noCharByteVal
); // No char exists
746 } else $outStr.=chr($this->noCharByteVal
); // No char exists (MIDDLE of MB sequence!)
747 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
754 * Converts all chars > 127 to numeric entities.
756 * @param string Input string
757 * @return string Output string
759 function utf8_to_entities($str) {
760 $strLen = strlen($str);
763 for ($a=0;$a<$strLen;$a++
) { // Traverse each char in UTF-8 string.
764 $chr=substr($str,$a,1);
766 if ($ord>127) { // This means multibyte! (first byte!)
767 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
768 $buf=$chr; // Add first byte
769 for ($b=0;$b<8;$b++
) { // for each byte in multibyte string...
770 $ord = $ord << 1; // Shift it left and ...
771 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
772 $a++
; // Increase pointer...
773 $buf.=substr($str,$a,1); // ... and add the next char.
777 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
778 } else $outStr.=chr($this->noCharByteVal
); // No char exists (MIDDLE of MB sequence!)
779 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
786 * Converts numeric entities (UNICODE, eg. decimal (Ӓ) or hexadecimal ()) to UTF-8 multibyte chars
788 * @param string Input string, UTF-8
789 * @param boolean If set, then all string-HTML entities (like & or £ will be converted as well)
790 * @return string Output string
792 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
793 if ($alsoStdHtmlEnt) {
794 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES
)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
797 $token = md5(microtime());
798 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
799 foreach($parts as $k => $v) {
801 if (substr($v,0,1)=='#') { // Dec or hex entities:
802 if (substr($v,1,1)=='x') {
803 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
805 $parts[$k] = $this->UnumberToChar(substr($v,1));
807 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
808 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
809 } else { // No conversion:
810 $parts[$k] ='&'.$v.';';
815 return implode('',$parts);
819 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
821 * @param string Input string, UTF-8
822 * @param boolean If set, then all HTML entities (like & or £ or { or 㽝) will be detected as characters.
823 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
824 * @return array Output array with the char numbers
826 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
827 // If entities must be registered as well...:
829 $str = $this->entities_to_utf8($str,1);
832 $strLen = strlen($str);
835 for ($a=0;$a<$strLen;$a++
) { // Traverse each char in UTF-8 string.
836 $chr=substr($str,$a,1);
838 if ($ord>127) { // This means multibyte! (first byte!)
839 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
840 $buf=$chr; // Add first byte
841 for ($b=0;$b<8;$b++
) { // for each byte in multibyte string...
842 $ord = $ord << 1; // Shift it left and ...
843 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
844 $a++
; // Increase pointer...
845 $buf.=substr($str,$a,1); // ... and add the next char.
849 $outArr[]=$retChar?
$buf:$this->utf8CharToUnumber($buf);
850 } else $outArr[]=$retChar?
chr($this->noCharByteVal
):$this->noCharByteVal
; // No char exists (MIDDLE of MB sequence!)
851 } else $outArr[]=$retChar?
chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
858 * Converts a UNICODE number to a UTF-8 multibyte character
859 * Algorithm based on script found at From: http://czyborra.com/utf/
860 * Unit-tested by Kasper
862 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
864 * bytes | bits | representation
866 * 2 | 11 | 110vvvvv 10vvvvvv
867 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
868 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
869 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
870 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
872 * @param integer UNICODE integer
873 * @return string UTF-8 multibyte character string
874 * @see utf8CharToUnumber()
876 function UnumberToChar($cbyte) {
881 } else if ($cbyte < 0x800) {
882 $str.=chr(0xC0 |
($cbyte >> 6));
883 $str.=chr(0x80 |
($cbyte & 0x3F));
884 } else if ($cbyte < 0x10000) {
885 $str.=chr(0xE0 |
($cbyte >> 12));
886 $str.=chr(0x80 |
(($cbyte >> 6) & 0x3F));
887 $str.=chr(0x80 |
($cbyte & 0x3F));
888 } else if ($cbyte < 0x200000) {
889 $str.=chr(0xF0 |
($cbyte >> 18));
890 $str.=chr(0x80 |
(($cbyte >> 12) & 0x3F));
891 $str.=chr(0x80 |
(($cbyte >> 6) & 0x3F));
892 $str.=chr(0x80 |
($cbyte & 0x3F));
893 } else if ($cbyte < 0x4000000) {
894 $str.=chr(0xF8 |
($cbyte >> 24));
895 $str.=chr(0x80 |
(($cbyte >> 18) & 0x3F));
896 $str.=chr(0x80 |
(($cbyte >> 12) & 0x3F));
897 $str.=chr(0x80 |
(($cbyte >> 6) & 0x3F));
898 $str.=chr(0x80 |
($cbyte & 0x3F));
899 } else if ($cbyte < 0x80000000) {
900 $str.=chr(0xFC |
($cbyte >> 30));
901 $str.=chr(0x80 |
(($cbyte >> 24) & 0x3F));
902 $str.=chr(0x80 |
(($cbyte >> 18) & 0x3F));
903 $str.=chr(0x80 |
(($cbyte >> 12) & 0x3F));
904 $str.=chr(0x80 |
(($cbyte >> 6) & 0x3F));
905 $str.=chr(0x80 |
($cbyte & 0x3F));
906 } else { // Cannot express a 32-bit character in UTF-8
907 $str .= chr($this->noCharByteVal
);
913 * Converts a UTF-8 Multibyte character to a UNICODE number
914 * Unit-tested by Kasper
916 * @param string UTF-8 multibyte character string
917 * @param boolean If set, then a hex. number is returned.
918 * @return integer UNICODE integer
919 * @see UnumberToChar()
921 function utf8CharToUnumber($str,$hex=0) {
922 $ord=ord(substr($str,0,1)); // First char
924 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
926 for ($b=0;$b<8;$b++
) { // for each byte in multibyte string...
927 $ord = $ord << 1; // Shift it left and ...
928 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
929 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+
1,1))),-6);
932 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
934 $int = bindec($binBuf);
937 return $hex ?
'x'.dechex($int) : $int;
948 /********************************************
952 ********************************************/
955 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
956 * This function is automatically called by the conversion functions
958 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
960 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
961 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
964 function initCharset($charset) {
965 // Only process if the charset is not yet loaded:
966 if (!is_array($this->parsedCharsets
[$charset])) {
968 // Conversion table filename:
969 $charsetConvTableFile = PATH_t3lib
.'csconvtbl/'.$charset.'.tbl';
971 // If the conversion table is found:
972 if ($charset && t3lib_div
::validPathStr($charsetConvTableFile) && @is_file
($charsetConvTableFile)) {
973 // Cache file for charsets:
974 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
975 $cacheFile = t3lib_div
::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
976 if ($cacheFile && @is_file
($cacheFile)) {
977 $this->parsedCharsets
[$charset]=unserialize(t3lib_div
::getUrl($cacheFile));
979 // Parse conversion table into lines:
980 $lines=t3lib_div
::trimExplode(chr(10),t3lib_div
::getUrl($charsetConvTableFile),1);
981 // Initialize the internal variable holding the conv. table:
982 $this->parsedCharsets
[$charset]=array('local'=>array(),'utf8'=>array());
983 // traverse the lines:
985 foreach($lines as $value) {
986 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
988 // Detect type if not done yet: (Done on first real line)
989 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
990 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ?
'whitespaced' : 'ms-token';
992 if ($detectedType=='ms-token') {
993 list($hexbyte,$utf8) = split('=|:',$value,3);
994 } elseif ($detectedType=='whitespaced') {
996 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
998 $utf8 = 'U+'.$regA[2];
1000 $decval = hexdec(trim($hexbyte));
1002 $utf8decval = hexdec(substr(trim($utf8),2));
1003 $this->parsedCharsets
[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
1004 $this->parsedCharsets
[$charset]['utf8'][$this->parsedCharsets
[$charset]['local'][$decval]]=$decval;
1009 t3lib_div
::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets
[$charset]));
1013 } else return false
;
1018 * This function initializes all UTF-8 character data tables.
1020 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1022 * @param string Mode ("case", "ascii", ...)
1023 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1026 function initUnicodeData($mode=null
) {
1028 $cacheFileCase = t3lib_div
::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1029 $cacheFileASCII = t3lib_div
::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1031 // Only process if the tables are not yet loaded
1034 if (is_array($this->caseFolding
['utf-8'])) return 1;
1036 // Use cached version if possible
1037 if ($cacheFileCase && @is_file
($cacheFileCase)) {
1038 $this->caseFolding
['utf-8'] = unserialize(t3lib_div
::getUrl($cacheFileCase));
1044 if (is_array($this->toASCII
['utf-8'])) return 1;
1046 // Use cached version if possible
1047 if ($cacheFileASCII && @is_file
($cacheFileASCII)) {
1048 $this->toASCII
['utf-8'] = unserialize(t3lib_div
::getUrl($cacheFileASCII));
1054 // process main Unicode data file
1055 $unicodeDataFile = PATH_t3lib
.'unidata/UnicodeData.txt';
1056 if (!(t3lib_div
::validPathStr($unicodeDataFile) && @is_file
($unicodeDataFile))) return false
;
1058 $fh = fopen($unicodeDataFile,'rb');
1059 if (!$fh) return false
;
1061 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1062 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1063 $this->caseFolding
['utf-8'] = array();
1064 $utf8CaseFolding =& $this->caseFolding
['utf-8']; // a shorthand
1065 $utf8CaseFolding['toUpper'] = array();
1066 $utf8CaseFolding['toLower'] = array();
1067 $utf8CaseFolding['toTitle'] = array();
1069 $decomposition = array(); // array of temp. decompositions
1070 $mark = array(); // array of chars that are marks (eg. composing accents)
1071 $number = array(); // array of chars that are numbers (eg. digits)
1072 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1074 while (!feof($fh)) {
1075 $line = fgets($fh,4096);
1076 // has a lot of info
1077 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
1079 $ord = hexdec($char);
1080 if ($ord > 0xFFFF) break; // only process the BMP
1082 $utf8_char = $this->UnumberToChar($ord);
1084 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1085 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1086 // store "title" only when different from "upper" (only a few)
1087 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1090 case 'M': // mark (accent, umlaut, ...)
1091 $mark["U+$char"] = 1;
1094 case 'N': // numeric value
1095 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
1098 // accented Latin letters without "official" decomposition
1100 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) {
1101 $c = ord($match[2]);
1102 if ($match[1] == 'SMALL') $c +
= 32;
1104 $decomposition["U+$char"] = array(dechex($c));
1109 if (ereg('(<.*>)? *(.+)',$decomp,$match)) {
1111 case '<circle>': // add parenthesis as circle replacement, eg (1)
1112 $match[2] = '0028 '.$match[2].' 0029';
1115 case '<square>': // add square brackets as square replacement, eg [1]
1116 $match[2] = '005B '.$match[2].' 005D';
1119 case '<compat>': // ignore multi char decompositions that start with a space
1120 if (ereg('^0020 ',$match[2])) continue 2;
1123 // ignore Arabic and vertical layout presentation decomposition
1131 $decomposition["U+$char"] = split(' ',$match[2]);
1136 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1137 $specialCasingFile = PATH_t3lib
.'unidata/SpecialCasing.txt';
1138 if (t3lib_div
::validPathStr($specialCasingFile) && @is_file
($specialCasingFile)) {
1139 $fh = fopen($specialCasingFile,'rb');
1141 while (!feof($fh)) {
1142 $line = fgets($fh,4096);
1143 if ($line{0} != '#' && trim($line) != '') {
1145 list($char,$lower,$title,$upper,$cond) = t3lib_div
::trimExplode(';', $line);
1146 if ($cond == '' ||
$cond{0} == '#') {
1147 $utf8_char = $this->UnumberToChar(hexdec($char));
1148 if ($char != $lower) {
1149 $arr = split(' ',$lower);
1150 for ($i=0; isset($arr[$i]); $i++
) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1151 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1153 if ($char != $title && $title != $upper) {
1154 $arr = split(' ',$title);
1155 for ($i=0; isset($arr[$i]); $i++
) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1156 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1158 if ($char != $upper) {
1159 $arr = split(' ',$upper);
1160 for ($i=0; isset($arr[$i]); $i++
) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1161 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1170 // process custom decompositions
1171 $customTranslitFile = PATH_t3lib
.'unidata/Translit.txt';
1172 if (t3lib_div
::validPathStr($customTranslitFile) && @is_file
($customTranslitFile)) {
1173 $fh = fopen($customTranslitFile,'rb');
1175 while (!feof($fh)) {
1176 $line = fgets($fh,4096);
1177 if ($line{0} != '#' && trim($line) != '') {
1178 list($char,$translit) = t3lib_div
::trimExplode(';', $line);
1179 if (!$translit) $omit["U+$char"] = 1;
1180 $decomposition["U+$char"] = split(' ', $translit);
1188 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1189 foreach($decomposition as $from => $to) {
1190 $code_decomp = array();
1192 while ($code_value = array_shift($to)) {
1193 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1194 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
1195 array_unshift($to, $cv);
1197 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1198 array_push($code_decomp, $code_value);
1201 if (count($code_decomp) ||
isset($omit[$from])) {
1202 $decomposition[$from] = $code_decomp;
1204 unset($decomposition[$from]);
1208 // create ascii only mapping
1209 $this->toASCII
['utf-8'] = array();
1210 $ascii =& $this->toASCII
['utf-8'];
1212 foreach($decomposition as $from => $to) {
1213 $code_decomp = array();
1214 while ($code_value = array_shift($to)) {
1215 $ord = hexdec($code_value);
1217 continue 2; // skip decompositions containing non-ASCII chars
1219 array_push($code_decomp,chr($ord));
1221 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1224 // add numeric decompositions
1225 foreach($number as $from => $to) {
1226 $utf8_char = $this->UnumberToChar(hexdec($from));
1227 if (!isset($ascii[$utf8_char])) {
1228 $ascii[$utf8_char] = $to;
1232 if ($cacheFileCase) {
1233 t3lib_div
::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1236 if ($cacheFileASCII) {
1237 t3lib_div
::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1244 * This function initializes the folding table for a charset other than UTF-8.
1245 * This function is automatically called by the case folding functions.
1247 * @param string Charset for which to initialize case folding.
1248 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1251 function initCaseFolding($charset) {
1252 // Only process if the case table is not yet loaded:
1253 if (is_array($this->caseFolding
[$charset])) return 1;
1255 // Use cached version if possible
1256 $cacheFile = t3lib_div
::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1257 if ($cacheFile && @is_file
($cacheFile)) {
1258 $this->caseFolding
[$charset] = unserialize(t3lib_div
::getUrl($cacheFile));
1262 // init UTF-8 conversion for this charset
1263 if (!$this->initCharset($charset)) {
1267 // UTF-8 case folding is used as the base conversion table
1268 if (!$this->initUnicodeData('case')) {
1272 $nochar = chr($this->noCharByteVal
);
1273 foreach ($this->parsedCharsets
[$charset]['local'] as $ci => $utf8) {
1274 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1275 $c = $this->utf8_decode($utf8, $charset);
1277 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1278 $cc = $this->utf8_decode($this->caseFolding
['utf-8']['toUpper'][$utf8], $charset);
1279 if ($cc != '' && $cc != $nochar) $this->caseFolding
[$charset]['toUpper'][$c] = $cc;
1281 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1282 $cc = $this->utf8_decode($this->caseFolding
['utf-8']['toLower'][$utf8], $charset);
1283 if ($cc != '' && $cc != $nochar) $this->caseFolding
[$charset]['toLower'][$c] = $cc;
1285 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1286 $cc = $this->utf8_decode($this->caseFolding
['utf-8']['toTitle'][$utf8], $charset);
1287 if ($cc != '' && $cc != $nochar) $this->caseFolding
[$charset]['toTitle'][$c] = $cc;
1290 // add the ASCII case table
1291 for ($i=ord('a'); $i<=ord('z'); $i++
) {
1292 $this->caseFolding
[$charset]['toUpper'][chr($i)] = chr($i-32);
1294 for ($i=ord('A'); $i<=ord('Z'); $i++
) {
1295 $this->caseFolding
[$charset]['toLower'][chr($i)] = chr($i+
32);
1299 t3lib_div
::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding
[$charset]));
1306 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1307 * This function is automatically called by the ASCII transliteration functions.
1309 * @param string Charset for which to initialize conversion.
1310 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1313 function initToASCII($charset) {
1314 // Only process if the case table is not yet loaded:
1315 if (is_array($this->toASCII
[$charset])) return 1;
1317 // Use cached version if possible
1318 $cacheFile = t3lib_div
::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1319 if ($cacheFile && @is_file
($cacheFile)) {
1320 $this->toASCII
[$charset] = unserialize(t3lib_div
::getUrl($cacheFile));
1324 // init UTF-8 conversion for this charset
1325 if (!$this->initCharset($charset)) {
1329 // UTF-8/ASCII transliteration is used as the base conversion table
1330 if (!$this->initUnicodeData('ascii')) {
1334 $nochar = chr($this->noCharByteVal
);
1335 foreach ($this->parsedCharsets
[$charset]['local'] as $ci => $utf8) {
1336 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1337 $c = $this->utf8_decode($utf8, $charset);
1339 if (isset($this->toASCII
['utf-8'][$utf8])) {
1340 $this->toASCII
[$charset][$c] = $this->toASCII
['utf-8'][$utf8];
1345 t3lib_div
::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII
[$charset]));
1366 /********************************************
1368 * String operation functions
1370 ********************************************/
1373 * Returns a part of a string.
1374 * Unit-tested by Kasper (single byte charsets only)
1376 * @param string The character set
1377 * @param string Character string
1378 * @param integer Start position (character position)
1379 * @param integer Length (in characters)
1380 * @return string The substring
1381 * @see substr(), mb_substr()
1382 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1384 function substr($charset,$string,$start,$len=null
) {
1385 if ($len===0) return '';
1387 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1388 // cannot omit $len, when specifying charset
1390 $enc = mb_internal_encoding(); // save internal encoding
1391 mb_internal_encoding($charset);
1392 $str = mb_substr($string,$start);
1393 mb_internal_encoding($enc); // restore internal encoding
1398 return mb_substr($string,$start,$len,$charset);
1400 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1401 // cannot omit $len, when specifying charset
1403 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1404 iconv_set_encoding('internal_encoding',$charset);
1405 $str = iconv_substr($string,$start);
1406 iconv_set_encoding('internal_encoding',$enc); // restore internal encoding
1411 return iconv_substr($string,$start,$len,$charset);
1413 } elseif ($charset == 'utf-8') {
1414 return $this->utf8_substr($string,$start,$len);
1415 } elseif ($this->eucBasedSets
[$charset]) {
1416 return $this->euc_substr($string,$start,$charset,$len);
1417 } elseif ($this->twoByteSets
[$charset]) {
1418 return substr($string,$start*2,$len*2);
1419 } elseif ($this->fourByteSets
[$charset]) {
1420 return substr($string,$start*4,$len*4);
1423 // treat everything else as single-byte encoding
1424 return $len === NULL ?
substr($string,$start) : substr($string,$start,$len);
1428 * Counts the number of characters.
1429 * Unit-tested by Kasper (single byte charsets only)
1431 * @param string The character set
1432 * @param string Character string
1433 * @return integer The number of characters
1435 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1437 function strlen($charset,$string) {
1438 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1439 return mb_strlen($string,$charset);
1440 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1441 return iconv_strlen($string,$charset);
1442 } elseif ($charset == 'utf-8') {
1443 return $this->utf8_strlen($string);
1444 } elseif ($this->eucBasedSets
[$charset]) {
1445 return $this->euc_strlen($string,$charset);
1446 } elseif ($this->twoByteSets
[$charset]) {
1447 return strlen($string)/2;
1448 } elseif ($this->fourByteSets
[$charset]) {
1449 return strlen($string)/4;
1451 // treat everything else as single-byte encoding
1452 return strlen($string);
1456 * Truncates a string and pre-/appends a string.
1457 * Unit tested by Kasper
1459 * @param string The character set
1460 * @param string Character string
1461 * @param integer Length (in characters)
1462 * @param string Crop signifier
1463 * @return string The shortened string
1464 * @see substr(), mb_strimwidth()
1465 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1467 function crop($charset,$string,$len,$crop='') {
1468 if (intval($len) == 0) return $string;
1470 if ($charset == 'utf-8') {
1471 $i = $this->utf8_char2byte_pos($string,$len);
1472 } elseif ($this->eucBasedSets
[$charset]) {
1473 $i = $this->euc_char2byte_pos($string,$len,$charset);
1478 $i = strlen($string)+
$len;
1479 if ($i<=0) $i = false
;
1483 if ($i === false
) { // $len outside actual string length
1487 if (strlen($string{$i})) {
1488 return substr($string,0,$i).$crop;
1492 if (strlen($string{$i-1})) {
1493 return $crop.substr($string,$i);
1498 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1500 return substr($string,0,$i).$crop;
1502 return $crop.substr($string,$i);
1511 * Cuts a string short at a given byte length.
1513 * @param string The character set
1514 * @param string Character string
1515 * @param integer The byte length
1516 * @return string The shortened string
1518 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1520 function strtrunc($charset,$string,$len) {
1521 if ($len <= 0) return '';
1523 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1524 return mb_strcut($string,0,$len,$charset);
1525 } elseif ($charset == 'utf-8') {
1526 return $this->utf8_strtrunc($string,$len);
1527 } elseif ($this->eucBasedSets
[$charset]) {
1528 return $this->euc_strtrunc($string,$charset);
1529 } elseif ($this->twoByteSets
[$charset]) {
1530 if ($len %
2) $len--; // don't cut at odd positions
1531 } elseif ($this->fourByteSets
[$charset]) {
1533 $len -= $x; // realign to position dividable by four
1535 // treat everything else as single-byte encoding
1536 return substr($string,0,$len);
1540 * Translates all characters of a string into their respective case values.
1541 * Unlike strtolower() and strtoupper() this method is locale independent.
1542 * Note that the string length may change!
1543 * eg. lower case German �(sharp S) becomes upper case "SS"
1544 * Unit-tested by Kasper
1545 * Real case folding is language dependent, this method ignores this fact.
1547 * @param string Character set of string
1548 * @param string Input string to convert case for
1549 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1550 * @return string The converted string
1551 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1552 * @see strtolower(), strtoupper()
1554 function conv_case($charset,$string,$case) {
1555 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1556 if ($case == 'toLower') {
1557 $string = mb_strtolower($string,$charset);
1559 $string = mb_strtoupper($string,$charset);
1561 } elseif ($charset == 'utf-8') {
1562 $string = $this->utf8_char_mapping($string,'case',$case);
1563 } elseif (isset($this->eucBasedSets
[$charset])) {
1564 $string = $this->euc_char_mapping($string,$charset,'case',$case);
1566 // treat everything else as single-byte encoding
1567 $string = $this->sb_char_mapping($string,$charset,'case',$case);
1574 * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
1576 * @param string Character set of string
1577 * @param string Input string to convert
1578 * @return string The converted string
1580 function specCharsToASCII($charset,$string) {
1581 if ($charset == 'utf-8') {
1582 $string = $this->utf8_char_mapping($string,'ascii');
1583 } elseif (isset($this->eucBasedSets
[$charset])) {
1584 $string = $this->euc_char_mapping($string,$charset,'ascii');
1586 // treat everything else as single-byte encoding
1587 $string = $this->sb_char_mapping($string,$charset,'ascii');
1604 /********************************************
1606 * Internal string operation functions
1608 ********************************************/
1611 * Maps all characters of a string in a single byte charset.
1613 * @param string the string
1614 * @param string the charset
1615 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1616 * @param string 'case': conversion 'toLower' or 'toUpper'
1617 * @return string the converted string
1618 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1620 function sb_char_mapping($str,$charset,$mode,$opt='') {
1623 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1624 $map =& $this->caseFolding
[$charset][$opt];
1628 if (!$this->initToASCII($charset)) return $str; // do nothing
1629 $map =& $this->toASCII
[$charset];
1637 for($i=0; strlen($str{$i}); $i++
) {
1639 if (isset($map[$c])) {
1658 /********************************************
1660 * Internal UTF-8 string operation functions
1662 ********************************************/
1665 * Returns a part of a UTF-8 string.
1666 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1668 * @param string UTF-8 string
1669 * @param integer Start position (character position)
1670 * @param integer Length (in characters)
1671 * @return string The substring
1673 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1675 function utf8_substr($str,$start,$len=null
) {
1676 if (!strcmp($len,'0')) return '';
1678 $byte_start = $this->utf8_char2byte_pos($str,$start);
1679 if ($byte_start === false
) {
1681 return false
; // $start outside string length
1687 $str = substr($str,$byte_start);
1690 $byte_end = $this->utf8_char2byte_pos($str,$len);
1691 if ($byte_end === false
) // $len outside actual string length
1692 return $len<0 ?
'' : $str; // When length is less than zero and exceeds, then we return blank string.
1694 return substr($str,0,$byte_end);
1700 * Counts the number of characters of a string in UTF-8.
1701 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1703 * @param string UTF-8 multibyte character string
1704 * @return integer The number of characters
1706 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1708 function utf8_strlen($str) {
1710 for($i=0; strlen($str{$i}); $i++
) {
1712 if (!($c & 0x80)) // single-byte (0xxxxxx)
1714 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1721 * Truncates a string in UTF-8 short at a given byte length.
1723 * @param string UTF-8 multibyte character string
1724 * @param integer the byte length
1725 * @return string the shortened string
1727 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1729 function utf8_strtrunc($str,$len) {
1731 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1732 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1733 if ($i <= 0) return ''; // sanity check
1734 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++
; // calculate number of bytes
1735 if ($bc+
$i > $len) return substr($str,0,$i);
1736 // fallthru: multibyte char fits into length
1738 return substr($str,0,$len);
1742 * Find position of first occurrence of a string, both arguments are in UTF-8.
1744 * @param string UTF-8 string to search in
1745 * @param string UTF-8 string to search for
1746 * @param integer Positition to start the search
1747 * @return integer The character position
1749 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1751 function utf8_strpos($haystack,$needle,$offset=0) {
1752 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1753 return mb_strpos($haystack,$needle,$offset,'utf-8');
1754 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1755 return iconv_strpos($haystack,$needle,$offset,'utf-8');
1758 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1759 if ($byte_offset === false
) return false
; // offset beyond string length
1761 $byte_pos = strpos($haystack,$needle,$byte_offset);
1762 if ($byte_pos === false
) return false
; // needle not found
1764 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1768 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1770 * @param string UTF-8 string to search in
1771 * @param string UTF-8 character to search for (single character)
1772 * @return integer The character position
1774 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1776 function utf8_strrpos($haystack,$needle) {
1777 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1778 return mb_strrpos($haystack,$needle,'utf-8');
1779 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1780 return iconv_strrpos($haystack,$needle,'utf-8');
1783 $byte_pos = strrpos($haystack,$needle);
1784 if ($byte_pos === false
) return false
; // needle not found
1786 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1790 * Translates a character position into an 'absolute' byte position.
1791 * Unit tested by Kasper.
1793 * @param string UTF-8 string
1794 * @param integer Character position (negative values start from the end)
1795 * @return integer Byte position
1796 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1798 function utf8_char2byte_pos($str,$pos) {
1799 $n = 0; // number of characters found
1800 $p = abs($pos); // number of characters wanted
1806 $i = strlen($str)-1;
1810 for( ; strlen($str{$i}) && $n<$p; $i+
=$d) {
1811 $c = (int)ord($str{$i});
1812 if (!($c & 0x80)) // single-byte (0xxxxxx)
1814 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1817 if (!strlen($str{$i})) return false
; // offset beyond string length
1820 // skip trailing multi-byte data bytes
1821 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++
; }
1831 * Translates an 'absolute' byte position into a character position.
1832 * Unit tested by Kasper.
1834 * @param string UTF-8 string
1835 * @param integer byte position
1836 * @return integer character position
1837 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1839 function utf8_byte2char_pos($str,$pos) {
1840 $n = 0; // number of characters
1841 for($i=$pos; $i>0; $i--) {
1842 $c = (int)ord($str{$i});
1843 if (!($c & 0x80)) // single-byte (0xxxxxx)
1845 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1848 if (!strlen($str{$i})) return false
; // offset beyond string length
1854 * Maps all characters of an UTF-8 string.
1856 * @param string UTF-8 string
1857 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1858 * @param string 'case': conversion 'toLower' or 'toUpper'
1859 * @return string the converted string
1860 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1862 function utf8_char_mapping($str,$mode,$opt='') {
1863 if (!$this->initUnicodeData($mode)) return $str; // do nothing
1868 $map =& $this->caseFolding
['utf-8'][$opt];
1872 $map =& $this->toASCII
['utf-8'];
1879 for($i=0; strlen($str{$i}); $i++
) {
1881 if (!($c & 0x80)) // single-byte (0xxxxxx)
1883 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1884 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++
; } // calculate number of bytes
1885 $mbc = substr($str,$i,$bc);
1889 if (isset($map[$mbc])) {
1916 /********************************************
1918 * Internal EUC string operation functions
1920 * Extended Unix Code:
1921 * ASCII compatible 7bit single bytes chars
1922 * 8bit two byte chars
1924 * Shift-JIS is treated as a special case.
1926 ********************************************/
1929 * Cuts a string in the EUC charset family short at a given byte length.
1931 * @param string EUC multibyte character string
1932 * @param integer the byte length
1933 * @param string the charset
1934 * @return string the shortened string
1936 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1938 function euc_strtrunc($str,$len,$charset) {
1939 $sjis = ($charset == 'shift_jis');
1940 for ($i=0; strlen($str{$i}) && $i<$len; $i++
) {
1943 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) $i++
; // advance a double-byte char
1946 if ($c >= 0x80) $i++
; // advance a double-byte char
1949 if (!strlen($str{$i})) return $str; // string shorter than supplied length
1952 return substr($str,0,$len-1); // we ended on a first byte
1954 return substr($str,0,$len);
1958 * Returns a part of a string in the EUC charset family.
1960 * @param string EUC multibyte character string
1961 * @param integer start position (character position)
1962 * @param string the charset
1963 * @param integer length (in characters)
1964 * @return string the substring
1965 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1967 function euc_substr($str,$start,$charset,$len=null
) {
1968 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1969 if ($byte_start === false
) return false
; // $start outside string length
1971 $str = substr($str,$byte_start);
1974 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1975 if ($byte_end === false
) // $len outside actual string length
1978 return substr($str,0,$byte_end);
1984 * Counts the number of characters of a string in the EUC charset family.
1986 * @param string EUC multibyte character string
1987 * @param string the charset
1988 * @return integer the number of characters
1990 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1992 function euc_strlen($str,$charset) {
1993 $sjis = ($charset == 'shift_jis');
1995 for ($i=0; strlen($str{$i}); $i++
) {
1998 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) $i++
; // advance a double-byte char
2001 if ($c >= 0x80) $i++
; // advance a double-byte char
2011 * Translates a character position into an 'absolute' byte position.
2013 * @param string EUC multibyte character string
2014 * @param integer character position (negative values start from the end)
2015 * @param string the charset
2016 * @return integer byte position
2017 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2019 function euc_char2byte_pos($str,$pos,$charset) {
2020 $sjis = ($charset == 'shift_jis');
2021 $n = 0; // number of characters seen
2022 $p = abs($pos); // number of characters wanted
2028 $i = strlen($str)-1;
2032 for ( ; strlen($str{$i}) && $n<$p; $i+
=$d) {
2035 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) $i+
=$d; // advance a double-byte char
2038 if ($c >= 0x80) $i+
=$d; // advance a double-byte char
2043 if (!strlen($str{$i})) return false
; // offset beyond string length
2045 if ($pos < 0) $i++
; // correct offset
2051 * Maps all characters of a string in the EUC charset family.
2053 * @param string EUC multibyte character string
2054 * @param string the charset
2055 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2056 * @param string 'case': conversion 'toLower' or 'toUpper'
2057 * @return string the converted string
2058 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2060 function euc_char_mapping($str,$charset,$mode,$opt='') {
2063 if (!$this->initCaseFolding($charset)) return $str; // do nothing
2064 $map =& $this->caseFolding
[$charset][$opt];
2068 if (!$this->initToASCII($charset)) return $str; // do nothing
2069 $map =& $this->toASCII
[$charset];
2076 $sjis = ($charset == 'shift_jis');
2078 for($i=0; strlen($str{$i}); $i++
) {
2083 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) { // a double-byte char
2084 $mbc = substr($str,$i,2);
2089 if ($c >= 0x80) { // a double-byte char
2090 $mbc = substr($str,$i,2);
2095 if (isset($map[$mbc])) {
2107 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE
]['XCLASS']['t3lib/class.t3lib_cs.php']) {
2108 include_once($TYPO3_CONF_VARS[TYPO3_MODE
]['XCLASS']['t3lib/class.t3lib_cs.php']);