2 /***************************************************************
5 * (c) 2003-2004 Kasper Skaarhoj (kasperYYYY@typo3.com)
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
25 * Class for conversion between charsets.
29 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
33 * [CLASS/FUNCTION INDEX of SCRIPT]
38 * 502: function parse_charset($charset)
39 * 521: function get_locale_charset($locale)
41 * SECTION: Charset Conversion functions
42 * 574: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
43 * 614: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
44 * 631: function utf8_encode($str,$charset)
45 * 678: function utf8_decode($str,$charset,$useEntityForNoChar=0)
46 * 721: function utf8_to_entities($str)
47 * 754: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
48 * 788: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
49 * 838: function UnumberToChar($cbyte)
50 * 883: function utf8CharToUnumber($str,$hex=0)
52 * SECTION: Init functions
53 * 926: function initCharset($charset)
54 * 988: function initUnicodeData($mode=null)
55 * 1213: function initCaseFolding($charset)
56 * 1275: function initToASCII($charset)
58 * SECTION: String operation functions
59 * 1346: function substr($charset,$string,$start,$len=null)
60 * 1384: function strlen($charset,$string)
61 * 1412: function crop($charset,$string,$len,$crop='')
62 * 1465: function strtrunc($charset,$string,$len)
63 * 1499: function conv_case($charset,$string,$case)
64 * 1525: function specCharsToASCII($charset,$string)
66 * SECTION: Internal string operation functions
67 * 1565: function sb_char_mapping($str,$charset,$mode,$opt='')
69 * SECTION: Internal UTF-8 string operation functions
70 * 1620: function utf8_substr($str,$start,$len=null)
71 * 1653: function utf8_strlen($str)
72 * 1674: function utf8_strtrunc($str,$len)
73 * 1696: function utf8_strpos($haystack,$needle,$offset=0)
74 * 1719: function utf8_strrpos($haystack,$needle)
75 * 1739: function utf8_char2byte_pos($str,$pos)
76 * 1780: function utf8_byte2char_pos($str,$pos)
77 * 1803: function utf8_char_mapping($str,$mode,$opt='')
79 * SECTION: Internal EUC string operation functions
80 * 1879: function euc_strtrunc($str,$len,$charset)
81 * 1908: function euc_substr($str,$start,$charset,$len=null)
82 * 1933: function euc_strlen($str,$charset)
83 * 1960: function euc_char2byte_pos($str,$pos,$charset)
84 * 2001: function euc_char_mapping($str,$charset,$mode,$opt='')
87 * (This index is automatically created/updated by the extension "extdeveval")
101 * Functions working on UTF-8 strings:
106 * - implode/explode/join
108 * Functions nearly working on UTF-8 strings:
110 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf_strlen
111 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained 7-bit ASCII
112 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
113 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
115 * Functions NOT working on UTF-8 strings:
129 * Class for conversion between charsets
131 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
132 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
137 var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
139 // This is the array where parsed conversion tables are stored (cached)
140 var $parsedCharsets=array();
142 // An array where case folding data will be stored (cached)
143 var $caseFolding=array();
145 // An array where charset-to-ASCII mappings are stored (cached)
146 var $toASCII=array();
148 // This tells the converter which charsets has two bytes per char:
149 var $twoByteSets=array(
150 'ucs-2'=>1, // 2-byte Unicode
153 // This tells the converter which charsets has four bytes per char:
154 var $fourByteSets=array(
155 'ucs-4'=>1, // 4-byte Unicode
156 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
159 // This tells the converter which charsets use a scheme like the Extended Unix Code:
160 var $eucBasedSets=array(
161 'gb2312'=>1, // Chinese, simplified.
162 'big5'=>1, // Chinese, traditional.
163 'euc-kr'=>1, // Korean
164 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
167 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
168 // http://czyborra.com/charsets/iso8859.html
171 'us-ascii'=> 'ascii',
172 'cp819' => 'iso-8859-1',
173 'ibm819' => 'iso-8859-1',
174 'iso-ir-100' => 'iso-8859-1',
175 'iso-ir-109' => 'iso-8859-2',
176 'iso-ir-148' => 'iso-8859-9',
177 'iso-ir-199' => 'iso-8859-14',
178 'iso-ir-203' => 'iso-8859-15',
179 'csisolatin1' => 'iso-8859-1',
180 'csisolatin2' => 'iso-8859-2',
181 'csisolatin3' => 'iso-8859-3',
182 'csisolatin5' => 'iso-8859-9',
183 'csisolatin8' => 'iso-8859-14',
184 'csisolatin9' => 'iso-8859-15',
185 'csisolatingreek' => 'iso-8859-7',
186 'iso-celtic' => 'iso-8859-14',
187 'latin1' => 'iso-8859-1',
188 'latin2' => 'iso-8859-2',
189 'latin3' => 'iso-8859-3',
190 'latin5' => 'iso-8859-9',
191 'latin6' => 'iso-8859-10',
192 'latin8' => 'iso-8859-14',
193 'latin9' => 'iso-8859-15',
194 'l1' => 'iso-8859-1',
195 'l2' => 'iso-8859-2',
196 'l3' => 'iso-8859-3',
197 'l5' => 'iso-8859-9',
198 'l6' => 'iso-8859-10',
199 'l8' => 'iso-8859-14',
200 'l9' => 'iso-8859-15',
201 'cyrillic' => 'iso-8859-5',
202 'arabic' => 'iso-8859-6',
203 'tis-620' => 'iso-8859-11',
204 'win874' => 'windows-874',
205 'win1250' => 'windows-1250',
206 'win1251' => 'windows-1251',
207 'win1252' => 'windows-1252',
208 'win1253' => 'windows-1253',
209 'win1254' => 'windows-1254',
210 'win1255' => 'windows-1255',
211 'win1256' => 'windows-1256',
212 'win1257' => 'windows-1257',
213 'win1258' => 'windows-1258',
214 'cp1250' => 'windows-1250',
215 'cp1251' => 'windows-1251',
216 'cp1252' => 'windows-1252',
217 'ms-ee' => 'windows-1250',
218 'ms-ansi' => 'windows-1252',
219 'ms-greek' => 'windows-1253',
220 'ms-turk' => 'windows-1254',
221 'winbaltrim' => 'windows-1257',
222 'koi-8ru' => 'koi-8r',
226 'macintosh' => 'macroman',
227 'euc-cn' => 'gb2312',
228 'x-euc-cn' => 'gb2312',
234 'sjis' => 'shift_jis',
235 'shift-jis' => 'shift_jis',
236 'cp932' => 'shift_jis',
247 // mapping of iso-639:2 language codes to language (family) names
248 var $lang_to_langfamily=array(
249 // iso-639:2 language codes, see:
250 // http://www.w3.org/WAI/ER/IG/ert/iso639.htm
251 // http://www.unicode.org/onlinedat/languages.html
254 'cs' => 'east_european',
255 'da' => 'west_european',
256 'de' => 'west_european',
257 'es' => 'west_european',
259 'eu' => 'west_european',
260 'fi' => 'west_european',
261 'fr' => 'west_european',
263 'hr' => 'east_european',
264 'hu' => 'east_european',
266 'is' => 'west_european',
267 'it' => 'west_european',
269 'kl' => 'west_european',
271 'lt' => 'lithuanian',
272 'lv' => 'west_european', // Latvian/Lettish
273 'nl' => 'west_european',
274 'no' => 'west_european',
275 'pl' => 'east_european',
276 'pt' => 'west_european',
277 'ro' => 'east_european',
279 'sk' => 'east_european',
280 'sl' => 'east_european',
281 'sv' => 'west_european',
284 'vi' => 'vietnamese',
286 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
287 'chs' => 'simpl_chinese',
288 'cht' => 'trad_chinese',
289 'csy' => 'east_european',
290 'dan' => 'west_european',
291 'deu' => 'west_european',
292 'dea' => 'west_european',
293 'des' => 'west_european',
294 'ena' => 'west_european',
295 'enc' => 'west_european',
296 'eng' => 'west_european',
297 'enz' => 'west_european',
298 'enu' => 'west_european',
299 'nld' => 'west_european',
300 'nlb' => 'west_european',
301 'fin' => 'west_european',
302 'fra' => 'west_european',
303 'frb' => 'west_european',
304 'frc' => 'west_european',
305 'frs' => 'west_european',
307 'hun' => 'east_european',
308 'isl' => 'west_euorpean',
309 'ita' => 'west_european',
310 'its' => 'west_european',
313 'nor' => 'west_european',
314 'non' => 'west_european',
315 'plk' => 'east_european',
316 'ptg' => 'west_european',
317 'ptb' => 'west_european',
318 'rus' => 'east_european',
319 'sky' => 'east_european',
320 'esp' => 'west_european',
321 'esm' => 'west_european',
322 'esn' => 'west_european',
323 'sve' => 'west_european',
325 // English language names
326 'bulgarian' => 'east_european',
327 'catalan' => 'west_european',
328 'croatian' => 'east_european',
329 'czech' => 'east_european',
330 'danish' => 'west_european',
331 'dutch' => 'west_european',
332 'english' => 'west_european',
333 'finnish' => 'west_european',
334 'french' => 'west_european',
335 'galician' => 'west_european',
336 'german' => 'west_european',
337 'hungarian' => 'east_european',
338 'icelandic' => 'west_european',
339 'italian' => 'west_european',
340 'latvian' => 'west_european',
341 'lettish' => 'west_european',
342 'norwegian' => 'west_european',
343 'polish' => 'east_european',
344 'portuguese' => 'west_european',
345 'russian' => 'cyrillic',
346 'romanian' => 'east_european',
347 'slovak' => 'east_european',
348 'slovenian' => 'east_european',
349 'spanish' => 'west_european',
350 'svedish' => 'west_european',
351 'turkish' => 'east_european',
352 'ukrainian' => 'cyrillic',
355 // mapping of language (family) names to charsets on Unix
356 var $lang_to_charset_unix=array(
357 'west_european' => 'iso-8859-1',
358 'estonian' => 'iso-8859-1',
359 'east_european' => 'iso-8859-2',
360 'baltic' => 'iso-8859-4',
361 'cyrillic' => 'iso-8859-5',
362 'arabic' => 'iso-8859-6',
363 'greek' => 'iso-8859-7',
364 'hebrew' => 'iso-8859-8',
365 'turkish' => 'iso-8859-9',
366 'thai' => 'iso-8859-11', // = TIS-620
367 'lithuanian' => 'iso-8859-13',
368 'chinese' => 'gb2312', // = euc-cn
369 'japanese' => 'euc-jp',
370 'korean' => 'euc-kr',
371 'simpl_chinese' => 'gb2312',
372 'trad_chinese' => 'big5',
376 // mapping of language (family) names to charsets on Windows
377 var $lang_to_charset_windows=array(
378 'east_european' => 'windows-1250',
379 'cyrillic' => 'windows-1251',
380 'west_european' => 'windows-1252',
381 'greek' => 'windows-1253',
382 'turkish' => 'windows-1254',
383 'hebrew' => 'windows-1255',
384 'arabic' => 'windows-1256',
385 'baltic' => 'windows-1257',
386 'estonian' => 'windows-1257',
387 'lithuanian' => 'windows-1257',
388 'vietnamese' => 'windows-1258',
391 'chinese' => 'gb2312',
392 'japanese' => 'shift_jis',
393 'simpl_chinese' => 'gb2312',
394 'trad_chinese' => 'big5',
397 // mapping of locale names to charsets
398 var $locale_to_charset=array(
399 'japanese.euc' => 'euc-jp',
400 'ja_jp.ujis' => 'euc-jp',
401 'korean.euc' => 'euc-kr',
407 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
408 // Empty values means "iso-8859-1"
409 var $charSetArray = array(
417 'cz' => 'windows-1250',
418 'pl' => 'iso-8859-2',
419 'si' => 'windows-1250',
421 'tr' => 'iso-8859-9',
424 'ru' => 'windows-1251',
425 'ro' => 'iso-8859-2',
427 'sk' => 'windows-1250',
428 'lt' => 'windows-1257',
430 'hr' => 'windows-1250',
431 'hu' => 'iso-8859-2',
433 'th' => 'iso-8859-11',
434 'gr' => 'iso-8859-7',
437 'bg' => 'windows-1251',
439 'et' => 'iso-8859-4',
440 'ar' => 'iso-8859-6',
442 'ua' => 'windows-1251',
446 'ca' => 'iso-8859-15',
447 'ba' => 'iso-8859-2',
451 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
452 // Empty values means sames as Typo3
453 var $isoArray = array(
476 'gl' => '', // Greenlandic
491 'ba' => '', // Bosnian
496 * Normalize - changes input character set to lowercase letters.
498 * @param string Input charset
499 * @return string Normalized charset
500 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
502 function parse_charset($charset) {
503 $charset = strtolower($charset);
504 if (isset($this->synonyms
[$charset])) $charset = $this->synonyms
[$charset];
510 * Get the charset of a locale.
513 * ln_CN language / country
514 * ln_CN.cs language / country / charset
515 * ln_CN.cs@mod language / country / charset / modifier
517 * @param string Locale string
518 * @return string Charset resolved for locale string
519 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
521 function get_locale_charset($locale) {
522 $locale = strtolower($locale);
524 // exact locale specific charset?
525 if (isset($this->locale_to_charset
[$locale])) return $this->locale_to_charset
[$locale];
528 list($locale,$modifier) = explode('@',$locale);
530 // locale contains charset: use it
531 list($locale,$charset) = explode('.',$locale);
532 if ($charset) return $this->parse_charset($charset);
534 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
535 if ($modifier == 'euro') return 'iso-8859-15';
538 list($language,$country) = explode('_',$locale);
539 if (isset($this->lang_to_langfamily
[$language])) $language = $this->lang_to_langfamily
[$language];
541 if (TYPO3_OS
== 'WIN') {
542 $cs = $this->lang_to_charset_windows
[$language];
544 $cs = $this->lang_to_charset_unix
[$language];
547 return $cs ?
$cs : 'iso-8859-1';
558 /********************************************
560 * Charset Conversion functions
562 ********************************************/
565 * Convert from one charset to another charset.
567 * @param string Input string
568 * @param string From charset (the current charset of the string)
569 * @param string To charset (the output charset wanted)
570 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
571 * @return string Converted string
574 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
575 if ($fromCS==$toCS) return $str;
577 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
578 if ($toCS=='utf-8' ||
!$useEntityForNoChar) {
579 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
581 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
582 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
586 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
587 if (false !== $conv_str) return $conv_str;
591 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
592 if (false !== $conv_str) return $conv_str;
595 // fallback to TYPO3 conversion
598 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
599 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
604 * Convert all elements in ARRAY from one charset to another charset.
605 * NOTICE: Array is passed by reference!
607 * @param string Input array, possibly multidimensional
608 * @param string From charset (the current charset of the string)
609 * @param string To charset (the output charset wanted)
610 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
614 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
615 foreach($array as $key => $value) {
616 if (is_array($array[$key])) {
617 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
619 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
625 * Converts $str from $charset to UTF-8
627 * @param string String in local charset to convert to UTF-8
628 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
629 * @return string Output string, converted to UTF-8
631 function utf8_encode($str,$charset) {
633 // Charset is case-insensitive.
634 if ($this->initCharset($charset)) { // Parse conv. table if not already...
635 $strLen = strlen($str);
638 for ($a=0;$a<$strLen;$a++
) { // Traverse each char in string.
639 $chr=substr($str,$a,1);
641 if (isset($this->twoByteSets
[$charset])) { // If the charset has two bytes per char
642 $ord2 = ord($str{$a+
1});
643 $ord = $ord<<8 & $ord2; // assume big endian
645 if (isset($this->parsedCharsets
[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
646 $outStr.=$this->parsedCharsets
[$charset]['local'][$ord];
647 } else $outStr.=chr($this->noCharByteVal
); // No char exists
649 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
650 if (isset($this->eucBasedSets
[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
652 $ord2=ord(substr($str,$a,1));
653 $ord = $ord*256+
$ord2;
655 elseif ($charset == 'shift_jis' && ($ord <160 ||
$ord>223)) { // Shift-JIS is like EUC, but chars between 160 and 223 are single byte
657 $ord2=ord(substr($str,$a,1));
658 $ord = $ord*256+
$ord2;
661 if (isset($this->parsedCharsets
[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
662 $outStr.=$this->parsedCharsets
[$charset]['local'][$ord];
663 } else $outStr.=chr($this->noCharByteVal
); // No char exists
664 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
671 * Converts $str from UTF-8 to $charset
673 * @param string String in UTF-8 to convert to local charset
674 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
675 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
676 * @return string Output string, converted to local charset
678 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
680 // Charset is case-insensitive.
681 if ($this->initCharset($charset)) { // Parse conv. table if not already...
682 $strLen = strlen($str);
685 for ($a=0,$i=0;$a<$strLen;$a++
,$i++
) { // Traverse each char in UTF-8 string.
686 $chr=substr($str,$a,1);
688 if ($ord>127) { // This means multibyte! (first byte!)
689 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
691 $buf=$chr; // Add first byte
692 for ($b=0;$b<8;$b++
) { // for each byte in multibyte string...
693 $ord = $ord << 1; // Shift it left and ...
694 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
695 $a++
; // Increase pointer...
696 $buf.=substr($str,$a,1); // ... and add the next char.
700 if (isset($this->parsedCharsets
[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
701 $mByte = $this->parsedCharsets
[$charset]['utf8'][$buf]; // The local number
702 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
703 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
704 } else $outStr.= chr($mByte);
705 } elseif ($useEntityForNoChar) { // Create num entity:
706 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
707 } else $outStr.=chr($this->noCharByteVal
); // No char exists
708 } else $outStr.=chr($this->noCharByteVal
); // No char exists (MIDDLE of MB sequence!)
709 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
716 * Converts all chars > 127 to numeric entities.
718 * @param string Input string
719 * @return string Output string
721 function utf8_to_entities($str) {
722 $strLen = strlen($str);
725 for ($a=0;$a<$strLen;$a++
) { // Traverse each char in UTF-8 string.
726 $chr=substr($str,$a,1);
728 if ($ord>127) { // This means multibyte! (first byte!)
729 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
730 $buf=$chr; // Add first byte
731 for ($b=0;$b<8;$b++
) { // for each byte in multibyte string...
732 $ord = $ord << 1; // Shift it left and ...
733 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
734 $a++
; // Increase pointer...
735 $buf.=substr($str,$a,1); // ... and add the next char.
739 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
740 } else $outStr.=chr($this->noCharByteVal
); // No char exists (MIDDLE of MB sequence!)
741 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
748 * Converts numeric entities (UNICODE, eg. decimal (Ӓ) or hexadecimal ()) to UTF-8 multibyte chars
750 * @param string Input string, UTF-8
751 * @param boolean If set, then all string-HTML entities (like & or £ will be converted as well)
752 * @return string Output string
754 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
755 if ($alsoStdHtmlEnt) {
756 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES
)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
759 $token = md5(microtime());
760 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
761 foreach($parts as $k => $v) {
763 if (substr($v,0,1)=='#') { // Dec or hex entities:
764 if (substr($v,1,1)=='x') {
765 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
767 $parts[$k] = $this->UnumberToChar(substr($v,1));
769 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
770 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
771 } else { // No conversion:
772 $parts[$k] ='&'.$v.';';
777 return implode('',$parts);
781 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
783 * @param string Input string, UTF-8
784 * @param boolean If set, then all HTML entities (like & or £ or { or 㽝) will be detected as characters.
785 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
786 * @return array Output array with the char numbers
788 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
789 // If entities must be registered as well...:
791 $str = $this->entities_to_utf8($str,1);
794 $strLen = strlen($str);
797 for ($a=0;$a<$strLen;$a++
) { // Traverse each char in UTF-8 string.
798 $chr=substr($str,$a,1);
800 if ($ord>127) { // This means multibyte! (first byte!)
801 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
802 $buf=$chr; // Add first byte
803 for ($b=0;$b<8;$b++
) { // for each byte in multibyte string...
804 $ord = $ord << 1; // Shift it left and ...
805 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
806 $a++
; // Increase pointer...
807 $buf.=substr($str,$a,1); // ... and add the next char.
811 $outArr[]=$retChar?
$buf:$this->utf8CharToUnumber($buf);
812 } else $outArr[]=$retChar?
chr($this->noCharByteVal
):$this->noCharByteVal
; // No char exists (MIDDLE of MB sequence!)
813 } else $outArr[]=$retChar?
chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
820 * Converts a UNICODE number to a UTF-8 multibyte character
821 * Algorithm based on script found at From: http://czyborra.com/utf/
822 * Unit-tested by Kasper
824 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
826 * bytes | bits | representation
828 * 2 | 11 | 110vvvvv 10vvvvvv
829 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
830 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
831 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
832 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
834 * @param integer UNICODE integer
835 * @return string UTF-8 multibyte character string
836 * @see utf8CharToUnumber()
838 function UnumberToChar($cbyte) {
843 } else if ($cbyte < 0x800) {
844 $str.=chr(0xC0 |
($cbyte >> 6));
845 $str.=chr(0x80 |
($cbyte & 0x3F));
846 } else if ($cbyte < 0x10000) {
847 $str.=chr(0xE0 |
($cbyte >> 12));
848 $str.=chr(0x80 |
(($cbyte >> 6) & 0x3F));
849 $str.=chr(0x80 |
($cbyte & 0x3F));
850 } else if ($cbyte < 0x200000) {
851 $str.=chr(0xF0 |
($cbyte >> 18));
852 $str.=chr(0x80 |
(($cbyte >> 12) & 0x3F));
853 $str.=chr(0x80 |
(($cbyte >> 6) & 0x3F));
854 $str.=chr(0x80 |
($cbyte & 0x3F));
855 } else if ($cbyte < 0x4000000) {
856 $str.=chr(0xF8 |
($cbyte >> 24));
857 $str.=chr(0x80 |
(($cbyte >> 18) & 0x3F));
858 $str.=chr(0x80 |
(($cbyte >> 12) & 0x3F));
859 $str.=chr(0x80 |
(($cbyte >> 6) & 0x3F));
860 $str.=chr(0x80 |
($cbyte & 0x3F));
861 } else if ($cbyte < 0x80000000) {
862 $str.=chr(0xFC |
($cbyte >> 30));
863 $str.=chr(0x80 |
(($cbyte >> 24) & 0x3F));
864 $str.=chr(0x80 |
(($cbyte >> 18) & 0x3F));
865 $str.=chr(0x80 |
(($cbyte >> 12) & 0x3F));
866 $str.=chr(0x80 |
(($cbyte >> 6) & 0x3F));
867 $str.=chr(0x80 |
($cbyte & 0x3F));
868 } else { // Cannot express a 32-bit character in UTF-8
869 $str .= chr($this->noCharByteVal
);
875 * Converts a UTF-8 Multibyte character to a UNICODE number
876 * Unit-tested by Kasper
878 * @param string UTF-8 multibyte character string
879 * @param boolean If set, then a hex. number is returned.
880 * @return integer UNICODE integer
881 * @see UnumberToChar()
883 function utf8CharToUnumber($str,$hex=0) {
884 $ord=ord(substr($str,0,1)); // First char
886 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
888 for ($b=0;$b<8;$b++
) { // for each byte in multibyte string...
889 $ord = $ord << 1; // Shift it left and ...
890 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
891 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+
1,1))),-6);
894 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
896 $int = bindec($binBuf);
899 return $hex ?
'x'.dechex($int) : $int;
910 /********************************************
914 ********************************************/
917 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
918 * This function is automatically called by the conversion functions
920 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
922 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
923 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
926 function initCharset($charset) {
927 // Only process if the charset is not yet loaded:
928 if (!is_array($this->parsedCharsets
[$charset])) {
930 // Conversion table filename:
931 $charsetConvTableFile = PATH_t3lib
.'csconvtbl/'.$charset.'.tbl';
933 // If the conversion table is found:
934 if ($charset && t3lib_div
::validPathStr($charsetConvTableFile) && @is_file
($charsetConvTableFile)) {
935 // Cache file for charsets:
936 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
937 $cacheFile = t3lib_div
::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
938 if ($cacheFile && @is_file
($cacheFile)) {
939 $this->parsedCharsets
[$charset]=unserialize(t3lib_div
::getUrl($cacheFile));
941 // Parse conversion table into lines:
942 $lines=t3lib_div
::trimExplode(chr(10),t3lib_div
::getUrl($charsetConvTableFile),1);
943 // Initialize the internal variable holding the conv. table:
944 $this->parsedCharsets
[$charset]=array('local'=>array(),'utf8'=>array());
945 // traverse the lines:
947 foreach($lines as $value) {
948 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
950 // Detect type if not done yet: (Done on first real line)
951 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
952 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ?
'whitespaced' : 'ms-token';
954 if ($detectedType=='ms-token') {
955 list($hexbyte,$utf8) = split('=|:',$value,3);
956 } elseif ($detectedType=='whitespaced') {
958 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
960 $utf8 = 'U+'.$regA[2];
962 $decval = hexdec(trim($hexbyte));
964 $utf8decval = hexdec(substr(trim($utf8),2));
965 $this->parsedCharsets
[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
966 $this->parsedCharsets
[$charset]['utf8'][$this->parsedCharsets
[$charset]['local'][$decval]]=$decval;
971 t3lib_div
::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets
[$charset]));
980 * This function initializes all UTF-8 character data tables.
982 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
984 * @param string Mode ("case", "ascii", ...)
985 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
988 function initUnicodeData($mode=null) {
990 $cacheFileCase = t3lib_div
::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
991 $cacheFileASCII = t3lib_div
::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
993 // Only process if the tables are not yet loaded
996 if (is_array($this->caseFolding
['utf-8'])) return 1;
998 // Use cached version if possible
999 if ($cacheFileCase && @is_file
($cacheFileCase)) {
1000 $this->caseFolding
['utf-8'] = unserialize(t3lib_div
::getUrl($cacheFileCase));
1006 if (is_array($this->toASCII
['utf-8'])) return 1;
1008 // Use cached version if possible
1009 if ($cacheFileASCII && @is_file
($cacheFileASCII)) {
1010 $this->toASCII
['utf-8'] = unserialize(t3lib_div
::getUrl($cacheFileASCII));
1016 // process main Unicode data file
1017 $unicodeDataFile = PATH_t3lib
.'unidata/UnicodeData.txt';
1018 if (!(t3lib_div
::validPathStr($unicodeDataFile) && @is_file
($unicodeDataFile))) return false;
1020 $fh = fopen($unicodeDataFile,'rb');
1021 if (!$fh) return false;
1023 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1024 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1025 $this->caseFolding
['utf-8'] = array();
1026 $utf8CaseFolding =& $this->caseFolding
['utf-8']; // a shorthand
1027 $utf8CaseFolding['toUpper'] = array();
1028 $utf8CaseFolding['toLower'] = array();
1029 $utf8CaseFolding['toTitle'] = array();
1031 $decomposition = array(); // array of temp. decompositions
1032 $mark = array(); // array of chars that are marks (eg. composing accents)
1033 $number = array(); // array of chars that are numbers (eg. digits)
1034 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1036 while (!feof($fh)) {
1037 $line = fgets($fh,4096);
1038 // has a lot of info
1039 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
1041 $ord = hexdec($char);
1042 if ($ord > 0xFFFF) break; // only process the BMP
1044 $utf8_char = $this->UnumberToChar($ord);
1046 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1047 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1048 // store "title" only when different from "upper" (only a few)
1049 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1052 case 'M': // mark (accent, umlaut, ...)
1053 $mark["U+$char"] = 1;
1056 case 'N': // numeric value
1057 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
1060 // accented Latin letters without "official" decomposition
1062 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) {
1063 $c = ord($match[2]);
1064 if ($match[1] == 'SMALL') $c +
= 32;
1066 $decomposition["U+$char"] = array(dechex($c));
1071 if (ereg('(<.*>)? *(.+)',$decomp,$match)) {
1073 case '<circle>': // add parenthesis as circle replacement, eg (1)
1074 $match[2] = '0028 '.$match[2].' 0029';
1077 case '<square>': // add square brackets as square replacement, eg [1]
1078 $match[2] = '005B '.$match[2].' 005D';
1081 case '<compat>': // ignore multi char decompositions that start with a space
1082 if (ereg('^0020 ',$match[2])) continue 2;
1085 // ignore Arabic and vertical layout presentation decomposition
1093 $decomposition["U+$char"] = split(' ',$match[2]);
1098 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1099 $specialCasingFile = PATH_t3lib
.'unidata/SpecialCasing.txt';
1100 if (t3lib_div
::validPathStr($specialCasingFile) && @is_file
($specialCasingFile)) {
1101 $fh = fopen($specialCasingFile,'rb');
1103 while (!feof($fh)) {
1104 $line = fgets($fh,4096);
1105 if ($line{0} != '#' && trim($line) != '') {
1107 list($char,$lower,$title,$upper,$cond) = t3lib_div
::trimExplode(';', $line);
1108 if ($cond == '' ||
$cond{0} == '#') {
1109 $utf8_char = $this->UnumberToChar(hexdec($char));
1110 if ($char != $lower) {
1111 $arr = split(' ',$lower);
1112 for ($i=0; isset($arr[$i]); $i++
) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1113 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1115 if ($char != $title && $title != $upper) {
1116 $arr = split(' ',$title);
1117 for ($i=0; isset($arr[$i]); $i++
) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1118 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1120 if ($char != $upper) {
1121 $arr = split(' ',$upper);
1122 for ($i=0; isset($arr[$i]); $i++
) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1123 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1132 // process custom decompositions
1133 $customTranslitFile = PATH_t3lib
.'unidata/Translit.txt';
1134 if (t3lib_div
::validPathStr($customTranslitFile) && @is_file
($customTranslitFile)) {
1135 $fh = fopen($customTranslitFile,'rb');
1137 while (!feof($fh)) {
1138 $line = fgets($fh,4096);
1139 if ($line{0} != '#' && trim($line) != '') {
1140 list($char,$translit) = t3lib_div
::trimExplode(';', $line);
1141 if (!$translit) $omit["U+$char"] = 1;
1142 $decomposition["U+$char"] = split(' ', $translit);
1150 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1151 foreach($decomposition as $from => $to) {
1152 $code_decomp = array();
1154 while ($code_value = array_shift($to)) {
1155 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1156 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
1157 array_unshift($to, $cv);
1159 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1160 array_push($code_decomp, $code_value);
1163 if (count($code_decomp) ||
isset($omit[$from])) {
1164 $decomposition[$from] = $code_decomp;
1166 unset($decomposition[$from]);
1170 // create ascii only mapping
1171 $this->toASCII
['utf-8'] = array();
1172 $ascii =& $this->toASCII
['utf-8'];
1174 foreach($decomposition as $from => $to) {
1175 $code_decomp = array();
1176 while ($code_value = array_shift($to)) {
1177 $ord = hexdec($code_value);
1179 continue 2; // skip decompositions containing non-ASCII chars
1181 array_push($code_decomp,chr($ord));
1183 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1186 // add numeric decompositions
1187 foreach($number as $from => $to) {
1188 $utf8_char = $this->UnumberToChar(hexdec($from));
1189 if (!isset($ascii[$utf8_char])) {
1190 $ascii[$utf8_char] = $to;
1194 if ($cacheFileCase) {
1195 t3lib_div
::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1198 if ($cacheFileASCII) {
1199 t3lib_div
::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1206 * This function initializes the folding table for a charset other than UTF-8.
1207 * This function is automatically called by the case folding functions.
1209 * @param string Charset for which to initialize case folding.
1210 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1213 function initCaseFolding($charset) {
1214 // Only process if the case table is not yet loaded:
1215 if (is_array($this->caseFolding
[$charset])) return 1;
1217 // Use cached version if possible
1218 $cacheFile = t3lib_div
::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1219 if ($cacheFile && @is_file
($cacheFile)) {
1220 $this->caseFolding
[$charset] = unserialize(t3lib_div
::getUrl($cacheFile));
1224 // init UTF-8 conversion for this charset
1225 if (!$this->initCharset($charset)) {
1229 // UTF-8 case folding is used as the base conversion table
1230 if (!$this->initUnicodeData('case')) {
1234 $nochar = chr($this->noCharByteVal
);
1235 foreach ($this->parsedCharsets
[$charset]['local'] as $ci => $utf8) {
1236 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1237 $c = $this->utf8_decode($utf8, $charset);
1239 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1240 $cc = $this->utf8_decode($this->caseFolding
['utf-8']['toUpper'][$utf8], $charset);
1241 if ($cc != '' && $cc != $nochar) $this->caseFolding
[$charset]['toUpper'][$c] = $cc;
1243 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1244 $cc = $this->utf8_decode($this->caseFolding
['utf-8']['toLower'][$utf8], $charset);
1245 if ($cc != '' && $cc != $nochar) $this->caseFolding
[$charset]['toLower'][$c] = $cc;
1247 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1248 $cc = $this->utf8_decode($this->caseFolding
['utf-8']['toTitle'][$utf8], $charset);
1249 if ($cc != '' && $cc != $nochar) $this->caseFolding
[$charset]['toTitle'][$c] = $cc;
1252 // add the ASCII case table
1253 for ($i=ord('a'); $i<=ord('z'); $i++
) {
1254 $this->caseFolding
[$charset]['toUpper'][chr($i)] = chr($i-32);
1256 for ($i=ord('A'); $i<=ord('Z'); $i++
) {
1257 $this->caseFolding
[$charset]['toLower'][chr($i)] = chr($i+
32);
1261 t3lib_div
::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding
[$charset]));
1268 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1269 * This function is automatically called by the ASCII transliteration functions.
1271 * @param string Charset for which to initialize conversion.
1272 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1275 function initToASCII($charset) {
1276 // Only process if the case table is not yet loaded:
1277 if (is_array($this->toASCII
[$charset])) return 1;
1279 // Use cached version if possible
1280 $cacheFile = t3lib_div
::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1281 if ($cacheFile && @is_file
($cacheFile)) {
1282 $this->toASCII
[$charset] = unserialize(t3lib_div
::getUrl($cacheFile));
1286 // init UTF-8 conversion for this charset
1287 if (!$this->initCharset($charset)) {
1291 // UTF-8/ASCII transliteration is used as the base conversion table
1292 if (!$this->initUnicodeData('ascii')) {
1296 $nochar = chr($this->noCharByteVal
);
1297 foreach ($this->parsedCharsets
[$charset]['local'] as $ci => $utf8) {
1298 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1299 $c = $this->utf8_decode($utf8, $charset);
1301 if (isset($this->toASCII
['utf-8'][$utf8])) {
1302 $this->toASCII
[$charset][$c] = $this->toASCII
['utf-8'][$utf8];
1307 t3lib_div
::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII
[$charset]));
1328 /********************************************
1330 * String operation functions
1332 ********************************************/
1335 * Returns a part of a string.
1336 * Unit-tested by Kasper (single byte charsets only)
1338 * @param string The character set
1339 * @param string Character string
1340 * @param integer Start position (character position)
1341 * @param integer Length (in characters)
1342 * @return string The substring
1343 * @see substr(), mb_substr()
1344 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1346 function substr($charset,$string,$start,$len=null) {
1347 if ($len===0) return '';
1349 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1350 // cannot omit $len, when specifying charset
1352 $enc = mb_internal_encoding(); // save internal encoding
1353 mb_internal_encoding('utf-8');
1354 $str = mb_substr($string,$start);
1355 mb_internal_encoding($enc); // restore internal encoding
1359 else return mb_substr($string,$start,$len,'utf-8');
1360 } elseif ($charset == 'utf-8') {
1361 return $this->utf8_substr($string,$start,$len);
1362 } elseif ($this->eucBasedSets
[$charset]) {
1363 return $this->euc_substr($string,$start,$charset,$len);
1364 } elseif ($this->twoByteSets
[$charset]) {
1365 return substr($string,$start*2,$len*2);
1366 } elseif ($this->fourByteSets
[$charset]) {
1367 return substr($string,$start*4,$len*4);
1370 // treat everything else as single-byte encoding
1371 return $len === NULL ?
substr($string,$start) : substr($string,$start,$len);
1375 * Counts the number of characters.
1376 * Unit-tested by Kasper (single byte charsets only)
1378 * @param string The character set
1379 * @param string Character string
1380 * @return integer The number of characters
1382 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1384 function strlen($charset,$string) {
1385 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1386 return mb_strlen($string,$charset);
1387 } elseif ($charset == 'utf-8') {
1388 return $this->utf8_strlen($string);
1389 } elseif ($this->eucBasedSets
[$charset]) {
1390 return $this->euc_strlen($string,$charset);
1391 } elseif ($this->twoByteSets
[$charset]) {
1392 return strlen($string)/2;
1393 } elseif ($this->fourByteSets
[$charset]) {
1394 return strlen($string)/4;
1396 // treat everything else as single-byte encoding
1397 return strlen($string);
1401 * Truncates a string and pre-/appends a string.
1402 * Unit tested by Kasper
1404 * @param string The character set
1405 * @param string Character string
1406 * @param integer Length (in characters)
1407 * @param string Crop signifier
1408 * @return string The shortened string
1409 * @see substr(), mb_strimwidth()
1410 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1412 function crop($charset,$string,$len,$crop='') {
1413 if (intval($len) == 0) return $string;
1415 if ($charset == 'utf-8') {
1416 $i = $this->utf8_char2byte_pos($string,$len);
1417 } elseif ($this->eucBasedSets
[$charset]) {
1418 $i = $this->euc_char2byte_pos($string,$len,$charset);
1423 $i = strlen($string)+
$len;
1424 if ($i<=0) $i = false;
1428 if ($i === false) { // $len outside actual string length
1432 if (strlen($string{$i})) {
1433 return substr($string,0,$i).$crop;
1437 if (strlen($string{$i-1})) {
1438 return $crop.substr($string,$i);
1443 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1445 return substr($string,0,$i).$crop;
1447 return $crop.substr($string,$i);
1456 * Cuts a string short at a given byte length.
1458 * @param string The character set
1459 * @param string Character string
1460 * @param integer The byte length
1461 * @return string The shortened string
1463 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1465 function strtrunc($charset,$string,$len) {
1466 if ($len <= 0) return '';
1468 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1469 return mb_strcut($string,0,$len,$charset);
1470 } elseif ($charset == 'utf-8') {
1471 return $this->utf8_strtrunc($string,$len);
1472 } elseif ($this->eucBasedSets
[$charset]) {
1473 return $this->euc_strtrunc($string,$charset);
1474 } elseif ($this->twoByteSets
[$charset]) {
1475 if ($len %
2) $len--; // don't cut at odd positions
1476 } elseif ($this->fourByteSets
[$charset]) {
1478 $len -= $x; // realign to position dividable by four
1480 // treat everything else as single-byte encoding
1481 return substr($string,0,$len);
1485 * Translates all characters of a string into their respective case values.
1486 * Unlike strtolower() and strtoupper() this method is locale independent.
1487 * Note that the string length may change!
1488 * eg. lower case German �(sharp S) becomes upper case "SS"
1489 * Unit-tested by Kasper
1490 * Real case folding is language dependent, this method ignores this fact.
1492 * @param string Character set of string
1493 * @param string Input string to convert case for
1494 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1495 * @return string The converted string
1496 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1497 * @see strtolower(), strtoupper()
1499 function conv_case($charset,$string,$case) {
1500 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' && float(phpversion()) >= 4.3) {
1501 if ($case == 'toLower') {
1502 return mb_strtolower($str,'utf-8');
1504 return mb_strtoupper($str,'utf-8');
1506 } elseif ($charset == 'utf-8') {
1507 return $this->utf8_char_mapping($string,'case',$case);
1508 } elseif (isset($this->eucBasedSets
[$charset])) {
1509 return $this->euc_char_mapping($string,$charset,'case',$case);
1511 // treat everything else as single-byte encoding
1512 return $this->sb_char_mapping($string,$charset,'case',$case);
1519 * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
1521 * @param string Character set of string
1522 * @param string Input string to convert
1523 * @return string The converted string
1525 function specCharsToASCII($charset,$string) {
1526 if ($charset == 'utf-8') {
1527 return $this->utf8_char_mapping($string,'ascii');
1528 } elseif (isset($this->eucBasedSets
[$charset])) {
1529 return $this->euc_char_mapping($string,$charset,'ascii');
1531 // treat everything else as single-byte encoding
1532 return $this->sb_char_mapping($string,$charset,'ascii');
1549 /********************************************
1551 * Internal string operation functions
1553 ********************************************/
1556 * Maps all characters of a string in a single byte charset.
1558 * @param string the string
1559 * @param string the charset
1560 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1561 * @param string 'case': conversion 'toLower' or 'toUpper'
1562 * @return string the converted string
1563 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1565 function sb_char_mapping($str,$charset,$mode,$opt='') {
1568 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1569 $map =& $this->caseFolding
[$charset][$opt];
1573 if (!$this->initToASCII($charset)) return $str; // do nothing
1574 $map =& $this->toASCII
[$charset];
1582 for($i=0; strlen($str{$i}); $i++
) {
1584 if (isset($map[$c])) {
1603 /********************************************
1605 * Internal UTF-8 string operation functions
1607 ********************************************/
1610 * Returns a part of a UTF-8 string.
1611 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1613 * @param string UTF-8 string
1614 * @param integer Start position (character position)
1615 * @param integer Length (in characters)
1616 * @return string The substring
1618 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1620 function utf8_substr($str,$start,$len=null) {
1621 if (!strcmp($len,'0')) return '';
1623 $byte_start = $this->utf8_char2byte_pos($str,$start);
1624 if ($byte_start === false) {
1626 return false; // $start outside string length
1632 $str = substr($str,$byte_start);
1635 $byte_end = $this->utf8_char2byte_pos($str,$len);
1636 if ($byte_end === false) // $len outside actual string length
1637 return $len<0 ?
'' : $str; // When length is less than zero and exceeds, then we return blank string.
1639 return substr($str,0,$byte_end);
1645 * Counts the number of characters of a string in UTF-8.
1646 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1648 * @param string UTF-8 multibyte character string
1649 * @return integer The number of characters
1651 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1653 function utf8_strlen($str) {
1655 for($i=0; strlen($str{$i}); $i++
) {
1657 if (!($c & 0x80)) // single-byte (0xxxxxx)
1659 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1666 * Truncates a string in UTF-8 short at a given byte length.
1668 * @param string UTF-8 multibyte character string
1669 * @param integer the byte length
1670 * @return string the shortened string
1672 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1674 function utf8_strtrunc($str,$len) {
1676 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1677 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1678 if ($i <= 0) return ''; // sanity check
1679 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++
; // calculate number of bytes
1680 if ($bc+
$i > $len) return substr($str,0,$i);
1681 // fallthru: multibyte char fits into length
1683 return substr($str,$len);
1687 * Find position of first occurrence of a string, both arguments are in UTF-8.
1689 * @param string UTF-8 string to search in
1690 * @param string UTF-8 string to search for
1691 * @param integer Positition to start the search
1692 * @return integer The character position
1694 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1696 function utf8_strpos($haystack,$needle,$offset=0) {
1697 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1698 return mb_strpos($haystack,$needle,'utf-8');
1701 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1702 if ($byte_offset === false) return false; // offset beyond string length
1704 $byte_pos = strpos($haystack,$needle,$byte_offset);
1705 if ($byte_pos === false) return false; // needle not found
1707 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1711 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1713 * @param string UTF-8 string to search in
1714 * @param string UTF-8 character to search for (single character)
1715 * @return integer The character position
1717 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1719 function utf8_strrpos($haystack,$needle) {
1720 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1721 return mb_strrpos($haystack,$needle,'utf-8');
1724 $byte_pos = strrpos($haystack,$needle);
1725 if ($byte_pos === false) return false; // needle not found
1727 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1731 * Translates a character position into an 'absolute' byte position.
1732 * Unit tested by Kasper.
1734 * @param string UTF-8 string
1735 * @param integer Character position (negative values start from the end)
1736 * @return integer Byte position
1737 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1739 function utf8_char2byte_pos($str,$pos) {
1740 $n = 0; // number of characters found
1741 $p = abs($pos); // number of characters wanted
1747 $i = strlen($str)-1;
1751 for( ; strlen($str{$i}) && $n<$p; $i+
=$d) {
1752 $c = (int)ord($str{$i});
1753 if (!($c & 0x80)) // single-byte (0xxxxxx)
1755 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1758 if (!strlen($str{$i})) return false; // offset beyond string length
1761 // skip trailing multi-byte data bytes
1762 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++
; }
1772 * Translates an 'absolute' byte position into a character position.
1773 * Unit tested by Kasper.
1775 * @param string UTF-8 string
1776 * @param integer byte position
1777 * @return integer character position
1778 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1780 function utf8_byte2char_pos($str,$pos) {
1781 $n = 0; // number of characters
1782 for($i=$pos; $i>0; $i--) {
1783 $c = (int)ord($str{$i});
1784 if (!($c & 0x80)) // single-byte (0xxxxxx)
1786 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1789 if (!strlen($str{$i})) return false; // offset beyond string length
1795 * Maps all characters of an UTF-8 string.
1797 * @param string UTF-8 string
1798 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1799 * @param string 'case': conversion 'toLower' or 'toUpper'
1800 * @return string the converted string
1801 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1803 function utf8_char_mapping($str,$mode,$opt='') {
1804 if (!$this->initUnicodeData($mode)) return $str; // do nothing
1809 $map =& $this->caseFolding
['utf-8'][$opt];
1813 $map =& $this->toASCII
['utf-8'];
1820 for($i=0; strlen($str{$i}); $i++
) {
1822 if (!($c & 0x80)) // single-byte (0xxxxxx)
1824 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1825 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++
; } // calculate number of bytes
1826 $mbc = substr($str,$i,$bc);
1830 if (isset($map[$mbc])) {
1857 /********************************************
1859 * Internal EUC string operation functions
1861 * Extended Unix Code:
1862 * ASCII compatible 7bit single bytes chars
1863 * 8bit two byte chars
1865 * Shift-JIS is treated as a special case.
1867 ********************************************/
1870 * Cuts a string in the EUC charset family short at a given byte length.
1872 * @param string EUC multibyte character string
1873 * @param integer the byte length
1874 * @param string the charset
1875 * @return string the shortened string
1877 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1879 function euc_strtrunc($str,$len,$charset) {
1880 $sjis = ($charset == 'shift_jis');
1881 for ($i=0; strlen($str{$i}) && $i<$len; $i++
) {
1884 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) $i++
; // advance a double-byte char
1887 if ($c >= 0x80) $i++
; // advance a double-byte char
1890 if (!strlen($str{$i})) return $str; // string shorter than supplied length
1893 return substr($str,0,$len-1); // we ended on a first byte
1895 return substr($str,0,$len);
1899 * Returns a part of a string in the EUC charset family.
1901 * @param string EUC multibyte character string
1902 * @param integer start position (character position)
1903 * @param string the charset
1904 * @param integer length (in characters)
1905 * @return string the substring
1906 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1908 function euc_substr($str,$start,$charset,$len=null) {
1909 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1910 if ($byte_start === false) return false; // $start outside string length
1912 $str = substr($str,$byte_start);
1915 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1916 if ($byte_end === false) // $len outside actual string length
1919 return substr($str,0,$byte_end);
1925 * Counts the number of characters of a string in the EUC charset family.
1927 * @param string EUC multibyte character string
1928 * @param string the charset
1929 * @return integer the number of characters
1931 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1933 function euc_strlen($str,$charset) {
1934 $sjis = ($charset == 'shift_jis');
1936 for ($i=0; strlen($str{$i}); $i++
) {
1939 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) $i++
; // advance a double-byte char
1942 if ($c >= 0x80) $i++
; // advance a double-byte char
1952 * Translates a character position into an 'absolute' byte position.
1954 * @param string EUC multibyte character string
1955 * @param integer character position (negative values start from the end)
1956 * @param string the charset
1957 * @return integer byte position
1958 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1960 function euc_char2byte_pos($str,$pos,$charset) {
1961 $sjis = ($charset == 'shift_jis');
1962 $n = 0; // number of characters seen
1963 $p = abs($pos); // number of characters wanted
1969 $i = strlen($str)-1;
1973 for ( ; strlen($str{$i}) && $n<$p; $i+
=$d) {
1976 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) $i+
=$d; // advance a double-byte char
1979 if ($c >= 0x80) $i+
=$d; // advance a double-byte char
1984 if (!strlen($str{$i})) return false; // offset beyond string length
1986 if ($pos < 0) $i++
; // correct offset
1992 * Maps all characters of a string in the EUC charset family.
1994 * @param string EUC multibyte character string
1995 * @param string the charset
1996 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1997 * @param string 'case': conversion 'toLower' or 'toUpper'
1998 * @return string the converted string
1999 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2001 function euc_char_mapping($str,$charset,$mode,$opt='') {
2004 if (!$this->initCaseFolding($charset)) return $str; // do nothing
2005 $map =& $this->caseFolding
[$charset][$opt];
2009 if (!$this->initToASCII($charset)) return $str; // do nothing
2010 $map =& $this->toASCII
[$charset];
2017 $sjis = ($charset == 'shift_jis');
2019 for($i=0; strlen($str{$i}); $i++
) {
2024 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) { // a double-byte char
2025 $mbc = substr($str,$i,2);
2030 if ($c >= 0x80) { // a double-byte char
2031 $mbc = substr($str,$i,2);
2036 if (isset($map[$mbc])) {
2048 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE
]['XCLASS']['t3lib/class.t3lib_cs.php']) {
2049 include_once($TYPO3_CONF_VARS[TYPO3_MODE
]['XCLASS']['t3lib/class.t3lib_cs.php']);