41defbc502114edc27c57de9936f099e678fb8ce
2 /***************************************************************
5 * (c) 2003-2004 Kasper Skaarhoj (kasper@typo3.com)
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
25 * Class for conversion between charsets.
29 * @author Kasper Skaarhoj <kasper@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
33 * [CLASS/FUNCTION INDEX of SCRIPT]
38 * 442: function parse_charset($charset)
39 * 460: function get_locale_charset($locale)
40 * 492: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
41 * 529: function utf8_encode($str,$charset)
42 * 576: function utf8_decode($str,$charset,$useEntityForNoChar=0)
43 * 619: function utf8_to_entities($str)
44 * 652: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
45 * 686: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
46 * 736: function UnumberToChar($cbyte)
47 * 781: function utf8CharToUnumber($str,$hex=0)
49 * SECTION: Init functions
50 * 824: function initCharset($charset)
51 * 885: function initCaseFoldingUTF8()
52 * 973: function initCaseFolding($charset)
54 * SECTION: String operation functions
55 * 1058: function substr($charset,$string,$start,$len=null)
56 * 1096: function strlen($charset,$string)
57 * 1124: function crop($charset,$string,$len,$crop='')
58 * 1165: function strtrunc($charset,$string,$len)
59 * 1197: function conv_case($charset,$string,$case)
61 * SECTION: Internal UTF-8 string operation functions
62 * 1264: function utf8_substr($str,$start,$len=null)
63 * 1297: function utf8_strlen($str)
64 * 1318: function utf8_strtrunc($str,$len)
65 * 1340: function utf8_strpos($haystack,$needle,$offset=0)
66 * 1363: function utf8_strrpos($haystack,$needle)
67 * 1383: function utf8_char2byte_pos($str,$pos)
68 * 1424: function utf8_byte2char_pos($str,$pos)
69 * 1448: function utf8_conv_case($str,$case)
71 * SECTION: Internal EUC string operation functions
72 * 1514: function euc_strtrunc($str,$len,$charset)
73 * 1543: function euc_substr($str,$start,$charset,$len=null)
74 * 1568: function euc_strlen($str,$charset)
75 * 1595: function euc_char2byte_pos($str,$pos,$charset)
76 * 1636: function euc_conv_case($str,$case,$charset)
79 * (This index is automatically created/updated by the extension "extdeveval")
93 * Functions working on UTF-8 strings:
98 * - implode/explode/join
100 * Functions nearly working on UTF-8 strings:
102 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf_strlen
103 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained 7-bit ASCII
104 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
105 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
107 * Functions NOT working on UTF-8 strings:
121 * Class for conversion between charsets
123 * @author Kasper Skaarhoj <kasper@typo3.com>
124 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
129 var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
131 // This is the array where parsed conversion tables are stored (cached)
132 var $parsedCharsets=array();
134 // An array where case folding data will be stored (cached)
135 var $caseFolding=array();
137 // An array where charset-to-ASCII mappings are stored (cached)
138 var $toASCII=array();
140 // This tells the converter which charsets has two bytes per char:
141 var $twoByteSets=array(
142 'ucs-2'=>1, // 2-byte Unicode
145 // This tells the converter which charsets has four bytes per char:
146 var $fourByteSets=array(
147 'ucs-4'=>1, // 4-byte Unicode
148 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
151 // This tells the converter which charsets use a scheme like the Extended Unix Code:
152 var $eucBasedSets=array(
153 'gb2312'=>1, // Chinese, simplified.
154 'big5'=>1, // Chinese, traditional.
155 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
158 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
159 // http://czyborra.com/charsets/iso8859.html
162 'us-ascii'=> 'ascii',
163 'cp819' => 'iso-8859-1',
164 'ibm819' => 'iso-8859-1',
165 'iso-ir-100' => 'iso-8859-1',
166 'iso-ir-109' => 'iso-8859-2',
167 'iso-ir-148' => 'iso-8859-9',
168 'iso-ir-199' => 'iso-8859-14',
169 'iso-ir-203' => 'iso-8859-15',
170 'csisolatin1' => 'iso-8859-1',
171 'csisolatin2' => 'iso-8859-2',
172 'csisolatin3' => 'iso-8859-3',
173 'csisolatin5' => 'iso-8859-9',
174 'csisolatin8' => 'iso-8859-14',
175 'csisolatin9' => 'iso-8859-15',
176 'csisolatingreek' => 'iso-8859-7',
177 'iso-celtic' => 'iso-8859-14',
178 'latin1' => 'iso-8859-1',
179 'latin2' => 'iso-8859-2',
180 'latin3' => 'iso-8859-3',
181 'latin5' => 'iso-8859-9',
182 'latin6' => 'iso-8859-10',
183 'latin8' => 'iso-8859-14',
184 'latin9' => 'iso-8859-15',
185 'l1' => 'iso-8859-1',
186 'l2' => 'iso-8859-2',
187 'l3' => 'iso-8859-3',
188 'l5' => 'iso-8859-9',
189 'l6' => 'iso-8859-10',
190 'l8' => 'iso-8859-14',
191 'l9' => 'iso-8859-15',
192 'cyrillic' => 'iso-8859-5',
193 'arabic' => 'iso-8859-6',
194 'tis-620' => 'iso-8859-11',
195 'win874' => 'windows-874',
196 'win1250' => 'windows-1250',
197 'win1251' => 'windows-1251',
198 'win1252' => 'windows-1252',
199 'win1253' => 'windows-1253',
200 'win1254' => 'windows-1254',
201 'win1255' => 'windows-1255',
202 'win1256' => 'windows-1256',
203 'win1257' => 'windows-1257',
204 'win1258' => 'windows-1258',
205 'cp1250' => 'windows-1250',
206 'cp1251' => 'windows-1251',
207 'cp1252' => 'windows-1252',
208 'ms-ee' => 'windows-1250',
209 'ms-ansi' => 'windows-1252',
210 'ms-greek' => 'windows-1253',
211 'ms-turk' => 'windows-1254',
212 'winbaltrim' => 'windows-1257',
213 'koi-8ru' => 'koi-8r',
217 'macintosh' => 'macroman',
218 'euc-cn' => 'gb2312',
219 'x-euc-cn' => 'gb2312',
225 'sjis' => 'shift_jis',
226 'shift-jis' => 'shift_jis',
227 'cp932' => 'shift_jis',
237 // mapping of iso-639:2 language codes to language (family) names
238 var $lang_to_langfamily=array(
239 // iso-639:2 language codes, see:
240 // http://www.w3.org/WAI/ER/IG/ert/iso639.htm
241 // http://www.unicode.org/onlinedat/languages.html
244 'cs' => 'east_european',
245 'da' => 'west_european',
246 'de' => 'west_european',
247 'es' => 'west_european',
249 'eu' => 'west_european',
250 'fi' => 'west_european',
251 'fr' => 'west_european',
253 'hr' => 'east_european',
254 'hu' => 'east_european',
256 'is' => 'west_european',
257 'it' => 'west_european',
259 'kl' => 'west_european',
261 'lt' => 'lithuanian',
262 'lv' => 'west_european', // Latvian/Lettish
263 'nl' => 'west_european',
264 'no' => 'west_european',
265 'pl' => 'east_european',
266 'pt' => 'west_european',
267 'ro' => 'east_european',
269 'sk' => 'east_european',
270 'sl' => 'east_european',
271 'sv' => 'west_european',
274 'vi' => 'vietnamese',
276 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
277 'chs' => 'simpl_chinese',
278 'cht' => 'trad_chinese',
279 'csy' => 'east_european',
280 'dan' => 'west_european',
281 'deu' => 'west_european',
282 'dea' => 'west_european',
283 'des' => 'west_european',
284 'ena' => 'west_european',
285 'enc' => 'west_european',
286 'eng' => 'west_european',
287 'enz' => 'west_european',
288 'enu' => 'west_european',
289 'nld' => 'west_european',
290 'nlb' => 'west_european',
291 'fin' => 'west_european',
292 'fra' => 'west_european',
293 'frb' => 'west_european',
294 'frc' => 'west_european',
295 'frs' => 'west_european',
297 'hun' => 'east_european',
298 'isl' => 'west_euorpean',
299 'ita' => 'west_european',
300 'its' => 'west_european',
303 'nor' => 'west_european',
304 'non' => 'west_european',
305 'plk' => 'east_european',
306 'ptg' => 'west_european',
307 'ptb' => 'west_european',
308 'rus' => 'east_european',
309 'sky' => 'east_european',
310 'esp' => 'west_european',
311 'esm' => 'west_european',
312 'esn' => 'west_european',
313 'sve' => 'west_european',
315 // English language names
316 'bulgarian' => 'east_european',
317 'catalan' => 'west_european',
318 'croatian' => 'east_european',
319 'czech' => 'east_european',
320 'danish' => 'west_european',
321 'dutch' => 'west_european',
322 'english' => 'west_european',
323 'finnish' => 'west_european',
324 'french' => 'west_european',
325 'galician' => 'west_european',
326 'german' => 'west_european',
327 'hungarian' => 'east_european',
328 'icelandic' => 'west_european',
329 'italian' => 'west_european',
330 'latvian' => 'west_european',
331 'lettish' => 'west_european',
332 'norwegian' => 'west_european',
333 'polish' => 'east_european',
334 'portuguese' => 'west_european',
335 'russian' => 'cyrillic',
336 'romanian' => 'east_european',
337 'slovak' => 'east_european',
338 'slovenian' => 'east_european',
339 'spanish' => 'west_european',
340 'svedish' => 'west_european',
341 'turkish' => 'east_european',
342 'ukrainian' => 'cyrillic',
345 // mapping of language (family) names to charsets on Unix
346 var $lang_to_charset_unix=array(
347 'west_european' => 'iso-8859-1',
348 'estonian' => 'iso-8859-1',
349 'east_european' => 'iso-8859-2',
350 'baltic' => 'iso-8859-4',
351 'cyrillic' => 'iso-8859-5',
352 'arabic' => 'iso-8859-6',
353 'greek' => 'iso-8859-7',
354 'hebrew' => 'iso-8859-8',
355 'turkish' => 'iso-8859-9',
356 'thai' => 'iso-8859-11', // = TIS-620
357 'lithuanian' => 'iso-8859-13',
358 'chinese' => 'gb2312', // = euc-cn
359 'japanese' => 'euc-jp',
360 'korean' => 'euc-kr',
361 'simpl_chinese' => 'gb2312',
362 'trad_chinese' => 'big5',
366 // mapping of language (family) names to charsets on Windows
367 var $lang_to_charset_windows=array(
368 'east_european' => 'windows-1250',
369 'cyrillic' => 'windows-1251',
370 'west_european' => 'windows-1252',
371 'greek' => 'windows-1253',
372 'turkish' => 'windows-1254',
373 'hebrew' => 'windows-1255',
374 'arabic' => 'windows-1256',
375 'baltic' => 'windows-1257',
376 'estonian' => 'windows-1257',
377 'lithuanian' => 'windows-1257',
378 'vietnamese' => 'windows-1258',
381 'chinese' => 'gb2312',
382 'japanese' => 'shift_jis',
383 'simpl_chinese' => 'gb2312',
384 'trad_chinese' => 'big5',
387 // mapping of locale names to charsets
388 var $locale_to_charset=array(
389 'japanese.euc' => 'euc-jp',
390 'ja_jp.ujis' => 'euc-jp',
391 'korean.euc' => 'euc-kr',
397 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
398 // Empty values means "iso-8859-1"
399 var $charSetArray = array(
407 'cz' => 'windows-1250',
408 'pl' => 'iso-8859-2',
409 'si' => 'windows-1250',
411 'tr' => 'iso-8859-9',
414 'ru' => 'windows-1251',
415 'ro' => 'iso-8859-2',
417 'sk' => 'windows-1250',
418 'lt' => 'windows-1257',
420 'hr' => 'windows-1250',
421 'hu' => 'iso-8859-2',
423 'th' => 'iso-8859-11',
424 'gr' => 'iso-8859-7',
427 'bg' => 'windows-1251',
429 'et' => 'iso-8859-4',
430 'ar' => 'iso-8859-6',
432 'ua' => 'windows-1251',
436 'ca' => 'iso-8859-15',
437 'ba' => 'iso-8859-2',
442 * Normalize - changes input character set to lowercase letters.
444 * @param string Input charset
445 * @return string Normalized charset
446 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
448 function parse_charset($charset) {
449 $charset = strtolower($charset);
450 if (isset($this->synonyms
[$charset])) $charset = $this->synonyms
[$charset];
456 * Get the charset of a locale.
459 * ln_CN language / country
460 * ln_CN.cs language / country / charset
462 * @param string Locale string
463 * @return string Charset resolved for locale string
464 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
466 function get_locale_charset($locale) {
467 $locale = strtolower($locale);
469 // exact locale specific charset?
470 if (isset($this->locale_to_charset
[$locale])) return $this->locale_to_charset
[$locale];
472 // locale contains charset: use it
473 list($locale,$charset) = explode('.',$locale);
474 if ($charset) return $this->parse_charset($charset);
477 list($language,$country) = explode('_',$locale);
478 if (isset($this->lang_to_langfamily
[$language])) $language = $this->lang_to_langfamily
[$language];
480 if (TYPO3_OS
== 'WIN') {
481 $cs = $this->lang_to_charset_windows
[$language];
483 $cs = $this->lang_to_charset_unix
[$language];
486 return $cs ?
$cs : 'iso-8859-1';
497 /********************************************
499 * Charset Conversion functions
501 ********************************************/
504 * Convert from one charset to another charset.
506 * @param string Input string
507 * @param string From charset (the current charset of the string)
508 * @param string To charset (the output charset wanted)
509 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
510 * @return string Converted string
513 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
514 if ($fromCS==$toCS) return $str;
516 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
517 if ($toCS=='utf-8' ||
!$useEntityForNoChar) {
518 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
520 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
521 if (false
!== $conv_str) return $conv_str; // returns false for unsupported charsets
525 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
526 if (false
!== $conv_str) return $conv_str;
530 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
531 if (false
!== $conv_str) return $conv_str;
534 // fallback to TYPO3 conversion
537 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
538 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
543 * Convert all elements in ARRAY from one charset to another charset.
544 * NOTICE: Array is passed by reference!
546 * @param string Input array, possibly multidimensional
547 * @param string From charset (the current charset of the string)
548 * @param string To charset (the output charset wanted)
549 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
553 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
554 foreach($array as $key => $value) {
555 if (is_array($array[$key])) {
556 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
558 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
564 * Converts $str from $charset to UTF-8
566 * @param string String in local charset to convert to UTF-8
567 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
568 * @return string Output string, converted to UTF-8
570 function utf8_encode($str,$charset) {
572 // Charset is case-insensitive.
573 if ($this->initCharset($charset)) { // Parse conv. table if not already...
574 $strLen = strlen($str);
577 for ($a=0;$a<$strLen;$a++
) { // Traverse each char in string.
578 $chr=substr($str,$a,1);
580 if (isset($this->twoByteSets
[$charset])) { // If the charset has two bytes per char
581 $ord2 = ord($str{$a+
1});
582 $ord = $ord<<8 & $ord2; // assume big endian
584 if (isset($this->parsedCharsets
[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
585 $outStr.=$this->parsedCharsets
[$charset]['local'][$ord];
586 } else $outStr.=chr($this->noCharByteVal
); // No char exists
588 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
589 if (isset($this->eucBasedSets
[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
591 $ord2=ord(substr($str,$a,1));
592 $ord = $ord*256+
$ord2;
594 elseif ($charset == 'shift_jis' && ($ord <160 ||
$ord>223)) { // Shift-JIS is like EUC, but chars between 160 and 223 are single byte
596 $ord2=ord(substr($str,$a,1));
597 $ord = $ord*256+
$ord2;
600 if (isset($this->parsedCharsets
[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
601 $outStr.=$this->parsedCharsets
[$charset]['local'][$ord];
602 } else $outStr.=chr($this->noCharByteVal
); // No char exists
603 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
610 * Converts $str from UTF-8 to $charset
612 * @param string String in UTF-8 to convert to local charset
613 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
614 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
615 * @return string Output string, converted to local charset
617 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
619 // Charset is case-insensitive.
620 if ($this->initCharset($charset)) { // Parse conv. table if not already...
621 $strLen = strlen($str);
624 for ($a=0,$i=0;$a<$strLen;$a++
,$i++
) { // Traverse each char in UTF-8 string.
625 $chr=substr($str,$a,1);
627 if ($ord>127) { // This means multibyte! (first byte!)
628 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
630 $buf=$chr; // Add first byte
631 for ($b=0;$b<8;$b++
) { // for each byte in multibyte string...
632 $ord = $ord << 1; // Shift it left and ...
633 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
634 $a++
; // Increase pointer...
635 $buf.=substr($str,$a,1); // ... and add the next char.
639 if (isset($this->parsedCharsets
[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
640 $mByte = $this->parsedCharsets
[$charset]['utf8'][$buf]; // The local number
641 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
642 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
643 } else $outStr.= chr($mByte);
644 } elseif ($useEntityForNoChar) { // Create num entity:
645 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
646 } else $outStr.=chr($this->noCharByteVal
); // No char exists
647 } else $outStr.=chr($this->noCharByteVal
); // No char exists (MIDDLE of MB sequence!)
648 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
655 * Converts all chars > 127 to numeric entities.
657 * @param string Input string
658 * @return string Output string
660 function utf8_to_entities($str) {
661 $strLen = strlen($str);
664 for ($a=0;$a<$strLen;$a++
) { // Traverse each char in UTF-8 string.
665 $chr=substr($str,$a,1);
667 if ($ord>127) { // This means multibyte! (first byte!)
668 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
669 $buf=$chr; // Add first byte
670 for ($b=0;$b<8;$b++
) { // for each byte in multibyte string...
671 $ord = $ord << 1; // Shift it left and ...
672 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
673 $a++
; // Increase pointer...
674 $buf.=substr($str,$a,1); // ... and add the next char.
678 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
679 } else $outStr.=chr($this->noCharByteVal
); // No char exists (MIDDLE of MB sequence!)
680 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
687 * Converts numeric entities (UNICODE, eg. decimal (Ӓ) or hexadecimal ()) to UTF-8 multibyte chars
689 * @param string Input string, UTF-8
690 * @param boolean If set, then all string-HTML entities (like & or £ will be converted as well)
691 * @return string Output string
693 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
694 if ($alsoStdHtmlEnt) {
695 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES
)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
698 $token = md5(microtime());
699 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
700 foreach($parts as $k => $v) {
702 if (substr($v,0,1)=='#') { // Dec or hex entities:
703 if (substr($v,1,1)=='x') {
704 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
706 $parts[$k] = $this->UnumberToChar(substr($v,1));
708 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
709 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
710 } else { // No conversion:
711 $parts[$k] ='&'.$v.';';
716 return implode('',$parts);
720 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
722 * @param string Input string, UTF-8
723 * @param boolean If set, then all HTML entities (like & or £ or { or 㽝) will be detected as characters.
724 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
725 * @return array Output array with the char numbers
727 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
728 // If entities must be registered as well...:
730 $str = $this->entities_to_utf8($str,1);
733 $strLen = strlen($str);
736 for ($a=0;$a<$strLen;$a++
) { // Traverse each char in UTF-8 string.
737 $chr=substr($str,$a,1);
739 if ($ord>127) { // This means multibyte! (first byte!)
740 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
741 $buf=$chr; // Add first byte
742 for ($b=0;$b<8;$b++
) { // for each byte in multibyte string...
743 $ord = $ord << 1; // Shift it left and ...
744 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
745 $a++
; // Increase pointer...
746 $buf.=substr($str,$a,1); // ... and add the next char.
750 $outArr[]=$retChar?
$buf:$this->utf8CharToUnumber($buf);
751 } else $outArr[]=$retChar?
chr($this->noCharByteVal
):$this->noCharByteVal
; // No char exists (MIDDLE of MB sequence!)
752 } else $outArr[]=$retChar?
chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
759 * Converts a UNICODE number to a UTF-8 multibyte character
760 * Algorithm based on script found at From: http://czyborra.com/utf/
761 * Unit-tested by Kasper
763 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
765 * bytes | bits | representation
767 * 2 | 11 | 110vvvvv 10vvvvvv
768 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
769 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
770 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
771 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
773 * @param integer UNICODE integer
774 * @return string UTF-8 multibyte character string
775 * @see utf8CharToUnumber()
777 function UnumberToChar($cbyte) {
782 } else if ($cbyte < 0x800) {
783 $str.=chr(0xC0 |
($cbyte >> 6));
784 $str.=chr(0x80 |
($cbyte & 0x3F));
785 } else if ($cbyte < 0x10000) {
786 $str.=chr(0xE0 |
($cbyte >> 12));
787 $str.=chr(0x80 |
(($cbyte >> 6) & 0x3F));
788 $str.=chr(0x80 |
($cbyte & 0x3F));
789 } else if ($cbyte < 0x200000) {
790 $str.=chr(0xF0 |
($cbyte >> 18));
791 $str.=chr(0x80 |
(($cbyte >> 12) & 0x3F));
792 $str.=chr(0x80 |
(($cbyte >> 6) & 0x3F));
793 $str.=chr(0x80 |
($cbyte & 0x3F));
794 } else if ($cbyte < 0x4000000) {
795 $str.=chr(0xF8 |
($cbyte >> 24));
796 $str.=chr(0x80 |
(($cbyte >> 18) & 0x3F));
797 $str.=chr(0x80 |
(($cbyte >> 12) & 0x3F));
798 $str.=chr(0x80 |
(($cbyte >> 6) & 0x3F));
799 $str.=chr(0x80 |
($cbyte & 0x3F));
800 } else if ($cbyte < 0x80000000) {
801 $str.=chr(0xFC |
($cbyte >> 30));
802 $str.=chr(0x80 |
(($cbyte >> 24) & 0x3F));
803 $str.=chr(0x80 |
(($cbyte >> 18) & 0x3F));
804 $str.=chr(0x80 |
(($cbyte >> 12) & 0x3F));
805 $str.=chr(0x80 |
(($cbyte >> 6) & 0x3F));
806 $str.=chr(0x80 |
($cbyte & 0x3F));
807 } else { // Cannot express a 32-bit character in UTF-8
808 $str .= chr($this->noCharByteVal
);
814 * Converts a UTF-8 Multibyte character to a UNICODE number
815 * Unit-tested by Kasper
817 * @param string UTF-8 multibyte character string
818 * @param boolean If set, then a hex. number is returned.
819 * @return integer UNICODE integer
820 * @see UnumberToChar()
822 function utf8CharToUnumber($str,$hex=0) {
823 $ord=ord(substr($str,0,1)); // First char
825 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
827 for ($b=0;$b<8;$b++
) { // for each byte in multibyte string...
828 $ord = $ord << 1; // Shift it left and ...
829 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
830 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+
1,1))),-6);
833 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
835 $int = bindec($binBuf);
838 return $hex ?
'x'.dechex($int) : $int;
849 /********************************************
853 ********************************************/
856 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
857 * This function is automatically called by the conversion functions
859 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
861 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
862 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
865 function initCharset($charset) {
866 // Only process if the charset is not yet loaded:
867 if (!is_array($this->parsedCharsets
[$charset])) {
869 // Conversion table filename:
870 $charsetConvTableFile = PATH_t3lib
.'csconvtbl/'.$charset.'.tbl';
872 // If the conversion table is found:
873 if ($charset && t3lib_div
::validPathStr($charsetConvTableFile) && @is_file
($charsetConvTableFile)) {
874 // Cache file for charsets:
875 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
876 $cacheFile = t3lib_div
::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
877 if ($cacheFile && @is_file
($cacheFile)) {
878 $this->parsedCharsets
[$charset]=unserialize(t3lib_div
::getUrl($cacheFile));
880 // Parse conversion table into lines:
881 $lines=t3lib_div
::trimExplode(chr(10),t3lib_div
::getUrl($charsetConvTableFile),1);
882 // Initialize the internal variable holding the conv. table:
883 $this->parsedCharsets
[$charset]=array('local'=>array(),'utf8'=>array());
884 // traverse the lines:
886 foreach($lines as $value) {
887 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
889 // Detect type if not done yet: (Done on first real line)
890 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
891 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ?
'whitespaced' : 'ms-token';
893 if ($detectedType=='ms-token') {
894 list($hexbyte,$utf8) = split('=|:',$value,3);
895 } elseif ($detectedType=='whitespaced') {
897 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
899 $utf8 = 'U+'.$regA[2];
901 $decval = hexdec(trim($hexbyte));
903 $utf8decval = hexdec(substr(trim($utf8),2));
904 $this->parsedCharsets
[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
905 $this->parsedCharsets
[$charset]['utf8'][$this->parsedCharsets
[$charset]['local'][$decval]]=$decval;
910 t3lib_div
::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets
[$charset]));
919 * This function initializes all UTF-8 character data tables.
921 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
923 * @param string Mode ("case", "ascii", ...)
924 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
927 function initUnicodeData($mode=null
) {
929 $cacheFileCase = t3lib_div
::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
930 $cacheFileASCII = t3lib_div
::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
932 // Only process if the tables are not yet loaded
935 if (is_array($this->caseFolding
['utf-8'])) return 1;
937 // Use cached version if possible
938 if ($cacheFileCase && @is_file
($cacheFileCase)) {
939 $this->caseFolding
['utf-8'] = unserialize(t3lib_div
::getUrl($cacheFileCase));
945 if (is_array($this->toASCII
['utf-8'])) return 1;
947 // Use cached version if possible
948 if ($cacheFileASCII && @is_file
($cacheFileASCII)) {
949 $this->toASCII
['utf-8'] = unserialize(t3lib_div
::getUrl($cacheFileASCII));
955 // process main Unicode data file
956 $unicodeDataFile = PATH_t3lib
.'unidata/UnicodeData.txt';
957 if (!(t3lib_div
::validPathStr($unicodeDataFile) && @is_file
($unicodeDataFile))) return false
;
959 $fh = fopen($unicodeDataFile,'rb');
960 if (!$fh) return false
;
962 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
963 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
964 $this->caseFolding
['utf-8'] = array();
965 $utf8CaseFolding =& $this->caseFolding
['utf-8']; // a shorthand
966 $utf8CaseFolding['toUpper'] = array();
967 $utf8CaseFolding['toLower'] = array();
968 $utf8CaseFolding['toTitle'] = array();
970 $decomposition = array(); // array of temp. decompositions
971 $mark = array(); // array of chars that are marks (eg. composing accents)
972 $number = array(); // array of chars that are numbers (eg. digits)
975 $line = fgets($fh,4096);
977 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
979 $ord = hexdec($char);
980 if ($ord > 0xFFFF) break; // only process the BMP
982 $utf8_char = $this->UnumberToChar($ord);
984 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
985 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
986 // store "title" only when different from "upper" (only a few)
987 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
990 case 'M': // mark (accent, umlaut, ...)
991 $mark["U+$char"] = 1;
994 case 'N': // numeric value
995 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
998 // accented Latin letters without "official" decomposition
1000 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) {
1001 $c = ord($match[2]);
1002 if ($match[1] == 'SMALL') $c +
= 32;
1004 $decomposition["U+$char"] = array(dechex($c));
1009 if (ereg('(<.*>)? *(.+)',$decomp,$match)) {
1011 case '<circle>': // add parenthesis as circle replacement, eg (1)
1012 $match[2] = '0028 '.$match[2].' 0029';
1015 case '<square>': // add square brackets as square replacement, eg [1]
1016 $match[2] = '005B '.$match[2].' 005D';
1019 case '<compat>': // ignore multi char decompositions that start with a space
1020 if (ereg('^0020 ',$match[2])) continue 2;
1023 // ignore Arabic and vertical layout presentation decomposition
1031 $decomposition["U+$char"] = split(' ',$match[2]);
1036 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1037 $specialCasingFile = PATH_t3lib
.'unidata/SpecialCasing.txt';
1038 if (t3lib_div
::validPathStr($specialCasingFile) && @is_file
($specialCasingFile)) {
1039 $fh = fopen($specialCasingFile,'rb');
1041 while (!feof($fh)) {
1042 $line = fgets($fh,4096);
1043 if ($line{0} != '#' && trim($line) != '') {
1045 list($char,$lower,$title,$upper,$cond) = t3lib_div
::trimExplode(';', $line);
1046 if ($cond == '' ||
$cond{0} == '#') {
1047 $utf8_char = $this->UnumberToChar(hexdec($char));
1048 if ($char != $lower) {
1049 $arr = split(' ',$lower);
1050 for ($i=0; isset($arr[$i]); $i++
) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1051 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1053 if ($char != $title && $title != $upper) {
1054 $arr = split(' ',$title);
1055 for ($i=0; isset($arr[$i]); $i++
) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1056 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1058 if ($char != $upper) {
1059 $arr = split(' ',$upper);
1060 for ($i=0; isset($arr[$i]); $i++
) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1061 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1070 // process custom decompositions
1071 $customTranslitFile = PATH_t3lib
.'unidata/Translit.txt';
1072 if (t3lib_div
::validPathStr($customTranslitFile) && @is_file
($customTranslitFile)) {
1073 $fh = fopen($customTranslitFile,'rb');
1075 while (!feof($fh)) {
1076 $line = fgets($fh,4096);
1077 if ($line{0} != '#' && trim($line) != '') {
1078 list($char,$translit) = t3lib_div
::trimExplode(';', $line);
1079 $decomposition["U+$char"] = split(' ', $translit);
1086 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1087 foreach($decomposition as $from => $to) {
1088 $code_decomp = array();
1090 while ($code_value = array_shift($to)) {
1091 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1092 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
1093 array_unshift($to, $cv);
1095 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1096 array_push($code_decomp, $code_value);
1099 if (count($code_decomp)) {
1100 $decomposition[$from] = $code_decomp;
1102 unset($decomposition[$from]);
1106 // create ascii only mapping
1107 $this->toASCII
['utf-8'] = array();
1108 $ascii =& $this->toASCII
['utf-8'];
1110 foreach($decomposition as $from => $to) {
1111 $code_decomp = array();
1112 while ($code_value = array_shift($to)) {
1113 $ord = hexdec($code_value);
1115 continue 2; // skip decompositions containing non-ASCII chars
1117 array_push($code_decomp,chr($ord));
1119 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1122 // add numeric decompositions
1123 foreach($number as $from => $to) {
1124 $utf8_char = $this->UnumberToChar(hexdec($from));
1125 if (!isset($ascii[$utf8_char])) {
1126 $ascii[$utf8_char] = $to;
1130 if ($cacheFileCase) {
1131 t3lib_div
::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1134 if ($cacheFileASCII) {
1135 t3lib_div
::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1142 * This function initializes the folding table for a charset other than UTF-8.
1143 * This function is automatically called by the case folding functions.
1145 * @param string Charset for which to initialize case folding.
1146 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1149 function initCaseFolding($charset) {
1150 // Only process if the case table is not yet loaded:
1151 if (is_array($this->caseFolding
[$charset])) return 1;
1153 // Use cached version if possible
1154 $cacheFile = t3lib_div
::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1155 if ($cacheFile && @is_file
($cacheFile)) {
1156 $this->caseFolding
[$charset] = unserialize(t3lib_div
::getUrl($cacheFile));
1160 // init UTF-8 conversion for this charset
1161 if (!$this->initCharset($charset)) {
1165 // UTF-8 case folding is used as the base conversion table
1166 if (!$this->initUnicodeData('case')) {
1170 $nochar = chr($this->noCharByteVal
);
1171 foreach ($this->parsedCharsets
[$charset]['local'] as $ci => $utf8) {
1172 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1173 $c = $this->utf8_decode($utf8, $charset);
1175 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1176 $cc = $this->utf8_decode($this->caseFolding
['utf-8']['toUpper'][$utf8], $charset);
1177 if ($cc != '' && $cc != $nochar) $this->caseFolding
[$charset]['toUpper'][$c] = $cc;
1179 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1180 $cc = $this->utf8_decode($this->caseFolding
['utf-8']['toLower'][$utf8], $charset);
1181 if ($cc != '' && $cc != $nochar) $this->caseFolding
[$charset]['toLower'][$c] = $cc;
1183 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1184 $cc = $this->utf8_decode($this->caseFolding
['utf-8']['toTitle'][$utf8], $charset);
1185 if ($cc != '' && $cc != $nochar) $this->caseFolding
[$charset]['toTitle'][$c] = $cc;
1188 // add the ASCII case table
1189 for ($i=ord('a'); $i<=ord('z'); $i++
) {
1190 $this->caseFolding
[$charset]['toUpper'][chr($i)] = chr($i-32);
1192 for ($i=ord('A'); $i<=ord('Z'); $i++
) {
1193 $this->caseFolding
[$charset]['toLower'][chr($i)] = chr($i+
32);
1197 t3lib_div
::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding
[$charset]));
1204 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1205 * This function is automatically called by the ASCII transliteration functions.
1207 * @param string Charset for which to initialize conversion.
1208 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1211 function initToASCII($charset) {
1212 // Only process if the case table is not yet loaded:
1213 if (is_array($this->toASCII
[$charset])) return 1;
1215 // Use cached version if possible
1216 $cacheFile = t3lib_div
::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1217 if ($cacheFile && @is_file
($cacheFile)) {
1218 $this->toASCII
[$charset] = unserialize(t3lib_div
::getUrl($cacheFile));
1222 // init UTF-8 conversion for this charset
1223 if (!$this->initCharset($charset)) {
1227 // UTF-8/ASCII transliteration is used as the base conversion table
1228 if (!$this->initUnicodeData('ascii')) {
1232 $nochar = chr($this->noCharByteVal
);
1233 foreach ($this->parsedCharsets
[$charset]['local'] as $ci => $utf8) {
1234 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1235 $c = $this->utf8_decode($utf8, $charset);
1237 if (isset($this->toASCII
['utf-8'][$utf8])) {
1238 $this->toASCII
[$charset][$c] = $this->toASCII
['utf-8'][$utf8];
1243 t3lib_div
::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII
[$charset]));
1264 /********************************************
1266 * String operation functions
1268 ********************************************/
1271 * Returns a part of a string.
1272 * Unit-tested by Kasper (single byte charsets only)
1274 * @param string The character set
1275 * @param string Character string
1276 * @param integer Start position (character position)
1277 * @param integer Length (in characters)
1278 * @return string The substring
1279 * @see substr(), mb_substr()
1280 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1282 function substr($charset,$string,$start,$len=null
) {
1283 if ($len===0) return '';
1285 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1286 // cannot omit $len, when specifying charset
1288 $enc = mb_internal_encoding(); // save internal encoding
1289 mb_internal_encoding('utf-8');
1290 $str = mb_substr($string,$start);
1291 mb_internal_encoding($enc); // restore internal encoding
1295 else return mb_substr($string,$start,$len,'utf-8');
1296 } elseif ($charset == 'utf-8') {
1297 return $this->utf8_substr($string,$start,$len);
1298 } elseif ($this->eucBasedSets
[$charset]) {
1299 return $this->euc_substr($string,$start,$charset,$len);
1300 } elseif ($this->twoByteSets
[$charset]) {
1301 return substr($string,$start*2,$len*2);
1302 } elseif ($this->fourByteSets
[$charset]) {
1303 return substr($string,$start*4,$len*4);
1306 // treat everything else as single-byte encoding
1307 return $len === NULL ?
substr($string,$start) : substr($string,$start,$len);
1311 * Counts the number of characters.
1312 * Unit-tested by Kasper (single byte charsets only)
1314 * @param string The character set
1315 * @param string Character string
1316 * @return integer The number of characters
1318 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1320 function strlen($charset,$string) {
1321 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1322 return mb_strlen($string,$charset);
1323 } elseif ($charset == 'utf-8') {
1324 return $this->utf8_strlen($string);
1325 } elseif ($this->eucBasedSets
[$charset]) {
1326 return $this->euc_strlen($string,$charset);
1327 } elseif ($this->twoByteSets
[$charset]) {
1328 return strlen($string)/2;
1329 } elseif ($this->fourByteSets
[$charset]) {
1330 return strlen($string)/4;
1332 // treat everything else as single-byte encoding
1333 return strlen($string);
1337 * Truncates a string and pre-/appends a string.
1338 * Unit tested by Kasper
1340 * @param string The character set
1341 * @param string Character string
1342 * @param integer Length (in characters)
1343 * @param string Crop signifier
1344 * @return string The shortened string
1345 * @see substr(), mb_strimwidth()
1346 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1348 function crop($charset,$string,$len,$crop='') {
1349 if (intval($len) == 0) return $string;
1351 if ($charset == 'utf-8') {
1352 $i = $this->utf8_char2byte_pos($string,$len);
1353 } elseif ($this->eucBasedSets
[$charset]) {
1354 $i = $this->euc_char2byte_pos($string,$len,$charset);
1359 $i = strlen($string)+
$len;
1360 if ($i<=0) $i = false
;
1364 if ($i === false
) { // $len outside actual string length
1368 if (strlen($string{$i})) {
1369 return substr($string,0,$i).$crop;
1373 if (strlen($string{$i-1})) {
1374 return $crop.substr($string,$i);
1379 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1381 return substr($string,0,$i).$crop;
1383 return $crop.substr($string,$i);
1392 * Cuts a string short at a given byte length.
1394 * @param string The character set
1395 * @param string Character string
1396 * @param integer The byte length
1397 * @return string The shortened string
1399 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1401 function strtrunc($charset,$string,$len) {
1402 if ($len <= 0) return '';
1404 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1405 return mb_strcut($string,0,$len,$charset);
1406 } elseif ($charset == 'utf-8') {
1407 return $this->utf8_strtrunc($string,$len);
1408 } elseif ($this->eucBasedSets
[$charset]) {
1409 return $this->euc_strtrunc($string,$charset);
1410 } elseif ($this->twoByteSets
[$charset]) {
1411 if ($len %
2) $len--; // don't cut at odd positions
1412 } elseif ($this->fourByteSets
[$charset]) {
1414 $len -= $x; // realign to position dividable by four
1416 // treat everything else as single-byte encoding
1417 return substr($string,0,$len);
1421 * Translates all characters of a string into their respective case values.
1422 * Unlike strtolower() and strtoupper() this method is locale independent.
1423 * Note that the string length may change!
1424 * eg. lower case German �(sharp S) becomes upper case "SS"
1425 * Unit-tested by Kasper
1426 * Real case folding is language dependent, this method ignores this fact.
1428 * @param string Character set of string
1429 * @param string Input string to convert case for
1430 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1431 * @return string The converted string
1432 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1433 * @see strtolower(), strtoupper()
1435 function conv_case($charset,$string,$case) {
1436 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' && float(phpversion()) >= 4.3) {
1437 if ($case == 'toLower') {
1438 return mb_strtolower($str,'utf-8');
1440 return mb_strtoupper($str,'utf-8');
1442 } elseif ($charset == 'utf-8') {
1443 return $this->utf8_char_mapping($string,'case',$case);
1444 } elseif (isset($this->eucBasedSets
[$charset])) {
1445 return $this->euc_char_mapping($string,$charset,'case',$case);
1447 // treat everything else as single-byte encoding
1448 return $this->sb_char_mapping($string,$charset,'case',$case);
1455 * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
1457 * @param string Character set of string
1458 * @param string Input string to convert
1459 * @return string The converted string
1461 function specCharsToASCII($charset,$string) {
1462 if ($charset == 'utf-8') {
1463 return $this->utf8_char_mapping($string,'ascii');
1464 } elseif (isset($this->eucBasedSets
[$charset])) {
1465 return $this->euc_char_mapping($string,$charset,'ascii');
1467 // treat everything else as single-byte encoding
1468 return $this->sb_char_mapping($string,$charset,'ascii');
1485 /********************************************
1487 * Internal string operation functions
1489 ********************************************/
1492 * Maps all characters of a string in a single byte charset.
1494 * @param string the string
1495 * @param string the charset
1496 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1497 * @param string 'case': conversion 'toLower' or 'toUpper'
1498 * @return string the converted string
1499 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1501 function sb_char_mapping($str,$charset,$mode,$opt='') {
1504 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1505 $map =& $this->caseFolding
[$charset][$opt];
1509 if (!$this->initToASCII($charset)) return $str; // do nothing
1510 $map =& $this->toASCII
[$charset];
1518 for($i=0; strlen($str{$i}); $i++
) {
1520 if (isset($map[$c])) {
1539 /********************************************
1541 * Internal UTF-8 string operation functions
1543 ********************************************/
1546 * Returns a part of a UTF-8 string.
1547 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1549 * @param string UTF-8 string
1550 * @param integer Start position (character position)
1551 * @param integer Length (in characters)
1552 * @return string The substring
1554 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1556 function utf8_substr($str,$start,$len=null
) {
1557 if (!strcmp($len,'0')) return '';
1559 $byte_start = $this->utf8_char2byte_pos($str,$start);
1560 if ($byte_start === false
) {
1562 return false
; // $start outside string length
1568 $str = substr($str,$byte_start);
1571 $byte_end = $this->utf8_char2byte_pos($str,$len);
1572 if ($byte_end === false
) // $len outside actual string length
1573 return $len<0 ?
'' : $str; // When length is less than zero and exceeds, then we return blank string.
1575 return substr($str,0,$byte_end);
1581 * Counts the number of characters of a string in UTF-8.
1582 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1584 * @param string UTF-8 multibyte character string
1585 * @return integer The number of characters
1587 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1589 function utf8_strlen($str) {
1591 for($i=0; strlen($str{$i}); $i++
) {
1593 if (!($c & 0x80)) // single-byte (0xxxxxx)
1595 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1602 * Truncates a string in UTF-8 short at a given byte length.
1604 * @param string UTF-8 multibyte character string
1605 * @param integer the byte length
1606 * @return string the shortened string
1608 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1610 function utf8_strtrunc($str,$len) {
1612 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1613 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1614 if ($i <= 0) return ''; // sanity check
1615 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++
; // calculate number of bytes
1616 if ($bc+
$i > $len) return substr($str,0,$i);
1617 // fallthru: multibyte char fits into length
1619 return substr($str,$len);
1623 * Find position of first occurrence of a string, both arguments are in UTF-8.
1625 * @param string UTF-8 string to search in
1626 * @param string UTF-8 string to search for
1627 * @param integer Positition to start the search
1628 * @return integer The character position
1630 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1632 function utf8_strpos($haystack,$needle,$offset=0) {
1633 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1634 return mb_strpos($haystack,$needle,'utf-8');
1637 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1638 if ($byte_offset === false
) return false
; // offset beyond string length
1640 $byte_pos = strpos($haystack,$needle,$byte_offset);
1641 if ($byte_pos === false
) return false
; // needle not found
1643 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1647 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1649 * @param string UTF-8 string to search in
1650 * @param string UTF-8 character to search for (single character)
1651 * @return integer The character position
1653 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1655 function utf8_strrpos($haystack,$needle) {
1656 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1657 return mb_strrpos($haystack,$needle,'utf-8');
1660 $byte_pos = strrpos($haystack,$needle);
1661 if ($byte_pos === false
) return false
; // needle not found
1663 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1667 * Translates a character position into an 'absolute' byte position.
1668 * Unit tested by Kasper.
1670 * @param string UTF-8 string
1671 * @param integer Character position (negative values start from the end)
1672 * @return integer Byte position
1673 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1675 function utf8_char2byte_pos($str,$pos) {
1676 $n = 0; // number of characters found
1677 $p = abs($pos); // number of characters wanted
1683 $i = strlen($str)-1;
1687 for( ; strlen($str{$i}) && $n<$p; $i+
=$d) {
1688 $c = (int)ord($str{$i});
1689 if (!($c & 0x80)) // single-byte (0xxxxxx)
1691 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1694 if (!strlen($str{$i})) return false
; // offset beyond string length
1697 // skip trailing multi-byte data bytes
1698 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++
; }
1708 * Translates an 'absolute' byte position into a character position.
1709 * Unit tested by Kasper.
1711 * @param string UTF-8 string
1712 * @param integer byte position
1713 * @return integer character position
1714 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1716 function utf8_byte2char_pos($str,$pos) {
1717 $n = 0; // number of characters
1718 for($i=$pos; $i>0; $i--) {
1719 $c = (int)ord($str{$i});
1720 if (!($c & 0x80)) // single-byte (0xxxxxx)
1722 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1725 if (!strlen($str{$i})) return false
; // offset beyond string length
1731 * Maps all characters of an UTF-8 string.
1733 * @param string UTF-8 string
1734 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1735 * @param string 'case': conversion 'toLower' or 'toUpper'
1736 * @return string the converted string
1737 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1739 function utf8_char_mapping($str,$mode,$opt='') {
1740 if (!$this->initUnicodeData($mode)) return $str; // do nothing
1745 $map =& $this->caseFolding
['utf-8'][$opt];
1749 $map =& $this->toASCII
['utf-8'];
1756 for($i=0; strlen($str{$i}); $i++
) {
1758 if (!($c & 0x80)) // single-byte (0xxxxxx)
1760 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1761 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++
; } // calculate number of bytes
1762 $mbc = substr($str,$i,$bc);
1766 if (isset($map[$mbc])) {
1793 /********************************************
1795 * Internal EUC string operation functions
1797 * Extended Unix Code:
1798 * ASCII compatible 7bit single bytes chars
1799 * 8bit two byte chars
1801 * Shift-JIS is treated as a special case.
1803 ********************************************/
1806 * Cuts a string in the EUC charset family short at a given byte length.
1808 * @param string EUC multibyte character string
1809 * @param integer the byte length
1810 * @param string the charset
1811 * @return string the shortened string
1813 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1815 function euc_strtrunc($str,$len,$charset) {
1816 $sjis = ($charset == 'shift_jis');
1817 for ($i=0; strlen($str{$i}) && $i<$len; $i++
) {
1820 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) $i++
; // advance a double-byte char
1823 if ($c >= 0x80) $i++
; // advance a double-byte char
1826 if (!strlen($str{$i})) return $str; // string shorter than supplied length
1829 return substr($str,0,$len-1); // we ended on a first byte
1831 return substr($str,0,$len);
1835 * Returns a part of a string in the EUC charset family.
1837 * @param string EUC multibyte character string
1838 * @param integer start position (character position)
1839 * @param string the charset
1840 * @param integer length (in characters)
1841 * @return string the substring
1842 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1844 function euc_substr($str,$start,$charset,$len=null
) {
1845 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1846 if ($byte_start === false
) return false
; // $start outside string length
1848 $str = substr($str,$byte_start);
1851 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1852 if ($byte_end === false
) // $len outside actual string length
1855 return substr($str,0,$byte_end);
1861 * Counts the number of characters of a string in the EUC charset family.
1863 * @param string EUC multibyte character string
1864 * @param string the charset
1865 * @return integer the number of characters
1867 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1869 function euc_strlen($str,$charset) {
1870 $sjis = ($charset == 'shift_jis');
1872 for ($i=0; strlen($str{$i}); $i++
) {
1875 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) $i++
; // advance a double-byte char
1878 if ($c >= 0x80) $i++
; // advance a double-byte char
1888 * Translates a character position into an 'absolute' byte position.
1890 * @param string EUC multibyte character string
1891 * @param integer character position (negative values start from the end)
1892 * @param string the charset
1893 * @return integer byte position
1894 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1896 function euc_char2byte_pos($str,$pos,$charset) {
1897 $sjis = ($charset == 'shift_jis');
1898 $n = 0; // number of characters seen
1899 $p = abs($pos); // number of characters wanted
1905 $i = strlen($str)-1;
1909 for ( ; strlen($str{$i}) && $n<$p; $i+
=$d) {
1912 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) $i+
=$d; // advance a double-byte char
1915 if ($c >= 0x80) $i+
=$d; // advance a double-byte char
1920 if (!strlen($str{$i})) return false
; // offset beyond string length
1922 if ($pos < 0) $i++
; // correct offset
1928 * Maps all characters of a string in the EUC charset family.
1930 * @param string EUC multibyte character string
1931 * @param string the charset
1932 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1933 * @param string 'case': conversion 'toLower' or 'toUpper'
1934 * @return string the converted string
1935 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1937 function euc_char_mapping($str,$charset,$mode,$opt='') {
1940 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1941 $map =& $this->caseFolding
[$charset][$opt];
1945 if (!$this->initToASCII($charset)) return $str; // do nothing
1946 $map =& $this->toASCII
[$charset];
1953 $sjis = ($charset == 'shift_jis');
1955 for($i=0; strlen($str{$i}); $i++
) {
1960 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) { // a double-byte char
1961 $mbc = substr($str,$i,2);
1966 if ($c >= 0x80) { // a double-byte char
1967 $mbc = substr($str,$i,2);
1972 if (isset($map[$mbc])) {
1984 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE
]['XCLASS']['t3lib/class.t3lib_cs.php']) {
1985 include_once($TYPO3_CONF_VARS[TYPO3_MODE
]['XCLASS']['t3lib/class.t3lib_cs.php']);