2 /***************************************************************
5 * (c) 2003-2004 Kasper Skaarhoj (kasper@typo3.com)
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
25 * Class for conversion between charsets.
29 * @author Kasper Skaarhoj <kasper@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
33 * [CLASS/FUNCTION INDEX of SCRIPT]
38 * 261: function parse_charset($charset)
39 * 278: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
40 * 312: function utf8_encode($str,$charset)
41 * 359: function utf8_decode($str,$charset,$useEntityForNoChar=0)
42 * 407: function utf8_to_entities($str)
43 * 440: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
44 * 474: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
45 * 515: function initCharset($charset)
46 * 586: function UnumberToChar($cbyte)
47 * 630: function utf8CharToUnumber($str,$hex=0)
49 * SECTION: String operation functions
50 * 682: function strtrunc($charset,$string,$len)
51 * 716: function substr($charset,$str,$start,$len=null)
52 * 755: function strlen($charset,$string)
54 * SECTION: UTF-8 String operation functions
55 * 803: function utf8_strtrunc($str,$len)
56 * 831: function utf8_substr($str,$start,$len=null)
57 * 857: function utf8_strlen($str)
58 * 879: function utf8_strpos($haystack,$needle,$offset=0)
59 * 902: function utf8_strrpos($haystack,$needle)
60 * 921: function utf8_char2byte_pos($str,$pos)
61 * 946: function utf8_byte2char_pos($str,$pos)
63 * SECTION: EUC String operation functions
64 * 994: function euc_strtrunc($str,$len,$charset)
65 * 1028: function euc_substr($str,$start,$charset,$len=null)
66 * 1055: function euc_strlen($str,$charset)
67 * 1082: function euc_char2byte_pos($str,$pos,$charset)
70 * (This index is automatically created/updated by the extension "extdeveval")
84 * Functions working on UTF-8 strings:
89 * - implode/explode/join
91 * Functions nearly working on UTF-8 strings:
93 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf_strlen
94 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained 7-bit ASCII
95 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
96 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
98 * Functions NOT working on UTF-8 strings:
112 * Class for conversion between charsets.
114 * @author Kasper Skaarhoj <kasper@typo3.com>
115 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
120 var $noCharByteVal=127; // ASCII Value for chars with no equalent.
122 // This is the array where parsed conversion tables are stored (cached)
123 var $parsedCharsets=array();
125 // An array where case folding data will be stored (cached)
126 var $caseFolding=array();
128 // This tells the converter which charsets has two bytes per char:
129 var $twoByteSets=array(
130 'ucs-2'=>1, // 2-byte Unicode
133 // This tells the converter which charsets has four bytes per char:
134 var $fourByteSets=array(
135 'ucs-4'=>1, // 4-byte Unicode
136 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
139 // This tells the converter which charsets use a scheme like the Extended Unix Code:
140 var $eucBasedSets=array(
141 'gb2312'=>1, // Chinese, simplified.
142 'big5'=>1, // Chinese, traditional.
145 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
146 // http://czyborra.com/charsets/iso8859.html
149 'us-ascii'=> 'ascii',
150 'cp819' => 'iso-8859-1',
151 'ibm819' => 'iso-8859-1',
152 'iso-ir-100' => 'iso-8859-1',
153 'iso-ir-109' => 'iso-8859-2',
154 'iso-ir-148' => 'iso-8859-9',
155 'iso-ir-199' => 'iso-8859-14',
156 'iso-ir-203' => 'iso-8859-15',
157 'csisolatin1' => 'iso-8859-1',
158 'csisolatin2' => 'iso-8859-2',
159 'csisolatin3' => 'iso-8859-3',
160 'csisolatin5' => 'iso-8859-9',
161 'csisolatin8' => 'iso-8859-14',
162 'csisolatin9' => 'iso-8859-15',
163 'csisolatingreek' => 'iso-8859-7',
164 'iso-celtic' => 'iso-8859-14',
165 'latin1' => 'iso-8859-1',
166 'latin2' => 'iso-8859-2',
167 'latin3' => 'iso-8859-3',
168 'latin5' => 'iso-8859-9',
169 'latin6' => 'iso-8859-10',
170 'latin8' => 'iso-8859-14',
171 'latin9' => 'iso-8859-15',
172 'l1' => 'iso-8859-1',
173 'l2' => 'iso-8859-2',
174 'l3' => 'iso-8859-3',
175 'l5' => 'iso-8859-9',
176 'l6' => 'iso-8859-10',
177 'l8' => 'iso-8859-14',
178 'l9' => 'iso-8859-15',
179 'cyrillic' => 'iso-8859-5',
180 'arabic' => 'iso-8859-6',
181 'win874' => 'windows-874',
182 'win1250' => 'windows-1250',
183 'win1251' => 'windows-1251',
184 'win1252' => 'windows-1252',
185 'win1253' => 'windows-1253',
186 'win1254' => 'windows-1254',
187 'win1255' => 'windows-1255',
188 'win1256' => 'windows-1256',
189 'win1257' => 'windows-1257',
190 'win1258' => 'windows-1258',
191 'cp1250' => 'windows-1250',
192 'cp1252' => 'windows-1252',
193 'ms-ee' => 'windows-1250',
194 'ms-ansi' => 'windows-1252',
195 'ms-greek' => 'windows-1253',
196 'ms-turk' => 'windows-1254',
197 'winbaltrim' => 'windows-1257',
198 'koi-8ru' => 'koi-8r',
201 'macintosh' => 'macRoman',
202 'euc-cn' => 'gb2312',
203 'x-euc-cn' => 'gb2312',
207 'sjis' => 'shift_jis',
208 'shift-jis' => 'shift_jis',
209 'cp932' => 'shift_jis',
219 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
220 // Empty values means "iso-8859-1"
221 var $charSetArray = array(
229 'cz' => 'windows-1250',
230 'pl' => 'iso-8859-2',
231 'si' => 'windows-1250',
233 'tr' => 'iso-8859-9',
236 'ru' => 'windows-1251',
237 'ro' => 'iso-8859-2',
239 'sk' => 'windows-1250',
240 'lt' => 'windows-1257',
242 'hr' => 'windows-1250',
243 'hu' => 'iso-8859-2',
245 'th' => 'iso-8859-11',
246 'gr' => 'iso-8859-7',
249 'bg' => 'windows-1251',
251 'et' => 'iso-8859-4',
252 'ar' => 'iso-8859-6',
254 'ua' => 'windows-1251',
258 * Normalize - changes input character set to lowercase letters.
260 * @param string Input charset
261 * @return string Normalized charset
262 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
264 function parse_charset($charset) {
265 $charset = strtolower($charset);
266 if (isset($this->synonyms
[$charset])) $charset = $this->synonyms
[$charset];
273 * Convert from one charset to another charset.
275 * @param string Input string
276 * @param string From charset (the current charset of the string)
277 * @param string To charset (the output charset wanted)
278 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
279 * @return string Converted string
281 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
282 global $TYPO3_CONF_VARS;
284 if ($fromCS==$toCS) return $str;
286 if (!$useEntityForNoChar) { // iconv and recode don't support fallback to SGML entities
287 if ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'iconv') {
288 $conv_str = iconv($str,$fromCS,$toCS.'//TRANSLIT');
289 if (false
!== $conv_str) return $conv_str;
291 elseif ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'recode') {
292 $conv_str = recode_string($toCS.'..'.$fromCS,$str);
293 if (false
!== $conv_str) return $conv_str;
295 elseif ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'mbstring') {
296 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
297 if (false
!== $conv_str) return $conv_str; // returns false for unsupported charsets
299 // fallback to TYPO3 conversion
302 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
303 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
309 * Converts $str from $charset to UTF-8
311 * @param string String in local charset to convert to UTF-8
312 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
313 * @return string Output string, converted to UTF-8
315 function utf8_encode($str,$charset) {
317 // Charset is case-insensitive.
318 if ($this->initCharset($charset)) { // Parse conv. table if not already...
319 $strLen = strlen($str);
322 for ($a=0;$a<$strLen;$a++
) { // Traverse each char in string.
323 $chr=substr($str,$a,1);
325 if ($this->twoByteSets
[$charset]) { // If the charset has two bytes per char
326 $ord2 = ord($str{$a+
1});
327 $ord = $ord<<8 & $ord2; // assume big endian
329 if (isset($this->parsedCharsets
[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
330 $outStr.=$this->parsedCharsets
[$charset]['local'][$ord];
331 } else $outStr.=chr($this->noCharByteVal
); // No char exists
333 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
334 if ($this->eucBasedSets
[$charset]) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
336 $ord2=ord(substr($str,$a,1));
337 $ord = $ord*256+
$ord2;
339 elseif ($charset == 'shift_jis' && ($ord <160 ||
$ord>223)) { // Shift-JIS is like EUC, but chars between 160 and 223 are single byte
341 $ord2=ord(substr($str,$a,1));
342 $ord = $ord*256+
$ord2;
345 if (isset($this->parsedCharsets
[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
346 $outStr.=$this->parsedCharsets
[$charset]['local'][$ord];
347 } else $outStr.=chr($this->noCharByteVal
); // No char exists
348 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
355 * Converts $str from UTF-8 to $charset
357 * @param string String in UTF-8 to convert to local charset
358 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
359 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
360 * @return string Output string, converted to local charset
362 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
364 // Charset is case-insensitive.
365 if ($this->initCharset($charset)) { // Parse conv. table if not already...
366 $strLen = strlen($str);
369 for ($a=0,$i=0;$a<$strLen;$a++
,$i++
) { // Traverse each char in UTF-8 string.
370 $chr=substr($str,$a,1);
372 if ($ord>127) { // This means multibyte! (first byte!)
373 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
375 $buf=$chr; // Add first byte
376 for ($b=0;$b<8;$b++
) { // for each byte in multibyte string...
377 $ord = $ord << 1; // Shift it left and ...
378 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
379 $a++
; // Increase pointer...
380 $buf.=substr($str,$a,1); // ... and add the next char.
384 # Martin Kutschker...! this does not work! With russian UTF-8 converted back to windows-1251 it failed... So the old code is re-inserted.
385 # for ($bc=0; $ord & 0x80; $ord = $ord << 1) { $bc++; } // calculate number of bytes
386 # $buf.=substr($str,$i,$bc);
389 if (isset($this->parsedCharsets
[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
390 $mByte = $this->parsedCharsets
[$charset]['utf8'][$buf]; // The local number
391 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
392 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
393 } else $outStr.= chr($mByte);
394 } elseif ($useEntityForNoChar) { // Create num entity:
395 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
396 } else $outStr.=chr($this->noCharByteVal
); // No char exists
397 } else $outStr.=chr($this->noCharByteVal
); // No char exists (MIDDLE of MB sequence!)
398 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
405 * Converts all chars > 127 to numeric entities.
407 * @param string Input string
408 * @return string Output string
410 function utf8_to_entities($str) {
411 $strLen = strlen($str);
414 for ($a=0;$a<$strLen;$a++
) { // Traverse each char in UTF-8 string.
415 $chr=substr($str,$a,1);
417 if ($ord>127) { // This means multibyte! (first byte!)
418 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
419 $buf=$chr; // Add first byte
420 for ($b=0;$b<8;$b++
) { // for each byte in multibyte string...
421 $ord = $ord << 1; // Shift it left and ...
422 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
423 $a++
; // Increase pointer...
424 $buf.=substr($str,$a,1); // ... and add the next char.
428 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
429 } else $outStr.=chr($this->noCharByteVal
); // No char exists (MIDDLE of MB sequence!)
430 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
437 * Converts numeric entities (UNICODE, eg. decimal (Ӓ) or hexadecimal ()) to UTF-8 multibyte chars
439 * @param string Input string, UTF-8
440 * @param boolean If set, then all string-HTML entities (like & or £ will be converted as well)
441 * @return string Output string
443 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
444 if ($alsoStdHtmlEnt) {
445 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES
));
448 $token = md5(microtime());
449 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
450 foreach($parts as $k => $v) {
452 if (substr($v,0,1)=='#') { // Dec or hex entities:
453 if (substr($v,1,1)=='x') {
454 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
456 $parts[$k] = $this->UnumberToChar(substr($v,1));
458 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
459 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
460 } else { // No conversion:
461 $parts[$k] ='&'.$v.';';
466 return implode('',$parts);
470 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
472 * @param string Input string, UTF-8
473 * @param boolean If set, then all HTML entities (like & or £ or { or 㽝) will be detected as characters.
474 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
475 * @return array Output array with the char numbers
477 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
478 // If entities must be registered as well...:
480 $str = $this->entities_to_utf8($str,1);
483 $strLen = strlen($str);
486 for ($a=0;$a<$strLen;$a++
) { // Traverse each char in UTF-8 string.
487 $chr=substr($str,$a,1);
489 if ($ord>127) { // This means multibyte! (first byte!)
490 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
491 $buf=$chr; // Add first byte
492 for ($b=0;$b<8;$b++
) { // for each byte in multibyte string...
493 $ord = $ord << 1; // Shift it left and ...
494 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
495 $a++
; // Increase pointer...
496 $buf.=substr($str,$a,1); // ... and add the next char.
500 $outArr[]=$retChar?
$buf:$this->utf8CharToUnumber($buf);
501 } else $outArr[]=$retChar?
chr($this->noCharByteVal
):$this->noCharByteVal
; // No char exists (MIDDLE of MB sequence!)
502 } else $outArr[]=$retChar?
chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
509 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
510 * This function is automatically called by the conversion functions
512 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
514 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
515 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
518 function initCharset($charset) {
519 // Only process if the charset is not yet loaded:
520 if (!is_array($this->parsedCharsets
[$charset])) {
522 // Conversion table filename:
523 $charsetConvTableFile = PATH_t3lib
.'csconvtbl/'.$charset.'.tbl';
525 // If the conversion table is found:
526 if ($charset && t3lib_div
::validPathStr($charsetConvTableFile) && @is_file
($charsetConvTableFile)) {
527 // Cache file for charsets:
528 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
529 $cacheFile = t3lib_div
::getFileAbsFileName('typo3temp/charset_'.$charset.'.tbl');
530 if ($cacheFile && @is_file
($cacheFile)) {
531 $this->parsedCharsets
[$charset]=unserialize(t3lib_div
::getUrl($cacheFile));
533 // Parse conversion table into lines:
534 $lines=t3lib_div
::trimExplode(chr(10),t3lib_div
::getUrl($charsetConvTableFile),1);
535 // Initialize the internal variable holding the conv. table:
536 $this->parsedCharsets
[$charset]=array('local'=>array(),'utf8'=>array());
537 // traverse the lines:
539 foreach($lines as $value) {
540 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
542 // Detect type if not done yet: (Done on first real line)
543 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
544 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ?
'whitespaced' : 'ms-token';
546 if ($detectedType=='ms-token') {
547 list($hexbyte,$utf8) = split('=|:',$value,3);
548 } elseif ($detectedType=='whitespaced') {
550 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
552 $utf8 = 'U+'.$regA[2];
554 $decval = hexdec(trim($hexbyte));
556 $utf8decval = hexdec(substr(trim($utf8),2));
557 $this->parsedCharsets
[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
558 $this->parsedCharsets
[$charset]['utf8'][$this->parsedCharsets
[$charset]['local'][$decval]]=$decval;
563 t3lib_div
::writeFile($cacheFile,serialize($this->parsedCharsets
[$charset]));
572 * Converts a UNICODE number to a UTF-8 multibyte character
573 * Algorithm based on script found at From: http://czyborra.com/utf/
575 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
577 * bytes | bits | representation
579 * 2 | 11 | 110vvvvv 10vvvvvv
580 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
581 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
582 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
583 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
585 * @param integer UNICODE integer
586 * @return string UTF-8 multibyte character string
587 * @see utf8CharToUnumber()
589 function UnumberToChar($cbyte) {
594 } else if ($cbyte < 0x800) {
595 $str.=chr(0xC0 |
($cbyte >> 6));
596 $str.=chr(0x80 |
($cbyte & 0x3F));
597 } else if ($cbyte < 0x10000) {
598 $str.=chr(0xE0 |
($cbyte >> 12));
599 $str.=chr(0x80 |
(($cbyte >> 6) & 0x3F));
600 $str.=chr(0x80 |
($cbyte & 0x3F));
601 } else if ($cbyte < 0x200000) {
602 $str.=chr(0xF0 |
($cbyte >> 18));
603 $str.=chr(0x80 |
(($cbyte >> 12) & 0x3F));
604 $str.=chr(0x80 |
(($cbyte >> 6) & 0x3F));
605 $str.=chr(0x80 |
($cbyte & 0x3F));
606 } else if ($cbyte < 0x4000000) {
607 $str.=chr(0xF8 |
($cbyte >> 24));
608 $str.=chr(0x80 |
(($cbyte >> 18) & 0x3F));
609 $str.=chr(0x80 |
(($cbyte >> 12) & 0x3F));
610 $str.=chr(0x80 |
(($cbyte >> 6) & 0x3F));
611 $str.=chr(0x80 |
($cbyte & 0x3F));
612 } else if ($cbyte < 0x80000000) {
613 $str.=chr(0xFC |
($cbyte >> 30));
614 $str.=chr(0x80 |
(($cbyte >> 24) & 0x3F));
615 $str.=chr(0x80 |
(($cbyte >> 18) & 0x3F));
616 $str.=chr(0x80 |
(($cbyte >> 12) & 0x3F));
617 $str.=chr(0x80 |
(($cbyte >> 6) & 0x3F));
618 $str.=chr(0x80 |
($cbyte & 0x3F));
619 } else { // Cannot express a 32-bit character in UTF-8
620 $str .= chr($this->noCharByteVal
);
626 * Converts a UTF-8 Multibyte character to a UNICODE number
628 * @param string UTF-8 multibyte character string
629 * @param boolean If set, then a hex. number is returned.
630 * @return integer UNICODE integer
631 * @see UnumberToChar()
633 function utf8CharToUnumber($str,$hex=0) {
634 $ord=ord(substr($str,0,1)); // First char
636 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
638 for ($b=0;$b<8;$b++
) { // for each byte in multibyte string...
639 $ord = $ord << 1; // Shift it left and ...
640 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
641 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+
1,1))),-6);
644 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
646 $int = bindec($binBuf);
649 return $hex ?
'x'.dechex($int) : $int;
653 * This function initializes the UTF-8 case folding table.
655 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
657 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
660 function initCaseFoldingUTF8() {
661 // Only process if the case table is not yet loaded:
662 if (is_array($this->caseFolding
['utf-8'])) return 1;
664 // Use cached version if possible
665 $cacheFile = t3lib_div
::getFileAbsFileName('typo3temp/cscase_utf-8.tbl');
666 if ($cacheFile && @is_file
($cacheFile)) {
667 $this->caseFolding
['utf-8'] = unserialize(t3lib_div
::getUrl($cacheFile));
671 // process main Unicode data file
672 $unicodeDataFile = PATH_t3lib
.'unidata/UnicodeData.txt';
673 if (!(t3lib_div
::validPathStr($unicodeDataFile) && @is_file
($unicodeDataFile))) return false
;
675 $fh = fopen($unicodeDataFile,'r');
676 if (!$fh) return false
;
678 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
679 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
680 $this->caseFolding
['utf-8'] = array();
681 $utf8CaseFolding =& $this->caseFolding
['utf-8']; // a shorthand
682 $utf8CaseFolding['toUpper'] = array();
683 $utf8CaseFolding['toLower'] = array();
684 $utf8CaseFolding['toTitle'] = array();
688 // has also other info like character class (digit, white space, etc.) and more
689 list($char,,,,,,,,,,,,$upper,$lower,$title,) = split(';', rtrim($line));
690 $char = $this->UnumberToChar(hexdec($char));
691 if ($upper) $utf8CaseFolding['toUpper'][$char] = $this->UnumberToChar(hexdec($upper));
692 if ($lower) $utf8CaseFolding['toLower'][$char] = $this->UnumberToChar(hexdec($lower));
693 // store "title" only when different from "upper" (only a few)
694 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$char] = $this->UnumberToChar(hexdec($title));
698 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
699 $specialCasingFile = PATH_t3lib
.'unidata/SpecialCasing.txt';
700 if (t3lib_div
::validPathStr($specialCasingFile) && @is_file
($specialCasingFile)) {
702 $fh = fopen($specialCasingFile,'r');
706 if ($line{0} != '#' && trim($line) != '') {
708 list($char,$lower,$title,$upper,$cond) = t3lib_div
::trimExplode(';', $line);
709 if ($cond == '' ||
$cond{0} == '#') {
710 $utf8_char = $this->UnumberToChar(hexdec($char));
711 if ($char != $lower) {
712 $arr = split(' ',$lower);
713 for ($i=0; isset($arr[$i]); $i++
) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
714 $utf8CaseFolding['toLower'][$utf8_char] = implode($arr);
716 if ($char != $title && $title != $upper) {
717 $arr = split(' ',$title);
718 for ($i=0; isset($arr[$i]); $i++
) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
719 $utf8CaseFolding['toTitle'][$utf8_char] = implode($arr);
721 if ($char != $upper) {
722 $arr = split(' ',$upper);
723 for ($i=0; isset($arr[$i]); $i++
) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
724 $utf8CaseFolding['toUpper'][$utf8_char] = implode($arr);
734 t3lib_div
::writeFile($cacheFile,serialize($utf8CaseFolding));
741 * This function initializes the folding table for a charset other than UTF-8.
742 * This function is automatically called by the case folding functions.
744 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
747 function initCaseFolding($charset) {
748 // Only process if the case table is not yet loaded:
749 if (is_array($this->caseFolding
[$charset])) return 1;
751 // Use cached version if possible
752 $cacheFile = t3lib_div
::getFileAbsFileName('typo3temp/cscase_'.$charset.'.tbl');
753 if ($cacheFile && @is_file
($cacheFile)) {
754 $this->caseFolding
[$charset] = unserialize(t3lib_div
::getUrl($cacheFile));
758 // init UTF-8 conversion for this charset
759 if (!$this->initCharset($charset)) {
763 // UTF-8 case folding is used as the base conversion table
764 if (!$this->initCaseFoldingUTF8()) {
768 $nochar = chr($this->noCharByteVal
);
769 foreach ($this->parsedCharsets
[$charset]['local'] as $ci => $utf8) {
770 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
771 $c = $this->conv($utf8, 'utf-8', $charset);
773 $cc = $this->conv($this->caseFolding
['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
774 if ($cc && $cc != $nochar) $this->caseFolding
[$charset]['toUpper'][$c] = $cc;
776 $cc = $this->conv($this->caseFolding
['utf-8']['toLower'][$utf8], 'utf-8', $charset);
777 if ($cc && $cc != $nochar) $this->caseFolding
[$charset]['toLower'][$c] = $cc;
779 $cc = $this->conv($this->caseFolding
['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
780 if ($cc && $cc != $nochar) $this->caseFolding
[$charset]['toTitle'][$c] = $cc;
783 // add the ASCII case table
784 for ($i=ord('a'); $i<=ord('z'); $i++
) {
785 $this->caseFolding
[$charset]['toUpper'][chr($i)] = chr($i-32);
787 for ($i=ord('A'); $i<=ord('Z'); $i++
) {
788 $this->caseFolding
[$charset]['toLower'][chr($i)] = chr($i+
32);
792 t3lib_div
::writeFile($cacheFile,serialize($this->caseFolding
[$charset]));
814 /********************************************
816 * String operation functions
818 ********************************************/
821 * Cuts a string short at a given byte length.
823 * @param string the character set
824 * @param string character string
825 * @param integer the byte length
826 * @return string the shortened string
828 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
830 function strtrunc($charset,$string,$len) {
831 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
832 return mb_strcut($string,0,$len,$charset);
833 } elseif ($charset == 'utf-8') {
834 return $this->utf8_strtrunc($string);
835 } elseif ($charset == 'shift_jis') {
836 return $this->euc_strtrunc($string,'shift_jis');
837 } elseif ($this->eucBasedSets
[$charset]) {
838 return $this->euc_strtrunc($string,$charset);
839 } elseif ($this->twoByteSets
[$charset]) {
840 if ($len %
2) $len--; // don't cut at odd positions
841 } elseif ($this->fourByteSets
[$charset]) {
843 $len -= $x; // realign to position dividable by four
845 // treat everything else as single-byte encoding
846 return substr($string,0,$len);
850 * Returns a part of a string.
853 * Negative values for @arg $start and @arg $len are currently not supported.
855 * @param string the character set
856 * @param string character string
857 * @param int $start start position (character position)
858 * @param int length (in characters)
859 * @return string the substring
861 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
864 function substr($charset,$string,$start,$len=null
) {
865 if ($len===0) return '';
867 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
868 // cannot omit $len, when specifying charset
870 $enc = mb_internal_encoding(); // save internal encoding
871 mb_internal_encoding('utf-8');
872 $str = mb_substr($string,$start);
873 mb_internal_encoding($enc); // restore internal encoding
877 else return mb_substr($string,$start,$len,'utf-8');
878 } elseif ($charset == 'utf-8') {
879 return $this->utf8_substr($string,$start,$len);
880 } elseif ($charset == 'shift_jis') {
881 return $this->euc_substr($string,$start,'shift_jis',$len);
882 } elseif ($this->eucBasedSets
[$charset]) {
883 return $this->euc_substr($string,$start,$charset,$len);
884 } elseif ($this->twoByteSets
[$charset]) {
885 return substr($string,$start*2,$len*2);
886 } elseif ($this->fourByteSets
[$charset]) {
887 return substr($string,$start*4,$len*4);
890 // treat everything else as single-byte encoding
891 return substr($string,$start,$len);
895 * Counts the number of characters.
897 * @param string the character set
898 * @param string character string
899 * @return integer the number of characters
901 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
903 function strlen($charset,$string) {
904 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
905 return mb_strlen($string,$charset);
906 } elseif ($charset == 'utf-8') {
907 return $this->utf8_strlen($string);
908 } elseif ($charset == 'shift_jis') {
909 return $this->euc_strlen($string,'shift_jis');
910 } elseif ($this->eucBasedSets
[$charset]) {
911 return $this->euc_strlen($string,$charset);
912 } elseif ($this->twoByteSets
[$charset]) {
913 return strlen($string)/2;
914 } elseif ($this->fourByteSets
[$charset]) {
915 return strlen($string)/4;
917 // treat everything else as single-byte encoding
918 return strlen($string);
922 * Translates all characters of a string into their respective case values.
923 * Unlike strtolower() and strtoupper() this method is locale independent.
925 * Real case folding is language dependent, this method ignores this fact.
927 * @param string string
928 * @return string the converted string
929 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
930 * @see strtolower(), strtoupper(), mb_convert_case()
932 function conv_case($charset,$string,$case) {
933 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' &&
934 float(phpversion()) >= 4.3) {
935 if ($case == 'toLower') {
936 return mb_strtolower($str,'utf-8');
938 return mb_strtoupper($str,'utf-8');
940 } elseif ($charset == 'utf-8') {
941 return $this->utf8_conv_case($string,$case);
942 } elseif ($charset == 'shift_jis') {
943 return $this->euc_conv_case($string,$case,'shift_jis');
944 } elseif ($this->eucBasedSets
[$charset]) {
945 return $this->euc_conv_case($string,$case,$charset);
948 // treat everything else as single-byte encoding
949 if (!$this->initCaseFolding($charset)) return $string; // do nothing
952 $caseConv =& $this->caseFolding
[$charset][$case];
953 for($i=0; $c=$string{$i}; $i++
) {
962 // is a simple strtr() faster or slower than the code above?
963 // perhaps faster for small single-byte tables but slower for large multi-byte tables?
965 // return strtr($string,$this->caseFolding[$charset][$case]);
983 /********************************************
985 * UTF-8 string operation functions
987 ********************************************/
990 * Truncates a string in UTF-8 short at a given byte length.
992 * @param string UTF-8 multibyte character string
993 * @param integer the byte length
994 * @return string the shortened string
996 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
998 function utf8_strtrunc($str,$len) {
999 if ($len <= 0) return '';
1002 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1003 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1004 if ($i <= 0) return ''; // sanity check
1005 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++
; // calculate number of bytes
1006 if ($bc+
$i > $len) return substr($str,0,$i);
1007 // fallthru: multibyte char fits into length
1009 return substr($str,$len);
1013 * Returns a part of a UTF-8 string.
1016 * Negative values for @arg $start and @arg $len are currently not supported.
1018 * @param string $str UTF-8 string
1019 * @param int $start start position (character position)
1020 * @param int $len length (in characters)
1021 * @return string the substring
1023 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1025 function utf8_substr($str,$start,$len=null
) {
1026 if ($len===0) return '';
1028 $byte_start = $this->utf8_char2byte_pos($str,$start);
1029 if ($byte_start === false
) return false
; // $start outside string length
1031 $str = substr($str,$byte_start);
1034 $byte_end = $this->utf8_char2byte_pos($str,$len);
1035 if ($byte_end === false
) // $len outside actual string length
1038 return substr($str,0,$byte_end);
1044 * Counts the number of characters of a string in UTF-8.
1046 * @param string UTF-8 multibyte character string
1047 * @return int the number of characters
1049 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1051 function utf8_strlen($str) {
1053 for($i=0; $str{$i}; $i++
) {
1055 if (!($c & 0x80)) // single-byte (0xxxxxx)
1057 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1064 * Find position of first occurrence of a string, both arguments are in UTF-8.
1066 * @param string UTF-8 string to search in
1067 * @param string UTF-8 string to search for
1068 * @param int positition to start the search
1069 * @return int the character position
1071 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1073 function utf8_strpos($haystack,$needle,$offset=0) {
1074 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1075 return mb_strpos($haystack,$needle,'utf-8');
1078 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1079 if ($byte_offset === false
) return false
; // offset beyond string length
1081 $byte_pos = strpos($haystack,$needle,$byte_offset);
1082 if ($byte_pos === false
) return false
; // needle not found
1084 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1088 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1090 * @param string UTF-8 string to search in
1091 * @param char UTF-8 character to search for
1092 * @return int the character position
1094 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1096 function utf8_strrpos($haystack,$needle) {
1097 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1098 return mb_strrpos($haystack,$needle,'utf-8');
1101 $byte_pos = strrpos($haystack,$needle);
1102 if ($byte_pos === false
) return false
; // needle not found
1104 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1108 * Translates a character position into an 'absolute' byte position.
1110 * @param string UTF-8 string
1111 * @param int character position
1112 * @return int byte position
1113 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1115 function utf8_char2byte_pos($str,$pos) {
1116 $n = 0; // number of characters
1117 for($i=0; $str{$i} && $n<$pos; $i++
) {
1118 $c = (int)ord($str{$i});
1119 if (!($c & 0x80)) // single-byte (0xxxxxx)
1121 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1124 if (!$str{$i}) return false
; // offset beyond string length
1126 // skip trailing multi-byte data bytes
1127 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++
; }
1133 * Translates an 'absolute' byte position into a character position.
1135 * @param string UTF-8 string
1136 * @param int byte position
1137 * @return int character position
1138 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1140 function utf8_byte2char_pos($str,$pos) {
1141 $n = 0; // number of characters
1142 for($i=$pos; $i>0; $i--) {
1143 $c = (int)ord($str{$i});
1144 if (!($c & 0x80)) // single-byte (0xxxxxx)
1146 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1149 if (!$str{$i}) return false
; // offset beyond string length
1155 * Translates all characters of an UTF-8 string into their respective case values.
1157 * @param string UTF-8 string
1158 * @param string conversion: 'toLower' or 'toUpper'
1159 * @return string the converted string
1160 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1161 * @see strtolower(), strtoupper(), mb_convert_case()
1163 function utf8_conv_case($str,$case) {
1164 if (!$this->initCaseFoldingUTF8()) return $str; // do nothing
1167 $caseConv =& $this->caseFolding
['utf-8'][$case];
1168 for($i=0; $str{$i}; $i++
) {
1170 if (!($c & 0x80)) // single-byte (0xxxxxx)
1172 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1173 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++
; } // calculate number of bytes
1174 $mbc = substr($str,$i,$bc);
1178 $cc = $caseConv[$mbc];
1206 /********************************************
1208 * EUC string operation functions
1210 * Extended Unix Code:
1211 * ASCII compatible 7bit single bytes chars
1212 * 8bit two byte chars
1214 * Shift-JIS is treated as a special case.
1216 ********************************************/
1219 * Cuts a string in the EUC charset family short at a given byte length.
1221 * @param string EUC multibyte character string
1222 * @param integer the byte length
1223 * @param string the charset
1224 * @return string the shortened string
1226 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1228 function euc_strtrunc($str,$len,$charset) {
1229 if ($len <= 0) return '';
1231 $sjis = ($charset == 'shift_jis');
1232 for ($i=0; $str{$i} && $i<$len; $i++
) {
1235 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) $i++
; // advance a double-byte char
1238 if ($c >= 0x80) $i++
; // advance a double-byte char
1241 if (!$str{$i}) return $str; // string shorter than supplied length
1244 return substr($str,0,$len-1); // we ended on a first byte
1246 return substr($str,0,$len);
1250 * Returns a part of a string in the EUC charset family.
1253 * Negative values for @arg $start and @arg $len are currently not supported.
1255 * @param string EUC multibyte character string
1256 * @param int start position (character position)
1257 * @param string the charset
1258 * @param int length (in characters)
1259 * @return string the substring
1260 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1262 function euc_substr($str,$start,$charset,$len=null
) {
1263 if ($len===0) return '';
1265 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1266 if ($byte_start === false
) return false
; // $start outside string length
1268 $str = substr($str,$byte_start);
1271 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1272 if ($byte_end === false
) // $len outside actual string length
1275 return substr($str,0,$byte_end);
1281 * Counts the number of characters of a string in the EUC charset family.
1283 * @param string EUC multibyte character string
1284 * @param string the charset
1285 * @return int the number of characters
1287 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1289 function euc_strlen($str,$charset) {
1290 $sjis = ($charset == 'shift_jis');
1292 for ($i=0; $str{$i}; $i++
) {
1295 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) $i++
; // advance a double-byte char
1298 if ($c >= 0x80) $i++
; // advance a double-byte char
1308 * Translates a character position into an 'absolute' byte position.
1310 * @param string EUC multibyte character string
1311 * @param int character position
1312 * @param string the charset
1313 * @return int byte position
1314 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1316 function euc_char2byte_pos($str,$pos,$charset) {
1317 $sjis = ($charset == 'shift_jis');
1318 $n = 0; // number of characters seen
1319 for ($i=0; $str{$i} && $n<$pos; $i++
) {
1322 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) $i++
; // advance a double-byte char
1325 if ($c >= 0x80) $i++
; // advance a double-byte char
1330 if (!$str{$i}) return false
; // offset beyond string length
1336 * Translates all characters of a string in the EUC charset family into their respective case values.
1338 * @param string EUC multibyte character string
1339 * @param string conversion: 'toLower' or 'toUpper'
1340 * @param string the charset
1341 * @return string the converted string
1342 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1343 * @see strtolower(), strtoupper(), mb_convert_case()
1345 function euc_conv_case($str,$case,$charset) {
1346 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1348 $sjis = ($charset == 'shift_jis');
1350 $caseConv =& $this->caseFolding
[$charset][$case];
1351 for($i=0; $mbc=$str{$i}; $i++
) {
1355 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) { // a double-byte char
1356 $mbc = substr($str,$i,2);
1361 if ($c >= 0x80) { // a double-byte char
1362 $mbc = substr($str,$i,2);
1367 $cc = $caseConv[$mbc];
1380 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE
]['XCLASS']['t3lib/class.t3lib_cs.php']) {
1381 include_once($TYPO3_CONF_VARS[TYPO3_MODE
]['XCLASS']['t3lib/class.t3lib_cs.php']);