UTF-8 string support:
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2004 Kasper Skaarhoj (kasper@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasper@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 104: class t3lib_cs
38 * 233: function parse_charset($charset)
39 * 250: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
40 * 284: function utf8_encode($str,$charset)
41 * 325: function utf8_decode($str,$charset,$useEntityForNoChar=0)
42 * 373: function utf8_to_entities($str)
43 * 406: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
44 * 437: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
45 * 478: function initCharset($charset)
46 * 549: function UnumberToChar($cbyte)
47 * 593: function utf8CharToUnumber($str,$hex=0)
48 * 622: function utf8_strtrunc($str,$len)
49 * 662: function utf_strlen($str)
50 * 675: function utf_substr($str,$start,$len=0)
51 * 689: function utf_strpos($haystack,$needle,$offset=0)
52 * 702: function utf_strrpos($haystack,$needle,$offset=0)
53 *
54 * TOTAL FUNCTIONS: 15
55 * (This index is automatically created/updated by the extension "extdeveval")
56 *
57 */
58
59
60
61
62
63
64
65
66 /**
67 * Notes on UTF-8
68 *
69 * Functions working on UTF-8 strings:
70 *
71 * - strchr/strstr
72 * - strrchr
73 * - substr_count
74 * - implode/explode/join
75 *
76 * Functions nearly working on UTF-8 strings:
77 *
78 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf_strlen
79 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained 7-bit ASCII
80 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
81 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
82 *
83 * Functions NOT working on UTF-8 strings:
84 *
85 * - str*cmp
86 * - stristr
87 * - stripos
88 * - substr
89 * - strrev
90 * - ereg/eregi
91 * - split/spliti
92 * - preg_*
93 * - ...
94 *
95 */
96 /**
97 * Class for conversion between charsets.
98 *
99 * @author Kasper Skaarhoj <kasper@typo3.com>
100 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
101 * @package TYPO3
102 * @subpackage t3lib
103 */
104 class t3lib_cs {
105 var $noCharByteVal=127; // ASCII Value for chars with no equalent.
106
107 // This is the array where parsed conversion tables are stored (cached)
108 var $parsedCharsets=array();
109
110 // This tells the converter which charsets has two bytes per char:
111 var $twoByteSets=array(
112 'ucs-2'=>1, // 2-byte Unicode
113 'utf-16'=>1 // 2-byte Unicode with surrogates
114 );
115
116 // This tells the converter which charset use the Extended Unix Code scheme:
117 var $eucBasedSets=array(
118 'gb2312'=>1, // Chinese, simplified.
119 );
120
121 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
122 // http://czyborra.com/charsets/iso8859.html
123 var $synonyms=array(
124 'us' => 'ascii',
125 'us-ascii'=> 'ascii',
126 'cp819' => 'iso-8859-1',
127 'ibm819' => 'iso-8859-1',
128 'iso-ir-100' => 'iso-8859-1',
129 'iso-ir-109' => 'iso-8859-2',
130 'iso-ir-148' => 'iso-8859-9',
131 'iso-ir-199' => 'iso-8859-14',
132 'iso-ir-203' => 'iso-8859-15',
133 'csisolatin1' => 'iso-8859-1',
134 'csisolatin2' => 'iso-8859-2',
135 'csisolatin3' => 'iso-8859-3',
136 'csisolatin5' => 'iso-8859-9',
137 'csisolatin8' => 'iso-8859-14',
138 'csisolatin9' => 'iso-8859-15',
139 'csisolatingreek' => 'iso-8859-7',
140 'iso-celtic' => 'iso-8859-14',
141 'latin1' => 'iso-8859-1',
142 'latin2' => 'iso-8859-2',
143 'latin3' => 'iso-8859-3',
144 'latin5' => 'iso-8859-9',
145 'latin6' => 'iso-8859-10',
146 'latin8' => 'iso-8859-14',
147 'latin9' => 'iso-8859-15',
148 'l1' => 'iso-8859-1',
149 'l2' => 'iso-8859-2',
150 'l3' => 'iso-8859-3',
151 'l5' => 'iso-8859-9',
152 'l6' => 'iso-8859-10',
153 'l8' => 'iso-8859-14',
154 'l9' => 'iso-8859-15',
155 'cyrillic' => 'iso-8859-5',
156 'arabic' => 'iso-8859-6',
157 'win874' => 'windows-874',
158 'win1250' => 'windows-1250',
159 'win1251' => 'windows-1251',
160 'win1252' => 'windows-1252',
161 'win1253' => 'windows-1253',
162 'win1254' => 'windows-1254',
163 'win1255' => 'windows-1255',
164 'win1256' => 'windows-1256',
165 'win1257' => 'windows-1257',
166 'win1258' => 'windows-1258',
167 'cp1250' => 'windows-1250',
168 'cp1252' => 'windows-1252',
169 'ms-ee' => 'windows-1250',
170 'ms-ansi' => 'windows-1252',
171 'ms-greek' => 'windows-1253',
172 'ms-turk' => 'windows-1254',
173 'winbaltrim' => 'windows-1257',
174 'mac' => 'macRoman',
175 'macintosh' => 'macRoman',
176 'euc-cn' => 'gb2312',
177 'x-euc-cn' => 'gb2312',
178 'utf8' => 'utf-8',
179 'utf-2' => 'utf-8',
180 'utf2' => 'utf-8',
181 );
182 /*
183 JIS X 0208 (euc-jp)
184 CNS 11643 (EUC-TW)
185 KS C 5601 (EUC-KR)
186 */
187
188 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
189 // Empty values means "iso-8859-1"
190 var $charSetArray = array(
191 'dk' => '',
192 'de' => '',
193 'no' => '',
194 'it' => '',
195 'fr' => '',
196 'es' => '',
197 'nl' => '',
198 'cz' => 'windows-1250',
199 'pl' => 'iso-8859-2',
200 'si' => 'windows-1250',
201 'fi' => '',
202 'tr' => 'iso-8859-9',
203 'se' => '',
204 'pt' => '',
205 'ru' => 'windows-1251',
206 'ro' => 'iso-8859-2',
207 'ch' => 'gb2312',
208 'sk' => 'windows-1250',
209 'lt' => 'windows-1257',
210 'is' => 'utf-8',
211 'hr' => 'windows-1250',
212 'hu' => 'iso-8859-2',
213 'gl' => '',
214 'th' => 'iso-8859-11',
215 'gr' => 'iso-8859-7',
216 'hk' => 'big5',
217 'eu' => '',
218 'bg' => 'windows-1251',
219 'br' => '',
220 'et' => 'iso-8859-4',
221 'ar' => 'iso-8859-6',
222 'he' => 'utf-8',
223 'ua' => 'windows-1251',
224 );
225
226 /**
227 * Normalize - changes input character set to lowercase letters.
228 *
229 * @param string Input charset
230 * @return string Normalized charset
231 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
232 */
233 function parse_charset($charset) {
234 $charset = strtolower($charset);
235 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
236
237 return $charset;
238 }
239
240
241 /**
242 * Convert from one charset to another charset.
243 *
244 * @param string Input string
245 * @param string From charset (the current charset of the string)
246 * @param string To charset (the output charset wanted)
247 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
248 * @return string Converted string
249 */
250 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
251 global $TYPO3_CONF_VARS;
252
253 if ($fromCS==$toCS) return $str;
254
255 if (!$useEntityForNoChar) { // iconv and recode don't support fallback to SGML entities
256 if ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'iconv') {
257 $conv_str = iconv($str,$fromCS,$toCS.'//TRANSLIT');
258 if (false !== $conv_str) return $conv_str;
259 }
260 elseif ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'recode') {
261 $conv_str = recode_string($toCS.'..'.$fromCS,$str);
262 if (false !== $conv_str) return $conv_str;
263 }
264 elseif ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'mbstring') {
265 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
266 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
267 }
268 // fallback to TYPO3 conversion
269 }
270
271 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
272 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
273 return $str;
274 }
275
276
277 /**
278 * Converts $str from $charset to UTF-8
279 *
280 * @param string String in local charset to convert to UTF-8
281 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
282 * @return string Output string, converted to UTF-8
283 */
284 function utf8_encode($str,$charset) {
285
286 // Charset is case-insensitive.
287 if ($this->initCharset($charset)) { // Parse conv. table if not already...
288 $strLen = strlen($str);
289 $outStr='';
290
291 for ($a=0,$i;$a<$strLen;$a++,$i++) { // Traverse each char in string.
292 $chr=substr($str,$a,1);
293 $ord=ord($chr);
294 if ($this->twoByteSets[$charset]) { // If the charset has two bytes per char
295 $ord2 = ord($str[$i+1]);
296 $ord = $ord<<8 & $ord2; // assume big endian
297
298 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
299 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
300 } else $outStr.=chr($this->noCharByteVal); // No char exists
301 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
302 if ($this->eucBasedSets[$charset]) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
303 $a++;
304 $ord2=ord(substr($str,$a,1));
305 $ord = $ord*256+$ord2;
306 }
307
308 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
309 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
310 } else $outStr.=chr($this->noCharByteVal); // No char exists
311 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
312 }
313 return $outStr;
314 }
315 }
316
317 /**
318 * Converts $str from UTF-8 to $charset
319 *
320 * @param string String in UTF-8 to convert to local charset
321 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
322 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
323 * @return string Output string, converted to local charset
324 */
325 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
326
327 // Charset is case-insensitive.
328 if ($this->initCharset($charset)) { // Parse conv. table if not already...
329 $strLen = strlen($str);
330 $outStr='';
331 $buf='';
332 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
333 $chr=substr($str,$a,1);
334 $ord=ord($chr);
335 if ($ord>127) { // This means multibyte! (first byte!)
336 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
337
338 $buf=$chr; // Add first byte
339 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
340 $ord = $ord << 1; // Shift it left and ...
341 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
342 $a++; // Increase pointer...
343 $buf.=substr($str,$a,1); // ... and add the next char.
344 } else break;
345 }
346
347 # Martin Kutschker...! this does not work! With russian UTF-8 converted back to windows-1251 it failed... So the old code is re-inserted.
348 # for ($bc=0; $ord & 0x80; $ord = $ord << 1) { $bc++; } // calculate number of bytes
349 # $buf.=substr($str,$i,$bc);
350 # $i+=$bc-1;
351
352 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
353 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
354 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
355 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
356 } else $outStr.= chr($mByte);
357 } elseif ($useEntityForNoChar) { // Create num entity:
358 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
359 } else $outStr.=chr($this->noCharByteVal); // No char exists
360 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
361 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
362 }
363 return $outStr;
364 }
365 }
366
367 /**
368 * Converts all chars > 127 to numeric entities.
369 *
370 * @param string Input string
371 * @return string Output string
372 */
373 function utf8_to_entities($str) {
374 $strLen = strlen($str);
375 $outStr='';
376 $buf='';
377 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
378 $chr=substr($str,$a,1);
379 $ord=ord($chr);
380 if ($ord>127) { // This means multibyte! (first byte!)
381 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
382 $buf=$chr; // Add first byte
383 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
384 $ord = $ord << 1; // Shift it left and ...
385 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
386 $a++; // Increase pointer...
387 $buf.=substr($str,$a,1); // ... and add the next char.
388 } else break;
389 }
390
391 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
392 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
393 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
394 }
395
396 return $outStr;
397 }
398
399 /**
400 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
401 *
402 * @param string Input string, UTF-8
403 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
404 * @return string Output string
405 */
406 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
407 if ($alsoStdHtmlEnt) {
408 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES));
409 }
410
411 $token = md5(microtime());
412 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
413 foreach($parts as $k => $v) {
414 if ($k%2) {
415 if (substr($v,0,1)=='#') { // Dec or hex entities:
416 if (substr($v,1,1)=='x') $v=hexdec(substr($v,2));
417 $parts[$k] = $this->UnumberToChar(substr($v,1));
418 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
419 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
420 } else { // No conversion:
421 $parts[$k] ='&'.$v.';';
422 }
423 }
424 }
425
426 return implode('',$parts);
427 }
428
429 /**
430 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
431 *
432 * @param string Input string, UTF-8
433 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
434 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
435 * @return array Output array with the char numbers
436 */
437 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
438 // If entities must be registered as well...:
439 if ($convEntities) {
440 $str = $this->entities_to_utf8($str,1);
441 }
442 // Do conversion:
443 $strLen = strlen($str);
444 $outArr=array();
445 $buf='';
446 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
447 $chr=substr($str,$a,1);
448 $ord=ord($chr);
449 if ($ord>127) { // This means multibyte! (first byte!)
450 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
451 $buf=$chr; // Add first byte
452 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
453 $ord = $ord << 1; // Shift it left and ...
454 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
455 $a++; // Increase pointer...
456 $buf.=substr($str,$a,1); // ... and add the next char.
457 } else break;
458 }
459
460 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
461 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
462 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
463 }
464
465 return $outArr;
466 }
467
468 /**
469 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
470 * This function is automatically called by the conversion functions
471 *
472 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
473 *
474 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
475 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
476 * @access private
477 */
478 function initCharset($charset) {
479 // Only process if the charset is not yet loaded:
480 if (!is_array($this->parsedCharsets[$charset])) {
481
482 // Conversion table filename:
483 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
484
485 // If the conversion table is found:
486 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
487 // Cache file for charsets:
488 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
489 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/charset_'.$charset.'.tbl');
490 if ($cacheFile && @is_file($cacheFile)) {
491 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
492 } else {
493 // Parse conversion table into lines:
494 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
495 // Initialize the internal variable holding the conv. table:
496 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
497 // traverse the lines:
498 $detectedType='';
499 foreach($lines as $value) {
500 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
501
502 // Detect type if not done yet: (Done on first real line)
503 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
504 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
505
506 if ($detectedType=='ms-token') {
507 list($hexbyte,$utf8) = split('=|:',$value,3);
508 } elseif ($detectedType=='whitespaced') {
509 $regA=array();
510 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
511 $hexbyte = $regA[1];
512 $utf8 = 'U+'.$regA[2];
513 }
514 $decval = hexdec(trim($hexbyte));
515 if ($decval>127) {
516 $utf8decval = hexdec(substr(trim($utf8),2));
517 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
518 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
519 }
520 }
521 }
522 if ($cacheFile) {
523 t3lib_div::writeFile($cacheFile,serialize($this->parsedCharsets[$charset]));
524 }
525 }
526 return 2;
527 } else return false;
528 } else return 1;
529 }
530
531 /**
532 * Converts a UNICODE number to a UTF-8 multibyte character
533 * Algorithm based on script found at From: http://czyborra.com/utf/
534 *
535 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
536 *
537 * bytes | bits | representation
538 * 1 | 7 | 0vvvvvvv
539 * 2 | 11 | 110vvvvv 10vvvvvv
540 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
541 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
542 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
543 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
544 *
545 * @param integer UNICODE integer
546 * @return string UTF-8 multibyte character string
547 * @see utf8CharToUnumber()
548 */
549 function UnumberToChar($cbyte) {
550 $str='';
551
552 if ($cbyte < 0x80) {
553 $str.=chr($cbyte);
554 } else if ($cbyte < 0x800) {
555 $str.=chr(0xC0 | ($cbyte >> 6));
556 $str.=chr(0x80 | ($cbyte & 0x3F));
557 } else if ($cbyte < 0x10000) {
558 $str.=chr(0xE0 | ($cbyte >> 12));
559 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
560 $str.=chr(0x80 | ($cbyte & 0x3F));
561 } else if ($cbyte < 0x200000) {
562 $str.=chr(0xF0 | ($cbyte >> 18));
563 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
564 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
565 $str.=chr(0x80 | ($cbyte & 0x3F));
566 } else if ($cbyte < 0x4000000) {
567 $str.=chr(0xF8 | ($cbyte >> 24));
568 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
569 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
570 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
571 $str.=chr(0x80 | ($cbyte & 0x3F));
572 } else if ($cbyte < 0x80000000) {
573 $str.=chr(0xFC | ($cbyte >> 30));
574 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
575 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
576 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
577 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
578 $str.=chr(0x80 | ($cbyte & 0x3F));
579 } else { // Cannot express a 32-bit character in UTF-8
580 $str .= chr($this->noCharByteVal);
581 }
582 return $str;
583 }
584
585 /**
586 * Converts a UTF-8 Multibyte character to a UNICODE number
587 *
588 * @param string UTF-8 multibyte character string
589 * @param boolean If set, then a hex. number is returned.
590 * @return integer UNICODE integer
591 * @see UnumberToChar()
592 */
593 function utf8CharToUnumber($str,$hex=0) {
594 $ord=ord(substr($str,0,1)); // First char
595
596 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
597 $binBuf='';
598 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
599 $ord = $ord << 1; // Shift it left and ...
600 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
601 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
602 } else break;
603 }
604 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
605
606 $int = bindec($binBuf);
607 } else $int = $ord;
608
609 return $hex ? 'x'.dechex($int) : $int;
610 }
611
612 /********************************************
613 *
614 * UTF-8 String operation functions
615 *
616 ********************************************/
617
618 /**
619 * Truncates a string in UTF-8 short at a given byte length.
620 *
621 * @param string UTF-8 multibyte character string
622 * @param integer the byte length
623 * @return string the shortened string
624 * @see mb_strcut()
625 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
626 */
627 function utf8_strtrunc($str,$len) {
628 if ($len <= 0) return '';
629
630 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
631 return mb_strcut($str,0,$len,'utf-8');
632 }
633
634 $i = $len-1;
635 if (ord($str{$i}) & 0x80) { // part of a mulitbyte sequence
636 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
637 if ($i <= 0) return ''; // sanity check
638 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
639 if ($bc+$i > $len) return substr($str,0,$i);
640 // fallthru: multibyte char fits into length
641 }
642 return substr($str,$len);
643 }
644
645 /**
646 * Returns a part of a UTF-8 string.
647 *
648 * @param string $str UTF-8 string
649 * @param int $start start position (character position)
650 * @param int $len length (in characters)
651 * @return string the substring
652 * @see substr()
653 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
654 *
655 * @bug
656 * Negative values for @arg $start and @arg $len are currently not supported.
657 */
658 function utf8_substr($str,$start,$len=null) {
659 if ($len===0) return '';
660
661 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
662 // cannot omit $len, when specifying charset
663 if ($len==null) {
664 $enc = mb_internal_encoding(); // save internal encoding
665 mb_internal_encoding('utf-8');
666 $str = mb_substr($str,$start);
667 mb_internal_encoding($enc); // restore internal encoding
668
669 return $len;
670 }
671 else return mb_substr($str,$start,$len,'utf-8');
672 }
673
674 $byte_start = utf8_char2byte_pos($str,$start);
675 if ($byte_start === false) return false; // $start outside string length
676
677 $str = substr($str,$byte_start);
678
679 if ($len!=null) {
680 $byte_end = utf8_char2byte_pos($str,$len+1);
681 if ($byte_end === false) // $len outside actual string length
682 return $str;
683 else
684 return substr($str,0,$byte_end);
685 }
686 else return $str;
687 }
688
689 /**
690 * Counts the number of characters of a string in UTF-8.
691 *
692 * @param string UTF-8 multibyte character string
693 * @return int the number of characters
694 * @see strlen()
695 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
696 */
697 function utf8_strlen($str) {
698 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
699 return mb_strlen($str,'utf-8');
700 }
701
702 $n=0;
703 for($i=0; $str{$i}; $i++) {
704 $c = ord($str{$i});
705 if (!($c & 0x80)) // single-byte (0xxxxxx)
706 $n++;
707 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
708 $n++;
709 }
710 return $n;
711 }
712
713 /**
714 * Find position of first occurrence of a string, both arguments are in UTF-8.
715 *
716 * @param string UTF-8 string to search in
717 * @param string UTF-8 string to search for
718 * @param int positition to start the search
719 * @return int the character position
720 * @see strpos()
721 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
722 */
723 function utf8_strpos($haystack,$needle,$offset=0) {
724 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
725 return mb_strpos($haystack,$needle,'utf-8');
726 }
727
728 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
729 if ($byte_offset === false) return false; // offset beyond string length
730
731 $byte_pos = strpos($haystack,$needle,$byte_offset);
732 if ($byte_pos === false) return false; // needle not found
733
734 return $this->utf8_byte2char_pos($haystack,$byte_pos);
735 }
736
737 /**
738 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
739 *
740 * @param string UTF-8 string to search in
741 * @param char UTF-8 character to search for
742 * @return int the character position
743 * @see strrpos()
744 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
745 */
746 function utf8_strrpos($haystack,$needle) {
747 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
748 return mb_strrpos($haystack,$needle,'utf-8');
749 }
750
751 $byte_pos = strrpos($haystack,$needle);
752 if ($byte_pos === false) return false; // needle not found
753
754 return $this->utf8_byte2char_pos($haystack,$byte_pos);
755 }
756
757 /**
758 * Translates a character position into an 'absolute' byte position.
759 *
760 * @param string UTF-8 string
761 * @param int character position
762 * @return int byte position
763 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
764 */
765 function utf8_char2byte_pos($str,$pos) {
766 $n = 0; // number of characters
767 for($i=0; $str{$i} && $n<$pos; $i++) {
768 $c = (int)ord($str{$i});
769 if (!($c & 0x80)) // single-byte (0xxxxxx)
770 $n++;
771 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
772 $n++;
773 }
774 if (!$str{$i}) return false; // offset beyond string length
775
776 // skip trailing multi-byte data bytes
777 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
778
779 return $i;
780 }
781
782 /**
783 * Translates an 'absolute' byte position into a character position.
784 *
785 * @param string UTF-8 string
786 * @param int byte position
787 * @return int character position
788 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
789 */
790 function utf8_byte2char_pos($str,$pos) {
791 $n = 0; // number of characters
792 for($i=$pos; $i>0; $i--) {
793 $c = (int)ord($str{$i});
794 if (!($c & 0x80)) // single-byte (0xxxxxx)
795 $n++;
796 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
797 $n++;
798 }
799 if (!$str{$i}) return false; // offset beyond string length
800
801 return $n;
802 }
803
804 }
805
806 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
807 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
808 }
809 ?>