Reverted to old way of dealing with utf8-decoding
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003 Kasper Skårhøj (kasper@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * @author Kasper Skårhøj <kasper@typo3.com>
28 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
29 */
30 /**
31 * [CLASS/FUNCTION INDEX of SCRIPT]
32 *
33 *
34 *
35 * 102: class t3lib_cs
36 * 194: function parse_charset($charset)
37 * 211: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
38 * 245: function utf8_encode($str,$charset)
39 * 286: function utf8_decode($str,$charset,$useEntityForNoChar=0)
40 * 341: function utf8_to_entities($str)
41 * 374: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
42 * 405: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
43 * 446: function initCharset($charset)
44 * 517: function UnumberToChar($cbyte)
45 * 561: function utf8CharToUnumber($str,$hex=0)
46 * 590: function utf8_strtrunc($str,$len)
47 * 612: function utf_strlen($str)
48 * 625: function utf_substr($str,$start,$len=0)
49 * 639: function utf_strpos($haystack,$needle,$offset=0)
50 * 652: function utf_strrpos($haystack,$needle,$offset=0)
51 *
52 * TOTAL FUNCTIONS: 15
53 * (This index is automatically created/updated by the extension "extdeveval")
54 *
55 */
56
57
58
59
60
61
62
63
64 /**
65 * Notes on UTF-8
66 *
67 * Functions working on UTF-8 strings:
68 *
69 * - strchr/strstr
70 * - strrchr
71 * - substr_count
72 * - implode/explode/join
73 *
74 * Functions nearly working on UTF-8 strings:
75 *
76 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf_strlen
77 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained 7-bit ASCII
78 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
79 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
80 *
81 * Functions NOT working on UTF-8 strings:
82 *
83 * - str*cmp
84 * - stristr
85 * - stripos
86 * - substr
87 * - strrev
88 * - ereg/eregi
89 * - split/spliti
90 * - preg_*
91 * - ...
92 *
93 */
94 /**
95 * Class for conversion between charsets.
96 *
97 * @author Kasper Skårhøj <kasper@typo3.com>
98 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
99 * @package TYPO3
100 * @subpackage t3lib
101 */
102 class t3lib_cs {
103 var $noCharByteVal=127; // ASCII Value for chars with no equalent.
104
105 // This is the array where parsed conversion tables are stored (cached)
106 var $parsedCharsets=array();
107
108 // This tells the converter which charsets has two bytes per char:
109 var $twoByteSets=array(
110 'ucs-2'=>1, // 2-byte Unicode
111 'utf-16'=>1 // 2-byte Unicode with surrogates
112 );
113
114 // This tells the converter which charset use the Extended Unix Code scheme:
115 var $eucBasedSets=array(
116 'gb2312'=>1, // Chinese, simplified.
117 );
118
119 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
120 // http://czyborra.com/charsets/iso8859.html
121 var $synonyms=array(
122 'us' => 'ascii',
123 'us-ascii'=> 'ascii',
124 'cp819' => 'iso-8859-1',
125 'ibm819' => 'iso-8859-1',
126 'iso-ir-100' => 'iso-8859-1',
127 'iso-ir-109' => 'iso-8859-2',
128 'iso-ir-148' => 'iso-8859-9',
129 'iso-ir-199' => 'iso-8859-14',
130 'iso-ir-203' => 'iso-8859-15',
131 'csisolatin1' => 'iso-8859-1',
132 'csisolatin2' => 'iso-8859-2',
133 'csisolatin3' => 'iso-8859-3',
134 'csisolatin5' => 'iso-8859-9',
135 'csisolatin8' => 'iso-8859-14',
136 'csisolatin9' => 'iso-8859-15',
137 'csisolatingreek' => 'iso-8859-7',
138 'iso-celtic' => 'iso-8859-14',
139 'latin1' => 'iso-8859-1',
140 'latin2' => 'iso-8859-2',
141 'latin3' => 'iso-8859-3',
142 'latin5' => 'iso-8859-9',
143 'latin6' => 'iso-8859-10',
144 'latin8' => 'iso-8859-14',
145 'latin9' => 'iso-8859-15',
146 'l1' => 'iso-8859-1',
147 'l2' => 'iso-8859-2',
148 'l3' => 'iso-8859-3',
149 'l5' => 'iso-8859-9',
150 'l6' => 'iso-8859-10',
151 'l8' => 'iso-8859-14',
152 'l9' => 'iso-8859-15',
153 'cyrillic' => 'iso-8859-5',
154 'arabic' => 'iso-8859-6',
155 'win874' => 'windows-874',
156 'win1250' => 'windows-1250',
157 'win1251' => 'windows-1251',
158 'win1252' => 'windows-1252',
159 'win1253' => 'windows-1253',
160 'win1254' => 'windows-1254',
161 'win1255' => 'windows-1255',
162 'win1256' => 'windows-1256',
163 'win1257' => 'windows-1257',
164 'win1258' => 'windows-1258',
165 'cp1250' => 'windows-1250',
166 'cp1252' => 'windows-1252',
167 'ms-ee' => 'windows-1250',
168 'ms-ansi' => 'windows-1252',
169 'ms-greek' => 'windows-1253',
170 'ms-turk' => 'windows-1254',
171 'winbaltrim' => 'windows-1257',
172 'mac' => 'macRoman',
173 'macintosh' => 'macRoman',
174 'euc-cn' => 'gb2312',
175 'x-euc-cn' => 'gb2312',
176 'utf8' => 'utf-8',
177 'utf-2' => 'utf-8',
178 'utf2' => 'utf-8',
179 );
180 /*
181 JIS X 0208 (euc-jp)
182 CNS 11643 (EUC-TW)
183 KS C 5601 (EUC-KR)
184 */
185
186
187 /**
188 * Normalize
189 *
190 * @param string Input charset
191 * @return string Normalized charset
192 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
193 */
194 function parse_charset($charset) {
195 $encoding = strtolower($charset);
196 if (isset($synonyms[$charset])) $encoding = $synonyms[$charset];
197
198 return $charset;
199 }
200
201
202 /**
203 * Convert from one charset to another charset.
204 *
205 * @param string Input string
206 * @param string From charset (the current charset of the string)
207 * @param string To charset (the output charset wanted)
208 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
209 * @return string Converted string
210 */
211 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
212 global $TYPO3_CONF_VARS;
213
214 if ($fromCS==$toCS) return $str;
215
216 if (!$useEntityForNoChar) { // iconv and recode don't support fallback to SGML entities
217 if ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'iconv') {
218 $conv_str = iconv($str,$fromCS,$toCS.'//TRANSLIT');
219 if (false !== $conv_str) return $conv_str;
220 }
221 elseif ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'recode') {
222 $conv_str = recode_string($toCS.'..'.$fromCS,$str);
223 if (false !== $conv_str) return $conv_str;
224 }
225 elseif ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'mbstring') {
226 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
227 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
228 }
229 // fallback to TYPO3 conversion
230 }
231
232 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
233 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
234 return $str;
235 }
236
237
238 /**
239 * Converts $str from $charset to UTF-8
240 *
241 * @param string String in local charset to convert to UTF-8
242 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
243 * @return string Output string, converted to UTF-8
244 */
245 function utf8_encode($str,$charset) {
246
247 // Charset is case-insensitive.
248 if ($this->initCharset($charset)) { // Parse conv. table if not already...
249 $strLen = strlen($str);
250 $outStr='';
251
252 for ($a=0,$i;$a<$strLen;$a++,$i++) { // Traverse each char in string.
253 $chr=substr($str,$a,1);
254 $ord=ord($chr);
255 if ($this->twoByteSets[$charset]) { // If the charset has two bytes per char
256 $ord2 = ord($str[$i+1]);
257 $ord = $ord<<8 & $ord2; // assume big endian
258
259 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
260 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
261 } else $outStr.=chr($this->noCharByteVal); // No char exists
262 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
263 if ($this->eucBasedSets[$charset]) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
264 $a++;
265 $ord2=ord(substr($str,$a,1));
266 $ord = $ord*256+$ord2;
267 }
268
269 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
270 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
271 } else $outStr.=chr($this->noCharByteVal); // No char exists
272 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
273 }
274 return $outStr;
275 }
276 }
277
278 /**
279 * Converts $str from UTF-8 to $charset
280 *
281 * @param string String in UTF-8 to convert to local charset
282 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
283 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
284 * @return string Output string, converted to local charset
285 */
286 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
287
288 // Charset is case-insensitive.
289 if ($this->initCharset($charset)) { // Parse conv. table if not already...
290 $strLen = strlen($str);
291 $outStr='';
292 $buf='';
293 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
294 $chr=substr($str,$a,1);
295 $ord=ord($chr);
296 if ($ord>127) { // This means multibyte! (first byte!)
297 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
298
299 $buf=$chr; // Add first byte
300 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
301 $ord = $ord << 1; // Shift it left and ...
302 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
303 $a++; // Increase pointer...
304 $buf.=substr($str,$a,1); // ... and add the next char.
305 } else break;
306 }
307
308 # Martin Kutschker...! this does not work! With russian UTF-8 converted back to windows-1251 it failed... So the old code is re-inserted.
309 # for ($bc=0; $ord & 0x80; $ord = $ord << 1) { $bc++; } // calculate number of bytes
310 # $buf.=substr($str,$i,$bc);
311 # $i+=$bc-1;
312
313 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
314 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
315 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
316 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
317 } else $outStr.= chr($mByte);
318 } elseif ($useEntityForNoChar) { // Create num entity:
319 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
320 } else $outStr.=chr($this->noCharByteVal); // No char exists
321 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
322 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
323 }
324 return $outStr;
325 }
326 }
327
328 /**
329 * Converts all chars > 127 to numeric entities.
330 *
331 * @param string Input string
332 * @return string Output string
333 */
334 function utf8_to_entities($str) {
335 $strLen = strlen($str);
336 $outStr='';
337 $buf='';
338 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
339 $chr=substr($str,$a,1);
340 $ord=ord($chr);
341 if ($ord>127) { // This means multibyte! (first byte!)
342 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
343 $buf=$chr; // Add first byte
344 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
345 $ord = $ord << 1; // Shift it left and ...
346 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
347 $a++; // Increase pointer...
348 $buf.=substr($str,$a,1); // ... and add the next char.
349 } else break;
350 }
351
352 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
353 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
354 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
355 }
356
357 return $outStr;
358 }
359
360 /**
361 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
362 *
363 * @param string Input string, UTF-8
364 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
365 * @return string Output string
366 */
367 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
368 if ($alsoStdHtmlEnt) {
369 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES));
370 }
371
372 $token = md5(microtime());
373 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
374 foreach($parts as $k => $v) {
375 if ($k%2) {
376 if (substr($v,0,1)=='#') { // Dec or hex entities:
377 if (substr($v,1,1)=='x') $v=hexdec(substr($v,2));
378 $parts[$k] = $this->UnumberToChar(substr($v,1));
379 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
380 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
381 } else { // No conversion:
382 $parts[$k] ='&'.$v.';';
383 }
384 }
385 }
386
387 return implode('',$parts);
388 }
389
390 /**
391 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
392 *
393 * @param string Input string, UTF-8
394 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
395 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
396 * @return array Output array with the char numbers
397 */
398 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
399 // If entities must be registered as well...:
400 if ($convEntities) {
401 $str = $this->entities_to_utf8($str,1);
402 }
403 // Do conversion:
404 $strLen = strlen($str);
405 $outArr=array();
406 $buf='';
407 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
408 $chr=substr($str,$a,1);
409 $ord=ord($chr);
410 if ($ord>127) { // This means multibyte! (first byte!)
411 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
412 $buf=$chr; // Add first byte
413 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
414 $ord = $ord << 1; // Shift it left and ...
415 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
416 $a++; // Increase pointer...
417 $buf.=substr($str,$a,1); // ... and add the next char.
418 } else break;
419 }
420
421 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
422 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
423 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
424 }
425
426 return $outArr;
427 }
428
429 /**
430 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
431 * This function is automatically called by the conversion functions
432 *
433 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
434 *
435 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
436 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
437 * @access private
438 */
439 function initCharset($charset) {
440 // Only process if the charset is not yet loaded:
441 if (!is_array($this->parsedCharsets[$charset])) {
442
443 // Conversion table filename:
444 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
445
446 // If the conversion table is found:
447 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
448 // Cache file for charsets:
449 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
450 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/charset_'.$charset.'.tbl');
451 if ($cacheFile && @is_file($cacheFile)) {
452 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
453 } else {
454 // Parse conversion table into lines:
455 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
456 // Initialize the internal variable holding the conv. table:
457 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
458 // traverse the lines:
459 $detectedType='';
460 foreach($lines as $value) {
461 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
462
463 // Detect type if not done yet: (Done on first real line)
464 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
465 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
466
467 if ($detectedType=='ms-token') {
468 list($hexbyte,$utf8) = split('=|:',$value,3);
469 } elseif ($detectedType=='whitespaced') {
470 $regA=array();
471 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
472 $hexbyte = $regA[1];
473 $utf8 = 'U+'.$regA[2];
474 }
475 $decval = hexdec(trim($hexbyte));
476 if ($decval>127) {
477 $utf8decval = hexdec(substr(trim($utf8),2));
478 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
479 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
480 }
481 }
482 }
483 if ($cacheFile) {
484 t3lib_div::writeFile($cacheFile,serialize($this->parsedCharsets[$charset]));
485 }
486 }
487 return 2;
488 } else return false;
489 } else return 1;
490 }
491
492 /**
493 * Converts a UNICODE number to a UTF-8 multibyte character
494 * Algorithm based on script found at From: http://czyborra.com/utf/
495 *
496 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
497 *
498 * bytes | bits | representation
499 * 1 | 7 | 0vvvvvvv
500 * 2 | 11 | 110vvvvv 10vvvvvv
501 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
502 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
503 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
504 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
505 *
506 * @param integer UNICODE integer
507 * @return string UTF-8 multibyte character string
508 * @see utf8CharToUnumber()
509 */
510 function UnumberToChar($cbyte) {
511 $str='';
512
513 if ($cbyte < 0x80) {
514 $str.=chr($cbyte);
515 } else if ($cbyte < 0x800) {
516 $str.=chr(0xC0 | ($cbyte >> 6));
517 $str.=chr(0x80 | ($cbyte & 0x3F));
518 } else if ($cbyte < 0x10000) {
519 $str.=chr(0xE0 | ($cbyte >> 12));
520 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
521 $str.=chr(0x80 | ($cbyte & 0x3F));
522 } else if ($cbyte < 0x200000) {
523 $str.=chr(0xF0 | ($cbyte >> 18));
524 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
525 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
526 $str.=chr(0x80 | ($cbyte & 0x3F));
527 } else if ($cbyte < 0x4000000) {
528 $str.=chr(0xF8 | ($cbyte >> 24));
529 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
530 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
531 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
532 $str.=chr(0x80 | ($cbyte & 0x3F));
533 } else if ($cbyte < 0x80000000) {
534 $str.=chr(0xFC | ($cbyte >> 30));
535 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
536 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
537 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
538 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
539 $str.=chr(0x80 | ($cbyte & 0x3F));
540 } else { // Cannot express a 32-bit character in UTF-8
541 $str .= chr($this->noCharByteVal);
542 }
543 return $str;
544 }
545
546 /**
547 * Converts a UTF-8 Multibyte character to a UNICODE number
548 *
549 * @param string UTF-8 multibyte character string
550 * @param boolean If set, then a hex. number is returned.
551 * @return integer UNICODE integer
552 * @see UnumberToChar()
553 */
554 function utf8CharToUnumber($str,$hex=0) {
555 $ord=ord(substr($str,0,1)); // First char
556
557 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
558 $binBuf='';
559 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
560 $ord = $ord << 1; // Shift it left and ...
561 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
562 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
563 } else break;
564 }
565 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
566
567 $int = bindec($binBuf);
568 } else $int = $ord;
569
570 return $hex ? 'x'.dechex($int) : $int;
571 }
572
573
574 /**
575 * Truncates a string in UTF-8 short at a given byte length
576 *
577 * @param string UTF-8 multibyte character string
578 * @param integer the byte length
579 * @return string the shortened string
580 * @see strcut()
581 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
582 */
583 function utf8_strtrunc($str,$len) {
584 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
585 return mb_strcut($str,0,$len,'utf-8');
586 }
587
588 $i = $len-1;
589 if (ord($str[$i]) & 0x80) { // part of a mulitbyte sequence
590 for (; !(ord($str[$i]) & 0x40); $i--) ; // find the first byte
591 for ($bc=0, $mbs=ord($str[$i]); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
592 if ($bc+$i > $len) return substr($str,0,$i);
593 // fallthru: multibyte char fits into length
594 }
595 return substr($str,$len);
596 }
597
598 // a few stubs of possibly useful functions, which may be impmeneted in PHP
599
600 /**
601 * @param [type] $str: ...
602 * @return [type] ...
603 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
604 */
605 function utf_strlen($str) {
606 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
607 return mb_strlen($str,'utf-8');
608 }
609 }
610
611 /**
612 * @param [type] $str: ...
613 * @param [type] $start: ...
614 * @param [type] $len: ...
615 * @return [type] ...
616 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
617 */
618 function utf_substr($str,$start,$len=0) {
619 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
620 // how to omit $len when you specify a charset?!?!
621 return mb_substr($str,$start,$len,'utf-8');
622 }
623 }
624
625 /**
626 * @param [type] $haystack: ...
627 * @param [type] $needle: ...
628 * @param [type] $offset: ...
629 * @return [type] ...
630 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
631 */
632 function utf_strpos($haystack,$needle,$offset=0) {
633 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
634 return mb_strpos($haystack,$needle,'utf-8');
635 }
636 }
637
638 /**
639 * @param [type] $haystack: ...
640 * @param [type] $needle: ...
641 * @param [type] $offset: ...
642 * @return [type] ...
643 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
644 */
645 function utf_strrpos($haystack,$needle,$offset=0) {
646 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
647 return mb_strrpos($haystack,$needle,'utf-8');
648 }
649 }
650 }
651
652 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
653 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
654 }
655 ?>