Added $Id$ keywords, cleaned up comment tags
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003 Kasper Skaarhoj (kasper@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasper@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 104: class t3lib_cs
38 * 196: function parse_charset($charset)
39 * 213: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
40 * 247: function utf8_encode($str,$charset)
41 * 288: function utf8_decode($str,$charset,$useEntityForNoChar=0)
42 * 336: function utf8_to_entities($str)
43 * 369: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
44 * 400: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
45 * 441: function initCharset($charset)
46 * 512: function UnumberToChar($cbyte)
47 * 556: function utf8CharToUnumber($str,$hex=0)
48 * 585: function utf8_strtrunc($str,$len)
49 * 624: function utf_strlen($str)
50 * 637: function utf_substr($str,$start,$len=0)
51 * 651: function utf_strpos($haystack,$needle,$offset=0)
52 * 664: function utf_strrpos($haystack,$needle,$offset=0)
53 *
54 * TOTAL FUNCTIONS: 15
55 * (This index is automatically created/updated by the extension "extdeveval")
56 *
57 */
58
59
60
61
62
63
64
65
66 /**
67 * Notes on UTF-8
68 *
69 * Functions working on UTF-8 strings:
70 *
71 * - strchr/strstr
72 * - strrchr
73 * - substr_count
74 * - implode/explode/join
75 *
76 * Functions nearly working on UTF-8 strings:
77 *
78 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf_strlen
79 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained 7-bit ASCII
80 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
81 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
82 *
83 * Functions NOT working on UTF-8 strings:
84 *
85 * - str*cmp
86 * - stristr
87 * - stripos
88 * - substr
89 * - strrev
90 * - ereg/eregi
91 * - split/spliti
92 * - preg_*
93 * - ...
94 *
95 */
96 /**
97 * Class for conversion between charsets.
98 *
99 * @author Kasper Skaarhoj <kasper@typo3.com>
100 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
101 * @package TYPO3
102 * @subpackage t3lib
103 */
104 class t3lib_cs {
105 var $noCharByteVal=127; // ASCII Value for chars with no equalent.
106
107 // This is the array where parsed conversion tables are stored (cached)
108 var $parsedCharsets=array();
109
110 // This tells the converter which charsets has two bytes per char:
111 var $twoByteSets=array(
112 'ucs-2'=>1, // 2-byte Unicode
113 'utf-16'=>1 // 2-byte Unicode with surrogates
114 );
115
116 // This tells the converter which charset use the Extended Unix Code scheme:
117 var $eucBasedSets=array(
118 'gb2312'=>1, // Chinese, simplified.
119 );
120
121 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
122 // http://czyborra.com/charsets/iso8859.html
123 var $synonyms=array(
124 'us' => 'ascii',
125 'us-ascii'=> 'ascii',
126 'cp819' => 'iso-8859-1',
127 'ibm819' => 'iso-8859-1',
128 'iso-ir-100' => 'iso-8859-1',
129 'iso-ir-109' => 'iso-8859-2',
130 'iso-ir-148' => 'iso-8859-9',
131 'iso-ir-199' => 'iso-8859-14',
132 'iso-ir-203' => 'iso-8859-15',
133 'csisolatin1' => 'iso-8859-1',
134 'csisolatin2' => 'iso-8859-2',
135 'csisolatin3' => 'iso-8859-3',
136 'csisolatin5' => 'iso-8859-9',
137 'csisolatin8' => 'iso-8859-14',
138 'csisolatin9' => 'iso-8859-15',
139 'csisolatingreek' => 'iso-8859-7',
140 'iso-celtic' => 'iso-8859-14',
141 'latin1' => 'iso-8859-1',
142 'latin2' => 'iso-8859-2',
143 'latin3' => 'iso-8859-3',
144 'latin5' => 'iso-8859-9',
145 'latin6' => 'iso-8859-10',
146 'latin8' => 'iso-8859-14',
147 'latin9' => 'iso-8859-15',
148 'l1' => 'iso-8859-1',
149 'l2' => 'iso-8859-2',
150 'l3' => 'iso-8859-3',
151 'l5' => 'iso-8859-9',
152 'l6' => 'iso-8859-10',
153 'l8' => 'iso-8859-14',
154 'l9' => 'iso-8859-15',
155 'cyrillic' => 'iso-8859-5',
156 'arabic' => 'iso-8859-6',
157 'win874' => 'windows-874',
158 'win1250' => 'windows-1250',
159 'win1251' => 'windows-1251',
160 'win1252' => 'windows-1252',
161 'win1253' => 'windows-1253',
162 'win1254' => 'windows-1254',
163 'win1255' => 'windows-1255',
164 'win1256' => 'windows-1256',
165 'win1257' => 'windows-1257',
166 'win1258' => 'windows-1258',
167 'cp1250' => 'windows-1250',
168 'cp1252' => 'windows-1252',
169 'ms-ee' => 'windows-1250',
170 'ms-ansi' => 'windows-1252',
171 'ms-greek' => 'windows-1253',
172 'ms-turk' => 'windows-1254',
173 'winbaltrim' => 'windows-1257',
174 'mac' => 'macRoman',
175 'macintosh' => 'macRoman',
176 'euc-cn' => 'gb2312',
177 'x-euc-cn' => 'gb2312',
178 'utf8' => 'utf-8',
179 'utf-2' => 'utf-8',
180 'utf2' => 'utf-8',
181 );
182 /*
183 JIS X 0208 (euc-jp)
184 CNS 11643 (EUC-TW)
185 KS C 5601 (EUC-KR)
186 */
187
188
189 /**
190 * Normalize
191 *
192 * @param string Input charset
193 * @return string Normalized charset
194 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
195 */
196 function parse_charset($charset) {
197 $encoding = strtolower($charset);
198 if (isset($synonyms[$charset])) $encoding = $synonyms[$charset];
199
200 return $charset;
201 }
202
203
204 /**
205 * Convert from one charset to another charset.
206 *
207 * @param string Input string
208 * @param string From charset (the current charset of the string)
209 * @param string To charset (the output charset wanted)
210 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
211 * @return string Converted string
212 */
213 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
214 global $TYPO3_CONF_VARS;
215
216 if ($fromCS==$toCS) return $str;
217
218 if (!$useEntityForNoChar) { // iconv and recode don't support fallback to SGML entities
219 if ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'iconv') {
220 $conv_str = iconv($str,$fromCS,$toCS.'//TRANSLIT');
221 if (false !== $conv_str) return $conv_str;
222 }
223 elseif ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'recode') {
224 $conv_str = recode_string($toCS.'..'.$fromCS,$str);
225 if (false !== $conv_str) return $conv_str;
226 }
227 elseif ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'mbstring') {
228 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
229 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
230 }
231 // fallback to TYPO3 conversion
232 }
233
234 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
235 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
236 return $str;
237 }
238
239
240 /**
241 * Converts $str from $charset to UTF-8
242 *
243 * @param string String in local charset to convert to UTF-8
244 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
245 * @return string Output string, converted to UTF-8
246 */
247 function utf8_encode($str,$charset) {
248
249 // Charset is case-insensitive.
250 if ($this->initCharset($charset)) { // Parse conv. table if not already...
251 $strLen = strlen($str);
252 $outStr='';
253
254 for ($a=0,$i;$a<$strLen;$a++,$i++) { // Traverse each char in string.
255 $chr=substr($str,$a,1);
256 $ord=ord($chr);
257 if ($this->twoByteSets[$charset]) { // If the charset has two bytes per char
258 $ord2 = ord($str[$i+1]);
259 $ord = $ord<<8 & $ord2; // assume big endian
260
261 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
262 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
263 } else $outStr.=chr($this->noCharByteVal); // No char exists
264 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
265 if ($this->eucBasedSets[$charset]) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
266 $a++;
267 $ord2=ord(substr($str,$a,1));
268 $ord = $ord*256+$ord2;
269 }
270
271 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
272 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
273 } else $outStr.=chr($this->noCharByteVal); // No char exists
274 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
275 }
276 return $outStr;
277 }
278 }
279
280 /**
281 * Converts $str from UTF-8 to $charset
282 *
283 * @param string String in UTF-8 to convert to local charset
284 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
285 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
286 * @return string Output string, converted to local charset
287 */
288 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
289
290 // Charset is case-insensitive.
291 if ($this->initCharset($charset)) { // Parse conv. table if not already...
292 $strLen = strlen($str);
293 $outStr='';
294 $buf='';
295 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
296 $chr=substr($str,$a,1);
297 $ord=ord($chr);
298 if ($ord>127) { // This means multibyte! (first byte!)
299 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
300
301 $buf=$chr; // Add first byte
302 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
303 $ord = $ord << 1; // Shift it left and ...
304 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
305 $a++; // Increase pointer...
306 $buf.=substr($str,$a,1); // ... and add the next char.
307 } else break;
308 }
309
310 # Martin Kutschker...! this does not work! With russian UTF-8 converted back to windows-1251 it failed... So the old code is re-inserted.
311 # for ($bc=0; $ord & 0x80; $ord = $ord << 1) { $bc++; } // calculate number of bytes
312 # $buf.=substr($str,$i,$bc);
313 # $i+=$bc-1;
314
315 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
316 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
317 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
318 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
319 } else $outStr.= chr($mByte);
320 } elseif ($useEntityForNoChar) { // Create num entity:
321 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
322 } else $outStr.=chr($this->noCharByteVal); // No char exists
323 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
324 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
325 }
326 return $outStr;
327 }
328 }
329
330 /**
331 * Converts all chars > 127 to numeric entities.
332 *
333 * @param string Input string
334 * @return string Output string
335 */
336 function utf8_to_entities($str) {
337 $strLen = strlen($str);
338 $outStr='';
339 $buf='';
340 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
341 $chr=substr($str,$a,1);
342 $ord=ord($chr);
343 if ($ord>127) { // This means multibyte! (first byte!)
344 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
345 $buf=$chr; // Add first byte
346 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
347 $ord = $ord << 1; // Shift it left and ...
348 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
349 $a++; // Increase pointer...
350 $buf.=substr($str,$a,1); // ... and add the next char.
351 } else break;
352 }
353
354 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
355 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
356 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
357 }
358
359 return $outStr;
360 }
361
362 /**
363 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
364 *
365 * @param string Input string, UTF-8
366 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
367 * @return string Output string
368 */
369 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
370 if ($alsoStdHtmlEnt) {
371 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES));
372 }
373
374 $token = md5(microtime());
375 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
376 foreach($parts as $k => $v) {
377 if ($k%2) {
378 if (substr($v,0,1)=='#') { // Dec or hex entities:
379 if (substr($v,1,1)=='x') $v=hexdec(substr($v,2));
380 $parts[$k] = $this->UnumberToChar(substr($v,1));
381 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
382 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
383 } else { // No conversion:
384 $parts[$k] ='&'.$v.';';
385 }
386 }
387 }
388
389 return implode('',$parts);
390 }
391
392 /**
393 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
394 *
395 * @param string Input string, UTF-8
396 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
397 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
398 * @return array Output array with the char numbers
399 */
400 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
401 // If entities must be registered as well...:
402 if ($convEntities) {
403 $str = $this->entities_to_utf8($str,1);
404 }
405 // Do conversion:
406 $strLen = strlen($str);
407 $outArr=array();
408 $buf='';
409 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
410 $chr=substr($str,$a,1);
411 $ord=ord($chr);
412 if ($ord>127) { // This means multibyte! (first byte!)
413 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
414 $buf=$chr; // Add first byte
415 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
416 $ord = $ord << 1; // Shift it left and ...
417 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
418 $a++; // Increase pointer...
419 $buf.=substr($str,$a,1); // ... and add the next char.
420 } else break;
421 }
422
423 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
424 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
425 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
426 }
427
428 return $outArr;
429 }
430
431 /**
432 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
433 * This function is automatically called by the conversion functions
434 *
435 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
436 *
437 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
438 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
439 * @access private
440 */
441 function initCharset($charset) {
442 // Only process if the charset is not yet loaded:
443 if (!is_array($this->parsedCharsets[$charset])) {
444
445 // Conversion table filename:
446 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
447
448 // If the conversion table is found:
449 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
450 // Cache file for charsets:
451 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
452 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/charset_'.$charset.'.tbl');
453 if ($cacheFile && @is_file($cacheFile)) {
454 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
455 } else {
456 // Parse conversion table into lines:
457 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
458 // Initialize the internal variable holding the conv. table:
459 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
460 // traverse the lines:
461 $detectedType='';
462 foreach($lines as $value) {
463 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
464
465 // Detect type if not done yet: (Done on first real line)
466 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
467 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
468
469 if ($detectedType=='ms-token') {
470 list($hexbyte,$utf8) = split('=|:',$value,3);
471 } elseif ($detectedType=='whitespaced') {
472 $regA=array();
473 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
474 $hexbyte = $regA[1];
475 $utf8 = 'U+'.$regA[2];
476 }
477 $decval = hexdec(trim($hexbyte));
478 if ($decval>127) {
479 $utf8decval = hexdec(substr(trim($utf8),2));
480 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
481 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
482 }
483 }
484 }
485 if ($cacheFile) {
486 t3lib_div::writeFile($cacheFile,serialize($this->parsedCharsets[$charset]));
487 }
488 }
489 return 2;
490 } else return false;
491 } else return 1;
492 }
493
494 /**
495 * Converts a UNICODE number to a UTF-8 multibyte character
496 * Algorithm based on script found at From: http://czyborra.com/utf/
497 *
498 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
499 *
500 * bytes | bits | representation
501 * 1 | 7 | 0vvvvvvv
502 * 2 | 11 | 110vvvvv 10vvvvvv
503 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
504 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
505 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
506 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
507 *
508 * @param integer UNICODE integer
509 * @return string UTF-8 multibyte character string
510 * @see utf8CharToUnumber()
511 */
512 function UnumberToChar($cbyte) {
513 $str='';
514
515 if ($cbyte < 0x80) {
516 $str.=chr($cbyte);
517 } else if ($cbyte < 0x800) {
518 $str.=chr(0xC0 | ($cbyte >> 6));
519 $str.=chr(0x80 | ($cbyte & 0x3F));
520 } else if ($cbyte < 0x10000) {
521 $str.=chr(0xE0 | ($cbyte >> 12));
522 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
523 $str.=chr(0x80 | ($cbyte & 0x3F));
524 } else if ($cbyte < 0x200000) {
525 $str.=chr(0xF0 | ($cbyte >> 18));
526 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
527 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
528 $str.=chr(0x80 | ($cbyte & 0x3F));
529 } else if ($cbyte < 0x4000000) {
530 $str.=chr(0xF8 | ($cbyte >> 24));
531 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
532 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
533 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
534 $str.=chr(0x80 | ($cbyte & 0x3F));
535 } else if ($cbyte < 0x80000000) {
536 $str.=chr(0xFC | ($cbyte >> 30));
537 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
538 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
539 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
540 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
541 $str.=chr(0x80 | ($cbyte & 0x3F));
542 } else { // Cannot express a 32-bit character in UTF-8
543 $str .= chr($this->noCharByteVal);
544 }
545 return $str;
546 }
547
548 /**
549 * Converts a UTF-8 Multibyte character to a UNICODE number
550 *
551 * @param string UTF-8 multibyte character string
552 * @param boolean If set, then a hex. number is returned.
553 * @return integer UNICODE integer
554 * @see UnumberToChar()
555 */
556 function utf8CharToUnumber($str,$hex=0) {
557 $ord=ord(substr($str,0,1)); // First char
558
559 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
560 $binBuf='';
561 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
562 $ord = $ord << 1; // Shift it left and ...
563 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
564 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
565 } else break;
566 }
567 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
568
569 $int = bindec($binBuf);
570 } else $int = $ord;
571
572 return $hex ? 'x'.dechex($int) : $int;
573 }
574
575
576 /**
577 * Truncates a string in UTF-8 short at a given byte length
578 *
579 * @param string UTF-8 multibyte character string
580 * @param integer the byte length
581 * @return string the shortened string
582 * @see strcut()
583 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
584 */
585 function utf8_strtrunc($str,$len) {
586 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
587 return mb_strcut($str,0,$len,'utf-8');
588 }
589
590 $i = $len-1;
591 if (ord($str[$i]) & 0x80) { // part of a mulitbyte sequence
592 for (; !(ord($str[$i]) & 0x40); $i--) ; // find the first byte
593 for ($bc=0, $mbs=ord($str[$i]); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
594 if ($bc+$i > $len) return substr($str,0,$i);
595 // fallthru: multibyte char fits into length
596 }
597 return substr($str,$len);
598 }
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617 // a few stubs of possibly useful functions, which may be impmeneted in PHP
618
619 /**
620 * @param [type] $str: ...
621 * @return [type] ...
622 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
623 */
624 function utf_strlen($str) {
625 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
626 return mb_strlen($str,'utf-8');
627 }
628 }
629
630 /**
631 * @param [type] $str: ...
632 * @param [type] $start: ...
633 * @param [type] $len: ...
634 * @return [type] ...
635 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
636 */
637 function utf_substr($str,$start,$len=0) {
638 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
639 // how to omit $len when you specify a charset?!?!
640 return mb_substr($str,$start,$len,'utf-8');
641 }
642 }
643
644 /**
645 * @param [type] $haystack: ...
646 * @param [type] $needle: ...
647 * @param [type] $offset: ...
648 * @return [type] ...
649 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
650 */
651 function utf_strpos($haystack,$needle,$offset=0) {
652 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
653 return mb_strpos($haystack,$needle,'utf-8');
654 }
655 }
656
657 /**
658 * @param [type] $haystack: ...
659 * @param [type] $needle: ...
660 * @param [type] $offset: ...
661 * @return [type] ...
662 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
663 */
664 function utf_strrpos($haystack,$needle,$offset=0) {
665 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
666 return mb_strrpos($haystack,$needle,'utf-8');
667 }
668 }
669 }
670
671 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
672 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
673 }
674 ?>