*** empty log message ***
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2004 Kasper Skaarhoj (kasper@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasper@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 104: class t3lib_cs
38 * 233: function parse_charset($charset)
39 * 250: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
40 * 284: function utf8_encode($str,$charset)
41 * 325: function utf8_decode($str,$charset,$useEntityForNoChar=0)
42 * 373: function utf8_to_entities($str)
43 * 406: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
44 * 437: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
45 * 478: function initCharset($charset)
46 * 549: function UnumberToChar($cbyte)
47 * 593: function utf8CharToUnumber($str,$hex=0)
48 * 622: function utf8_strtrunc($str,$len)
49 * 662: function utf_strlen($str)
50 * 675: function utf_substr($str,$start,$len=0)
51 * 689: function utf_strpos($haystack,$needle,$offset=0)
52 * 702: function utf_strrpos($haystack,$needle,$offset=0)
53 *
54 * TOTAL FUNCTIONS: 15
55 * (This index is automatically created/updated by the extension "extdeveval")
56 *
57 */
58
59
60
61
62
63
64
65
66 /**
67 * Notes on UTF-8
68 *
69 * Functions working on UTF-8 strings:
70 *
71 * - strchr/strstr
72 * - strrchr
73 * - substr_count
74 * - implode/explode/join
75 *
76 * Functions nearly working on UTF-8 strings:
77 *
78 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf_strlen
79 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained 7-bit ASCII
80 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
81 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
82 *
83 * Functions NOT working on UTF-8 strings:
84 *
85 * - str*cmp
86 * - stristr
87 * - stripos
88 * - substr
89 * - strrev
90 * - ereg/eregi
91 * - split/spliti
92 * - preg_*
93 * - ...
94 *
95 */
96 /**
97 * Class for conversion between charsets.
98 *
99 * @author Kasper Skaarhoj <kasper@typo3.com>
100 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
101 * @package TYPO3
102 * @subpackage t3lib
103 */
104 class t3lib_cs {
105 var $noCharByteVal=127; // ASCII Value for chars with no equalent.
106
107 // This is the array where parsed conversion tables are stored (cached)
108 var $parsedCharsets=array();
109
110 // This tells the converter which charsets has two bytes per char:
111 var $twoByteSets=array(
112 'ucs-2'=>1, // 2-byte Unicode
113 'utf-16'=>1 // 2-byte Unicode with surrogates
114 );
115
116 // This tells the converter which charset use the Extended Unix Code scheme:
117 var $eucBasedSets=array(
118 'gb2312'=>1, // Chinese, simplified.
119 );
120
121 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
122 // http://czyborra.com/charsets/iso8859.html
123 var $synonyms=array(
124 'us' => 'ascii',
125 'us-ascii'=> 'ascii',
126 'cp819' => 'iso-8859-1',
127 'ibm819' => 'iso-8859-1',
128 'iso-ir-100' => 'iso-8859-1',
129 'iso-ir-109' => 'iso-8859-2',
130 'iso-ir-148' => 'iso-8859-9',
131 'iso-ir-199' => 'iso-8859-14',
132 'iso-ir-203' => 'iso-8859-15',
133 'csisolatin1' => 'iso-8859-1',
134 'csisolatin2' => 'iso-8859-2',
135 'csisolatin3' => 'iso-8859-3',
136 'csisolatin5' => 'iso-8859-9',
137 'csisolatin8' => 'iso-8859-14',
138 'csisolatin9' => 'iso-8859-15',
139 'csisolatingreek' => 'iso-8859-7',
140 'iso-celtic' => 'iso-8859-14',
141 'latin1' => 'iso-8859-1',
142 'latin2' => 'iso-8859-2',
143 'latin3' => 'iso-8859-3',
144 'latin5' => 'iso-8859-9',
145 'latin6' => 'iso-8859-10',
146 'latin8' => 'iso-8859-14',
147 'latin9' => 'iso-8859-15',
148 'l1' => 'iso-8859-1',
149 'l2' => 'iso-8859-2',
150 'l3' => 'iso-8859-3',
151 'l5' => 'iso-8859-9',
152 'l6' => 'iso-8859-10',
153 'l8' => 'iso-8859-14',
154 'l9' => 'iso-8859-15',
155 'cyrillic' => 'iso-8859-5',
156 'arabic' => 'iso-8859-6',
157 'win874' => 'windows-874',
158 'win1250' => 'windows-1250',
159 'win1251' => 'windows-1251',
160 'win1252' => 'windows-1252',
161 'win1253' => 'windows-1253',
162 'win1254' => 'windows-1254',
163 'win1255' => 'windows-1255',
164 'win1256' => 'windows-1256',
165 'win1257' => 'windows-1257',
166 'win1258' => 'windows-1258',
167 'cp1250' => 'windows-1250',
168 'cp1252' => 'windows-1252',
169 'ms-ee' => 'windows-1250',
170 'ms-ansi' => 'windows-1252',
171 'ms-greek' => 'windows-1253',
172 'ms-turk' => 'windows-1254',
173 'winbaltrim' => 'windows-1257',
174 'mac' => 'macRoman',
175 'macintosh' => 'macRoman',
176 'euc-cn' => 'gb2312',
177 'x-euc-cn' => 'gb2312',
178 'utf8' => 'utf-8',
179 'utf-2' => 'utf-8',
180 'utf2' => 'utf-8',
181 );
182 /*
183 JIS X 0208 (euc-jp)
184 CNS 11643 (EUC-TW)
185 KS C 5601 (EUC-KR)
186 */
187
188 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
189 // Empty values means "iso-8859-1"
190 var $charSetArray = array(
191 'dk' => '',
192 'de' => '',
193 'no' => '',
194 'it' => '',
195 'fr' => '',
196 'es' => '',
197 'nl' => '',
198 'cz' => 'windows-1250',
199 'pl' => 'iso-8859-2',
200 'si' => 'windows-1250',
201 'fi' => '',
202 'tr' => 'iso-8859-9',
203 'se' => '',
204 'pt' => '',
205 'ru' => 'windows-1251',
206 'ro' => 'iso-8859-2',
207 'ch' => 'gb2312',
208 'sk' => 'windows-1250',
209 'lt' => 'windows-1257',
210 'is' => 'utf-8',
211 'hr' => 'windows-1250',
212 'hu' => 'iso-8859-2',
213 'gl' => '',
214 'th' => 'iso-8859-11',
215 'gr' => 'iso-8859-7',
216 'hk' => 'big5',
217 'eu' => '',
218 'bg' => 'windows-1251',
219 'br' => '',
220 'et' => 'iso-8859-4',
221 'ar' => 'iso-8859-6',
222 'he' => 'utf-8',
223 'ua' => 'windows-1251',
224 );
225
226 /**
227 * Normalize - changes input character set to lowercase letters.
228 *
229 * @param string Input charset
230 * @return string Normalized charset
231 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
232 */
233 function parse_charset($charset) {
234 $charset = strtolower($charset);
235 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
236
237 return $charset;
238 }
239
240
241 /**
242 * Convert from one charset to another charset.
243 *
244 * @param string Input string
245 * @param string From charset (the current charset of the string)
246 * @param string To charset (the output charset wanted)
247 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
248 * @return string Converted string
249 */
250 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
251 global $TYPO3_CONF_VARS;
252
253 if ($fromCS==$toCS) return $str;
254
255 if (!$useEntityForNoChar) { // iconv and recode don't support fallback to SGML entities
256 if ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'iconv') {
257 $conv_str = iconv($str,$fromCS,$toCS.'//TRANSLIT');
258 if (false !== $conv_str) return $conv_str;
259 }
260 elseif ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'recode') {
261 $conv_str = recode_string($toCS.'..'.$fromCS,$str);
262 if (false !== $conv_str) return $conv_str;
263 }
264 elseif ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'mbstring') {
265 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
266 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
267 }
268 // fallback to TYPO3 conversion
269 }
270
271 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
272 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
273 return $str;
274 }
275
276
277 /**
278 * Converts $str from $charset to UTF-8
279 *
280 * @param string String in local charset to convert to UTF-8
281 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
282 * @return string Output string, converted to UTF-8
283 */
284 function utf8_encode($str,$charset) {
285
286 // Charset is case-insensitive.
287 if ($this->initCharset($charset)) { // Parse conv. table if not already...
288 $strLen = strlen($str);
289 $outStr='';
290
291 for ($a=0,$i;$a<$strLen;$a++,$i++) { // Traverse each char in string.
292 $chr=substr($str,$a,1);
293 $ord=ord($chr);
294 if ($this->twoByteSets[$charset]) { // If the charset has two bytes per char
295 $ord2 = ord($str[$i+1]);
296 $ord = $ord<<8 & $ord2; // assume big endian
297
298 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
299 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
300 } else $outStr.=chr($this->noCharByteVal); // No char exists
301 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
302 if ($this->eucBasedSets[$charset]) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
303 $a++;
304 $ord2=ord(substr($str,$a,1));
305 $ord = $ord*256+$ord2;
306 }
307
308 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
309 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
310 } else $outStr.=chr($this->noCharByteVal); // No char exists
311 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
312 }
313 return $outStr;
314 }
315 }
316
317 /**
318 * Converts $str from UTF-8 to $charset
319 *
320 * @param string String in UTF-8 to convert to local charset
321 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
322 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
323 * @return string Output string, converted to local charset
324 */
325 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
326
327 // Charset is case-insensitive.
328 if ($this->initCharset($charset)) { // Parse conv. table if not already...
329 $strLen = strlen($str);
330 $outStr='';
331 $buf='';
332 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
333 $chr=substr($str,$a,1);
334 $ord=ord($chr);
335 if ($ord>127) { // This means multibyte! (first byte!)
336 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
337
338 $buf=$chr; // Add first byte
339 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
340 $ord = $ord << 1; // Shift it left and ...
341 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
342 $a++; // Increase pointer...
343 $buf.=substr($str,$a,1); // ... and add the next char.
344 } else break;
345 }
346
347 # Martin Kutschker...! this does not work! With russian UTF-8 converted back to windows-1251 it failed... So the old code is re-inserted.
348 # for ($bc=0; $ord & 0x80; $ord = $ord << 1) { $bc++; } // calculate number of bytes
349 # $buf.=substr($str,$i,$bc);
350 # $i+=$bc-1;
351
352 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
353 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
354 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
355 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
356 } else $outStr.= chr($mByte);
357 } elseif ($useEntityForNoChar) { // Create num entity:
358 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
359 } else $outStr.=chr($this->noCharByteVal); // No char exists
360 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
361 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
362 }
363 return $outStr;
364 }
365 }
366
367 /**
368 * Converts all chars > 127 to numeric entities.
369 *
370 * @param string Input string
371 * @return string Output string
372 */
373 function utf8_to_entities($str) {
374 $strLen = strlen($str);
375 $outStr='';
376 $buf='';
377 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
378 $chr=substr($str,$a,1);
379 $ord=ord($chr);
380 if ($ord>127) { // This means multibyte! (first byte!)
381 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
382 $buf=$chr; // Add first byte
383 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
384 $ord = $ord << 1; // Shift it left and ...
385 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
386 $a++; // Increase pointer...
387 $buf.=substr($str,$a,1); // ... and add the next char.
388 } else break;
389 }
390
391 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
392 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
393 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
394 }
395
396 return $outStr;
397 }
398
399 /**
400 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
401 *
402 * @param string Input string, UTF-8
403 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
404 * @return string Output string
405 */
406 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
407 if ($alsoStdHtmlEnt) {
408 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES));
409 }
410
411 $token = md5(microtime());
412 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
413 foreach($parts as $k => $v) {
414 if ($k%2) {
415 if (substr($v,0,1)=='#') { // Dec or hex entities:
416 if (substr($v,1,1)=='x') $v=hexdec(substr($v,2));
417 $parts[$k] = $this->UnumberToChar(substr($v,1));
418 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
419 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
420 } else { // No conversion:
421 $parts[$k] ='&'.$v.';';
422 }
423 }
424 }
425
426 return implode('',$parts);
427 }
428
429 /**
430 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
431 *
432 * @param string Input string, UTF-8
433 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
434 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
435 * @return array Output array with the char numbers
436 */
437 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
438 // If entities must be registered as well...:
439 if ($convEntities) {
440 $str = $this->entities_to_utf8($str,1);
441 }
442 // Do conversion:
443 $strLen = strlen($str);
444 $outArr=array();
445 $buf='';
446 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
447 $chr=substr($str,$a,1);
448 $ord=ord($chr);
449 if ($ord>127) { // This means multibyte! (first byte!)
450 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
451 $buf=$chr; // Add first byte
452 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
453 $ord = $ord << 1; // Shift it left and ...
454 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
455 $a++; // Increase pointer...
456 $buf.=substr($str,$a,1); // ... and add the next char.
457 } else break;
458 }
459
460 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
461 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
462 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
463 }
464
465 return $outArr;
466 }
467
468 /**
469 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
470 * This function is automatically called by the conversion functions
471 *
472 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
473 *
474 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
475 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
476 * @access private
477 */
478 function initCharset($charset) {
479 // Only process if the charset is not yet loaded:
480 if (!is_array($this->parsedCharsets[$charset])) {
481
482 // Conversion table filename:
483 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
484
485 // If the conversion table is found:
486 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
487 // Cache file for charsets:
488 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
489 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/charset_'.$charset.'.tbl');
490 if ($cacheFile && @is_file($cacheFile)) {
491 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
492 } else {
493 // Parse conversion table into lines:
494 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
495 // Initialize the internal variable holding the conv. table:
496 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
497 // traverse the lines:
498 $detectedType='';
499 foreach($lines as $value) {
500 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
501
502 // Detect type if not done yet: (Done on first real line)
503 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
504 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
505
506 if ($detectedType=='ms-token') {
507 list($hexbyte,$utf8) = split('=|:',$value,3);
508 } elseif ($detectedType=='whitespaced') {
509 $regA=array();
510 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
511 $hexbyte = $regA[1];
512 $utf8 = 'U+'.$regA[2];
513 }
514 $decval = hexdec(trim($hexbyte));
515 if ($decval>127) {
516 $utf8decval = hexdec(substr(trim($utf8),2));
517 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
518 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
519 }
520 }
521 }
522 if ($cacheFile) {
523 t3lib_div::writeFile($cacheFile,serialize($this->parsedCharsets[$charset]));
524 }
525 }
526 return 2;
527 } else return false;
528 } else return 1;
529 }
530
531 /**
532 * Converts a UNICODE number to a UTF-8 multibyte character
533 * Algorithm based on script found at From: http://czyborra.com/utf/
534 *
535 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
536 *
537 * bytes | bits | representation
538 * 1 | 7 | 0vvvvvvv
539 * 2 | 11 | 110vvvvv 10vvvvvv
540 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
541 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
542 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
543 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
544 *
545 * @param integer UNICODE integer
546 * @return string UTF-8 multibyte character string
547 * @see utf8CharToUnumber()
548 */
549 function UnumberToChar($cbyte) {
550 $str='';
551
552 if ($cbyte < 0x80) {
553 $str.=chr($cbyte);
554 } else if ($cbyte < 0x800) {
555 $str.=chr(0xC0 | ($cbyte >> 6));
556 $str.=chr(0x80 | ($cbyte & 0x3F));
557 } else if ($cbyte < 0x10000) {
558 $str.=chr(0xE0 | ($cbyte >> 12));
559 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
560 $str.=chr(0x80 | ($cbyte & 0x3F));
561 } else if ($cbyte < 0x200000) {
562 $str.=chr(0xF0 | ($cbyte >> 18));
563 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
564 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
565 $str.=chr(0x80 | ($cbyte & 0x3F));
566 } else if ($cbyte < 0x4000000) {
567 $str.=chr(0xF8 | ($cbyte >> 24));
568 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
569 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
570 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
571 $str.=chr(0x80 | ($cbyte & 0x3F));
572 } else if ($cbyte < 0x80000000) {
573 $str.=chr(0xFC | ($cbyte >> 30));
574 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
575 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
576 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
577 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
578 $str.=chr(0x80 | ($cbyte & 0x3F));
579 } else { // Cannot express a 32-bit character in UTF-8
580 $str .= chr($this->noCharByteVal);
581 }
582 return $str;
583 }
584
585 /**
586 * Converts a UTF-8 Multibyte character to a UNICODE number
587 *
588 * @param string UTF-8 multibyte character string
589 * @param boolean If set, then a hex. number is returned.
590 * @return integer UNICODE integer
591 * @see UnumberToChar()
592 */
593 function utf8CharToUnumber($str,$hex=0) {
594 $ord=ord(substr($str,0,1)); // First char
595
596 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
597 $binBuf='';
598 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
599 $ord = $ord << 1; // Shift it left and ...
600 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
601 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
602 } else break;
603 }
604 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
605
606 $int = bindec($binBuf);
607 } else $int = $ord;
608
609 return $hex ? 'x'.dechex($int) : $int;
610 }
611
612
613 /**
614 * Truncates a string in UTF-8 short at a given byte length
615 *
616 * @param string UTF-8 multibyte character string
617 * @param integer the byte length
618 * @return string the shortened string
619 * @see strcut()
620 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
621 */
622 function utf8_strtrunc($str,$len) {
623 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
624 return mb_strcut($str,0,$len,'utf-8');
625 }
626
627 $i = $len-1;
628 if (ord($str[$i]) & 0x80) { // part of a mulitbyte sequence
629 for (; !(ord($str[$i]) & 0x40); $i--) ; // find the first byte
630 for ($bc=0, $mbs=ord($str[$i]); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
631 if ($bc+$i > $len) return substr($str,0,$i);
632 // fallthru: multibyte char fits into length
633 }
634 return substr($str,$len);
635 }
636
637
638
639
640
641
642
643
644
645
646
647
648
649 /********************************************
650 *
651 * String operation functions
652 *
653 ********************************************/
654
655 // a few stubs of possibly useful functions, which may be implemented in PHP
656
657 /**
658 * @param [type] $str: ...
659 * @return [type] ...
660 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
661 */
662 function utf_strlen($str) {
663 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
664 return mb_strlen($str,'utf-8');
665 }
666 }
667
668 /**
669 * @param [type] $str: ...
670 * @param [type] $start: ...
671 * @param [type] $len: ...
672 * @return [type] ...
673 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
674 */
675 function utf_substr($str,$start,$len=0) {
676 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
677 // how to omit $len when you specify a charset?!?!
678 return mb_substr($str,$start,$len,'utf-8');
679 }
680 }
681
682 /**
683 * @param [type] $haystack: ...
684 * @param [type] $needle: ...
685 * @param [type] $offset: ...
686 * @return [type] ...
687 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
688 */
689 function utf_strpos($haystack,$needle,$offset=0) {
690 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
691 return mb_strpos($haystack,$needle,'utf-8');
692 }
693 }
694
695 /**
696 * @param [type] $haystack: ...
697 * @param [type] $needle: ...
698 * @param [type] $offset: ...
699 * @return [type] ...
700 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
701 */
702 function utf_strrpos($haystack,$needle,$offset=0) {
703 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
704 return mb_strrpos($haystack,$needle,'utf-8');
705 }
706 }
707 }
708
709 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
710 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
711 }
712 ?>