General string handling:
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2004 Kasper Skaarhoj (kasper@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasper@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 108: class t3lib_cs
38 * 237: function parse_charset($charset)
39 * 254: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
40 * 288: function utf8_encode($str,$charset)
41 * 329: function utf8_decode($str,$charset,$useEntityForNoChar=0)
42 * 377: function utf8_to_entities($str)
43 * 410: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
44 * 441: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
45 * 482: function initCharset($charset)
46 * 553: function UnumberToChar($cbyte)
47 * 597: function utf8CharToUnumber($str,$hex=0)
48 *
49 * SECTION: UTF-8 String operation functions
50 * 648: function utf8_strtrunc($str,$len)
51 * 680: function utf8_substr($str,$start,$len=null)
52 * 719: function utf8_strlen($str)
53 * 745: function utf8_strpos($haystack,$needle,$offset=0)
54 * 768: function utf8_strrpos($haystack,$needle)
55 * 787: function utf8_char2byte_pos($str,$pos)
56 * 812: function utf8_byte2char_pos($str,$pos)
57 *
58 * TOTAL FUNCTIONS: 17
59 * (This index is automatically created/updated by the extension "extdeveval")
60 *
61 */
62
63
64
65
66
67
68
69
70 /**
71 * Notes on UTF-8
72 *
73 * Functions working on UTF-8 strings:
74 *
75 * - strchr/strstr
76 * - strrchr
77 * - substr_count
78 * - implode/explode/join
79 *
80 * Functions nearly working on UTF-8 strings:
81 *
82 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf_strlen
83 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained 7-bit ASCII
84 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
85 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
86 *
87 * Functions NOT working on UTF-8 strings:
88 *
89 * - str*cmp
90 * - stristr
91 * - stripos
92 * - substr
93 * - strrev
94 * - ereg/eregi
95 * - split/spliti
96 * - preg_*
97 * - ...
98 *
99 */
100 /**
101 * Class for conversion between charsets.
102 *
103 * @author Kasper Skaarhoj <kasper@typo3.com>
104 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
105 * @package TYPO3
106 * @subpackage t3lib
107 */
108 class t3lib_cs {
109 var $noCharByteVal=127; // ASCII Value for chars with no equalent.
110
111 // This is the array where parsed conversion tables are stored (cached)
112 var $parsedCharsets=array();
113
114 // This tells the converter which charsets has two bytes per char:
115 var $twoByteSets=array(
116 'ucs-2'=>1, // 2-byte Unicode
117 );
118
119 // This tells the converter which charsets has four bytes per char:
120 var $fourByteSets=array(
121 'ucs-4'=>1, // 4-byte Unicode
122 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
123 );
124
125 // This tells the converter which charsets use a scheme like the Extended Unix Code:
126 var $eucBasedSets=array(
127 'gb2312'=>1, // Chinese, simplified.
128 'big5'=>1, // Chinese, traditional.
129 );
130
131 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
132 // http://czyborra.com/charsets/iso8859.html
133 var $synonyms=array(
134 'us' => 'ascii',
135 'us-ascii'=> 'ascii',
136 'cp819' => 'iso-8859-1',
137 'ibm819' => 'iso-8859-1',
138 'iso-ir-100' => 'iso-8859-1',
139 'iso-ir-109' => 'iso-8859-2',
140 'iso-ir-148' => 'iso-8859-9',
141 'iso-ir-199' => 'iso-8859-14',
142 'iso-ir-203' => 'iso-8859-15',
143 'csisolatin1' => 'iso-8859-1',
144 'csisolatin2' => 'iso-8859-2',
145 'csisolatin3' => 'iso-8859-3',
146 'csisolatin5' => 'iso-8859-9',
147 'csisolatin8' => 'iso-8859-14',
148 'csisolatin9' => 'iso-8859-15',
149 'csisolatingreek' => 'iso-8859-7',
150 'iso-celtic' => 'iso-8859-14',
151 'latin1' => 'iso-8859-1',
152 'latin2' => 'iso-8859-2',
153 'latin3' => 'iso-8859-3',
154 'latin5' => 'iso-8859-9',
155 'latin6' => 'iso-8859-10',
156 'latin8' => 'iso-8859-14',
157 'latin9' => 'iso-8859-15',
158 'l1' => 'iso-8859-1',
159 'l2' => 'iso-8859-2',
160 'l3' => 'iso-8859-3',
161 'l5' => 'iso-8859-9',
162 'l6' => 'iso-8859-10',
163 'l8' => 'iso-8859-14',
164 'l9' => 'iso-8859-15',
165 'cyrillic' => 'iso-8859-5',
166 'arabic' => 'iso-8859-6',
167 'win874' => 'windows-874',
168 'win1250' => 'windows-1250',
169 'win1251' => 'windows-1251',
170 'win1252' => 'windows-1252',
171 'win1253' => 'windows-1253',
172 'win1254' => 'windows-1254',
173 'win1255' => 'windows-1255',
174 'win1256' => 'windows-1256',
175 'win1257' => 'windows-1257',
176 'win1258' => 'windows-1258',
177 'cp1250' => 'windows-1250',
178 'cp1252' => 'windows-1252',
179 'ms-ee' => 'windows-1250',
180 'ms-ansi' => 'windows-1252',
181 'ms-greek' => 'windows-1253',
182 'ms-turk' => 'windows-1254',
183 'winbaltrim' => 'windows-1257',
184 'mac' => 'macRoman',
185 'macintosh' => 'macRoman',
186 'euc-cn' => 'gb2312',
187 'x-euc-cn' => 'gb2312',
188 'cp936' => 'gb2312',
189 'big-5' => 'big5',
190 'cp950' => 'big5',
191 'sjis' => 'shift_jis',
192 'shift-jis' => 'shift_jis',
193 'cp932' => 'shift_jis',
194 'utf7' => 'utf-7',
195 'utf8' => 'utf-8',
196 'utf16' => 'utf-16',
197 'utf32' => 'utf-32',
198 'utf8' => 'utf-8',
199 'ucs2' => 'ucs-2',
200 'ucs4' => 'ucs-4',
201 );
202
203 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
204 // Empty values means "iso-8859-1"
205 var $charSetArray = array(
206 'dk' => '',
207 'de' => '',
208 'no' => '',
209 'it' => '',
210 'fr' => '',
211 'es' => '',
212 'nl' => '',
213 'cz' => 'windows-1250',
214 'pl' => 'iso-8859-2',
215 'si' => 'windows-1250',
216 'fi' => '',
217 'tr' => 'iso-8859-9',
218 'se' => '',
219 'pt' => '',
220 'ru' => 'windows-1251',
221 'ro' => 'iso-8859-2',
222 'ch' => 'gb2312',
223 'sk' => 'windows-1250',
224 'lt' => 'windows-1257',
225 'is' => 'utf-8',
226 'hr' => 'windows-1250',
227 'hu' => 'iso-8859-2',
228 'gl' => '',
229 'th' => 'iso-8859-11',
230 'gr' => 'iso-8859-7',
231 'hk' => 'big5',
232 'eu' => '',
233 'bg' => 'windows-1251',
234 'br' => '',
235 'et' => 'iso-8859-4',
236 'ar' => 'iso-8859-6',
237 'he' => 'utf-8',
238 'ua' => 'windows-1251',
239 );
240
241 /**
242 * Normalize - changes input character set to lowercase letters.
243 *
244 * @param string Input charset
245 * @return string Normalized charset
246 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
247 */
248 function parse_charset($charset) {
249 $charset = strtolower($charset);
250 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
251
252 return $charset;
253 }
254
255
256 /**
257 * Convert from one charset to another charset.
258 *
259 * @param string Input string
260 * @param string From charset (the current charset of the string)
261 * @param string To charset (the output charset wanted)
262 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
263 * @return string Converted string
264 */
265 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
266 global $TYPO3_CONF_VARS;
267
268 if ($fromCS==$toCS) return $str;
269
270 if (!$useEntityForNoChar) { // iconv and recode don't support fallback to SGML entities
271 if ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'iconv') {
272 $conv_str = iconv($str,$fromCS,$toCS.'//TRANSLIT');
273 if (false !== $conv_str) return $conv_str;
274 }
275 elseif ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'recode') {
276 $conv_str = recode_string($toCS.'..'.$fromCS,$str);
277 if (false !== $conv_str) return $conv_str;
278 }
279 elseif ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'mbstring') {
280 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
281 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
282 }
283 // fallback to TYPO3 conversion
284 }
285
286 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
287 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
288 return $str;
289 }
290
291
292 /**
293 * Converts $str from $charset to UTF-8
294 *
295 * @param string String in local charset to convert to UTF-8
296 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
297 * @return string Output string, converted to UTF-8
298 */
299 function utf8_encode($str,$charset) {
300
301 // Charset is case-insensitive.
302 if ($this->initCharset($charset)) { // Parse conv. table if not already...
303 $strLen = strlen($str);
304 $outStr='';
305
306 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
307 $chr=substr($str,$a,1);
308 $ord=ord($chr);
309 if ($this->twoByteSets[$charset]) { // If the charset has two bytes per char
310 $ord2 = ord($str{$a+1});
311 $ord = $ord<<8 & $ord2; // assume big endian
312
313 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
314 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
315 } else $outStr.=chr($this->noCharByteVal); // No char exists
316 $a++;
317 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
318 if ($this->eucBasedSets[$charset]) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
319 $a++;
320 $ord2=ord(substr($str,$a,1));
321 $ord = $ord*256+$ord2;
322 }
323 elseif ($charset == 'shift_jis' && ($ord <160 || $ord>223)) { // Shift-JIS is like EUC, but chars between 160 and 223 are single byte
324 $a++;
325 $ord2=ord(substr($str,$a,1));
326 $ord = $ord*256+$ord2;
327 }
328
329 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
330 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
331 } else $outStr.=chr($this->noCharByteVal); // No char exists
332 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
333 }
334 return $outStr;
335 }
336 }
337
338 /**
339 * Converts $str from UTF-8 to $charset
340 *
341 * @param string String in UTF-8 to convert to local charset
342 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
343 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
344 * @return string Output string, converted to local charset
345 */
346 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
347
348 // Charset is case-insensitive.
349 if ($this->initCharset($charset)) { // Parse conv. table if not already...
350 $strLen = strlen($str);
351 $outStr='';
352 $buf='';
353 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
354 $chr=substr($str,$a,1);
355 $ord=ord($chr);
356 if ($ord>127) { // This means multibyte! (first byte!)
357 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
358
359 $buf=$chr; // Add first byte
360 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
361 $ord = $ord << 1; // Shift it left and ...
362 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
363 $a++; // Increase pointer...
364 $buf.=substr($str,$a,1); // ... and add the next char.
365 } else break;
366 }
367
368 # Martin Kutschker...! this does not work! With russian UTF-8 converted back to windows-1251 it failed... So the old code is re-inserted.
369 # for ($bc=0; $ord & 0x80; $ord = $ord << 1) { $bc++; } // calculate number of bytes
370 # $buf.=substr($str,$i,$bc);
371 # $i+=$bc-1;
372
373 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
374 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
375 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
376 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
377 } else $outStr.= chr($mByte);
378 } elseif ($useEntityForNoChar) { // Create num entity:
379 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
380 } else $outStr.=chr($this->noCharByteVal); // No char exists
381 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
382 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
383 }
384 return $outStr;
385 }
386 }
387
388 /**
389 * Converts all chars > 127 to numeric entities.
390 *
391 * @param string Input string
392 * @return string Output string
393 */
394 function utf8_to_entities($str) {
395 $strLen = strlen($str);
396 $outStr='';
397 $buf='';
398 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
399 $chr=substr($str,$a,1);
400 $ord=ord($chr);
401 if ($ord>127) { // This means multibyte! (first byte!)
402 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
403 $buf=$chr; // Add first byte
404 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
405 $ord = $ord << 1; // Shift it left and ...
406 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
407 $a++; // Increase pointer...
408 $buf.=substr($str,$a,1); // ... and add the next char.
409 } else break;
410 }
411
412 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
413 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
414 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
415 }
416
417 return $outStr;
418 }
419
420 /**
421 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
422 *
423 * @param string Input string, UTF-8
424 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
425 * @return string Output string
426 */
427 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
428 if ($alsoStdHtmlEnt) {
429 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES));
430 }
431
432 $token = md5(microtime());
433 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
434 foreach($parts as $k => $v) {
435 if ($k%2) {
436 if (substr($v,0,1)=='#') { // Dec or hex entities:
437 if (substr($v,1,1)=='x') $v=hexdec(substr($v,2));
438 $parts[$k] = $this->UnumberToChar(substr($v,1));
439 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
440 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
441 } else { // No conversion:
442 $parts[$k] ='&'.$v.';';
443 }
444 }
445 }
446
447 return implode('',$parts);
448 }
449
450 /**
451 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
452 *
453 * @param string Input string, UTF-8
454 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
455 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
456 * @return array Output array with the char numbers
457 */
458 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
459 // If entities must be registered as well...:
460 if ($convEntities) {
461 $str = $this->entities_to_utf8($str,1);
462 }
463 // Do conversion:
464 $strLen = strlen($str);
465 $outArr=array();
466 $buf='';
467 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
468 $chr=substr($str,$a,1);
469 $ord=ord($chr);
470 if ($ord>127) { // This means multibyte! (first byte!)
471 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
472 $buf=$chr; // Add first byte
473 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
474 $ord = $ord << 1; // Shift it left and ...
475 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
476 $a++; // Increase pointer...
477 $buf.=substr($str,$a,1); // ... and add the next char.
478 } else break;
479 }
480
481 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
482 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
483 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
484 }
485
486 return $outArr;
487 }
488
489 /**
490 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
491 * This function is automatically called by the conversion functions
492 *
493 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
494 *
495 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
496 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
497 * @access private
498 */
499 function initCharset($charset) {
500 // Only process if the charset is not yet loaded:
501 if (!is_array($this->parsedCharsets[$charset])) {
502
503 // Conversion table filename:
504 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
505
506 // If the conversion table is found:
507 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
508 // Cache file for charsets:
509 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
510 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/charset_'.$charset.'.tbl');
511 if ($cacheFile && @is_file($cacheFile)) {
512 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
513 } else {
514 // Parse conversion table into lines:
515 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
516 // Initialize the internal variable holding the conv. table:
517 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
518 // traverse the lines:
519 $detectedType='';
520 foreach($lines as $value) {
521 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
522
523 // Detect type if not done yet: (Done on first real line)
524 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
525 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
526
527 if ($detectedType=='ms-token') {
528 list($hexbyte,$utf8) = split('=|:',$value,3);
529 } elseif ($detectedType=='whitespaced') {
530 $regA=array();
531 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
532 $hexbyte = $regA[1];
533 $utf8 = 'U+'.$regA[2];
534 }
535 $decval = hexdec(trim($hexbyte));
536 if ($decval>127) {
537 $utf8decval = hexdec(substr(trim($utf8),2));
538 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
539 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
540 }
541 }
542 }
543 if ($cacheFile) {
544 t3lib_div::writeFile($cacheFile,serialize($this->parsedCharsets[$charset]));
545 }
546 }
547 return 2;
548 } else return false;
549 } else return 1;
550 }
551
552 /**
553 * Converts a UNICODE number to a UTF-8 multibyte character
554 * Algorithm based on script found at From: http://czyborra.com/utf/
555 *
556 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
557 *
558 * bytes | bits | representation
559 * 1 | 7 | 0vvvvvvv
560 * 2 | 11 | 110vvvvv 10vvvvvv
561 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
562 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
563 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
564 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
565 *
566 * @param integer UNICODE integer
567 * @return string UTF-8 multibyte character string
568 * @see utf8CharToUnumber()
569 */
570 function UnumberToChar($cbyte) {
571 $str='';
572
573 if ($cbyte < 0x80) {
574 $str.=chr($cbyte);
575 } else if ($cbyte < 0x800) {
576 $str.=chr(0xC0 | ($cbyte >> 6));
577 $str.=chr(0x80 | ($cbyte & 0x3F));
578 } else if ($cbyte < 0x10000) {
579 $str.=chr(0xE0 | ($cbyte >> 12));
580 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
581 $str.=chr(0x80 | ($cbyte & 0x3F));
582 } else if ($cbyte < 0x200000) {
583 $str.=chr(0xF0 | ($cbyte >> 18));
584 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
585 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
586 $str.=chr(0x80 | ($cbyte & 0x3F));
587 } else if ($cbyte < 0x4000000) {
588 $str.=chr(0xF8 | ($cbyte >> 24));
589 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
590 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
591 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
592 $str.=chr(0x80 | ($cbyte & 0x3F));
593 } else if ($cbyte < 0x80000000) {
594 $str.=chr(0xFC | ($cbyte >> 30));
595 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
596 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
597 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
598 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
599 $str.=chr(0x80 | ($cbyte & 0x3F));
600 } else { // Cannot express a 32-bit character in UTF-8
601 $str .= chr($this->noCharByteVal);
602 }
603 return $str;
604 }
605
606 /**
607 * Converts a UTF-8 Multibyte character to a UNICODE number
608 *
609 * @param string UTF-8 multibyte character string
610 * @param boolean If set, then a hex. number is returned.
611 * @return integer UNICODE integer
612 * @see UnumberToChar()
613 */
614 function utf8CharToUnumber($str,$hex=0) {
615 $ord=ord(substr($str,0,1)); // First char
616
617 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
618 $binBuf='';
619 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
620 $ord = $ord << 1; // Shift it left and ...
621 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
622 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
623 } else break;
624 }
625 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
626
627 $int = bindec($binBuf);
628 } else $int = $ord;
629
630 return $hex ? 'x'.dechex($int) : $int;
631 }
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650 /********************************************
651 *
652 * String operation functions
653 *
654 ********************************************/
655
656 /**
657 * Cuts a string short at a given byte length.
658 *
659 * @param string the character set
660 * @param string character string
661 * @param integer the byte length
662 * @return string the shortened string
663 * @see mb_strcut()
664 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
665 */
666 function strtrunc($charset,$string,$len) {
667 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
668 return mb_strcut($string,0,$len,$charset);
669 } elseif ($charset == 'utf-8') {
670 return $this->utf8_strtrunc($string);
671 } elseif ($charset == 'shift_jis') {
672 return $this->euc_strtrunc($string,'shift_jis');
673 } elseif ($this->eucBasedSets[$charset]) {
674 return $this->euc_strtrunc($string,$charset);
675 } elseif ($this->twoByteSets[$charset]) {
676 if ($len % 2) $len--; // don't cut at odd positions
677 } elseif ($this->fourByteSets[$charset]) {
678 $x = $len % 4;
679 $len -= $x; // realign to position dividable by four
680 }
681 // treat everything else as single-byte encoding
682 return substr($string,0,$len);
683 }
684
685 /**
686 * Returns a part of a string.
687 *
688 *
689 * Negative values for @arg $start and @arg $len are currently not supported.
690 *
691 * @param string the character set
692 * @param string character string
693 * @param int $start start position (character position)
694 * @param int length (in characters)
695 * @return string the substring
696 * @see substr()
697 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
698 * @bug
699 */
700 function substr($charset,$str,$start,$len=null) {
701 if ($len===0) return '';
702
703 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
704 // cannot omit $len, when specifying charset
705 if ($len==null) {
706 $enc = mb_internal_encoding(); // save internal encoding
707 mb_internal_encoding('utf-8');
708 $str = mb_substr($str,$start);
709 mb_internal_encoding($enc); // restore internal encoding
710
711 return $str;
712 }
713 else return mb_substr($str,$start,$len,'utf-8');
714 } elseif ($charset == 'utf-8') {
715 return $this->utf8_substr($string,$start,$len);
716 } elseif ($charset == 'shift_jis') {
717 return $this->euc_substr($string,$start,$len,'shift_jis');
718 } elseif ($this->eucBasedSets[$charset]) {
719 return $this->euc_substr($string,$start,$len);
720 } elseif ($this->twoByteSets[$charset]) {
721 return substr($string,$start*2,$len*2);
722 } elseif ($this->fourByteSets[$charset]) {
723 return substr($string,$start*4,$len*4);
724 }
725
726 // treat everything else as single-byte encoding
727 return substr($string,$start,$len);
728 }
729
730 /**
731 * Counts the number of characters.
732 *
733 * @param string the character set
734 * @param string character string
735 * @return integer the number of characters
736 * @see strlen()
737 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
738 */
739 function strlen($charset,$string) {
740 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
741 return mb_strlen($string,$charset);
742 } elseif ($charset == 'utf-8') {
743 return $this->utf8_strlen($string);
744 } elseif ($charset == 'shift_jis') {
745 return $this->euc_strlen($string,'shift_jis');
746 } elseif ($this->eucBasedSets[$charset]) {
747 return $this->euc_strlen($string,$charset);
748 } elseif ($this->twoByteSets[$charset]) {
749 return strlen($string)/2;
750 } elseif ($this->fourByteSets[$charset]) {
751 return strlen($string)/4;
752 }
753 // treat everything else as single-byte encoding
754 return strlen($string);
755 }
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772 /********************************************
773 *
774 * UTF-8 String operation functions
775 *
776 ********************************************/
777
778 /**
779 * Truncates a string in UTF-8 short at a given byte length.
780 *
781 * @param string UTF-8 multibyte character string
782 * @param integer the byte length
783 * @return string the shortened string
784 * @see mb_strcut()
785 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
786 */
787 function utf8_strtrunc($str,$len) {
788 if ($len <= 0) return '';
789
790 $i = $len-1;
791 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
792 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
793 if ($i <= 0) return ''; // sanity check
794 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
795 if ($bc+$i > $len) return substr($str,0,$i);
796 // fallthru: multibyte char fits into length
797 }
798 return substr($str,$len);
799 }
800
801 /**
802 * Returns a part of a UTF-8 string.
803 *
804 *
805 * Negative values for @arg $start and @arg $len are currently not supported.
806 *
807 * @param string $str UTF-8 string
808 * @param int $start start position (character position)
809 * @param int $len length (in characters)
810 * @return string the substring
811 * @see substr()
812 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
813 * @bug
814 */
815 function utf8_substr($str,$start,$len=null) {
816 if ($len===0) return '';
817
818 $byte_start = $this->utf8_char2byte_pos($str,$start);
819 if ($byte_start === false) return false; // $start outside string length
820
821 $str = substr($str,$byte_start);
822
823 if ($len!=null) {
824 $byte_end = $this->utf8_char2byte_pos($str,$len);
825 if ($byte_end === false) // $len outside actual string length
826 return $str;
827 else
828 return substr($str,0,$byte_end);
829 }
830 else return $str;
831 }
832
833 /**
834 * Counts the number of characters of a string in UTF-8.
835 *
836 * @param string UTF-8 multibyte character string
837 * @return int the number of characters
838 * @see strlen()
839 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
840 */
841 function utf8_strlen($str) {
842 $n=0;
843 for($i=0; $str{$i}; $i++) {
844 $c = ord($str{$i});
845 if (!($c & 0x80)) // single-byte (0xxxxxx)
846 $n++;
847 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
848 $n++;
849 }
850 return $n;
851 }
852
853 /**
854 * Find position of first occurrence of a string, both arguments are in UTF-8.
855 *
856 * @param string UTF-8 string to search in
857 * @param string UTF-8 string to search for
858 * @param int positition to start the search
859 * @return int the character position
860 * @see strpos()
861 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
862 */
863 function utf8_strpos($haystack,$needle,$offset=0) {
864 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
865 return mb_strpos($haystack,$needle,'utf-8');
866 }
867
868 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
869 if ($byte_offset === false) return false; // offset beyond string length
870
871 $byte_pos = strpos($haystack,$needle,$byte_offset);
872 if ($byte_pos === false) return false; // needle not found
873
874 return $this->utf8_byte2char_pos($haystack,$byte_pos);
875 }
876
877 /**
878 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
879 *
880 * @param string UTF-8 string to search in
881 * @param char UTF-8 character to search for
882 * @return int the character position
883 * @see strrpos()
884 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
885 */
886 function utf8_strrpos($haystack,$needle) {
887 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
888 return mb_strrpos($haystack,$needle,'utf-8');
889 }
890
891 $byte_pos = strrpos($haystack,$needle);
892 if ($byte_pos === false) return false; // needle not found
893
894 return $this->utf8_byte2char_pos($haystack,$byte_pos);
895 }
896
897 /**
898 * Translates a character position into an 'absolute' byte position.
899 *
900 * @param string UTF-8 string
901 * @param int character position
902 * @return int byte position
903 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
904 */
905 function utf8_char2byte_pos($str,$pos) {
906 $n = 0; // number of characters
907 for($i=0; $str{$i} && $n<$pos; $i++) {
908 $c = (int)ord($str{$i});
909 if (!($c & 0x80)) // single-byte (0xxxxxx)
910 $n++;
911 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
912 $n++;
913 }
914 if (!$str{$i}) return false; // offset beyond string length
915
916 // skip trailing multi-byte data bytes
917 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
918
919 return $i;
920 }
921
922 /**
923 * Translates an 'absolute' byte position into a character position.
924 *
925 * @param string UTF-8 string
926 * @param int byte position
927 * @return int character position
928 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
929 */
930 function utf8_byte2char_pos($str,$pos) {
931 $n = 0; // number of characters
932 for($i=$pos; $i>0; $i--) {
933 $c = (int)ord($str{$i});
934 if (!($c & 0x80)) // single-byte (0xxxxxx)
935 $n++;
936 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
937 $n++;
938 }
939 if (!$str{$i}) return false; // offset beyond string length
940
941 return $n;
942 }
943
944
945
946
947
948
949
950
951
952
953
954
955
956 /********************************************
957 *
958 * EUC String operation functions
959 *
960 * Extended Unix Code:
961 * ASCII compatible 7bit single bytes chars
962 * 8bit two byte chars
963 *
964 * Shift-JIS is treated as a special case.
965 *
966 ********************************************/
967
968 /**
969 * Cuts a string in the EUC charset family short at a given byte length.
970 *
971 * @param string EUC multibyte character string
972 * @param integer the byte length
973 * @param string the charset
974 * @return string the shortened string
975 * @see mb_strcut()
976 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
977 */
978 function euc_strtrunc($str,$len,$charset) {
979 if ($len <= 0) return '';
980
981 $sjis = ($charset == 'shift_jis');
982 for ($i=0; $str{$i} && $i<$len; $i++) {
983 $c = ord($str{$i});
984 if ($sjis) {
985 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
986 }
987 else {
988 if ($c >= 0x80) $i++; // advance a double-byte char
989 }
990 }
991 if (!$str{$i}) return $str; // string shorter than supplied length
992
993 if ($i>$len)
994 return substr($str,0,$len-1); // we ended on a first byte
995 else
996 return substr($str,0,$len);
997 }
998
999 /**
1000 * Returns a part of a string in the EUC charset family.
1001 *
1002 *
1003 * Negative values for @arg $start and @arg $len are currently not supported.
1004 *
1005 * @param string EUC multibyte character string
1006 * @param int start position (character position)
1007 * @param int length (in characters)
1008 * @return string the substring
1009 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1010 *
1011 */
1012 function euc_substr($str,$start,$charset,$len=null) {
1013 if ($len===0) return '';
1014
1015 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1016 if ($byte_start === false) return false; // $start outside string length
1017
1018 $str = substr($str,$byte_start);
1019
1020 if ($len!=null) {
1021 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1022 if ($byte_end === false) // $len outside actual string length
1023 return $str;
1024 else
1025 return substr($str,0,$byte_end);
1026 }
1027 else return $str;
1028 }
1029
1030 /**
1031 * Counts the number of characters of a string in the EUC charset family.
1032 *
1033 * @param string EUC multibyte character string
1034 * @param string the charset
1035 * @return int the number of characters
1036 * @see strlen()
1037 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1038 */
1039 function euc_strlen($str,$charset) {
1040 $sjis = ($charset == 'shift_jis');
1041 $n=0;
1042 for ($i=0; $str{$i}; $i++) {
1043 $c = ord($str{$i});
1044 if ($sjis) {
1045 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1046 }
1047 else {
1048 if ($c >= 0x80) $i++; // advance a double-byte char
1049 }
1050
1051 $n++;
1052 }
1053
1054 return $n;
1055 }
1056
1057 /**
1058 * Translates a character position into an 'absolute' byte position.
1059 *
1060 * @param string EUC multibyte character string
1061 * @param int character position
1062 * @param string the charset
1063 * @return int byte position
1064 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1065 */
1066 function euc_char2byte_pos($str,$pos,$charset) {
1067 $sjis = ($charset == 'shift_jis');
1068 $n = 0; // number of characters seen
1069 for ($i=0; $str{$i} && $n<$pos; $i++) {
1070 $c = ord($str{$i});
1071 if ($sjis) {
1072 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1073 }
1074 else {
1075 if ($c >= 0x80) $i++; // advance a double-byte char
1076 }
1077
1078 $n++;
1079 }
1080 if (!$str{$i}) return false; // offset beyond string length
1081
1082 return $i;
1083 }
1084
1085 }
1086
1087 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
1088 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
1089 }
1090 ?>