Support for Shift-JIS:
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2004 Kasper Skaarhoj (kasper@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasper@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 108: class t3lib_cs
38 * 237: function parse_charset($charset)
39 * 254: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
40 * 288: function utf8_encode($str,$charset)
41 * 329: function utf8_decode($str,$charset,$useEntityForNoChar=0)
42 * 377: function utf8_to_entities($str)
43 * 410: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
44 * 441: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
45 * 482: function initCharset($charset)
46 * 553: function UnumberToChar($cbyte)
47 * 597: function utf8CharToUnumber($str,$hex=0)
48 *
49 * SECTION: UTF-8 String operation functions
50 * 648: function utf8_strtrunc($str,$len)
51 * 680: function utf8_substr($str,$start,$len=null)
52 * 719: function utf8_strlen($str)
53 * 745: function utf8_strpos($haystack,$needle,$offset=0)
54 * 768: function utf8_strrpos($haystack,$needle)
55 * 787: function utf8_char2byte_pos($str,$pos)
56 * 812: function utf8_byte2char_pos($str,$pos)
57 *
58 * TOTAL FUNCTIONS: 17
59 * (This index is automatically created/updated by the extension "extdeveval")
60 *
61 */
62
63
64
65
66
67
68
69
70 /**
71 * Notes on UTF-8
72 *
73 * Functions working on UTF-8 strings:
74 *
75 * - strchr/strstr
76 * - strrchr
77 * - substr_count
78 * - implode/explode/join
79 *
80 * Functions nearly working on UTF-8 strings:
81 *
82 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf_strlen
83 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained 7-bit ASCII
84 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
85 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
86 *
87 * Functions NOT working on UTF-8 strings:
88 *
89 * - str*cmp
90 * - stristr
91 * - stripos
92 * - substr
93 * - strrev
94 * - ereg/eregi
95 * - split/spliti
96 * - preg_*
97 * - ...
98 *
99 */
100 /**
101 * Class for conversion between charsets.
102 *
103 * @author Kasper Skaarhoj <kasper@typo3.com>
104 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
105 * @package TYPO3
106 * @subpackage t3lib
107 */
108 class t3lib_cs {
109 var $noCharByteVal=127; // ASCII Value for chars with no equalent.
110
111 // This is the array where parsed conversion tables are stored (cached)
112 var $parsedCharsets=array();
113
114 // This tells the converter which charsets has two bytes per char:
115 var $twoByteSets=array(
116 'ucs-2'=>1, // 2-byte Unicode
117 );
118
119 // This tells the converter which charsets has four bytes per char:
120 var $fourByteSets=array(
121 'ucs-4'=>1, // 4-byte Unicode
122 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
123 );
124
125 // This tells the converter which charsets use a scheme like the Extended Unix Code:
126 var $eucBasedSets=array(
127 'gb2312'=>1, // Chinese, simplified.
128 'big'=>1, // Chinese, traditional.
129 );
130
131 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
132 // http://czyborra.com/charsets/iso8859.html
133 var $synonyms=array(
134 'us' => 'ascii',
135 'us-ascii'=> 'ascii',
136 'cp819' => 'iso-8859-1',
137 'ibm819' => 'iso-8859-1',
138 'iso-ir-100' => 'iso-8859-1',
139 'iso-ir-109' => 'iso-8859-2',
140 'iso-ir-148' => 'iso-8859-9',
141 'iso-ir-199' => 'iso-8859-14',
142 'iso-ir-203' => 'iso-8859-15',
143 'csisolatin1' => 'iso-8859-1',
144 'csisolatin2' => 'iso-8859-2',
145 'csisolatin3' => 'iso-8859-3',
146 'csisolatin5' => 'iso-8859-9',
147 'csisolatin8' => 'iso-8859-14',
148 'csisolatin9' => 'iso-8859-15',
149 'csisolatingreek' => 'iso-8859-7',
150 'iso-celtic' => 'iso-8859-14',
151 'latin1' => 'iso-8859-1',
152 'latin2' => 'iso-8859-2',
153 'latin3' => 'iso-8859-3',
154 'latin5' => 'iso-8859-9',
155 'latin6' => 'iso-8859-10',
156 'latin8' => 'iso-8859-14',
157 'latin9' => 'iso-8859-15',
158 'l1' => 'iso-8859-1',
159 'l2' => 'iso-8859-2',
160 'l3' => 'iso-8859-3',
161 'l5' => 'iso-8859-9',
162 'l6' => 'iso-8859-10',
163 'l8' => 'iso-8859-14',
164 'l9' => 'iso-8859-15',
165 'cyrillic' => 'iso-8859-5',
166 'arabic' => 'iso-8859-6',
167 'win874' => 'windows-874',
168 'win1250' => 'windows-1250',
169 'win1251' => 'windows-1251',
170 'win1252' => 'windows-1252',
171 'win1253' => 'windows-1253',
172 'win1254' => 'windows-1254',
173 'win1255' => 'windows-1255',
174 'win1256' => 'windows-1256',
175 'win1257' => 'windows-1257',
176 'win1258' => 'windows-1258',
177 'cp1250' => 'windows-1250',
178 'cp1252' => 'windows-1252',
179 'ms-ee' => 'windows-1250',
180 'ms-ansi' => 'windows-1252',
181 'ms-greek' => 'windows-1253',
182 'ms-turk' => 'windows-1254',
183 'winbaltrim' => 'windows-1257',
184 'mac' => 'macRoman',
185 'macintosh' => 'macRoman',
186 'euc-cn' => 'gb2312',
187 'x-euc-cn' => 'gb2312',
188 'cp936' => 'gb2312',
189 'big-5' => 'big5',
190 'cp950' => 'big5',
191 'sjis' => 'shift_jis',
192 'shift-jis' => 'shift_jis',
193 'cp932' => 'shift_jis',
194 'utf7' => 'utf-7',
195 'utf8' => 'utf-8',
196 'utf16' => 'utf-16',
197 'utf32' => 'utf-32',
198 'utf8' => 'utf-8',
199 'ucs2' => 'ucs-2',
200 'ucs4' => 'ucs-4',
201 );
202
203 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
204 // Empty values means "iso-8859-1"
205 var $charSetArray = array(
206 'dk' => '',
207 'de' => '',
208 'no' => '',
209 'it' => '',
210 'fr' => '',
211 'es' => '',
212 'nl' => '',
213 'cz' => 'windows-1250',
214 'pl' => 'iso-8859-2',
215 'si' => 'windows-1250',
216 'fi' => '',
217 'tr' => 'iso-8859-9',
218 'se' => '',
219 'pt' => '',
220 'ru' => 'windows-1251',
221 'ro' => 'iso-8859-2',
222 'ch' => 'gb2312',
223 'sk' => 'windows-1250',
224 'lt' => 'windows-1257',
225 'is' => 'utf-8',
226 'hr' => 'windows-1250',
227 'hu' => 'iso-8859-2',
228 'gl' => '',
229 'th' => 'iso-8859-11',
230 'gr' => 'iso-8859-7',
231 'hk' => 'big5',
232 'eu' => '',
233 'bg' => 'windows-1251',
234 'br' => '',
235 'et' => 'iso-8859-4',
236 'ar' => 'iso-8859-6',
237 'he' => 'utf-8',
238 'ua' => 'windows-1251',
239 );
240
241 /**
242 * Normalize - changes input character set to lowercase letters.
243 *
244 * @param string Input charset
245 * @return string Normalized charset
246 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
247 */
248 function parse_charset($charset) {
249 $charset = strtolower($charset);
250 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
251
252 return $charset;
253 }
254
255
256 /**
257 * Convert from one charset to another charset.
258 *
259 * @param string Input string
260 * @param string From charset (the current charset of the string)
261 * @param string To charset (the output charset wanted)
262 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
263 * @return string Converted string
264 */
265 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
266 global $TYPO3_CONF_VARS;
267
268 if ($fromCS==$toCS) return $str;
269
270 if (!$useEntityForNoChar) { // iconv and recode don't support fallback to SGML entities
271 if ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'iconv') {
272 $conv_str = iconv($str,$fromCS,$toCS.'//TRANSLIT');
273 if (false !== $conv_str) return $conv_str;
274 }
275 elseif ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'recode') {
276 $conv_str = recode_string($toCS.'..'.$fromCS,$str);
277 if (false !== $conv_str) return $conv_str;
278 }
279 elseif ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'mbstring') {
280 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
281 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
282 }
283 // fallback to TYPO3 conversion
284 }
285
286 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
287 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
288 return $str;
289 }
290
291
292 /**
293 * Converts $str from $charset to UTF-8
294 *
295 * @param string String in local charset to convert to UTF-8
296 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
297 * @return string Output string, converted to UTF-8
298 */
299 function utf8_encode($str,$charset) {
300
301 // Charset is case-insensitive.
302 if ($this->initCharset($charset)) { // Parse conv. table if not already...
303 $strLen = strlen($str);
304 $outStr='';
305
306 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
307 $chr=substr($str,$a,1);
308 $ord=ord($chr);
309 if ($this->twoByteSets[$charset]) { // If the charset has two bytes per char
310 $ord2 = ord($str{$a+1});
311 $ord = $ord<<8 & $ord2; // assume big endian
312
313 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
314 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
315 } else $outStr.=chr($this->noCharByteVal); // No char exists
316 $a++;
317 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
318 if ($this->eucBasedSets[$charset]) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
319 $a++;
320 $ord2=ord(substr($str,$a,1));
321 $ord = $ord*256+$ord2;
322 }
323 elseif ($charset == 'shift_jis' && ($ord <160 || $ord>223)) { // Shift-JIS is like EUC, but chars between 160 and 223 are single byte
324 $a++;
325 $ord2=ord(substr($str,$a,1));
326 $ord = $ord*256+$ord2;
327 }
328
329 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
330 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
331 } else $outStr.=chr($this->noCharByteVal); // No char exists
332 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
333 }
334 return $outStr;
335 }
336 }
337
338 /**
339 * Converts $str from UTF-8 to $charset
340 *
341 * @param string String in UTF-8 to convert to local charset
342 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
343 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
344 * @return string Output string, converted to local charset
345 */
346 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
347
348 // Charset is case-insensitive.
349 if ($this->initCharset($charset)) { // Parse conv. table if not already...
350 $strLen = strlen($str);
351 $outStr='';
352 $buf='';
353 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
354 $chr=substr($str,$a,1);
355 $ord=ord($chr);
356 if ($ord>127) { // This means multibyte! (first byte!)
357 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
358
359 $buf=$chr; // Add first byte
360 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
361 $ord = $ord << 1; // Shift it left and ...
362 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
363 $a++; // Increase pointer...
364 $buf.=substr($str,$a,1); // ... and add the next char.
365 } else break;
366 }
367
368 # Martin Kutschker...! this does not work! With russian UTF-8 converted back to windows-1251 it failed... So the old code is re-inserted.
369 # for ($bc=0; $ord & 0x80; $ord = $ord << 1) { $bc++; } // calculate number of bytes
370 # $buf.=substr($str,$i,$bc);
371 # $i+=$bc-1;
372
373 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
374 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
375 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
376 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
377 } else $outStr.= chr($mByte);
378 } elseif ($useEntityForNoChar) { // Create num entity:
379 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
380 } else $outStr.=chr($this->noCharByteVal); // No char exists
381 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
382 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
383 }
384 return $outStr;
385 }
386 }
387
388 /**
389 * Converts all chars > 127 to numeric entities.
390 *
391 * @param string Input string
392 * @return string Output string
393 */
394 function utf8_to_entities($str) {
395 $strLen = strlen($str);
396 $outStr='';
397 $buf='';
398 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
399 $chr=substr($str,$a,1);
400 $ord=ord($chr);
401 if ($ord>127) { // This means multibyte! (first byte!)
402 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
403 $buf=$chr; // Add first byte
404 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
405 $ord = $ord << 1; // Shift it left and ...
406 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
407 $a++; // Increase pointer...
408 $buf.=substr($str,$a,1); // ... and add the next char.
409 } else break;
410 }
411
412 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
413 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
414 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
415 }
416
417 return $outStr;
418 }
419
420 /**
421 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
422 *
423 * @param string Input string, UTF-8
424 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
425 * @return string Output string
426 */
427 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
428 if ($alsoStdHtmlEnt) {
429 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES));
430 }
431
432 $token = md5(microtime());
433 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
434 foreach($parts as $k => $v) {
435 if ($k%2) {
436 if (substr($v,0,1)=='#') { // Dec or hex entities:
437 if (substr($v,1,1)=='x') $v=hexdec(substr($v,2));
438 $parts[$k] = $this->UnumberToChar(substr($v,1));
439 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
440 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
441 } else { // No conversion:
442 $parts[$k] ='&'.$v.';';
443 }
444 }
445 }
446
447 return implode('',$parts);
448 }
449
450 /**
451 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
452 *
453 * @param string Input string, UTF-8
454 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
455 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
456 * @return array Output array with the char numbers
457 */
458 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
459 // If entities must be registered as well...:
460 if ($convEntities) {
461 $str = $this->entities_to_utf8($str,1);
462 }
463 // Do conversion:
464 $strLen = strlen($str);
465 $outArr=array();
466 $buf='';
467 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
468 $chr=substr($str,$a,1);
469 $ord=ord($chr);
470 if ($ord>127) { // This means multibyte! (first byte!)
471 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
472 $buf=$chr; // Add first byte
473 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
474 $ord = $ord << 1; // Shift it left and ...
475 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
476 $a++; // Increase pointer...
477 $buf.=substr($str,$a,1); // ... and add the next char.
478 } else break;
479 }
480
481 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
482 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
483 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
484 }
485
486 return $outArr;
487 }
488
489 /**
490 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
491 * This function is automatically called by the conversion functions
492 *
493 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
494 *
495 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
496 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
497 * @access private
498 */
499 function initCharset($charset) {
500 // Only process if the charset is not yet loaded:
501 if (!is_array($this->parsedCharsets[$charset])) {
502
503 // Conversion table filename:
504 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
505
506 // If the conversion table is found:
507 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
508 // Cache file for charsets:
509 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
510 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/charset_'.$charset.'.tbl');
511 if ($cacheFile && @is_file($cacheFile)) {
512 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
513 } else {
514 // Parse conversion table into lines:
515 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
516 // Initialize the internal variable holding the conv. table:
517 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
518 // traverse the lines:
519 $detectedType='';
520 foreach($lines as $value) {
521 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
522
523 // Detect type if not done yet: (Done on first real line)
524 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
525 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
526
527 if ($detectedType=='ms-token') {
528 list($hexbyte,$utf8) = split('=|:',$value,3);
529 } elseif ($detectedType=='whitespaced') {
530 $regA=array();
531 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
532 $hexbyte = $regA[1];
533 $utf8 = 'U+'.$regA[2];
534 }
535 $decval = hexdec(trim($hexbyte));
536 if ($decval>127) {
537 $utf8decval = hexdec(substr(trim($utf8),2));
538 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
539 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
540 }
541 }
542 }
543 if ($cacheFile) {
544 t3lib_div::writeFile($cacheFile,serialize($this->parsedCharsets[$charset]));
545 }
546 }
547 return 2;
548 } else return false;
549 } else return 1;
550 }
551
552 /**
553 * Converts a UNICODE number to a UTF-8 multibyte character
554 * Algorithm based on script found at From: http://czyborra.com/utf/
555 *
556 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
557 *
558 * bytes | bits | representation
559 * 1 | 7 | 0vvvvvvv
560 * 2 | 11 | 110vvvvv 10vvvvvv
561 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
562 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
563 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
564 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
565 *
566 * @param integer UNICODE integer
567 * @return string UTF-8 multibyte character string
568 * @see utf8CharToUnumber()
569 */
570 function UnumberToChar($cbyte) {
571 $str='';
572
573 if ($cbyte < 0x80) {
574 $str.=chr($cbyte);
575 } else if ($cbyte < 0x800) {
576 $str.=chr(0xC0 | ($cbyte >> 6));
577 $str.=chr(0x80 | ($cbyte & 0x3F));
578 } else if ($cbyte < 0x10000) {
579 $str.=chr(0xE0 | ($cbyte >> 12));
580 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
581 $str.=chr(0x80 | ($cbyte & 0x3F));
582 } else if ($cbyte < 0x200000) {
583 $str.=chr(0xF0 | ($cbyte >> 18));
584 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
585 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
586 $str.=chr(0x80 | ($cbyte & 0x3F));
587 } else if ($cbyte < 0x4000000) {
588 $str.=chr(0xF8 | ($cbyte >> 24));
589 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
590 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
591 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
592 $str.=chr(0x80 | ($cbyte & 0x3F));
593 } else if ($cbyte < 0x80000000) {
594 $str.=chr(0xFC | ($cbyte >> 30));
595 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
596 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
597 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
598 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
599 $str.=chr(0x80 | ($cbyte & 0x3F));
600 } else { // Cannot express a 32-bit character in UTF-8
601 $str .= chr($this->noCharByteVal);
602 }
603 return $str;
604 }
605
606 /**
607 * Converts a UTF-8 Multibyte character to a UNICODE number
608 *
609 * @param string UTF-8 multibyte character string
610 * @param boolean If set, then a hex. number is returned.
611 * @return integer UNICODE integer
612 * @see UnumberToChar()
613 */
614 function utf8CharToUnumber($str,$hex=0) {
615 $ord=ord(substr($str,0,1)); // First char
616
617 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
618 $binBuf='';
619 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
620 $ord = $ord << 1; // Shift it left and ...
621 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
622 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
623 } else break;
624 }
625 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
626
627 $int = bindec($binBuf);
628 } else $int = $ord;
629
630 return $hex ? 'x'.dechex($int) : $int;
631 }
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650 /********************************************
651 *
652 * String operation functions
653 *
654 ********************************************/
655
656 /**
657 * Cuts a string short at a given byte length.
658 *
659 * @param string character string
660 * @param integer the byte length
661 * @param string the character set
662 * @return string the shortened string
663 * @see mb_strcut()
664 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
665 */
666 function strtrunc($string,$len,$charset) {
667 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
668 return mb_strcut($string,0,$len,$charset);
669 } elseif ($charset == 'utf-8') {
670 return utf8_strtrunc($string);
671 } elseif ($charset == 'shift_jis') {
672 return euc_strtrunc($string,'shift_jis');
673 } elseif ($this->eucBasedSets[$charset]) {
674 return euc_strtrunc($string,$charset);
675 } elseif ($this->twoByteSets[$charset]) {
676 if ($len % 2) $len--; // don't cut at odd positions
677 } elseif ($this->fourByteSets[$charset]) {
678 $x = $len % 4;
679 $len -= $x; // realign to position dividable by four
680 }
681 // treat everything else as single-byte encoding
682 return substr($string,0,$len);
683 }
684
685 /**
686 * Counts the number of characters.
687 *
688 * @param string character string
689 * @param string the character set
690 * @return integer the number of characters
691 * @see strlen()
692 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
693 */
694 function strlen($string,$charset) {
695 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
696 return mb_strlen($string,$charset);
697 } elseif ($charset == 'utf-8') {
698 return utf8_strlen($string);
699 } elseif ($charset == 'shift_jis') {
700 return euc_strlen($string,'shift_jis');
701 } elseif ($this->eucBasedSets[$charset]) {
702 return euc_strlen($string,$charset);
703 } elseif ($this->twoByteSets[$charset]) {
704 return strlen($string)/2;
705 } elseif ($this->fourByteSets[$charset]) {
706 return strlen($string)/4;
707 }
708 // treat everything else as single-byte encoding
709 return strlen($string);
710 }
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726 /********************************************
727 *
728 * UTF-8 String operation functions
729 *
730 ********************************************/
731
732 /**
733 * Truncates a string in UTF-8 short at a given byte length.
734 *
735 * @param string UTF-8 multibyte character string
736 * @param integer the byte length
737 * @return string the shortened string
738 * @see mb_strcut()
739 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
740 */
741 function utf8_strtrunc($str,$len) {
742 if ($len <= 0) return '';
743
744 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
745 return mb_strcut($str,0,$len,'utf-8');
746 }
747
748 $i = $len-1;
749 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
750 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
751 if ($i <= 0) return ''; // sanity check
752 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
753 if ($bc+$i > $len) return substr($str,0,$i);
754 // fallthru: multibyte char fits into length
755 }
756 return substr($str,$len);
757 }
758
759 /**
760 * Returns a part of a UTF-8 string.
761 *
762 *
763 * Negative values for @arg $start and @arg $len are currently not supported.
764 *
765 * @param string $str UTF-8 string
766 * @param int $start start position (character position)
767 * @param int $len length (in characters)
768 * @return string the substring
769 * @see substr()
770 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
771 * @bug
772 */
773 function utf8_substr($str,$start,$len=null) {
774 if ($len===0) return '';
775
776 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
777 // cannot omit $len, when specifying charset
778 if ($len==null) {
779 $enc = mb_internal_encoding(); // save internal encoding
780 mb_internal_encoding('utf-8');
781 $str = mb_substr($str,$start);
782 mb_internal_encoding($enc); // restore internal encoding
783
784 return $len;
785 }
786 else return mb_substr($str,$start,$len,'utf-8');
787 }
788
789 $byte_start = utf8_char2byte_pos($str,$start);
790 if ($byte_start === false) return false; // $start outside string length
791
792 $str = substr($str,$byte_start);
793
794 if ($len!=null) {
795 $byte_end = utf8_char2byte_pos($str,$len+1);
796 if ($byte_end === false) // $len outside actual string length
797 return $str;
798 else
799 return substr($str,0,$byte_end);
800 }
801 else return $str;
802 }
803
804 /**
805 * Counts the number of characters of a string in UTF-8.
806 *
807 * @param string UTF-8 multibyte character string
808 * @return int the number of characters
809 * @see strlen()
810 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
811 */
812 function utf8_strlen($str) {
813 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
814 return mb_strlen($str,'utf-8');
815 }
816
817 $n=0;
818 for($i=0; $str{$i}; $i++) {
819 $c = ord($str{$i});
820 if (!($c & 0x80)) // single-byte (0xxxxxx)
821 $n++;
822 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
823 $n++;
824 }
825 return $n;
826 }
827
828 /**
829 * Find position of first occurrence of a string, both arguments are in UTF-8.
830 *
831 * @param string UTF-8 string to search in
832 * @param string UTF-8 string to search for
833 * @param int positition to start the search
834 * @return int the character position
835 * @see strpos()
836 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
837 */
838 function utf8_strpos($haystack,$needle,$offset=0) {
839 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
840 return mb_strpos($haystack,$needle,'utf-8');
841 }
842
843 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
844 if ($byte_offset === false) return false; // offset beyond string length
845
846 $byte_pos = strpos($haystack,$needle,$byte_offset);
847 if ($byte_pos === false) return false; // needle not found
848
849 return $this->utf8_byte2char_pos($haystack,$byte_pos);
850 }
851
852 /**
853 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
854 *
855 * @param string UTF-8 string to search in
856 * @param char UTF-8 character to search for
857 * @return int the character position
858 * @see strrpos()
859 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
860 */
861 function utf8_strrpos($haystack,$needle) {
862 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
863 return mb_strrpos($haystack,$needle,'utf-8');
864 }
865
866 $byte_pos = strrpos($haystack,$needle);
867 if ($byte_pos === false) return false; // needle not found
868
869 return $this->utf8_byte2char_pos($haystack,$byte_pos);
870 }
871
872 /**
873 * Translates a character position into an 'absolute' byte position.
874 *
875 * @param string UTF-8 string
876 * @param int character position
877 * @return int byte position
878 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
879 */
880 function utf8_char2byte_pos($str,$pos) {
881 $n = 0; // number of characters
882 for($i=0; $str{$i} && $n<$pos; $i++) {
883 $c = (int)ord($str{$i});
884 if (!($c & 0x80)) // single-byte (0xxxxxx)
885 $n++;
886 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
887 $n++;
888 }
889 if (!$str{$i}) return false; // offset beyond string length
890
891 // skip trailing multi-byte data bytes
892 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
893
894 return $i;
895 }
896
897 /**
898 * Translates an 'absolute' byte position into a character position.
899 *
900 * @param string UTF-8 string
901 * @param int byte position
902 * @return int character position
903 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
904 */
905 function utf8_byte2char_pos($str,$pos) {
906 $n = 0; // number of characters
907 for($i=$pos; $i>0; $i--) {
908 $c = (int)ord($str{$i});
909 if (!($c & 0x80)) // single-byte (0xxxxxx)
910 $n++;
911 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
912 $n++;
913 }
914 if (!$str{$i}) return false; // offset beyond string length
915
916 return $n;
917 }
918
919
920
921
922
923
924
925
926
927
928
929
930
931 /********************************************
932 *
933 * EUC String operation functions
934 *
935 * Extended Unix Code:
936 * ASCII compatible 7bit single bytes chars
937 * 8bit two byte chars
938 *
939 * Shift-JIS is handled as a special case
940 *
941 ********************************************/
942
943 /**
944 * Cuts a string in the EUC charset family short at a given byte length.
945 *
946 * @param string EUC multibyte character string
947 * @param integer the byte length
948 * @param string the charset
949 * @return string the shortened string
950 * @see mb_strcut()
951 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
952 */
953 function euc_strtrunc($str,$len,$charset) {
954 if ($len <= 0) return '';
955
956 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
957 return mb_strcut($str,0,$len,$charset);
958 }
959
960 $sjis = ($charset == 'shift_jis');
961 for ($i=0; $str{$i} && $i<$len; $i++) {
962 $c = ord($str{$i});
963 if ($sjis) {
964 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
965 }
966 else {
967 if ($c >= 0x80) $i++; // advance a double-byte char
968 }
969 }
970 if (!$str{$i}) return $str; // string shorter than supplied length
971
972 if ($i>$len)
973 return substr($str,0,$len-1); // we ended on a first byte
974 else
975 return substr($str,0,$len);
976 }
977
978 /**
979 * Counts the number of characters of a string in the EUC charset family.
980 *
981 * @param string EUC multibyte character string
982 * @param string the charset
983 * @return int the number of characters
984 * @see strlen()
985 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
986 */
987 function euc_strlen($str,$charset) {
988 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
989 return mb_strlen($str,$charset);
990 }
991
992 $sjis = ($charset == 'shift_jis');
993 $n=0;
994 for ($i=0; $str{$i}; $i++) {
995 $c = ord($str{$i});
996 if ($sjis) {
997 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
998 }
999 else {
1000 if ($c >= 0x80) $i++; // advance a double-byte char
1001 }
1002
1003 $n++;
1004 }
1005
1006 return $n;
1007 }
1008
1009 }
1010
1011 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
1012 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
1013 }
1014 ?>