Support for KOI-8R added
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2004 Kasper Skaarhoj (kasper@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasper@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 108: class t3lib_cs
38 * 237: function parse_charset($charset)
39 * 254: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
40 * 288: function utf8_encode($str,$charset)
41 * 329: function utf8_decode($str,$charset,$useEntityForNoChar=0)
42 * 377: function utf8_to_entities($str)
43 * 410: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
44 * 441: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
45 * 482: function initCharset($charset)
46 * 553: function UnumberToChar($cbyte)
47 * 597: function utf8CharToUnumber($str,$hex=0)
48 *
49 * SECTION: UTF-8 String operation functions
50 * 648: function utf8_strtrunc($str,$len)
51 * 680: function utf8_substr($str,$start,$len=null)
52 * 719: function utf8_strlen($str)
53 * 745: function utf8_strpos($haystack,$needle,$offset=0)
54 * 768: function utf8_strrpos($haystack,$needle)
55 * 787: function utf8_char2byte_pos($str,$pos)
56 * 812: function utf8_byte2char_pos($str,$pos)
57 *
58 * TOTAL FUNCTIONS: 17
59 * (This index is automatically created/updated by the extension "extdeveval")
60 *
61 */
62
63
64
65
66
67
68
69
70 /**
71 * Notes on UTF-8
72 *
73 * Functions working on UTF-8 strings:
74 *
75 * - strchr/strstr
76 * - strrchr
77 * - substr_count
78 * - implode/explode/join
79 *
80 * Functions nearly working on UTF-8 strings:
81 *
82 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf_strlen
83 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained 7-bit ASCII
84 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
85 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
86 *
87 * Functions NOT working on UTF-8 strings:
88 *
89 * - str*cmp
90 * - stristr
91 * - stripos
92 * - substr
93 * - strrev
94 * - ereg/eregi
95 * - split/spliti
96 * - preg_*
97 * - ...
98 *
99 */
100 /**
101 * Class for conversion between charsets.
102 *
103 * @author Kasper Skaarhoj <kasper@typo3.com>
104 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
105 * @package TYPO3
106 * @subpackage t3lib
107 */
108 class t3lib_cs {
109 var $noCharByteVal=127; // ASCII Value for chars with no equalent.
110
111 // This is the array where parsed conversion tables are stored (cached)
112 var $parsedCharsets=array();
113
114 // This tells the converter which charsets has two bytes per char:
115 var $twoByteSets=array(
116 'ucs-2'=>1, // 2-byte Unicode
117 );
118
119 // This tells the converter which charsets has four bytes per char:
120 var $fourByteSets=array(
121 'ucs-4'=>1, // 4-byte Unicode
122 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
123 );
124
125 // This tells the converter which charsets use a scheme like the Extended Unix Code:
126 var $eucBasedSets=array(
127 'gb2312'=>1, // Chinese, simplified.
128 'big5'=>1, // Chinese, traditional.
129 );
130
131 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
132 // http://czyborra.com/charsets/iso8859.html
133 var $synonyms=array(
134 'us' => 'ascii',
135 'us-ascii'=> 'ascii',
136 'cp819' => 'iso-8859-1',
137 'ibm819' => 'iso-8859-1',
138 'iso-ir-100' => 'iso-8859-1',
139 'iso-ir-109' => 'iso-8859-2',
140 'iso-ir-148' => 'iso-8859-9',
141 'iso-ir-199' => 'iso-8859-14',
142 'iso-ir-203' => 'iso-8859-15',
143 'csisolatin1' => 'iso-8859-1',
144 'csisolatin2' => 'iso-8859-2',
145 'csisolatin3' => 'iso-8859-3',
146 'csisolatin5' => 'iso-8859-9',
147 'csisolatin8' => 'iso-8859-14',
148 'csisolatin9' => 'iso-8859-15',
149 'csisolatingreek' => 'iso-8859-7',
150 'iso-celtic' => 'iso-8859-14',
151 'latin1' => 'iso-8859-1',
152 'latin2' => 'iso-8859-2',
153 'latin3' => 'iso-8859-3',
154 'latin5' => 'iso-8859-9',
155 'latin6' => 'iso-8859-10',
156 'latin8' => 'iso-8859-14',
157 'latin9' => 'iso-8859-15',
158 'l1' => 'iso-8859-1',
159 'l2' => 'iso-8859-2',
160 'l3' => 'iso-8859-3',
161 'l5' => 'iso-8859-9',
162 'l6' => 'iso-8859-10',
163 'l8' => 'iso-8859-14',
164 'l9' => 'iso-8859-15',
165 'cyrillic' => 'iso-8859-5',
166 'arabic' => 'iso-8859-6',
167 'win874' => 'windows-874',
168 'win1250' => 'windows-1250',
169 'win1251' => 'windows-1251',
170 'win1252' => 'windows-1252',
171 'win1253' => 'windows-1253',
172 'win1254' => 'windows-1254',
173 'win1255' => 'windows-1255',
174 'win1256' => 'windows-1256',
175 'win1257' => 'windows-1257',
176 'win1258' => 'windows-1258',
177 'cp1250' => 'windows-1250',
178 'cp1252' => 'windows-1252',
179 'ms-ee' => 'windows-1250',
180 'ms-ansi' => 'windows-1252',
181 'ms-greek' => 'windows-1253',
182 'ms-turk' => 'windows-1254',
183 'winbaltrim' => 'windows-1257',
184 'koi-8ru' => 'koi-8r',
185 'koi8r' => 'koi-8r',
186 'mac' => 'macRoman',
187 'macintosh' => 'macRoman',
188 'euc-cn' => 'gb2312',
189 'x-euc-cn' => 'gb2312',
190 'cp936' => 'gb2312',
191 'big-5' => 'big5',
192 'cp950' => 'big5',
193 'sjis' => 'shift_jis',
194 'shift-jis' => 'shift_jis',
195 'cp932' => 'shift_jis',
196 'utf7' => 'utf-7',
197 'utf8' => 'utf-8',
198 'utf16' => 'utf-16',
199 'utf32' => 'utf-32',
200 'utf8' => 'utf-8',
201 'ucs2' => 'ucs-2',
202 'ucs4' => 'ucs-4',
203 );
204
205 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
206 // Empty values means "iso-8859-1"
207 var $charSetArray = array(
208 'dk' => '',
209 'de' => '',
210 'no' => '',
211 'it' => '',
212 'fr' => '',
213 'es' => '',
214 'nl' => '',
215 'cz' => 'windows-1250',
216 'pl' => 'iso-8859-2',
217 'si' => 'windows-1250',
218 'fi' => '',
219 'tr' => 'iso-8859-9',
220 'se' => '',
221 'pt' => '',
222 'ru' => 'windows-1251',
223 'ro' => 'iso-8859-2',
224 'ch' => 'gb2312',
225 'sk' => 'windows-1250',
226 'lt' => 'windows-1257',
227 'is' => 'utf-8',
228 'hr' => 'windows-1250',
229 'hu' => 'iso-8859-2',
230 'gl' => '',
231 'th' => 'iso-8859-11',
232 'gr' => 'iso-8859-7',
233 'hk' => 'big5',
234 'eu' => '',
235 'bg' => 'windows-1251',
236 'br' => '',
237 'et' => 'iso-8859-4',
238 'ar' => 'iso-8859-6',
239 'he' => 'utf-8',
240 'ua' => 'windows-1251',
241 );
242
243 /**
244 * Normalize - changes input character set to lowercase letters.
245 *
246 * @param string Input charset
247 * @return string Normalized charset
248 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
249 */
250 function parse_charset($charset) {
251 $charset = strtolower($charset);
252 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
253
254 return $charset;
255 }
256
257
258 /**
259 * Convert from one charset to another charset.
260 *
261 * @param string Input string
262 * @param string From charset (the current charset of the string)
263 * @param string To charset (the output charset wanted)
264 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
265 * @return string Converted string
266 */
267 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
268 global $TYPO3_CONF_VARS;
269
270 if ($fromCS==$toCS) return $str;
271
272 if (!$useEntityForNoChar) { // iconv and recode don't support fallback to SGML entities
273 if ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'iconv') {
274 $conv_str = iconv($str,$fromCS,$toCS.'//TRANSLIT');
275 if (false !== $conv_str) return $conv_str;
276 }
277 elseif ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'recode') {
278 $conv_str = recode_string($toCS.'..'.$fromCS,$str);
279 if (false !== $conv_str) return $conv_str;
280 }
281 elseif ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'mbstring') {
282 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
283 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
284 }
285 // fallback to TYPO3 conversion
286 }
287
288 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
289 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
290 return $str;
291 }
292
293
294 /**
295 * Converts $str from $charset to UTF-8
296 *
297 * @param string String in local charset to convert to UTF-8
298 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
299 * @return string Output string, converted to UTF-8
300 */
301 function utf8_encode($str,$charset) {
302
303 // Charset is case-insensitive.
304 if ($this->initCharset($charset)) { // Parse conv. table if not already...
305 $strLen = strlen($str);
306 $outStr='';
307
308 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
309 $chr=substr($str,$a,1);
310 $ord=ord($chr);
311 if ($this->twoByteSets[$charset]) { // If the charset has two bytes per char
312 $ord2 = ord($str{$a+1});
313 $ord = $ord<<8 & $ord2; // assume big endian
314
315 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
316 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
317 } else $outStr.=chr($this->noCharByteVal); // No char exists
318 $a++;
319 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
320 if ($this->eucBasedSets[$charset]) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
321 $a++;
322 $ord2=ord(substr($str,$a,1));
323 $ord = $ord*256+$ord2;
324 }
325 elseif ($charset == 'shift_jis' && ($ord <160 || $ord>223)) { // Shift-JIS is like EUC, but chars between 160 and 223 are single byte
326 $a++;
327 $ord2=ord(substr($str,$a,1));
328 $ord = $ord*256+$ord2;
329 }
330
331 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
332 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
333 } else $outStr.=chr($this->noCharByteVal); // No char exists
334 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
335 }
336 return $outStr;
337 }
338 }
339
340 /**
341 * Converts $str from UTF-8 to $charset
342 *
343 * @param string String in UTF-8 to convert to local charset
344 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
345 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
346 * @return string Output string, converted to local charset
347 */
348 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
349
350 // Charset is case-insensitive.
351 if ($this->initCharset($charset)) { // Parse conv. table if not already...
352 $strLen = strlen($str);
353 $outStr='';
354 $buf='';
355 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
356 $chr=substr($str,$a,1);
357 $ord=ord($chr);
358 if ($ord>127) { // This means multibyte! (first byte!)
359 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
360
361 $buf=$chr; // Add first byte
362 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
363 $ord = $ord << 1; // Shift it left and ...
364 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
365 $a++; // Increase pointer...
366 $buf.=substr($str,$a,1); // ... and add the next char.
367 } else break;
368 }
369
370 # Martin Kutschker...! this does not work! With russian UTF-8 converted back to windows-1251 it failed... So the old code is re-inserted.
371 # for ($bc=0; $ord & 0x80; $ord = $ord << 1) { $bc++; } // calculate number of bytes
372 # $buf.=substr($str,$i,$bc);
373 # $i+=$bc-1;
374
375 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
376 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
377 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
378 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
379 } else $outStr.= chr($mByte);
380 } elseif ($useEntityForNoChar) { // Create num entity:
381 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
382 } else $outStr.=chr($this->noCharByteVal); // No char exists
383 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
384 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
385 }
386 return $outStr;
387 }
388 }
389
390 /**
391 * Converts all chars > 127 to numeric entities.
392 *
393 * @param string Input string
394 * @return string Output string
395 */
396 function utf8_to_entities($str) {
397 $strLen = strlen($str);
398 $outStr='';
399 $buf='';
400 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
401 $chr=substr($str,$a,1);
402 $ord=ord($chr);
403 if ($ord>127) { // This means multibyte! (first byte!)
404 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
405 $buf=$chr; // Add first byte
406 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
407 $ord = $ord << 1; // Shift it left and ...
408 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
409 $a++; // Increase pointer...
410 $buf.=substr($str,$a,1); // ... and add the next char.
411 } else break;
412 }
413
414 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
415 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
416 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
417 }
418
419 return $outStr;
420 }
421
422 /**
423 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
424 *
425 * @param string Input string, UTF-8
426 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
427 * @return string Output string
428 */
429 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
430 if ($alsoStdHtmlEnt) {
431 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES));
432 }
433
434 $token = md5(microtime());
435 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
436 foreach($parts as $k => $v) {
437 if ($k%2) {
438 if (substr($v,0,1)=='#') { // Dec or hex entities:
439 if (substr($v,1,1)=='x') $v=hexdec(substr($v,2));
440 $parts[$k] = $this->UnumberToChar(substr($v,1));
441 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
442 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
443 } else { // No conversion:
444 $parts[$k] ='&'.$v.';';
445 }
446 }
447 }
448
449 return implode('',$parts);
450 }
451
452 /**
453 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
454 *
455 * @param string Input string, UTF-8
456 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
457 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
458 * @return array Output array with the char numbers
459 */
460 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
461 // If entities must be registered as well...:
462 if ($convEntities) {
463 $str = $this->entities_to_utf8($str,1);
464 }
465 // Do conversion:
466 $strLen = strlen($str);
467 $outArr=array();
468 $buf='';
469 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
470 $chr=substr($str,$a,1);
471 $ord=ord($chr);
472 if ($ord>127) { // This means multibyte! (first byte!)
473 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
474 $buf=$chr; // Add first byte
475 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
476 $ord = $ord << 1; // Shift it left and ...
477 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
478 $a++; // Increase pointer...
479 $buf.=substr($str,$a,1); // ... and add the next char.
480 } else break;
481 }
482
483 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
484 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
485 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
486 }
487
488 return $outArr;
489 }
490
491 /**
492 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
493 * This function is automatically called by the conversion functions
494 *
495 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
496 *
497 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
498 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
499 * @access private
500 */
501 function initCharset($charset) {
502 // Only process if the charset is not yet loaded:
503 if (!is_array($this->parsedCharsets[$charset])) {
504
505 // Conversion table filename:
506 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
507
508 // If the conversion table is found:
509 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
510 // Cache file for charsets:
511 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
512 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/charset_'.$charset.'.tbl');
513 if ($cacheFile && @is_file($cacheFile)) {
514 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
515 } else {
516 // Parse conversion table into lines:
517 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
518 // Initialize the internal variable holding the conv. table:
519 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
520 // traverse the lines:
521 $detectedType='';
522 foreach($lines as $value) {
523 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
524
525 // Detect type if not done yet: (Done on first real line)
526 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
527 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
528
529 if ($detectedType=='ms-token') {
530 list($hexbyte,$utf8) = split('=|:',$value,3);
531 } elseif ($detectedType=='whitespaced') {
532 $regA=array();
533 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
534 $hexbyte = $regA[1];
535 $utf8 = 'U+'.$regA[2];
536 }
537 $decval = hexdec(trim($hexbyte));
538 if ($decval>127) {
539 $utf8decval = hexdec(substr(trim($utf8),2));
540 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
541 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
542 }
543 }
544 }
545 if ($cacheFile) {
546 t3lib_div::writeFile($cacheFile,serialize($this->parsedCharsets[$charset]));
547 }
548 }
549 return 2;
550 } else return false;
551 } else return 1;
552 }
553
554 /**
555 * Converts a UNICODE number to a UTF-8 multibyte character
556 * Algorithm based on script found at From: http://czyborra.com/utf/
557 *
558 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
559 *
560 * bytes | bits | representation
561 * 1 | 7 | 0vvvvvvv
562 * 2 | 11 | 110vvvvv 10vvvvvv
563 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
564 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
565 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
566 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
567 *
568 * @param integer UNICODE integer
569 * @return string UTF-8 multibyte character string
570 * @see utf8CharToUnumber()
571 */
572 function UnumberToChar($cbyte) {
573 $str='';
574
575 if ($cbyte < 0x80) {
576 $str.=chr($cbyte);
577 } else if ($cbyte < 0x800) {
578 $str.=chr(0xC0 | ($cbyte >> 6));
579 $str.=chr(0x80 | ($cbyte & 0x3F));
580 } else if ($cbyte < 0x10000) {
581 $str.=chr(0xE0 | ($cbyte >> 12));
582 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
583 $str.=chr(0x80 | ($cbyte & 0x3F));
584 } else if ($cbyte < 0x200000) {
585 $str.=chr(0xF0 | ($cbyte >> 18));
586 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
587 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
588 $str.=chr(0x80 | ($cbyte & 0x3F));
589 } else if ($cbyte < 0x4000000) {
590 $str.=chr(0xF8 | ($cbyte >> 24));
591 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
592 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
593 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
594 $str.=chr(0x80 | ($cbyte & 0x3F));
595 } else if ($cbyte < 0x80000000) {
596 $str.=chr(0xFC | ($cbyte >> 30));
597 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
598 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
599 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
600 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
601 $str.=chr(0x80 | ($cbyte & 0x3F));
602 } else { // Cannot express a 32-bit character in UTF-8
603 $str .= chr($this->noCharByteVal);
604 }
605 return $str;
606 }
607
608 /**
609 * Converts a UTF-8 Multibyte character to a UNICODE number
610 *
611 * @param string UTF-8 multibyte character string
612 * @param boolean If set, then a hex. number is returned.
613 * @return integer UNICODE integer
614 * @see UnumberToChar()
615 */
616 function utf8CharToUnumber($str,$hex=0) {
617 $ord=ord(substr($str,0,1)); // First char
618
619 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
620 $binBuf='';
621 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
622 $ord = $ord << 1; // Shift it left and ...
623 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
624 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
625 } else break;
626 }
627 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
628
629 $int = bindec($binBuf);
630 } else $int = $ord;
631
632 return $hex ? 'x'.dechex($int) : $int;
633 }
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652 /********************************************
653 *
654 * String operation functions
655 *
656 ********************************************/
657
658 /**
659 * Cuts a string short at a given byte length.
660 *
661 * @param string the character set
662 * @param string character string
663 * @param integer the byte length
664 * @return string the shortened string
665 * @see mb_strcut()
666 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
667 */
668 function strtrunc($charset,$string,$len) {
669 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
670 return mb_strcut($string,0,$len,$charset);
671 } elseif ($charset == 'utf-8') {
672 return $this->utf8_strtrunc($string);
673 } elseif ($charset == 'shift_jis') {
674 return $this->euc_strtrunc($string,'shift_jis');
675 } elseif ($this->eucBasedSets[$charset]) {
676 return $this->euc_strtrunc($string,$charset);
677 } elseif ($this->twoByteSets[$charset]) {
678 if ($len % 2) $len--; // don't cut at odd positions
679 } elseif ($this->fourByteSets[$charset]) {
680 $x = $len % 4;
681 $len -= $x; // realign to position dividable by four
682 }
683 // treat everything else as single-byte encoding
684 return substr($string,0,$len);
685 }
686
687 /**
688 * Returns a part of a string.
689 *
690 *
691 * Negative values for @arg $start and @arg $len are currently not supported.
692 *
693 * @param string the character set
694 * @param string character string
695 * @param int $start start position (character position)
696 * @param int length (in characters)
697 * @return string the substring
698 * @see substr()
699 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
700 * @bug
701 */
702 function substr($charset,$str,$start,$len=null) {
703 if ($len===0) return '';
704
705 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
706 // cannot omit $len, when specifying charset
707 if ($len==null) {
708 $enc = mb_internal_encoding(); // save internal encoding
709 mb_internal_encoding('utf-8');
710 $str = mb_substr($str,$start);
711 mb_internal_encoding($enc); // restore internal encoding
712
713 return $str;
714 }
715 else return mb_substr($str,$start,$len,'utf-8');
716 } elseif ($charset == 'utf-8') {
717 return $this->utf8_substr($string,$start,$len);
718 } elseif ($charset == 'shift_jis') {
719 return $this->euc_substr($string,$start,$len,'shift_jis');
720 } elseif ($this->eucBasedSets[$charset]) {
721 return $this->euc_substr($string,$start,$len);
722 } elseif ($this->twoByteSets[$charset]) {
723 return substr($string,$start*2,$len*2);
724 } elseif ($this->fourByteSets[$charset]) {
725 return substr($string,$start*4,$len*4);
726 }
727
728 // treat everything else as single-byte encoding
729 return substr($string,$start,$len);
730 }
731
732 /**
733 * Counts the number of characters.
734 *
735 * @param string the character set
736 * @param string character string
737 * @return integer the number of characters
738 * @see strlen()
739 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
740 */
741 function strlen($charset,$string) {
742 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
743 return mb_strlen($string,$charset);
744 } elseif ($charset == 'utf-8') {
745 return $this->utf8_strlen($string);
746 } elseif ($charset == 'shift_jis') {
747 return $this->euc_strlen($string,'shift_jis');
748 } elseif ($this->eucBasedSets[$charset]) {
749 return $this->euc_strlen($string,$charset);
750 } elseif ($this->twoByteSets[$charset]) {
751 return strlen($string)/2;
752 } elseif ($this->fourByteSets[$charset]) {
753 return strlen($string)/4;
754 }
755 // treat everything else as single-byte encoding
756 return strlen($string);
757 }
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774 /********************************************
775 *
776 * UTF-8 String operation functions
777 *
778 ********************************************/
779
780 /**
781 * Truncates a string in UTF-8 short at a given byte length.
782 *
783 * @param string UTF-8 multibyte character string
784 * @param integer the byte length
785 * @return string the shortened string
786 * @see mb_strcut()
787 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
788 */
789 function utf8_strtrunc($str,$len) {
790 if ($len <= 0) return '';
791
792 $i = $len-1;
793 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
794 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
795 if ($i <= 0) return ''; // sanity check
796 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
797 if ($bc+$i > $len) return substr($str,0,$i);
798 // fallthru: multibyte char fits into length
799 }
800 return substr($str,$len);
801 }
802
803 /**
804 * Returns a part of a UTF-8 string.
805 *
806 *
807 * Negative values for @arg $start and @arg $len are currently not supported.
808 *
809 * @param string $str UTF-8 string
810 * @param int $start start position (character position)
811 * @param int $len length (in characters)
812 * @return string the substring
813 * @see substr()
814 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
815 * @bug
816 */
817 function utf8_substr($str,$start,$len=null) {
818 if ($len===0) return '';
819
820 $byte_start = $this->utf8_char2byte_pos($str,$start);
821 if ($byte_start === false) return false; // $start outside string length
822
823 $str = substr($str,$byte_start);
824
825 if ($len!=null) {
826 $byte_end = $this->utf8_char2byte_pos($str,$len);
827 if ($byte_end === false) // $len outside actual string length
828 return $str;
829 else
830 return substr($str,0,$byte_end);
831 }
832 else return $str;
833 }
834
835 /**
836 * Counts the number of characters of a string in UTF-8.
837 *
838 * @param string UTF-8 multibyte character string
839 * @return int the number of characters
840 * @see strlen()
841 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
842 */
843 function utf8_strlen($str) {
844 $n=0;
845 for($i=0; $str{$i}; $i++) {
846 $c = ord($str{$i});
847 if (!($c & 0x80)) // single-byte (0xxxxxx)
848 $n++;
849 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
850 $n++;
851 }
852 return $n;
853 }
854
855 /**
856 * Find position of first occurrence of a string, both arguments are in UTF-8.
857 *
858 * @param string UTF-8 string to search in
859 * @param string UTF-8 string to search for
860 * @param int positition to start the search
861 * @return int the character position
862 * @see strpos()
863 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
864 */
865 function utf8_strpos($haystack,$needle,$offset=0) {
866 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
867 return mb_strpos($haystack,$needle,'utf-8');
868 }
869
870 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
871 if ($byte_offset === false) return false; // offset beyond string length
872
873 $byte_pos = strpos($haystack,$needle,$byte_offset);
874 if ($byte_pos === false) return false; // needle not found
875
876 return $this->utf8_byte2char_pos($haystack,$byte_pos);
877 }
878
879 /**
880 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
881 *
882 * @param string UTF-8 string to search in
883 * @param char UTF-8 character to search for
884 * @return int the character position
885 * @see strrpos()
886 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
887 */
888 function utf8_strrpos($haystack,$needle) {
889 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
890 return mb_strrpos($haystack,$needle,'utf-8');
891 }
892
893 $byte_pos = strrpos($haystack,$needle);
894 if ($byte_pos === false) return false; // needle not found
895
896 return $this->utf8_byte2char_pos($haystack,$byte_pos);
897 }
898
899 /**
900 * Translates a character position into an 'absolute' byte position.
901 *
902 * @param string UTF-8 string
903 * @param int character position
904 * @return int byte position
905 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
906 */
907 function utf8_char2byte_pos($str,$pos) {
908 $n = 0; // number of characters
909 for($i=0; $str{$i} && $n<$pos; $i++) {
910 $c = (int)ord($str{$i});
911 if (!($c & 0x80)) // single-byte (0xxxxxx)
912 $n++;
913 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
914 $n++;
915 }
916 if (!$str{$i}) return false; // offset beyond string length
917
918 // skip trailing multi-byte data bytes
919 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
920
921 return $i;
922 }
923
924 /**
925 * Translates an 'absolute' byte position into a character position.
926 *
927 * @param string UTF-8 string
928 * @param int byte position
929 * @return int character position
930 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
931 */
932 function utf8_byte2char_pos($str,$pos) {
933 $n = 0; // number of characters
934 for($i=$pos; $i>0; $i--) {
935 $c = (int)ord($str{$i});
936 if (!($c & 0x80)) // single-byte (0xxxxxx)
937 $n++;
938 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
939 $n++;
940 }
941 if (!$str{$i}) return false; // offset beyond string length
942
943 return $n;
944 }
945
946
947
948
949
950
951
952
953
954
955
956
957
958 /********************************************
959 *
960 * EUC String operation functions
961 *
962 * Extended Unix Code:
963 * ASCII compatible 7bit single bytes chars
964 * 8bit two byte chars
965 *
966 * Shift-JIS is treated as a special case.
967 *
968 ********************************************/
969
970 /**
971 * Cuts a string in the EUC charset family short at a given byte length.
972 *
973 * @param string EUC multibyte character string
974 * @param integer the byte length
975 * @param string the charset
976 * @return string the shortened string
977 * @see mb_strcut()
978 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
979 */
980 function euc_strtrunc($str,$len,$charset) {
981 if ($len <= 0) return '';
982
983 $sjis = ($charset == 'shift_jis');
984 for ($i=0; $str{$i} && $i<$len; $i++) {
985 $c = ord($str{$i});
986 if ($sjis) {
987 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
988 }
989 else {
990 if ($c >= 0x80) $i++; // advance a double-byte char
991 }
992 }
993 if (!$str{$i}) return $str; // string shorter than supplied length
994
995 if ($i>$len)
996 return substr($str,0,$len-1); // we ended on a first byte
997 else
998 return substr($str,0,$len);
999 }
1000
1001 /**
1002 * Returns a part of a string in the EUC charset family.
1003 *
1004 *
1005 * Negative values for @arg $start and @arg $len are currently not supported.
1006 *
1007 * @param string EUC multibyte character string
1008 * @param int start position (character position)
1009 * @param int length (in characters)
1010 * @return string the substring
1011 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1012 *
1013 */
1014 function euc_substr($str,$start,$charset,$len=null) {
1015 if ($len===0) return '';
1016
1017 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1018 if ($byte_start === false) return false; // $start outside string length
1019
1020 $str = substr($str,$byte_start);
1021
1022 if ($len!=null) {
1023 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1024 if ($byte_end === false) // $len outside actual string length
1025 return $str;
1026 else
1027 return substr($str,0,$byte_end);
1028 }
1029 else return $str;
1030 }
1031
1032 /**
1033 * Counts the number of characters of a string in the EUC charset family.
1034 *
1035 * @param string EUC multibyte character string
1036 * @param string the charset
1037 * @return int the number of characters
1038 * @see strlen()
1039 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1040 */
1041 function euc_strlen($str,$charset) {
1042 $sjis = ($charset == 'shift_jis');
1043 $n=0;
1044 for ($i=0; $str{$i}; $i++) {
1045 $c = ord($str{$i});
1046 if ($sjis) {
1047 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1048 }
1049 else {
1050 if ($c >= 0x80) $i++; // advance a double-byte char
1051 }
1052
1053 $n++;
1054 }
1055
1056 return $n;
1057 }
1058
1059 /**
1060 * Translates a character position into an 'absolute' byte position.
1061 *
1062 * @param string EUC multibyte character string
1063 * @param int character position
1064 * @param string the charset
1065 * @return int byte position
1066 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1067 */
1068 function euc_char2byte_pos($str,$pos,$charset) {
1069 $sjis = ($charset == 'shift_jis');
1070 $n = 0; // number of characters seen
1071 for ($i=0; $str{$i} && $n<$pos; $i++) {
1072 $c = ord($str{$i});
1073 if ($sjis) {
1074 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1075 }
1076 else {
1077 if ($c >= 0x80) $i++; // advance a double-byte char
1078 }
1079
1080 $n++;
1081 }
1082 if (!$str{$i}) return false; // offset beyond string length
1083
1084 return $i;
1085 }
1086
1087 }
1088
1089 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
1090 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
1091 }
1092 ?>