Replacing string index test strlen() with isset().
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2004 Kasper Skaarhoj (kasper@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasper@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 128: class t3lib_cs
38 * 442: function parse_charset($charset)
39 * 460: function get_locale_charset($locale)
40 * 492: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
41 * 529: function utf8_encode($str,$charset)
42 * 576: function utf8_decode($str,$charset,$useEntityForNoChar=0)
43 * 619: function utf8_to_entities($str)
44 * 652: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
45 * 686: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
46 * 736: function UnumberToChar($cbyte)
47 * 781: function utf8CharToUnumber($str,$hex=0)
48 *
49 * SECTION: Init functions
50 * 824: function initCharset($charset)
51 * 885: function initCaseFoldingUTF8()
52 * 973: function initCaseFolding($charset)
53 *
54 * SECTION: String operation functions
55 * 1058: function substr($charset,$string,$start,$len=null)
56 * 1096: function strlen($charset,$string)
57 * 1124: function crop($charset,$string,$len,$crop='')
58 * 1165: function strtrunc($charset,$string,$len)
59 * 1197: function conv_case($charset,$string,$case)
60 *
61 * SECTION: Internal UTF-8 string operation functions
62 * 1264: function utf8_substr($str,$start,$len=null)
63 * 1297: function utf8_strlen($str)
64 * 1318: function utf8_strtrunc($str,$len)
65 * 1340: function utf8_strpos($haystack,$needle,$offset=0)
66 * 1363: function utf8_strrpos($haystack,$needle)
67 * 1383: function utf8_char2byte_pos($str,$pos)
68 * 1424: function utf8_byte2char_pos($str,$pos)
69 * 1448: function utf8_conv_case($str,$case)
70 *
71 * SECTION: Internal EUC string operation functions
72 * 1514: function euc_strtrunc($str,$len,$charset)
73 * 1543: function euc_substr($str,$start,$charset,$len=null)
74 * 1568: function euc_strlen($str,$charset)
75 * 1595: function euc_char2byte_pos($str,$pos,$charset)
76 * 1636: function euc_conv_case($str,$case,$charset)
77 *
78 * TOTAL FUNCTIONS: 31
79 * (This index is automatically created/updated by the extension "extdeveval")
80 *
81 */
82
83
84
85
86
87
88
89
90 /**
91 * Notes on UTF-8
92 *
93 * Functions working on UTF-8 strings:
94 *
95 * - strchr/strstr
96 * - strrchr
97 * - substr_count
98 * - implode/explode/join
99 *
100 * Functions nearly working on UTF-8 strings:
101 *
102 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf_strlen
103 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained 7-bit ASCII
104 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
105 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
106 *
107 * Functions NOT working on UTF-8 strings:
108 *
109 * - str*cmp
110 * - stristr
111 * - stripos
112 * - substr
113 * - strrev
114 * - ereg/eregi
115 * - split/spliti
116 * - preg_*
117 * - ...
118 *
119 */
120 /**
121 * Class for conversion between charsets
122 *
123 * @author Kasper Skaarhoj <kasper@typo3.com>
124 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
125 * @package TYPO3
126 * @subpackage t3lib
127 */
128 class t3lib_cs {
129 var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
130
131 // This is the array where parsed conversion tables are stored (cached)
132 var $parsedCharsets=array();
133
134 // An array where case folding data will be stored (cached)
135 var $caseFolding=array();
136
137 // This tells the converter which charsets has two bytes per char:
138 var $twoByteSets=array(
139 'ucs-2'=>1, // 2-byte Unicode
140 );
141
142 // This tells the converter which charsets has four bytes per char:
143 var $fourByteSets=array(
144 'ucs-4'=>1, // 4-byte Unicode
145 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
146 );
147
148 // This tells the converter which charsets use a scheme like the Extended Unix Code:
149 var $eucBasedSets=array(
150 'gb2312'=>1, // Chinese, simplified.
151 'big5'=>1, // Chinese, traditional.
152 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
153 );
154
155 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
156 // http://czyborra.com/charsets/iso8859.html
157 var $synonyms=array(
158 'us' => 'ascii',
159 'us-ascii'=> 'ascii',
160 'cp819' => 'iso-8859-1',
161 'ibm819' => 'iso-8859-1',
162 'iso-ir-100' => 'iso-8859-1',
163 'iso-ir-109' => 'iso-8859-2',
164 'iso-ir-148' => 'iso-8859-9',
165 'iso-ir-199' => 'iso-8859-14',
166 'iso-ir-203' => 'iso-8859-15',
167 'csisolatin1' => 'iso-8859-1',
168 'csisolatin2' => 'iso-8859-2',
169 'csisolatin3' => 'iso-8859-3',
170 'csisolatin5' => 'iso-8859-9',
171 'csisolatin8' => 'iso-8859-14',
172 'csisolatin9' => 'iso-8859-15',
173 'csisolatingreek' => 'iso-8859-7',
174 'iso-celtic' => 'iso-8859-14',
175 'latin1' => 'iso-8859-1',
176 'latin2' => 'iso-8859-2',
177 'latin3' => 'iso-8859-3',
178 'latin5' => 'iso-8859-9',
179 'latin6' => 'iso-8859-10',
180 'latin8' => 'iso-8859-14',
181 'latin9' => 'iso-8859-15',
182 'l1' => 'iso-8859-1',
183 'l2' => 'iso-8859-2',
184 'l3' => 'iso-8859-3',
185 'l5' => 'iso-8859-9',
186 'l6' => 'iso-8859-10',
187 'l8' => 'iso-8859-14',
188 'l9' => 'iso-8859-15',
189 'cyrillic' => 'iso-8859-5',
190 'arabic' => 'iso-8859-6',
191 'tis-620' => 'iso-8859-11',
192 'win874' => 'windows-874',
193 'win1250' => 'windows-1250',
194 'win1251' => 'windows-1251',
195 'win1252' => 'windows-1252',
196 'win1253' => 'windows-1253',
197 'win1254' => 'windows-1254',
198 'win1255' => 'windows-1255',
199 'win1256' => 'windows-1256',
200 'win1257' => 'windows-1257',
201 'win1258' => 'windows-1258',
202 'cp1250' => 'windows-1250',
203 'cp1251' => 'windows-1251',
204 'cp1252' => 'windows-1252',
205 'ms-ee' => 'windows-1250',
206 'ms-ansi' => 'windows-1252',
207 'ms-greek' => 'windows-1253',
208 'ms-turk' => 'windows-1254',
209 'winbaltrim' => 'windows-1257',
210 'koi-8ru' => 'koi-8r',
211 'koi8r' => 'koi-8r',
212 'cp878' => 'koi-8r',
213 'mac' => 'macroman',
214 'macintosh' => 'macroman',
215 'euc-cn' => 'gb2312',
216 'x-euc-cn' => 'gb2312',
217 'euccn' => 'gb2312',
218 'cp936' => 'gb2312',
219 'big-5' => 'big5',
220 'cp950' => 'big5',
221 'eucjp' => 'euc-jp',
222 'sjis' => 'shift_jis',
223 'shift-jis' => 'shift_jis',
224 'cp932' => 'shift_jis',
225 'utf7' => 'utf-7',
226 'utf8' => 'utf-8',
227 'utf16' => 'utf-16',
228 'utf32' => 'utf-32',
229 'utf8' => 'utf-8',
230 'ucs2' => 'ucs-2',
231 'ucs4' => 'ucs-4',
232 );
233
234 // mapping of iso-639:2 language codes to language (family) names
235 var $lang_to_langfamily=array(
236 // iso-639:2 language codes, see:
237 // http://www.w3.org/WAI/ER/IG/ert/iso639.htm
238 // http://www.unicode.org/onlinedat/languages.html
239 'ar' => 'arabic',
240 'bg' => 'cyrillic',
241 'cs' => 'east_european',
242 'da' => 'west_european',
243 'de' => 'west_european',
244 'es' => 'west_european',
245 'et' => 'estonian',
246 'eu' => 'west_european',
247 'fi' => 'west_european',
248 'fr' => 'west_european',
249 'gr' => 'greek',
250 'hr' => 'east_european',
251 'hu' => 'east_european',
252 'iw' => 'hebrew',
253 'is' => 'west_european',
254 'it' => 'west_european',
255 'ja' => 'japanese',
256 'kl' => 'west_european',
257 'ko' => 'korean',
258 'lt' => 'lithuanian',
259 'lv' => 'west_european', // Latvian/Lettish
260 'nl' => 'west_european',
261 'no' => 'west_european',
262 'pl' => 'east_european',
263 'pt' => 'west_european',
264 'ro' => 'east_european',
265 'ru' => 'cyrillic',
266 'sk' => 'east_european',
267 'sl' => 'east_european',
268 'sv' => 'west_european',
269 'th' => 'thai',
270 'uk' => 'cyrillic',
271 'vi' => 'vietnamese',
272 'zh' => 'chinese',
273 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
274 'chs' => 'simpl_chinese',
275 'cht' => 'trad_chinese',
276 'csy' => 'east_european',
277 'dan' => 'west_european',
278 'deu' => 'west_european',
279 'dea' => 'west_european',
280 'des' => 'west_european',
281 'ena' => 'west_european',
282 'enc' => 'west_european',
283 'eng' => 'west_european',
284 'enz' => 'west_european',
285 'enu' => 'west_european',
286 'nld' => 'west_european',
287 'nlb' => 'west_european',
288 'fin' => 'west_european',
289 'fra' => 'west_european',
290 'frb' => 'west_european',
291 'frc' => 'west_european',
292 'frs' => 'west_european',
293 'ell' => 'greek',
294 'hun' => 'east_european',
295 'isl' => 'west_euorpean',
296 'ita' => 'west_european',
297 'its' => 'west_european',
298 'jpn' => 'japanese',
299 'kor' => 'korean',
300 'nor' => 'west_european',
301 'non' => 'west_european',
302 'plk' => 'east_european',
303 'ptg' => 'west_european',
304 'ptb' => 'west_european',
305 'rus' => 'east_european',
306 'sky' => 'east_european',
307 'esp' => 'west_european',
308 'esm' => 'west_european',
309 'esn' => 'west_european',
310 'sve' => 'west_european',
311 'trk' => 'turkish',
312 // English language names
313 'bulgarian' => 'east_european',
314 'catalan' => 'west_european',
315 'croatian' => 'east_european',
316 'czech' => 'east_european',
317 'danish' => 'west_european',
318 'dutch' => 'west_european',
319 'english' => 'west_european',
320 'finnish' => 'west_european',
321 'french' => 'west_european',
322 'galician' => 'west_european',
323 'german' => 'west_european',
324 'hungarian' => 'east_european',
325 'icelandic' => 'west_european',
326 'italian' => 'west_european',
327 'latvian' => 'west_european',
328 'lettish' => 'west_european',
329 'norwegian' => 'west_european',
330 'polish' => 'east_european',
331 'portuguese' => 'west_european',
332 'russian' => 'cyrillic',
333 'romanian' => 'east_european',
334 'slovak' => 'east_european',
335 'slovenian' => 'east_european',
336 'spanish' => 'west_european',
337 'svedish' => 'west_european',
338 'turkish' => 'east_european',
339 'ukrainian' => 'cyrillic',
340 );
341
342 // mapping of language (family) names to charsets on Unix
343 var $lang_to_charset_unix=array(
344 'west_european' => 'iso-8859-1',
345 'estonian' => 'iso-8859-1',
346 'east_european' => 'iso-8859-2',
347 'baltic' => 'iso-8859-4',
348 'cyrillic' => 'iso-8859-5',
349 'arabic' => 'iso-8859-6',
350 'greek' => 'iso-8859-7',
351 'hebrew' => 'iso-8859-8',
352 'turkish' => 'iso-8859-9',
353 'thai' => 'iso-8859-11', // = TIS-620
354 'lithuanian' => 'iso-8859-13',
355 'chinese' => 'gb2312', // = euc-cn
356 'japanese' => 'euc-jp',
357 'korean' => 'euc-kr',
358 'simpl_chinese' => 'gb2312',
359 'trad_chinese' => 'big5',
360 'vietnamese' => '',
361 );
362
363 // mapping of language (family) names to charsets on Windows
364 var $lang_to_charset_windows=array(
365 'east_european' => 'windows-1250',
366 'cyrillic' => 'windows-1251',
367 'west_european' => 'windows-1252',
368 'greek' => 'windows-1253',
369 'turkish' => 'windows-1254',
370 'hebrew' => 'windows-1255',
371 'arabic' => 'windows-1256',
372 'baltic' => 'windows-1257',
373 'estonian' => 'windows-1257',
374 'lithuanian' => 'windows-1257',
375 'vietnamese' => 'windows-1258',
376 'thai' => 'cp874',
377 'korean' => 'cp950',
378 'chinese' => 'gb2312',
379 'japanese' => 'shift_jis',
380 'simpl_chinese' => 'gb2312',
381 'trad_chinese' => 'big5',
382 );
383
384 // mapping of locale names to charsets
385 var $locale_to_charset=array(
386 'japanese.euc' => 'euc-jp',
387 'ja_jp.ujis' => 'euc-jp',
388 'korean.euc' => 'euc-kr',
389 'zh_cn' => 'gb2312',
390 'zh_hk' => 'big5',
391 'zh_tw' => 'big5',
392 );
393
394 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
395 // Empty values means "iso-8859-1"
396 var $charSetArray = array(
397 'dk' => '',
398 'de' => '',
399 'no' => '',
400 'it' => '',
401 'fr' => '',
402 'es' => '',
403 'nl' => '',
404 'cz' => 'windows-1250',
405 'pl' => 'iso-8859-2',
406 'si' => 'windows-1250',
407 'fi' => '',
408 'tr' => 'iso-8859-9',
409 'se' => '',
410 'pt' => '',
411 'ru' => 'windows-1251',
412 'ro' => 'iso-8859-2',
413 'ch' => 'gb2312',
414 'sk' => 'windows-1250',
415 'lt' => 'windows-1257',
416 'is' => 'utf-8',
417 'hr' => 'windows-1250',
418 'hu' => 'iso-8859-2',
419 'gl' => '',
420 'th' => 'iso-8859-11',
421 'gr' => 'iso-8859-7',
422 'hk' => 'big5',
423 'eu' => '',
424 'bg' => 'windows-1251',
425 'br' => '',
426 'et' => 'iso-8859-4',
427 'ar' => 'iso-8859-6',
428 'he' => 'utf-8',
429 'ua' => 'windows-1251',
430 'jp' => 'shift_jis',
431 'lv' => 'utf-8',
432 'vn' => 'utf-8',
433 );
434
435 /**
436 * Normalize - changes input character set to lowercase letters.
437 *
438 * @param string Input charset
439 * @return string Normalized charset
440 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
441 */
442 function parse_charset($charset) {
443 $charset = strtolower($charset);
444 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
445
446 return $charset;
447 }
448
449 /**
450 * Get the charset of a locale.
451 *
452 * ln language
453 * ln_CN language / country
454 * ln_CN.cs language / country / charset
455 *
456 * @param string Locale string
457 * @return string Charset resolved for locale string
458 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
459 */
460 function get_locale_charset($locale) {
461 $locale = strtolower($locale);
462
463 // exact locale specific charset?
464 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale];
465
466 // locale contains charset: use it
467 list($locale,$charset) = explode('.',$locale);
468 if ($charset) return $this->parse_charset($charset);
469
470 // get language
471 list($language,$country) = explode('_',$locale);
472 if (isset($this->lang_to_langfamily[$language])) $language = $this->lang_to_langfamily[$language];
473
474 if (TYPO3_OS == 'WIN') {
475 $cs = $this->lang_to_charset_windows[$language];
476 } else {
477 $cs = $this->lang_to_charset_unix[$language];
478 }
479
480 return $cs ? $cs : 'iso-8859-1';
481 }
482
483
484
485
486
487
488
489
490
491 /********************************************
492 *
493 * Charset Conversion functions
494 *
495 ********************************************/
496
497 /**
498 * Convert from one charset to another charset.
499 *
500 * @param string Input string
501 * @param string From charset (the current charset of the string)
502 * @param string To charset (the output charset wanted)
503 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
504 * @return string Converted string
505 */
506 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
507 if ($fromCS==$toCS) return $str;
508
509 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
510 if ($toCS=='utf-8' || !$useEntityForNoChar) {
511 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
512 case 'mbstring':
513 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
514 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
515 break;
516
517 case 'iconv':
518 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
519 if (false !== $conv_str) return $conv_str;
520 break;
521
522 case 'recode':
523 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
524 if (false !== $conv_str) return $conv_str;
525 break;
526 }
527 // fallback to TYPO3 conversion
528 }
529
530 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
531 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
532 return $str;
533 }
534
535
536 /**
537 * Converts $str from $charset to UTF-8
538 *
539 * @param string String in local charset to convert to UTF-8
540 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
541 * @return string Output string, converted to UTF-8
542 */
543 function utf8_encode($str,$charset) {
544
545 // Charset is case-insensitive.
546 if ($this->initCharset($charset)) { // Parse conv. table if not already...
547 $strLen = strlen($str);
548 $outStr='';
549
550 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
551 $chr=substr($str,$a,1);
552 $ord=ord($chr);
553 if ($this->twoByteSets[$charset]) { // If the charset has two bytes per char
554 $ord2 = ord($str{$a+1});
555 $ord = $ord<<8 & $ord2; // assume big endian
556
557 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
558 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
559 } else $outStr.=chr($this->noCharByteVal); // No char exists
560 $a++;
561 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
562 if ($this->eucBasedSets[$charset]) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
563 $a++;
564 $ord2=ord(substr($str,$a,1));
565 $ord = $ord*256+$ord2;
566 }
567 elseif ($charset == 'shift_jis' && ($ord <160 || $ord>223)) { // Shift-JIS is like EUC, but chars between 160 and 223 are single byte
568 $a++;
569 $ord2=ord(substr($str,$a,1));
570 $ord = $ord*256+$ord2;
571 }
572
573 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
574 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
575 } else $outStr.=chr($this->noCharByteVal); // No char exists
576 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
577 }
578 return $outStr;
579 }
580 }
581
582 /**
583 * Converts $str from UTF-8 to $charset
584 *
585 * @param string String in UTF-8 to convert to local charset
586 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
587 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
588 * @return string Output string, converted to local charset
589 */
590 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
591
592 // Charset is case-insensitive.
593 if ($this->initCharset($charset)) { // Parse conv. table if not already...
594 $strLen = strlen($str);
595 $outStr='';
596 $buf='';
597 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
598 $chr=substr($str,$a,1);
599 $ord=ord($chr);
600 if ($ord>127) { // This means multibyte! (first byte!)
601 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
602
603 $buf=$chr; // Add first byte
604 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
605 $ord = $ord << 1; // Shift it left and ...
606 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
607 $a++; // Increase pointer...
608 $buf.=substr($str,$a,1); // ... and add the next char.
609 } else break;
610 }
611
612 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
613 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
614 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
615 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
616 } else $outStr.= chr($mByte);
617 } elseif ($useEntityForNoChar) { // Create num entity:
618 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
619 } else $outStr.=chr($this->noCharByteVal); // No char exists
620 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
621 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
622 }
623 return $outStr;
624 }
625 }
626
627 /**
628 * Converts all chars > 127 to numeric entities.
629 *
630 * @param string Input string
631 * @return string Output string
632 */
633 function utf8_to_entities($str) {
634 $strLen = strlen($str);
635 $outStr='';
636 $buf='';
637 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
638 $chr=substr($str,$a,1);
639 $ord=ord($chr);
640 if ($ord>127) { // This means multibyte! (first byte!)
641 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
642 $buf=$chr; // Add first byte
643 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
644 $ord = $ord << 1; // Shift it left and ...
645 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
646 $a++; // Increase pointer...
647 $buf.=substr($str,$a,1); // ... and add the next char.
648 } else break;
649 }
650
651 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
652 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
653 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
654 }
655
656 return $outStr;
657 }
658
659 /**
660 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
661 *
662 * @param string Input string, UTF-8
663 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
664 * @return string Output string
665 */
666 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
667 if ($alsoStdHtmlEnt) {
668 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
669 }
670
671 $token = md5(microtime());
672 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
673 foreach($parts as $k => $v) {
674 if ($k%2) {
675 if (substr($v,0,1)=='#') { // Dec or hex entities:
676 if (substr($v,1,1)=='x') {
677 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
678 } else {
679 $parts[$k] = $this->UnumberToChar(substr($v,1));
680 }
681 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
682 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
683 } else { // No conversion:
684 $parts[$k] ='&'.$v.';';
685 }
686 }
687 }
688
689 return implode('',$parts);
690 }
691
692 /**
693 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
694 *
695 * @param string Input string, UTF-8
696 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
697 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
698 * @return array Output array with the char numbers
699 */
700 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
701 // If entities must be registered as well...:
702 if ($convEntities) {
703 $str = $this->entities_to_utf8($str,1);
704 }
705 // Do conversion:
706 $strLen = strlen($str);
707 $outArr=array();
708 $buf='';
709 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
710 $chr=substr($str,$a,1);
711 $ord=ord($chr);
712 if ($ord>127) { // This means multibyte! (first byte!)
713 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
714 $buf=$chr; // Add first byte
715 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
716 $ord = $ord << 1; // Shift it left and ...
717 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
718 $a++; // Increase pointer...
719 $buf.=substr($str,$a,1); // ... and add the next char.
720 } else break;
721 }
722
723 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
724 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
725 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
726 }
727
728 return $outArr;
729 }
730
731 /**
732 * Converts a UNICODE number to a UTF-8 multibyte character
733 * Algorithm based on script found at From: http://czyborra.com/utf/
734 * Unit-tested by Kasper
735 *
736 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
737 *
738 * bytes | bits | representation
739 * 1 | 7 | 0vvvvvvv
740 * 2 | 11 | 110vvvvv 10vvvvvv
741 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
742 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
743 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
744 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
745 *
746 * @param integer UNICODE integer
747 * @return string UTF-8 multibyte character string
748 * @see utf8CharToUnumber()
749 */
750 function UnumberToChar($cbyte) {
751 $str='';
752
753 if ($cbyte < 0x80) {
754 $str.=chr($cbyte);
755 } else if ($cbyte < 0x800) {
756 $str.=chr(0xC0 | ($cbyte >> 6));
757 $str.=chr(0x80 | ($cbyte & 0x3F));
758 } else if ($cbyte < 0x10000) {
759 $str.=chr(0xE0 | ($cbyte >> 12));
760 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
761 $str.=chr(0x80 | ($cbyte & 0x3F));
762 } else if ($cbyte < 0x200000) {
763 $str.=chr(0xF0 | ($cbyte >> 18));
764 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
765 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
766 $str.=chr(0x80 | ($cbyte & 0x3F));
767 } else if ($cbyte < 0x4000000) {
768 $str.=chr(0xF8 | ($cbyte >> 24));
769 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
770 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
771 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
772 $str.=chr(0x80 | ($cbyte & 0x3F));
773 } else if ($cbyte < 0x80000000) {
774 $str.=chr(0xFC | ($cbyte >> 30));
775 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
776 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
777 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
778 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
779 $str.=chr(0x80 | ($cbyte & 0x3F));
780 } else { // Cannot express a 32-bit character in UTF-8
781 $str .= chr($this->noCharByteVal);
782 }
783 return $str;
784 }
785
786 /**
787 * Converts a UTF-8 Multibyte character to a UNICODE number
788 * Unit-tested by Kasper
789 *
790 * @param string UTF-8 multibyte character string
791 * @param boolean If set, then a hex. number is returned.
792 * @return integer UNICODE integer
793 * @see UnumberToChar()
794 */
795 function utf8CharToUnumber($str,$hex=0) {
796 $ord=ord(substr($str,0,1)); // First char
797
798 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
799 $binBuf='';
800 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
801 $ord = $ord << 1; // Shift it left and ...
802 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
803 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
804 } else break;
805 }
806 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
807
808 $int = bindec($binBuf);
809 } else $int = $ord;
810
811 return $hex ? 'x'.dechex($int) : $int;
812 }
813
814
815
816
817
818
819
820
821
822 /********************************************
823 *
824 * Init functions
825 *
826 ********************************************/
827
828 /**
829 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
830 * This function is automatically called by the conversion functions
831 *
832 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
833 *
834 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
835 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
836 * @access private
837 */
838 function initCharset($charset) {
839 // Only process if the charset is not yet loaded:
840 if (!is_array($this->parsedCharsets[$charset])) {
841
842 // Conversion table filename:
843 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
844
845 // If the conversion table is found:
846 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
847 // Cache file for charsets:
848 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
849 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/charset_'.$charset.'.tbl');
850 if ($cacheFile && @is_file($cacheFile)) {
851 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
852 } else {
853 // Parse conversion table into lines:
854 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
855 // Initialize the internal variable holding the conv. table:
856 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
857 // traverse the lines:
858 $detectedType='';
859 foreach($lines as $value) {
860 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
861
862 // Detect type if not done yet: (Done on first real line)
863 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
864 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
865
866 if ($detectedType=='ms-token') {
867 list($hexbyte,$utf8) = split('=|:',$value,3);
868 } elseif ($detectedType=='whitespaced') {
869 $regA=array();
870 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
871 $hexbyte = $regA[1];
872 $utf8 = 'U+'.$regA[2];
873 }
874 $decval = hexdec(trim($hexbyte));
875 if ($decval>127) {
876 $utf8decval = hexdec(substr(trim($utf8),2));
877 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
878 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
879 }
880 }
881 }
882 if ($cacheFile) {
883 t3lib_div::writeFile($cacheFile,serialize($this->parsedCharsets[$charset]));
884 }
885 }
886 return 2;
887 } else return false;
888 } else return 1;
889 }
890
891 /**
892 * This function initializes the UTF-8 case folding table.
893 *
894 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
895 *
896 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
897 * @access private
898 */
899 function initCaseFoldingUTF8() {
900 // Only process if the case table is not yet loaded:
901 if (is_array($this->caseFolding['utf-8'])) return 1;
902
903 // Use cached version if possible
904 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cscase_utf-8.tbl');
905 if ($cacheFile && @is_file($cacheFile)) {
906 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFile));
907 return 2;
908 }
909
910 // process main Unicode data file
911 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
912 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
913
914 $fh = fopen($unicodeDataFile,'r');
915 if (!$fh) return false;
916
917 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
918 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
919 $this->caseFolding['utf-8'] = array();
920 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
921 $utf8CaseFolding['toUpper'] = array();
922 $utf8CaseFolding['toLower'] = array();
923 $utf8CaseFolding['toTitle'] = array();
924
925 while (!feof($fh)) {
926 $line = fgets($fh);
927 // has also other info like character class (digit, white space, etc.) and more
928 list($char,,,,,,,,,,,,$upper,$lower,$title,) = split(';', rtrim($line));
929 $char = $this->UnumberToChar(hexdec($char));
930 if ($upper) $utf8CaseFolding['toUpper'][$char] = $this->UnumberToChar(hexdec($upper));
931 if ($lower) $utf8CaseFolding['toLower'][$char] = $this->UnumberToChar(hexdec($lower));
932 // store "title" only when different from "upper" (only a few)
933 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$char] = $this->UnumberToChar(hexdec($title));
934 }
935 fclose($fh);
936
937 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
938 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
939 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
940
941 $fh = fopen($specialCasingFile,'r');
942 if ($fh) {
943 while (!feof($fh)) {
944 $line = fgets($fh);
945 if ($line{0} != '#' && trim($line) != '') {
946
947 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
948 if ($cond == '' || $cond{0} == '#') {
949 $utf8_char = $this->UnumberToChar(hexdec($char));
950 if ($char != $lower) {
951 $arr = split(' ',$lower);
952 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
953 $utf8CaseFolding['toLower'][$utf8_char] = implode($arr);
954 }
955 if ($char != $title && $title != $upper) {
956 $arr = split(' ',$title);
957 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
958 $utf8CaseFolding['toTitle'][$utf8_char] = implode($arr);
959 }
960 if ($char != $upper) {
961 $arr = split(' ',$upper);
962 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
963 $utf8CaseFolding['toUpper'][$utf8_char] = implode($arr);
964 }
965 }
966 }
967 }
968 fclose($fh);
969 }
970 }
971
972 if ($cacheFile) {
973 t3lib_div::writeFile($cacheFile,serialize($utf8CaseFolding));
974 }
975
976 return 3;
977 }
978
979 /**
980 * This function initializes the folding table for a charset other than UTF-8.
981 * This function is automatically called by the case folding functions.
982 *
983 * @param string Charset for which to initialize case folding.
984 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
985 * @access private
986 */
987 function initCaseFolding($charset) {
988 // Only process if the case table is not yet loaded:
989 if (is_array($this->caseFolding[$charset])) return 1;
990
991 // Use cached version if possible
992 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cscase_'.$charset.'.tbl');
993 if ($cacheFile && @is_file($cacheFile)) {
994 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
995 return 2;
996 }
997
998 // init UTF-8 conversion for this charset
999 if (!$this->initCharset($charset)) {
1000 return false;
1001 }
1002
1003 // UTF-8 case folding is used as the base conversion table
1004 if (!$this->initCaseFoldingUTF8()) {
1005 return false;
1006 }
1007
1008 $nochar = chr($this->noCharByteVal);
1009 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1010 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1011 $c = $this->conv($utf8, 'utf-8', $charset);
1012
1013 $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1014 if ($cc && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1015
1016 $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1017 if ($cc && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
1018
1019 $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1020 if ($cc && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1021 }
1022
1023 // add the ASCII case table
1024 for ($i=ord('a'); $i<=ord('z'); $i++) {
1025 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
1026 }
1027 for ($i=ord('A'); $i<=ord('Z'); $i++) {
1028 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
1029 }
1030
1031 if ($cacheFile) {
1032 t3lib_div::writeFile($cacheFile,serialize($this->caseFolding[$charset]));
1033 }
1034
1035 return 3;
1036 }
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054 /********************************************
1055 *
1056 * String operation functions
1057 *
1058 ********************************************/
1059
1060 /**
1061 * Returns a part of a string.
1062 * Unit-tested by Kasper (single byte charsets only)
1063 *
1064 * @param string The character set
1065 * @param string Character string
1066 * @param integer Start position (character position)
1067 * @param integer Length (in characters)
1068 * @return string The substring
1069 * @see substr(), mb_substr()
1070 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1071 */
1072 function substr($charset,$string,$start,$len=null) {
1073 if ($len===0) return '';
1074
1075 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1076 // cannot omit $len, when specifying charset
1077 if ($len==null) {
1078 $enc = mb_internal_encoding(); // save internal encoding
1079 mb_internal_encoding('utf-8');
1080 $str = mb_substr($string,$start);
1081 mb_internal_encoding($enc); // restore internal encoding
1082
1083 return $str;
1084 }
1085 else return mb_substr($string,$start,$len,'utf-8');
1086 } elseif ($charset == 'utf-8') {
1087 return $this->utf8_substr($string,$start,$len);
1088 } elseif ($this->eucBasedSets[$charset]) {
1089 return $this->euc_substr($string,$start,$charset,$len);
1090 } elseif ($this->twoByteSets[$charset]) {
1091 return substr($string,$start*2,$len*2);
1092 } elseif ($this->fourByteSets[$charset]) {
1093 return substr($string,$start*4,$len*4);
1094 }
1095
1096 // treat everything else as single-byte encoding
1097 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
1098 }
1099
1100 /**
1101 * Counts the number of characters.
1102 * Unit-tested by Kasper (single byte charsets only)
1103 *
1104 * @param string The character set
1105 * @param string Character string
1106 * @return integer The number of characters
1107 * @see strlen()
1108 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1109 */
1110 function strlen($charset,$string) {
1111 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1112 return mb_strlen($string,$charset);
1113 } elseif ($charset == 'utf-8') {
1114 return $this->utf8_strlen($string);
1115 } elseif ($this->eucBasedSets[$charset]) {
1116 return $this->euc_strlen($string,$charset);
1117 } elseif ($this->twoByteSets[$charset]) {
1118 return strlen($string)/2;
1119 } elseif ($this->fourByteSets[$charset]) {
1120 return strlen($string)/4;
1121 }
1122 // treat everything else as single-byte encoding
1123 return strlen($string);
1124 }
1125
1126 /**
1127 * Truncates a string and pre-/appends a string.
1128 * Unit tested by Kasper
1129 *
1130 * @param string The character set
1131 * @param string Character string
1132 * @param integer Length (in characters)
1133 * @param string Crop signifier
1134 * @return string The shortened string
1135 * @see substr(), mb_strimwidth()
1136 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1137 */
1138 function crop($charset,$string,$len,$crop='') {
1139 if ($len == 0) return $crop;
1140
1141 if ($charset == 'utf-8') {
1142 $i = $this->utf8_char2byte_pos($string,$len);
1143 } elseif ($this->eucBasedSets[$charset]) {
1144 $i = $this->euc_char2byte_pos($string,$len,$charset);
1145 } else {
1146 if ($len > 0) {
1147 $i = $len;
1148 } else {
1149 $i = strlen($string)+$len;
1150 if ($i<=0) $i = false;
1151 }
1152 }
1153
1154 if ($i === false) { // $len outside actual string length
1155 return $string;
1156 } else {
1157 if ($len > 0) {
1158 if (isset($string{$i})) {
1159 return substr($string,0,$i).$crop;
1160 }
1161 } else {
1162 if (isset($string{$i-1})) {
1163 return $crop.substr($string,$i);
1164 }
1165 }
1166
1167 /*
1168 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1169 if ($len > 0) {
1170 return substr($string,0,$i).$crop;
1171 } else {
1172 return $crop.substr($string,$i);
1173 }
1174 }
1175 */
1176 }
1177 return $string;
1178 }
1179
1180 /**
1181 * Cuts a string short at a given byte length.
1182 *
1183 * @param string The character set
1184 * @param string Character string
1185 * @param integer The byte length
1186 * @return string The shortened string
1187 * @see mb_strcut()
1188 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1189 */
1190 function strtrunc($charset,$string,$len) {
1191 if ($len <= 0) return '';
1192
1193 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1194 return mb_strcut($string,0,$len,$charset);
1195 } elseif ($charset == 'utf-8') {
1196 return $this->utf8_strtrunc($string,$len);
1197 } elseif ($this->eucBasedSets[$charset]) {
1198 return $this->euc_strtrunc($string,$charset);
1199 } elseif ($this->twoByteSets[$charset]) {
1200 if ($len % 2) $len--; // don't cut at odd positions
1201 } elseif ($this->fourByteSets[$charset]) {
1202 $x = $len % 4;
1203 $len -= $x; // realign to position dividable by four
1204 }
1205 // treat everything else as single-byte encoding
1206 return substr($string,0,$len);
1207 }
1208
1209 /**
1210 * Translates all characters of a string into their respective case values.
1211 * Unlike strtolower() and strtoupper() this method is locale independent.
1212 * Note that the string length may change!
1213 * eg. lower case German "ß" (scharfes S) becomes uper case "SS"
1214 * Unit-tested by Kasper
1215 * Real case folding is language dependent, this method ignores this fact.
1216 *
1217 * @param string Character set of string
1218 * @param string Input string to convert case for
1219 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1220 * @return string The converted string
1221 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1222 * @see strtolower(), strtoupper()
1223 */
1224 function conv_case($charset,$string,$case) {
1225 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' &&
1226 float(phpversion()) >= 4.3) {
1227 if ($case == 'toLower') {
1228 return mb_strtolower($str,'utf-8');
1229 } else {
1230 return mb_strtoupper($str,'utf-8');
1231 }
1232 } elseif ($charset == 'utf-8') {
1233 return $this->utf8_conv_case($string,$case);
1234 } elseif ($this->eucBasedSets[$charset]) {
1235 return $this->euc_conv_case($string,$case,$charset);
1236 }
1237
1238 // treat everything else as single-byte encoding
1239 if (!$this->initCaseFolding($charset)) return $string; // do nothing
1240 $out = '';
1241 $caseConv =& $this->caseFolding[$charset][$case];
1242
1243 for($i=0; isset($string{$i}); $i++) {
1244 $c = $string{$i};
1245 $cc = $caseConv[$c];
1246 if ($cc) {
1247 $out .= $cc;
1248 } else {
1249 $out .= $c;
1250 }
1251 }
1252
1253 // is a simple strtr() faster or slower than the code above?
1254 // perhaps faster for small single-byte tables but slower for large multi-byte tables?
1255 //
1256 // return strtr($string,$this->caseFolding[$charset][$case]);
1257
1258 return $out;
1259 }
1260
1261 /**
1262 * Converts special chars (like ÆØÅæøå, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1263 * CURRENTLY IT IS NOT FULLY IMPLEMENTED!!!
1264 *
1265 * @param string Character set of string
1266 * @param string Input string to convert
1267 * @return string The converted string
1268 */
1269 function specCharsToASCII($charset,$string) {
1270 if ($charset == 'utf-8') {
1271 $pat = array (
1272 '/'.$this->utf8_encode('æ', 'iso-8859-1').'/',
1273 '/'.$this->utf8_encode('ø', 'iso-8859-1').'/',
1274 '/'.$this->utf8_encode('å', 'iso-8859-1').'/',
1275 '/'.$this->utf8_encode('Æ', 'iso-8859-1').'/',
1276 '/'.$this->utf8_encode('Ø', 'iso-8859-1').'/',
1277 '/'.$this->utf8_encode('Å', 'iso-8859-1').'/',
1278 );
1279 $repl = array ( 'ae', 'oe', 'aa', 'AE', 'OE', 'AA');
1280 $string = preg_replace($pat,$repl,$string);
1281 } else {
1282 $string = t3lib_div::convUmlauts($string);
1283 }
1284
1285 return $string;
1286 }
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299 /********************************************
1300 *
1301 * Internal UTF-8 string operation functions
1302 *
1303 ********************************************/
1304
1305 /**
1306 * Returns a part of a UTF-8 string.
1307 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1308 *
1309 * @param string UTF-8 string
1310 * @param integer Start position (character position)
1311 * @param integer Length (in characters)
1312 * @return string The substring
1313 * @see substr()
1314 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1315 */
1316 function utf8_substr($str,$start,$len=null) {
1317 if (!strcmp($len,'0')) return '';
1318
1319 $byte_start = $this->utf8_char2byte_pos($str,$start);
1320 if ($byte_start === false) {
1321 if ($start > 0) {
1322 return false; // $start outside string length
1323 } else {
1324 $start = 0;
1325 }
1326 }
1327
1328 $str = substr($str,$byte_start);
1329
1330 if ($len!=null) {
1331 $byte_end = $this->utf8_char2byte_pos($str,$len);
1332 if ($byte_end === false) // $len outside actual string length
1333 return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string.
1334 else
1335 return substr($str,0,$byte_end);
1336 }
1337 else return $str;
1338 }
1339
1340 /**
1341 * Counts the number of characters of a string in UTF-8.
1342 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1343 *
1344 * @param string UTF-8 multibyte character string
1345 * @return integer The number of characters
1346 * @see strlen()
1347 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1348 */
1349 function utf8_strlen($str) {
1350 $n=0;
1351 for($i=0; isset($str{$i}); $i++) {
1352 $c = ord($str{$i});
1353 if (!($c & 0x80)) // single-byte (0xxxxxx)
1354 $n++;
1355 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1356 $n++;
1357 }
1358 return $n;
1359 }
1360
1361 /**
1362 * Truncates a string in UTF-8 short at a given byte length.
1363 *
1364 * @param string UTF-8 multibyte character string
1365 * @param integer the byte length
1366 * @return string the shortened string
1367 * @see mb_strcut()
1368 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1369 */
1370 function utf8_strtrunc($str,$len) {
1371 $i = $len-1;
1372 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1373 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1374 if ($i <= 0) return ''; // sanity check
1375 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1376 if ($bc+$i > $len) return substr($str,0,$i);
1377 // fallthru: multibyte char fits into length
1378 }
1379 return substr($str,$len);
1380 }
1381
1382 /**
1383 * Find position of first occurrence of a string, both arguments are in UTF-8.
1384 *
1385 * @param string UTF-8 string to search in
1386 * @param string UTF-8 string to search for
1387 * @param integer Positition to start the search
1388 * @return integer The character position
1389 * @see strpos()
1390 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1391 */
1392 function utf8_strpos($haystack,$needle,$offset=0) {
1393 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1394 return mb_strpos($haystack,$needle,'utf-8');
1395 }
1396
1397 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1398 if ($byte_offset === false) return false; // offset beyond string length
1399
1400 $byte_pos = strpos($haystack,$needle,$byte_offset);
1401 if ($byte_pos === false) return false; // needle not found
1402
1403 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1404 }
1405
1406 /**
1407 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1408 *
1409 * @param string UTF-8 string to search in
1410 * @param string UTF-8 character to search for (single character)
1411 * @return integer The character position
1412 * @see strrpos()
1413 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1414 */
1415 function utf8_strrpos($haystack,$needle) {
1416 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1417 return mb_strrpos($haystack,$needle,'utf-8');
1418 }
1419
1420 $byte_pos = strrpos($haystack,$needle);
1421 if ($byte_pos === false) return false; // needle not found
1422
1423 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1424 }
1425
1426 /**
1427 * Translates a character position into an 'absolute' byte position.
1428 * Unit tested by Kasper.
1429 *
1430 * @param string UTF-8 string
1431 * @param integer Character position (negative values start from the end)
1432 * @return integer Byte position
1433 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1434 */
1435 function utf8_char2byte_pos($str,$pos) {
1436 $n = 0; // number of characters found
1437 $p = abs($pos); // number of characters wanted
1438
1439 if ($pos >= 0) {
1440 $i = 0;
1441 $d = 1;
1442 } else {
1443 $i = strlen($str)-1;
1444 $d = -1;
1445 }
1446
1447 for( ; isset($str{$i}) && $n<$p; $i+=$d) {
1448 $c = (int)ord($str{$i});
1449 if (!($c & 0x80)) // single-byte (0xxxxxx)
1450 $n++;
1451 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1452 $n++;
1453 }
1454 if (!isset($str{$i})) return false; // offset beyond string length
1455
1456 if ($pos >= 0) {
1457 // skip trailing multi-byte data bytes
1458 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1459 } else {
1460 // correct offset
1461 $i++;
1462 }
1463
1464 return $i;
1465 }
1466
1467 /**
1468 * Translates an 'absolute' byte position into a character position.
1469 * Unit tested by Kasper.
1470 *
1471 * @param string UTF-8 string
1472 * @param integer byte position
1473 * @return integer character position
1474 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1475 */
1476 function utf8_byte2char_pos($str,$pos) {
1477 $n = 0; // number of characters
1478 for($i=$pos; $i>0; $i--) {
1479 $c = (int)ord($str{$i});
1480 if (!($c & 0x80)) // single-byte (0xxxxxx)
1481 $n++;
1482 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1483 $n++;
1484 }
1485 if (!isset($str{$i})) return false; // offset beyond string length
1486
1487 return $n;
1488 }
1489
1490 /**
1491 * Translates all characters of an UTF-8 string into their respective case values.
1492 * Unit-tested by Kasper
1493 *
1494 * @param string UTF-8 string
1495 * @param string conversion: 'toLower' or 'toUpper'
1496 * @return string the converted string
1497 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1498 * @see strtolower(), strtoupper(), mb_convert_case()
1499 */
1500 function utf8_conv_case($str,$case) {
1501 if (!$this->initCaseFoldingUTF8()) return $str; // do nothing
1502
1503 $out = '';
1504 $caseConv =& $this->caseFolding['utf-8'][$case];
1505
1506 for($i=0; isset($str{$i}); $i++) {
1507 $c = ord($str{$i});
1508 if (!($c & 0x80)) // single-byte (0xxxxxx)
1509 $mbc = $str{$i};
1510 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1511 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1512 $mbc = substr($str,$i,$bc);
1513 $i += $bc-1;
1514 }
1515
1516 $cc = $caseConv[$mbc];
1517 if ($cc) {
1518 $out .= $cc;
1519 } else {
1520 $out .= $mbc;
1521 }
1522 }
1523
1524 return $out;
1525 }
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544 /********************************************
1545 *
1546 * Internal EUC string operation functions
1547 *
1548 * Extended Unix Code:
1549 * ASCII compatible 7bit single bytes chars
1550 * 8bit two byte chars
1551 *
1552 * Shift-JIS is treated as a special case.
1553 *
1554 ********************************************/
1555
1556 /**
1557 * Cuts a string in the EUC charset family short at a given byte length.
1558 *
1559 * @param string EUC multibyte character string
1560 * @param integer the byte length
1561 * @param string the charset
1562 * @return string the shortened string
1563 * @see mb_strcut()
1564 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1565 */
1566 function euc_strtrunc($str,$len,$charset) {
1567 $sjis = ($charset == 'shift_jis');
1568 for ($i=0; isset($str{$i}) && $i<$len; $i++) {
1569 $c = ord($str{$i});
1570 if ($sjis) {
1571 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1572 }
1573 else {
1574 if ($c >= 0x80) $i++; // advance a double-byte char
1575 }
1576 }
1577 if (!isset($str{$i})) return $str; // string shorter than supplied length
1578
1579 if ($i>$len)
1580 return substr($str,0,$len-1); // we ended on a first byte
1581 else
1582 return substr($str,0,$len);
1583 }
1584
1585 /**
1586 * Returns a part of a string in the EUC charset family.
1587 *
1588 * @param string EUC multibyte character string
1589 * @param integer start position (character position)
1590 * @param string the charset
1591 * @param integer length (in characters)
1592 * @return string the substring
1593 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1594 */
1595 function euc_substr($str,$start,$charset,$len=null) {
1596 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1597 if ($byte_start === false) return false; // $start outside string length
1598
1599 $str = substr($str,$byte_start);
1600
1601 if ($len!=null) {
1602 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1603 if ($byte_end === false) // $len outside actual string length
1604 return $str;
1605 else
1606 return substr($str,0,$byte_end);
1607 }
1608 else return $str;
1609 }
1610
1611 /**
1612 * Counts the number of characters of a string in the EUC charset family.
1613 *
1614 * @param string EUC multibyte character string
1615 * @param string the charset
1616 * @return integer the number of characters
1617 * @see strlen()
1618 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1619 */
1620 function euc_strlen($str,$charset) {
1621 $sjis = ($charset == 'shift_jis');
1622 $n=0;
1623 for ($i=0; isset($str{$i}); $i++) {
1624 $c = ord($str{$i});
1625 if ($sjis) {
1626 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1627 }
1628 else {
1629 if ($c >= 0x80) $i++; // advance a double-byte char
1630 }
1631
1632 $n++;
1633 }
1634
1635 return $n;
1636 }
1637
1638 /**
1639 * Translates a character position into an 'absolute' byte position.
1640 *
1641 * @param string EUC multibyte character string
1642 * @param integer character position (negative values start from the end)
1643 * @param string the charset
1644 * @return integer byte position
1645 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1646 */
1647 function euc_char2byte_pos($str,$pos,$charset) {
1648 $sjis = ($charset == 'shift_jis');
1649 $n = 0; // number of characters seen
1650 $p = abs($pos); // number of characters wanted
1651
1652 if ($pos >= 0) {
1653 $i = 0;
1654 $d = 1;
1655 } else {
1656 $i = strlen($str)-1;
1657 $d = -1;
1658 }
1659
1660 for ( ; isset($str{$i}) && $n<$p; $i+=$d) {
1661 $c = ord($str{$i});
1662 if ($sjis) {
1663 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char
1664 }
1665 else {
1666 if ($c >= 0x80) $i+=$d; // advance a double-byte char
1667 }
1668
1669 $n++;
1670 }
1671 if (!isset($str{$i})) return false; // offset beyond string length
1672
1673 if ($pos < 0) $i++; // correct offset
1674
1675 return $i;
1676 }
1677
1678 /**
1679 * Translates all characters of a string in the EUC charset family into their respective case values.
1680 *
1681 * @param string EUC multibyte character string
1682 * @param string conversion: 'toLower' or 'toUpper'
1683 * @param string the charset
1684 * @return string the converted string
1685 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1686 * @see strtolower(), strtoupper(), mb_convert_case()
1687 */
1688 function euc_conv_case($str,$case,$charset) {
1689 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1690
1691 $sjis = ($charset == 'shift_jis');
1692 $out = '';
1693 $caseConv =& $this->caseFolding[$charset][$case];
1694 for($i=0; $mbc=$str{$i}; $i++) {
1695 $c = ord($str{$i});
1696
1697 if ($sjis) {
1698 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
1699 $mbc = substr($str,$i,2);
1700 $i++;
1701 }
1702 }
1703 else {
1704 if ($c >= 0x80) { // a double-byte char
1705 $mbc = substr($str,$i,2);
1706 $i++;
1707 }
1708 }
1709
1710 $cc = $caseConv[$mbc];
1711 if ($cc) {
1712 $out .= $cc;
1713 } else {
1714 $out .= $mbc;
1715 }
1716 }
1717
1718 return $out;
1719 }
1720 }
1721
1722 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
1723 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
1724 }
1725 ?>