c093c53a8799c579b945e944e3ca76a925086698
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2004 Kasper Skaarhoj (kasper@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasper@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 119: class t3lib_cs
38 * 261: function parse_charset($charset)
39 * 278: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
40 * 312: function utf8_encode($str,$charset)
41 * 359: function utf8_decode($str,$charset,$useEntityForNoChar=0)
42 * 407: function utf8_to_entities($str)
43 * 440: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
44 * 474: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
45 * 515: function initCharset($charset)
46 * 586: function UnumberToChar($cbyte)
47 * 630: function utf8CharToUnumber($str,$hex=0)
48 *
49 * SECTION: String operation functions
50 * 682: function strtrunc($charset,$string,$len)
51 * 716: function substr($charset,$str,$start,$len=null)
52 * 755: function strlen($charset,$string)
53 *
54 * SECTION: UTF-8 String operation functions
55 * 803: function utf8_strtrunc($str,$len)
56 * 831: function utf8_substr($str,$start,$len=null)
57 * 857: function utf8_strlen($str)
58 * 879: function utf8_strpos($haystack,$needle,$offset=0)
59 * 902: function utf8_strrpos($haystack,$needle)
60 * 921: function utf8_char2byte_pos($str,$pos)
61 * 946: function utf8_byte2char_pos($str,$pos)
62 *
63 * SECTION: EUC String operation functions
64 * 994: function euc_strtrunc($str,$len,$charset)
65 * 1028: function euc_substr($str,$start,$charset,$len=null)
66 * 1055: function euc_strlen($str,$charset)
67 * 1082: function euc_char2byte_pos($str,$pos,$charset)
68 *
69 * TOTAL FUNCTIONS: 24
70 * (This index is automatically created/updated by the extension "extdeveval")
71 *
72 */
73
74
75
76
77
78
79
80
81 /**
82 * Notes on UTF-8
83 *
84 * Functions working on UTF-8 strings:
85 *
86 * - strchr/strstr
87 * - strrchr
88 * - substr_count
89 * - implode/explode/join
90 *
91 * Functions nearly working on UTF-8 strings:
92 *
93 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf_strlen
94 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained 7-bit ASCII
95 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
96 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
97 *
98 * Functions NOT working on UTF-8 strings:
99 *
100 * - str*cmp
101 * - stristr
102 * - stripos
103 * - substr
104 * - strrev
105 * - ereg/eregi
106 * - split/spliti
107 * - preg_*
108 * - ...
109 *
110 */
111 /**
112 * Class for conversion between charsets.
113 *
114 * @author Kasper Skaarhoj <kasper@typo3.com>
115 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
116 * @package TYPO3
117 * @subpackage t3lib
118 */
119 class t3lib_cs {
120 var $noCharByteVal=127; // ASCII Value for chars with no equalent.
121
122 // This is the array where parsed conversion tables are stored (cached)
123 var $parsedCharsets=array();
124
125 // An array where case folding data will be stored (cached)
126 var $caseFolding=array();
127
128 // This tells the converter which charsets has two bytes per char:
129 var $twoByteSets=array(
130 'ucs-2'=>1, // 2-byte Unicode
131 );
132
133 // This tells the converter which charsets has four bytes per char:
134 var $fourByteSets=array(
135 'ucs-4'=>1, // 4-byte Unicode
136 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
137 );
138
139 // This tells the converter which charsets use a scheme like the Extended Unix Code:
140 var $eucBasedSets=array(
141 'gb2312'=>1, // Chinese, simplified.
142 'big5'=>1, // Chinese, traditional.
143 );
144
145 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
146 // http://czyborra.com/charsets/iso8859.html
147 var $synonyms=array(
148 'us' => 'ascii',
149 'us-ascii'=> 'ascii',
150 'cp819' => 'iso-8859-1',
151 'ibm819' => 'iso-8859-1',
152 'iso-ir-100' => 'iso-8859-1',
153 'iso-ir-109' => 'iso-8859-2',
154 'iso-ir-148' => 'iso-8859-9',
155 'iso-ir-199' => 'iso-8859-14',
156 'iso-ir-203' => 'iso-8859-15',
157 'csisolatin1' => 'iso-8859-1',
158 'csisolatin2' => 'iso-8859-2',
159 'csisolatin3' => 'iso-8859-3',
160 'csisolatin5' => 'iso-8859-9',
161 'csisolatin8' => 'iso-8859-14',
162 'csisolatin9' => 'iso-8859-15',
163 'csisolatingreek' => 'iso-8859-7',
164 'iso-celtic' => 'iso-8859-14',
165 'latin1' => 'iso-8859-1',
166 'latin2' => 'iso-8859-2',
167 'latin3' => 'iso-8859-3',
168 'latin5' => 'iso-8859-9',
169 'latin6' => 'iso-8859-10',
170 'latin8' => 'iso-8859-14',
171 'latin9' => 'iso-8859-15',
172 'l1' => 'iso-8859-1',
173 'l2' => 'iso-8859-2',
174 'l3' => 'iso-8859-3',
175 'l5' => 'iso-8859-9',
176 'l6' => 'iso-8859-10',
177 'l8' => 'iso-8859-14',
178 'l9' => 'iso-8859-15',
179 'cyrillic' => 'iso-8859-5',
180 'arabic' => 'iso-8859-6',
181 'win874' => 'windows-874',
182 'win1250' => 'windows-1250',
183 'win1251' => 'windows-1251',
184 'win1252' => 'windows-1252',
185 'win1253' => 'windows-1253',
186 'win1254' => 'windows-1254',
187 'win1255' => 'windows-1255',
188 'win1256' => 'windows-1256',
189 'win1257' => 'windows-1257',
190 'win1258' => 'windows-1258',
191 'cp1250' => 'windows-1250',
192 'cp1252' => 'windows-1252',
193 'ms-ee' => 'windows-1250',
194 'ms-ansi' => 'windows-1252',
195 'ms-greek' => 'windows-1253',
196 'ms-turk' => 'windows-1254',
197 'winbaltrim' => 'windows-1257',
198 'koi-8ru' => 'koi-8r',
199 'koi8r' => 'koi-8r',
200 'mac' => 'macRoman',
201 'macintosh' => 'macRoman',
202 'euc-cn' => 'gb2312',
203 'x-euc-cn' => 'gb2312',
204 'cp936' => 'gb2312',
205 'big-5' => 'big5',
206 'cp950' => 'big5',
207 'sjis' => 'shift_jis',
208 'shift-jis' => 'shift_jis',
209 'cp932' => 'shift_jis',
210 'utf7' => 'utf-7',
211 'utf8' => 'utf-8',
212 'utf16' => 'utf-16',
213 'utf32' => 'utf-32',
214 'utf8' => 'utf-8',
215 'ucs2' => 'ucs-2',
216 'ucs4' => 'ucs-4',
217 );
218
219 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
220 // Empty values means "iso-8859-1"
221 var $charSetArray = array(
222 'dk' => '',
223 'de' => '',
224 'no' => '',
225 'it' => '',
226 'fr' => '',
227 'es' => '',
228 'nl' => '',
229 'cz' => 'windows-1250',
230 'pl' => 'iso-8859-2',
231 'si' => 'windows-1250',
232 'fi' => '',
233 'tr' => 'iso-8859-9',
234 'se' => '',
235 'pt' => '',
236 'ru' => 'windows-1251',
237 'ro' => 'iso-8859-2',
238 'ch' => 'gb2312',
239 'sk' => 'windows-1250',
240 'lt' => 'windows-1257',
241 'is' => 'utf-8',
242 'hr' => 'windows-1250',
243 'hu' => 'iso-8859-2',
244 'gl' => '',
245 'th' => 'iso-8859-11',
246 'gr' => 'iso-8859-7',
247 'hk' => 'big5',
248 'eu' => '',
249 'bg' => 'windows-1251',
250 'br' => '',
251 'et' => 'iso-8859-4',
252 'ar' => 'iso-8859-6',
253 'he' => 'utf-8',
254 'ua' => 'windows-1251',
255 );
256
257 /**
258 * Normalize - changes input character set to lowercase letters.
259 *
260 * @param string Input charset
261 * @return string Normalized charset
262 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
263 */
264 function parse_charset($charset) {
265 $charset = strtolower($charset);
266 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
267
268 return $charset;
269 }
270
271
272 /**
273 * Convert from one charset to another charset.
274 *
275 * @param string Input string
276 * @param string From charset (the current charset of the string)
277 * @param string To charset (the output charset wanted)
278 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
279 * @return string Converted string
280 */
281 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
282 global $TYPO3_CONF_VARS;
283
284 if ($fromCS==$toCS) return $str;
285
286 if (!$useEntityForNoChar) { // iconv and recode don't support fallback to SGML entities
287 if ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'iconv') {
288 $conv_str = iconv($str,$fromCS,$toCS.'//TRANSLIT');
289 if (false !== $conv_str) return $conv_str;
290 }
291 elseif ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'recode') {
292 $conv_str = recode_string($toCS.'..'.$fromCS,$str);
293 if (false !== $conv_str) return $conv_str;
294 }
295 elseif ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'mbstring') {
296 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
297 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
298 }
299 // fallback to TYPO3 conversion
300 }
301
302 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
303 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
304 return $str;
305 }
306
307
308 /**
309 * Converts $str from $charset to UTF-8
310 *
311 * @param string String in local charset to convert to UTF-8
312 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
313 * @return string Output string, converted to UTF-8
314 */
315 function utf8_encode($str,$charset) {
316
317 // Charset is case-insensitive.
318 if ($this->initCharset($charset)) { // Parse conv. table if not already...
319 $strLen = strlen($str);
320 $outStr='';
321
322 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
323 $chr=substr($str,$a,1);
324 $ord=ord($chr);
325 if ($this->twoByteSets[$charset]) { // If the charset has two bytes per char
326 $ord2 = ord($str{$a+1});
327 $ord = $ord<<8 & $ord2; // assume big endian
328
329 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
330 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
331 } else $outStr.=chr($this->noCharByteVal); // No char exists
332 $a++;
333 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
334 if ($this->eucBasedSets[$charset]) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
335 $a++;
336 $ord2=ord(substr($str,$a,1));
337 $ord = $ord*256+$ord2;
338 }
339 elseif ($charset == 'shift_jis' && ($ord <160 || $ord>223)) { // Shift-JIS is like EUC, but chars between 160 and 223 are single byte
340 $a++;
341 $ord2=ord(substr($str,$a,1));
342 $ord = $ord*256+$ord2;
343 }
344
345 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
346 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
347 } else $outStr.=chr($this->noCharByteVal); // No char exists
348 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
349 }
350 return $outStr;
351 }
352 }
353
354 /**
355 * Converts $str from UTF-8 to $charset
356 *
357 * @param string String in UTF-8 to convert to local charset
358 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
359 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
360 * @return string Output string, converted to local charset
361 */
362 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
363
364 // Charset is case-insensitive.
365 if ($this->initCharset($charset)) { // Parse conv. table if not already...
366 $strLen = strlen($str);
367 $outStr='';
368 $buf='';
369 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
370 $chr=substr($str,$a,1);
371 $ord=ord($chr);
372 if ($ord>127) { // This means multibyte! (first byte!)
373 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
374
375 $buf=$chr; // Add first byte
376 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
377 $ord = $ord << 1; // Shift it left and ...
378 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
379 $a++; // Increase pointer...
380 $buf.=substr($str,$a,1); // ... and add the next char.
381 } else break;
382 }
383
384 # Martin Kutschker...! this does not work! With russian UTF-8 converted back to windows-1251 it failed... So the old code is re-inserted.
385 # for ($bc=0; $ord & 0x80; $ord = $ord << 1) { $bc++; } // calculate number of bytes
386 # $buf.=substr($str,$i,$bc);
387 # $i+=$bc-1;
388
389 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
390 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
391 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
392 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
393 } else $outStr.= chr($mByte);
394 } elseif ($useEntityForNoChar) { // Create num entity:
395 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
396 } else $outStr.=chr($this->noCharByteVal); // No char exists
397 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
398 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
399 }
400 return $outStr;
401 }
402 }
403
404 /**
405 * Converts all chars > 127 to numeric entities.
406 *
407 * @param string Input string
408 * @return string Output string
409 */
410 function utf8_to_entities($str) {
411 $strLen = strlen($str);
412 $outStr='';
413 $buf='';
414 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
415 $chr=substr($str,$a,1);
416 $ord=ord($chr);
417 if ($ord>127) { // This means multibyte! (first byte!)
418 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
419 $buf=$chr; // Add first byte
420 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
421 $ord = $ord << 1; // Shift it left and ...
422 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
423 $a++; // Increase pointer...
424 $buf.=substr($str,$a,1); // ... and add the next char.
425 } else break;
426 }
427
428 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
429 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
430 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
431 }
432
433 return $outStr;
434 }
435
436 /**
437 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
438 *
439 * @param string Input string, UTF-8
440 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
441 * @return string Output string
442 */
443 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
444 if ($alsoStdHtmlEnt) {
445 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES));
446 }
447
448 $token = md5(microtime());
449 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
450 foreach($parts as $k => $v) {
451 if ($k%2) {
452 if (substr($v,0,1)=='#') { // Dec or hex entities:
453 if (substr($v,1,1)=='x') {
454 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
455 } else {
456 $parts[$k] = $this->UnumberToChar(substr($v,1));
457 }
458 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
459 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
460 } else { // No conversion:
461 $parts[$k] ='&'.$v.';';
462 }
463 }
464 }
465
466 return implode('',$parts);
467 }
468
469 /**
470 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
471 *
472 * @param string Input string, UTF-8
473 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
474 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
475 * @return array Output array with the char numbers
476 */
477 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
478 // If entities must be registered as well...:
479 if ($convEntities) {
480 $str = $this->entities_to_utf8($str,1);
481 }
482 // Do conversion:
483 $strLen = strlen($str);
484 $outArr=array();
485 $buf='';
486 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
487 $chr=substr($str,$a,1);
488 $ord=ord($chr);
489 if ($ord>127) { // This means multibyte! (first byte!)
490 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
491 $buf=$chr; // Add first byte
492 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
493 $ord = $ord << 1; // Shift it left and ...
494 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
495 $a++; // Increase pointer...
496 $buf.=substr($str,$a,1); // ... and add the next char.
497 } else break;
498 }
499
500 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
501 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
502 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
503 }
504
505 return $outArr;
506 }
507
508 /**
509 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
510 * This function is automatically called by the conversion functions
511 *
512 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
513 *
514 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
515 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
516 * @access private
517 */
518 function initCharset($charset) {
519 // Only process if the charset is not yet loaded:
520 if (!is_array($this->parsedCharsets[$charset])) {
521
522 // Conversion table filename:
523 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
524
525 // If the conversion table is found:
526 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
527 // Cache file for charsets:
528 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
529 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/charset_'.$charset.'.tbl');
530 if ($cacheFile && @is_file($cacheFile)) {
531 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
532 } else {
533 // Parse conversion table into lines:
534 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
535 // Initialize the internal variable holding the conv. table:
536 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
537 // traverse the lines:
538 $detectedType='';
539 foreach($lines as $value) {
540 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
541
542 // Detect type if not done yet: (Done on first real line)
543 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
544 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
545
546 if ($detectedType=='ms-token') {
547 list($hexbyte,$utf8) = split('=|:',$value,3);
548 } elseif ($detectedType=='whitespaced') {
549 $regA=array();
550 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
551 $hexbyte = $regA[1];
552 $utf8 = 'U+'.$regA[2];
553 }
554 $decval = hexdec(trim($hexbyte));
555 if ($decval>127) {
556 $utf8decval = hexdec(substr(trim($utf8),2));
557 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
558 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
559 }
560 }
561 }
562 if ($cacheFile) {
563 t3lib_div::writeFile($cacheFile,serialize($this->parsedCharsets[$charset]));
564 }
565 }
566 return 2;
567 } else return false;
568 } else return 1;
569 }
570
571 /**
572 * Converts a UNICODE number to a UTF-8 multibyte character
573 * Algorithm based on script found at From: http://czyborra.com/utf/
574 *
575 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
576 *
577 * bytes | bits | representation
578 * 1 | 7 | 0vvvvvvv
579 * 2 | 11 | 110vvvvv 10vvvvvv
580 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
581 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
582 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
583 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
584 *
585 * @param integer UNICODE integer
586 * @return string UTF-8 multibyte character string
587 * @see utf8CharToUnumber()
588 */
589 function UnumberToChar($cbyte) {
590 $str='';
591
592 if ($cbyte < 0x80) {
593 $str.=chr($cbyte);
594 } else if ($cbyte < 0x800) {
595 $str.=chr(0xC0 | ($cbyte >> 6));
596 $str.=chr(0x80 | ($cbyte & 0x3F));
597 } else if ($cbyte < 0x10000) {
598 $str.=chr(0xE0 | ($cbyte >> 12));
599 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
600 $str.=chr(0x80 | ($cbyte & 0x3F));
601 } else if ($cbyte < 0x200000) {
602 $str.=chr(0xF0 | ($cbyte >> 18));
603 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
604 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
605 $str.=chr(0x80 | ($cbyte & 0x3F));
606 } else if ($cbyte < 0x4000000) {
607 $str.=chr(0xF8 | ($cbyte >> 24));
608 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
609 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
610 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
611 $str.=chr(0x80 | ($cbyte & 0x3F));
612 } else if ($cbyte < 0x80000000) {
613 $str.=chr(0xFC | ($cbyte >> 30));
614 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
615 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
616 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
617 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
618 $str.=chr(0x80 | ($cbyte & 0x3F));
619 } else { // Cannot express a 32-bit character in UTF-8
620 $str .= chr($this->noCharByteVal);
621 }
622 return $str;
623 }
624
625 /**
626 * Converts a UTF-8 Multibyte character to a UNICODE number
627 *
628 * @param string UTF-8 multibyte character string
629 * @param boolean If set, then a hex. number is returned.
630 * @return integer UNICODE integer
631 * @see UnumberToChar()
632 */
633 function utf8CharToUnumber($str,$hex=0) {
634 $ord=ord(substr($str,0,1)); // First char
635
636 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
637 $binBuf='';
638 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
639 $ord = $ord << 1; // Shift it left and ...
640 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
641 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
642 } else break;
643 }
644 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
645
646 $int = bindec($binBuf);
647 } else $int = $ord;
648
649 return $hex ? 'x'.dechex($int) : $int;
650 }
651
652 /**
653 * This function initializes the UTF-8 case folding table.
654 *
655 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
656 *
657 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
658 * @access private
659 */
660 function initCaseFoldingUTF8() {
661 // Only process if the case table is not yet loaded:
662 if (is_array($this->caseFolding['utf-8'])) return 1;
663
664 // Use cached version if possible
665 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cscase_utf-8.tbl');
666 if ($cacheFile && @is_file($cacheFile)) {
667 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFile));
668 return 2;
669 }
670
671 // process main Unicode data file
672 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
673 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
674
675 $fh = fopen($unicodeDataFile,'r');
676 if (!$fh) return false;
677
678 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
679 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
680 $this->caseFolding['utf-8'] = array();
681 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
682 $utf8CaseFolding['toUpper'] = array();
683 $utf8CaseFolding['toLower'] = array();
684 $utf8CaseFolding['toTitle'] = array();
685
686 while (!feof($fh)) {
687 $line = fgets($fh);
688 // has also other info like character class (digit, white space, etc.) and more
689 list($char,,,,,,,,,,,,$upper,$lower,$title,) = split(';', rtrim($line));
690 $char = $this->UnumberToChar(hexdec($char));
691 if ($upper) $utf8CaseFolding['toUpper'][$char] = $this->UnumberToChar(hexdec($upper));
692 if ($lower) $utf8CaseFolding['toLower'][$char] = $this->UnumberToChar(hexdec($lower));
693 // store "title" only when different from "upper" (only a few)
694 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$char] = $this->UnumberToChar(hexdec($title));
695 }
696 fclose($fh);
697
698 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
699 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
700 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
701
702 $fh = fopen($specialCasingFile,'r');
703 if ($fh) {
704 while (!feof($fh)) {
705 $line = fgets($fh);
706 if ($line{0} != '#' && trim($line) != '') {
707
708 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
709 if ($cond == '' || $cond{0} == '#') {
710 $utf8_char = $this->UnumberToChar(hexdec($char));
711 if ($char != $lower) {
712 $arr = split(' ',$lower);
713 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
714 $utf8CaseFolding['toLower'][$utf8_char] = implode($arr);
715 }
716 if ($char != $title && $title != $upper) {
717 $arr = split(' ',$title);
718 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
719 $utf8CaseFolding['toTitle'][$utf8_char] = implode($arr);
720 }
721 if ($char != $upper) {
722 $arr = split(' ',$upper);
723 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
724 $utf8CaseFolding['toUpper'][$utf8_char] = implode($arr);
725 }
726 }
727 }
728 }
729 fclose($fh);
730 }
731 }
732
733 if ($cacheFile) {
734 t3lib_div::writeFile($cacheFile,serialize($utf8CaseFolding));
735 }
736
737 return 3;
738 }
739
740 /**
741 * This function initializes the folding table for a charset other than UTF-8.
742 * This function is automatically called by the case folding functions.
743 *
744 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
745 * @access private
746 */
747 function initCaseFolding($charset) {
748 // Only process if the case table is not yet loaded:
749 if (is_array($this->caseFolding[$charset])) return 1;
750
751 // Use cached version if possible
752 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cscase_'.$charset.'.tbl');
753 if ($cacheFile && @is_file($cacheFile)) {
754 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
755 return 2;
756 }
757
758 // init UTF-8 conversion for this charset
759 if (!$this->initCharset($charset)) {
760 return false;
761 }
762
763 // UTF-8 case folding is used as the base conversion table
764 if (!$this->initCaseFoldingUTF8()) {
765 return false;
766 }
767
768 $nochar = chr($this->noCharByteVal);
769 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
770 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
771 $c = $this->conv($utf8, 'utf-8', $charset);
772
773 $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
774 if ($cc && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
775
776 $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
777 if ($cc && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
778
779 $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
780 if ($cc && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
781 }
782
783 // add the ASCII case table
784 for ($i=ord('a'); $i<=ord('z'); $i++) {
785 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
786 }
787 for ($i=ord('A'); $i<=ord('Z'); $i++) {
788 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
789 }
790
791 if ($cacheFile) {
792 t3lib_div::writeFile($cacheFile,serialize($this->caseFolding[$charset]));
793 }
794
795 return 3;
796 }
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814 /********************************************
815 *
816 * String operation functions
817 *
818 ********************************************/
819
820 /**
821 * Cuts a string short at a given byte length.
822 *
823 * @param string the character set
824 * @param string character string
825 * @param integer the byte length
826 * @return string the shortened string
827 * @see mb_strcut()
828 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
829 */
830 function strtrunc($charset,$string,$len) {
831 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
832 return mb_strcut($string,0,$len,$charset);
833 } elseif ($charset == 'utf-8') {
834 return $this->utf8_strtrunc($string);
835 } elseif ($charset == 'shift_jis') {
836 return $this->euc_strtrunc($string,'shift_jis');
837 } elseif ($this->eucBasedSets[$charset]) {
838 return $this->euc_strtrunc($string,$charset);
839 } elseif ($this->twoByteSets[$charset]) {
840 if ($len % 2) $len--; // don't cut at odd positions
841 } elseif ($this->fourByteSets[$charset]) {
842 $x = $len % 4;
843 $len -= $x; // realign to position dividable by four
844 }
845 // treat everything else as single-byte encoding
846 return substr($string,0,$len);
847 }
848
849 /**
850 * Returns a part of a string.
851 *
852 *
853 * Negative values for @arg $start and @arg $len are currently not supported.
854 *
855 * @param string the character set
856 * @param string character string
857 * @param int $start start position (character position)
858 * @param int length (in characters)
859 * @return string the substring
860 * @see substr()
861 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
862 * @bug
863 */
864 function substr($charset,$string,$start,$len=null) {
865 if ($len===0) return '';
866
867 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
868 // cannot omit $len, when specifying charset
869 if ($len==null) {
870 $enc = mb_internal_encoding(); // save internal encoding
871 mb_internal_encoding('utf-8');
872 $str = mb_substr($string,$start);
873 mb_internal_encoding($enc); // restore internal encoding
874
875 return $str;
876 }
877 else return mb_substr($string,$start,$len,'utf-8');
878 } elseif ($charset == 'utf-8') {
879 return $this->utf8_substr($string,$start,$len);
880 } elseif ($charset == 'shift_jis') {
881 return $this->euc_substr($string,$start,'shift_jis',$len);
882 } elseif ($this->eucBasedSets[$charset]) {
883 return $this->euc_substr($string,$start,$charset,$len);
884 } elseif ($this->twoByteSets[$charset]) {
885 return substr($string,$start*2,$len*2);
886 } elseif ($this->fourByteSets[$charset]) {
887 return substr($string,$start*4,$len*4);
888 }
889
890 // treat everything else as single-byte encoding
891 return substr($string,$start,$len);
892 }
893
894 /**
895 * Counts the number of characters.
896 *
897 * @param string the character set
898 * @param string character string
899 * @return integer the number of characters
900 * @see strlen()
901 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
902 */
903 function strlen($charset,$string) {
904 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
905 return mb_strlen($string,$charset);
906 } elseif ($charset == 'utf-8') {
907 return $this->utf8_strlen($string);
908 } elseif ($charset == 'shift_jis') {
909 return $this->euc_strlen($string,'shift_jis');
910 } elseif ($this->eucBasedSets[$charset]) {
911 return $this->euc_strlen($string,$charset);
912 } elseif ($this->twoByteSets[$charset]) {
913 return strlen($string)/2;
914 } elseif ($this->fourByteSets[$charset]) {
915 return strlen($string)/4;
916 }
917 // treat everything else as single-byte encoding
918 return strlen($string);
919 }
920
921 /**
922 * Translates all characters of a string into their respective case values.
923 * Unlike strtolower() and strtoupper() this method is locale independent.
924 *
925 * Real case folding is language dependent, this method ignores this fact.
926 *
927 * @param string string
928 * @return string the converted string
929 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
930 * @see strtolower(), strtoupper(), mb_convert_case()
931 */
932 function conv_case($charset,$string,$case) {
933 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' &&
934 float(phpversion()) >= 4.3) {
935 if ($case == 'toLower') {
936 return mb_strtolower($str,'utf-8');
937 } else {
938 return mb_strtoupper($str,'utf-8');
939 }
940 } elseif ($charset == 'utf-8') {
941 return $this->utf8_conv_case($string,$case);
942 }
943 /*
944 } elseif ($charset == 'shift_jis') {
945 return $this->euc_conv_case($string,$case,'shift_jis');
946 } elseif ($this->eucBasedSets[$charset]) {
947 return $this->euc_conv_case($string,$case,$charset);
948 }
949 */
950
951 // treat everything else as single-byte encoding
952 if (!$this->initCaseFolding($charset)) return $string; // do nothing
953
954 $out = '';
955 $caseConv =& $this->caseFolding[$charset][$case];
956 for($i=0; $c=$string{$i}; $i++) {
957 $cc = $caseConv[$c];
958 if ($cc) {
959 $out .= $cc;
960 } else {
961 $out .= $c;
962 }
963 }
964
965 // is a simple strtr() faster or slower than the code above?
966 // perhaps faster for small single-byte tables but slower for large multi-byte tables?
967 //
968 // return strtr($string,$this->caseFolding[$charset][$case]);
969
970 return $out;
971 }
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986 /********************************************
987 *
988 * UTF-8 string operation functions
989 *
990 ********************************************/
991
992 /**
993 * Truncates a string in UTF-8 short at a given byte length.
994 *
995 * @param string UTF-8 multibyte character string
996 * @param integer the byte length
997 * @return string the shortened string
998 * @see mb_strcut()
999 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1000 */
1001 function utf8_strtrunc($str,$len) {
1002 if ($len <= 0) return '';
1003
1004 $i = $len-1;
1005 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1006 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1007 if ($i <= 0) return ''; // sanity check
1008 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1009 if ($bc+$i > $len) return substr($str,0,$i);
1010 // fallthru: multibyte char fits into length
1011 }
1012 return substr($str,$len);
1013 }
1014
1015 /**
1016 * Returns a part of a UTF-8 string.
1017 *
1018 *
1019 * Negative values for @arg $start and @arg $len are currently not supported.
1020 *
1021 * @param string $str UTF-8 string
1022 * @param int $start start position (character position)
1023 * @param int $len length (in characters)
1024 * @return string the substring
1025 * @see substr()
1026 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1027 */
1028 function utf8_substr($str,$start,$len=null) {
1029 if ($len===0) return '';
1030
1031 $byte_start = $this->utf8_char2byte_pos($str,$start);
1032 if ($byte_start === false) return false; // $start outside string length
1033
1034 $str = substr($str,$byte_start);
1035
1036 if ($len!=null) {
1037 $byte_end = $this->utf8_char2byte_pos($str,$len);
1038 if ($byte_end === false) // $len outside actual string length
1039 return $str;
1040 else
1041 return substr($str,0,$byte_end);
1042 }
1043 else return $str;
1044 }
1045
1046 /**
1047 * Counts the number of characters of a string in UTF-8.
1048 *
1049 * @param string UTF-8 multibyte character string
1050 * @return int the number of characters
1051 * @see strlen()
1052 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1053 */
1054 function utf8_strlen($str) {
1055 $n=0;
1056 for($i=0; $str{$i}; $i++) {
1057 $c = ord($str{$i});
1058 if (!($c & 0x80)) // single-byte (0xxxxxx)
1059 $n++;
1060 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1061 $n++;
1062 }
1063 return $n;
1064 }
1065
1066 /**
1067 * Find position of first occurrence of a string, both arguments are in UTF-8.
1068 *
1069 * @param string UTF-8 string to search in
1070 * @param string UTF-8 string to search for
1071 * @param int positition to start the search
1072 * @return int the character position
1073 * @see strpos()
1074 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1075 */
1076 function utf8_strpos($haystack,$needle,$offset=0) {
1077 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1078 return mb_strpos($haystack,$needle,'utf-8');
1079 }
1080
1081 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1082 if ($byte_offset === false) return false; // offset beyond string length
1083
1084 $byte_pos = strpos($haystack,$needle,$byte_offset);
1085 if ($byte_pos === false) return false; // needle not found
1086
1087 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1088 }
1089
1090 /**
1091 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1092 *
1093 * @param string UTF-8 string to search in
1094 * @param char UTF-8 character to search for
1095 * @return int the character position
1096 * @see strrpos()
1097 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1098 */
1099 function utf8_strrpos($haystack,$needle) {
1100 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1101 return mb_strrpos($haystack,$needle,'utf-8');
1102 }
1103
1104 $byte_pos = strrpos($haystack,$needle);
1105 if ($byte_pos === false) return false; // needle not found
1106
1107 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1108 }
1109
1110 /**
1111 * Translates a character position into an 'absolute' byte position.
1112 *
1113 * @param string UTF-8 string
1114 * @param int character position
1115 * @return int byte position
1116 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1117 */
1118 function utf8_char2byte_pos($str,$pos) {
1119 $n = 0; // number of characters
1120 for($i=0; $str{$i} && $n<$pos; $i++) {
1121 $c = (int)ord($str{$i});
1122 if (!($c & 0x80)) // single-byte (0xxxxxx)
1123 $n++;
1124 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1125 $n++;
1126 }
1127 if (!$str{$i}) return false; // offset beyond string length
1128
1129 // skip trailing multi-byte data bytes
1130 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1131
1132 return $i;
1133 }
1134
1135 /**
1136 * Translates an 'absolute' byte position into a character position.
1137 *
1138 * @param string UTF-8 string
1139 * @param int byte position
1140 * @return int character position
1141 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1142 */
1143 function utf8_byte2char_pos($str,$pos) {
1144 $n = 0; // number of characters
1145 for($i=$pos; $i>0; $i--) {
1146 $c = (int)ord($str{$i});
1147 if (!($c & 0x80)) // single-byte (0xxxxxx)
1148 $n++;
1149 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1150 $n++;
1151 }
1152 if (!$str{$i}) return false; // offset beyond string length
1153
1154 return $n;
1155 }
1156
1157 /**
1158 * Translates all characters of an UTF-8 string into their respective case values.
1159 *
1160 * @param string UTF-8 string
1161 * @param string conversion: 'toLower' or 'toUpper'
1162 * @return string the converted string
1163 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1164 * @see strtolower()
1165 */
1166 function utf8_conv_case($str,$case) {
1167 if (!$this->initCaseFoldingUTF8()) return $str; // do nothing
1168
1169 $out = '';
1170 $caseConv =& $this->caseFolding['utf-8'][$case];
1171 for($i=0; $str{$i}; $i++) {
1172 $c = ord($str{$i});
1173 if (!($c & 0x80)) // single-byte (0xxxxxx)
1174 $mbc = $str{$i};
1175 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1176 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1177 $mbc = substr($str,$i,$bc);
1178 $i += $bc-1;
1179 }
1180
1181 $cc = $caseConv[$mbc];
1182 if ($cc) {
1183 $out .= $cc;
1184 } else {
1185 $out .= $mbc;
1186 }
1187 }
1188
1189 return $out;
1190 }
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209 /********************************************
1210 *
1211 * EUC string operation functions
1212 *
1213 * Extended Unix Code:
1214 * ASCII compatible 7bit single bytes chars
1215 * 8bit two byte chars
1216 *
1217 * Shift-JIS is treated as a special case.
1218 *
1219 ********************************************/
1220
1221 /**
1222 * Cuts a string in the EUC charset family short at a given byte length.
1223 *
1224 * @param string EUC multibyte character string
1225 * @param integer the byte length
1226 * @param string the charset
1227 * @return string the shortened string
1228 * @see mb_strcut()
1229 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1230 */
1231 function euc_strtrunc($str,$len,$charset) {
1232 if ($len <= 0) return '';
1233
1234 $sjis = ($charset == 'shift_jis');
1235 for ($i=0; $str{$i} && $i<$len; $i++) {
1236 $c = ord($str{$i});
1237 if ($sjis) {
1238 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1239 }
1240 else {
1241 if ($c >= 0x80) $i++; // advance a double-byte char
1242 }
1243 }
1244 if (!$str{$i}) return $str; // string shorter than supplied length
1245
1246 if ($i>$len)
1247 return substr($str,0,$len-1); // we ended on a first byte
1248 else
1249 return substr($str,0,$len);
1250 }
1251
1252 /**
1253 * Returns a part of a string in the EUC charset family.
1254 *
1255 *
1256 * Negative values for @arg $start and @arg $len are currently not supported.
1257 *
1258 * @param string EUC multibyte character string
1259 * @param int start position (character position)
1260 * @param string the charset
1261 * @param int length (in characters)
1262 * @return string the substring
1263 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1264 */
1265 function euc_substr($str,$start,$charset,$len=null) {
1266 if ($len===0) return '';
1267
1268 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1269 if ($byte_start === false) return false; // $start outside string length
1270
1271 $str = substr($str,$byte_start);
1272
1273 if ($len!=null) {
1274 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1275 if ($byte_end === false) // $len outside actual string length
1276 return $str;
1277 else
1278 return substr($str,0,$byte_end);
1279 }
1280 else return $str;
1281 }
1282
1283 /**
1284 * Counts the number of characters of a string in the EUC charset family.
1285 *
1286 * @param string EUC multibyte character string
1287 * @param string the charset
1288 * @return int the number of characters
1289 * @see strlen()
1290 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1291 */
1292 function euc_strlen($str,$charset) {
1293 $sjis = ($charset == 'shift_jis');
1294 $n=0;
1295 for ($i=0; $str{$i}; $i++) {
1296 $c = ord($str{$i});
1297 if ($sjis) {
1298 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1299 }
1300 else {
1301 if ($c >= 0x80) $i++; // advance a double-byte char
1302 }
1303
1304 $n++;
1305 }
1306
1307 return $n;
1308 }
1309
1310 /**
1311 * Translates a character position into an 'absolute' byte position.
1312 *
1313 * @param string EUC multibyte character string
1314 * @param int character position
1315 * @param string the charset
1316 * @return int byte position
1317 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1318 */
1319 function euc_char2byte_pos($str,$pos,$charset) {
1320 $sjis = ($charset == 'shift_jis');
1321 $n = 0; // number of characters seen
1322 for ($i=0; $str{$i} && $n<$pos; $i++) {
1323 $c = ord($str{$i});
1324 if ($sjis) {
1325 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1326 }
1327 else {
1328 if ($c >= 0x80) $i++; // advance a double-byte char
1329 }
1330
1331 $n++;
1332 }
1333 if (!$str{$i}) return false; // offset beyond string length
1334
1335 return $i;
1336 }
1337
1338 }
1339
1340 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
1341 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
1342 }
1343 ?>