Added case folding for EUC charsets.
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2004 Kasper Skaarhoj (kasper@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasper@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 119: class t3lib_cs
38 * 261: function parse_charset($charset)
39 * 278: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
40 * 312: function utf8_encode($str,$charset)
41 * 359: function utf8_decode($str,$charset,$useEntityForNoChar=0)
42 * 407: function utf8_to_entities($str)
43 * 440: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
44 * 474: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
45 * 515: function initCharset($charset)
46 * 586: function UnumberToChar($cbyte)
47 * 630: function utf8CharToUnumber($str,$hex=0)
48 *
49 * SECTION: String operation functions
50 * 682: function strtrunc($charset,$string,$len)
51 * 716: function substr($charset,$str,$start,$len=null)
52 * 755: function strlen($charset,$string)
53 *
54 * SECTION: UTF-8 String operation functions
55 * 803: function utf8_strtrunc($str,$len)
56 * 831: function utf8_substr($str,$start,$len=null)
57 * 857: function utf8_strlen($str)
58 * 879: function utf8_strpos($haystack,$needle,$offset=0)
59 * 902: function utf8_strrpos($haystack,$needle)
60 * 921: function utf8_char2byte_pos($str,$pos)
61 * 946: function utf8_byte2char_pos($str,$pos)
62 *
63 * SECTION: EUC String operation functions
64 * 994: function euc_strtrunc($str,$len,$charset)
65 * 1028: function euc_substr($str,$start,$charset,$len=null)
66 * 1055: function euc_strlen($str,$charset)
67 * 1082: function euc_char2byte_pos($str,$pos,$charset)
68 *
69 * TOTAL FUNCTIONS: 24
70 * (This index is automatically created/updated by the extension "extdeveval")
71 *
72 */
73
74
75
76
77
78
79
80
81 /**
82 * Notes on UTF-8
83 *
84 * Functions working on UTF-8 strings:
85 *
86 * - strchr/strstr
87 * - strrchr
88 * - substr_count
89 * - implode/explode/join
90 *
91 * Functions nearly working on UTF-8 strings:
92 *
93 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf_strlen
94 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained 7-bit ASCII
95 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
96 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
97 *
98 * Functions NOT working on UTF-8 strings:
99 *
100 * - str*cmp
101 * - stristr
102 * - stripos
103 * - substr
104 * - strrev
105 * - ereg/eregi
106 * - split/spliti
107 * - preg_*
108 * - ...
109 *
110 */
111 /**
112 * Class for conversion between charsets.
113 *
114 * @author Kasper Skaarhoj <kasper@typo3.com>
115 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
116 * @package TYPO3
117 * @subpackage t3lib
118 */
119 class t3lib_cs {
120 var $noCharByteVal=127; // ASCII Value for chars with no equalent.
121
122 // This is the array where parsed conversion tables are stored (cached)
123 var $parsedCharsets=array();
124
125 // An array where case folding data will be stored (cached)
126 var $caseFolding=array();
127
128 // This tells the converter which charsets has two bytes per char:
129 var $twoByteSets=array(
130 'ucs-2'=>1, // 2-byte Unicode
131 );
132
133 // This tells the converter which charsets has four bytes per char:
134 var $fourByteSets=array(
135 'ucs-4'=>1, // 4-byte Unicode
136 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
137 );
138
139 // This tells the converter which charsets use a scheme like the Extended Unix Code:
140 var $eucBasedSets=array(
141 'gb2312'=>1, // Chinese, simplified.
142 'big5'=>1, // Chinese, traditional.
143 );
144
145 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
146 // http://czyborra.com/charsets/iso8859.html
147 var $synonyms=array(
148 'us' => 'ascii',
149 'us-ascii'=> 'ascii',
150 'cp819' => 'iso-8859-1',
151 'ibm819' => 'iso-8859-1',
152 'iso-ir-100' => 'iso-8859-1',
153 'iso-ir-109' => 'iso-8859-2',
154 'iso-ir-148' => 'iso-8859-9',
155 'iso-ir-199' => 'iso-8859-14',
156 'iso-ir-203' => 'iso-8859-15',
157 'csisolatin1' => 'iso-8859-1',
158 'csisolatin2' => 'iso-8859-2',
159 'csisolatin3' => 'iso-8859-3',
160 'csisolatin5' => 'iso-8859-9',
161 'csisolatin8' => 'iso-8859-14',
162 'csisolatin9' => 'iso-8859-15',
163 'csisolatingreek' => 'iso-8859-7',
164 'iso-celtic' => 'iso-8859-14',
165 'latin1' => 'iso-8859-1',
166 'latin2' => 'iso-8859-2',
167 'latin3' => 'iso-8859-3',
168 'latin5' => 'iso-8859-9',
169 'latin6' => 'iso-8859-10',
170 'latin8' => 'iso-8859-14',
171 'latin9' => 'iso-8859-15',
172 'l1' => 'iso-8859-1',
173 'l2' => 'iso-8859-2',
174 'l3' => 'iso-8859-3',
175 'l5' => 'iso-8859-9',
176 'l6' => 'iso-8859-10',
177 'l8' => 'iso-8859-14',
178 'l9' => 'iso-8859-15',
179 'cyrillic' => 'iso-8859-5',
180 'arabic' => 'iso-8859-6',
181 'win874' => 'windows-874',
182 'win1250' => 'windows-1250',
183 'win1251' => 'windows-1251',
184 'win1252' => 'windows-1252',
185 'win1253' => 'windows-1253',
186 'win1254' => 'windows-1254',
187 'win1255' => 'windows-1255',
188 'win1256' => 'windows-1256',
189 'win1257' => 'windows-1257',
190 'win1258' => 'windows-1258',
191 'cp1250' => 'windows-1250',
192 'cp1252' => 'windows-1252',
193 'ms-ee' => 'windows-1250',
194 'ms-ansi' => 'windows-1252',
195 'ms-greek' => 'windows-1253',
196 'ms-turk' => 'windows-1254',
197 'winbaltrim' => 'windows-1257',
198 'koi-8ru' => 'koi-8r',
199 'koi8r' => 'koi-8r',
200 'mac' => 'macRoman',
201 'macintosh' => 'macRoman',
202 'euc-cn' => 'gb2312',
203 'x-euc-cn' => 'gb2312',
204 'cp936' => 'gb2312',
205 'big-5' => 'big5',
206 'cp950' => 'big5',
207 'sjis' => 'shift_jis',
208 'shift-jis' => 'shift_jis',
209 'cp932' => 'shift_jis',
210 'utf7' => 'utf-7',
211 'utf8' => 'utf-8',
212 'utf16' => 'utf-16',
213 'utf32' => 'utf-32',
214 'utf8' => 'utf-8',
215 'ucs2' => 'ucs-2',
216 'ucs4' => 'ucs-4',
217 );
218
219 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
220 // Empty values means "iso-8859-1"
221 var $charSetArray = array(
222 'dk' => '',
223 'de' => '',
224 'no' => '',
225 'it' => '',
226 'fr' => '',
227 'es' => '',
228 'nl' => '',
229 'cz' => 'windows-1250',
230 'pl' => 'iso-8859-2',
231 'si' => 'windows-1250',
232 'fi' => '',
233 'tr' => 'iso-8859-9',
234 'se' => '',
235 'pt' => '',
236 'ru' => 'windows-1251',
237 'ro' => 'iso-8859-2',
238 'ch' => 'gb2312',
239 'sk' => 'windows-1250',
240 'lt' => 'windows-1257',
241 'is' => 'utf-8',
242 'hr' => 'windows-1250',
243 'hu' => 'iso-8859-2',
244 'gl' => '',
245 'th' => 'iso-8859-11',
246 'gr' => 'iso-8859-7',
247 'hk' => 'big5',
248 'eu' => '',
249 'bg' => 'windows-1251',
250 'br' => '',
251 'et' => 'iso-8859-4',
252 'ar' => 'iso-8859-6',
253 'he' => 'utf-8',
254 'ua' => 'windows-1251',
255 );
256
257 /**
258 * Normalize - changes input character set to lowercase letters.
259 *
260 * @param string Input charset
261 * @return string Normalized charset
262 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
263 */
264 function parse_charset($charset) {
265 $charset = strtolower($charset);
266 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
267
268 return $charset;
269 }
270
271
272 /**
273 * Convert from one charset to another charset.
274 *
275 * @param string Input string
276 * @param string From charset (the current charset of the string)
277 * @param string To charset (the output charset wanted)
278 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
279 * @return string Converted string
280 */
281 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
282 global $TYPO3_CONF_VARS;
283
284 if ($fromCS==$toCS) return $str;
285
286 if (!$useEntityForNoChar) { // iconv and recode don't support fallback to SGML entities
287 if ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'iconv') {
288 $conv_str = iconv($str,$fromCS,$toCS.'//TRANSLIT');
289 if (false !== $conv_str) return $conv_str;
290 }
291 elseif ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'recode') {
292 $conv_str = recode_string($toCS.'..'.$fromCS,$str);
293 if (false !== $conv_str) return $conv_str;
294 }
295 elseif ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'mbstring') {
296 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
297 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
298 }
299 // fallback to TYPO3 conversion
300 }
301
302 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
303 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
304 return $str;
305 }
306
307
308 /**
309 * Converts $str from $charset to UTF-8
310 *
311 * @param string String in local charset to convert to UTF-8
312 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
313 * @return string Output string, converted to UTF-8
314 */
315 function utf8_encode($str,$charset) {
316
317 // Charset is case-insensitive.
318 if ($this->initCharset($charset)) { // Parse conv. table if not already...
319 $strLen = strlen($str);
320 $outStr='';
321
322 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
323 $chr=substr($str,$a,1);
324 $ord=ord($chr);
325 if ($this->twoByteSets[$charset]) { // If the charset has two bytes per char
326 $ord2 = ord($str{$a+1});
327 $ord = $ord<<8 & $ord2; // assume big endian
328
329 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
330 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
331 } else $outStr.=chr($this->noCharByteVal); // No char exists
332 $a++;
333 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
334 if ($this->eucBasedSets[$charset]) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
335 $a++;
336 $ord2=ord(substr($str,$a,1));
337 $ord = $ord*256+$ord2;
338 }
339 elseif ($charset == 'shift_jis' && ($ord <160 || $ord>223)) { // Shift-JIS is like EUC, but chars between 160 and 223 are single byte
340 $a++;
341 $ord2=ord(substr($str,$a,1));
342 $ord = $ord*256+$ord2;
343 }
344
345 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
346 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
347 } else $outStr.=chr($this->noCharByteVal); // No char exists
348 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
349 }
350 return $outStr;
351 }
352 }
353
354 /**
355 * Converts $str from UTF-8 to $charset
356 *
357 * @param string String in UTF-8 to convert to local charset
358 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
359 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
360 * @return string Output string, converted to local charset
361 */
362 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
363
364 // Charset is case-insensitive.
365 if ($this->initCharset($charset)) { // Parse conv. table if not already...
366 $strLen = strlen($str);
367 $outStr='';
368 $buf='';
369 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
370 $chr=substr($str,$a,1);
371 $ord=ord($chr);
372 if ($ord>127) { // This means multibyte! (first byte!)
373 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
374
375 $buf=$chr; // Add first byte
376 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
377 $ord = $ord << 1; // Shift it left and ...
378 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
379 $a++; // Increase pointer...
380 $buf.=substr($str,$a,1); // ... and add the next char.
381 } else break;
382 }
383
384 # Martin Kutschker...! this does not work! With russian UTF-8 converted back to windows-1251 it failed... So the old code is re-inserted.
385 # for ($bc=0; $ord & 0x80; $ord = $ord << 1) { $bc++; } // calculate number of bytes
386 # $buf.=substr($str,$i,$bc);
387 # $i+=$bc-1;
388
389 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
390 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
391 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
392 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
393 } else $outStr.= chr($mByte);
394 } elseif ($useEntityForNoChar) { // Create num entity:
395 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
396 } else $outStr.=chr($this->noCharByteVal); // No char exists
397 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
398 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
399 }
400 return $outStr;
401 }
402 }
403
404 /**
405 * Converts all chars > 127 to numeric entities.
406 *
407 * @param string Input string
408 * @return string Output string
409 */
410 function utf8_to_entities($str) {
411 $strLen = strlen($str);
412 $outStr='';
413 $buf='';
414 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
415 $chr=substr($str,$a,1);
416 $ord=ord($chr);
417 if ($ord>127) { // This means multibyte! (first byte!)
418 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
419 $buf=$chr; // Add first byte
420 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
421 $ord = $ord << 1; // Shift it left and ...
422 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
423 $a++; // Increase pointer...
424 $buf.=substr($str,$a,1); // ... and add the next char.
425 } else break;
426 }
427
428 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
429 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
430 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
431 }
432
433 return $outStr;
434 }
435
436 /**
437 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
438 *
439 * @param string Input string, UTF-8
440 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
441 * @return string Output string
442 */
443 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
444 if ($alsoStdHtmlEnt) {
445 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES));
446 }
447
448 $token = md5(microtime());
449 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
450 foreach($parts as $k => $v) {
451 if ($k%2) {
452 if (substr($v,0,1)=='#') { // Dec or hex entities:
453 if (substr($v,1,1)=='x') {
454 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
455 } else {
456 $parts[$k] = $this->UnumberToChar(substr($v,1));
457 }
458 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
459 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
460 } else { // No conversion:
461 $parts[$k] ='&'.$v.';';
462 }
463 }
464 }
465
466 return implode('',$parts);
467 }
468
469 /**
470 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
471 *
472 * @param string Input string, UTF-8
473 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
474 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
475 * @return array Output array with the char numbers
476 */
477 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
478 // If entities must be registered as well...:
479 if ($convEntities) {
480 $str = $this->entities_to_utf8($str,1);
481 }
482 // Do conversion:
483 $strLen = strlen($str);
484 $outArr=array();
485 $buf='';
486 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
487 $chr=substr($str,$a,1);
488 $ord=ord($chr);
489 if ($ord>127) { // This means multibyte! (first byte!)
490 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
491 $buf=$chr; // Add first byte
492 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
493 $ord = $ord << 1; // Shift it left and ...
494 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
495 $a++; // Increase pointer...
496 $buf.=substr($str,$a,1); // ... and add the next char.
497 } else break;
498 }
499
500 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
501 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
502 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
503 }
504
505 return $outArr;
506 }
507
508 /**
509 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
510 * This function is automatically called by the conversion functions
511 *
512 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
513 *
514 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
515 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
516 * @access private
517 */
518 function initCharset($charset) {
519 // Only process if the charset is not yet loaded:
520 if (!is_array($this->parsedCharsets[$charset])) {
521
522 // Conversion table filename:
523 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
524
525 // If the conversion table is found:
526 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
527 // Cache file for charsets:
528 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
529 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/charset_'.$charset.'.tbl');
530 if ($cacheFile && @is_file($cacheFile)) {
531 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
532 } else {
533 // Parse conversion table into lines:
534 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
535 // Initialize the internal variable holding the conv. table:
536 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
537 // traverse the lines:
538 $detectedType='';
539 foreach($lines as $value) {
540 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
541
542 // Detect type if not done yet: (Done on first real line)
543 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
544 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
545
546 if ($detectedType=='ms-token') {
547 list($hexbyte,$utf8) = split('=|:',$value,3);
548 } elseif ($detectedType=='whitespaced') {
549 $regA=array();
550 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
551 $hexbyte = $regA[1];
552 $utf8 = 'U+'.$regA[2];
553 }
554 $decval = hexdec(trim($hexbyte));
555 if ($decval>127) {
556 $utf8decval = hexdec(substr(trim($utf8),2));
557 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
558 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
559 }
560 }
561 }
562 if ($cacheFile) {
563 t3lib_div::writeFile($cacheFile,serialize($this->parsedCharsets[$charset]));
564 }
565 }
566 return 2;
567 } else return false;
568 } else return 1;
569 }
570
571 /**
572 * Converts a UNICODE number to a UTF-8 multibyte character
573 * Algorithm based on script found at From: http://czyborra.com/utf/
574 *
575 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
576 *
577 * bytes | bits | representation
578 * 1 | 7 | 0vvvvvvv
579 * 2 | 11 | 110vvvvv 10vvvvvv
580 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
581 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
582 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
583 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
584 *
585 * @param integer UNICODE integer
586 * @return string UTF-8 multibyte character string
587 * @see utf8CharToUnumber()
588 */
589 function UnumberToChar($cbyte) {
590 $str='';
591
592 if ($cbyte < 0x80) {
593 $str.=chr($cbyte);
594 } else if ($cbyte < 0x800) {
595 $str.=chr(0xC0 | ($cbyte >> 6));
596 $str.=chr(0x80 | ($cbyte & 0x3F));
597 } else if ($cbyte < 0x10000) {
598 $str.=chr(0xE0 | ($cbyte >> 12));
599 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
600 $str.=chr(0x80 | ($cbyte & 0x3F));
601 } else if ($cbyte < 0x200000) {
602 $str.=chr(0xF0 | ($cbyte >> 18));
603 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
604 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
605 $str.=chr(0x80 | ($cbyte & 0x3F));
606 } else if ($cbyte < 0x4000000) {
607 $str.=chr(0xF8 | ($cbyte >> 24));
608 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
609 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
610 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
611 $str.=chr(0x80 | ($cbyte & 0x3F));
612 } else if ($cbyte < 0x80000000) {
613 $str.=chr(0xFC | ($cbyte >> 30));
614 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
615 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
616 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
617 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
618 $str.=chr(0x80 | ($cbyte & 0x3F));
619 } else { // Cannot express a 32-bit character in UTF-8
620 $str .= chr($this->noCharByteVal);
621 }
622 return $str;
623 }
624
625 /**
626 * Converts a UTF-8 Multibyte character to a UNICODE number
627 *
628 * @param string UTF-8 multibyte character string
629 * @param boolean If set, then a hex. number is returned.
630 * @return integer UNICODE integer
631 * @see UnumberToChar()
632 */
633 function utf8CharToUnumber($str,$hex=0) {
634 $ord=ord(substr($str,0,1)); // First char
635
636 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
637 $binBuf='';
638 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
639 $ord = $ord << 1; // Shift it left and ...
640 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
641 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
642 } else break;
643 }
644 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
645
646 $int = bindec($binBuf);
647 } else $int = $ord;
648
649 return $hex ? 'x'.dechex($int) : $int;
650 }
651
652 /**
653 * This function initializes the UTF-8 case folding table.
654 *
655 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
656 *
657 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
658 * @access private
659 */
660 function initCaseFoldingUTF8() {
661 // Only process if the case table is not yet loaded:
662 if (is_array($this->caseFolding['utf-8'])) return 1;
663
664 // Use cached version if possible
665 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cscase_utf-8.tbl');
666 if ($cacheFile && @is_file($cacheFile)) {
667 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFile));
668 return 2;
669 }
670
671 // process main Unicode data file
672 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
673 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
674
675 $fh = fopen($unicodeDataFile,'r');
676 if (!$fh) return false;
677
678 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
679 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
680 $this->caseFolding['utf-8'] = array();
681 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
682 $utf8CaseFolding['toUpper'] = array();
683 $utf8CaseFolding['toLower'] = array();
684 $utf8CaseFolding['toTitle'] = array();
685
686 while (!feof($fh)) {
687 $line = fgets($fh);
688 // has also other info like character class (digit, white space, etc.) and more
689 list($char,,,,,,,,,,,,$upper,$lower,$title,) = split(';', rtrim($line));
690 $char = $this->UnumberToChar(hexdec($char));
691 if ($upper) $utf8CaseFolding['toUpper'][$char] = $this->UnumberToChar(hexdec($upper));
692 if ($lower) $utf8CaseFolding['toLower'][$char] = $this->UnumberToChar(hexdec($lower));
693 // store "title" only when different from "upper" (only a few)
694 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$char] = $this->UnumberToChar(hexdec($title));
695 }
696 fclose($fh);
697
698 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
699 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
700 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
701
702 $fh = fopen($specialCasingFile,'r');
703 if ($fh) {
704 while (!feof($fh)) {
705 $line = fgets($fh);
706 if ($line{0} != '#' && trim($line) != '') {
707
708 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
709 if ($cond == '' || $cond{0} == '#') {
710 $utf8_char = $this->UnumberToChar(hexdec($char));
711 if ($char != $lower) {
712 $arr = split(' ',$lower);
713 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
714 $utf8CaseFolding['toLower'][$utf8_char] = implode($arr);
715 }
716 if ($char != $title && $title != $upper) {
717 $arr = split(' ',$title);
718 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
719 $utf8CaseFolding['toTitle'][$utf8_char] = implode($arr);
720 }
721 if ($char != $upper) {
722 $arr = split(' ',$upper);
723 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
724 $utf8CaseFolding['toUpper'][$utf8_char] = implode($arr);
725 }
726 }
727 }
728 }
729 fclose($fh);
730 }
731 }
732
733 if ($cacheFile) {
734 t3lib_div::writeFile($cacheFile,serialize($utf8CaseFolding));
735 }
736
737 return 3;
738 }
739
740 /**
741 * This function initializes the folding table for a charset other than UTF-8.
742 * This function is automatically called by the case folding functions.
743 *
744 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
745 * @access private
746 */
747 function initCaseFolding($charset) {
748 // Only process if the case table is not yet loaded:
749 if (is_array($this->caseFolding[$charset])) return 1;
750
751 // Use cached version if possible
752 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cscase_'.$charset.'.tbl');
753 if ($cacheFile && @is_file($cacheFile)) {
754 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
755 return 2;
756 }
757
758 // init UTF-8 conversion for this charset
759 if (!$this->initCharset($charset)) {
760 return false;
761 }
762
763 // UTF-8 case folding is used as the base conversion table
764 if (!$this->initCaseFoldingUTF8()) {
765 return false;
766 }
767
768 $nochar = chr($this->noCharByteVal);
769 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
770 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
771 $c = $this->conv($utf8, 'utf-8', $charset);
772
773 $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
774 if ($cc && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
775
776 $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
777 if ($cc && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
778
779 $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
780 if ($cc && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
781 }
782
783 // add the ASCII case table
784 for ($i=ord('a'); $i<=ord('z'); $i++) {
785 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
786 }
787 for ($i=ord('A'); $i<=ord('Z'); $i++) {
788 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
789 }
790
791 if ($cacheFile) {
792 t3lib_div::writeFile($cacheFile,serialize($this->caseFolding[$charset]));
793 }
794
795 return 3;
796 }
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814 /********************************************
815 *
816 * String operation functions
817 *
818 ********************************************/
819
820 /**
821 * Cuts a string short at a given byte length.
822 *
823 * @param string the character set
824 * @param string character string
825 * @param integer the byte length
826 * @return string the shortened string
827 * @see mb_strcut()
828 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
829 */
830 function strtrunc($charset,$string,$len) {
831 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
832 return mb_strcut($string,0,$len,$charset);
833 } elseif ($charset == 'utf-8') {
834 return $this->utf8_strtrunc($string);
835 } elseif ($charset == 'shift_jis') {
836 return $this->euc_strtrunc($string,'shift_jis');
837 } elseif ($this->eucBasedSets[$charset]) {
838 return $this->euc_strtrunc($string,$charset);
839 } elseif ($this->twoByteSets[$charset]) {
840 if ($len % 2) $len--; // don't cut at odd positions
841 } elseif ($this->fourByteSets[$charset]) {
842 $x = $len % 4;
843 $len -= $x; // realign to position dividable by four
844 }
845 // treat everything else as single-byte encoding
846 return substr($string,0,$len);
847 }
848
849 /**
850 * Returns a part of a string.
851 *
852 *
853 * Negative values for @arg $start and @arg $len are currently not supported.
854 *
855 * @param string the character set
856 * @param string character string
857 * @param int $start start position (character position)
858 * @param int length (in characters)
859 * @return string the substring
860 * @see substr()
861 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
862 * @bug
863 */
864 function substr($charset,$string,$start,$len=null) {
865 if ($len===0) return '';
866
867 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
868 // cannot omit $len, when specifying charset
869 if ($len==null) {
870 $enc = mb_internal_encoding(); // save internal encoding
871 mb_internal_encoding('utf-8');
872 $str = mb_substr($string,$start);
873 mb_internal_encoding($enc); // restore internal encoding
874
875 return $str;
876 }
877 else return mb_substr($string,$start,$len,'utf-8');
878 } elseif ($charset == 'utf-8') {
879 return $this->utf8_substr($string,$start,$len);
880 } elseif ($charset == 'shift_jis') {
881 return $this->euc_substr($string,$start,'shift_jis',$len);
882 } elseif ($this->eucBasedSets[$charset]) {
883 return $this->euc_substr($string,$start,$charset,$len);
884 } elseif ($this->twoByteSets[$charset]) {
885 return substr($string,$start*2,$len*2);
886 } elseif ($this->fourByteSets[$charset]) {
887 return substr($string,$start*4,$len*4);
888 }
889
890 // treat everything else as single-byte encoding
891 return substr($string,$start,$len);
892 }
893
894 /**
895 * Counts the number of characters.
896 *
897 * @param string the character set
898 * @param string character string
899 * @return integer the number of characters
900 * @see strlen()
901 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
902 */
903 function strlen($charset,$string) {
904 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
905 return mb_strlen($string,$charset);
906 } elseif ($charset == 'utf-8') {
907 return $this->utf8_strlen($string);
908 } elseif ($charset == 'shift_jis') {
909 return $this->euc_strlen($string,'shift_jis');
910 } elseif ($this->eucBasedSets[$charset]) {
911 return $this->euc_strlen($string,$charset);
912 } elseif ($this->twoByteSets[$charset]) {
913 return strlen($string)/2;
914 } elseif ($this->fourByteSets[$charset]) {
915 return strlen($string)/4;
916 }
917 // treat everything else as single-byte encoding
918 return strlen($string);
919 }
920
921 /**
922 * Translates all characters of a string into their respective case values.
923 * Unlike strtolower() and strtoupper() this method is locale independent.
924 *
925 * Real case folding is language dependent, this method ignores this fact.
926 *
927 * @param string string
928 * @return string the converted string
929 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
930 * @see strtolower(), strtoupper(), mb_convert_case()
931 */
932 function conv_case($charset,$string,$case) {
933 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' &&
934 float(phpversion()) >= 4.3) {
935 if ($case == 'toLower') {
936 return mb_strtolower($str,'utf-8');
937 } else {
938 return mb_strtoupper($str,'utf-8');
939 }
940 } elseif ($charset == 'utf-8') {
941 return $this->utf8_conv_case($string,$case);
942 } elseif ($charset == 'shift_jis') {
943 return $this->euc_conv_case($string,$case,'shift_jis');
944 } elseif ($this->eucBasedSets[$charset]) {
945 return $this->euc_conv_case($string,$case,$charset);
946 }
947
948 // treat everything else as single-byte encoding
949 if (!$this->initCaseFolding($charset)) return $string; // do nothing
950
951 $out = '';
952 $caseConv =& $this->caseFolding[$charset][$case];
953 for($i=0; $c=$string{$i}; $i++) {
954 $cc = $caseConv[$c];
955 if ($cc) {
956 $out .= $cc;
957 } else {
958 $out .= $c;
959 }
960 }
961
962 // is a simple strtr() faster or slower than the code above?
963 // perhaps faster for small single-byte tables but slower for large multi-byte tables?
964 //
965 // return strtr($string,$this->caseFolding[$charset][$case]);
966
967 return $out;
968 }
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983 /********************************************
984 *
985 * UTF-8 string operation functions
986 *
987 ********************************************/
988
989 /**
990 * Truncates a string in UTF-8 short at a given byte length.
991 *
992 * @param string UTF-8 multibyte character string
993 * @param integer the byte length
994 * @return string the shortened string
995 * @see mb_strcut()
996 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
997 */
998 function utf8_strtrunc($str,$len) {
999 if ($len <= 0) return '';
1000
1001 $i = $len-1;
1002 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1003 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1004 if ($i <= 0) return ''; // sanity check
1005 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1006 if ($bc+$i > $len) return substr($str,0,$i);
1007 // fallthru: multibyte char fits into length
1008 }
1009 return substr($str,$len);
1010 }
1011
1012 /**
1013 * Returns a part of a UTF-8 string.
1014 *
1015 *
1016 * Negative values for @arg $start and @arg $len are currently not supported.
1017 *
1018 * @param string $str UTF-8 string
1019 * @param int $start start position (character position)
1020 * @param int $len length (in characters)
1021 * @return string the substring
1022 * @see substr()
1023 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1024 */
1025 function utf8_substr($str,$start,$len=null) {
1026 if ($len===0) return '';
1027
1028 $byte_start = $this->utf8_char2byte_pos($str,$start);
1029 if ($byte_start === false) return false; // $start outside string length
1030
1031 $str = substr($str,$byte_start);
1032
1033 if ($len!=null) {
1034 $byte_end = $this->utf8_char2byte_pos($str,$len);
1035 if ($byte_end === false) // $len outside actual string length
1036 return $str;
1037 else
1038 return substr($str,0,$byte_end);
1039 }
1040 else return $str;
1041 }
1042
1043 /**
1044 * Counts the number of characters of a string in UTF-8.
1045 *
1046 * @param string UTF-8 multibyte character string
1047 * @return int the number of characters
1048 * @see strlen()
1049 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1050 */
1051 function utf8_strlen($str) {
1052 $n=0;
1053 for($i=0; $str{$i}; $i++) {
1054 $c = ord($str{$i});
1055 if (!($c & 0x80)) // single-byte (0xxxxxx)
1056 $n++;
1057 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1058 $n++;
1059 }
1060 return $n;
1061 }
1062
1063 /**
1064 * Find position of first occurrence of a string, both arguments are in UTF-8.
1065 *
1066 * @param string UTF-8 string to search in
1067 * @param string UTF-8 string to search for
1068 * @param int positition to start the search
1069 * @return int the character position
1070 * @see strpos()
1071 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1072 */
1073 function utf8_strpos($haystack,$needle,$offset=0) {
1074 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1075 return mb_strpos($haystack,$needle,'utf-8');
1076 }
1077
1078 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1079 if ($byte_offset === false) return false; // offset beyond string length
1080
1081 $byte_pos = strpos($haystack,$needle,$byte_offset);
1082 if ($byte_pos === false) return false; // needle not found
1083
1084 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1085 }
1086
1087 /**
1088 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1089 *
1090 * @param string UTF-8 string to search in
1091 * @param char UTF-8 character to search for
1092 * @return int the character position
1093 * @see strrpos()
1094 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1095 */
1096 function utf8_strrpos($haystack,$needle) {
1097 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1098 return mb_strrpos($haystack,$needle,'utf-8');
1099 }
1100
1101 $byte_pos = strrpos($haystack,$needle);
1102 if ($byte_pos === false) return false; // needle not found
1103
1104 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1105 }
1106
1107 /**
1108 * Translates a character position into an 'absolute' byte position.
1109 *
1110 * @param string UTF-8 string
1111 * @param int character position
1112 * @return int byte position
1113 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1114 */
1115 function utf8_char2byte_pos($str,$pos) {
1116 $n = 0; // number of characters
1117 for($i=0; $str{$i} && $n<$pos; $i++) {
1118 $c = (int)ord($str{$i});
1119 if (!($c & 0x80)) // single-byte (0xxxxxx)
1120 $n++;
1121 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1122 $n++;
1123 }
1124 if (!$str{$i}) return false; // offset beyond string length
1125
1126 // skip trailing multi-byte data bytes
1127 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1128
1129 return $i;
1130 }
1131
1132 /**
1133 * Translates an 'absolute' byte position into a character position.
1134 *
1135 * @param string UTF-8 string
1136 * @param int byte position
1137 * @return int character position
1138 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1139 */
1140 function utf8_byte2char_pos($str,$pos) {
1141 $n = 0; // number of characters
1142 for($i=$pos; $i>0; $i--) {
1143 $c = (int)ord($str{$i});
1144 if (!($c & 0x80)) // single-byte (0xxxxxx)
1145 $n++;
1146 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1147 $n++;
1148 }
1149 if (!$str{$i}) return false; // offset beyond string length
1150
1151 return $n;
1152 }
1153
1154 /**
1155 * Translates all characters of an UTF-8 string into their respective case values.
1156 *
1157 * @param string UTF-8 string
1158 * @param string conversion: 'toLower' or 'toUpper'
1159 * @return string the converted string
1160 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1161 * @see strtolower(), strtoupper(), mb_convert_case()
1162 */
1163 function utf8_conv_case($str,$case) {
1164 if (!$this->initCaseFoldingUTF8()) return $str; // do nothing
1165
1166 $out = '';
1167 $caseConv =& $this->caseFolding['utf-8'][$case];
1168 for($i=0; $str{$i}; $i++) {
1169 $c = ord($str{$i});
1170 if (!($c & 0x80)) // single-byte (0xxxxxx)
1171 $mbc = $str{$i};
1172 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1173 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1174 $mbc = substr($str,$i,$bc);
1175 $i += $bc-1;
1176 }
1177
1178 $cc = $caseConv[$mbc];
1179 if ($cc) {
1180 $out .= $cc;
1181 } else {
1182 $out .= $mbc;
1183 }
1184 }
1185
1186 return $out;
1187 }
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206 /********************************************
1207 *
1208 * EUC string operation functions
1209 *
1210 * Extended Unix Code:
1211 * ASCII compatible 7bit single bytes chars
1212 * 8bit two byte chars
1213 *
1214 * Shift-JIS is treated as a special case.
1215 *
1216 ********************************************/
1217
1218 /**
1219 * Cuts a string in the EUC charset family short at a given byte length.
1220 *
1221 * @param string EUC multibyte character string
1222 * @param integer the byte length
1223 * @param string the charset
1224 * @return string the shortened string
1225 * @see mb_strcut()
1226 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1227 */
1228 function euc_strtrunc($str,$len,$charset) {
1229 if ($len <= 0) return '';
1230
1231 $sjis = ($charset == 'shift_jis');
1232 for ($i=0; $str{$i} && $i<$len; $i++) {
1233 $c = ord($str{$i});
1234 if ($sjis) {
1235 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1236 }
1237 else {
1238 if ($c >= 0x80) $i++; // advance a double-byte char
1239 }
1240 }
1241 if (!$str{$i}) return $str; // string shorter than supplied length
1242
1243 if ($i>$len)
1244 return substr($str,0,$len-1); // we ended on a first byte
1245 else
1246 return substr($str,0,$len);
1247 }
1248
1249 /**
1250 * Returns a part of a string in the EUC charset family.
1251 *
1252 *
1253 * Negative values for @arg $start and @arg $len are currently not supported.
1254 *
1255 * @param string EUC multibyte character string
1256 * @param int start position (character position)
1257 * @param string the charset
1258 * @param int length (in characters)
1259 * @return string the substring
1260 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1261 */
1262 function euc_substr($str,$start,$charset,$len=null) {
1263 if ($len===0) return '';
1264
1265 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1266 if ($byte_start === false) return false; // $start outside string length
1267
1268 $str = substr($str,$byte_start);
1269
1270 if ($len!=null) {
1271 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1272 if ($byte_end === false) // $len outside actual string length
1273 return $str;
1274 else
1275 return substr($str,0,$byte_end);
1276 }
1277 else return $str;
1278 }
1279
1280 /**
1281 * Counts the number of characters of a string in the EUC charset family.
1282 *
1283 * @param string EUC multibyte character string
1284 * @param string the charset
1285 * @return int the number of characters
1286 * @see strlen()
1287 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1288 */
1289 function euc_strlen($str,$charset) {
1290 $sjis = ($charset == 'shift_jis');
1291 $n=0;
1292 for ($i=0; $str{$i}; $i++) {
1293 $c = ord($str{$i});
1294 if ($sjis) {
1295 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1296 }
1297 else {
1298 if ($c >= 0x80) $i++; // advance a double-byte char
1299 }
1300
1301 $n++;
1302 }
1303
1304 return $n;
1305 }
1306
1307 /**
1308 * Translates a character position into an 'absolute' byte position.
1309 *
1310 * @param string EUC multibyte character string
1311 * @param int character position
1312 * @param string the charset
1313 * @return int byte position
1314 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1315 */
1316 function euc_char2byte_pos($str,$pos,$charset) {
1317 $sjis = ($charset == 'shift_jis');
1318 $n = 0; // number of characters seen
1319 for ($i=0; $str{$i} && $n<$pos; $i++) {
1320 $c = ord($str{$i});
1321 if ($sjis) {
1322 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1323 }
1324 else {
1325 if ($c >= 0x80) $i++; // advance a double-byte char
1326 }
1327
1328 $n++;
1329 }
1330 if (!$str{$i}) return false; // offset beyond string length
1331
1332 return $i;
1333 }
1334
1335 /**
1336 * Translates all characters of a string in the EUC charset family into their respective case values.
1337 *
1338 * @param string EUC multibyte character string
1339 * @param string conversion: 'toLower' or 'toUpper'
1340 * @param string the charset
1341 * @return string the converted string
1342 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1343 * @see strtolower(), strtoupper(), mb_convert_case()
1344 */
1345 function euc_conv_case($str,$case,$charset) {
1346 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1347
1348 $sjis = ($charset == 'shift_jis');
1349 $out = '';
1350 $caseConv =& $this->caseFolding[$charset][$case];
1351 for($i=0; $mbc=$str{$i}; $i++) {
1352 $c = ord($str{$i});
1353
1354 if ($sjis) {
1355 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
1356 $mbc = substr($str,$i,2);
1357 $i++;
1358 }
1359 }
1360 else {
1361 if ($c >= 0x80) { // a double-byte char
1362 $mbc = substr($str,$i,2);
1363 $i++;
1364 }
1365 }
1366
1367 $cc = $caseConv[$mbc];
1368 if ($cc) {
1369 $out .= $cc;
1370 } else {
1371 $out .= $mbc;
1372 }
1373 }
1374
1375 return $out;
1376 }
1377
1378 }
1379
1380 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
1381 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
1382 }
1383 ?>