386d2ecc425f931d733a217c0bb863008899aef9
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2004 Kasper Skaarhoj (kasper@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasper@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 119: class t3lib_cs
38 * 261: function parse_charset($charset)
39 * 278: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
40 * 312: function utf8_encode($str,$charset)
41 * 359: function utf8_decode($str,$charset,$useEntityForNoChar=0)
42 * 407: function utf8_to_entities($str)
43 * 440: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
44 * 474: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
45 * 515: function initCharset($charset)
46 * 586: function UnumberToChar($cbyte)
47 * 630: function utf8CharToUnumber($str,$hex=0)
48 *
49 * SECTION: String operation functions
50 * 682: function strtrunc($charset,$string,$len)
51 * 716: function substr($charset,$str,$start,$len=null)
52 * 755: function strlen($charset,$string)
53 *
54 * SECTION: UTF-8 String operation functions
55 * 803: function utf8_strtrunc($str,$len)
56 * 831: function utf8_substr($str,$start,$len=null)
57 * 857: function utf8_strlen($str)
58 * 879: function utf8_strpos($haystack,$needle,$offset=0)
59 * 902: function utf8_strrpos($haystack,$needle)
60 * 921: function utf8_char2byte_pos($str,$pos)
61 * 946: function utf8_byte2char_pos($str,$pos)
62 *
63 * SECTION: EUC String operation functions
64 * 994: function euc_strtrunc($str,$len,$charset)
65 * 1028: function euc_substr($str,$start,$charset,$len=null)
66 * 1055: function euc_strlen($str,$charset)
67 * 1082: function euc_char2byte_pos($str,$pos,$charset)
68 *
69 * TOTAL FUNCTIONS: 24
70 * (This index is automatically created/updated by the extension "extdeveval")
71 *
72 */
73
74
75
76
77
78
79
80
81 /**
82 * Notes on UTF-8
83 *
84 * Functions working on UTF-8 strings:
85 *
86 * - strchr/strstr
87 * - strrchr
88 * - substr_count
89 * - implode/explode/join
90 *
91 * Functions nearly working on UTF-8 strings:
92 *
93 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf_strlen
94 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained 7-bit ASCII
95 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
96 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
97 *
98 * Functions NOT working on UTF-8 strings:
99 *
100 * - str*cmp
101 * - stristr
102 * - stripos
103 * - substr
104 * - strrev
105 * - ereg/eregi
106 * - split/spliti
107 * - preg_*
108 * - ...
109 *
110 */
111 /**
112 * Class for conversion between charsets.
113 *
114 * @author Kasper Skaarhoj <kasper@typo3.com>
115 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
116 * @package TYPO3
117 * @subpackage t3lib
118 */
119 class t3lib_cs {
120 var $noCharByteVal=127; // ASCII Value for chars with no equalent.
121
122 // This is the array where parsed conversion tables are stored (cached)
123 var $parsedCharsets=array();
124
125 // An array where case folding data will be stored (cached)
126 var $caseFolding=array();
127
128 // This tells the converter which charsets has two bytes per char:
129 var $twoByteSets=array(
130 'ucs-2'=>1, // 2-byte Unicode
131 );
132
133 // This tells the converter which charsets has four bytes per char:
134 var $fourByteSets=array(
135 'ucs-4'=>1, // 4-byte Unicode
136 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
137 );
138
139 // This tells the converter which charsets use a scheme like the Extended Unix Code:
140 var $eucBasedSets=array(
141 'gb2312'=>1, // Chinese, simplified.
142 'big5'=>1, // Chinese, traditional.
143 'shift_jis'=>1, // Japanes - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
144 );
145
146 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
147 // http://czyborra.com/charsets/iso8859.html
148 var $synonyms=array(
149 'us' => 'ascii',
150 'us-ascii'=> 'ascii',
151 'cp819' => 'iso-8859-1',
152 'ibm819' => 'iso-8859-1',
153 'iso-ir-100' => 'iso-8859-1',
154 'iso-ir-109' => 'iso-8859-2',
155 'iso-ir-148' => 'iso-8859-9',
156 'iso-ir-199' => 'iso-8859-14',
157 'iso-ir-203' => 'iso-8859-15',
158 'csisolatin1' => 'iso-8859-1',
159 'csisolatin2' => 'iso-8859-2',
160 'csisolatin3' => 'iso-8859-3',
161 'csisolatin5' => 'iso-8859-9',
162 'csisolatin8' => 'iso-8859-14',
163 'csisolatin9' => 'iso-8859-15',
164 'csisolatingreek' => 'iso-8859-7',
165 'iso-celtic' => 'iso-8859-14',
166 'latin1' => 'iso-8859-1',
167 'latin2' => 'iso-8859-2',
168 'latin3' => 'iso-8859-3',
169 'latin5' => 'iso-8859-9',
170 'latin6' => 'iso-8859-10',
171 'latin8' => 'iso-8859-14',
172 'latin9' => 'iso-8859-15',
173 'l1' => 'iso-8859-1',
174 'l2' => 'iso-8859-2',
175 'l3' => 'iso-8859-3',
176 'l5' => 'iso-8859-9',
177 'l6' => 'iso-8859-10',
178 'l8' => 'iso-8859-14',
179 'l9' => 'iso-8859-15',
180 'cyrillic' => 'iso-8859-5',
181 'arabic' => 'iso-8859-6',
182 'win874' => 'windows-874',
183 'win1250' => 'windows-1250',
184 'win1251' => 'windows-1251',
185 'win1252' => 'windows-1252',
186 'win1253' => 'windows-1253',
187 'win1254' => 'windows-1254',
188 'win1255' => 'windows-1255',
189 'win1256' => 'windows-1256',
190 'win1257' => 'windows-1257',
191 'win1258' => 'windows-1258',
192 'cp1250' => 'windows-1250',
193 'cp1252' => 'windows-1252',
194 'ms-ee' => 'windows-1250',
195 'ms-ansi' => 'windows-1252',
196 'ms-greek' => 'windows-1253',
197 'ms-turk' => 'windows-1254',
198 'winbaltrim' => 'windows-1257',
199 'koi-8ru' => 'koi-8r',
200 'koi8r' => 'koi-8r',
201 'cp878' => 'koi-8r',
202 'mac' => 'macRoman',
203 'macintosh' => 'macRoman',
204 'euc-cn' => 'gb2312',
205 'x-euc-cn' => 'gb2312',
206 'cp936' => 'gb2312',
207 'big-5' => 'big5',
208 'cp950' => 'big5',
209 'sjis' => 'shift_jis',
210 'shift-jis' => 'shift_jis',
211 'cp932' => 'shift_jis',
212 'utf7' => 'utf-7',
213 'utf8' => 'utf-8',
214 'utf16' => 'utf-16',
215 'utf32' => 'utf-32',
216 'utf8' => 'utf-8',
217 'ucs2' => 'ucs-2',
218 'ucs4' => 'ucs-4',
219 );
220
221 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
222 // Empty values means "iso-8859-1"
223 var $charSetArray = array(
224 'dk' => '',
225 'de' => '',
226 'no' => '',
227 'it' => '',
228 'fr' => '',
229 'es' => '',
230 'nl' => '',
231 'cz' => 'windows-1250',
232 'pl' => 'iso-8859-2',
233 'si' => 'windows-1250',
234 'fi' => '',
235 'tr' => 'iso-8859-9',
236 'se' => '',
237 'pt' => '',
238 'ru' => 'windows-1251',
239 'ro' => 'iso-8859-2',
240 'ch' => 'gb2312',
241 'sk' => 'windows-1250',
242 'lt' => 'windows-1257',
243 'is' => 'utf-8',
244 'hr' => 'windows-1250',
245 'hu' => 'iso-8859-2',
246 'gl' => '',
247 'th' => 'iso-8859-11',
248 'gr' => 'iso-8859-7',
249 'hk' => 'big5',
250 'eu' => '',
251 'bg' => 'windows-1251',
252 'br' => '',
253 'et' => 'iso-8859-4',
254 'ar' => 'iso-8859-6',
255 'he' => 'utf-8',
256 'ua' => 'windows-1251',
257 );
258
259 /**
260 * Normalize - changes input character set to lowercase letters.
261 *
262 * @param string Input charset
263 * @return string Normalized charset
264 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
265 */
266 function parse_charset($charset) {
267 $charset = strtolower($charset);
268 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
269
270 return $charset;
271 }
272
273
274 /**
275 * Convert from one charset to another charset.
276 *
277 * @param string Input string
278 * @param string From charset (the current charset of the string)
279 * @param string To charset (the output charset wanted)
280 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
281 * @return string Converted string
282 */
283 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
284 global $TYPO3_CONF_VARS;
285
286 if ($fromCS==$toCS) return $str;
287
288 if (!$useEntityForNoChar) { // iconv and recode don't support fallback to SGML entities
289 if ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'iconv') {
290 $conv_str = iconv($str,$fromCS,$toCS.'//TRANSLIT');
291 if (false !== $conv_str) return $conv_str;
292 }
293 elseif ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'recode') {
294 $conv_str = recode_string($toCS.'..'.$fromCS,$str);
295 if (false !== $conv_str) return $conv_str;
296 }
297 elseif ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'mbstring') {
298 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
299 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
300 }
301 // fallback to TYPO3 conversion
302 }
303
304 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
305 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
306 return $str;
307 }
308
309
310 /**
311 * Converts $str from $charset to UTF-8
312 *
313 * @param string String in local charset to convert to UTF-8
314 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
315 * @return string Output string, converted to UTF-8
316 */
317 function utf8_encode($str,$charset) {
318
319 // Charset is case-insensitive.
320 if ($this->initCharset($charset)) { // Parse conv. table if not already...
321 $strLen = strlen($str);
322 $outStr='';
323
324 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
325 $chr=substr($str,$a,1);
326 $ord=ord($chr);
327 if ($this->twoByteSets[$charset]) { // If the charset has two bytes per char
328 $ord2 = ord($str{$a+1});
329 $ord = $ord<<8 & $ord2; // assume big endian
330
331 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
332 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
333 } else $outStr.=chr($this->noCharByteVal); // No char exists
334 $a++;
335 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
336 if ($this->eucBasedSets[$charset]) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
337 $a++;
338 $ord2=ord(substr($str,$a,1));
339 $ord = $ord*256+$ord2;
340 }
341 elseif ($charset == 'shift_jis' && ($ord <160 || $ord>223)) { // Shift-JIS is like EUC, but chars between 160 and 223 are single byte
342 $a++;
343 $ord2=ord(substr($str,$a,1));
344 $ord = $ord*256+$ord2;
345 }
346
347 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
348 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
349 } else $outStr.=chr($this->noCharByteVal); // No char exists
350 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
351 }
352 return $outStr;
353 }
354 }
355
356 /**
357 * Converts $str from UTF-8 to $charset
358 *
359 * @param string String in UTF-8 to convert to local charset
360 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
361 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
362 * @return string Output string, converted to local charset
363 */
364 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
365
366 // Charset is case-insensitive.
367 if ($this->initCharset($charset)) { // Parse conv. table if not already...
368 $strLen = strlen($str);
369 $outStr='';
370 $buf='';
371 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
372 $chr=substr($str,$a,1);
373 $ord=ord($chr);
374 if ($ord>127) { // This means multibyte! (first byte!)
375 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
376
377 $buf=$chr; // Add first byte
378 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
379 $ord = $ord << 1; // Shift it left and ...
380 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
381 $a++; // Increase pointer...
382 $buf.=substr($str,$a,1); // ... and add the next char.
383 } else break;
384 }
385
386 # Martin Kutschker...! this does not work! With russian UTF-8 converted back to windows-1251 it failed... So the old code is re-inserted.
387 # for ($bc=0; $ord & 0x80; $ord = $ord << 1) { $bc++; } // calculate number of bytes
388 # $buf.=substr($str,$i,$bc);
389 # $i+=$bc-1;
390
391 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
392 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
393 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
394 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
395 } else $outStr.= chr($mByte);
396 } elseif ($useEntityForNoChar) { // Create num entity:
397 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
398 } else $outStr.=chr($this->noCharByteVal); // No char exists
399 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
400 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
401 }
402 return $outStr;
403 }
404 }
405
406 /**
407 * Converts all chars > 127 to numeric entities.
408 *
409 * @param string Input string
410 * @return string Output string
411 */
412 function utf8_to_entities($str) {
413 $strLen = strlen($str);
414 $outStr='';
415 $buf='';
416 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
417 $chr=substr($str,$a,1);
418 $ord=ord($chr);
419 if ($ord>127) { // This means multibyte! (first byte!)
420 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
421 $buf=$chr; // Add first byte
422 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
423 $ord = $ord << 1; // Shift it left and ...
424 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
425 $a++; // Increase pointer...
426 $buf.=substr($str,$a,1); // ... and add the next char.
427 } else break;
428 }
429
430 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
431 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
432 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
433 }
434
435 return $outStr;
436 }
437
438 /**
439 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
440 *
441 * @param string Input string, UTF-8
442 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
443 * @return string Output string
444 */
445 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
446 if ($alsoStdHtmlEnt) {
447 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES));
448 }
449
450 $token = md5(microtime());
451 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
452 foreach($parts as $k => $v) {
453 if ($k%2) {
454 if (substr($v,0,1)=='#') { // Dec or hex entities:
455 if (substr($v,1,1)=='x') {
456 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
457 } else {
458 $parts[$k] = $this->UnumberToChar(substr($v,1));
459 }
460 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
461 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
462 } else { // No conversion:
463 $parts[$k] ='&'.$v.';';
464 }
465 }
466 }
467
468 return implode('',$parts);
469 }
470
471 /**
472 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
473 *
474 * @param string Input string, UTF-8
475 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
476 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
477 * @return array Output array with the char numbers
478 */
479 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
480 // If entities must be registered as well...:
481 if ($convEntities) {
482 $str = $this->entities_to_utf8($str,1);
483 }
484 // Do conversion:
485 $strLen = strlen($str);
486 $outArr=array();
487 $buf='';
488 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
489 $chr=substr($str,$a,1);
490 $ord=ord($chr);
491 if ($ord>127) { // This means multibyte! (first byte!)
492 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
493 $buf=$chr; // Add first byte
494 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
495 $ord = $ord << 1; // Shift it left and ...
496 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
497 $a++; // Increase pointer...
498 $buf.=substr($str,$a,1); // ... and add the next char.
499 } else break;
500 }
501
502 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
503 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
504 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
505 }
506
507 return $outArr;
508 }
509
510 /**
511 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
512 * This function is automatically called by the conversion functions
513 *
514 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
515 *
516 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
517 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
518 * @access private
519 */
520 function initCharset($charset) {
521 // Only process if the charset is not yet loaded:
522 if (!is_array($this->parsedCharsets[$charset])) {
523
524 // Conversion table filename:
525 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
526
527 // If the conversion table is found:
528 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
529 // Cache file for charsets:
530 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
531 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/charset_'.$charset.'.tbl');
532 if ($cacheFile && @is_file($cacheFile)) {
533 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
534 } else {
535 // Parse conversion table into lines:
536 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
537 // Initialize the internal variable holding the conv. table:
538 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
539 // traverse the lines:
540 $detectedType='';
541 foreach($lines as $value) {
542 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
543
544 // Detect type if not done yet: (Done on first real line)
545 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
546 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
547
548 if ($detectedType=='ms-token') {
549 list($hexbyte,$utf8) = split('=|:',$value,3);
550 } elseif ($detectedType=='whitespaced') {
551 $regA=array();
552 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
553 $hexbyte = $regA[1];
554 $utf8 = 'U+'.$regA[2];
555 }
556 $decval = hexdec(trim($hexbyte));
557 if ($decval>127) {
558 $utf8decval = hexdec(substr(trim($utf8),2));
559 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
560 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
561 }
562 }
563 }
564 if ($cacheFile) {
565 t3lib_div::writeFile($cacheFile,serialize($this->parsedCharsets[$charset]));
566 }
567 }
568 return 2;
569 } else return false;
570 } else return 1;
571 }
572
573 /**
574 * Converts a UNICODE number to a UTF-8 multibyte character
575 * Algorithm based on script found at From: http://czyborra.com/utf/
576 *
577 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
578 *
579 * bytes | bits | representation
580 * 1 | 7 | 0vvvvvvv
581 * 2 | 11 | 110vvvvv 10vvvvvv
582 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
583 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
584 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
585 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
586 *
587 * @param integer UNICODE integer
588 * @return string UTF-8 multibyte character string
589 * @see utf8CharToUnumber()
590 */
591 function UnumberToChar($cbyte) {
592 $str='';
593
594 if ($cbyte < 0x80) {
595 $str.=chr($cbyte);
596 } else if ($cbyte < 0x800) {
597 $str.=chr(0xC0 | ($cbyte >> 6));
598 $str.=chr(0x80 | ($cbyte & 0x3F));
599 } else if ($cbyte < 0x10000) {
600 $str.=chr(0xE0 | ($cbyte >> 12));
601 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
602 $str.=chr(0x80 | ($cbyte & 0x3F));
603 } else if ($cbyte < 0x200000) {
604 $str.=chr(0xF0 | ($cbyte >> 18));
605 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
606 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
607 $str.=chr(0x80 | ($cbyte & 0x3F));
608 } else if ($cbyte < 0x4000000) {
609 $str.=chr(0xF8 | ($cbyte >> 24));
610 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
611 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
612 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
613 $str.=chr(0x80 | ($cbyte & 0x3F));
614 } else if ($cbyte < 0x80000000) {
615 $str.=chr(0xFC | ($cbyte >> 30));
616 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
617 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
618 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
619 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
620 $str.=chr(0x80 | ($cbyte & 0x3F));
621 } else { // Cannot express a 32-bit character in UTF-8
622 $str .= chr($this->noCharByteVal);
623 }
624 return $str;
625 }
626
627 /**
628 * Converts a UTF-8 Multibyte character to a UNICODE number
629 *
630 * @param string UTF-8 multibyte character string
631 * @param boolean If set, then a hex. number is returned.
632 * @return integer UNICODE integer
633 * @see UnumberToChar()
634 */
635 function utf8CharToUnumber($str,$hex=0) {
636 $ord=ord(substr($str,0,1)); // First char
637
638 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
639 $binBuf='';
640 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
641 $ord = $ord << 1; // Shift it left and ...
642 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
643 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
644 } else break;
645 }
646 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
647
648 $int = bindec($binBuf);
649 } else $int = $ord;
650
651 return $hex ? 'x'.dechex($int) : $int;
652 }
653
654 /**
655 * This function initializes the UTF-8 case folding table.
656 *
657 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
658 *
659 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
660 * @access private
661 */
662 function initCaseFoldingUTF8() {
663 // Only process if the case table is not yet loaded:
664 if (is_array($this->caseFolding['utf-8'])) return 1;
665
666 // Use cached version if possible
667 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cscase_utf-8.tbl');
668 if ($cacheFile && @is_file($cacheFile)) {
669 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFile));
670 return 2;
671 }
672
673 // process main Unicode data file
674 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
675 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
676
677 $fh = fopen($unicodeDataFile,'r');
678 if (!$fh) return false;
679
680 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
681 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
682 $this->caseFolding['utf-8'] = array();
683 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
684 $utf8CaseFolding['toUpper'] = array();
685 $utf8CaseFolding['toLower'] = array();
686 $utf8CaseFolding['toTitle'] = array();
687
688 while (!feof($fh)) {
689 $line = fgets($fh);
690 // has also other info like character class (digit, white space, etc.) and more
691 list($char,,,,,,,,,,,,$upper,$lower,$title,) = split(';', rtrim($line));
692 $char = $this->UnumberToChar(hexdec($char));
693 if ($upper) $utf8CaseFolding['toUpper'][$char] = $this->UnumberToChar(hexdec($upper));
694 if ($lower) $utf8CaseFolding['toLower'][$char] = $this->UnumberToChar(hexdec($lower));
695 // store "title" only when different from "upper" (only a few)
696 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$char] = $this->UnumberToChar(hexdec($title));
697 }
698 fclose($fh);
699
700 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
701 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
702 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
703
704 $fh = fopen($specialCasingFile,'r');
705 if ($fh) {
706 while (!feof($fh)) {
707 $line = fgets($fh);
708 if ($line{0} != '#' && trim($line) != '') {
709
710 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
711 if ($cond == '' || $cond{0} == '#') {
712 $utf8_char = $this->UnumberToChar(hexdec($char));
713 if ($char != $lower) {
714 $arr = split(' ',$lower);
715 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
716 $utf8CaseFolding['toLower'][$utf8_char] = implode($arr);
717 }
718 if ($char != $title && $title != $upper) {
719 $arr = split(' ',$title);
720 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
721 $utf8CaseFolding['toTitle'][$utf8_char] = implode($arr);
722 }
723 if ($char != $upper) {
724 $arr = split(' ',$upper);
725 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
726 $utf8CaseFolding['toUpper'][$utf8_char] = implode($arr);
727 }
728 }
729 }
730 }
731 fclose($fh);
732 }
733 }
734
735 if ($cacheFile) {
736 t3lib_div::writeFile($cacheFile,serialize($utf8CaseFolding));
737 }
738
739 return 3;
740 }
741
742 /**
743 * This function initializes the folding table for a charset other than UTF-8.
744 * This function is automatically called by the case folding functions.
745 *
746 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
747 * @access private
748 */
749 function initCaseFolding($charset) {
750 // Only process if the case table is not yet loaded:
751 if (is_array($this->caseFolding[$charset])) return 1;
752
753 // Use cached version if possible
754 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cscase_'.$charset.'.tbl');
755 if ($cacheFile && @is_file($cacheFile)) {
756 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
757 return 2;
758 }
759
760 // init UTF-8 conversion for this charset
761 if (!$this->initCharset($charset)) {
762 return false;
763 }
764
765 // UTF-8 case folding is used as the base conversion table
766 if (!$this->initCaseFoldingUTF8()) {
767 return false;
768 }
769
770 $nochar = chr($this->noCharByteVal);
771 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
772 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
773 $c = $this->conv($utf8, 'utf-8', $charset);
774
775 $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
776 if ($cc && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
777
778 $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
779 if ($cc && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
780
781 $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
782 if ($cc && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
783 }
784
785 // add the ASCII case table
786 for ($i=ord('a'); $i<=ord('z'); $i++) {
787 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
788 }
789 for ($i=ord('A'); $i<=ord('Z'); $i++) {
790 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
791 }
792
793 if ($cacheFile) {
794 t3lib_div::writeFile($cacheFile,serialize($this->caseFolding[$charset]));
795 }
796
797 return 3;
798 }
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816 /********************************************
817 *
818 * String operation functions
819 *
820 ********************************************/
821
822 /**
823 * Cuts a string short at a given byte length.
824 *
825 * @param string the character set
826 * @param string character string
827 * @param integer the byte length
828 * @return string the shortened string
829 * @see mb_strcut()
830 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
831 */
832 function strtrunc($charset,$string,$len) {
833 if ($len <= 0) return '';
834
835 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
836 return mb_strcut($string,0,$len,$charset);
837 } elseif ($charset == 'utf-8') {
838 return $this->utf8_strtrunc($string);
839 } elseif ($this->eucBasedSets[$charset]) {
840 return $this->euc_strtrunc($string,$charset);
841 } elseif ($this->twoByteSets[$charset]) {
842 if ($len % 2) $len--; // don't cut at odd positions
843 } elseif ($this->fourByteSets[$charset]) {
844 $x = $len % 4;
845 $len -= $x; // realign to position dividable by four
846 }
847 // treat everything else as single-byte encoding
848 return substr($string,0,$len);
849 }
850
851 /**
852 * Returns a part of a string.
853 *
854 * @param string the character set
855 * @param string character string
856 * @param int start position (character position)
857 * @param int length (in characters)
858 * @return string the substring
859 * @see substr(), mb_substr()
860 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
861 */
862 function substr($charset,$string,$start,$len=null) {
863 if ($len===0) return '';
864
865 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
866 // cannot omit $len, when specifying charset
867 if ($len==null) {
868 $enc = mb_internal_encoding(); // save internal encoding
869 mb_internal_encoding('utf-8');
870 $str = mb_substr($string,$start);
871 mb_internal_encoding($enc); // restore internal encoding
872
873 return $str;
874 }
875 else return mb_substr($string,$start,$len,'utf-8');
876 } elseif ($charset == 'utf-8') {
877 return $this->utf8_substr($string,$start,$len);
878 } elseif ($this->eucBasedSets[$charset]) {
879 return $this->euc_substr($string,$start,$charset,$len);
880 } elseif ($this->twoByteSets[$charset]) {
881 return substr($string,$start*2,$len*2);
882 } elseif ($this->fourByteSets[$charset]) {
883 return substr($string,$start*4,$len*4);
884 }
885
886 // treat everything else as single-byte encoding
887 return substr($string,$start,$len);
888 }
889
890 /**
891 * Truncates a string and pre-/appends a string.
892 *
893 * @param string the character set
894 * @param string character string
895 * @param int length (in characters)
896 * @param string crop signifier
897 * @return string the shortened string
898 * @see substr(), mb_strimwidth()
899 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
900 */
901 function crop($charset,$string,$len,$crop='') {
902 if ($len == 0) return $crop;
903
904 if ($charset == 'utf-8') {
905 $i = $this->utf8_char2byte_pos($string,$len);
906 } elseif ($this->eucBasedSets[$charset]) {
907 $i = $this->euc_char2byte_pos($string,$len,$charset);
908 } else {
909 if ($len > 0) {
910 $i = $len;
911 } else {
912 $i = strlen($string)+$len;
913 if ($i<=0) $i = false;
914 }
915 }
916
917 if ($i === false) { // $len outside actual string length
918 return $string;
919 } else {
920 if ($len > 0) {
921 if ($string{$i+1}) {
922 return substr($string,0,$i).$crop;
923 }
924 } else {
925 if ($string{$i-1}) {
926 return $crop.substr($string,$i);
927 }
928 }
929 }
930
931 return $string;
932 }
933
934 /**
935 * Counts the number of characters.
936 *
937 * @param string the character set
938 * @param string character string
939 * @return integer the number of characters
940 * @see strlen()
941 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
942 */
943 function strlen($charset,$string) {
944 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
945 return mb_strlen($string,$charset);
946 } elseif ($charset == 'utf-8') {
947 return $this->utf8_strlen($string);
948 } elseif ($this->eucBasedSets[$charset]) {
949 return $this->euc_strlen($string,$charset);
950 } elseif ($this->twoByteSets[$charset]) {
951 return strlen($string)/2;
952 } elseif ($this->fourByteSets[$charset]) {
953 return strlen($string)/4;
954 }
955 // treat everything else as single-byte encoding
956 return strlen($string);
957 }
958
959 /**
960 * Translates all characters of a string into their respective case values.
961 * Unlike strtolower() and strtoupper() this method is locale independent.
962 *
963 * Real case folding is language dependent, this method ignores this fact.
964 *
965 * @param string string
966 * @return string the converted string
967 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
968 * @see strtolower(), strtoupper()
969 */
970 function conv_case($charset,$string,$case) {
971 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' &&
972 float(phpversion()) >= 4.3) {
973 if ($case == 'toLower') {
974 return mb_strtolower($str,'utf-8');
975 } else {
976 return mb_strtoupper($str,'utf-8');
977 }
978 } elseif ($charset == 'utf-8') {
979 return $this->utf8_conv_case($string,$case);
980 } elseif ($this->eucBasedSets[$charset]) {
981 return $this->euc_conv_case($string,$case,$charset);
982 }
983
984 // treat everything else as single-byte encoding
985 if (!$this->initCaseFolding($charset)) return $string; // do nothing
986
987 $out = '';
988 $caseConv =& $this->caseFolding[$charset][$case];
989 for($i=0; $c=$string{$i}; $i++) {
990 $cc = $caseConv[$c];
991 if ($cc) {
992 $out .= $cc;
993 } else {
994 $out .= $c;
995 }
996 }
997
998 // is a simple strtr() faster or slower than the code above?
999 // perhaps faster for small single-byte tables but slower for large multi-byte tables?
1000 //
1001 // return strtr($string,$this->caseFolding[$charset][$case]);
1002
1003 return $out;
1004 }
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019 /********************************************
1020 *
1021 * Internal UTF-8 string operation functions
1022 *
1023 ********************************************/
1024
1025 /**
1026 * Truncates a string in UTF-8 short at a given byte length.
1027 *
1028 * @param string UTF-8 multibyte character string
1029 * @param integer the byte length
1030 * @return string the shortened string
1031 * @see mb_strcut()
1032 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1033 */
1034 function utf8_strtrunc($str,$len) {
1035 $i = $len-1;
1036 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1037 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1038 if ($i <= 0) return ''; // sanity check
1039 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1040 if ($bc+$i > $len) return substr($str,0,$i);
1041 // fallthru: multibyte char fits into length
1042 }
1043 return substr($str,$len);
1044 }
1045
1046 /**
1047 * Returns a part of a UTF-8 string.
1048 *
1049 * @param string $str UTF-8 string
1050 * @param int $start start position (character position)
1051 * @param int $len length (in characters)
1052 * @return string the substring
1053 * @see substr()
1054 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1055 */
1056 function utf8_substr($str,$start,$len=null) {
1057 $byte_start = $this->utf8_char2byte_pos($str,$start);
1058 if ($byte_start === false) return false; // $start outside string length
1059
1060 $str = substr($str,$byte_start);
1061
1062 if ($len!=null) {
1063 $byte_end = $this->utf8_char2byte_pos($str,$len);
1064 if ($byte_end === false) // $len outside actual string length
1065 return $str;
1066 else
1067 return substr($str,0,$byte_end);
1068 }
1069 else return $str;
1070 }
1071
1072 /**
1073 * Counts the number of characters of a string in UTF-8.
1074 *
1075 * @param string UTF-8 multibyte character string
1076 * @return int the number of characters
1077 * @see strlen()
1078 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1079 */
1080 function utf8_strlen($str) {
1081 $n=0;
1082 for($i=0; $str{$i}; $i++) {
1083 $c = ord($str{$i});
1084 if (!($c & 0x80)) // single-byte (0xxxxxx)
1085 $n++;
1086 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1087 $n++;
1088 }
1089 return $n;
1090 }
1091
1092 /**
1093 * Find position of first occurrence of a string, both arguments are in UTF-8.
1094 *
1095 * @param string UTF-8 string to search in
1096 * @param string UTF-8 string to search for
1097 * @param int positition to start the search
1098 * @return int the character position
1099 * @see strpos()
1100 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1101 */
1102 function utf8_strpos($haystack,$needle,$offset=0) {
1103 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1104 return mb_strpos($haystack,$needle,'utf-8');
1105 }
1106
1107 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1108 if ($byte_offset === false) return false; // offset beyond string length
1109
1110 $byte_pos = strpos($haystack,$needle,$byte_offset);
1111 if ($byte_pos === false) return false; // needle not found
1112
1113 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1114 }
1115
1116 /**
1117 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1118 *
1119 * @param string UTF-8 string to search in
1120 * @param char UTF-8 character to search for
1121 * @return int the character position
1122 * @see strrpos()
1123 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1124 */
1125 function utf8_strrpos($haystack,$needle) {
1126 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1127 return mb_strrpos($haystack,$needle,'utf-8');
1128 }
1129
1130 $byte_pos = strrpos($haystack,$needle);
1131 if ($byte_pos === false) return false; // needle not found
1132
1133 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1134 }
1135
1136 /**
1137 * Translates a character position into an 'absolute' byte position.
1138 *
1139 * @param string UTF-8 string
1140 * @param int character position (negative values start from the end)
1141 * @return int byte position
1142 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1143 */
1144 function utf8_char2byte_pos($str,$pos) {
1145 $n = 0; // number of characters found
1146 $p = abs($pos); // number of characters wanted
1147
1148 if ($pos >= 0) {
1149 $i = 0;
1150 $d = 1;
1151 } else {
1152 $i = strlen($str)-1;
1153 $d = -1;
1154 }
1155
1156 for( ; $str{$i} && $n<$p; $i+=d) {
1157 $c = (int)ord($str{$i});
1158 if (!($c & 0x80)) // single-byte (0xxxxxx)
1159 $n++;
1160 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1161 $n++;
1162 }
1163 if (!$str{$i}) return false; // offset beyond string length
1164
1165 if ($pos >= 0) {
1166 // skip trailing multi-byte data bytes
1167 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1168 } else {
1169 // correct offset
1170 $i++;
1171 }
1172
1173 return $i;
1174 }
1175
1176 /**
1177 * Translates an 'absolute' byte position into a character position.
1178 *
1179 * @param string UTF-8 string
1180 * @param int byte position
1181 * @return int character position
1182 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1183 */
1184 function utf8_byte2char_pos($str,$pos) {
1185 $n = 0; // number of characters
1186 for($i=$pos; $i>0; $i--) {
1187 $c = (int)ord($str{$i});
1188 if (!($c & 0x80)) // single-byte (0xxxxxx)
1189 $n++;
1190 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1191 $n++;
1192 }
1193 if (!$str{$i}) return false; // offset beyond string length
1194
1195 return $n;
1196 }
1197
1198 /**
1199 * Translates all characters of an UTF-8 string into their respective case values.
1200 *
1201 * @param string UTF-8 string
1202 * @param string conversion: 'toLower' or 'toUpper'
1203 * @return string the converted string
1204 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1205 * @see strtolower(), strtoupper(), mb_convert_case()
1206 */
1207 function utf8_conv_case($str,$case) {
1208 if (!$this->initCaseFoldingUTF8()) return $str; // do nothing
1209
1210 $out = '';
1211 $caseConv =& $this->caseFolding['utf-8'][$case];
1212 for($i=0; $str{$i}; $i++) {
1213 $c = ord($str{$i});
1214 if (!($c & 0x80)) // single-byte (0xxxxxx)
1215 $mbc = $str{$i};
1216 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1217 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1218 $mbc = substr($str,$i,$bc);
1219 $i += $bc-1;
1220 }
1221
1222 $cc = $caseConv[$mbc];
1223 if ($cc) {
1224 $out .= $cc;
1225 } else {
1226 $out .= $mbc;
1227 }
1228 }
1229
1230 return $out;
1231 }
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250 /********************************************
1251 *
1252 * Internal EUC string operation functions
1253 *
1254 * Extended Unix Code:
1255 * ASCII compatible 7bit single bytes chars
1256 * 8bit two byte chars
1257 *
1258 * Shift-JIS is treated as a special case.
1259 *
1260 ********************************************/
1261
1262 /**
1263 * Cuts a string in the EUC charset family short at a given byte length.
1264 *
1265 * @param string EUC multibyte character string
1266 * @param integer the byte length
1267 * @param string the charset
1268 * @return string the shortened string
1269 * @see mb_strcut()
1270 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1271 */
1272 function euc_strtrunc($str,$len,$charset) {
1273 $sjis = ($charset == 'shift_jis');
1274 for ($i=0; $str{$i} && $i<$len; $i++) {
1275 $c = ord($str{$i});
1276 if ($sjis) {
1277 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1278 }
1279 else {
1280 if ($c >= 0x80) $i++; // advance a double-byte char
1281 }
1282 }
1283 if (!$str{$i}) return $str; // string shorter than supplied length
1284
1285 if ($i>$len)
1286 return substr($str,0,$len-1); // we ended on a first byte
1287 else
1288 return substr($str,0,$len);
1289 }
1290
1291 /**
1292 * Returns a part of a string in the EUC charset family.
1293 *
1294 * @param string EUC multibyte character string
1295 * @param int start position (character position)
1296 * @param string the charset
1297 * @param int length (in characters)
1298 * @return string the substring
1299 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1300 */
1301 function euc_substr($str,$start,$charset,$len=null) {
1302 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1303 if ($byte_start === false) return false; // $start outside string length
1304
1305 $str = substr($str,$byte_start);
1306
1307 if ($len!=null) {
1308 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1309 if ($byte_end === false) // $len outside actual string length
1310 return $str;
1311 else
1312 return substr($str,0,$byte_end);
1313 }
1314 else return $str;
1315 }
1316
1317 /**
1318 * Counts the number of characters of a string in the EUC charset family.
1319 *
1320 * @param string EUC multibyte character string
1321 * @param string the charset
1322 * @return int the number of characters
1323 * @see strlen()
1324 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1325 */
1326 function euc_strlen($str,$charset) {
1327 $sjis = ($charset == 'shift_jis');
1328 $n=0;
1329 for ($i=0; $str{$i}; $i++) {
1330 $c = ord($str{$i});
1331 if ($sjis) {
1332 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1333 }
1334 else {
1335 if ($c >= 0x80) $i++; // advance a double-byte char
1336 }
1337
1338 $n++;
1339 }
1340
1341 return $n;
1342 }
1343
1344 /**
1345 * Translates a character position into an 'absolute' byte position.
1346 *
1347 * @param string EUC multibyte character string
1348 * @param int character position (negative values start from the end)
1349 * @param string the charset
1350 * @return int byte position
1351 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1352 */
1353 function euc_char2byte_pos($str,$pos,$charset) {
1354 $sjis = ($charset == 'shift_jis');
1355 $n = 0; // number of characters seen
1356 $p = abs($pos); // number of characters wanted
1357
1358 if ($pos >= 0) {
1359 $i = 0;
1360 $d = 1;
1361 } else {
1362 $i = strlen($str)-1;
1363 $d = -1;
1364 }
1365
1366 for ( ; $str{$i} && $n<$p; $i+=$d) {
1367 $c = ord($str{$i});
1368 if ($sjis) {
1369 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char
1370 }
1371 else {
1372 if ($c >= 0x80) $i+=$d; // advance a double-byte char
1373 }
1374
1375 $n++;
1376 }
1377 if (!$str{$i}) return false; // offset beyond string length
1378
1379 if ($pos < 0) $i++; // correct offset
1380
1381 return $i;
1382 }
1383
1384 /**
1385 * Translates all characters of a string in the EUC charset family into their respective case values.
1386 *
1387 * @param string EUC multibyte character string
1388 * @param string conversion: 'toLower' or 'toUpper'
1389 * @param string the charset
1390 * @return string the converted string
1391 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1392 * @see strtolower(), strtoupper(), mb_convert_case()
1393 */
1394 function euc_conv_case($str,$case,$charset) {
1395 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1396
1397 $sjis = ($charset == 'shift_jis');
1398 $out = '';
1399 $caseConv =& $this->caseFolding[$charset][$case];
1400 for($i=0; $mbc=$str{$i}; $i++) {
1401 $c = ord($str{$i});
1402
1403 if ($sjis) {
1404 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
1405 $mbc = substr($str,$i,2);
1406 $i++;
1407 }
1408 }
1409 else {
1410 if ($c >= 0x80) { // a double-byte char
1411 $mbc = substr($str,$i,2);
1412 $i++;
1413 }
1414 }
1415
1416 $cc = $caseConv[$mbc];
1417 if ($cc) {
1418 $out .= $cc;
1419 } else {
1420 $out .= $mbc;
1421 }
1422 }
1423
1424 return $out;
1425 }
1426
1427 }
1428
1429 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
1430 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
1431 }
1432 ?>