Added crop()
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2004 Kasper Skaarhoj (kasper@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasper@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 119: class t3lib_cs
38 * 261: function parse_charset($charset)
39 * 278: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
40 * 312: function utf8_encode($str,$charset)
41 * 359: function utf8_decode($str,$charset,$useEntityForNoChar=0)
42 * 407: function utf8_to_entities($str)
43 * 440: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
44 * 474: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
45 * 515: function initCharset($charset)
46 * 586: function UnumberToChar($cbyte)
47 * 630: function utf8CharToUnumber($str,$hex=0)
48 *
49 * SECTION: String operation functions
50 * 682: function strtrunc($charset,$string,$len)
51 * 716: function substr($charset,$str,$start,$len=null)
52 * 755: function strlen($charset,$string)
53 *
54 * SECTION: UTF-8 String operation functions
55 * 803: function utf8_strtrunc($str,$len)
56 * 831: function utf8_substr($str,$start,$len=null)
57 * 857: function utf8_strlen($str)
58 * 879: function utf8_strpos($haystack,$needle,$offset=0)
59 * 902: function utf8_strrpos($haystack,$needle)
60 * 921: function utf8_char2byte_pos($str,$pos)
61 * 946: function utf8_byte2char_pos($str,$pos)
62 *
63 * SECTION: EUC String operation functions
64 * 994: function euc_strtrunc($str,$len,$charset)
65 * 1028: function euc_substr($str,$start,$charset,$len=null)
66 * 1055: function euc_strlen($str,$charset)
67 * 1082: function euc_char2byte_pos($str,$pos,$charset)
68 *
69 * TOTAL FUNCTIONS: 24
70 * (This index is automatically created/updated by the extension "extdeveval")
71 *
72 */
73
74
75
76
77
78
79
80
81 /**
82 * Notes on UTF-8
83 *
84 * Functions working on UTF-8 strings:
85 *
86 * - strchr/strstr
87 * - strrchr
88 * - substr_count
89 * - implode/explode/join
90 *
91 * Functions nearly working on UTF-8 strings:
92 *
93 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf_strlen
94 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained 7-bit ASCII
95 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
96 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
97 *
98 * Functions NOT working on UTF-8 strings:
99 *
100 * - str*cmp
101 * - stristr
102 * - stripos
103 * - substr
104 * - strrev
105 * - ereg/eregi
106 * - split/spliti
107 * - preg_*
108 * - ...
109 *
110 */
111 /**
112 * Class for conversion between charsets.
113 *
114 * @author Kasper Skaarhoj <kasper@typo3.com>
115 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
116 * @package TYPO3
117 * @subpackage t3lib
118 */
119 class t3lib_cs {
120 var $noCharByteVal=127; // ASCII Value for chars with no equalent.
121
122 // This is the array where parsed conversion tables are stored (cached)
123 var $parsedCharsets=array();
124
125 // An array where case folding data will be stored (cached)
126 var $caseFolding=array();
127
128 // This tells the converter which charsets has two bytes per char:
129 var $twoByteSets=array(
130 'ucs-2'=>1, // 2-byte Unicode
131 );
132
133 // This tells the converter which charsets has four bytes per char:
134 var $fourByteSets=array(
135 'ucs-4'=>1, // 4-byte Unicode
136 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
137 );
138
139 // This tells the converter which charsets use a scheme like the Extended Unix Code:
140 var $eucBasedSets=array(
141 'gb2312'=>1, // Chinese, simplified.
142 'big5'=>1, // Chinese, traditional.
143 );
144
145 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
146 // http://czyborra.com/charsets/iso8859.html
147 var $synonyms=array(
148 'us' => 'ascii',
149 'us-ascii'=> 'ascii',
150 'cp819' => 'iso-8859-1',
151 'ibm819' => 'iso-8859-1',
152 'iso-ir-100' => 'iso-8859-1',
153 'iso-ir-109' => 'iso-8859-2',
154 'iso-ir-148' => 'iso-8859-9',
155 'iso-ir-199' => 'iso-8859-14',
156 'iso-ir-203' => 'iso-8859-15',
157 'csisolatin1' => 'iso-8859-1',
158 'csisolatin2' => 'iso-8859-2',
159 'csisolatin3' => 'iso-8859-3',
160 'csisolatin5' => 'iso-8859-9',
161 'csisolatin8' => 'iso-8859-14',
162 'csisolatin9' => 'iso-8859-15',
163 'csisolatingreek' => 'iso-8859-7',
164 'iso-celtic' => 'iso-8859-14',
165 'latin1' => 'iso-8859-1',
166 'latin2' => 'iso-8859-2',
167 'latin3' => 'iso-8859-3',
168 'latin5' => 'iso-8859-9',
169 'latin6' => 'iso-8859-10',
170 'latin8' => 'iso-8859-14',
171 'latin9' => 'iso-8859-15',
172 'l1' => 'iso-8859-1',
173 'l2' => 'iso-8859-2',
174 'l3' => 'iso-8859-3',
175 'l5' => 'iso-8859-9',
176 'l6' => 'iso-8859-10',
177 'l8' => 'iso-8859-14',
178 'l9' => 'iso-8859-15',
179 'cyrillic' => 'iso-8859-5',
180 'arabic' => 'iso-8859-6',
181 'win874' => 'windows-874',
182 'win1250' => 'windows-1250',
183 'win1251' => 'windows-1251',
184 'win1252' => 'windows-1252',
185 'win1253' => 'windows-1253',
186 'win1254' => 'windows-1254',
187 'win1255' => 'windows-1255',
188 'win1256' => 'windows-1256',
189 'win1257' => 'windows-1257',
190 'win1258' => 'windows-1258',
191 'cp1250' => 'windows-1250',
192 'cp1252' => 'windows-1252',
193 'ms-ee' => 'windows-1250',
194 'ms-ansi' => 'windows-1252',
195 'ms-greek' => 'windows-1253',
196 'ms-turk' => 'windows-1254',
197 'winbaltrim' => 'windows-1257',
198 'koi-8ru' => 'koi-8r',
199 'koi8r' => 'koi-8r',
200 'mac' => 'macRoman',
201 'macintosh' => 'macRoman',
202 'euc-cn' => 'gb2312',
203 'x-euc-cn' => 'gb2312',
204 'cp936' => 'gb2312',
205 'big-5' => 'big5',
206 'cp950' => 'big5',
207 'sjis' => 'shift_jis',
208 'shift-jis' => 'shift_jis',
209 'cp932' => 'shift_jis',
210 'utf7' => 'utf-7',
211 'utf8' => 'utf-8',
212 'utf16' => 'utf-16',
213 'utf32' => 'utf-32',
214 'utf8' => 'utf-8',
215 'ucs2' => 'ucs-2',
216 'ucs4' => 'ucs-4',
217 );
218
219 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
220 // Empty values means "iso-8859-1"
221 var $charSetArray = array(
222 'dk' => '',
223 'de' => '',
224 'no' => '',
225 'it' => '',
226 'fr' => '',
227 'es' => '',
228 'nl' => '',
229 'cz' => 'windows-1250',
230 'pl' => 'iso-8859-2',
231 'si' => 'windows-1250',
232 'fi' => '',
233 'tr' => 'iso-8859-9',
234 'se' => '',
235 'pt' => '',
236 'ru' => 'windows-1251',
237 'ro' => 'iso-8859-2',
238 'ch' => 'gb2312',
239 'sk' => 'windows-1250',
240 'lt' => 'windows-1257',
241 'is' => 'utf-8',
242 'hr' => 'windows-1250',
243 'hu' => 'iso-8859-2',
244 'gl' => '',
245 'th' => 'iso-8859-11',
246 'gr' => 'iso-8859-7',
247 'hk' => 'big5',
248 'eu' => '',
249 'bg' => 'windows-1251',
250 'br' => '',
251 'et' => 'iso-8859-4',
252 'ar' => 'iso-8859-6',
253 'he' => 'utf-8',
254 'ua' => 'windows-1251',
255 );
256
257 /**
258 * Normalize - changes input character set to lowercase letters.
259 *
260 * @param string Input charset
261 * @return string Normalized charset
262 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
263 */
264 function parse_charset($charset) {
265 $charset = strtolower($charset);
266 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
267
268 return $charset;
269 }
270
271
272 /**
273 * Convert from one charset to another charset.
274 *
275 * @param string Input string
276 * @param string From charset (the current charset of the string)
277 * @param string To charset (the output charset wanted)
278 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
279 * @return string Converted string
280 */
281 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
282 global $TYPO3_CONF_VARS;
283
284 if ($fromCS==$toCS) return $str;
285
286 if (!$useEntityForNoChar) { // iconv and recode don't support fallback to SGML entities
287 if ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'iconv') {
288 $conv_str = iconv($str,$fromCS,$toCS.'//TRANSLIT');
289 if (false !== $conv_str) return $conv_str;
290 }
291 elseif ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'recode') {
292 $conv_str = recode_string($toCS.'..'.$fromCS,$str);
293 if (false !== $conv_str) return $conv_str;
294 }
295 elseif ($TYPO3_CONF_VARS['SYS']['t3lib_cs_convMethod'] == 'mbstring') {
296 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
297 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
298 }
299 // fallback to TYPO3 conversion
300 }
301
302 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
303 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
304 return $str;
305 }
306
307
308 /**
309 * Converts $str from $charset to UTF-8
310 *
311 * @param string String in local charset to convert to UTF-8
312 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
313 * @return string Output string, converted to UTF-8
314 */
315 function utf8_encode($str,$charset) {
316
317 // Charset is case-insensitive.
318 if ($this->initCharset($charset)) { // Parse conv. table if not already...
319 $strLen = strlen($str);
320 $outStr='';
321
322 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
323 $chr=substr($str,$a,1);
324 $ord=ord($chr);
325 if ($this->twoByteSets[$charset]) { // If the charset has two bytes per char
326 $ord2 = ord($str{$a+1});
327 $ord = $ord<<8 & $ord2; // assume big endian
328
329 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
330 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
331 } else $outStr.=chr($this->noCharByteVal); // No char exists
332 $a++;
333 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
334 if ($this->eucBasedSets[$charset]) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
335 $a++;
336 $ord2=ord(substr($str,$a,1));
337 $ord = $ord*256+$ord2;
338 }
339 elseif ($charset == 'shift_jis' && ($ord <160 || $ord>223)) { // Shift-JIS is like EUC, but chars between 160 and 223 are single byte
340 $a++;
341 $ord2=ord(substr($str,$a,1));
342 $ord = $ord*256+$ord2;
343 }
344
345 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
346 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
347 } else $outStr.=chr($this->noCharByteVal); // No char exists
348 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
349 }
350 return $outStr;
351 }
352 }
353
354 /**
355 * Converts $str from UTF-8 to $charset
356 *
357 * @param string String in UTF-8 to convert to local charset
358 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
359 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
360 * @return string Output string, converted to local charset
361 */
362 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
363
364 // Charset is case-insensitive.
365 if ($this->initCharset($charset)) { // Parse conv. table if not already...
366 $strLen = strlen($str);
367 $outStr='';
368 $buf='';
369 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
370 $chr=substr($str,$a,1);
371 $ord=ord($chr);
372 if ($ord>127) { // This means multibyte! (first byte!)
373 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
374
375 $buf=$chr; // Add first byte
376 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
377 $ord = $ord << 1; // Shift it left and ...
378 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
379 $a++; // Increase pointer...
380 $buf.=substr($str,$a,1); // ... and add the next char.
381 } else break;
382 }
383
384 # Martin Kutschker...! this does not work! With russian UTF-8 converted back to windows-1251 it failed... So the old code is re-inserted.
385 # for ($bc=0; $ord & 0x80; $ord = $ord << 1) { $bc++; } // calculate number of bytes
386 # $buf.=substr($str,$i,$bc);
387 # $i+=$bc-1;
388
389 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
390 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
391 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
392 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
393 } else $outStr.= chr($mByte);
394 } elseif ($useEntityForNoChar) { // Create num entity:
395 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
396 } else $outStr.=chr($this->noCharByteVal); // No char exists
397 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
398 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
399 }
400 return $outStr;
401 }
402 }
403
404 /**
405 * Converts all chars > 127 to numeric entities.
406 *
407 * @param string Input string
408 * @return string Output string
409 */
410 function utf8_to_entities($str) {
411 $strLen = strlen($str);
412 $outStr='';
413 $buf='';
414 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
415 $chr=substr($str,$a,1);
416 $ord=ord($chr);
417 if ($ord>127) { // This means multibyte! (first byte!)
418 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
419 $buf=$chr; // Add first byte
420 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
421 $ord = $ord << 1; // Shift it left and ...
422 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
423 $a++; // Increase pointer...
424 $buf.=substr($str,$a,1); // ... and add the next char.
425 } else break;
426 }
427
428 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
429 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
430 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
431 }
432
433 return $outStr;
434 }
435
436 /**
437 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
438 *
439 * @param string Input string, UTF-8
440 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
441 * @return string Output string
442 */
443 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
444 if ($alsoStdHtmlEnt) {
445 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES));
446 }
447
448 $token = md5(microtime());
449 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
450 foreach($parts as $k => $v) {
451 if ($k%2) {
452 if (substr($v,0,1)=='#') { // Dec or hex entities:
453 if (substr($v,1,1)=='x') {
454 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
455 } else {
456 $parts[$k] = $this->UnumberToChar(substr($v,1));
457 }
458 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
459 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
460 } else { // No conversion:
461 $parts[$k] ='&'.$v.';';
462 }
463 }
464 }
465
466 return implode('',$parts);
467 }
468
469 /**
470 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
471 *
472 * @param string Input string, UTF-8
473 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
474 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
475 * @return array Output array with the char numbers
476 */
477 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
478 // If entities must be registered as well...:
479 if ($convEntities) {
480 $str = $this->entities_to_utf8($str,1);
481 }
482 // Do conversion:
483 $strLen = strlen($str);
484 $outArr=array();
485 $buf='';
486 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
487 $chr=substr($str,$a,1);
488 $ord=ord($chr);
489 if ($ord>127) { // This means multibyte! (first byte!)
490 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
491 $buf=$chr; // Add first byte
492 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
493 $ord = $ord << 1; // Shift it left and ...
494 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
495 $a++; // Increase pointer...
496 $buf.=substr($str,$a,1); // ... and add the next char.
497 } else break;
498 }
499
500 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
501 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
502 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
503 }
504
505 return $outArr;
506 }
507
508 /**
509 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
510 * This function is automatically called by the conversion functions
511 *
512 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
513 *
514 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
515 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
516 * @access private
517 */
518 function initCharset($charset) {
519 // Only process if the charset is not yet loaded:
520 if (!is_array($this->parsedCharsets[$charset])) {
521
522 // Conversion table filename:
523 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
524
525 // If the conversion table is found:
526 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
527 // Cache file for charsets:
528 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
529 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/charset_'.$charset.'.tbl');
530 if ($cacheFile && @is_file($cacheFile)) {
531 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
532 } else {
533 // Parse conversion table into lines:
534 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
535 // Initialize the internal variable holding the conv. table:
536 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
537 // traverse the lines:
538 $detectedType='';
539 foreach($lines as $value) {
540 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
541
542 // Detect type if not done yet: (Done on first real line)
543 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
544 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
545
546 if ($detectedType=='ms-token') {
547 list($hexbyte,$utf8) = split('=|:',$value,3);
548 } elseif ($detectedType=='whitespaced') {
549 $regA=array();
550 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
551 $hexbyte = $regA[1];
552 $utf8 = 'U+'.$regA[2];
553 }
554 $decval = hexdec(trim($hexbyte));
555 if ($decval>127) {
556 $utf8decval = hexdec(substr(trim($utf8),2));
557 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
558 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
559 }
560 }
561 }
562 if ($cacheFile) {
563 t3lib_div::writeFile($cacheFile,serialize($this->parsedCharsets[$charset]));
564 }
565 }
566 return 2;
567 } else return false;
568 } else return 1;
569 }
570
571 /**
572 * Converts a UNICODE number to a UTF-8 multibyte character
573 * Algorithm based on script found at From: http://czyborra.com/utf/
574 *
575 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
576 *
577 * bytes | bits | representation
578 * 1 | 7 | 0vvvvvvv
579 * 2 | 11 | 110vvvvv 10vvvvvv
580 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
581 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
582 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
583 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
584 *
585 * @param integer UNICODE integer
586 * @return string UTF-8 multibyte character string
587 * @see utf8CharToUnumber()
588 */
589 function UnumberToChar($cbyte) {
590 $str='';
591
592 if ($cbyte < 0x80) {
593 $str.=chr($cbyte);
594 } else if ($cbyte < 0x800) {
595 $str.=chr(0xC0 | ($cbyte >> 6));
596 $str.=chr(0x80 | ($cbyte & 0x3F));
597 } else if ($cbyte < 0x10000) {
598 $str.=chr(0xE0 | ($cbyte >> 12));
599 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
600 $str.=chr(0x80 | ($cbyte & 0x3F));
601 } else if ($cbyte < 0x200000) {
602 $str.=chr(0xF0 | ($cbyte >> 18));
603 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
604 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
605 $str.=chr(0x80 | ($cbyte & 0x3F));
606 } else if ($cbyte < 0x4000000) {
607 $str.=chr(0xF8 | ($cbyte >> 24));
608 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
609 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
610 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
611 $str.=chr(0x80 | ($cbyte & 0x3F));
612 } else if ($cbyte < 0x80000000) {
613 $str.=chr(0xFC | ($cbyte >> 30));
614 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
615 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
616 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
617 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
618 $str.=chr(0x80 | ($cbyte & 0x3F));
619 } else { // Cannot express a 32-bit character in UTF-8
620 $str .= chr($this->noCharByteVal);
621 }
622 return $str;
623 }
624
625 /**
626 * Converts a UTF-8 Multibyte character to a UNICODE number
627 *
628 * @param string UTF-8 multibyte character string
629 * @param boolean If set, then a hex. number is returned.
630 * @return integer UNICODE integer
631 * @see UnumberToChar()
632 */
633 function utf8CharToUnumber($str,$hex=0) {
634 $ord=ord(substr($str,0,1)); // First char
635
636 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
637 $binBuf='';
638 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
639 $ord = $ord << 1; // Shift it left and ...
640 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
641 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
642 } else break;
643 }
644 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
645
646 $int = bindec($binBuf);
647 } else $int = $ord;
648
649 return $hex ? 'x'.dechex($int) : $int;
650 }
651
652 /**
653 * This function initializes the UTF-8 case folding table.
654 *
655 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
656 *
657 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
658 * @access private
659 */
660 function initCaseFoldingUTF8() {
661 // Only process if the case table is not yet loaded:
662 if (is_array($this->caseFolding['utf-8'])) return 1;
663
664 // Use cached version if possible
665 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cscase_utf-8.tbl');
666 if ($cacheFile && @is_file($cacheFile)) {
667 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFile));
668 return 2;
669 }
670
671 // process main Unicode data file
672 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
673 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
674
675 $fh = fopen($unicodeDataFile,'r');
676 if (!$fh) return false;
677
678 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
679 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
680 $this->caseFolding['utf-8'] = array();
681 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
682 $utf8CaseFolding['toUpper'] = array();
683 $utf8CaseFolding['toLower'] = array();
684 $utf8CaseFolding['toTitle'] = array();
685
686 while (!feof($fh)) {
687 $line = fgets($fh);
688 // has also other info like character class (digit, white space, etc.) and more
689 list($char,,,,,,,,,,,,$upper,$lower,$title,) = split(';', rtrim($line));
690 $char = $this->UnumberToChar(hexdec($char));
691 if ($upper) $utf8CaseFolding['toUpper'][$char] = $this->UnumberToChar(hexdec($upper));
692 if ($lower) $utf8CaseFolding['toLower'][$char] = $this->UnumberToChar(hexdec($lower));
693 // store "title" only when different from "upper" (only a few)
694 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$char] = $this->UnumberToChar(hexdec($title));
695 }
696 fclose($fh);
697
698 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
699 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
700 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
701
702 $fh = fopen($specialCasingFile,'r');
703 if ($fh) {
704 while (!feof($fh)) {
705 $line = fgets($fh);
706 if ($line{0} != '#' && trim($line) != '') {
707
708 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
709 if ($cond == '' || $cond{0} == '#') {
710 $utf8_char = $this->UnumberToChar(hexdec($char));
711 if ($char != $lower) {
712 $arr = split(' ',$lower);
713 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
714 $utf8CaseFolding['toLower'][$utf8_char] = implode($arr);
715 }
716 if ($char != $title && $title != $upper) {
717 $arr = split(' ',$title);
718 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
719 $utf8CaseFolding['toTitle'][$utf8_char] = implode($arr);
720 }
721 if ($char != $upper) {
722 $arr = split(' ',$upper);
723 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
724 $utf8CaseFolding['toUpper'][$utf8_char] = implode($arr);
725 }
726 }
727 }
728 }
729 fclose($fh);
730 }
731 }
732
733 if ($cacheFile) {
734 t3lib_div::writeFile($cacheFile,serialize($utf8CaseFolding));
735 }
736
737 return 3;
738 }
739
740 /**
741 * This function initializes the folding table for a charset other than UTF-8.
742 * This function is automatically called by the case folding functions.
743 *
744 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
745 * @access private
746 */
747 function initCaseFolding($charset) {
748 // Only process if the case table is not yet loaded:
749 if (is_array($this->caseFolding[$charset])) return 1;
750
751 // Use cached version if possible
752 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cscase_'.$charset.'.tbl');
753 if ($cacheFile && @is_file($cacheFile)) {
754 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
755 return 2;
756 }
757
758 // init UTF-8 conversion for this charset
759 if (!$this->initCharset($charset)) {
760 return false;
761 }
762
763 // UTF-8 case folding is used as the base conversion table
764 if (!$this->initCaseFoldingUTF8()) {
765 return false;
766 }
767
768 $nochar = chr($this->noCharByteVal);
769 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
770 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
771 $c = $this->conv($utf8, 'utf-8', $charset);
772
773 $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
774 if ($cc && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
775
776 $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
777 if ($cc && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
778
779 $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
780 if ($cc && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
781 }
782
783 // add the ASCII case table
784 for ($i=ord('a'); $i<=ord('z'); $i++) {
785 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
786 }
787 for ($i=ord('A'); $i<=ord('Z'); $i++) {
788 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
789 }
790
791 if ($cacheFile) {
792 t3lib_div::writeFile($cacheFile,serialize($this->caseFolding[$charset]));
793 }
794
795 return 3;
796 }
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814 /********************************************
815 *
816 * String operation functions
817 *
818 ********************************************/
819
820 /**
821 * Cuts a string short at a given byte length.
822 *
823 * @param string the character set
824 * @param string character string
825 * @param integer the byte length
826 * @return string the shortened string
827 * @see mb_strcut()
828 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
829 */
830 function strtrunc($charset,$string,$len) {
831 if ($len <= 0) return '';
832
833 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
834 return mb_strcut($string,0,$len,$charset);
835 } elseif ($charset == 'utf-8') {
836 return $this->utf8_strtrunc($string);
837 } elseif ($charset == 'shift_jis') {
838 return $this->euc_strtrunc($string,'shift_jis');
839 } elseif ($this->eucBasedSets[$charset]) {
840 return $this->euc_strtrunc($string,$charset);
841 } elseif ($this->twoByteSets[$charset]) {
842 if ($len % 2) $len--; // don't cut at odd positions
843 } elseif ($this->fourByteSets[$charset]) {
844 $x = $len % 4;
845 $len -= $x; // realign to position dividable by four
846 }
847 // treat everything else as single-byte encoding
848 return substr($string,0,$len);
849 }
850
851 /**
852 * Returns a part of a string.
853 *
854 * @param string the character set
855 * @param string character string
856 * @param int start position (character position)
857 * @param int length (in characters)
858 * @return string the substring
859 * @see substr(), mb_substr()
860 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
861 * @bug
862 */
863 function substr($charset,$string,$start,$len=null) {
864 if ($len===0) return '';
865
866 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
867 // cannot omit $len, when specifying charset
868 if ($len==null) {
869 $enc = mb_internal_encoding(); // save internal encoding
870 mb_internal_encoding('utf-8');
871 $str = mb_substr($string,$start);
872 mb_internal_encoding($enc); // restore internal encoding
873
874 return $str;
875 }
876 else return mb_substr($string,$start,$len,'utf-8');
877 } elseif ($charset == 'utf-8') {
878 return $this->utf8_substr($string,$start,$len);
879 } elseif ($charset == 'shift_jis') {
880 return $this->euc_substr($string,$start,'shift_jis',$len);
881 } elseif ($this->eucBasedSets[$charset]) {
882 return $this->euc_substr($string,$start,$charset,$len);
883 } elseif ($this->twoByteSets[$charset]) {
884 return substr($string,$start*2,$len*2);
885 } elseif ($this->fourByteSets[$charset]) {
886 return substr($string,$start*4,$len*4);
887 }
888
889 // treat everything else as single-byte encoding
890 return substr($string,$start,$len);
891 }
892
893 /**
894 * Counts the number of characters.
895 *
896 * @param string the character set
897 * @param string character string
898 * @return integer the number of characters
899 * @see strlen()
900 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
901 */
902 function strlen($charset,$string) {
903 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
904 return mb_strlen($string,$charset);
905 } elseif ($charset == 'utf-8') {
906 return $this->utf8_strlen($string);
907 } elseif ($charset == 'shift_jis') {
908 return $this->euc_strlen($string,'shift_jis');
909 } elseif ($this->eucBasedSets[$charset]) {
910 return $this->euc_strlen($string,$charset);
911 } elseif ($this->twoByteSets[$charset]) {
912 return strlen($string)/2;
913 } elseif ($this->fourByteSets[$charset]) {
914 return strlen($string)/4;
915 }
916 // treat everything else as single-byte encoding
917 return strlen($string);
918 }
919
920 /**
921 * Translates all characters of a string into their respective case values.
922 * Unlike strtolower() and strtoupper() this method is locale independent.
923 *
924 * Real case folding is language dependent, this method ignores this fact.
925 *
926 * @param string string
927 * @return string the converted string
928 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
929 * @see strtolower(), strtoupper()
930 */
931 function conv_case($charset,$string,$case) {
932 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' &&
933 float(phpversion()) >= 4.3) {
934 if ($case == 'toLower') {
935 return mb_strtolower($str,'utf-8');
936 } else {
937 return mb_strtoupper($str,'utf-8');
938 }
939 } elseif ($charset == 'utf-8') {
940 return $this->utf8_conv_case($string,$case);
941 } elseif ($charset == 'shift_jis') {
942 return $this->euc_conv_case($string,$case,'shift_jis');
943 } elseif ($this->eucBasedSets[$charset]) {
944 return $this->euc_conv_case($string,$case,$charset);
945 }
946
947 // treat everything else as single-byte encoding
948 if (!$this->initCaseFolding($charset)) return $string; // do nothing
949
950 $out = '';
951 $caseConv =& $this->caseFolding[$charset][$case];
952 for($i=0; $c=$string{$i}; $i++) {
953 $cc = $caseConv[$c];
954 if ($cc) {
955 $out .= $cc;
956 } else {
957 $out .= $c;
958 }
959 }
960
961 // is a simple strtr() faster or slower than the code above?
962 // perhaps faster for small single-byte tables but slower for large multi-byte tables?
963 //
964 // return strtr($string,$this->caseFolding[$charset][$case]);
965
966 return $out;
967 }
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982 /********************************************
983 *
984 * Internal UTF-8 string operation functions
985 *
986 ********************************************/
987
988 /**
989 * Truncates a string in UTF-8 short at a given byte length.
990 *
991 * @param string UTF-8 multibyte character string
992 * @param integer the byte length
993 * @return string the shortened string
994 * @see mb_strcut()
995 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
996 */
997 function utf8_strtrunc($str,$len) {
998 $i = $len-1;
999 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1000 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1001 if ($i <= 0) return ''; // sanity check
1002 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1003 if ($bc+$i > $len) return substr($str,0,$i);
1004 // fallthru: multibyte char fits into length
1005 }
1006 return substr($str,$len);
1007 }
1008
1009 /**
1010 * Returns a part of a UTF-8 string.
1011 *
1012 * @param string $str UTF-8 string
1013 * @param int $start start position (character position)
1014 * @param int $len length (in characters)
1015 * @return string the substring
1016 * @see substr()
1017 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1018 */
1019 function utf8_substr($str,$start,$len=null) {
1020 $byte_start = $this->utf8_char2byte_pos($str,$start);
1021 if ($byte_start === false) return false; // $start outside string length
1022
1023 $str = substr($str,$byte_start);
1024
1025 if ($len!=null) {
1026 $byte_end = $this->utf8_char2byte_pos($str,$len);
1027 if ($byte_end === false) // $len outside actual string length
1028 return $str;
1029 else
1030 return substr($str,0,$byte_end);
1031 }
1032 else return $str;
1033 }
1034
1035 /**
1036 * Counts the number of characters of a string in UTF-8.
1037 *
1038 * @param string UTF-8 multibyte character string
1039 * @return int the number of characters
1040 * @see strlen()
1041 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1042 */
1043 function utf8_strlen($str) {
1044 $n=0;
1045 for($i=0; $str{$i}; $i++) {
1046 $c = ord($str{$i});
1047 if (!($c & 0x80)) // single-byte (0xxxxxx)
1048 $n++;
1049 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1050 $n++;
1051 }
1052 return $n;
1053 }
1054
1055 /**
1056 * Find position of first occurrence of a string, both arguments are in UTF-8.
1057 *
1058 * @param string UTF-8 string to search in
1059 * @param string UTF-8 string to search for
1060 * @param int positition to start the search
1061 * @return int the character position
1062 * @see strpos()
1063 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1064 */
1065 function utf8_strpos($haystack,$needle,$offset=0) {
1066 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1067 return mb_strpos($haystack,$needle,'utf-8');
1068 }
1069
1070 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1071 if ($byte_offset === false) return false; // offset beyond string length
1072
1073 $byte_pos = strpos($haystack,$needle,$byte_offset);
1074 if ($byte_pos === false) return false; // needle not found
1075
1076 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1077 }
1078
1079 /**
1080 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1081 *
1082 * @param string UTF-8 string to search in
1083 * @param char UTF-8 character to search for
1084 * @return int the character position
1085 * @see strrpos()
1086 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1087 */
1088 function utf8_strrpos($haystack,$needle) {
1089 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1090 return mb_strrpos($haystack,$needle,'utf-8');
1091 }
1092
1093 $byte_pos = strrpos($haystack,$needle);
1094 if ($byte_pos === false) return false; // needle not found
1095
1096 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1097 }
1098
1099 /**
1100 * Translates a character position into an 'absolute' byte position.
1101 *
1102 * @param string UTF-8 string
1103 * @param int character position (negative values start from the end)
1104 * @return int byte position
1105 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1106 */
1107 function utf8_char2byte_pos($str,$pos) {
1108 $n = 0; // number of characters found
1109 $p = abs($pos); // number of characters wanted
1110
1111 if ($pos >= 0) {
1112 $i = 0;
1113 $d = 1;
1114 } else {
1115 $i = strlen($str)-1;
1116 $d = -1;
1117 }
1118
1119 for( ; $str{$i} && $n<$p; $i+=d) {
1120 $c = (int)ord($str{$i});
1121 if (!($c & 0x80)) // single-byte (0xxxxxx)
1122 $n++;
1123 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1124 $n++;
1125 }
1126 if (!$str{$i}) return false; // offset beyond string length
1127
1128 if ($pos >= 0) {
1129 // skip trailing multi-byte data bytes
1130 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1131 } else {
1132 // correct offset
1133 $i++;
1134 }
1135
1136 return $i;
1137 }
1138
1139 /**
1140 * Translates an 'absolute' byte position into a character position.
1141 *
1142 * @param string UTF-8 string
1143 * @param int byte position
1144 * @return int character position
1145 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1146 */
1147 function utf8_byte2char_pos($str,$pos) {
1148 $n = 0; // number of characters
1149 for($i=$pos; $i>0; $i--) {
1150 $c = (int)ord($str{$i});
1151 if (!($c & 0x80)) // single-byte (0xxxxxx)
1152 $n++;
1153 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1154 $n++;
1155 }
1156 if (!$str{$i}) return false; // offset beyond string length
1157
1158 return $n;
1159 }
1160
1161 /**
1162 * Translates all characters of an UTF-8 string into their respective case values.
1163 *
1164 * @param string UTF-8 string
1165 * @param string conversion: 'toLower' or 'toUpper'
1166 * @return string the converted string
1167 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1168 * @see strtolower(), strtoupper(), mb_convert_case()
1169 */
1170 function utf8_conv_case($str,$case) {
1171 if (!$this->initCaseFoldingUTF8()) return $str; // do nothing
1172
1173 $out = '';
1174 $caseConv =& $this->caseFolding['utf-8'][$case];
1175 for($i=0; $str{$i}; $i++) {
1176 $c = ord($str{$i});
1177 if (!($c & 0x80)) // single-byte (0xxxxxx)
1178 $mbc = $str{$i};
1179 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1180 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1181 $mbc = substr($str,$i,$bc);
1182 $i += $bc-1;
1183 }
1184
1185 $cc = $caseConv[$mbc];
1186 if ($cc) {
1187 $out .= $cc;
1188 } else {
1189 $out .= $mbc;
1190 }
1191 }
1192
1193 return $out;
1194 }
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213 /********************************************
1214 *
1215 * Internal EUC string operation functions
1216 *
1217 * Extended Unix Code:
1218 * ASCII compatible 7bit single bytes chars
1219 * 8bit two byte chars
1220 *
1221 * Shift-JIS is treated as a special case.
1222 *
1223 ********************************************/
1224
1225 /**
1226 * Cuts a string in the EUC charset family short at a given byte length.
1227 *
1228 * @param string EUC multibyte character string
1229 * @param integer the byte length
1230 * @param string the charset
1231 * @return string the shortened string
1232 * @see mb_strcut()
1233 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1234 */
1235 function euc_strtrunc($str,$len,$charset) {
1236 $sjis = ($charset == 'shift_jis');
1237 for ($i=0; $str{$i} && $i<$len; $i++) {
1238 $c = ord($str{$i});
1239 if ($sjis) {
1240 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1241 }
1242 else {
1243 if ($c >= 0x80) $i++; // advance a double-byte char
1244 }
1245 }
1246 if (!$str{$i}) return $str; // string shorter than supplied length
1247
1248 if ($i>$len)
1249 return substr($str,0,$len-1); // we ended on a first byte
1250 else
1251 return substr($str,0,$len);
1252 }
1253
1254 /**
1255 * Returns a part of a string in the EUC charset family.
1256 *
1257 * @param string EUC multibyte character string
1258 * @param int start position (character position)
1259 * @param string the charset
1260 * @param int length (in characters)
1261 * @return string the substring
1262 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1263 */
1264 function euc_substr($str,$start,$charset,$len=null) {
1265 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1266 if ($byte_start === false) return false; // $start outside string length
1267
1268 $str = substr($str,$byte_start);
1269
1270 if ($len!=null) {
1271 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1272 if ($byte_end === false) // $len outside actual string length
1273 return $str;
1274 else
1275 return substr($str,0,$byte_end);
1276 }
1277 else return $str;
1278 }
1279
1280 /**
1281 * Counts the number of characters of a string in the EUC charset family.
1282 *
1283 * @param string EUC multibyte character string
1284 * @param string the charset
1285 * @return int the number of characters
1286 * @see strlen()
1287 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1288 */
1289 function euc_strlen($str,$charset) {
1290 $sjis = ($charset == 'shift_jis');
1291 $n=0;
1292 for ($i=0; $str{$i}; $i++) {
1293 $c = ord($str{$i});
1294 if ($sjis) {
1295 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1296 }
1297 else {
1298 if ($c >= 0x80) $i++; // advance a double-byte char
1299 }
1300
1301 $n++;
1302 }
1303
1304 return $n;
1305 }
1306
1307 /**
1308 * Translates a character position into an 'absolute' byte position.
1309 *
1310 * @param string EUC multibyte character string
1311 * @param int character position (negative values start from the end)
1312 * @param string the charset
1313 * @return int byte position
1314 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1315 */
1316 function euc_char2byte_pos($str,$pos,$charset) {
1317 $sjis = ($charset == 'shift_jis');
1318 $n = 0; // number of characters seen
1319 $p = abs($pos); // number of characters wanted
1320
1321 if ($pos >= 0) {
1322 $i = 0;
1323 $d = 1;
1324 } else {
1325 $i = strlen($str)-1;
1326 $d = -1;
1327 }
1328
1329 for ( ; $str{$i} && $n<$p; $i+=$d) {
1330 $c = ord($str{$i});
1331 if ($sjis) {
1332 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char
1333 }
1334 else {
1335 if ($c >= 0x80) $i+=$d; // advance a double-byte char
1336 }
1337
1338 $n++;
1339 }
1340 if (!$str{$i}) return false; // offset beyond string length
1341
1342 if ($pos < 0) $i++; // correct offset
1343
1344 return $i;
1345 }
1346
1347 /**
1348 * Translates all characters of a string in the EUC charset family into their respective case values.
1349 *
1350 * @param string EUC multibyte character string
1351 * @param string conversion: 'toLower' or 'toUpper'
1352 * @param string the charset
1353 * @return string the converted string
1354 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1355 * @see strtolower(), strtoupper(), mb_convert_case()
1356 */
1357 function euc_conv_case($str,$case,$charset) {
1358 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1359
1360 $sjis = ($charset == 'shift_jis');
1361 $out = '';
1362 $caseConv =& $this->caseFolding[$charset][$case];
1363 for($i=0; $mbc=$str{$i}; $i++) {
1364 $c = ord($str{$i});
1365
1366 if ($sjis) {
1367 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
1368 $mbc = substr($str,$i,2);
1369 $i++;
1370 }
1371 }
1372 else {
1373 if ($c >= 0x80) { // a double-byte char
1374 $mbc = substr($str,$i,2);
1375 $i++;
1376 }
1377 }
1378
1379 $cc = $caseConv[$mbc];
1380 if ($cc) {
1381 $out .= $cc;
1382 } else {
1383 $out .= $mbc;
1384 }
1385 }
1386
1387 return $out;
1388 }
1389
1390 }
1391
1392 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
1393 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
1394 }
1395 ?>