Updating version number to 4.2-dev after release of 4.2.0beta2a
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2007 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * $Id$
28 *
29 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 136: class t3lib_cs
38 * 488: function parse_charset($charset)
39 * 507: function get_locale_charset($locale)
40 *
41 * SECTION: Charset Conversion functions
42 * 560: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
43 * 600: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
44 * 617: function utf8_encode($str,$charset)
45 * 663: function utf8_decode($str,$charset,$useEntityForNoChar=0)
46 * 706: function utf8_to_entities($str)
47 * 739: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
48 * 773: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
49 * 823: function UnumberToChar($cbyte)
50 * 868: function utf8CharToUnumber($str,$hex=0)
51 *
52 * SECTION: Init functions
53 * 911: function initCharset($charset)
54 * 973: function initUnicodeData($mode=null)
55 * 1198: function initCaseFolding($charset)
56 * 1260: function initToASCII($charset)
57 *
58 * SECTION: String operation functions
59 * 1331: function substr($charset,$string,$start,$len=null)
60 * 1384: function strlen($charset,$string)
61 * 1414: function crop($charset,$string,$len,$crop='')
62 * 1467: function strtrunc($charset,$string,$len)
63 * 1501: function conv_case($charset,$string,$case)
64 * 1527: function specCharsToASCII($charset,$string)
65 *
66 * SECTION: Internal string operation functions
67 * 1567: function sb_char_mapping($str,$charset,$mode,$opt='')
68 *
69 * SECTION: Internal UTF-8 string operation functions
70 * 1622: function utf8_substr($str,$start,$len=null)
71 * 1655: function utf8_strlen($str)
72 * 1676: function utf8_strtrunc($str,$len)
73 * 1698: function utf8_strpos($haystack,$needle,$offset=0)
74 * 1723: function utf8_strrpos($haystack,$needle)
75 * 1745: function utf8_char2byte_pos($str,$pos)
76 * 1786: function utf8_byte2char_pos($str,$pos)
77 * 1809: function utf8_char_mapping($str,$mode,$opt='')
78 *
79 * SECTION: Internal EUC string operation functions
80 * 1885: function euc_strtrunc($str,$len,$charset)
81 * 1914: function euc_substr($str,$start,$charset,$len=null)
82 * 1939: function euc_strlen($str,$charset)
83 * 1966: function euc_char2byte_pos($str,$pos,$charset)
84 * 2007: function euc_char_mapping($str,$charset,$mode,$opt='')
85 *
86 * TOTAL FUNCTIONS: 35
87 * (This index is automatically created/updated by the extension "extdeveval")
88 *
89 */
90
91
92
93
94
95
96
97
98 /**
99 * Notes on UTF-8
100 *
101 * Functions working on UTF-8 strings:
102 *
103 * - strchr/strstr
104 * - strrchr
105 * - substr_count
106 * - implode/explode/join
107 *
108 * Functions nearly working on UTF-8 strings:
109 *
110 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
111 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
112 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
113 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
114 *
115 * Functions NOT working on UTF-8 strings:
116 *
117 * - str*cmp
118 * - stristr
119 * - stripos
120 * - substr
121 * - strrev
122 * - ereg/eregi
123 * - split/spliti
124 * - preg_*
125 * - ...
126 *
127 */
128 /**
129 * Class for conversion between charsets
130 *
131 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
132 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
133 * @package TYPO3
134 * @subpackage t3lib
135 */
136 class t3lib_cs {
137 var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
138
139 // This is the array where parsed conversion tables are stored (cached)
140 var $parsedCharsets=array();
141
142 // An array where case folding data will be stored (cached)
143 var $caseFolding=array();
144
145 // An array where charset-to-ASCII mappings are stored (cached)
146 var $toASCII=array();
147
148 // This tells the converter which charsets has two bytes per char:
149 var $twoByteSets=array(
150 'ucs-2'=>1, // 2-byte Unicode
151 );
152
153 // This tells the converter which charsets has four bytes per char:
154 var $fourByteSets=array(
155 'ucs-4'=>1, // 4-byte Unicode
156 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
157 );
158
159 // This tells the converter which charsets use a scheme like the Extended Unix Code:
160 var $eucBasedSets=array(
161 'gb2312'=>1, // Chinese, simplified.
162 'big5'=>1, // Chinese, traditional.
163 'euc-kr'=>1, // Korean
164 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
165 );
166
167 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
168 // http://czyborra.com/charsets/iso8859.html
169 var $synonyms=array(
170 'us' => 'ascii',
171 'us-ascii'=> 'ascii',
172 'cp819' => 'iso-8859-1',
173 'ibm819' => 'iso-8859-1',
174 'iso-ir-100' => 'iso-8859-1',
175 'iso-ir-109' => 'iso-8859-2',
176 'iso-ir-148' => 'iso-8859-9',
177 'iso-ir-199' => 'iso-8859-14',
178 'iso-ir-203' => 'iso-8859-15',
179 'csisolatin1' => 'iso-8859-1',
180 'csisolatin2' => 'iso-8859-2',
181 'csisolatin3' => 'iso-8859-3',
182 'csisolatin5' => 'iso-8859-9',
183 'csisolatin8' => 'iso-8859-14',
184 'csisolatin9' => 'iso-8859-15',
185 'csisolatingreek' => 'iso-8859-7',
186 'iso-celtic' => 'iso-8859-14',
187 'latin1' => 'iso-8859-1',
188 'latin2' => 'iso-8859-2',
189 'latin3' => 'iso-8859-3',
190 'latin5' => 'iso-8859-9',
191 'latin6' => 'iso-8859-10',
192 'latin8' => 'iso-8859-14',
193 'latin9' => 'iso-8859-15',
194 'l1' => 'iso-8859-1',
195 'l2' => 'iso-8859-2',
196 'l3' => 'iso-8859-3',
197 'l5' => 'iso-8859-9',
198 'l6' => 'iso-8859-10',
199 'l8' => 'iso-8859-14',
200 'l9' => 'iso-8859-15',
201 'cyrillic' => 'iso-8859-5',
202 'arabic' => 'iso-8859-6',
203 'tis-620' => 'iso-8859-11',
204 'win874' => 'windows-874',
205 'win1250' => 'windows-1250',
206 'win1251' => 'windows-1251',
207 'win1252' => 'windows-1252',
208 'win1253' => 'windows-1253',
209 'win1254' => 'windows-1254',
210 'win1255' => 'windows-1255',
211 'win1256' => 'windows-1256',
212 'win1257' => 'windows-1257',
213 'win1258' => 'windows-1258',
214 'cp1250' => 'windows-1250',
215 'cp1251' => 'windows-1251',
216 'cp1252' => 'windows-1252',
217 'ms-ee' => 'windows-1250',
218 'ms-ansi' => 'windows-1252',
219 'ms-greek' => 'windows-1253',
220 'ms-turk' => 'windows-1254',
221 'winbaltrim' => 'windows-1257',
222 'koi-8ru' => 'koi-8r',
223 'koi8r' => 'koi-8r',
224 'cp878' => 'koi-8r',
225 'mac' => 'macroman',
226 'macintosh' => 'macroman',
227 'euc-cn' => 'gb2312',
228 'x-euc-cn' => 'gb2312',
229 'euccn' => 'gb2312',
230 'cp936' => 'gb2312',
231 'big-5' => 'big5',
232 'cp950' => 'big5',
233 'eucjp' => 'euc-jp',
234 'sjis' => 'shift_jis',
235 'shift-jis' => 'shift_jis',
236 'cp932' => 'shift_jis',
237 'cp949' => 'euc-kr',
238 'utf7' => 'utf-7',
239 'utf8' => 'utf-8',
240 'utf16' => 'utf-16',
241 'utf32' => 'utf-32',
242 'utf8' => 'utf-8',
243 'ucs2' => 'ucs-2',
244 'ucs4' => 'ucs-4',
245 );
246
247 // mapping of iso-639:2 language codes to script names
248 var $lang_to_script=array(
249 // iso-639:2 language codes, see:
250 // http://www.w3.org/WAI/ER/IG/ert/iso639.htm
251 // http://www.loc.gov/standards/iso639-2/langcodes.html
252 // http://www.unicode.org/onlinedat/languages.html
253 'ar' => 'arabic',
254 'bg' => 'cyrillic', // Bulgarian
255 'bs' => 'east_european', // Bosnian
256 'cs' => 'east_european', // Czech
257 'da' => 'west_european', // Danish
258 'de' => 'west_european', // German
259 'es' => 'west_european', // Spanish
260 'et' => 'estonian',
261 'eo' => 'unicode', // Esperanto
262 'eu' => 'west_european', // Basque
263 'fa' => 'arabic', // Persian
264 'fi' => 'west_european', // Finish
265 'fo' => 'west_european', // Faroese
266 'fr' => 'west_european', // French
267 'ga' => 'west_european', // Galician
268 'ge' => 'unicode', // Georgian
269 'gr' => 'greek',
270 'he' => 'hebrew', // Hebrew (since 1998)
271 'hi' => 'unicode', // Hindi
272 'hr' => 'east_european', // Croatian
273 'hu' => 'east_european', // Hungarian
274 'iw' => 'hebrew', // Hebrew (til 1998)
275 'is' => 'west_european', // Icelandic
276 'it' => 'west_european', // Italian
277 'ja' => 'japanese',
278 'kl' => 'west_european', // Greenlandic
279 'ko' => 'korean',
280 'lt' => 'lithuanian',
281 'lv' => 'west_european', // Latvian/Lettish
282 'nl' => 'west_european', // Dutch
283 'no' => 'west_european', // Norwegian
284 'pl' => 'east_european', // Polish
285 'pt' => 'west_european', // Portuguese
286 'ro' => 'east_european', // Romanian
287 'ru' => 'cyrillic', // Russian
288 'sk' => 'east_european', // Slovak
289 'sl' => 'east_european', // Slovenian
290 'sr' => 'cyrillic', // Serbian
291 'sv' => 'west_european', // Swedish
292 'sq' => 'albanian', // Albanian
293 'th' => 'thai',
294 'uk' => 'cyrillic', // Ukranian
295 'vi' => 'vietnamese',
296 'zh' => 'chinese',
297 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
298 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
299 'ara' => 'arabic',
300 'bgr' => 'cyrillic', // Bulgarian
301 'cat' => 'west_european', // Catalan
302 'chs' => 'simpl_chinese',
303 'cht' => 'trad_chinese',
304 'csy' => 'east_european', // Czech
305 'dan' => 'west_european', // Danisch
306 'deu' => 'west_european', // German
307 'dea' => 'west_european', // German (Austrian)
308 'des' => 'west_european', // German (Swiss)
309 'ena' => 'west_european', // English (Australian)
310 'enc' => 'west_european', // English (Canadian)
311 'eng' => 'west_european', // English
312 'enz' => 'west_european', // English (New Zealand)
313 'enu' => 'west_european', // English (United States)
314 'euq' => 'west_european', // Basque
315 'fos' => 'west_european', // Faroese
316 'far' => 'arabic', // Persian
317 'fin' => 'west_european', // Finish
318 'fra' => 'west_european', // French
319 'frb' => 'west_european', // French (Belgian)
320 'frc' => 'west_european', // French (Canadian)
321 'frs' => 'west_european', // French (Swiss)
322 'geo' => 'unicode', // Georgian
323 'glg' => 'west_european', // Galician
324 'ell' => 'greek',
325 'heb' => 'hebrew',
326 'hin' => 'unicode', // Hindi
327 'hun' => 'east_european', // Hungarian
328 'isl' => 'west_euorpean', // Icelandic
329 'ita' => 'west_european', // Italian
330 'its' => 'west_european', // Italian (Swiss)
331 'jpn' => 'japanese',
332 'kor' => 'korean',
333 'lth' => 'lithuanian',
334 'lvi' => 'west_european', // Latvian/Lettish
335 'msl' => 'west_european', // Malay
336 'nlb' => 'west_european', // Dutch (Belgian)
337 'nld' => 'west_european', // Dutch
338 'nor' => 'west_european', // Norwegian (bokmal)
339 'non' => 'west_european', // Norwegian (nynorsk)
340 'plk' => 'east_european', // Polish
341 'ptg' => 'west_european', // Portuguese
342 'ptb' => 'west_european', // Portuguese (Brazil)
343 'rom' => 'east_european', // Romanian
344 'rus' => 'cyrillic', // Russian
345 'slv' => 'east_european', // Slovenian
346 'sky' => 'east_european', // Slovak
347 'srl' => 'east_european', // Serbian (Latin)
348 'srb' => 'cyrillic', // Serbian (Cyrillic)
349 'esp' => 'west_european', // Spanish (trad. sort)
350 'esm' => 'west_european', // Spanish (Mexican)
351 'esn' => 'west_european', // Spanish (internat. sort)
352 'sve' => 'west_european', // Swedish
353 'sqi' => 'albanian', // Albanian
354 'tha' => 'thai',
355 'trk' => 'turkish',
356 'ukr' => 'cyrillic', // Ukrainian
357 // English language names
358 'albanian' => 'albanian',
359 'arabic' => 'arabic',
360 'basque' => 'west_european',
361 'bosnian' => 'east_european',
362 'bulgarian' => 'east_european',
363 'catalan' => 'west_european',
364 'croatian' => 'east_european',
365 'czech' => 'east_european',
366 'danish' => 'west_european',
367 'dutch' => 'west_european',
368 'english' => 'west_european',
369 'esperanto' => 'unicode',
370 'estonian' => 'estonian',
371 'faroese' => 'west_european',
372 'farsi' => 'arabic',
373 'finnish' => 'west_european',
374 'french' => 'west_european',
375 'galician' => 'west_european',
376 'georgian' => 'unicode',
377 'german' => 'west_european',
378 'greek' => 'greek',
379 'greenlandic' => 'west_european',
380 'hebrew' => 'hebrew',
381 'hindi' => 'unicode',
382 'hungarian' => 'east_european',
383 'icelandic' => 'west_european',
384 'italian' => 'west_european',
385 'latvian' => 'west_european',
386 'lettish' => 'west_european',
387 'lithuanian' => 'lithuanian',
388 'malay' => 'west_european',
389 'norwegian' => 'west_european',
390 'persian' => 'arabic',
391 'polish' => 'east_european',
392 'portuguese' => 'west_european',
393 'russian' => 'cyrillic',
394 'romanian' => 'east_european',
395 'serbian' => 'cyrillic',
396 'slovak' => 'east_european',
397 'slovenian' => 'east_european',
398 'spanish' => 'west_european',
399 'svedish' => 'west_european',
400 'that' => 'thai',
401 'turkish' => 'turkish',
402 'ukrainian' => 'cyrillic',
403 );
404
405 // mapping of language (family) names to charsets on Unix
406 var $script_to_charset_unix=array(
407 'west_european' => 'iso-8859-1',
408 'estonian' => 'iso-8859-1',
409 'east_european' => 'iso-8859-2',
410 'baltic' => 'iso-8859-4',
411 'cyrillic' => 'iso-8859-5',
412 'arabic' => 'iso-8859-6',
413 'greek' => 'iso-8859-7',
414 'hebrew' => 'iso-8859-8',
415 'turkish' => 'iso-8859-9',
416 'thai' => 'iso-8859-11', // = TIS-620
417 'lithuanian' => 'iso-8859-13',
418 'chinese' => 'gb2312', // = euc-cn
419 'japanese' => 'euc-jp',
420 'korean' => 'euc-kr',
421 'simpl_chinese' => 'gb2312',
422 'trad_chinese' => 'big5',
423 'vietnamese' => '',
424 'unicode' => 'utf-8',
425 'albanian' => 'utf-8'
426 );
427
428 // mapping of language (family) names to charsets on Windows
429 var $script_to_charset_windows=array(
430 'east_european' => 'windows-1250',
431 'cyrillic' => 'windows-1251',
432 'west_european' => 'windows-1252',
433 'greek' => 'windows-1253',
434 'turkish' => 'windows-1254',
435 'hebrew' => 'windows-1255',
436 'arabic' => 'windows-1256',
437 'baltic' => 'windows-1257',
438 'estonian' => 'windows-1257',
439 'lithuanian' => 'windows-1257',
440 'vietnamese' => 'windows-1258',
441 'thai' => 'cp874',
442 'korean' => 'cp949',
443 'chinese' => 'gb2312',
444 'japanese' => 'shift_jis',
445 'simpl_chinese' => 'gb2312',
446 'trad_chinese' => 'big5',
447 'albanian' => 'windows-1250',
448 'unicode' => 'utf-8'
449 );
450
451 // mapping of locale names to charsets
452 var $locale_to_charset=array(
453 'japanese.euc' => 'euc-jp',
454 'ja_jp.ujis' => 'euc-jp',
455 'korean.euc' => 'euc-kr',
456 'sr@Latn' => 'iso-8859-2',
457 'zh_cn' => 'gb2312',
458 'zh_hk' => 'big5',
459 'zh_tw' => 'big5',
460 );
461
462 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
463 // Empty values means "iso-8859-1"
464 var $charSetArray = array(
465 'dk' => '',
466 'de' => '',
467 'no' => '',
468 'it' => '',
469 'fr' => '',
470 'es' => '',
471 'nl' => '',
472 'cz' => 'windows-1250',
473 'pl' => 'iso-8859-2',
474 'si' => 'windows-1250',
475 'fi' => '',
476 'tr' => 'iso-8859-9',
477 'se' => '',
478 'pt' => '',
479 'ru' => 'windows-1251',
480 'ro' => 'iso-8859-2',
481 'ch' => 'gb2312',
482 'sk' => 'windows-1250',
483 'lt' => 'windows-1257',
484 'is' => 'utf-8',
485 'hr' => 'windows-1250',
486 'hu' => 'iso-8859-2',
487 'gl' => '',
488 'th' => 'iso-8859-11',
489 'gr' => 'iso-8859-7',
490 'hk' => 'big5',
491 'eu' => '',
492 'bg' => 'windows-1251',
493 'br' => '',
494 'et' => 'iso-8859-4',
495 'ar' => 'iso-8859-6',
496 'he' => 'utf-8',
497 'ua' => 'windows-1251',
498 'jp' => 'shift_jis',
499 'lv' => 'utf-8',
500 'vn' => 'utf-8',
501 'ca' => 'iso-8859-15',
502 'ba' => 'iso-8859-2',
503 'kr' => 'euc-kr',
504 'eo' => 'utf-8',
505 'my' => '',
506 'hi' => 'utf-8',
507 'fo' => 'utf-8',
508 'fa' => 'utf-8',
509 'sr' => 'utf-8',
510 'sq' => 'utf-8',
511 'ge' => 'utf-8',
512 'ga' => '',
513 );
514
515 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
516 // Missing keys means: same as Typo3
517 var $isoArray = array(
518 'ba' => 'bs',
519 'br' => 'pt_BR',
520 'ch' => 'zh_CN',
521 'cz' => 'cs',
522 'dk' => 'da',
523 'si' => 'sl',
524 'se' => 'sv',
525 'gl' => 'kl',
526 'gr' => 'el',
527 'hk' => 'zh_HK',
528 'kr' => 'ko',
529 'ua' => 'uk',
530 'jp' => 'ja',
531 'vn' => 'vi',
532 );
533
534 /**
535 * Normalize - changes input character set to lowercase letters.
536 *
537 * @param string Input charset
538 * @return string Normalized charset
539 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
540 */
541 function parse_charset($charset) {
542 $charset = trim(strtolower($charset));
543 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
544
545 return $charset;
546 }
547
548 /**
549 * Get the charset of a locale.
550 *
551 * ln language
552 * ln_CN language / country
553 * ln_CN.cs language / country / charset
554 * ln_CN.cs@mod language / country / charset / modifier
555 *
556 * @param string Locale string
557 * @return string Charset resolved for locale string
558 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
559 */
560 function get_locale_charset($locale) {
561 $locale = strtolower($locale);
562
563 // exact locale specific charset?
564 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale];
565
566 // get modifier
567 list($locale,$modifier) = explode('@',$locale);
568
569 // locale contains charset: use it
570 list($locale,$charset) = explode('.',$locale);
571 if ($charset) return $this->parse_charset($charset);
572
573 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
574 if ($modifier == 'euro') return 'iso-8859-15';
575
576 // get language
577 list($language,$country) = explode('_',$locale);
578 if (isset($this->lang_to_script[$language])) $script = $this->lang_to_script[$language];
579
580 if (TYPO3_OS == 'WIN') {
581 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
582 } else {
583 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
584 }
585
586 return $cs;
587 }
588
589
590
591
592
593
594
595
596
597 /********************************************
598 *
599 * Charset Conversion functions
600 *
601 ********************************************/
602
603 /**
604 * Convert from one charset to another charset.
605 *
606 * @param string Input string
607 * @param string From charset (the current charset of the string)
608 * @param string To charset (the output charset wanted)
609 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
610 * @return string Converted string
611 * @see convArray()
612 */
613 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
614 if ($fromCS==$toCS) return $str;
615
616 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
617 if ($toCS=='utf-8' || !$useEntityForNoChar) {
618 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
619 case 'mbstring':
620 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
621 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
622 break;
623
624 case 'iconv':
625 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
626 if (false !== $conv_str) return $conv_str;
627 break;
628
629 case 'recode':
630 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
631 if (false !== $conv_str) return $conv_str;
632 break;
633 }
634 // fallback to TYPO3 conversion
635 }
636
637 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
638 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
639 return $str;
640 }
641
642 /**
643 * Convert all elements in ARRAY from one charset to another charset.
644 * NOTICE: Array is passed by reference!
645 *
646 * @param string Input array, possibly multidimensional
647 * @param string From charset (the current charset of the string)
648 * @param string To charset (the output charset wanted)
649 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
650 * @return void
651 * @see conv()
652 */
653 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
654 foreach($array as $key => $value) {
655 if (is_array($array[$key])) {
656 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
657 } else {
658 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
659 }
660 }
661 }
662
663 /**
664 * Converts $str from $charset to UTF-8
665 *
666 * @param string String in local charset to convert to UTF-8
667 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
668 * @return string Output string, converted to UTF-8
669 */
670 function utf8_encode($str,$charset) {
671
672 if ($charset === 'utf-8') return $str;
673
674 // Charset is case-insensitive.
675 if ($this->initCharset($charset)) { // Parse conv. table if not already...
676 $strLen = strlen($str);
677 $outStr='';
678
679 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
680 $chr=substr($str,$a,1);
681 $ord=ord($chr);
682 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
683 $ord2 = ord($str{$a+1});
684 $ord = $ord<<8 | $ord2; // assume big endian
685
686 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
687 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
688 } else $outStr.=chr($this->noCharByteVal); // No char exists
689 $a++;
690 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
691 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
692 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
693 $a++;
694 $ord2=ord(substr($str,$a,1));
695 $ord = $ord*256+$ord2;
696 }
697 }
698
699 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
700 $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
701 } else $outStr.= chr($this->noCharByteVal); // No char exists
702 } else $outStr.= $chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
703 }
704 return $outStr;
705 }
706 }
707
708 /**
709 * Converts $str from UTF-8 to $charset
710 *
711 * @param string String in UTF-8 to convert to local charset
712 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
713 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
714 * @return string Output string, converted to local charset
715 */
716 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
717
718 // Charset is case-insensitive.
719 if ($this->initCharset($charset)) { // Parse conv. table if not already...
720 $strLen = strlen($str);
721 $outStr='';
722 $buf='';
723 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
724 $chr=substr($str,$a,1);
725 $ord=ord($chr);
726 if ($ord>127) { // This means multibyte! (first byte!)
727 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
728
729 $buf=$chr; // Add first byte
730 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
731 $ord = $ord << 1; // Shift it left and ...
732 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
733 $a++; // Increase pointer...
734 $buf.=substr($str,$a,1); // ... and add the next char.
735 } else break;
736 }
737
738 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
739 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
740 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
741 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
742 } else $outStr.= chr($mByte);
743 } elseif ($useEntityForNoChar) { // Create num entity:
744 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
745 } else $outStr.=chr($this->noCharByteVal); // No char exists
746 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
747 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
748 }
749 return $outStr;
750 }
751 }
752
753 /**
754 * Converts all chars > 127 to numeric entities.
755 *
756 * @param string Input string
757 * @return string Output string
758 */
759 function utf8_to_entities($str) {
760 $strLen = strlen($str);
761 $outStr='';
762 $buf='';
763 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
764 $chr=substr($str,$a,1);
765 $ord=ord($chr);
766 if ($ord>127) { // This means multibyte! (first byte!)
767 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
768 $buf=$chr; // Add first byte
769 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
770 $ord = $ord << 1; // Shift it left and ...
771 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
772 $a++; // Increase pointer...
773 $buf.=substr($str,$a,1); // ... and add the next char.
774 } else break;
775 }
776
777 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
778 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
779 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
780 }
781
782 return $outStr;
783 }
784
785 /**
786 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
787 *
788 * @param string Input string, UTF-8
789 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
790 * @return string Output string
791 */
792 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
793 if ($alsoStdHtmlEnt) {
794 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
795 }
796
797 $token = md5(microtime());
798 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
799 foreach($parts as $k => $v) {
800 if ($k%2) {
801 if (substr($v,0,1)=='#') { // Dec or hex entities:
802 if (substr($v,1,1)=='x') {
803 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
804 } else {
805 $parts[$k] = $this->UnumberToChar(substr($v,1));
806 }
807 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
808 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
809 } else { // No conversion:
810 $parts[$k] ='&'.$v.';';
811 }
812 }
813 }
814
815 return implode('',$parts);
816 }
817
818 /**
819 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
820 *
821 * @param string Input string, UTF-8
822 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
823 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
824 * @return array Output array with the char numbers
825 */
826 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
827 // If entities must be registered as well...:
828 if ($convEntities) {
829 $str = $this->entities_to_utf8($str,1);
830 }
831 // Do conversion:
832 $strLen = strlen($str);
833 $outArr=array();
834 $buf='';
835 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
836 $chr=substr($str,$a,1);
837 $ord=ord($chr);
838 if ($ord>127) { // This means multibyte! (first byte!)
839 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
840 $buf=$chr; // Add first byte
841 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
842 $ord = $ord << 1; // Shift it left and ...
843 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
844 $a++; // Increase pointer...
845 $buf.=substr($str,$a,1); // ... and add the next char.
846 } else break;
847 }
848
849 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
850 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
851 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
852 }
853
854 return $outArr;
855 }
856
857 /**
858 * Converts a UNICODE number to a UTF-8 multibyte character
859 * Algorithm based on script found at From: http://czyborra.com/utf/
860 * Unit-tested by Kasper
861 *
862 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
863 *
864 * bytes | bits | representation
865 * 1 | 7 | 0vvvvvvv
866 * 2 | 11 | 110vvvvv 10vvvvvv
867 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
868 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
869 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
870 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
871 *
872 * @param integer UNICODE integer
873 * @return string UTF-8 multibyte character string
874 * @see utf8CharToUnumber()
875 */
876 function UnumberToChar($cbyte) {
877 $str='';
878
879 if ($cbyte < 0x80) {
880 $str.=chr($cbyte);
881 } else if ($cbyte < 0x800) {
882 $str.=chr(0xC0 | ($cbyte >> 6));
883 $str.=chr(0x80 | ($cbyte & 0x3F));
884 } else if ($cbyte < 0x10000) {
885 $str.=chr(0xE0 | ($cbyte >> 12));
886 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
887 $str.=chr(0x80 | ($cbyte & 0x3F));
888 } else if ($cbyte < 0x200000) {
889 $str.=chr(0xF0 | ($cbyte >> 18));
890 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
891 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
892 $str.=chr(0x80 | ($cbyte & 0x3F));
893 } else if ($cbyte < 0x4000000) {
894 $str.=chr(0xF8 | ($cbyte >> 24));
895 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
896 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
897 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
898 $str.=chr(0x80 | ($cbyte & 0x3F));
899 } else if ($cbyte < 0x80000000) {
900 $str.=chr(0xFC | ($cbyte >> 30));
901 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
902 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
903 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
904 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
905 $str.=chr(0x80 | ($cbyte & 0x3F));
906 } else { // Cannot express a 32-bit character in UTF-8
907 $str .= chr($this->noCharByteVal);
908 }
909 return $str;
910 }
911
912 /**
913 * Converts a UTF-8 Multibyte character to a UNICODE number
914 * Unit-tested by Kasper
915 *
916 * @param string UTF-8 multibyte character string
917 * @param boolean If set, then a hex. number is returned.
918 * @return integer UNICODE integer
919 * @see UnumberToChar()
920 */
921 function utf8CharToUnumber($str,$hex=0) {
922 $ord=ord(substr($str,0,1)); // First char
923
924 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
925 $binBuf='';
926 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
927 $ord = $ord << 1; // Shift it left and ...
928 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
929 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
930 } else break;
931 }
932 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
933
934 $int = bindec($binBuf);
935 } else $int = $ord;
936
937 return $hex ? 'x'.dechex($int) : $int;
938 }
939
940
941
942
943
944
945
946
947
948 /********************************************
949 *
950 * Init functions
951 *
952 ********************************************/
953
954 /**
955 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
956 * This function is automatically called by the conversion functions
957 *
958 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
959 *
960 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
961 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
962 * @access private
963 */
964 function initCharset($charset) {
965 // Only process if the charset is not yet loaded:
966 if (!is_array($this->parsedCharsets[$charset])) {
967
968 // Conversion table filename:
969 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
970
971 // If the conversion table is found:
972 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
973 // Cache file for charsets:
974 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
975 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
976 if ($cacheFile && @is_file($cacheFile)) {
977 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
978 } else {
979 // Parse conversion table into lines:
980 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
981 // Initialize the internal variable holding the conv. table:
982 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
983 // traverse the lines:
984 $detectedType='';
985 foreach($lines as $value) {
986 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
987
988 // Detect type if not done yet: (Done on first real line)
989 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
990 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
991
992 if ($detectedType=='ms-token') {
993 list($hexbyte,$utf8) = split('=|:',$value,3);
994 } elseif ($detectedType=='whitespaced') {
995 $regA=array();
996 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
997 $hexbyte = $regA[1];
998 $utf8 = 'U+'.$regA[2];
999 }
1000 $decval = hexdec(trim($hexbyte));
1001 if ($decval>127) {
1002 $utf8decval = hexdec(substr(trim($utf8),2));
1003 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
1004 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
1005 }
1006 }
1007 }
1008 if ($cacheFile) {
1009 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
1010 }
1011 }
1012 return 2;
1013 } else return false;
1014 } else return 1;
1015 }
1016
1017 /**
1018 * This function initializes all UTF-8 character data tables.
1019 *
1020 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1021 *
1022 * @param string Mode ("case", "ascii", ...)
1023 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1024 * @access private
1025 */
1026 function initUnicodeData($mode=null) {
1027 // cache files
1028 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1029 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1030
1031 // Only process if the tables are not yet loaded
1032 switch($mode) {
1033 case 'case':
1034 if (is_array($this->caseFolding['utf-8'])) return 1;
1035
1036 // Use cached version if possible
1037 if ($cacheFileCase && @is_file($cacheFileCase)) {
1038 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1039 return 2;
1040 }
1041 break;
1042
1043 case 'ascii':
1044 if (is_array($this->toASCII['utf-8'])) return 1;
1045
1046 // Use cached version if possible
1047 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1048 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1049 return 2;
1050 }
1051 break;
1052 }
1053
1054 // process main Unicode data file
1055 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
1056 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
1057
1058 $fh = fopen($unicodeDataFile,'rb');
1059 if (!$fh) return false;
1060
1061 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1062 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1063 $this->caseFolding['utf-8'] = array();
1064 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1065 $utf8CaseFolding['toUpper'] = array();
1066 $utf8CaseFolding['toLower'] = array();
1067 $utf8CaseFolding['toTitle'] = array();
1068
1069 $decomposition = array(); // array of temp. decompositions
1070 $mark = array(); // array of chars that are marks (eg. composing accents)
1071 $number = array(); // array of chars that are numbers (eg. digits)
1072 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1073
1074 while (!feof($fh)) {
1075 $line = fgets($fh,4096);
1076 // has a lot of info
1077 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
1078
1079 $ord = hexdec($char);
1080 if ($ord > 0xFFFF) break; // only process the BMP
1081
1082 $utf8_char = $this->UnumberToChar($ord);
1083
1084 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1085 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1086 // store "title" only when different from "upper" (only a few)
1087 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1088
1089 switch ($cat{0}) {
1090 case 'M': // mark (accent, umlaut, ...)
1091 $mark["U+$char"] = 1;
1092 break;
1093
1094 case 'N': // numeric value
1095 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
1096 }
1097
1098 // accented Latin letters without "official" decomposition
1099 $match = array();
1100 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) {
1101 $c = ord($match[2]);
1102 if ($match[1] == 'SMALL') $c += 32;
1103
1104 $decomposition["U+$char"] = array(dechex($c));
1105 continue;
1106 }
1107
1108 $match = array();
1109 if (ereg('(<.*>)? *(.+)',$decomp,$match)) {
1110 switch($match[1]) {
1111 case '<circle>': // add parenthesis as circle replacement, eg (1)
1112 $match[2] = '0028 '.$match[2].' 0029';
1113 break;
1114
1115 case '<square>': // add square brackets as square replacement, eg [1]
1116 $match[2] = '005B '.$match[2].' 005D';
1117 break;
1118
1119 case '<compat>': // ignore multi char decompositions that start with a space
1120 if (ereg('^0020 ',$match[2])) continue 2;
1121 break;
1122
1123 // ignore Arabic and vertical layout presentation decomposition
1124 case '<initial>':
1125 case '<medial>':
1126 case '<final>':
1127 case '<isolated>':
1128 case '<vertical>':
1129 continue 2;
1130 }
1131 $decomposition["U+$char"] = split(' ',$match[2]);
1132 }
1133 }
1134 fclose($fh);
1135
1136 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1137 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
1138 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1139 $fh = fopen($specialCasingFile,'rb');
1140 if ($fh) {
1141 while (!feof($fh)) {
1142 $line = fgets($fh,4096);
1143 if ($line{0} != '#' && trim($line) != '') {
1144
1145 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
1146 if ($cond == '' || $cond{0} == '#') {
1147 $utf8_char = $this->UnumberToChar(hexdec($char));
1148 if ($char != $lower) {
1149 $arr = split(' ',$lower);
1150 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1151 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1152 }
1153 if ($char != $title && $title != $upper) {
1154 $arr = split(' ',$title);
1155 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1156 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1157 }
1158 if ($char != $upper) {
1159 $arr = split(' ',$upper);
1160 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1161 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1162 }
1163 }
1164 }
1165 }
1166 fclose($fh);
1167 }
1168 }
1169
1170 // process custom decompositions
1171 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
1172 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1173 $fh = fopen($customTranslitFile,'rb');
1174 if ($fh) {
1175 while (!feof($fh)) {
1176 $line = fgets($fh,4096);
1177 if ($line{0} != '#' && trim($line) != '') {
1178 list($char,$translit) = t3lib_div::trimExplode(';', $line);
1179 if (!$translit) $omit["U+$char"] = 1;
1180 $decomposition["U+$char"] = split(' ', $translit);
1181
1182 }
1183 }
1184 fclose($fh);
1185 }
1186 }
1187
1188 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1189 foreach($decomposition as $from => $to) {
1190 $code_decomp = array();
1191
1192 while ($code_value = array_shift($to)) {
1193 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1194 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
1195 array_unshift($to, $cv);
1196 }
1197 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1198 array_push($code_decomp, $code_value);
1199 }
1200 }
1201 if (count($code_decomp) || isset($omit[$from])) {
1202 $decomposition[$from] = $code_decomp;
1203 } else {
1204 unset($decomposition[$from]);
1205 }
1206 }
1207
1208 // create ascii only mapping
1209 $this->toASCII['utf-8'] = array();
1210 $ascii =& $this->toASCII['utf-8'];
1211
1212 foreach($decomposition as $from => $to) {
1213 $code_decomp = array();
1214 while ($code_value = array_shift($to)) {
1215 $ord = hexdec($code_value);
1216 if ($ord > 127)
1217 continue 2; // skip decompositions containing non-ASCII chars
1218 else
1219 array_push($code_decomp,chr($ord));
1220 }
1221 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1222 }
1223
1224 // add numeric decompositions
1225 foreach($number as $from => $to) {
1226 $utf8_char = $this->UnumberToChar(hexdec($from));
1227 if (!isset($ascii[$utf8_char])) {
1228 $ascii[$utf8_char] = $to;
1229 }
1230 }
1231
1232 if ($cacheFileCase) {
1233 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1234 }
1235
1236 if ($cacheFileASCII) {
1237 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1238 }
1239
1240 return 3;
1241 }
1242
1243 /**
1244 * This function initializes the folding table for a charset other than UTF-8.
1245 * This function is automatically called by the case folding functions.
1246 *
1247 * @param string Charset for which to initialize case folding.
1248 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1249 * @access private
1250 */
1251 function initCaseFolding($charset) {
1252 // Only process if the case table is not yet loaded:
1253 if (is_array($this->caseFolding[$charset])) return 1;
1254
1255 // Use cached version if possible
1256 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1257 if ($cacheFile && @is_file($cacheFile)) {
1258 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1259 return 2;
1260 }
1261
1262 // init UTF-8 conversion for this charset
1263 if (!$this->initCharset($charset)) {
1264 return false;
1265 }
1266
1267 // UTF-8 case folding is used as the base conversion table
1268 if (!$this->initUnicodeData('case')) {
1269 return false;
1270 }
1271
1272 $nochar = chr($this->noCharByteVal);
1273 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1274 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1275 $c = $this->utf8_decode($utf8, $charset);
1276
1277 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1278 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1279 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1280
1281 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1282 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1283 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
1284
1285 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1286 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1287 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1288 }
1289
1290 // add the ASCII case table
1291 for ($i=ord('a'); $i<=ord('z'); $i++) {
1292 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
1293 }
1294 for ($i=ord('A'); $i<=ord('Z'); $i++) {
1295 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
1296 }
1297
1298 if ($cacheFile) {
1299 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
1300 }
1301
1302 return 3;
1303 }
1304
1305 /**
1306 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1307 * This function is automatically called by the ASCII transliteration functions.
1308 *
1309 * @param string Charset for which to initialize conversion.
1310 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1311 * @access private
1312 */
1313 function initToASCII($charset) {
1314 // Only process if the case table is not yet loaded:
1315 if (is_array($this->toASCII[$charset])) return 1;
1316
1317 // Use cached version if possible
1318 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1319 if ($cacheFile && @is_file($cacheFile)) {
1320 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1321 return 2;
1322 }
1323
1324 // init UTF-8 conversion for this charset
1325 if (!$this->initCharset($charset)) {
1326 return false;
1327 }
1328
1329 // UTF-8/ASCII transliteration is used as the base conversion table
1330 if (!$this->initUnicodeData('ascii')) {
1331 return false;
1332 }
1333
1334 $nochar = chr($this->noCharByteVal);
1335 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1336 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1337 $c = $this->utf8_decode($utf8, $charset);
1338
1339 if (isset($this->toASCII['utf-8'][$utf8])) {
1340 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1341 }
1342 }
1343
1344 if ($cacheFile) {
1345 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
1346 }
1347
1348 return 3;
1349 }
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366 /********************************************
1367 *
1368 * String operation functions
1369 *
1370 ********************************************/
1371
1372 /**
1373 * Returns a part of a string.
1374 * Unit-tested by Kasper (single byte charsets only)
1375 *
1376 * @param string The character set
1377 * @param string Character string
1378 * @param integer Start position (character position)
1379 * @param integer Length (in characters)
1380 * @return string The substring
1381 * @see substr(), mb_substr()
1382 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1383 */
1384 function substr($charset,$string,$start,$len=null) {
1385 if ($len===0) return '';
1386
1387 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1388 // cannot omit $len, when specifying charset
1389 if ($len==null) {
1390 $enc = mb_internal_encoding(); // save internal encoding
1391 mb_internal_encoding($charset);
1392 $str = mb_substr($string,$start);
1393 mb_internal_encoding($enc); // restore internal encoding
1394
1395 return $str;
1396 }
1397 else {
1398 return mb_substr($string,$start,$len,$charset);
1399 }
1400 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1401 // cannot omit $len, when specifying charset
1402 if ($len==null) {
1403 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1404 iconv_set_encoding('internal_encoding',$charset);
1405 $str = iconv_substr($string,$start);
1406 iconv_set_encoding('internal_encoding',$enc); // restore internal encoding
1407
1408 return $str;
1409 }
1410 else {
1411 return iconv_substr($string,$start,$len,$charset);
1412 }
1413 } elseif ($charset == 'utf-8') {
1414 return $this->utf8_substr($string,$start,$len);
1415 } elseif ($this->eucBasedSets[$charset]) {
1416 return $this->euc_substr($string,$start,$charset,$len);
1417 } elseif ($this->twoByteSets[$charset]) {
1418 return substr($string,$start*2,$len*2);
1419 } elseif ($this->fourByteSets[$charset]) {
1420 return substr($string,$start*4,$len*4);
1421 }
1422
1423 // treat everything else as single-byte encoding
1424 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
1425 }
1426
1427 /**
1428 * Counts the number of characters.
1429 * Unit-tested by Kasper (single byte charsets only)
1430 *
1431 * @param string The character set
1432 * @param string Character string
1433 * @return integer The number of characters
1434 * @see strlen()
1435 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1436 */
1437 function strlen($charset,$string) {
1438 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1439 return mb_strlen($string,$charset);
1440 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1441 return iconv_strlen($string,$charset);
1442 } elseif ($charset == 'utf-8') {
1443 return $this->utf8_strlen($string);
1444 } elseif ($this->eucBasedSets[$charset]) {
1445 return $this->euc_strlen($string,$charset);
1446 } elseif ($this->twoByteSets[$charset]) {
1447 return strlen($string)/2;
1448 } elseif ($this->fourByteSets[$charset]) {
1449 return strlen($string)/4;
1450 }
1451 // treat everything else as single-byte encoding
1452 return strlen($string);
1453 }
1454
1455 /**
1456 * Truncates a string and pre-/appends a string.
1457 * Unit tested by Kasper
1458 *
1459 * @param string The character set
1460 * @param string Character string
1461 * @param integer Length (in characters)
1462 * @param string Crop signifier
1463 * @return string The shortened string
1464 * @see substr(), mb_strimwidth()
1465 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1466 */
1467 function crop($charset,$string,$len,$crop='') {
1468 if (intval($len) == 0) return $string;
1469
1470 if ($charset == 'utf-8') {
1471 $i = $this->utf8_char2byte_pos($string,$len);
1472 } elseif ($this->eucBasedSets[$charset]) {
1473 $i = $this->euc_char2byte_pos($string,$len,$charset);
1474 } else {
1475 if ($len > 0) {
1476 $i = $len;
1477 } else {
1478 $i = strlen($string)+$len;
1479 if ($i<=0) $i = false;
1480 }
1481 }
1482
1483 if ($i === false) { // $len outside actual string length
1484 return $string;
1485 } else {
1486 if ($len > 0) {
1487 if (strlen($string{$i})) {
1488 return substr($string,0,$i).$crop;
1489
1490 }
1491 } else {
1492 if (strlen($string{$i-1})) {
1493 return $crop.substr($string,$i);
1494 }
1495 }
1496
1497 /*
1498 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1499 if ($len > 0) {
1500 return substr($string,0,$i).$crop;
1501 } else {
1502 return $crop.substr($string,$i);
1503 }
1504 }
1505 */
1506 }
1507 return $string;
1508 }
1509
1510 /**
1511 * Cuts a string short at a given byte length.
1512 *
1513 * @param string The character set
1514 * @param string Character string
1515 * @param integer The byte length
1516 * @return string The shortened string
1517 * @see mb_strcut()
1518 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1519 */
1520 function strtrunc($charset,$string,$len) {
1521 if ($len <= 0) return '';
1522
1523 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1524 return mb_strcut($string,0,$len,$charset);
1525 } elseif ($charset == 'utf-8') {
1526 return $this->utf8_strtrunc($string,$len);
1527 } elseif ($this->eucBasedSets[$charset]) {
1528 return $this->euc_strtrunc($string,$charset);
1529 } elseif ($this->twoByteSets[$charset]) {
1530 if ($len % 2) $len--; // don't cut at odd positions
1531 } elseif ($this->fourByteSets[$charset]) {
1532 $x = $len % 4;
1533 $len -= $x; // realign to position dividable by four
1534 }
1535 // treat everything else as single-byte encoding
1536 return substr($string,0,$len);
1537 }
1538
1539 /**
1540 * Translates all characters of a string into their respective case values.
1541 * Unlike strtolower() and strtoupper() this method is locale independent.
1542 * Note that the string length may change!
1543 * eg. lower case German �(sharp S) becomes upper case "SS"
1544 * Unit-tested by Kasper
1545 * Real case folding is language dependent, this method ignores this fact.
1546 *
1547 * @param string Character set of string
1548 * @param string Input string to convert case for
1549 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1550 * @return string The converted string
1551 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1552 * @see strtolower(), strtoupper()
1553 */
1554 function conv_case($charset,$string,$case) {
1555 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1556 if ($case == 'toLower') {
1557 $string = mb_strtolower($string,$charset);
1558 } else {
1559 $string = mb_strtoupper($string,$charset);
1560 }
1561 } elseif ($charset == 'utf-8') {
1562 $string = $this->utf8_char_mapping($string,'case',$case);
1563 } elseif (isset($this->eucBasedSets[$charset])) {
1564 $string = $this->euc_char_mapping($string,$charset,'case',$case);
1565 } else {
1566 // treat everything else as single-byte encoding
1567 $string = $this->sb_char_mapping($string,$charset,'case',$case);
1568 }
1569
1570 return $string;
1571 }
1572
1573 /**
1574 * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
1575 *
1576 * @param string Character set of string
1577 * @param string Input string to convert
1578 * @return string The converted string
1579 */
1580 function specCharsToASCII($charset,$string) {
1581 if ($charset == 'utf-8') {
1582 $string = $this->utf8_char_mapping($string,'ascii');
1583 } elseif (isset($this->eucBasedSets[$charset])) {
1584 $string = $this->euc_char_mapping($string,$charset,'ascii');
1585 } else {
1586 // treat everything else as single-byte encoding
1587 $string = $this->sb_char_mapping($string,$charset,'ascii');
1588 }
1589
1590 return $string;
1591 }
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604 /********************************************
1605 *
1606 * Internal string operation functions
1607 *
1608 ********************************************/
1609
1610 /**
1611 * Maps all characters of a string in a single byte charset.
1612 *
1613 * @param string the string
1614 * @param string the charset
1615 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1616 * @param string 'case': conversion 'toLower' or 'toUpper'
1617 * @return string the converted string
1618 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1619 */
1620 function sb_char_mapping($str,$charset,$mode,$opt='') {
1621 switch($mode) {
1622 case 'case':
1623 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1624 $map =& $this->caseFolding[$charset][$opt];
1625 break;
1626
1627 case 'ascii':
1628 if (!$this->initToASCII($charset)) return $str; // do nothing
1629 $map =& $this->toASCII[$charset];
1630 break;
1631
1632 default:
1633 return $str;
1634 }
1635
1636 $out = '';
1637 for($i=0; strlen($str{$i}); $i++) {
1638 $c = $str{$i};
1639 if (isset($map[$c])) {
1640 $out .= $map[$c];
1641 } else {
1642 $out .= $c;
1643 }
1644 }
1645
1646 return $out;
1647 }
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658 /********************************************
1659 *
1660 * Internal UTF-8 string operation functions
1661 *
1662 ********************************************/
1663
1664 /**
1665 * Returns a part of a UTF-8 string.
1666 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1667 *
1668 * @param string UTF-8 string
1669 * @param integer Start position (character position)
1670 * @param integer Length (in characters)
1671 * @return string The substring
1672 * @see substr()
1673 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1674 */
1675 function utf8_substr($str,$start,$len=null) {
1676 if (!strcmp($len,'0')) return '';
1677
1678 $byte_start = $this->utf8_char2byte_pos($str,$start);
1679 if ($byte_start === false) {
1680 if ($start > 0) {
1681 return false; // $start outside string length
1682 } else {
1683 $start = 0;
1684 }
1685 }
1686
1687 $str = substr($str,$byte_start);
1688
1689 if ($len!=null) {
1690 $byte_end = $this->utf8_char2byte_pos($str,$len);
1691 if ($byte_end === false) // $len outside actual string length
1692 return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string.
1693 else
1694 return substr($str,0,$byte_end);
1695 }
1696 else return $str;
1697 }
1698
1699 /**
1700 * Counts the number of characters of a string in UTF-8.
1701 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1702 *
1703 * @param string UTF-8 multibyte character string
1704 * @return integer The number of characters
1705 * @see strlen()
1706 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1707 */
1708 function utf8_strlen($str) {
1709 $n=0;
1710 for($i=0; strlen($str{$i}); $i++) {
1711 $c = ord($str{$i});
1712 if (!($c & 0x80)) // single-byte (0xxxxxx)
1713 $n++;
1714 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1715 $n++;
1716 }
1717 return $n;
1718 }
1719
1720 /**
1721 * Truncates a string in UTF-8 short at a given byte length.
1722 *
1723 * @param string UTF-8 multibyte character string
1724 * @param integer the byte length
1725 * @return string the shortened string
1726 * @see mb_strcut()
1727 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1728 */
1729 function utf8_strtrunc($str,$len) {
1730 $i = $len-1;
1731 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1732 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1733 if ($i <= 0) return ''; // sanity check
1734 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1735 if ($bc+$i > $len) return substr($str,0,$i);
1736 // fallthru: multibyte char fits into length
1737 }
1738 return substr($str,0,$len);
1739 }
1740
1741 /**
1742 * Find position of first occurrence of a string, both arguments are in UTF-8.
1743 *
1744 * @param string UTF-8 string to search in
1745 * @param string UTF-8 string to search for
1746 * @param integer Positition to start the search
1747 * @return integer The character position
1748 * @see strpos()
1749 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1750 */
1751 function utf8_strpos($haystack,$needle,$offset=0) {
1752 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1753 return mb_strpos($haystack,$needle,$offset,'utf-8');
1754 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1755 return iconv_strpos($haystack,$needle,$offset,'utf-8');
1756 }
1757
1758 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1759 if ($byte_offset === false) return false; // offset beyond string length
1760
1761 $byte_pos = strpos($haystack,$needle,$byte_offset);
1762 if ($byte_pos === false) return false; // needle not found
1763
1764 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1765 }
1766
1767 /**
1768 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1769 *
1770 * @param string UTF-8 string to search in
1771 * @param string UTF-8 character to search for (single character)
1772 * @return integer The character position
1773 * @see strrpos()
1774 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1775 */
1776 function utf8_strrpos($haystack,$needle) {
1777 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1778 return mb_strrpos($haystack,$needle,'utf-8');
1779 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1780 return iconv_strrpos($haystack,$needle,'utf-8');
1781 }
1782
1783 $byte_pos = strrpos($haystack,$needle);
1784 if ($byte_pos === false) return false; // needle not found
1785
1786 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1787 }
1788
1789 /**
1790 * Translates a character position into an 'absolute' byte position.
1791 * Unit tested by Kasper.
1792 *
1793 * @param string UTF-8 string
1794 * @param integer Character position (negative values start from the end)
1795 * @return integer Byte position
1796 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1797 */
1798 function utf8_char2byte_pos($str,$pos) {
1799 $n = 0; // number of characters found
1800 $p = abs($pos); // number of characters wanted
1801
1802 if ($pos >= 0) {
1803 $i = 0;
1804 $d = 1;
1805 } else {
1806 $i = strlen($str)-1;
1807 $d = -1;
1808 }
1809
1810 for( ; strlen($str{$i}) && $n<$p; $i+=$d) {
1811 $c = (int)ord($str{$i});
1812 if (!($c & 0x80)) // single-byte (0xxxxxx)
1813 $n++;
1814 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1815 $n++;
1816 }
1817 if (!strlen($str{$i})) return false; // offset beyond string length
1818
1819 if ($pos >= 0) {
1820 // skip trailing multi-byte data bytes
1821 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1822 } else {
1823 // correct offset
1824 $i++;
1825 }
1826
1827 return $i;
1828 }
1829
1830 /**
1831 * Translates an 'absolute' byte position into a character position.
1832 * Unit tested by Kasper.
1833 *
1834 * @param string UTF-8 string
1835 * @param integer byte position
1836 * @return integer character position
1837 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1838 */
1839 function utf8_byte2char_pos($str,$pos) {
1840 $n = 0; // number of characters
1841 for($i=$pos; $i>0; $i--) {
1842 $c = (int)ord($str{$i});
1843 if (!($c & 0x80)) // single-byte (0xxxxxx)
1844 $n++;
1845 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1846 $n++;
1847 }
1848 if (!strlen($str{$i})) return false; // offset beyond string length
1849
1850 return $n;
1851 }
1852
1853 /**
1854 * Maps all characters of an UTF-8 string.
1855 *
1856 * @param string UTF-8 string
1857 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1858 * @param string 'case': conversion 'toLower' or 'toUpper'
1859 * @return string the converted string
1860 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1861 */
1862 function utf8_char_mapping($str,$mode,$opt='') {
1863 if (!$this->initUnicodeData($mode)) return $str; // do nothing
1864
1865 $out = '';
1866 switch($mode) {
1867 case 'case':
1868 $map =& $this->caseFolding['utf-8'][$opt];
1869 break;
1870
1871 case 'ascii':
1872 $map =& $this->toASCII['utf-8'];
1873 break;
1874
1875 default:
1876 return $str;
1877 }
1878
1879 for($i=0; strlen($str{$i}); $i++) {
1880 $c = ord($str{$i});
1881 if (!($c & 0x80)) // single-byte (0xxxxxx)
1882 $mbc = $str{$i};
1883 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1884 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1885 $mbc = substr($str,$i,$bc);
1886 $i += $bc-1;
1887 }
1888
1889 if (isset($map[$mbc])) {
1890 $out .= $map[$mbc];
1891 } else {
1892 $out .= $mbc;
1893 }
1894 }
1895
1896 return $out;
1897 }
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916 /********************************************
1917 *
1918 * Internal EUC string operation functions
1919 *
1920 * Extended Unix Code:
1921 * ASCII compatible 7bit single bytes chars
1922 * 8bit two byte chars
1923 *
1924 * Shift-JIS is treated as a special case.
1925 *
1926 ********************************************/
1927
1928 /**
1929 * Cuts a string in the EUC charset family short at a given byte length.
1930 *
1931 * @param string EUC multibyte character string
1932 * @param integer the byte length
1933 * @param string the charset
1934 * @return string the shortened string
1935 * @see mb_strcut()
1936 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1937 */
1938 function euc_strtrunc($str,$len,$charset) {
1939 $sjis = ($charset == 'shift_jis');
1940 for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
1941 $c = ord($str{$i});
1942 if ($sjis) {
1943 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1944 }
1945 else {
1946 if ($c >= 0x80) $i++; // advance a double-byte char
1947 }
1948 }
1949 if (!strlen($str{$i})) return $str; // string shorter than supplied length
1950
1951 if ($i>$len)
1952 return substr($str,0,$len-1); // we ended on a first byte
1953 else
1954 return substr($str,0,$len);
1955 }
1956
1957 /**
1958 * Returns a part of a string in the EUC charset family.
1959 *
1960 * @param string EUC multibyte character string
1961 * @param integer start position (character position)
1962 * @param string the charset
1963 * @param integer length (in characters)
1964 * @return string the substring
1965 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1966 */
1967 function euc_substr($str,$start,$charset,$len=null) {
1968 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1969 if ($byte_start === false) return false; // $start outside string length
1970
1971 $str = substr($str,$byte_start);
1972
1973 if ($len!=null) {
1974 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1975 if ($byte_end === false) // $len outside actual string length
1976 return $str;
1977 else
1978 return substr($str,0,$byte_end);
1979 }
1980 else return $str;
1981 }
1982
1983 /**
1984 * Counts the number of characters of a string in the EUC charset family.
1985 *
1986 * @param string EUC multibyte character string
1987 * @param string the charset
1988 * @return integer the number of characters
1989 * @see strlen()
1990 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1991 */
1992 function euc_strlen($str,$charset) {
1993 $sjis = ($charset == 'shift_jis');
1994 $n=0;
1995 for ($i=0; strlen($str{$i}); $i++) {
1996 $c = ord($str{$i});
1997 if ($sjis) {
1998 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1999 }
2000 else {
2001 if ($c >= 0x80) $i++; // advance a double-byte char
2002 }
2003
2004 $n++;
2005 }
2006
2007 return $n;
2008 }
2009
2010 /**
2011 * Translates a character position into an 'absolute' byte position.
2012 *
2013 * @param string EUC multibyte character string
2014 * @param integer character position (negative values start from the end)
2015 * @param string the charset
2016 * @return integer byte position
2017 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2018 */
2019 function euc_char2byte_pos($str,$pos,$charset) {
2020 $sjis = ($charset == 'shift_jis');
2021 $n = 0; // number of characters seen
2022 $p = abs($pos); // number of characters wanted
2023
2024 if ($pos >= 0) {
2025 $i = 0;
2026 $d = 1;
2027 } else {
2028 $i = strlen($str)-1;
2029 $d = -1;
2030 }
2031
2032 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
2033 $c = ord($str{$i});
2034 if ($sjis) {
2035 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char
2036 }
2037 else {
2038 if ($c >= 0x80) $i+=$d; // advance a double-byte char
2039 }
2040
2041 $n++;
2042 }
2043 if (!strlen($str{$i})) return false; // offset beyond string length
2044
2045 if ($pos < 0) $i++; // correct offset
2046
2047 return $i;
2048 }
2049
2050 /**
2051 * Maps all characters of a string in the EUC charset family.
2052 *
2053 * @param string EUC multibyte character string
2054 * @param string the charset
2055 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2056 * @param string 'case': conversion 'toLower' or 'toUpper'
2057 * @return string the converted string
2058 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2059 */
2060 function euc_char_mapping($str,$charset,$mode,$opt='') {
2061 switch($mode) {
2062 case 'case':
2063 if (!$this->initCaseFolding($charset)) return $str; // do nothing
2064 $map =& $this->caseFolding[$charset][$opt];
2065 break;
2066
2067 case 'ascii':
2068 if (!$this->initToASCII($charset)) return $str; // do nothing
2069 $map =& $this->toASCII[$charset];
2070 break;
2071
2072 default:
2073 return $str;
2074 }
2075
2076 $sjis = ($charset == 'shift_jis');
2077 $out = '';
2078 for($i=0; strlen($str{$i}); $i++) {
2079 $mbc = $str{$i};
2080 $c = ord($mbc);
2081
2082 if ($sjis) {
2083 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2084 $mbc = substr($str,$i,2);
2085 $i++;
2086 }
2087 }
2088 else {
2089 if ($c >= 0x80) { // a double-byte char
2090 $mbc = substr($str,$i,2);
2091 $i++;
2092 }
2093 }
2094
2095 if (isset($map[$mbc])) {
2096 $out .= $map[$mbc];
2097 } else {
2098 $out .= $mbc;
2099 }
2100 }
2101
2102 return $out;
2103 }
2104
2105 }
2106
2107 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
2108 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2109 }
2110 ?>