[TASK] Clean-up several whitespaces and PHPdoc comments
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
28 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
29 */
30
31
32 /**
33 * Notes on UTF-8
34 *
35 * Functions working on UTF-8 strings:
36 *
37 * - strchr/strstr
38 * - strrchr
39 * - substr_count
40 * - implode/explode/join
41 *
42 * Functions nearly working on UTF-8 strings:
43 *
44 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
45 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
46 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
47 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
48 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
49 *
50 * Functions NOT working on UTF-8 strings:
51 *
52 * - str*cmp
53 * - stristr
54 * - stripos
55 * - substr
56 * - strrev
57 * - split/spliti
58 * - ...
59 *
60 */
61 /**
62 * Class for conversion between charsets
63 *
64 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
65 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
66 * @package TYPO3
67 * @subpackage t3lib
68 */
69 class t3lib_cs {
70
71 /**
72 * @var t3lib_l10n_Locales
73 */
74 protected $locales;
75
76 var $noCharByteVal = 63; // ASCII Value for chars with no equivalent.
77
78 // This is the array where parsed conversion tables are stored (cached)
79 var $parsedCharsets = array();
80
81 // An array where case folding data will be stored (cached)
82 var $caseFolding = array();
83
84 // An array where charset-to-ASCII mappings are stored (cached)
85 var $toASCII = array();
86
87 // This tells the converter which charsets has two bytes per char:
88 var $twoByteSets = array(
89 'ucs-2' => 1, // 2-byte Unicode
90 );
91
92 // This tells the converter which charsets has four bytes per char:
93 var $fourByteSets = array(
94 'ucs-4' => 1, // 4-byte Unicode
95 'utf-32' => 1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
96 );
97
98 // This tells the converter which charsets use a scheme like the Extended Unix Code:
99 var $eucBasedSets = array(
100 'gb2312' => 1, // Chinese, simplified.
101 'big5' => 1, // Chinese, traditional.
102 'euc-kr' => 1, // Korean
103 'shift_jis' => 1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
104 );
105
106 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
107 // http://czyborra.com/charsets/iso8859.html
108 var $synonyms = array(
109 'us' => 'ascii',
110 'us-ascii' => 'ascii',
111 'cp819' => 'iso-8859-1',
112 'ibm819' => 'iso-8859-1',
113 'iso-ir-100' => 'iso-8859-1',
114 'iso-ir-101' => 'iso-8859-2',
115 'iso-ir-109' => 'iso-8859-3',
116 'iso-ir-110' => 'iso-8859-4',
117 'iso-ir-144' => 'iso-8859-5',
118 'iso-ir-127' => 'iso-8859-6',
119 'iso-ir-126' => 'iso-8859-7',
120 'iso-ir-138' => 'iso-8859-8',
121 'iso-ir-148' => 'iso-8859-9',
122 'iso-ir-157' => 'iso-8859-10',
123 'iso-ir-179' => 'iso-8859-13',
124 'iso-ir-199' => 'iso-8859-14',
125 'iso-ir-203' => 'iso-8859-15',
126 'csisolatin1' => 'iso-8859-1',
127 'csisolatin2' => 'iso-8859-2',
128 'csisolatin3' => 'iso-8859-3',
129 'csisolatin5' => 'iso-8859-9',
130 'csisolatin8' => 'iso-8859-14',
131 'csisolatin9' => 'iso-8859-15',
132 'csisolatingreek' => 'iso-8859-7',
133 'iso-celtic' => 'iso-8859-14',
134 'latin1' => 'iso-8859-1',
135 'latin2' => 'iso-8859-2',
136 'latin3' => 'iso-8859-3',
137 'latin5' => 'iso-8859-9',
138 'latin6' => 'iso-8859-10',
139 'latin8' => 'iso-8859-14',
140 'latin9' => 'iso-8859-15',
141 'l1' => 'iso-8859-1',
142 'l2' => 'iso-8859-2',
143 'l3' => 'iso-8859-3',
144 'l5' => 'iso-8859-9',
145 'l6' => 'iso-8859-10',
146 'l8' => 'iso-8859-14',
147 'l9' => 'iso-8859-15',
148 'cyrillic' => 'iso-8859-5',
149 'arabic' => 'iso-8859-6',
150 'tis-620' => 'iso-8859-11',
151 'win874' => 'windows-874',
152 'win1250' => 'windows-1250',
153 'win1251' => 'windows-1251',
154 'win1252' => 'windows-1252',
155 'win1253' => 'windows-1253',
156 'win1254' => 'windows-1254',
157 'win1255' => 'windows-1255',
158 'win1256' => 'windows-1256',
159 'win1257' => 'windows-1257',
160 'win1258' => 'windows-1258',
161 'cp1250' => 'windows-1250',
162 'cp1251' => 'windows-1251',
163 'cp1252' => 'windows-1252',
164 'ms-ee' => 'windows-1250',
165 'ms-ansi' => 'windows-1252',
166 'ms-greek' => 'windows-1253',
167 'ms-turk' => 'windows-1254',
168 'winbaltrim' => 'windows-1257',
169 'koi-8ru' => 'koi-8r',
170 'koi8r' => 'koi-8r',
171 'cp878' => 'koi-8r',
172 'mac' => 'macroman',
173 'macintosh' => 'macroman',
174 'euc-cn' => 'gb2312',
175 'x-euc-cn' => 'gb2312',
176 'euccn' => 'gb2312',
177 'cp936' => 'gb2312',
178 'big-5' => 'big5',
179 'cp950' => 'big5',
180 'eucjp' => 'euc-jp',
181 'sjis' => 'shift_jis',
182 'shift-jis' => 'shift_jis',
183 'cp932' => 'shift_jis',
184 'cp949' => 'euc-kr',
185 'utf7' => 'utf-7',
186 'utf8' => 'utf-8',
187 'utf16' => 'utf-16',
188 'utf32' => 'utf-32',
189 'utf8' => 'utf-8',
190 'ucs2' => 'ucs-2',
191 'ucs4' => 'ucs-4',
192 );
193
194 // mapping of iso-639-1 language codes to script names
195 var $lang_to_script = array(
196 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
197 'af' => 'west_european', //Afrikaans
198 'ar' => 'arabic',
199 'bg' => 'cyrillic', // Bulgarian
200 'bs' => 'east_european', // Bosnian
201 'cs' => 'east_european', // Czech
202 'da' => 'west_european', // Danish
203 'de' => 'west_european', // German
204 'es' => 'west_european', // Spanish
205 'et' => 'estonian',
206 'eo' => 'unicode', // Esperanto
207 'eu' => 'west_european', // Basque
208 'fa' => 'arabic', // Persian
209 'fi' => 'west_european', // Finish
210 'fo' => 'west_european', // Faroese
211 'fr' => 'west_european', // French
212 'ga' => 'west_european', // Irish
213 'gl' => 'west_european', // Galician
214 'gr' => 'greek',
215 'he' => 'hebrew', // Hebrew (since 1998)
216 'hi' => 'unicode', // Hindi
217 'hr' => 'east_european', // Croatian
218 'hu' => 'east_european', // Hungarian
219 'iw' => 'hebrew', // Hebrew (til 1998)
220 'is' => 'west_european', // Icelandic
221 'it' => 'west_european', // Italian
222 'ja' => 'japanese',
223 'ka' => 'unicode', // Georgian
224 'kl' => 'west_european', // Greenlandic
225 'km' => 'unicode', // Khmer
226 'ko' => 'korean',
227 'lt' => 'lithuanian',
228 'lv' => 'west_european', // Latvian/Lettish
229 'nl' => 'west_european', // Dutch
230 'no' => 'west_european', // Norwegian
231 'nb' => 'west_european', // Norwegian Bokmal
232 'nn' => 'west_european', // Norwegian Nynorsk
233 'pl' => 'east_european', // Polish
234 'pt' => 'west_european', // Portuguese
235 'ro' => 'east_european', // Romanian
236 'ru' => 'cyrillic', // Russian
237 'sk' => 'east_european', // Slovak
238 'sl' => 'east_european', // Slovenian
239 'sr' => 'cyrillic', // Serbian
240 'sv' => 'west_european', // Swedish
241 'sq' => 'albanian', // Albanian
242 'th' => 'thai',
243 'uk' => 'cyrillic', // Ukranian
244 'vi' => 'vietnamese',
245 'zh' => 'chinese',
246 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
247 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
248 'afk'=> 'west_european', // Afrikaans
249 'ara' => 'arabic',
250 'bgr' => 'cyrillic', // Bulgarian
251 'cat' => 'west_european', // Catalan
252 'chs' => 'simpl_chinese',
253 'cht' => 'trad_chinese',
254 'csy' => 'east_european', // Czech
255 'dan' => 'west_european', // Danisch
256 'deu' => 'west_european', // German
257 'dea' => 'west_european', // German (Austrian)
258 'des' => 'west_european', // German (Swiss)
259 'ena' => 'west_european', // English (Australian)
260 'enc' => 'west_european', // English (Canadian)
261 'eng' => 'west_european', // English
262 'enz' => 'west_european', // English (New Zealand)
263 'enu' => 'west_european', // English (United States)
264 'euq' => 'west_european', // Basque
265 'fos' => 'west_european', // Faroese
266 'far' => 'arabic', // Persian
267 'fin' => 'west_european', // Finish
268 'fra' => 'west_european', // French
269 'frb' => 'west_european', // French (Belgian)
270 'frc' => 'west_european', // French (Canadian)
271 'frs' => 'west_european', // French (Swiss)
272 'geo' => 'unicode', // Georgian
273 'glg' => 'west_european', // Galician
274 'ell' => 'greek',
275 'heb' => 'hebrew',
276 'hin' => 'unicode', // Hindi
277 'hun' => 'east_european', // Hungarian
278 'isl' => 'west_euorpean', // Icelandic
279 'ita' => 'west_european', // Italian
280 'its' => 'west_european', // Italian (Swiss)
281 'jpn' => 'japanese',
282 'khm' => 'unicode', // Khmer
283 'kor' => 'korean',
284 'lth' => 'lithuanian',
285 'lvi' => 'west_european', // Latvian/Lettish
286 'msl' => 'west_european', // Malay
287 'nlb' => 'west_european', // Dutch (Belgian)
288 'nld' => 'west_european', // Dutch
289 'nor' => 'west_european', // Norwegian (bokmal)
290 'non' => 'west_european', // Norwegian (nynorsk)
291 'plk' => 'east_european', // Polish
292 'ptg' => 'west_european', // Portuguese
293 'ptb' => 'west_european', // Portuguese (Brazil)
294 'rom' => 'east_european', // Romanian
295 'rus' => 'cyrillic', // Russian
296 'slv' => 'east_european', // Slovenian
297 'sky' => 'east_european', // Slovak
298 'srl' => 'east_european', // Serbian (Latin)
299 'srb' => 'cyrillic', // Serbian (Cyrillic)
300 'esp' => 'west_european', // Spanish (trad. sort)
301 'esm' => 'west_european', // Spanish (Mexican)
302 'esn' => 'west_european', // Spanish (internat. sort)
303 'sve' => 'west_european', // Swedish
304 'sqi' => 'albanian', // Albanian
305 'tha' => 'thai',
306 'trk' => 'turkish',
307 'ukr' => 'cyrillic', // Ukrainian
308 // English language names
309 'afrikaans' => 'west_european',
310 'albanian' => 'albanian',
311 'arabic' => 'arabic',
312 'basque' => 'west_european',
313 'bosnian' => 'east_european',
314 'bulgarian' => 'east_european',
315 'catalan' => 'west_european',
316 'croatian' => 'east_european',
317 'czech' => 'east_european',
318 'danish' => 'west_european',
319 'dutch' => 'west_european',
320 'english' => 'west_european',
321 'esperanto' => 'unicode',
322 'estonian' => 'estonian',
323 'faroese' => 'west_european',
324 'farsi' => 'arabic',
325 'finnish' => 'west_european',
326 'french' => 'west_european',
327 'galician' => 'west_european',
328 'georgian' => 'unicode',
329 'german' => 'west_european',
330 'greek' => 'greek',
331 'greenlandic' => 'west_european',
332 'hebrew' => 'hebrew',
333 'hindi' => 'unicode',
334 'hungarian' => 'east_european',
335 'icelandic' => 'west_european',
336 'italian' => 'west_european',
337 'khmer' => 'unicode',
338 'latvian' => 'west_european',
339 'lettish' => 'west_european',
340 'lithuanian' => 'lithuanian',
341 'malay' => 'west_european',
342 'norwegian' => 'west_european',
343 'persian' => 'arabic',
344 'polish' => 'east_european',
345 'portuguese' => 'west_european',
346 'russian' => 'cyrillic',
347 'romanian' => 'east_european',
348 'serbian' => 'cyrillic',
349 'slovak' => 'east_european',
350 'slovenian' => 'east_european',
351 'spanish' => 'west_european',
352 'svedish' => 'west_european',
353 'that' => 'thai',
354 'turkish' => 'turkish',
355 'ukrainian' => 'cyrillic',
356 );
357
358 // mapping of language (family) names to charsets on Unix
359 var $script_to_charset_unix = array(
360 'west_european' => 'iso-8859-1',
361 'estonian' => 'iso-8859-1',
362 'east_european' => 'iso-8859-2',
363 'baltic' => 'iso-8859-4',
364 'cyrillic' => 'iso-8859-5',
365 'arabic' => 'iso-8859-6',
366 'greek' => 'iso-8859-7',
367 'hebrew' => 'iso-8859-8',
368 'turkish' => 'iso-8859-9',
369 'thai' => 'iso-8859-11', // = TIS-620
370 'lithuanian' => 'iso-8859-13',
371 'chinese' => 'gb2312', // = euc-cn
372 'japanese' => 'euc-jp',
373 'korean' => 'euc-kr',
374 'simpl_chinese' => 'gb2312',
375 'trad_chinese' => 'big5',
376 'vietnamese' => '',
377 'unicode' => 'utf-8',
378 'albanian' => 'utf-8'
379 );
380
381 // mapping of language (family) names to charsets on Windows
382 var $script_to_charset_windows = array(
383 'east_european' => 'windows-1250',
384 'cyrillic' => 'windows-1251',
385 'west_european' => 'windows-1252',
386 'greek' => 'windows-1253',
387 'turkish' => 'windows-1254',
388 'hebrew' => 'windows-1255',
389 'arabic' => 'windows-1256',
390 'baltic' => 'windows-1257',
391 'estonian' => 'windows-1257',
392 'lithuanian' => 'windows-1257',
393 'vietnamese' => 'windows-1258',
394 'thai' => 'cp874',
395 'korean' => 'cp949',
396 'chinese' => 'gb2312',
397 'japanese' => 'shift_jis',
398 'simpl_chinese' => 'gb2312',
399 'trad_chinese' => 'big5',
400 'albanian' => 'windows-1250',
401 'unicode' => 'utf-8'
402 );
403
404 // mapping of locale names to charsets
405 var $locale_to_charset = array(
406 'japanese.euc' => 'euc-jp',
407 'ja_jp.ujis' => 'euc-jp',
408 'korean.euc' => 'euc-kr',
409 'sr@Latn' => 'iso-8859-2',
410 'zh_cn' => 'gb2312',
411 'zh_hk' => 'big5',
412 'zh_tw' => 'big5',
413 );
414
415 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
416 // Empty values means "iso-8859-1"
417 var $charSetArray = array(
418 'af' => '',
419 'ar' => 'iso-8859-6',
420 'ba' => 'iso-8859-2',
421 'bg' => 'windows-1251',
422 'br' => '',
423 'ca' => 'iso-8859-15',
424 'ch' => 'gb2312',
425 'cs' => 'windows-1250',
426 'cz' => 'windows-1250',
427 'da' => '',
428 'de' => '',
429 'dk' => '',
430 'el' => 'iso-8859-7',
431 'eo' => 'utf-8',
432 'es' => '',
433 'et' => 'iso-8859-4',
434 'eu' => '',
435 'fa' => 'utf-8',
436 'fi' => '',
437 'fo' => 'utf-8',
438 'fr' => '',
439 'fr_CA' => '',
440 'ga' => '',
441 'ge' => 'utf-8',
442 'gl' => '',
443 'gr' => 'iso-8859-7',
444 'he' => 'utf-8',
445 'hi' => 'utf-8',
446 'hk' => 'big5',
447 'hr' => 'windows-1250',
448 'hu' => 'iso-8859-2',
449 'is' => 'utf-8',
450 'it' => '',
451 'ja' => 'shift_jis',
452 'jp' => 'shift_jis',
453 'ka' => 'utf-8',
454 'kl' => 'utf-8',
455 'km' => 'utf-8',
456 'ko' => 'euc-kr',
457 'kr' => 'euc-kr',
458 'lt' => 'windows-1257',
459 'lv' => 'utf-8',
460 'ms' => '',
461 'my' => '',
462 'nl' => '',
463 'no' => '',
464 'pl' => 'iso-8859-2',
465 'pt' => '',
466 'pt_BR' => '',
467 'qc' => '',
468 'ro' => 'iso-8859-2',
469 'ru' => 'windows-1251',
470 'se' => '',
471 'si' => 'windows-1250',
472 'sk' => 'windows-1250',
473 'sl' => 'windows-1250',
474 'sq' => 'utf-8',
475 'sr' => 'utf-8',
476 'sv' => '',
477 'th' => 'iso-8859-11',
478 'tr' => 'iso-8859-9',
479 'ua' => 'windows-1251',
480 'uk' => 'windows-1251',
481 'vi' => 'utf-8',
482 'vn' => 'utf-8',
483 'zh' => 'big5',
484 );
485
486 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
487 // Missing keys means: same as TYPO3
488 // @deprecated since TYPO3 4.6, will be removed in TYPO3 4.8 - use t3lib_l10n_Locales::getIsoMapping()
489 var $isoArray = array(
490 'ba' => 'bs',
491 'br' => 'pt_BR',
492 'ch' => 'zh_CN',
493 'cz' => 'cs',
494 'dk' => 'da',
495 'si' => 'sl',
496 'se' => 'sv',
497 'gl' => 'kl',
498 'gr' => 'el',
499 'hk' => 'zh_HK',
500 'kr' => 'ko',
501 'ua' => 'uk',
502 'jp' => 'ja',
503 'qc' => 'fr_CA',
504 'vn' => 'vi',
505 'ge' => 'ka',
506 'ga' => 'gl',
507 );
508
509 /**
510 * Default constructor.
511 */
512 public function __construct() {
513 $this->locales = t3lib_div::makeInstance('t3lib_l10n_Locales');
514 }
515
516 /**
517 * Normalize - changes input character set to lowercase letters.
518 *
519 * @param string Input charset
520 * @return string Normalized charset
521 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
522 */
523 function parse_charset($charset) {
524 $charset = trim(strtolower($charset));
525 if (isset($this->synonyms[$charset])) {
526 $charset = $this->synonyms[$charset];
527 }
528
529 return $charset;
530 }
531
532 /**
533 * Get the charset of a locale.
534 *
535 * ln language
536 * ln_CN language / country
537 * ln_CN.cs language / country / charset
538 * ln_CN.cs@mod language / country / charset / modifier
539 *
540 * @param string Locale string
541 * @return string Charset resolved for locale string
542 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
543 */
544 function get_locale_charset($locale) {
545 $locale = strtolower($locale);
546
547 // exact locale specific charset?
548 if (isset($this->locale_to_charset[$locale])) {
549 return $this->locale_to_charset[$locale];
550 }
551
552 // get modifier
553 list($locale, $modifier) = explode('@', $locale);
554
555 // locale contains charset: use it
556 list($locale, $charset) = explode('.', $locale);
557 if ($charset) {
558 return $this->parse_charset($charset);
559 }
560
561 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
562 if ($modifier == 'euro') {
563 return 'iso-8859-15';
564 }
565
566 // get language
567 list($language, $country) = explode('_', $locale);
568 if (isset($this->lang_to_script[$language])) {
569 $script = $this->lang_to_script[$language];
570 }
571
572 if (TYPO3_OS == 'WIN') {
573 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
574 } else {
575 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
576 }
577
578 return $cs;
579 }
580
581
582 /********************************************
583 *
584 * Charset Conversion functions
585 *
586 ********************************************/
587
588 /**
589 * Convert from one charset to another charset.
590 *
591 * @param string Input string
592 * @param string From charset (the current charset of the string)
593 * @param string To charset (the output charset wanted)
594 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
595 * @return string Converted string
596 * @see convArray()
597 */
598 function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
599 if ($fromCS == $toCS) {
600 return $str;
601 }
602
603 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
604 if ($toCS == 'utf-8' || !$useEntityForNoChar) {
605 switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
606 case 'mbstring':
607 $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
608 if (FALSE !== $conv_str) {
609 return $conv_str;
610 } // returns FALSE for unsupported charsets
611 break;
612
613 case 'iconv':
614 $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
615 if (FALSE !== $conv_str) {
616 return $conv_str;
617 }
618 break;
619
620 case 'recode':
621 $conv_str = recode_string($fromCS . '..' . $toCS, $str);
622 if (FALSE !== $conv_str) {
623 return $conv_str;
624 }
625 break;
626 }
627 // fallback to TYPO3 conversion
628 }
629
630 if ($fromCS != 'utf-8') {
631 $str = $this->utf8_encode($str, $fromCS);
632 }
633 if ($toCS != 'utf-8') {
634 $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
635 }
636 return $str;
637 }
638
639 /**
640 * Convert all elements in ARRAY with type string from one charset to another charset.
641 * NOTICE: Array is passed by reference!
642 *
643 * @param string Input array, possibly multidimensional
644 * @param string From charset (the current charset of the string)
645 * @param string To charset (the output charset wanted)
646 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
647 * @return void
648 * @see conv()
649 */
650 function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
651 foreach ($array as $key => $value) {
652 if (is_array($array[$key])) {
653 $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
654 } elseif (is_string($array[$key])) {
655 $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
656 }
657 }
658 }
659
660 /**
661 * Converts $str from $charset to UTF-8
662 *
663 * @param string String in local charset to convert to UTF-8
664 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
665 * @return string Output string, converted to UTF-8
666 */
667 function utf8_encode($str, $charset) {
668
669 if ($charset === 'utf-8') {
670 return $str;
671 }
672
673 // Charset is case-insensitive.
674 if ($this->initCharset($charset)) { // Parse conv. table if not already...
675 $strLen = strlen($str);
676 $outStr = '';
677
678 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in string.
679 $chr = substr($str, $a, 1);
680 $ord = ord($chr);
681 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
682 $ord2 = ord($str{$a + 1});
683 $ord = $ord << 8 | $ord2; // assume big endian
684
685 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
686 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
687 } else {
688 $outStr .= chr($this->noCharByteVal);
689 } // No char exists
690 $a++;
691 } elseif ($ord > 127) { // If char has value over 127 it's a multibyte char in UTF-8
692 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
693 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
694 $a++;
695 $ord2 = ord(substr($str, $a, 1));
696 $ord = $ord * 256 + $ord2;
697 }
698 }
699
700 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
701 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
702 } else {
703 $outStr .= chr($this->noCharByteVal);
704 } // No char exists
705 } else {
706 $outStr .= $chr;
707 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
708 }
709 return $outStr;
710 }
711 }
712
713 /**
714 * Converts $str from UTF-8 to $charset
715 *
716 * @param string String in UTF-8 to convert to local charset
717 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
718 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
719 * @return string Output string, converted to local charset
720 */
721 function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
722
723 if ($charset === 'utf-8') {
724 return $str;
725 }
726
727 // Charset is case-insensitive.
728 if ($this->initCharset($charset)) { // Parse conv. table if not already...
729 $strLen = strlen($str);
730 $outStr = '';
731 $buf = '';
732 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) { // Traverse each char in UTF-8 string.
733 $chr = substr($str, $a, 1);
734 $ord = ord($chr);
735 if ($ord > 127) { // This means multibyte! (first byte!)
736 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
737
738 $buf = $chr; // Add first byte
739 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
740 $ord = $ord << 1; // Shift it left and ...
741 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
742 $a++; // Increase pointer...
743 $buf .= substr($str, $a, 1); // ... and add the next char.
744 } else {
745 break;
746 }
747 }
748
749 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
750 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
751 if ($mByte > 255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
752 $outStr .= chr(($mByte >> 8) & 255) . chr($mByte & 255);
753 } else {
754 $outStr .= chr($mByte);
755 }
756 } elseif ($useEntityForNoChar) { // Create num entity:
757 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
758 } else {
759 $outStr .= chr($this->noCharByteVal);
760 } // No char exists
761 } else {
762 $outStr .= chr($this->noCharByteVal);
763 } // No char exists (MIDDLE of MB sequence!)
764 } else {
765 $outStr .= $chr;
766 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
767 }
768 return $outStr;
769 }
770 }
771
772 /**
773 * Converts all chars > 127 to numeric entities.
774 *
775 * @param string Input string
776 * @return string Output string
777 */
778 function utf8_to_entities($str) {
779 $strLen = strlen($str);
780 $outStr = '';
781 $buf = '';
782 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
783 $chr = substr($str, $a, 1);
784 $ord = ord($chr);
785 if ($ord > 127) { // This means multibyte! (first byte!)
786 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
787 $buf = $chr; // Add first byte
788 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
789 $ord = $ord << 1; // Shift it left and ...
790 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
791 $a++; // Increase pointer...
792 $buf .= substr($str, $a, 1); // ... and add the next char.
793 } else {
794 break;
795 }
796 }
797
798 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
799 } else {
800 $outStr .= chr($this->noCharByteVal);
801 } // No char exists (MIDDLE of MB sequence!)
802 } else {
803 $outStr .= $chr;
804 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
805 }
806
807 return $outStr;
808 }
809
810 /**
811 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
812 *
813 * @param string Input string, UTF-8
814 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
815 * @return string Output string
816 */
817 function entities_to_utf8($str, $alsoStdHtmlEnt = 0) {
818 if ($alsoStdHtmlEnt) {
819 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
820 }
821
822 $token = md5(microtime());
823 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
824 foreach ($parts as $k => $v) {
825 if ($k % 2) {
826 if (substr($v, 0, 1) == '#') { // Dec or hex entities:
827 if (substr($v, 1, 1) == 'x') {
828 $parts[$k] = $this->UnumberToChar(hexdec(substr($v, 2)));
829 } else {
830 $parts[$k] = $this->UnumberToChar(substr($v, 1));
831 }
832 } elseif ($alsoStdHtmlEnt && $trans_tbl['&' . $v . ';']) { // Other entities:
833 $parts[$k] = $this->utf8_encode($trans_tbl['&' . $v . ';'], 'iso-8859-1');
834 } else { // No conversion:
835 $parts[$k] = '&' . $v . ';';
836 }
837 }
838 }
839
840 return implode('', $parts);
841 }
842
843 /**
844 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
845 *
846 * @param string Input string, UTF-8
847 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
848 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
849 * @return array Output array with the char numbers
850 */
851 function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
852 // If entities must be registered as well...:
853 if ($convEntities) {
854 $str = $this->entities_to_utf8($str, 1);
855 }
856 // Do conversion:
857 $strLen = strlen($str);
858 $outArr = array();
859 $buf = '';
860 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
861 $chr = substr($str, $a, 1);
862 $ord = ord($chr);
863 if ($ord > 127) { // This means multibyte! (first byte!)
864 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
865 $buf = $chr; // Add first byte
866 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
867 $ord = $ord << 1; // Shift it left and ...
868 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
869 $a++; // Increase pointer...
870 $buf .= substr($str, $a, 1); // ... and add the next char.
871 } else {
872 break;
873 }
874 }
875
876 $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
877 } else {
878 $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
879 } // No char exists (MIDDLE of MB sequence!)
880 } else {
881 $outArr[] = $retChar ? chr($ord) : $ord;
882 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
883 }
884
885 return $outArr;
886 }
887
888 /**
889 * Converts a UNICODE number to a UTF-8 multibyte character
890 * Algorithm based on script found at From: http://czyborra.com/utf/
891 * Unit-tested by Kasper
892 *
893 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
894 *
895 * bytes | bits | representation
896 * 1 | 7 | 0vvvvvvv
897 * 2 | 11 | 110vvvvv 10vvvvvv
898 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
899 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
900 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
901 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
902 *
903 * @param integer UNICODE integer
904 * @return string UTF-8 multibyte character string
905 * @see utf8CharToUnumber()
906 */
907 function UnumberToChar($cbyte) {
908 $str = '';
909
910 if ($cbyte < 0x80) {
911 $str .= chr($cbyte);
912 } else {
913 if ($cbyte < 0x800) {
914 $str .= chr(0xC0 | ($cbyte >> 6));
915 $str .= chr(0x80 | ($cbyte & 0x3F));
916 } else {
917 if ($cbyte < 0x10000) {
918 $str .= chr(0xE0 | ($cbyte >> 12));
919 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
920 $str .= chr(0x80 | ($cbyte & 0x3F));
921 } else {
922 if ($cbyte < 0x200000) {
923 $str .= chr(0xF0 | ($cbyte >> 18));
924 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
925 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
926 $str .= chr(0x80 | ($cbyte & 0x3F));
927 } else {
928 if ($cbyte < 0x4000000) {
929 $str .= chr(0xF8 | ($cbyte >> 24));
930 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
931 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
932 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
933 $str .= chr(0x80 | ($cbyte & 0x3F));
934 } else {
935 if ($cbyte < 0x80000000) {
936 $str .= chr(0xFC | ($cbyte >> 30));
937 $str .= chr(0x80 | (($cbyte >> 24) & 0x3F));
938 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
939 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
940 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
941 $str .= chr(0x80 | ($cbyte & 0x3F));
942 } else { // Cannot express a 32-bit character in UTF-8
943 $str .= chr($this->noCharByteVal);
944 }
945 }
946 }
947 }
948 }
949 }
950 return $str;
951 }
952
953 /**
954 * Converts a UTF-8 Multibyte character to a UNICODE number
955 * Unit-tested by Kasper
956 *
957 * @param string UTF-8 multibyte character string
958 * @param boolean If set, then a hex. number is returned.
959 * @return integer UNICODE integer
960 * @see UnumberToChar()
961 */
962 function utf8CharToUnumber($str, $hex = 0) {
963 $ord = ord(substr($str, 0, 1)); // First char
964
965 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
966 $binBuf = '';
967 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
968 $ord = $ord << 1; // Shift it left and ...
969 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
970 $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6);
971 } else {
972 break;
973 }
974 }
975 $binBuf = substr('00000000' . decbin(ord(substr($str, 0, 1))), -(6 - $b)) . $binBuf;
976
977 $int = bindec($binBuf);
978 } else {
979 $int = $ord;
980 }
981
982 return $hex ? 'x' . dechex($int) : $int;
983 }
984
985
986 /********************************************
987 *
988 * Init functions
989 *
990 ********************************************/
991
992 /**
993 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
994 * This function is automatically called by the conversion functions
995 *
996 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
997 *
998 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
999 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
1000 * @access private
1001 */
1002 function initCharset($charset) {
1003 // Only process if the charset is not yet loaded:
1004 if (!is_array($this->parsedCharsets[$charset])) {
1005
1006 // Conversion table filename:
1007 $charsetConvTableFile = PATH_t3lib . 'csconvtbl/' . $charset . '.tbl';
1008
1009 // If the conversion table is found:
1010 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
1011 // Cache file for charsets:
1012 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1013 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1014 if ($cacheFile && @is_file($cacheFile)) {
1015 $this->parsedCharsets[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1016 } else {
1017 // Parse conversion table into lines:
1018 $lines = t3lib_div::trimExplode(LF, t3lib_div::getUrl($charsetConvTableFile), 1);
1019 // Initialize the internal variable holding the conv. table:
1020 $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
1021 // traverse the lines:
1022 $detectedType = '';
1023 foreach ($lines as $value) {
1024 if (trim($value) && substr($value, 0, 1) != '#') { // Comment line or blanks are ignored.
1025
1026 // Detect type if not done yet: (Done on first real line)
1027 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1028 if (!$detectedType) {
1029 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
1030 }
1031
1032 if ($detectedType == 'ms-token') {
1033 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1034 } elseif ($detectedType == 'whitespaced') {
1035 $regA = array();
1036 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1037 $hexbyte = $regA[1];
1038 $utf8 = 'U+' . $regA[2];
1039 }
1040 $decval = hexdec(trim($hexbyte));
1041 if ($decval > 127) {
1042 $utf8decval = hexdec(substr(trim($utf8), 2));
1043 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1044 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1045 }
1046 }
1047 }
1048 if ($cacheFile) {
1049 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1050 }
1051 }
1052 return 2;
1053 } else {
1054 return FALSE;
1055 }
1056 } else {
1057 return 1;
1058 }
1059 }
1060
1061 /**
1062 * This function initializes all UTF-8 character data tables.
1063 *
1064 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1065 *
1066 * @param string Mode ("case", "ascii", ...)
1067 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1068 * @access private
1069 */
1070 function initUnicodeData($mode = NULL) {
1071 // cache files
1072 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1073 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1074
1075 // Only process if the tables are not yet loaded
1076 switch ($mode) {
1077 case 'case':
1078 if (is_array($this->caseFolding['utf-8'])) {
1079 return 1;
1080 }
1081
1082 // Use cached version if possible
1083 if ($cacheFileCase && @is_file($cacheFileCase)) {
1084 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1085 return 2;
1086 }
1087 break;
1088
1089 case 'ascii':
1090 if (is_array($this->toASCII['utf-8'])) {
1091 return 1;
1092 }
1093
1094 // Use cached version if possible
1095 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1096 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1097 return 2;
1098 }
1099 break;
1100 }
1101
1102 // process main Unicode data file
1103 $unicodeDataFile = PATH_t3lib . 'unidata/UnicodeData.txt';
1104 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1105 return FALSE;
1106 }
1107
1108 $fh = fopen($unicodeDataFile, 'rb');
1109 if (!$fh) {
1110 return FALSE;
1111 }
1112
1113 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1114 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1115 $this->caseFolding['utf-8'] = array();
1116 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1117 $utf8CaseFolding['toUpper'] = array();
1118 $utf8CaseFolding['toLower'] = array();
1119 $utf8CaseFolding['toTitle'] = array();
1120
1121 $decomposition = array(); // array of temp. decompositions
1122 $mark = array(); // array of chars that are marks (eg. composing accents)
1123 $number = array(); // array of chars that are numbers (eg. digits)
1124 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1125
1126 while (!feof($fh)) {
1127 $line = fgets($fh, 4096);
1128 // has a lot of info
1129 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line));
1130
1131 $ord = hexdec($char);
1132 if ($ord > 0xFFFF) {
1133 break;
1134 } // only process the BMP
1135
1136 $utf8_char = $this->UnumberToChar($ord);
1137
1138 if ($upper) {
1139 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1140 }
1141 if ($lower) {
1142 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1143 }
1144 // store "title" only when different from "upper" (only a few)
1145 if ($title && $title != $upper) {
1146 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1147 }
1148
1149 switch ($cat{0}) {
1150 case 'M': // mark (accent, umlaut, ...)
1151 $mark["U+$char"] = 1;
1152 break;
1153
1154 case 'N': // numeric value
1155 if ($ord > 0x80 && $num != '') {
1156 $number["U+$char"] = $num;
1157 }
1158 }
1159
1160 // accented Latin letters without "official" decomposition
1161 $match = array();
1162 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1163 $c = ord($match[2]);
1164 if ($match[1] == 'SMALL') {
1165 $c += 32;
1166 }
1167
1168 $decomposition["U+$char"] = array(dechex($c));
1169 continue;
1170 }
1171
1172 $match = array();
1173 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1174 switch ($match[1]) {
1175 case '<circle>': // add parenthesis as circle replacement, eg (1)
1176 $match[2] = '0028 ' . $match[2] . ' 0029';
1177 break;
1178
1179 case '<square>': // add square brackets as square replacement, eg [1]
1180 $match[2] = '005B ' . $match[2] . ' 005D';
1181 break;
1182
1183 case '<compat>': // ignore multi char decompositions that start with a space
1184 if (preg_match('/^0020 /', $match[2])) {
1185 continue 2;
1186 }
1187 break;
1188
1189 // ignore Arabic and vertical layout presentation decomposition
1190 case '<initial>':
1191 case '<medial>':
1192 case '<final>':
1193 case '<isolated>':
1194 case '<vertical>':
1195 continue 2;
1196 }
1197 $decomposition["U+$char"] = explode(' ', $match[2]);
1198 }
1199 }
1200 fclose($fh);
1201
1202 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1203 $specialCasingFile = PATH_t3lib . 'unidata/SpecialCasing.txt';
1204 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1205 $fh = fopen($specialCasingFile, 'rb');
1206 if ($fh) {
1207 while (!feof($fh)) {
1208 $line = fgets($fh, 4096);
1209 if ($line{0} != '#' && trim($line) != '') {
1210
1211 list($char, $lower, $title, $upper, $cond) = t3lib_div::trimExplode(';', $line);
1212 if ($cond == '' || $cond{0} == '#') {
1213 $utf8_char = $this->UnumberToChar(hexdec($char));
1214 if ($char != $lower) {
1215 $arr = explode(' ', $lower);
1216 for ($i = 0; isset($arr[$i]); $i++) {
1217 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1218 }
1219 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1220 }
1221 if ($char != $title && $title != $upper) {
1222 $arr = explode(' ', $title);
1223 for ($i = 0; isset($arr[$i]); $i++) {
1224 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1225 }
1226 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1227 }
1228 if ($char != $upper) {
1229 $arr = explode(' ', $upper);
1230 for ($i = 0; isset($arr[$i]); $i++) {
1231 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1232 }
1233 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1234 }
1235 }
1236 }
1237 }
1238 fclose($fh);
1239 }
1240 }
1241
1242 // process custom decompositions
1243 $customTranslitFile = PATH_t3lib . 'unidata/Translit.txt';
1244 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1245 $fh = fopen($customTranslitFile, 'rb');
1246 if ($fh) {
1247 while (!feof($fh)) {
1248 $line = fgets($fh, 4096);
1249 if ($line{0} != '#' && trim($line) != '') {
1250 list($char, $translit) = t3lib_div::trimExplode(';', $line);
1251 if (!$translit) {
1252 $omit["U+$char"] = 1;
1253 }
1254 $decomposition["U+$char"] = explode(' ', $translit);
1255
1256 }
1257 }
1258 fclose($fh);
1259 }
1260 }
1261
1262 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1263 foreach ($decomposition as $from => $to) {
1264 $code_decomp = array();
1265
1266 while ($code_value = array_shift($to)) {
1267 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1268 foreach (array_reverse($decomposition["U+$code_value"]) as $cv) {
1269 array_unshift($to, $cv);
1270 }
1271 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1272 array_push($code_decomp, $code_value);
1273 }
1274 }
1275 if (count($code_decomp) || isset($omit[$from])) {
1276 $decomposition[$from] = $code_decomp;
1277 } else {
1278 unset($decomposition[$from]);
1279 }
1280 }
1281
1282 // create ascii only mapping
1283 $this->toASCII['utf-8'] = array();
1284 $ascii =& $this->toASCII['utf-8'];
1285
1286 foreach ($decomposition as $from => $to) {
1287 $code_decomp = array();
1288 while ($code_value = array_shift($to)) {
1289 $ord = hexdec($code_value);
1290 if ($ord > 127) {
1291 continue 2;
1292 } // skip decompositions containing non-ASCII chars
1293 else
1294 {
1295 array_push($code_decomp, chr($ord));
1296 }
1297 }
1298 $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1299 }
1300
1301 // add numeric decompositions
1302 foreach ($number as $from => $to) {
1303 $utf8_char = $this->UnumberToChar(hexdec($from));
1304 if (!isset($ascii[$utf8_char])) {
1305 $ascii[$utf8_char] = $to;
1306 }
1307 }
1308
1309 if ($cacheFileCase) {
1310 t3lib_div::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1311 }
1312
1313 if ($cacheFileASCII) {
1314 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1315 }
1316
1317 return 3;
1318 }
1319
1320 /**
1321 * This function initializes the folding table for a charset other than UTF-8.
1322 * This function is automatically called by the case folding functions.
1323 *
1324 * @param string Charset for which to initialize case folding.
1325 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1326 * @access private
1327 */
1328 function initCaseFolding($charset) {
1329 // Only process if the case table is not yet loaded:
1330 if (is_array($this->caseFolding[$charset])) {
1331 return 1;
1332 }
1333
1334 // Use cached version if possible
1335 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1336 if ($cacheFile && @is_file($cacheFile)) {
1337 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1338 return 2;
1339 }
1340
1341 // init UTF-8 conversion for this charset
1342 if (!$this->initCharset($charset)) {
1343 return FALSE;
1344 }
1345
1346 // UTF-8 case folding is used as the base conversion table
1347 if (!$this->initUnicodeData('case')) {
1348 return FALSE;
1349 }
1350
1351 $nochar = chr($this->noCharByteVal);
1352 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1353 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1354 $c = $this->utf8_decode($utf8, $charset);
1355
1356 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1357 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1358 if ($cc != '' && $cc != $nochar) {
1359 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1360 }
1361
1362 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1363 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1364 if ($cc != '' && $cc != $nochar) {
1365 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1366 }
1367
1368 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1369 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1370 if ($cc != '' && $cc != $nochar) {
1371 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1372 }
1373 }
1374
1375 // add the ASCII case table
1376 for ($i = ord('a'); $i <= ord('z'); $i++) {
1377 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1378 }
1379 for ($i = ord('A'); $i <= ord('Z'); $i++) {
1380 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1381 }
1382
1383 if ($cacheFile) {
1384 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1385 }
1386
1387 return 3;
1388 }
1389
1390 /**
1391 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1392 * This function is automatically called by the ASCII transliteration functions.
1393 *
1394 * @param string Charset for which to initialize conversion.
1395 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1396 * @access private
1397 */
1398 function initToASCII($charset) {
1399 // Only process if the case table is not yet loaded:
1400 if (is_array($this->toASCII[$charset])) {
1401 return 1;
1402 }
1403
1404 // Use cached version if possible
1405 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1406 if ($cacheFile && @is_file($cacheFile)) {
1407 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1408 return 2;
1409 }
1410
1411 // init UTF-8 conversion for this charset
1412 if (!$this->initCharset($charset)) {
1413 return FALSE;
1414 }
1415
1416 // UTF-8/ASCII transliteration is used as the base conversion table
1417 if (!$this->initUnicodeData('ascii')) {
1418 return FALSE;
1419 }
1420
1421 $nochar = chr($this->noCharByteVal);
1422 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1423 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1424 $c = $this->utf8_decode($utf8, $charset);
1425
1426 if (isset($this->toASCII['utf-8'][$utf8])) {
1427 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1428 }
1429 }
1430
1431 if ($cacheFile) {
1432 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1433 }
1434
1435 return 3;
1436 }
1437
1438
1439 /********************************************
1440 *
1441 * String operation functions
1442 *
1443 ********************************************/
1444
1445 /**
1446 * Returns a part of a string.
1447 * Unit-tested by Kasper (single byte charsets only)
1448 *
1449 * @param string The character set
1450 * @param string Character string
1451 * @param integer Start position (character position)
1452 * @param integer Length (in characters)
1453 * @return string The substring
1454 * @see substr(), mb_substr()
1455 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1456 */
1457 function substr($charset, $string, $start, $len = NULL) {
1458 if ($len === 0 || $string === '') {
1459 return '';
1460 }
1461
1462 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1463 // cannot omit $len, when specifying charset
1464 if ($len == NULL) {
1465 $enc = mb_internal_encoding(); // save internal encoding
1466 mb_internal_encoding($charset);
1467 $str = mb_substr($string, $start);
1468 mb_internal_encoding($enc); // restore internal encoding
1469
1470 return $str;
1471 }
1472 else {
1473 return mb_substr($string, $start, $len, $charset);
1474 }
1475 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1476 // cannot omit $len, when specifying charset
1477 if ($len == NULL) {
1478 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1479 iconv_set_encoding('internal_encoding', $charset);
1480 $str = iconv_substr($string, $start);
1481 iconv_set_encoding('internal_encoding', $enc); // restore internal encoding
1482
1483 return $str;
1484 }
1485 else {
1486 return iconv_substr($string, $start, $len, $charset);
1487 }
1488 } elseif ($charset == 'utf-8') {
1489 return $this->utf8_substr($string, $start, $len);
1490 } elseif ($this->eucBasedSets[$charset]) {
1491 return $this->euc_substr($string, $start, $charset, $len);
1492 } elseif ($this->twoByteSets[$charset]) {
1493 return substr($string, $start * 2, $len * 2);
1494 } elseif ($this->fourByteSets[$charset]) {
1495 return substr($string, $start * 4, $len * 4);
1496 }
1497
1498 // treat everything else as single-byte encoding
1499 return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
1500 }
1501
1502 /**
1503 * Counts the number of characters.
1504 * Unit-tested by Kasper (single byte charsets only)
1505 *
1506 * @param string The character set
1507 * @param string Character string
1508 * @return integer The number of characters
1509 * @see strlen()
1510 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1511 */
1512 function strlen($charset, $string) {
1513 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1514 return mb_strlen($string, $charset);
1515 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1516 return iconv_strlen($string, $charset);
1517 } elseif ($charset == 'utf-8') {
1518 return $this->utf8_strlen($string);
1519 } elseif ($this->eucBasedSets[$charset]) {
1520 return $this->euc_strlen($string, $charset);
1521 } elseif ($this->twoByteSets[$charset]) {
1522 return strlen($string) / 2;
1523 } elseif ($this->fourByteSets[$charset]) {
1524 return strlen($string) / 4;
1525 }
1526 // treat everything else as single-byte encoding
1527 return strlen($string);
1528 }
1529
1530 /**
1531 * Method to crop strings using the mb_substr function.
1532 *
1533 * @param string The character set
1534 * @param string String to be cropped
1535 * @param integer Crop length (in characters)
1536 * @param string Crop signifier
1537 * @return string The shortened string
1538 * @see mb_strlen(), mb_substr()
1539 */
1540 protected function cropMbstring($charset, $string, $len, $crop = '') {
1541 if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) {
1542 return $string;
1543 }
1544
1545 if ($len > 0) {
1546 $string = mb_substr($string, 0, $len, $charset) . $crop;
1547 } else {
1548 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1549 }
1550
1551 return $string;
1552 }
1553
1554 /**
1555 * Truncates a string and pre-/appends a string.
1556 * Unit tested by Kasper
1557 *
1558 * @param string The character set
1559 * @param string Character string
1560 * @param integer Length (in characters)
1561 * @param string Crop signifier
1562 * @return string The shortened string
1563 * @see substr(), mb_strimwidth()
1564 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1565 */
1566 function crop($charset, $string, $len, $crop = '') {
1567 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1568 return $this->cropMbstring($charset, $string, $len, $crop);
1569 }
1570
1571 if (intval($len) == 0) {
1572 return $string;
1573 }
1574
1575 if ($charset == 'utf-8') {
1576 $i = $this->utf8_char2byte_pos($string, $len);
1577 } elseif ($this->eucBasedSets[$charset]) {
1578 $i = $this->euc_char2byte_pos($string, $len, $charset);
1579 } else {
1580 if ($len > 0) {
1581 $i = $len;
1582 } else {
1583 $i = strlen($string) + $len;
1584 if ($i <= 0) {
1585 $i = FALSE;
1586 }
1587 }
1588 }
1589
1590 if ($i === FALSE) { // $len outside actual string length
1591 return $string;
1592 } else {
1593 if ($len > 0) {
1594 if (strlen($string{$i})) {
1595 return substr($string, 0, $i) . $crop;
1596
1597 }
1598 } else {
1599 if (strlen($string{$i - 1})) {
1600 return $crop . substr($string, $i);
1601 }
1602 }
1603
1604 /*
1605 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return TRUE here (which is not a catastrophe, but...)
1606 if ($len > 0) {
1607 return substr($string,0,$i).$crop;
1608 } else {
1609 return $crop.substr($string,$i);
1610 }
1611 }
1612 */
1613 }
1614 return $string;
1615 }
1616
1617 /**
1618 * Cuts a string short at a given byte length.
1619 *
1620 * @param string The character set
1621 * @param string Character string
1622 * @param integer The byte length
1623 * @return string The shortened string
1624 * @see mb_strcut()
1625 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1626 */
1627 function strtrunc($charset, $string, $len) {
1628 if ($len <= 0) {
1629 return '';
1630 }
1631
1632 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1633 return mb_strcut($string, 0, $len, $charset);
1634 } elseif ($charset == 'utf-8') {
1635 return $this->utf8_strtrunc($string, $len);
1636 } elseif ($this->eucBasedSets[$charset]) {
1637 return $this->euc_strtrunc($string, $len, $charset);
1638 } elseif ($this->twoByteSets[$charset]) {
1639 if ($len % 2) {
1640 $len--;
1641 } // don't cut at odd positions
1642 } elseif ($this->fourByteSets[$charset]) {
1643 $x = $len % 4;
1644 $len -= $x; // realign to position dividable by four
1645 }
1646 // treat everything else as single-byte encoding
1647 return substr($string, 0, $len);
1648 }
1649
1650 /**
1651 * Translates all characters of a string into their respective case values.
1652 * Unlike strtolower() and strtoupper() this method is locale independent.
1653 * Note that the string length may change!
1654 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1655 * Unit-tested by Kasper
1656 * Real case folding is language dependent, this method ignores this fact.
1657 *
1658 * @param string Character set of string
1659 * @param string Input string to convert case for
1660 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1661 * @return string The converted string
1662 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1663 * @see strtolower(), strtoupper()
1664 */
1665 function conv_case($charset, $string, $case) {
1666 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1667 if ($case == 'toLower') {
1668 $string = mb_strtolower($string, $charset);
1669 } else {
1670 $string = mb_strtoupper($string, $charset);
1671 }
1672 } elseif ($charset == 'utf-8') {
1673 $string = $this->utf8_char_mapping($string, 'case', $case);
1674 } elseif (isset($this->eucBasedSets[$charset])) {
1675 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1676 } else {
1677 // treat everything else as single-byte encoding
1678 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1679 }
1680
1681 return $string;
1682 }
1683
1684 /**
1685 * Equivalent of lcfirst/ucfirst but using character set.
1686 *
1687 * @param string $charset
1688 * @param string $string
1689 * @param string $case
1690 * @return string
1691 * @see t3lib_cs::conv_case()
1692 */
1693 public function convCaseFirst($charset, $string, $case) {
1694 $firstChar = $this->substr($charset, $string, 0, 1);
1695 $firstChar = $this->conv_case($charset, $firstChar, $case);
1696 $remainder = $this->substr($charset, $string, 1);
1697 return $firstChar . $remainder;
1698 }
1699
1700 /**
1701 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1702 *
1703 * @param string $charset Character set of string
1704 * @param string $string Input string to convert
1705 * @return string The converted string
1706 */
1707 function specCharsToASCII($charset, $string) {
1708 if ($charset == 'utf-8') {
1709 $string = $this->utf8_char_mapping($string, 'ascii');
1710 } elseif (isset($this->eucBasedSets[$charset])) {
1711 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1712 } else {
1713 // treat everything else as single-byte encoding
1714 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1715 }
1716
1717 return $string;
1718 }
1719
1720
1721 /**
1722 * converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1723 * into a TYPO3-readable language code
1724 * @param $languageCodesList list of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1725 * see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
1726 * @return string a preferred language that TYPO3 supports, or "default" if none found
1727 * @author Benjamin Mack (benni.typo3.org)
1728 */
1729 public function getPreferredClientLanguage($languageCodesList) {
1730 $allLanguageCodes = array();
1731 $selectedLanguage = 'default';
1732
1733 // get all languages where TYPO3 code is the same as the ISO code
1734 foreach ($this->charSetArray as $typo3Lang => $charSet) {
1735 $allLanguageCodes[$typo3Lang] = $typo3Lang;
1736 }
1737
1738 // get all languages where TYPO3 code differs from ISO code
1739 // or needs the country part
1740 // the iso codes will here overwrite the default typo3 language in the key
1741 foreach ($this->locales->getIsoMapping() as $typo3Lang => $isoLang) {
1742 $isoLang = join('-', explode('_', $isoLang));
1743 $allLanguageCodes[$typo3Lang] = $isoLang;
1744 }
1745
1746 // move the iso codes to the (because we're comparing the keys with "isset" later on)
1747 $allLanguageCodes = array_flip($allLanguageCodes);
1748
1749
1750 $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList);
1751 // order the preferred languages after they key
1752 $sortedPreferredLanguages = array();
1753 foreach ($preferredLanguages as $preferredLanguage) {
1754 $quality = 1.0;
1755 if (strpos($preferredLanguage, ';q=') !== FALSE) {
1756 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1757 }
1758 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1759 }
1760
1761 // loop through the languages, with the highest priority first
1762 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1763 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1764 if (isset($allLanguageCodes[$preferredLanguage])) {
1765 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1766 break;
1767 }
1768
1769 // strip the country code from the end
1770 list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
1771 if (isset($allLanguageCodes[$preferredLanguage])) {
1772 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1773 break;
1774 }
1775 }
1776 if (!$selectedLanguage || $selectedLanguage == 'en') {
1777 $selectedLanguage = 'default';
1778 }
1779 return $selectedLanguage;
1780 }
1781
1782
1783 /********************************************
1784 *
1785 * Internal string operation functions
1786 *
1787 ********************************************/
1788
1789 /**
1790 * Maps all characters of a string in a single byte charset.
1791 *
1792 * @param string the string
1793 * @param string the charset
1794 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1795 * @param string 'case': conversion 'toLower' or 'toUpper'
1796 * @return string the converted string
1797 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1798 */
1799 function sb_char_mapping($str, $charset, $mode, $opt = '') {
1800 switch ($mode) {
1801 case 'case':
1802 if (!$this->initCaseFolding($charset)) {
1803 return $str;
1804 } // do nothing
1805 $map =& $this->caseFolding[$charset][$opt];
1806 break;
1807
1808 case 'ascii':
1809 if (!$this->initToASCII($charset)) {
1810 return $str;
1811 } // do nothing
1812 $map =& $this->toASCII[$charset];
1813 break;
1814
1815 default:
1816 return $str;
1817 }
1818
1819 $out = '';
1820 for ($i = 0; strlen($str{$i}); $i++) {
1821 $c = $str{$i};
1822 if (isset($map[$c])) {
1823 $out .= $map[$c];
1824 } else {
1825 $out .= $c;
1826 }
1827 }
1828
1829 return $out;
1830 }
1831
1832
1833 /********************************************
1834 *
1835 * Internal UTF-8 string operation functions
1836 *
1837 ********************************************/
1838
1839 /**
1840 * Returns a part of a UTF-8 string.
1841 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1842 *
1843 * @param string UTF-8 string
1844 * @param integer Start position (character position)
1845 * @param integer Length (in characters)
1846 * @return string The substring
1847 * @see substr()
1848 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1849 */
1850 function utf8_substr($str, $start, $len = NULL) {
1851 if (!strcmp($len, '0')) {
1852 return '';
1853 }
1854
1855 $byte_start = $this->utf8_char2byte_pos($str, $start);
1856 if ($byte_start === FALSE) {
1857 if ($start > 0) {
1858 return FALSE; // $start outside string length
1859 } else {
1860 $start = 0;
1861 }
1862 }
1863
1864 $str = substr($str, $byte_start);
1865
1866 if ($len != NULL) {
1867 $byte_end = $this->utf8_char2byte_pos($str, $len);
1868 if ($byte_end === FALSE) // $len outside actual string length
1869 {
1870 return $len < 0 ? '' : $str;
1871 } // When length is less than zero and exceeds, then we return blank string.
1872 else
1873 {
1874 return substr($str, 0, $byte_end);
1875 }
1876 }
1877 else {
1878 return $str;
1879 }
1880 }
1881
1882 /**
1883 * Counts the number of characters of a string in UTF-8.
1884 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1885 *
1886 * @param string UTF-8 multibyte character string
1887 * @return integer The number of characters
1888 * @see strlen()
1889 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1890 */
1891 function utf8_strlen($str) {
1892 $n = 0;
1893 for ($i = 0; strlen($str{$i}); $i++) {
1894 $c = ord($str{$i});
1895 if (!($c & 0x80)) // single-byte (0xxxxxx)
1896 {
1897 $n++;
1898 }
1899 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1900 {
1901 $n++;
1902 }
1903 }
1904 return $n;
1905 }
1906
1907 /**
1908 * Truncates a string in UTF-8 short at a given byte length.
1909 *
1910 * @param string UTF-8 multibyte character string
1911 * @param integer the byte length
1912 * @return string the shortened string
1913 * @see mb_strcut()
1914 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1915 */
1916 function utf8_strtrunc($str, $len) {
1917 $i = $len - 1;
1918 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1919 for (; $i > 0 && !(ord($str{$i}) & 0x40); $i--) {
1920 // find the first byte
1921 ;
1922 }
1923 if ($i <= 0) {
1924 return '';
1925 } // sanity check
1926 for ($bc = 0, $mbs = ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) {
1927 // calculate number of bytes
1928 $bc++;
1929 }
1930 if ($bc + $i > $len) {
1931 return substr($str, 0, $i);
1932 }
1933 // fallthru: multibyte char fits into length
1934 }
1935 return substr($str, 0, $len);
1936 }
1937
1938 /**
1939 * Find position of first occurrence of a string, both arguments are in UTF-8.
1940 *
1941 * @param string UTF-8 string to search in
1942 * @param string UTF-8 string to search for
1943 * @param integer Positition to start the search
1944 * @return integer The character position
1945 * @see strpos()
1946 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1947 */
1948 function utf8_strpos($haystack, $needle, $offset = 0) {
1949 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1950 return mb_strpos($haystack, $needle, $offset, 'utf-8');
1951 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1952 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1953 }
1954
1955 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1956 if ($byte_offset === FALSE) {
1957 return FALSE;
1958 } // offset beyond string length
1959
1960 $byte_pos = strpos($haystack, $needle, $byte_offset);
1961 if ($byte_pos === FALSE) {
1962 return FALSE;
1963 } // needle not found
1964
1965 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1966 }
1967
1968 /**
1969 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1970 *
1971 * @param string UTF-8 string to search in
1972 * @param string UTF-8 character to search for (single character)
1973 * @return integer The character position
1974 * @see strrpos()
1975 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1976 */
1977 function utf8_strrpos($haystack, $needle) {
1978 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1979 return mb_strrpos($haystack, $needle, 'utf-8');
1980 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1981 return iconv_strrpos($haystack, $needle, 'utf-8');
1982 }
1983
1984 $byte_pos = strrpos($haystack, $needle);
1985 if ($byte_pos === FALSE) {
1986 return FALSE;
1987 } // needle not found
1988
1989 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1990 }
1991
1992 /**
1993 * Translates a character position into an 'absolute' byte position.
1994 * Unit tested by Kasper.
1995 *
1996 * @param string UTF-8 string
1997 * @param integer Character position (negative values start from the end)
1998 * @return integer Byte position
1999 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2000 */
2001 function utf8_char2byte_pos($str, $pos) {
2002 $n = 0; // number of characters found
2003 $p = abs($pos); // number of characters wanted
2004
2005 if ($pos >= 0) {
2006 $i = 0;
2007 $d = 1;
2008 } else {
2009 $i = strlen($str) - 1;
2010 $d = -1;
2011 }
2012
2013 for (; strlen($str{$i}) && $n < $p; $i += $d) {
2014 $c = (int) ord($str{$i});
2015 if (!($c & 0x80)) // single-byte (0xxxxxx)
2016 {
2017 $n++;
2018 }
2019 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2020 {
2021 $n++;
2022 }
2023 }
2024 if (!strlen($str{$i})) {
2025 return FALSE;
2026 } // offset beyond string length
2027
2028 if ($pos >= 0) {
2029 // skip trailing multi-byte data bytes
2030 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) {
2031 $i++;
2032 }
2033 } else {
2034 // correct offset
2035 $i++;
2036 }
2037
2038 return $i;
2039 }
2040
2041 /**
2042 * Translates an 'absolute' byte position into a character position.
2043 * Unit tested by Kasper.
2044 *
2045 * @param string UTF-8 string
2046 * @param integer byte position
2047 * @return integer character position
2048 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2049 */
2050 function utf8_byte2char_pos($str, $pos) {
2051 $n = 0; // number of characters
2052 for ($i = $pos; $i > 0; $i--) {
2053 $c = (int) ord($str{$i});
2054 if (!($c & 0x80)) // single-byte (0xxxxxx)
2055 {
2056 $n++;
2057 }
2058 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2059 {
2060 $n++;
2061 }
2062 }
2063 if (!strlen($str{$i})) {
2064 return FALSE;
2065 } // offset beyond string length
2066
2067 return $n;
2068 }
2069
2070 /**
2071 * Maps all characters of an UTF-8 string.
2072 *
2073 * @param string UTF-8 string
2074 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2075 * @param string 'case': conversion 'toLower' or 'toUpper'
2076 * @return string the converted string
2077 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2078 */
2079 function utf8_char_mapping($str, $mode, $opt = '') {
2080 if (!$this->initUnicodeData($mode)) {
2081 return $str;
2082 } // do nothing
2083
2084 $out = '';
2085 switch ($mode) {
2086 case 'case':
2087 $map =& $this->caseFolding['utf-8'][$opt];
2088 break;
2089
2090 case 'ascii':
2091 $map =& $this->toASCII['utf-8'];
2092 break;
2093
2094 default:
2095 return $str;
2096 }
2097
2098 for ($i = 0; strlen($str{$i}); $i++) {
2099 $c = ord($str{$i});
2100 if (!($c & 0x80)) // single-byte (0xxxxxx)
2101 {
2102 $mbc = $str{$i};
2103 }
2104 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
2105 for ($bc = 0; $c & 0x80; $c = $c << 1) {
2106 $bc++;
2107 } // calculate number of bytes
2108 $mbc = substr($str, $i, $bc);
2109 $i += $bc - 1;
2110 }
2111
2112 if (isset($map[$mbc])) {
2113 $out .= $map[$mbc];
2114 } else {
2115 $out .= $mbc;
2116 }
2117 }
2118
2119 return $out;
2120 }
2121
2122
2123 /********************************************
2124 *
2125 * Internal EUC string operation functions
2126 *
2127 * Extended Unix Code:
2128 * ASCII compatible 7bit single bytes chars
2129 * 8bit two byte chars
2130 *
2131 * Shift-JIS is treated as a special case.
2132 *
2133 ********************************************/
2134
2135 /**
2136 * Cuts a string in the EUC charset family short at a given byte length.
2137 *
2138 * @param string EUC multibyte character string
2139 * @param integer the byte length
2140 * @param string the charset
2141 * @return string the shortened string
2142 * @see mb_strcut()
2143 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2144 */
2145 function euc_strtrunc($str, $len, $charset) {
2146 $sjis = ($charset == 'shift_jis');
2147 for ($i = 0; strlen($str{$i}) && $i < $len; $i++) {
2148 $c = ord($str{$i});
2149 if ($sjis) {
2150 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2151 $i++;
2152 } // advance a double-byte char
2153 }
2154 else {
2155 if ($c >= 0x80) {
2156 $i++;
2157 } // advance a double-byte char
2158 }
2159 }
2160 if (!strlen($str{$i})) {
2161 return $str;
2162 } // string shorter than supplied length
2163
2164 if ($i > $len) {
2165 return substr($str, 0, $len - 1); // we ended on a first byte
2166 } else {
2167 return substr($str, 0, $len);
2168 }
2169 }
2170
2171 /**
2172 * Returns a part of a string in the EUC charset family.
2173 *
2174 * @param string EUC multibyte character string
2175 * @param integer start position (character position)
2176 * @param string the charset
2177 * @param integer length (in characters)
2178 * @return string the substring
2179 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2180 */
2181 function euc_substr($str, $start, $charset, $len = NULL) {
2182 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2183 if ($byte_start === FALSE) {
2184 return FALSE;
2185 } // $start outside string length
2186
2187 $str = substr($str, $byte_start);
2188
2189 if ($len != NULL) {
2190 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2191 if ($byte_end === FALSE) // $len outside actual string length
2192 {
2193 return $str;
2194 }
2195 else
2196 {
2197 return substr($str, 0, $byte_end);
2198 }
2199 }
2200 else {
2201 return $str;
2202 }
2203 }
2204
2205 /**
2206 * Counts the number of characters of a string in the EUC charset family.
2207 *
2208 * @param string EUC multibyte character string
2209 * @param string the charset
2210 * @return integer the number of characters
2211 * @see strlen()
2212 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2213 */
2214 function euc_strlen($str, $charset) {
2215 $sjis = ($charset == 'shift_jis');
2216 $n = 0;
2217 for ($i = 0; strlen($str{$i}); $i++) {
2218 $c = ord($str{$i});
2219 if ($sjis) {
2220 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2221 $i++;
2222 } // advance a double-byte char
2223 }
2224 else {
2225 if ($c >= 0x80) {
2226 $i++;
2227 } // advance a double-byte char
2228 }
2229
2230 $n++;
2231 }
2232
2233 return $n;
2234 }
2235
2236 /**
2237 * Translates a character position into an 'absolute' byte position.
2238 *
2239 * @param string EUC multibyte character string
2240 * @param integer character position (negative values start from the end)
2241 * @param string the charset
2242 * @return integer byte position
2243 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2244 */
2245 function euc_char2byte_pos($str, $pos, $charset) {
2246 $sjis = ($charset == 'shift_jis');
2247 $n = 0; // number of characters seen
2248 $p = abs($pos); // number of characters wanted
2249
2250 if ($pos >= 0) {
2251 $i = 0;
2252 $d = 1;
2253 } else {
2254 $i = strlen($str) - 1;
2255 $d = -1;
2256 }
2257
2258 for (; strlen($str{$i}) && $n < $p; $i += $d) {
2259 $c = ord($str{$i});
2260 if ($sjis) {
2261 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2262 $i += $d;
2263 } // advance a double-byte char
2264 }
2265 else {
2266 if ($c >= 0x80) {
2267 $i += $d;
2268 } // advance a double-byte char
2269 }
2270
2271 $n++;
2272 }
2273 if (!strlen($str{$i})) {
2274 return FALSE;
2275 } // offset beyond string length
2276
2277 if ($pos < 0) {
2278 $i++;
2279 } // correct offset
2280
2281 return $i;
2282 }
2283
2284 /**
2285 * Maps all characters of a string in the EUC charset family.
2286 *
2287 * @param string EUC multibyte character string
2288 * @param string the charset
2289 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2290 * @param string 'case': conversion 'toLower' or 'toUpper'
2291 * @return string the converted string
2292 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2293 */
2294 function euc_char_mapping($str, $charset, $mode, $opt = '') {
2295 switch ($mode) {
2296 case 'case':
2297 if (!$this->initCaseFolding($charset)) {
2298 return $str;
2299 } // do nothing
2300 $map =& $this->caseFolding[$charset][$opt];
2301 break;
2302
2303 case 'ascii':
2304 if (!$this->initToASCII($charset)) {
2305 return $str;
2306 } // do nothing
2307 $map =& $this->toASCII[$charset];
2308 break;
2309
2310 default:
2311 return $str;
2312 }
2313
2314 $sjis = ($charset == 'shift_jis');
2315 $out = '';
2316 for ($i = 0; strlen($str{$i}); $i++) {
2317 $mbc = $str{$i};
2318 $c = ord($mbc);
2319
2320 if ($sjis) {
2321 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2322 $mbc = substr($str, $i, 2);
2323 $i++;
2324 }
2325 }
2326 else {
2327 if ($c >= 0x80) { // a double-byte char
2328 $mbc = substr($str, $i, 2);
2329 $i++;
2330 }
2331 }
2332
2333 if (isset($map[$mbc])) {
2334 $out .= $map[$mbc];
2335 } else {
2336 $out .= $mbc;
2337 }
2338 }
2339
2340 return $out;
2341 }
2342
2343 }
2344
2345 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])) {
2346 include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2347 }
2348
2349 ?>