[BUGFIX] Missing column in t3lib_TCEmain::getPreviousLocalizedRecordUid
[Packages/TYPO3.CMS.git] / t3lib / class.t3lib_cs.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2003-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 *
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
26 *
27 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
28 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
29 */
30
31
32 /**
33 * Notes on UTF-8
34 *
35 * Functions working on UTF-8 strings:
36 *
37 * - strchr/strstr
38 * - strrchr
39 * - substr_count
40 * - implode/explode/join
41 *
42 * Functions nearly working on UTF-8 strings:
43 *
44 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
45 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
46 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
47 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
48 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
49 *
50 * Functions NOT working on UTF-8 strings:
51 *
52 * - str*cmp
53 * - stristr
54 * - stripos
55 * - substr
56 * - strrev
57 * - split/spliti
58 * - ...
59 *
60 */
61 /**
62 * Class for conversion between charsets
63 *
64 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
65 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
66 * @package TYPO3
67 * @subpackage t3lib
68 */
69 class t3lib_cs {
70
71 /**
72 * @var t3lib_l10n_Locales
73 */
74 protected $locales;
75
76 var $noCharByteVal = 63; // ASCII Value for chars with no equivalent.
77
78 // This is the array where parsed conversion tables are stored (cached)
79 var $parsedCharsets = array();
80
81 // An array where case folding data will be stored (cached)
82 var $caseFolding = array();
83
84 // An array where charset-to-ASCII mappings are stored (cached)
85 var $toASCII = array();
86
87 // This tells the converter which charsets has two bytes per char:
88 var $twoByteSets = array(
89 'ucs-2' => 1, // 2-byte Unicode
90 );
91
92 // This tells the converter which charsets has four bytes per char:
93 var $fourByteSets = array(
94 'ucs-4' => 1, // 4-byte Unicode
95 'utf-32' => 1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
96 );
97
98 // This tells the converter which charsets use a scheme like the Extended Unix Code:
99 var $eucBasedSets = array(
100 'gb2312' => 1, // Chinese, simplified.
101 'big5' => 1, // Chinese, traditional.
102 'euc-kr' => 1, // Korean
103 'shift_jis' => 1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
104 );
105
106 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
107 // http://czyborra.com/charsets/iso8859.html
108 var $synonyms = array(
109 'us' => 'ascii',
110 'us-ascii' => 'ascii',
111 'cp819' => 'iso-8859-1',
112 'ibm819' => 'iso-8859-1',
113 'iso-ir-100' => 'iso-8859-1',
114 'iso-ir-101' => 'iso-8859-2',
115 'iso-ir-109' => 'iso-8859-3',
116 'iso-ir-110' => 'iso-8859-4',
117 'iso-ir-144' => 'iso-8859-5',
118 'iso-ir-127' => 'iso-8859-6',
119 'iso-ir-126' => 'iso-8859-7',
120 'iso-ir-138' => 'iso-8859-8',
121 'iso-ir-148' => 'iso-8859-9',
122 'iso-ir-157' => 'iso-8859-10',
123 'iso-ir-179' => 'iso-8859-13',
124 'iso-ir-199' => 'iso-8859-14',
125 'iso-ir-203' => 'iso-8859-15',
126 'csisolatin1' => 'iso-8859-1',
127 'csisolatin2' => 'iso-8859-2',
128 'csisolatin3' => 'iso-8859-3',
129 'csisolatin5' => 'iso-8859-9',
130 'csisolatin8' => 'iso-8859-14',
131 'csisolatin9' => 'iso-8859-15',
132 'csisolatingreek' => 'iso-8859-7',
133 'iso-celtic' => 'iso-8859-14',
134 'latin1' => 'iso-8859-1',
135 'latin2' => 'iso-8859-2',
136 'latin3' => 'iso-8859-3',
137 'latin5' => 'iso-8859-9',
138 'latin6' => 'iso-8859-10',
139 'latin8' => 'iso-8859-14',
140 'latin9' => 'iso-8859-15',
141 'l1' => 'iso-8859-1',
142 'l2' => 'iso-8859-2',
143 'l3' => 'iso-8859-3',
144 'l5' => 'iso-8859-9',
145 'l6' => 'iso-8859-10',
146 'l8' => 'iso-8859-14',
147 'l9' => 'iso-8859-15',
148 'cyrillic' => 'iso-8859-5',
149 'arabic' => 'iso-8859-6',
150 'tis-620' => 'iso-8859-11',
151 'win874' => 'windows-874',
152 'win1250' => 'windows-1250',
153 'win1251' => 'windows-1251',
154 'win1252' => 'windows-1252',
155 'win1253' => 'windows-1253',
156 'win1254' => 'windows-1254',
157 'win1255' => 'windows-1255',
158 'win1256' => 'windows-1256',
159 'win1257' => 'windows-1257',
160 'win1258' => 'windows-1258',
161 'cp1250' => 'windows-1250',
162 'cp1251' => 'windows-1251',
163 'cp1252' => 'windows-1252',
164 'ms-ee' => 'windows-1250',
165 'ms-ansi' => 'windows-1252',
166 'ms-greek' => 'windows-1253',
167 'ms-turk' => 'windows-1254',
168 'winbaltrim' => 'windows-1257',
169 'koi-8ru' => 'koi-8r',
170 'koi8r' => 'koi-8r',
171 'cp878' => 'koi-8r',
172 'mac' => 'macroman',
173 'macintosh' => 'macroman',
174 'euc-cn' => 'gb2312',
175 'x-euc-cn' => 'gb2312',
176 'euccn' => 'gb2312',
177 'cp936' => 'gb2312',
178 'big-5' => 'big5',
179 'cp950' => 'big5',
180 'eucjp' => 'euc-jp',
181 'sjis' => 'shift_jis',
182 'shift-jis' => 'shift_jis',
183 'cp932' => 'shift_jis',
184 'cp949' => 'euc-kr',
185 'utf7' => 'utf-7',
186 'utf8' => 'utf-8',
187 'utf16' => 'utf-16',
188 'utf32' => 'utf-32',
189 'utf8' => 'utf-8',
190 'ucs2' => 'ucs-2',
191 'ucs4' => 'ucs-4',
192 );
193
194 // mapping of iso-639-1 language codes to script names
195 var $lang_to_script = array(
196 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
197 'af' => 'west_european', //Afrikaans
198 'ar' => 'arabic',
199 'bg' => 'cyrillic', // Bulgarian
200 'bs' => 'east_european', // Bosnian
201 'cs' => 'east_european', // Czech
202 'da' => 'west_european', // Danish
203 'de' => 'west_european', // German
204 'es' => 'west_european', // Spanish
205 'et' => 'estonian',
206 'eo' => 'unicode', // Esperanto
207 'eu' => 'west_european', // Basque
208 'fa' => 'arabic', // Persian
209 'fi' => 'west_european', // Finish
210 'fo' => 'west_european', // Faroese
211 'fr' => 'west_european', // French
212 'ga' => 'west_european', // Irish
213 'gl' => 'west_european', // Galician
214 'gr' => 'greek',
215 'he' => 'hebrew', // Hebrew (since 1998)
216 'hi' => 'unicode', // Hindi
217 'hr' => 'east_european', // Croatian
218 'hu' => 'east_european', // Hungarian
219 'iw' => 'hebrew', // Hebrew (til 1998)
220 'is' => 'west_european', // Icelandic
221 'it' => 'west_european', // Italian
222 'ja' => 'japanese',
223 'ka' => 'unicode', // Georgian
224 'kl' => 'west_european', // Greenlandic
225 'km' => 'unicode', // Khmer
226 'ko' => 'korean',
227 'lt' => 'lithuanian',
228 'lv' => 'west_european', // Latvian/Lettish
229 'nl' => 'west_european', // Dutch
230 'no' => 'west_european', // Norwegian
231 'nb' => 'west_european', // Norwegian Bokmal
232 'nn' => 'west_european', // Norwegian Nynorsk
233 'pl' => 'east_european', // Polish
234 'pt' => 'west_european', // Portuguese
235 'ro' => 'east_european', // Romanian
236 'ru' => 'cyrillic', // Russian
237 'sk' => 'east_european', // Slovak
238 'sl' => 'east_european', // Slovenian
239 'sr' => 'cyrillic', // Serbian
240 'sv' => 'west_european', // Swedish
241 'sq' => 'albanian', // Albanian
242 'th' => 'thai',
243 'uk' => 'cyrillic', // Ukranian
244 'vi' => 'vietnamese',
245 'zh' => 'chinese',
246 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
247 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
248 'afk'=> 'west_european', // Afrikaans
249 'ara' => 'arabic',
250 'bgr' => 'cyrillic', // Bulgarian
251 'cat' => 'west_european', // Catalan
252 'chs' => 'simpl_chinese',
253 'cht' => 'trad_chinese',
254 'csy' => 'east_european', // Czech
255 'dan' => 'west_european', // Danisch
256 'deu' => 'west_european', // German
257 'dea' => 'west_european', // German (Austrian)
258 'des' => 'west_european', // German (Swiss)
259 'ena' => 'west_european', // English (Australian)
260 'enc' => 'west_european', // English (Canadian)
261 'eng' => 'west_european', // English
262 'enz' => 'west_european', // English (New Zealand)
263 'enu' => 'west_european', // English (United States)
264 'euq' => 'west_european', // Basque
265 'fos' => 'west_european', // Faroese
266 'far' => 'arabic', // Persian
267 'fin' => 'west_european', // Finish
268 'fra' => 'west_european', // French
269 'frb' => 'west_european', // French (Belgian)
270 'frc' => 'west_european', // French (Canadian)
271 'frs' => 'west_european', // French (Swiss)
272 'geo' => 'unicode', // Georgian
273 'glg' => 'west_european', // Galician
274 'ell' => 'greek',
275 'heb' => 'hebrew',
276 'hin' => 'unicode', // Hindi
277 'hun' => 'east_european', // Hungarian
278 'isl' => 'west_euorpean', // Icelandic
279 'ita' => 'west_european', // Italian
280 'its' => 'west_european', // Italian (Swiss)
281 'jpn' => 'japanese',
282 'khm' => 'unicode', // Khmer
283 'kor' => 'korean',
284 'lth' => 'lithuanian',
285 'lvi' => 'west_european', // Latvian/Lettish
286 'msl' => 'west_european', // Malay
287 'nlb' => 'west_european', // Dutch (Belgian)
288 'nld' => 'west_european', // Dutch
289 'nor' => 'west_european', // Norwegian (bokmal)
290 'non' => 'west_european', // Norwegian (nynorsk)
291 'plk' => 'east_european', // Polish
292 'ptg' => 'west_european', // Portuguese
293 'ptb' => 'west_european', // Portuguese (Brazil)
294 'rom' => 'east_european', // Romanian
295 'rus' => 'cyrillic', // Russian
296 'slv' => 'east_european', // Slovenian
297 'sky' => 'east_european', // Slovak
298 'srl' => 'east_european', // Serbian (Latin)
299 'srb' => 'cyrillic', // Serbian (Cyrillic)
300 'esp' => 'west_european', // Spanish (trad. sort)
301 'esm' => 'west_european', // Spanish (Mexican)
302 'esn' => 'west_european', // Spanish (internat. sort)
303 'sve' => 'west_european', // Swedish
304 'sqi' => 'albanian', // Albanian
305 'tha' => 'thai',
306 'trk' => 'turkish',
307 'ukr' => 'cyrillic', // Ukrainian
308 // English language names
309 'afrikaans' => 'west_european',
310 'albanian' => 'albanian',
311 'arabic' => 'arabic',
312 'basque' => 'west_european',
313 'bosnian' => 'east_european',
314 'bulgarian' => 'east_european',
315 'catalan' => 'west_european',
316 'croatian' => 'east_european',
317 'czech' => 'east_european',
318 'danish' => 'west_european',
319 'dutch' => 'west_european',
320 'english' => 'west_european',
321 'esperanto' => 'unicode',
322 'estonian' => 'estonian',
323 'faroese' => 'west_european',
324 'farsi' => 'arabic',
325 'finnish' => 'west_european',
326 'french' => 'west_european',
327 'galician' => 'west_european',
328 'georgian' => 'unicode',
329 'german' => 'west_european',
330 'greek' => 'greek',
331 'greenlandic' => 'west_european',
332 'hebrew' => 'hebrew',
333 'hindi' => 'unicode',
334 'hungarian' => 'east_european',
335 'icelandic' => 'west_european',
336 'italian' => 'west_european',
337 'khmer' => 'unicode',
338 'latvian' => 'west_european',
339 'lettish' => 'west_european',
340 'lithuanian' => 'lithuanian',
341 'malay' => 'west_european',
342 'norwegian' => 'west_european',
343 'persian' => 'arabic',
344 'polish' => 'east_european',
345 'portuguese' => 'west_european',
346 'russian' => 'cyrillic',
347 'romanian' => 'east_european',
348 'serbian' => 'cyrillic',
349 'slovak' => 'east_european',
350 'slovenian' => 'east_european',
351 'spanish' => 'west_european',
352 'svedish' => 'west_european',
353 'that' => 'thai',
354 'turkish' => 'turkish',
355 'ukrainian' => 'cyrillic',
356 );
357
358 // mapping of language (family) names to charsets on Unix
359 var $script_to_charset_unix = array(
360 'west_european' => 'iso-8859-1',
361 'estonian' => 'iso-8859-1',
362 'east_european' => 'iso-8859-2',
363 'baltic' => 'iso-8859-4',
364 'cyrillic' => 'iso-8859-5',
365 'arabic' => 'iso-8859-6',
366 'greek' => 'iso-8859-7',
367 'hebrew' => 'iso-8859-8',
368 'turkish' => 'iso-8859-9',
369 'thai' => 'iso-8859-11', // = TIS-620
370 'lithuanian' => 'iso-8859-13',
371 'chinese' => 'gb2312', // = euc-cn
372 'japanese' => 'euc-jp',
373 'korean' => 'euc-kr',
374 'simpl_chinese' => 'gb2312',
375 'trad_chinese' => 'big5',
376 'vietnamese' => '',
377 'unicode' => 'utf-8',
378 'albanian' => 'utf-8'
379 );
380
381 // mapping of language (family) names to charsets on Windows
382 var $script_to_charset_windows = array(
383 'east_european' => 'windows-1250',
384 'cyrillic' => 'windows-1251',
385 'west_european' => 'windows-1252',
386 'greek' => 'windows-1253',
387 'turkish' => 'windows-1254',
388 'hebrew' => 'windows-1255',
389 'arabic' => 'windows-1256',
390 'baltic' => 'windows-1257',
391 'estonian' => 'windows-1257',
392 'lithuanian' => 'windows-1257',
393 'vietnamese' => 'windows-1258',
394 'thai' => 'cp874',
395 'korean' => 'cp949',
396 'chinese' => 'gb2312',
397 'japanese' => 'shift_jis',
398 'simpl_chinese' => 'gb2312',
399 'trad_chinese' => 'big5',
400 'albanian' => 'windows-1250',
401 'unicode' => 'utf-8'
402 );
403
404 // mapping of locale names to charsets
405 var $locale_to_charset = array(
406 'japanese.euc' => 'euc-jp',
407 'ja_jp.ujis' => 'euc-jp',
408 'korean.euc' => 'euc-kr',
409 'sr@Latn' => 'iso-8859-2',
410 'zh_cn' => 'gb2312',
411 'zh_hk' => 'big5',
412 'zh_tw' => 'big5',
413 );
414
415 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
416 // Empty values means "iso-8859-1"
417 var $charSetArray = array(
418 'af' => '',
419 'ar' => 'iso-8859-6',
420 'ba' => 'iso-8859-2',
421 'bg' => 'windows-1251',
422 'br' => '',
423 'ca' => 'iso-8859-15',
424 'ch' => 'gb2312',
425 'cs' => 'windows-1250',
426 'cz' => 'windows-1250',
427 'da' => '',
428 'de' => '',
429 'dk' => '',
430 'el' => 'iso-8859-7',
431 'eo' => 'utf-8',
432 'es' => '',
433 'et' => 'iso-8859-4',
434 'eu' => '',
435 'fa' => 'utf-8',
436 'fi' => '',
437 'fo' => 'utf-8',
438 'fr' => '',
439 'fr_CA' => '',
440 'ga' => '',
441 'ge' => 'utf-8',
442 'gl' => '',
443 'gr' => 'iso-8859-7',
444 'he' => 'utf-8',
445 'hi' => 'utf-8',
446 'hk' => 'big5',
447 'hr' => 'windows-1250',
448 'hu' => 'iso-8859-2',
449 'is' => 'utf-8',
450 'it' => '',
451 'ja' => 'shift_jis',
452 'jp' => 'shift_jis',
453 'ka' => 'utf-8',
454 'kl' => 'utf-8',
455 'km' => 'utf-8',
456 'ko' => 'euc-kr',
457 'kr' => 'euc-kr',
458 'lt' => 'windows-1257',
459 'lv' => 'utf-8',
460 'ms' => '',
461 'my' => '',
462 'nl' => '',
463 'no' => '',
464 'pl' => 'iso-8859-2',
465 'pt' => '',
466 'pt_BR' => '',
467 'qc' => '',
468 'ro' => 'iso-8859-2',
469 'ru' => 'windows-1251',
470 'se' => '',
471 'si' => 'windows-1250',
472 'sk' => 'windows-1250',
473 'sl' => 'windows-1250',
474 'sq' => 'utf-8',
475 'sr' => 'utf-8',
476 'sv' => '',
477 'th' => 'iso-8859-11',
478 'tr' => 'iso-8859-9',
479 'ua' => 'windows-1251',
480 'uk' => 'windows-1251',
481 'vi' => 'utf-8',
482 'vn' => 'utf-8',
483 'zh' => 'big5',
484 );
485
486 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
487 // Missing keys means: same as TYPO3
488 // @deprecated since TYPO3 4.6, will be removed in TYPO3 4.8 - use t3lib_l10n_Locales::getIsoMapping()
489 var $isoArray = array(
490 'ba' => 'bs',
491 'br' => 'pt_BR',
492 'ch' => 'zh_CN',
493 'cz' => 'cs',
494 'dk' => 'da',
495 'si' => 'sl',
496 'se' => 'sv',
497 'gl' => 'kl',
498 'gr' => 'el',
499 'hk' => 'zh_HK',
500 'kr' => 'ko',
501 'ua' => 'uk',
502 'jp' => 'ja',
503 'qc' => 'fr_CA',
504 'vn' => 'vi',
505 'ge' => 'ka',
506 'ga' => 'gl',
507 );
508
509 /**
510 * Default constructor.
511 */
512 public function __construct() {
513 $this->locales = t3lib_div::makeInstance('t3lib_l10n_Locales');
514 }
515
516 /**
517 * Normalize - changes input character set to lowercase letters.
518 *
519 * @param string Input charset
520 * @return string Normalized charset
521 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
522 */
523 function parse_charset($charset) {
524 $charset = trim(strtolower($charset));
525 if (isset($this->synonyms[$charset])) {
526 $charset = $this->synonyms[$charset];
527 }
528
529 return $charset;
530 }
531
532 /**
533 * Get the charset of a locale.
534 *
535 * ln language
536 * ln_CN language / country
537 * ln_CN.cs language / country / charset
538 * ln_CN.cs@mod language / country / charset / modifier
539 *
540 * @param string Locale string
541 * @return string Charset resolved for locale string
542 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
543 */
544 function get_locale_charset($locale) {
545 $locale = strtolower($locale);
546
547 // exact locale specific charset?
548 if (isset($this->locale_to_charset[$locale])) {
549 return $this->locale_to_charset[$locale];
550 }
551
552 // get modifier
553 list($locale, $modifier) = explode('@', $locale);
554
555 // locale contains charset: use it
556 list($locale, $charset) = explode('.', $locale);
557 if ($charset) {
558 return $this->parse_charset($charset);
559 }
560
561 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
562 if ($modifier == 'euro') {
563 return 'iso-8859-15';
564 }
565
566 // get language
567 list($language, $country) = explode('_', $locale);
568 if (isset($this->lang_to_script[$language])) {
569 $script = $this->lang_to_script[$language];
570 }
571
572 if (TYPO3_OS == 'WIN') {
573 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
574 } else {
575 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'utf-8';
576 }
577
578 return $cs;
579 }
580
581
582 /********************************************
583 *
584 * Charset Conversion functions
585 *
586 ********************************************/
587
588 /**
589 * Convert from one charset to another charset.
590 *
591 * @param string Input string
592 * @param string From charset (the current charset of the string)
593 * @param string To charset (the output charset wanted)
594 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
595 * @return string Converted string
596 * @see convArray()
597 */
598 function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
599 if ($fromCS == $toCS) {
600 return $str;
601 }
602
603 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
604 if ($toCS == 'utf-8' || !$useEntityForNoChar) {
605 switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
606 case 'mbstring':
607 $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
608 if (FALSE !== $conv_str) {
609 return $conv_str;
610 } // returns FALSE for unsupported charsets
611 break;
612
613 case 'iconv':
614 $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
615 if (FALSE !== $conv_str) {
616 return $conv_str;
617 }
618 break;
619
620 case 'recode':
621 $conv_str = recode_string($fromCS . '..' . $toCS, $str);
622 if (FALSE !== $conv_str) {
623 return $conv_str;
624 }
625 break;
626 }
627 // fallback to TYPO3 conversion
628 }
629
630 if ($fromCS != 'utf-8') {
631 $str = $this->utf8_encode($str, $fromCS);
632 }
633 if ($toCS != 'utf-8') {
634 $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
635 }
636 return $str;
637 }
638
639 /**
640 * Convert all elements in ARRAY with type string from one charset to another charset.
641 * NOTICE: Array is passed by reference!
642 *
643 * @param string Input array, possibly multidimensional
644 * @param string From charset (the current charset of the string)
645 * @param string To charset (the output charset wanted)
646 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
647 * @return void
648 * @see conv()
649 */
650 function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
651 foreach ($array as $key => $value) {
652 if (is_array($array[$key])) {
653 $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
654 } elseif (is_string($array[$key])) {
655 $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
656 }
657 }
658 }
659
660 /**
661 * Converts $str from $charset to UTF-8
662 *
663 * @param string String in local charset to convert to UTF-8
664 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
665 * @return string Output string, converted to UTF-8
666 */
667 function utf8_encode($str, $charset) {
668
669 if ($charset === 'utf-8') {
670 return $str;
671 }
672
673 // Charset is case-insensitive.
674 if ($this->initCharset($charset)) { // Parse conv. table if not already...
675 $strLen = strlen($str);
676 $outStr = '';
677
678 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in string.
679 $chr = substr($str, $a, 1);
680 $ord = ord($chr);
681 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
682 $ord2 = ord($str{$a + 1});
683 $ord = $ord << 8 | $ord2; // assume big endian
684
685 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
686 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
687 } else {
688 $outStr .= chr($this->noCharByteVal);
689 } // No char exists
690 $a++;
691 } elseif ($ord > 127) { // If char has value over 127 it's a multibyte char in UTF-8
692 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
693 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
694 $a++;
695 $ord2 = ord(substr($str, $a, 1));
696 $ord = $ord * 256 + $ord2;
697 }
698 }
699
700 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
701 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
702 } else {
703 $outStr .= chr($this->noCharByteVal);
704 } // No char exists
705 } else {
706 $outStr .= $chr;
707 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
708 }
709 return $outStr;
710 }
711 }
712
713 /**
714 * Converts $str from UTF-8 to $charset
715 *
716 * @param string String in UTF-8 to convert to local charset
717 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
718 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
719 * @return string Output string, converted to local charset
720 */
721 function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
722
723 if ($charset === 'utf-8') {
724 return $str;
725 }
726
727 // Charset is case-insensitive.
728 if ($this->initCharset($charset)) { // Parse conv. table if not already...
729 $strLen = strlen($str);
730 $outStr = '';
731 $buf = '';
732 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) { // Traverse each char in UTF-8 string.
733 $chr = substr($str, $a, 1);
734 $ord = ord($chr);
735 if ($ord > 127) { // This means multibyte! (first byte!)
736 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
737
738 $buf = $chr; // Add first byte
739 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
740 $ord = $ord << 1; // Shift it left and ...
741 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
742 $a++; // Increase pointer...
743 $buf .= substr($str, $a, 1); // ... and add the next char.
744 } else {
745 break;
746 }
747 }
748
749 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
750 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
751 if ($mByte > 255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
752 $outStr .= chr(($mByte >> 8) & 255) . chr($mByte & 255);
753 } else {
754 $outStr .= chr($mByte);
755 }
756 } elseif ($useEntityForNoChar) { // Create num entity:
757 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
758 } else {
759 $outStr .= chr($this->noCharByteVal);
760 } // No char exists
761 } else {
762 $outStr .= chr($this->noCharByteVal);
763 } // No char exists (MIDDLE of MB sequence!)
764 } else {
765 $outStr .= $chr;
766 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
767 }
768 return $outStr;
769 }
770 }
771
772 /**
773 * Converts all chars > 127 to numeric entities.
774 *
775 * @param string Input string
776 * @return string Output string
777 */
778 function utf8_to_entities($str) {
779 $strLen = strlen($str);
780 $outStr = '';
781 $buf = '';
782 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
783 $chr = substr($str, $a, 1);
784 $ord = ord($chr);
785 if ($ord > 127) { // This means multibyte! (first byte!)
786 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
787 $buf = $chr; // Add first byte
788 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
789 $ord = $ord << 1; // Shift it left and ...
790 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
791 $a++; // Increase pointer...
792 $buf .= substr($str, $a, 1); // ... and add the next char.
793 } else {
794 break;
795 }
796 }
797
798 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
799 } else {
800 $outStr .= chr($this->noCharByteVal);
801 } // No char exists (MIDDLE of MB sequence!)
802 } else {
803 $outStr .= $chr;
804 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
805 }
806
807 return $outStr;
808 }
809
810 /**
811 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
812 *
813 * @param string Input string, UTF-8
814 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
815 * @return string Output string
816 */
817 function entities_to_utf8($str, $alsoStdHtmlEnt = FALSE) {
818 if ($alsoStdHtmlEnt) {
819 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8'));
820 }
821
822 $token = md5(microtime());
823 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
824 foreach ($parts as $k => $v) {
825 // only take every second element
826 if ($k % 2 === 0) {
827 continue;
828 }
829
830 $position = 0;
831 if (substr($v, $position, 1) == '#') { // Dec or hex entities:
832 $position++;
833 if (substr($v, $position, 1) == 'x') {
834 $v = hexdec(substr($v, ++$position));
835 } else {
836 $v = substr($v, $position);
837 }
838 $parts[$k] = $this->UnumberToChar($v);
839 } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) { // Other entities:
840 $parts[$k] = $trans_tbl['&' . $v . ';'];
841 } else { // No conversion:
842 $parts[$k] = '&' . $v . ';';
843 }
844 }
845
846 return implode('', $parts);
847 }
848
849 /**
850 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
851 *
852 * @param string Input string, UTF-8
853 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
854 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
855 * @return array Output array with the char numbers
856 */
857 function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
858 // If entities must be registered as well...:
859 if ($convEntities) {
860 $str = $this->entities_to_utf8($str, 1);
861 }
862 // Do conversion:
863 $strLen = strlen($str);
864 $outArr = array();
865 $buf = '';
866 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
867 $chr = substr($str, $a, 1);
868 $ord = ord($chr);
869 if ($ord > 127) { // This means multibyte! (first byte!)
870 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
871 $buf = $chr; // Add first byte
872 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
873 $ord = $ord << 1; // Shift it left and ...
874 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
875 $a++; // Increase pointer...
876 $buf .= substr($str, $a, 1); // ... and add the next char.
877 } else {
878 break;
879 }
880 }
881
882 $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
883 } else {
884 $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
885 } // No char exists (MIDDLE of MB sequence!)
886 } else {
887 $outArr[] = $retChar ? chr($ord) : $ord;
888 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
889 }
890
891 return $outArr;
892 }
893
894 /**
895 * Converts a UNICODE number to a UTF-8 multibyte character
896 * Algorithm based on script found at From: http://czyborra.com/utf/
897 * Unit-tested by Kasper
898 *
899 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
900 *
901 * bytes | bits | representation
902 * 1 | 7 | 0vvvvvvv
903 * 2 | 11 | 110vvvvv 10vvvvvv
904 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
905 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
906 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
907 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
908 *
909 * @param integer UNICODE integer
910 * @return string UTF-8 multibyte character string
911 * @see utf8CharToUnumber()
912 */
913 function UnumberToChar($cbyte) {
914 $str = '';
915
916 if ($cbyte < 0x80) {
917 $str .= chr($cbyte);
918 } else {
919 if ($cbyte < 0x800) {
920 $str .= chr(0xC0 | ($cbyte >> 6));
921 $str .= chr(0x80 | ($cbyte & 0x3F));
922 } else {
923 if ($cbyte < 0x10000) {
924 $str .= chr(0xE0 | ($cbyte >> 12));
925 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
926 $str .= chr(0x80 | ($cbyte & 0x3F));
927 } else {
928 if ($cbyte < 0x200000) {
929 $str .= chr(0xF0 | ($cbyte >> 18));
930 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
931 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
932 $str .= chr(0x80 | ($cbyte & 0x3F));
933 } else {
934 if ($cbyte < 0x4000000) {
935 $str .= chr(0xF8 | ($cbyte >> 24));
936 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
937 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
938 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
939 $str .= chr(0x80 | ($cbyte & 0x3F));
940 } else {
941 if ($cbyte < 0x80000000) {
942 $str .= chr(0xFC | ($cbyte >> 30));
943 $str .= chr(0x80 | (($cbyte >> 24) & 0x3F));
944 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
945 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
946 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
947 $str .= chr(0x80 | ($cbyte & 0x3F));
948 } else { // Cannot express a 32-bit character in UTF-8
949 $str .= chr($this->noCharByteVal);
950 }
951 }
952 }
953 }
954 }
955 }
956 return $str;
957 }
958
959 /**
960 * Converts a UTF-8 Multibyte character to a UNICODE number
961 * Unit-tested by Kasper
962 *
963 * @param string UTF-8 multibyte character string
964 * @param boolean If set, then a hex. number is returned.
965 * @return integer UNICODE integer
966 * @see UnumberToChar()
967 */
968 function utf8CharToUnumber($str, $hex = 0) {
969 $ord = ord(substr($str, 0, 1)); // First char
970
971 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
972 $binBuf = '';
973 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
974 $ord = $ord << 1; // Shift it left and ...
975 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
976 $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6);
977 } else {
978 break;
979 }
980 }
981 $binBuf = substr('00000000' . decbin(ord(substr($str, 0, 1))), -(6 - $b)) . $binBuf;
982
983 $int = bindec($binBuf);
984 } else {
985 $int = $ord;
986 }
987
988 return $hex ? 'x' . dechex($int) : $int;
989 }
990
991
992 /********************************************
993 *
994 * Init functions
995 *
996 ********************************************/
997
998 /**
999 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
1000 * This function is automatically called by the conversion functions
1001 *
1002 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
1003 *
1004 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
1005 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
1006 * @access private
1007 */
1008 function initCharset($charset) {
1009 // Only process if the charset is not yet loaded:
1010 if (!is_array($this->parsedCharsets[$charset])) {
1011
1012 // Conversion table filename:
1013 $charsetConvTableFile = PATH_t3lib . 'csconvtbl/' . $charset . '.tbl';
1014
1015 // If the conversion table is found:
1016 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
1017 // Cache file for charsets:
1018 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1019 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1020 if ($cacheFile && @is_file($cacheFile)) {
1021 $this->parsedCharsets[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1022 } else {
1023 // Parse conversion table into lines:
1024 $lines = t3lib_div::trimExplode(LF, t3lib_div::getUrl($charsetConvTableFile), 1);
1025 // Initialize the internal variable holding the conv. table:
1026 $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
1027 // traverse the lines:
1028 $detectedType = '';
1029 foreach ($lines as $value) {
1030 if (trim($value) && substr($value, 0, 1) != '#') { // Comment line or blanks are ignored.
1031
1032 // Detect type if not done yet: (Done on first real line)
1033 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1034 if (!$detectedType) {
1035 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
1036 }
1037
1038 if ($detectedType == 'ms-token') {
1039 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1040 } elseif ($detectedType == 'whitespaced') {
1041 $regA = array();
1042 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1043 $hexbyte = $regA[1];
1044 $utf8 = 'U+' . $regA[2];
1045 }
1046 $decval = hexdec(trim($hexbyte));
1047 if ($decval > 127) {
1048 $utf8decval = hexdec(substr(trim($utf8), 2));
1049 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1050 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1051 }
1052 }
1053 }
1054 if ($cacheFile) {
1055 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1056 }
1057 }
1058 return 2;
1059 } else {
1060 return FALSE;
1061 }
1062 } else {
1063 return 1;
1064 }
1065 }
1066
1067 /**
1068 * This function initializes all UTF-8 character data tables.
1069 *
1070 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1071 *
1072 * @param string Mode ("case", "ascii", ...)
1073 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1074 * @access private
1075 */
1076 function initUnicodeData($mode = NULL) {
1077 // cache files
1078 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1079 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1080
1081 // Only process if the tables are not yet loaded
1082 switch ($mode) {
1083 case 'case':
1084 if (is_array($this->caseFolding['utf-8'])) {
1085 return 1;
1086 }
1087
1088 // Use cached version if possible
1089 if ($cacheFileCase && @is_file($cacheFileCase)) {
1090 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1091 return 2;
1092 }
1093 break;
1094
1095 case 'ascii':
1096 if (is_array($this->toASCII['utf-8'])) {
1097 return 1;
1098 }
1099
1100 // Use cached version if possible
1101 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1102 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1103 return 2;
1104 }
1105 break;
1106 }
1107
1108 // process main Unicode data file
1109 $unicodeDataFile = PATH_t3lib . 'unidata/UnicodeData.txt';
1110 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1111 return FALSE;
1112 }
1113
1114 $fh = fopen($unicodeDataFile, 'rb');
1115 if (!$fh) {
1116 return FALSE;
1117 }
1118
1119 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1120 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1121 $this->caseFolding['utf-8'] = array();
1122 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1123 $utf8CaseFolding['toUpper'] = array();
1124 $utf8CaseFolding['toLower'] = array();
1125 $utf8CaseFolding['toTitle'] = array();
1126
1127 $decomposition = array(); // array of temp. decompositions
1128 $mark = array(); // array of chars that are marks (eg. composing accents)
1129 $number = array(); // array of chars that are numbers (eg. digits)
1130 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1131
1132 while (!feof($fh)) {
1133 $line = fgets($fh, 4096);
1134 // has a lot of info
1135 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line));
1136
1137 $ord = hexdec($char);
1138 if ($ord > 0xFFFF) {
1139 break;
1140 } // only process the BMP
1141
1142 $utf8_char = $this->UnumberToChar($ord);
1143
1144 if ($upper) {
1145 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1146 }
1147 if ($lower) {
1148 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1149 }
1150 // store "title" only when different from "upper" (only a few)
1151 if ($title && $title != $upper) {
1152 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1153 }
1154
1155 switch ($cat{0}) {
1156 case 'M': // mark (accent, umlaut, ...)
1157 $mark["U+$char"] = 1;
1158 break;
1159
1160 case 'N': // numeric value
1161 if ($ord > 0x80 && $num != '') {
1162 $number["U+$char"] = $num;
1163 }
1164 }
1165
1166 // accented Latin letters without "official" decomposition
1167 $match = array();
1168 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1169 $c = ord($match[2]);
1170 if ($match[1] == 'SMALL') {
1171 $c += 32;
1172 }
1173
1174 $decomposition["U+$char"] = array(dechex($c));
1175 continue;
1176 }
1177
1178 $match = array();
1179 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1180 switch ($match[1]) {
1181 case '<circle>': // add parenthesis as circle replacement, eg (1)
1182 $match[2] = '0028 ' . $match[2] . ' 0029';
1183 break;
1184
1185 case '<square>': // add square brackets as square replacement, eg [1]
1186 $match[2] = '005B ' . $match[2] . ' 005D';
1187 break;
1188
1189 case '<compat>': // ignore multi char decompositions that start with a space
1190 if (preg_match('/^0020 /', $match[2])) {
1191 continue 2;
1192 }
1193 break;
1194
1195 // ignore Arabic and vertical layout presentation decomposition
1196 case '<initial>':
1197 case '<medial>':
1198 case '<final>':
1199 case '<isolated>':
1200 case '<vertical>':
1201 continue 2;
1202 }
1203 $decomposition["U+$char"] = explode(' ', $match[2]);
1204 }
1205 }
1206 fclose($fh);
1207
1208 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1209 $specialCasingFile = PATH_t3lib . 'unidata/SpecialCasing.txt';
1210 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1211 $fh = fopen($specialCasingFile, 'rb');
1212 if ($fh) {
1213 while (!feof($fh)) {
1214 $line = fgets($fh, 4096);
1215 if ($line{0} != '#' && trim($line) != '') {
1216
1217 list($char, $lower, $title, $upper, $cond) = t3lib_div::trimExplode(';', $line);
1218 if ($cond == '' || $cond{0} == '#') {
1219 $utf8_char = $this->UnumberToChar(hexdec($char));
1220 if ($char != $lower) {
1221 $arr = explode(' ', $lower);
1222 for ($i = 0; isset($arr[$i]); $i++) {
1223 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1224 }
1225 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1226 }
1227 if ($char != $title && $title != $upper) {
1228 $arr = explode(' ', $title);
1229 for ($i = 0; isset($arr[$i]); $i++) {
1230 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1231 }
1232 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1233 }
1234 if ($char != $upper) {
1235 $arr = explode(' ', $upper);
1236 for ($i = 0; isset($arr[$i]); $i++) {
1237 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1238 }
1239 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1240 }
1241 }
1242 }
1243 }
1244 fclose($fh);
1245 }
1246 }
1247
1248 // process custom decompositions
1249 $customTranslitFile = PATH_t3lib . 'unidata/Translit.txt';
1250 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1251 $fh = fopen($customTranslitFile, 'rb');
1252 if ($fh) {
1253 while (!feof($fh)) {
1254 $line = fgets($fh, 4096);
1255 if ($line{0} != '#' && trim($line) != '') {
1256 list($char, $translit) = t3lib_div::trimExplode(';', $line);
1257 if (!$translit) {
1258 $omit["U+$char"] = 1;
1259 }
1260 $decomposition["U+$char"] = explode(' ', $translit);
1261
1262 }
1263 }
1264 fclose($fh);
1265 }
1266 }
1267
1268 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1269 foreach ($decomposition as $from => $to) {
1270 $code_decomp = array();
1271
1272 while ($code_value = array_shift($to)) {
1273 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1274 foreach (array_reverse($decomposition["U+$code_value"]) as $cv) {
1275 array_unshift($to, $cv);
1276 }
1277 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1278 array_push($code_decomp, $code_value);
1279 }
1280 }
1281 if (count($code_decomp) || isset($omit[$from])) {
1282 $decomposition[$from] = $code_decomp;
1283 } else {
1284 unset($decomposition[$from]);
1285 }
1286 }
1287
1288 // create ascii only mapping
1289 $this->toASCII['utf-8'] = array();
1290 $ascii =& $this->toASCII['utf-8'];
1291
1292 foreach ($decomposition as $from => $to) {
1293 $code_decomp = array();
1294 while ($code_value = array_shift($to)) {
1295 $ord = hexdec($code_value);
1296 if ($ord > 127) {
1297 continue 2;
1298 } // skip decompositions containing non-ASCII chars
1299 else
1300 {
1301 array_push($code_decomp, chr($ord));
1302 }
1303 }
1304 $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1305 }
1306
1307 // add numeric decompositions
1308 foreach ($number as $from => $to) {
1309 $utf8_char = $this->UnumberToChar(hexdec($from));
1310 if (!isset($ascii[$utf8_char])) {
1311 $ascii[$utf8_char] = $to;
1312 }
1313 }
1314
1315 if ($cacheFileCase) {
1316 t3lib_div::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1317 }
1318
1319 if ($cacheFileASCII) {
1320 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1321 }
1322
1323 return 3;
1324 }
1325
1326 /**
1327 * This function initializes the folding table for a charset other than UTF-8.
1328 * This function is automatically called by the case folding functions.
1329 *
1330 * @param string Charset for which to initialize case folding.
1331 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1332 * @access private
1333 */
1334 function initCaseFolding($charset) {
1335 // Only process if the case table is not yet loaded:
1336 if (is_array($this->caseFolding[$charset])) {
1337 return 1;
1338 }
1339
1340 // Use cached version if possible
1341 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1342 if ($cacheFile && @is_file($cacheFile)) {
1343 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1344 return 2;
1345 }
1346
1347 // init UTF-8 conversion for this charset
1348 if (!$this->initCharset($charset)) {
1349 return FALSE;
1350 }
1351
1352 // UTF-8 case folding is used as the base conversion table
1353 if (!$this->initUnicodeData('case')) {
1354 return FALSE;
1355 }
1356
1357 $nochar = chr($this->noCharByteVal);
1358 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1359 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1360 $c = $this->utf8_decode($utf8, $charset);
1361
1362 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1363 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1364 if ($cc != '' && $cc != $nochar) {
1365 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1366 }
1367
1368 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1369 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1370 if ($cc != '' && $cc != $nochar) {
1371 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1372 }
1373
1374 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1375 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1376 if ($cc != '' && $cc != $nochar) {
1377 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1378 }
1379 }
1380
1381 // add the ASCII case table
1382 for ($i = ord('a'); $i <= ord('z'); $i++) {
1383 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1384 }
1385 for ($i = ord('A'); $i <= ord('Z'); $i++) {
1386 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1387 }
1388
1389 if ($cacheFile) {
1390 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1391 }
1392
1393 return 3;
1394 }
1395
1396 /**
1397 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1398 * This function is automatically called by the ASCII transliteration functions.
1399 *
1400 * @param string Charset for which to initialize conversion.
1401 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1402 * @access private
1403 */
1404 function initToASCII($charset) {
1405 // Only process if the case table is not yet loaded:
1406 if (is_array($this->toASCII[$charset])) {
1407 return 1;
1408 }
1409
1410 // Use cached version if possible
1411 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1412 if ($cacheFile && @is_file($cacheFile)) {
1413 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1414 return 2;
1415 }
1416
1417 // init UTF-8 conversion for this charset
1418 if (!$this->initCharset($charset)) {
1419 return FALSE;
1420 }
1421
1422 // UTF-8/ASCII transliteration is used as the base conversion table
1423 if (!$this->initUnicodeData('ascii')) {
1424 return FALSE;
1425 }
1426
1427 $nochar = chr($this->noCharByteVal);
1428 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1429 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1430 $c = $this->utf8_decode($utf8, $charset);
1431
1432 if (isset($this->toASCII['utf-8'][$utf8])) {
1433 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1434 }
1435 }
1436
1437 if ($cacheFile) {
1438 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1439 }
1440
1441 return 3;
1442 }
1443
1444
1445 /********************************************
1446 *
1447 * String operation functions
1448 *
1449 ********************************************/
1450
1451 /**
1452 * Returns a part of a string.
1453 * Unit-tested by Kasper (single byte charsets only)
1454 *
1455 * @param string The character set
1456 * @param string Character string
1457 * @param integer Start position (character position)
1458 * @param integer Length (in characters)
1459 * @return string The substring
1460 * @see substr(), mb_substr()
1461 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1462 */
1463 function substr($charset, $string, $start, $len = NULL) {
1464 if ($len === 0 || $string === '') {
1465 return '';
1466 }
1467
1468 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1469 // cannot omit $len, when specifying charset
1470 if ($len == NULL) {
1471 $enc = mb_internal_encoding(); // save internal encoding
1472 mb_internal_encoding($charset);
1473 $str = mb_substr($string, $start);
1474 mb_internal_encoding($enc); // restore internal encoding
1475
1476 return $str;
1477 }
1478 else {
1479 return mb_substr($string, $start, $len, $charset);
1480 }
1481 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1482 // cannot omit $len, when specifying charset
1483 if ($len == NULL) {
1484 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1485 iconv_set_encoding('internal_encoding', $charset);
1486 $str = iconv_substr($string, $start);
1487 iconv_set_encoding('internal_encoding', $enc); // restore internal encoding
1488
1489 return $str;
1490 }
1491 else {
1492 return iconv_substr($string, $start, $len, $charset);
1493 }
1494 } elseif ($charset == 'utf-8') {
1495 return $this->utf8_substr($string, $start, $len);
1496 } elseif ($this->eucBasedSets[$charset]) {
1497 return $this->euc_substr($string, $start, $charset, $len);
1498 } elseif ($this->twoByteSets[$charset]) {
1499 return substr($string, $start * 2, $len * 2);
1500 } elseif ($this->fourByteSets[$charset]) {
1501 return substr($string, $start * 4, $len * 4);
1502 }
1503
1504 // treat everything else as single-byte encoding
1505 return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
1506 }
1507
1508 /**
1509 * Counts the number of characters.
1510 * Unit-tested by Kasper (single byte charsets only)
1511 *
1512 * @param string The character set
1513 * @param string Character string
1514 * @return integer The number of characters
1515 * @see strlen()
1516 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1517 */
1518 function strlen($charset, $string) {
1519 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1520 return mb_strlen($string, $charset);
1521 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1522 return iconv_strlen($string, $charset);
1523 } elseif ($charset == 'utf-8') {
1524 return $this->utf8_strlen($string);
1525 } elseif ($this->eucBasedSets[$charset]) {
1526 return $this->euc_strlen($string, $charset);
1527 } elseif ($this->twoByteSets[$charset]) {
1528 return strlen($string) / 2;
1529 } elseif ($this->fourByteSets[$charset]) {
1530 return strlen($string) / 4;
1531 }
1532 // treat everything else as single-byte encoding
1533 return strlen($string);
1534 }
1535
1536 /**
1537 * Method to crop strings using the mb_substr function.
1538 *
1539 * @param string The character set
1540 * @param string String to be cropped
1541 * @param integer Crop length (in characters)
1542 * @param string Crop signifier
1543 * @return string The shortened string
1544 * @see mb_strlen(), mb_substr()
1545 */
1546 protected function cropMbstring($charset, $string, $len, $crop = '') {
1547 if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) {
1548 return $string;
1549 }
1550
1551 if ($len > 0) {
1552 $string = mb_substr($string, 0, $len, $charset) . $crop;
1553 } else {
1554 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1555 }
1556
1557 return $string;
1558 }
1559
1560 /**
1561 * Truncates a string and pre-/appends a string.
1562 * Unit tested by Kasper
1563 *
1564 * @param string The character set
1565 * @param string Character string
1566 * @param integer Length (in characters)
1567 * @param string Crop signifier
1568 * @return string The shortened string
1569 * @see substr(), mb_strimwidth()
1570 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1571 */
1572 function crop($charset, $string, $len, $crop = '') {
1573 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1574 return $this->cropMbstring($charset, $string, $len, $crop);
1575 }
1576
1577 if (intval($len) == 0) {
1578 return $string;
1579 }
1580
1581 if ($charset == 'utf-8') {
1582 $i = $this->utf8_char2byte_pos($string, $len);
1583 } elseif ($this->eucBasedSets[$charset]) {
1584 $i = $this->euc_char2byte_pos($string, $len, $charset);
1585 } else {
1586 if ($len > 0) {
1587 $i = $len;
1588 } else {
1589 $i = strlen($string) + $len;
1590 if ($i <= 0) {
1591 $i = FALSE;
1592 }
1593 }
1594 }
1595
1596 if ($i === FALSE) { // $len outside actual string length
1597 return $string;
1598 } else {
1599 if ($len > 0) {
1600 if (strlen($string{$i})) {
1601 return substr($string, 0, $i) . $crop;
1602
1603 }
1604 } else {
1605 if (strlen($string{$i - 1})) {
1606 return $crop . substr($string, $i);
1607 }
1608 }
1609
1610 /*
1611 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return TRUE here (which is not a catastrophe, but...)
1612 if ($len > 0) {
1613 return substr($string,0,$i).$crop;
1614 } else {
1615 return $crop.substr($string,$i);
1616 }
1617 }
1618 */
1619 }
1620 return $string;
1621 }
1622
1623 /**
1624 * Cuts a string short at a given byte length.
1625 *
1626 * @param string The character set
1627 * @param string Character string
1628 * @param integer The byte length
1629 * @return string The shortened string
1630 * @see mb_strcut()
1631 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1632 */
1633 function strtrunc($charset, $string, $len) {
1634 if ($len <= 0) {
1635 return '';
1636 }
1637
1638 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1639 return mb_strcut($string, 0, $len, $charset);
1640 } elseif ($charset == 'utf-8') {
1641 return $this->utf8_strtrunc($string, $len);
1642 } elseif ($this->eucBasedSets[$charset]) {
1643 return $this->euc_strtrunc($string, $len, $charset);
1644 } elseif ($this->twoByteSets[$charset]) {
1645 if ($len % 2) {
1646 $len--;
1647 } // don't cut at odd positions
1648 } elseif ($this->fourByteSets[$charset]) {
1649 $x = $len % 4;
1650 $len -= $x; // realign to position dividable by four
1651 }
1652 // treat everything else as single-byte encoding
1653 return substr($string, 0, $len);
1654 }
1655
1656 /**
1657 * Translates all characters of a string into their respective case values.
1658 * Unlike strtolower() and strtoupper() this method is locale independent.
1659 * Note that the string length may change!
1660 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1661 * Unit-tested by Kasper
1662 * Real case folding is language dependent, this method ignores this fact.
1663 *
1664 * @param string Character set of string
1665 * @param string Input string to convert case for
1666 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1667 * @return string The converted string
1668 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1669 * @see strtolower(), strtoupper()
1670 */
1671 function conv_case($charset, $string, $case) {
1672 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1673 if ($case == 'toLower') {
1674 $string = mb_strtolower($string, $charset);
1675 } else {
1676 $string = mb_strtoupper($string, $charset);
1677 }
1678 } elseif ($charset == 'utf-8') {
1679 $string = $this->utf8_char_mapping($string, 'case', $case);
1680 } elseif (isset($this->eucBasedSets[$charset])) {
1681 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1682 } else {
1683 // treat everything else as single-byte encoding
1684 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1685 }
1686
1687 return $string;
1688 }
1689
1690 /**
1691 * Equivalent of lcfirst/ucfirst but using character set.
1692 *
1693 * @param string $charset
1694 * @param string $string
1695 * @param string $case
1696 * @return string
1697 * @see t3lib_cs::conv_case()
1698 */
1699 public function convCaseFirst($charset, $string, $case) {
1700 $firstChar = $this->substr($charset, $string, 0, 1);
1701 $firstChar = $this->conv_case($charset, $firstChar, $case);
1702 $remainder = $this->substr($charset, $string, 1);
1703 return $firstChar . $remainder;
1704 }
1705
1706 /**
1707 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1708 *
1709 * @param string $charset Character set of string
1710 * @param string $string Input string to convert
1711 * @return string The converted string
1712 */
1713 function specCharsToASCII($charset, $string) {
1714 if ($charset == 'utf-8') {
1715 $string = $this->utf8_char_mapping($string, 'ascii');
1716 } elseif (isset($this->eucBasedSets[$charset])) {
1717 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1718 } else {
1719 // treat everything else as single-byte encoding
1720 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1721 }
1722
1723 return $string;
1724 }
1725
1726
1727 /**
1728 * converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1729 * into a TYPO3-readable language code
1730 * @param $languageCodesList list of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1731 * see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
1732 * @return string a preferred language that TYPO3 supports, or "default" if none found
1733 * @author Benjamin Mack (benni.typo3.org)
1734 */
1735 public function getPreferredClientLanguage($languageCodesList) {
1736 $allLanguageCodes = array();
1737 $selectedLanguage = 'default';
1738
1739 // get all languages where TYPO3 code is the same as the ISO code
1740 foreach ($this->charSetArray as $typo3Lang => $charSet) {
1741 $allLanguageCodes[$typo3Lang] = $typo3Lang;
1742 }
1743
1744 // get all languages where TYPO3 code differs from ISO code
1745 // or needs the country part
1746 // the iso codes will here overwrite the default typo3 language in the key
1747 foreach ($this->locales->getIsoMapping() as $typo3Lang => $isoLang) {
1748 $isoLang = join('-', explode('_', $isoLang));
1749 $allLanguageCodes[$typo3Lang] = $isoLang;
1750 }
1751
1752 // move the iso codes to the (because we're comparing the keys with "isset" later on)
1753 $allLanguageCodes = array_flip($allLanguageCodes);
1754
1755
1756 $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList);
1757 // order the preferred languages after they key
1758 $sortedPreferredLanguages = array();
1759 foreach ($preferredLanguages as $preferredLanguage) {
1760 $quality = 1.0;
1761 if (strpos($preferredLanguage, ';q=') !== FALSE) {
1762 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1763 }
1764 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1765 }
1766
1767 // loop through the languages, with the highest priority first
1768 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1769 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1770 if (isset($allLanguageCodes[$preferredLanguage])) {
1771 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1772 break;
1773 }
1774
1775 // strip the country code from the end
1776 list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
1777 if (isset($allLanguageCodes[$preferredLanguage])) {
1778 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1779 break;
1780 }
1781 }
1782 if (!$selectedLanguage || $selectedLanguage == 'en') {
1783 $selectedLanguage = 'default';
1784 }
1785 return $selectedLanguage;
1786 }
1787
1788
1789 /********************************************
1790 *
1791 * Internal string operation functions
1792 *
1793 ********************************************/
1794
1795 /**
1796 * Maps all characters of a string in a single byte charset.
1797 *
1798 * @param string the string
1799 * @param string the charset
1800 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1801 * @param string 'case': conversion 'toLower' or 'toUpper'
1802 * @return string the converted string
1803 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1804 */
1805 function sb_char_mapping($str, $charset, $mode, $opt = '') {
1806 switch ($mode) {
1807 case 'case':
1808 if (!$this->initCaseFolding($charset)) {
1809 return $str;
1810 } // do nothing
1811 $map =& $this->caseFolding[$charset][$opt];
1812 break;
1813
1814 case 'ascii':
1815 if (!$this->initToASCII($charset)) {
1816 return $str;
1817 } // do nothing
1818 $map =& $this->toASCII[$charset];
1819 break;
1820
1821 default:
1822 return $str;
1823 }
1824
1825 $out = '';
1826 for ($i = 0; strlen($str{$i}); $i++) {
1827 $c = $str{$i};
1828 if (isset($map[$c])) {
1829 $out .= $map[$c];
1830 } else {
1831 $out .= $c;
1832 }
1833 }
1834
1835 return $out;
1836 }
1837
1838
1839 /********************************************
1840 *
1841 * Internal UTF-8 string operation functions
1842 *
1843 ********************************************/
1844
1845 /**
1846 * Returns a part of a UTF-8 string.
1847 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1848 *
1849 * @param string UTF-8 string
1850 * @param integer Start position (character position)
1851 * @param integer Length (in characters)
1852 * @return string The substring
1853 * @see substr()
1854 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1855 */
1856 function utf8_substr($str, $start, $len = NULL) {
1857 if (!strcmp($len, '0')) {
1858 return '';
1859 }
1860
1861 $byte_start = $this->utf8_char2byte_pos($str, $start);
1862 if ($byte_start === FALSE) {
1863 if ($start > 0) {
1864 return FALSE; // $start outside string length
1865 } else {
1866 $start = 0;
1867 }
1868 }
1869
1870 $str = substr($str, $byte_start);
1871
1872 if ($len != NULL) {
1873 $byte_end = $this->utf8_char2byte_pos($str, $len);
1874 if ($byte_end === FALSE) // $len outside actual string length
1875 {
1876 return $len < 0 ? '' : $str;
1877 } // When length is less than zero and exceeds, then we return blank string.
1878 else
1879 {
1880 return substr($str, 0, $byte_end);
1881 }
1882 }
1883 else {
1884 return $str;
1885 }
1886 }
1887
1888 /**
1889 * Counts the number of characters of a string in UTF-8.
1890 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1891 *
1892 * @param string UTF-8 multibyte character string
1893 * @return integer The number of characters
1894 * @see strlen()
1895 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1896 */
1897 function utf8_strlen($str) {
1898 $n = 0;
1899 for ($i = 0; strlen($str{$i}); $i++) {
1900 $c = ord($str{$i});
1901 if (!($c & 0x80)) // single-byte (0xxxxxx)
1902 {
1903 $n++;
1904 }
1905 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1906 {
1907 $n++;
1908 }
1909 }
1910 return $n;
1911 }
1912
1913 /**
1914 * Truncates a string in UTF-8 short at a given byte length.
1915 *
1916 * @param string UTF-8 multibyte character string
1917 * @param integer the byte length
1918 * @return string the shortened string
1919 * @see mb_strcut()
1920 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1921 */
1922 function utf8_strtrunc($str, $len) {
1923 $i = $len - 1;
1924 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1925 for (; $i > 0 && !(ord($str{$i}) & 0x40); $i--) {
1926 // find the first byte
1927 ;
1928 }
1929 if ($i <= 0) {
1930 return '';
1931 } // sanity check
1932 for ($bc = 0, $mbs = ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) {
1933 // calculate number of bytes
1934 $bc++;
1935 }
1936 if ($bc + $i > $len) {
1937 return substr($str, 0, $i);
1938 }
1939 // fallthru: multibyte char fits into length
1940 }
1941 return substr($str, 0, $len);
1942 }
1943
1944 /**
1945 * Find position of first occurrence of a string, both arguments are in UTF-8.
1946 *
1947 * @param string UTF-8 string to search in
1948 * @param string UTF-8 string to search for
1949 * @param integer Positition to start the search
1950 * @return integer The character position
1951 * @see strpos()
1952 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1953 */
1954 function utf8_strpos($haystack, $needle, $offset = 0) {
1955 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1956 return mb_strpos($haystack, $needle, $offset, 'utf-8');
1957 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1958 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1959 }
1960
1961 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1962 if ($byte_offset === FALSE) {
1963 return FALSE;
1964 } // offset beyond string length
1965
1966 $byte_pos = strpos($haystack, $needle, $byte_offset);
1967 if ($byte_pos === FALSE) {
1968 return FALSE;
1969 } // needle not found
1970
1971 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1972 }
1973
1974 /**
1975 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1976 *
1977 * @param string UTF-8 string to search in
1978 * @param string UTF-8 character to search for (single character)
1979 * @return integer The character position
1980 * @see strrpos()
1981 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1982 */
1983 function utf8_strrpos($haystack, $needle) {
1984 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1985 return mb_strrpos($haystack, $needle, 'utf-8');
1986 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1987 return iconv_strrpos($haystack, $needle, 'utf-8');
1988 }
1989
1990 $byte_pos = strrpos($haystack, $needle);
1991 if ($byte_pos === FALSE) {
1992 return FALSE;
1993 } // needle not found
1994
1995 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1996 }
1997
1998 /**
1999 * Translates a character position into an 'absolute' byte position.
2000 * Unit tested by Kasper.
2001 *
2002 * @param string UTF-8 string
2003 * @param integer Character position (negative values start from the end)
2004 * @return integer Byte position
2005 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2006 */
2007 function utf8_char2byte_pos($str, $pos) {
2008 $n = 0; // number of characters found
2009 $p = abs($pos); // number of characters wanted
2010
2011 if ($pos >= 0) {
2012 $i = 0;
2013 $d = 1;
2014 } else {
2015 $i = strlen($str) - 1;
2016 $d = -1;
2017 }
2018
2019 for (; strlen($str{$i}) && $n < $p; $i += $d) {
2020 $c = (int) ord($str{$i});
2021 if (!($c & 0x80)) // single-byte (0xxxxxx)
2022 {
2023 $n++;
2024 }
2025 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2026 {
2027 $n++;
2028 }
2029 }
2030 if (!strlen($str{$i})) {
2031 return FALSE;
2032 } // offset beyond string length
2033
2034 if ($pos >= 0) {
2035 // skip trailing multi-byte data bytes
2036 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) {
2037 $i++;
2038 }
2039 } else {
2040 // correct offset
2041 $i++;
2042 }
2043
2044 return $i;
2045 }
2046
2047 /**
2048 * Translates an 'absolute' byte position into a character position.
2049 * Unit tested by Kasper.
2050 *
2051 * @param string UTF-8 string
2052 * @param integer byte position
2053 * @return integer character position
2054 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2055 */
2056 function utf8_byte2char_pos($str, $pos) {
2057 $n = 0; // number of characters
2058 for ($i = $pos; $i > 0; $i--) {
2059 $c = (int) ord($str{$i});
2060 if (!($c & 0x80)) // single-byte (0xxxxxx)
2061 {
2062 $n++;
2063 }
2064 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2065 {
2066 $n++;
2067 }
2068 }
2069 if (!strlen($str{$i})) {
2070 return FALSE;
2071 } // offset beyond string length
2072
2073 return $n;
2074 }
2075
2076 /**
2077 * Maps all characters of an UTF-8 string.
2078 *
2079 * @param string UTF-8 string
2080 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2081 * @param string 'case': conversion 'toLower' or 'toUpper'
2082 * @return string the converted string
2083 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2084 */
2085 function utf8_char_mapping($str, $mode, $opt = '') {
2086 if (!$this->initUnicodeData($mode)) {
2087 return $str;
2088 } // do nothing
2089
2090 $out = '';
2091 switch ($mode) {
2092 case 'case':
2093 $map =& $this->caseFolding['utf-8'][$opt];
2094 break;
2095
2096 case 'ascii':
2097 $map =& $this->toASCII['utf-8'];
2098 break;
2099
2100 default:
2101 return $str;
2102 }
2103
2104 for ($i = 0; strlen($str{$i}); $i++) {
2105 $c = ord($str{$i});
2106 if (!($c & 0x80)) // single-byte (0xxxxxx)
2107 {
2108 $mbc = $str{$i};
2109 }
2110 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
2111 for ($bc = 0; $c & 0x80; $c = $c << 1) {
2112 $bc++;
2113 } // calculate number of bytes
2114 $mbc = substr($str, $i, $bc);
2115 $i += $bc - 1;
2116 }
2117
2118 if (isset($map[$mbc])) {
2119 $out .= $map[$mbc];
2120 } else {
2121 $out .= $mbc;
2122 }
2123 }
2124
2125 return $out;
2126 }
2127
2128
2129 /********************************************
2130 *
2131 * Internal EUC string operation functions
2132 *
2133 * Extended Unix Code:
2134 * ASCII compatible 7bit single bytes chars
2135 * 8bit two byte chars
2136 *
2137 * Shift-JIS is treated as a special case.
2138 *
2139 ********************************************/
2140
2141 /**
2142 * Cuts a string in the EUC charset family short at a given byte length.
2143 *
2144 * @param string EUC multibyte character string
2145 * @param integer the byte length
2146 * @param string the charset
2147 * @return string the shortened string
2148 * @see mb_strcut()
2149 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2150 */
2151 function euc_strtrunc($str, $len, $charset) {
2152 $sjis = ($charset == 'shift_jis');
2153 for ($i = 0; strlen($str{$i}) && $i < $len; $i++) {
2154 $c = ord($str{$i});
2155 if ($sjis) {
2156 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2157 $i++;
2158 } // advance a double-byte char
2159 }
2160 else {
2161 if ($c >= 0x80) {
2162 $i++;
2163 } // advance a double-byte char
2164 }
2165 }
2166 if (!strlen($str{$i})) {
2167 return $str;
2168 } // string shorter than supplied length
2169
2170 if ($i > $len) {
2171 return substr($str, 0, $len - 1); // we ended on a first byte
2172 } else {
2173 return substr($str, 0, $len);
2174 }
2175 }
2176
2177 /**
2178 * Returns a part of a string in the EUC charset family.
2179 *
2180 * @param string EUC multibyte character string
2181 * @param integer start position (character position)
2182 * @param string the charset
2183 * @param integer length (in characters)
2184 * @return string the substring
2185 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2186 */
2187 function euc_substr($str, $start, $charset, $len = NULL) {
2188 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2189 if ($byte_start === FALSE) {
2190 return FALSE;
2191 } // $start outside string length
2192
2193 $str = substr($str, $byte_start);
2194
2195 if ($len != NULL) {
2196 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2197 if ($byte_end === FALSE) // $len outside actual string length
2198 {
2199 return $str;
2200 }
2201 else
2202 {
2203 return substr($str, 0, $byte_end);
2204 }
2205 }
2206 else {
2207 return $str;
2208 }
2209 }
2210
2211 /**
2212 * Counts the number of characters of a string in the EUC charset family.
2213 *
2214 * @param string EUC multibyte character string
2215 * @param string the charset
2216 * @return integer the number of characters
2217 * @see strlen()
2218 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2219 */
2220 function euc_strlen($str, $charset) {
2221 $sjis = ($charset == 'shift_jis');
2222 $n = 0;
2223 for ($i = 0; strlen($str{$i}); $i++) {
2224 $c = ord($str{$i});
2225 if ($sjis) {
2226 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2227 $i++;
2228 } // advance a double-byte char
2229 }
2230 else {
2231 if ($c >= 0x80) {
2232 $i++;
2233 } // advance a double-byte char
2234 }
2235
2236 $n++;
2237 }
2238
2239 return $n;
2240 }
2241
2242 /**
2243 * Translates a character position into an 'absolute' byte position.
2244 *
2245 * @param string EUC multibyte character string
2246 * @param integer character position (negative values start from the end)
2247 * @param string the charset
2248 * @return integer byte position
2249 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2250 */
2251 function euc_char2byte_pos($str, $pos, $charset) {
2252 $sjis = ($charset == 'shift_jis');
2253 $n = 0; // number of characters seen
2254 $p = abs($pos); // number of characters wanted
2255
2256 if ($pos >= 0) {
2257 $i = 0;
2258 $d = 1;
2259 } else {
2260 $i = strlen($str) - 1;
2261 $d = -1;
2262 }
2263
2264 for (; strlen($str{$i}) && $n < $p; $i += $d) {
2265 $c = ord($str{$i});
2266 if ($sjis) {
2267 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2268 $i += $d;
2269 } // advance a double-byte char
2270 }
2271 else {
2272 if ($c >= 0x80) {
2273 $i += $d;
2274 } // advance a double-byte char
2275 }
2276
2277 $n++;
2278 }
2279 if (!strlen($str{$i})) {
2280 return FALSE;
2281 } // offset beyond string length
2282
2283 if ($pos < 0) {
2284 $i++;
2285 } // correct offset
2286
2287 return $i;
2288 }
2289
2290 /**
2291 * Maps all characters of a string in the EUC charset family.
2292 *
2293 * @param string EUC multibyte character string
2294 * @param string the charset
2295 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2296 * @param string 'case': conversion 'toLower' or 'toUpper'
2297 * @return string the converted string
2298 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2299 */
2300 function euc_char_mapping($str, $charset, $mode, $opt = '') {
2301 switch ($mode) {
2302 case 'case':
2303 if (!$this->initCaseFolding($charset)) {
2304 return $str;
2305 } // do nothing
2306 $map =& $this->caseFolding[$charset][$opt];
2307 break;
2308
2309 case 'ascii':
2310 if (!$this->initToASCII($charset)) {
2311 return $str;
2312 } // do nothing
2313 $map =& $this->toASCII[$charset];
2314 break;
2315
2316 default:
2317 return $str;
2318 }
2319
2320 $sjis = ($charset == 'shift_jis');
2321 $out = '';
2322 for ($i = 0; strlen($str{$i}); $i++) {
2323 $mbc = $str{$i};
2324 $c = ord($mbc);
2325
2326 if ($sjis) {
2327 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2328 $mbc = substr($str, $i, 2);
2329 $i++;
2330 }
2331 }
2332 else {
2333 if ($c >= 0x80) { // a double-byte char
2334 $mbc = substr($str, $i, 2);
2335 $i++;
2336 }
2337 }
2338
2339 if (isset($map[$mbc])) {
2340 $out .= $map[$mbc];
2341 } else {
2342 $out .= $mbc;
2343 }
2344 }
2345
2346 return $out;
2347 }
2348
2349 }
2350
2351 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])) {
2352 include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2353 }
2354
2355 ?>