[TASK] Introduce unit tests for CharsetConverter 58/62758/4
authorAlexander Schnitzler <git@alexanderschnitzler.de>
Sun, 22 Dec 2019 15:58:14 +0000 (16:58 +0100)
committerSusanne Moog <look@susi.dev>
Thu, 16 Jan 2020 15:18:43 +0000 (16:18 +0100)
Releases: master
Resolves: #90090
Change-Id: Iba1fcf663df0f11d1227d77d5ead2b5f473f7bc6
Reviewed-on: https://review.typo3.org/c/Packages/TYPO3.CMS/+/62758
Tested-by: TYPO3com <noreply@typo3.com>
Tested-by: Benni Mack <benni@typo3.org>
Tested-by: Susanne Moog <look@susi.dev>
Reviewed-by: Benni Mack <benni@typo3.org>
Reviewed-by: Susanne Moog <look@susi.dev>
typo3/sysext/core/Classes/Charset/CharsetConverter.php
typo3/sysext/core/Tests/Unit/Charset/CharsetConverterTest.php

index 9a2b30a..b38e460 100644 (file)
@@ -151,6 +151,7 @@ class CharsetConverter implements SingletonInterface
                 $ord = ord($chr);
                 // If the charset has two bytes per char
                 if (isset($this->twoByteSets[$charset])) {
+                    // TYPO3 cannot convert from ucs-2 as the according conversion table is not present
                     $ord2 = ord($str[$a + 1]);
                     // Assume big endian
                     $ord = $ord << 8 | $ord2;
index 557319e..341d4bc 100644 (file)
@@ -25,6 +25,205 @@ use TYPO3\TestingFramework\Core\Unit\UnitTestCase;
 class CharsetConverterTest extends UnitTestCase
 {
     /**
+     * @test
+     */
+    public function utf8DecodeACharacterToAscii()
+    {
+        $charsetConverter = new CharsetConverter();
+
+        $string = "\x41"; // A
+        self::assertSame(1, mb_strlen($string));
+        self::assertSame(1, strlen($string));
+        self::assertSame('UTF-8', mb_detect_encoding($string, ['UTF-8', 'ASCII']));
+
+        // test decoding to ascii
+        self::assertSame('A', $charsetConverter->utf8_decode($string, 'ascii'));
+        self::assertSame('A', $charsetConverter->utf8_decode($string, 'ascii', true));
+
+        $targetString = $charsetConverter->utf8_decode($string, 'ascii');
+        self::assertSame('ASCII', mb_detect_encoding($targetString, ['ASCII', 'UTF-8']));
+    }
+
+    /**
+     * @test
+     */
+    public function utf8DecodeACharacterToIso885915()
+    {
+        $charsetConverter = new CharsetConverter();
+
+        $string = "\xE2\x82\xAC"; // €
+        self::assertSame(1, mb_strlen($string));
+        self::assertSame(3, strlen($string));
+        self::assertSame('UTF-8', mb_detect_encoding($string, ['ASCII', 'UTF-8']));
+
+        // test decoding to ascii
+        self::assertSame('?', $charsetConverter->utf8_decode($string, 'ascii'));
+        self::assertSame('&#x20ac;', $charsetConverter->utf8_decode($string, 'ascii', true));
+
+        // test decoding to iso-8859-15
+        $targetString = $charsetConverter->utf8_decode($string, 'iso-8859-15');
+        self::assertSame('ISO-8859-15', mb_detect_encoding($targetString, ['ASCII', 'UTF-8', 'ISO-8859-15']));
+        self::assertNotSame($string, $targetString);
+    }
+
+    /**
+     * @test
+     */
+    public function utf8DecodeEuroSignCharacterToIso885915()
+    {
+        $charsetConverter = new CharsetConverter();
+
+        $string = "\xE2\x82\xAC"; // €
+        self::assertSame(1, mb_strlen($string));
+        self::assertSame(3, strlen($string));
+        self::assertSame('UTF-8', mb_detect_encoding($string, ['ASCII', 'UTF-8']));
+
+        // test decoding to ascii
+        self::assertSame('?', $charsetConverter->utf8_decode($string, 'ascii'));
+        self::assertSame('&#x20ac;', $charsetConverter->utf8_decode($string, 'ascii', true));
+
+        // test decoding to iso-8859-15
+        $targetString = $charsetConverter->utf8_decode($string, 'iso-8859-15');
+        self::assertSame('ISO-8859-15', mb_detect_encoding($targetString, ['ASCII', 'UTF-8', 'ISO-8859-15']));
+        self::assertNotSame($string, $targetString);
+    }
+
+    /**
+     * @test
+     */
+    public function utf8DecodeAKanjiToBig5()
+    {
+        $charsetConverter = new CharsetConverter();
+
+        $string = "\xE6\xBC\x80"; // 漀
+        self::assertSame(1, mb_strlen($string));
+        self::assertSame(3, strlen($string));
+        self::assertSame('UTF-8', mb_detect_encoding($string, ['ASCII', 'UTF-8']));
+
+        // test decoding to ascii
+        self::assertSame('?', $charsetConverter->utf8_decode($string, 'ascii'));
+        self::assertSame('&#x6f00;', $charsetConverter->utf8_decode($string, 'ascii', true));
+
+        // test decoding to big5
+        $targetString = $charsetConverter->utf8_decode($string, 'big5');
+        self::assertSame('BIG-5', mb_detect_encoding($targetString, ['ASCII', 'UTF-8', 'BIG-5']));
+        self::assertNotSame($string, $targetString);
+    }
+
+    /**
+     * @test
+     */
+    public function convertingAUtf8EmojiSignToNonExistingAsciiRepresentationResultsInAQuestionMarkSign()
+    {
+        $charsetConverter = new CharsetConverter();
+
+        $string = "\xF0\x9F\x98\x82"; // 😂
+        self::assertSame(1, mb_strlen($string));
+        self::assertSame(4, strlen($string));
+        self::assertSame('UTF-8', mb_detect_encoding($string, ['ASCII', 'UTF-8']));
+
+        // test decoding to ascii
+        self::assertSame('?', $charsetConverter->utf8_decode($string, 'ascii'));
+        self::assertSame('&#x1f602;', $charsetConverter->utf8_decode($string, 'ascii', true));
+    }
+
+    /**
+     * @test
+     */
+    public function utf8DecodeToUtf8ReturnsTheSameSign()
+    {
+        self::assertSame(
+            "\xF0\x9F\x98\x82",
+            (new CharsetConverter())->utf8_decode("\xF0\x9F\x98\x82", 'utf-8')
+        );
+    }
+
+    /**
+     * @test
+     */
+    public function utf8EncodeIso885915ACharacter()
+    {
+        $string = "\x41"; // A
+        $targetString = (new CharsetConverter())->utf8_encode($string, 'iso-8859-15');
+
+        self::assertSame(1, strlen($string));
+        self::assertSame('A', $targetString);
+        self::assertSame(1, mb_strlen($targetString));
+        self::assertSame(1, strlen($targetString));
+        self::assertSame($string, $targetString);
+    }
+
+    /**
+     * @test
+     */
+    public function utf8EncodeIso885915EuroSign()
+    {
+        $string = "\xA4"; // € sign encoded as iso-8859-15
+        $targetString = (new CharsetConverter())->utf8_encode($string, 'iso-8859-15');
+
+        self::assertSame('€', $targetString);
+        self::assertSame(1, mb_strlen($targetString));
+        self::assertSame(3, strlen($targetString));
+        self::assertNotSame($string, $targetString);
+    }
+
+    /**
+     * @test
+     */
+    public function utf8EncodeABig5EncodedSign()
+    {
+        $string = "\xA2\xC5"; // 〣 sign encoded as big5
+        $targetString =  (new CharsetConverter())->utf8_encode($string, 'big5');
+
+        self::assertSame(2, strlen($string));
+        self::assertSame('〣', $targetString);
+        self::assertSame(1, mb_strlen($targetString));
+        self::assertSame(3, strlen($targetString));
+        self::assertNotSame($string, $targetString);
+    }
+
+    /**
+     * @test
+     */
+    public function utf8EncodeAlredyUtf8EncodedSign()
+    {
+        self::assertSame(
+            "\xF0\x9F\x98\x82",
+            (new CharsetConverter())->utf8_encode("\xF0\x9F\x98\x82", 'utf-8')
+        );
+    }
+
+    /**
+     * @test
+     */
+    public function utf8ToNumberArray()
+    {
+        $string = "\xF0\x9F\x98\x82 &ndash; a joyful emoji";
+        $expectedArray = [
+            '😂',
+            ' ',
+            '–',
+            ' ',
+            'a',
+            ' ',
+            'j',
+            'o',
+            'y',
+            'f',
+            'u',
+            'l',
+            ' ',
+            'e',
+            'm',
+            'o',
+            'j',
+            'i',
+        ];
+
+        self::assertSame($expectedArray, (new CharsetConverter())->utf8_to_numberarray($string));
+    }
+
+    /**
      * Data provider for specialCharactersToAsciiConvertsUmlautsToAscii()
      *
      * @return string[][]