[TASK] Consolidate regexp for utf8 and non-utf8 file systems 39/23439/13
authorLucas Bremgartner <lb@bexa.ch>
Thu, 14 Nov 2013 23:45:26 +0000 (00:45 +0100)
committerWouter Wolters <typo3@wouterwolters.nl>
Thu, 6 Mar 2014 22:29:43 +0000 (23:29 +0100)
This patch enables the @ character to be usable now
also for non-utf8 file systems.

Resolves: #50538
Releases: 6.2, 6.1
Change-Id: I72ce24393003af8733af6fc650e69781df4a272c
Reviewed-on: https://review.typo3.org/23439
Reviewed-by: Markus Klein
Tested-by: Markus Klein
Reviewed-by: Stefan Neufeind
Reviewed-by: Wouter Wolters
Tested-by: Wouter Wolters
typo3/sysext/core/Classes/Charset/CharsetConverter.php
typo3/sysext/core/Classes/Resource/Driver/LocalDriver.php
typo3/sysext/core/Classes/Utility/File/BasicFileUtility.php
typo3/sysext/core/Tests/Unit/Resource/Driver/LocalDriverTest.php

index c638c08..4c0a70d 100644 (file)
@@ -1791,7 +1791,7 @@ class CharsetConverter {
         * @todo Define visibility
         */
        public function specCharsToASCII($charset, $string) {
-               if ($charset == 'utf-8') {
+               if ($charset === 'utf-8') {
                        $string = $this->utf8_char_mapping($string, 'ascii');
                } elseif (isset($this->eucBasedSets[$charset])) {
                        $string = $this->euc_char_mapping($string, $charset, 'ascii');
index 7825b62..17d4071 100644 (file)
@@ -40,6 +40,11 @@ use TYPO3\CMS\Core\Utility\PathUtility;
 class LocalDriver extends AbstractHierarchicalFilesystemDriver {
 
        /**
+        * @var string
+        */
+       const UNSAFE_FILENAME_CHARACTER_EXPRESSION = '\\x00-\\x2C\\/\\x3A-\\x3F\\x5B-\\x60\\x7B-\\xBF';
+
+       /**
         * The absolute base path. It always contains a trailing slash.
         *
         * @var string
@@ -279,7 +284,7 @@ class LocalDriver extends AbstractHierarchicalFilesystemDriver {
                // Handle UTF-8 characters
                if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['UTF8filesystem']) {
                        // Allow ".", "-", 0-9, a-z, A-Z and everything beyond U+C0 (latin capital letter a with grave)
-                       $cleanFileName = preg_replace('/[\\x00-\\x2C\\/\\x3A-\\x3F\\x5B-\\x60\\x7B-\\xBF]/u', '_', trim($fileName));
+                       $cleanFileName = preg_replace('/[' . self::UNSAFE_FILENAME_CHARACTER_EXPRESSION . ']/u', '_', trim($fileName));
                } else {
                        // Define character set
                        if (!$charset) {
@@ -295,7 +300,7 @@ class LocalDriver extends AbstractHierarchicalFilesystemDriver {
                                $fileName = $this->getCharsetConversion()->specCharsToASCII($charset, $fileName);
                        }
                        // Replace unwanted characters by underscores
-                       $cleanFileName = preg_replace('/[^.[:alnum:]_-]/', '_', trim($fileName));
+                       $cleanFileName = preg_replace('/[' . self::UNSAFE_FILENAME_CHARACTER_EXPRESSION . '\\xC0-\\xFF]/', '_', trim($fileName));
                }
                // Strip trailing dots and return
                $cleanFileName = preg_replace('/\\.*$/', '', $cleanFileName);
index 6e9ab60..beb8bb4 100644 (file)
@@ -40,6 +40,10 @@ use TYPO3\CMS\Core\Utility\PathUtility;
  * @author     Kasper Skårhøj <kasperYYYY@typo3.com>
  */
 class BasicFileUtility {
+       /**
+        * @var string
+        */
+       const UNSAFE_FILENAME_CHARACTER_EXPRESSION = '\\x00-\\x2C\\/\\x3A-\\x3F\\x5B-\\x60\\x7B-\\xBF';
 
        /**
         * @todo Define visibility
@@ -95,6 +99,11 @@ class BasicFileUtility {
         */
        public $isInit = 0;
 
+       /**
+        * @var \TYPO3\CMS\Core\Charset\CharsetConverter
+        */
+       public $csConvObj;
+
        // Set to TRUE after init()/start();
        /**********************************
         *
@@ -490,16 +499,16 @@ class BasicFileUtility {
         * Returns a string where any character not matching [.a-zA-Z0-9_-] is substituted by '_'
         * Trailing dots are removed
         *
-        * @param       string          Input string, typically the body of a filename
-        * @param       string          Charset of the a filename (defaults to current charset; depending on context)
-        * @return      string          Output string with any characters not matching [.a-zA-Z0-9_-] is substituted by '_' and trailing dots removed
+        * @param string $fileName Input string, typically the body of a filename
+        * @param string $charset Charset of the a filename (defaults to current charset; depending on context)
+        * @return string Output string with any characters not matching [.a-zA-Z0-9_-] is substituted by '_' and trailing dots removed
         * @todo Define visibility
         */
        public function cleanFileName($fileName, $charset = '') {
                // Handle UTF-8 characters
                if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['UTF8filesystem']) {
                        // allow ".", "-", 0-9, a-z, A-Z and everything beyond U+C0 (latin capital letter a with grave)
-                       $cleanFileName = preg_replace('/[\\x00-\\x2C\\/\\x3A-\\x3F\\x5B-\\x60\\x7B-\\xBF]/u', '_', trim($fileName));
+                       $cleanFileName = preg_replace('/[' . self::UNSAFE_FILENAME_CHARACTER_EXPRESSION . ']/u', '_', trim($fileName));
                } else {
                        // Get conversion object or initialize if needed
                        if (!is_object($this->csConvObj)) {
@@ -527,7 +536,7 @@ class BasicFileUtility {
                                $fileName = $this->csConvObj->specCharsToASCII($charset, $fileName);
                        }
                        // Replace unwanted characters by underscores
-                       $cleanFileName = preg_replace('/[^.[:alnum:]_-]/', '_', trim($fileName));
+                       $cleanFileName = preg_replace('/[' . self::UNSAFE_FILENAME_CHARACTER_EXPRESSION . '\\xC0-\\xFF]/', '_', trim($fileName));
                }
                // Strip trailing dots and return
                return preg_replace('/\\.*$/', '', $cleanFileName);
index 18cf42a..abb1043 100644 (file)
@@ -54,6 +54,21 @@ class LocalDriverTest extends \TYPO3\CMS\Core\Tests\Unit\Resource\BaseTestCase {
        protected $testDirs = array();
 
        /**
+        * @var string
+        */
+       protected $iso88591GreaterThan127 = '';
+
+       /**
+        * @var string
+        */
+       protected $utf8Latin1Supplement = '';
+
+       /**
+        * @var string
+        */
+       protected $utf8Latin1ExtendedA = '';
+
+       /**
         * Tear down
         */
        public function tearDown() {
@@ -1199,4 +1214,189 @@ class LocalDriverTest extends \TYPO3\CMS\Core\Tests\Unit\Resource\BaseTestCase {
                $this->assertTrue(is_file($basePath . '/targetFolder/newFolderName/subFolder/file'));
        }
 
+       ///////////////////////
+       // Tests concerning sanitizeFileName
+       ///////////////////////
+
+       /**
+        * Set up data for sanitizeFileName tests
+        */
+       public function setUpCharacterStrings() {
+               // Generate string containing all characters for the iso8859-1 charset, charcode greater than 127
+               $this->iso88591GreaterThan127 = '';
+               for ($i = 0xA0; $i <= 0xFF; $i++) {
+                       $this->iso88591GreaterThan127 .= chr($i);
+               }
+
+               // Generate string containing all characters for the utf-8 Latin-1 Supplement (U+0080 to U+00FF)
+               // without U+0080 to U+009F: control characters
+               // Based on http://www.utf8-chartable.de/unicode-utf8-table.pl
+               $this->utf8Latin1Supplement = '';
+               for ($i = 0xA0; $i <= 0xBF; $i++) {
+                       $this->utf8Latin1Supplement .= chr(0xC2) . chr($i);
+               }
+               for ($i = 0x80; $i <= 0xBF; $i++) {
+                       $this->utf8Latin1Supplement .= chr(0xC3) . chr($i);
+               }
+
+               // Generate string containing all characters for the utf-8 Latin-1 Extended-A (U+0100 to U+017F)
+               $this->utf8Latin1ExtendedA = '';
+               for ($i = 0x80; $i <= 0xBF; $i++) {
+                       $this->utf8Latin1ExtendedA .= chr(0xC4) . chr($i);
+               }
+               for ($i = 0x80; $i <= 0xBF; $i++) {
+                       $this->utf8Latin1ExtendedA .= chr(0xC5) . chr($i);
+               }
+       }
+
+       /**
+        * Data provider for sanitizeFileNameUTF8FilesystemDataProvider
+        *
+        * Every array splits into:
+        * - String value fileName
+        * - String value charset (none = '', utf-8, latin1, etc.)
+        * - Expected result (cleaned fileName)
+        *
+        * @return array
+        */
+       public function sanitizeFileNameUTF8FilesystemDataProvider() {
+               $this->setUpCharacterStrings();
+               return array(
+                       // Characters ordered by ASCII table
+                       'allowed characters utf-8 (ASCII part)' => array(
+                               '-.0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz',
+                               '-.0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz'
+                       ),
+                       // Characters ordered by ASCII table (except for space-character, because space-character ist trimmed)
+                       'replace special characters with _ (not allowed characters) utf-8 (ASCII part)' => array(
+                               '! "#$%&\'()*+,/:;<=>?[\\]^`{|}~',
+                               '_____________________________'
+                       ),
+                       'utf-8 (Latin-1 Supplement)' => array(
+                               $this->utf8Latin1Supplement,
+                               '________________________________ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'
+                       ),
+                       'trim leading and tailing spaces utf-8' => array(
+                               ' test.txt  ',
+                               'test.txt'
+                       ),
+                       'remove tailing dot' => array(
+                               'test.txt.',
+                               'test.txt'
+                       ),
+               );
+       }
+
+       /**
+        * @test
+        * @dataProvider sanitizeFileNameUTF8FilesystemDataProvider
+        */
+       public function sanitizeFileNameUTF8Filesystem($fileName, $expectedResult) {
+               $GLOBALS['TYPO3_CONF_VARS']['SYS']['UTF8filesystem'] = 1;
+               $this->assertEquals(
+                       $expectedResult,
+                       $this->createDriverFixture()->sanitizeFileName($fileName)
+               );
+       }
+
+
+       /**
+        * Data provider for sanitizeFileNameNonUTF8Filesystem
+        *
+        * Every array splits into:
+        * - String value fileName
+        * - String value charset (none = '', utf-8, latin1, etc.)
+        * - Expected result (cleaned fileName)
+        *
+        * @return array
+        */
+       public function sanitizeFileNameNonUTF8FilesystemDataProvider() {
+               $this->setUpCharacterStrings();
+               return array(
+                       // Characters ordered by ASCII table
+                       'allowed characters iso-8859-1' => array(
+                               '-.0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz',
+                               'iso-8859-1',
+                               '-.0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz'
+                       ),
+                       // Characters ordered by ASCII table
+                       'allowed characters utf-8' => array(
+                               '-.0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz',
+                               'utf-8',
+                               '-.0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz'
+                       ),
+                       // Characters ordered by ASCII table (except for space-character, because space-character ist trimmed)
+                       'replace special characters with _ (not allowed characters) iso-8859-1' => array(
+                               '! "#$%&\'()*+,/:;<=>?[\\]^`{|}~',
+                               'iso-8859-1',
+                               '_____________________________'
+                       ),
+                       // Characters ordered by ASCII table (except for space-character, because space-character ist trimmed)
+                       'replace special characters with _ (not allowed characters) utf-8' => array(
+                               '! "#$%&\'()*+,/:;<=>?[\\]^`{|}~',
+                               'utf-8',
+                               '_____________________________'
+                       ),
+                       'iso-8859-1 (code > 127)' => array(
+                               // http://de.wikipedia.org/wiki/ISO_8859-1
+                               // chr(0xA0) = NBSP (no-break space) => gets trimmed
+                               $this->iso88591GreaterThan127,
+                               'iso-8859-1',
+                               '_centpound_yen____c_a_____R_____-23_u___1o__1_41_23_4_AAAAAEAAAECEEEEIIIIDNOOOOOExOEUUUUEYTHssaaaaaeaaaeceeeeiiiidnoooooe_oeuuuueythy'
+                       ),
+                       'utf-8 (Latin-1 Supplement)' => array(
+                               // chr(0xC2) . chr(0x0A) = NBSP (no-break space) => gets trimmed
+                               $this->utf8Latin1Supplement,
+                               'utf-8',
+                               '_centpound__yen______c_a_______R_______-23__u_____1o__1_41_23_4_AAAAAEAAAECEEEEIIIIDNOOOOOExOEUUUUEYTHssaaaaaeaaaeceeeeiiiidnoooooe_oeuuuueythy'
+                       ),
+                       'utf-8 (Latin-1 Extended A)' => array(
+                               $this->utf8Latin1ExtendedA,
+                               'utf-8',
+                               'AaAaAaCcCcCcCcDdDdEeEeEeEeEeGgGgGgGgHhHhIiIiIiIiIiIJijJjKk__LlLlLlL_l_LlNnNnNn_n____OOooOoOoOEoeRrRrRrSsSsSsSsTtTtTtUuUuUuUuUuUuWwYyYZzZzZzs'
+                       ),
+                       'trim leading and tailing spaces iso-8859-1' => array(
+                               ' test.txt  ',
+                               'iso-8859-1',
+                               'test.txt'
+                       ),
+                       'trim leading and tailing spaces utf-8' => array(
+                               ' test.txt  ',
+                               'utf-8',
+                               'test.txt'
+                       ),
+                       'remove tailing dot iso-8859-1' => array(
+                               'test.txt.',
+                               'iso-8859-1',
+                               'test.txt'
+                       ),
+                       'remove tailing dot utf-8' => array(
+                               'test.txt.',
+                               'utf-8',
+                               'test.txt'
+                       ),
+               );
+       }
+
+       /**
+        * @test
+        * @dataProvider sanitizeFileNameNonUTF8FilesystemDataProvider
+        */
+       public function sanitizeFileNameNonUTF8Filesystem($fileName, $charset, $expectedResult) {
+               $GLOBALS['TYPO3_CONF_VARS']['SYS']['UTF8filesystem'] = 0;
+               $this->assertEquals(
+                       $expectedResult,
+                       $this->createDriverFixture()->sanitizeFileName($fileName, $charset)
+               );
+       }
+
+       /**
+        * @test
+        * @expectedException \TYPO3\CMS\Core\Resource\Exception\InvalidFileNameException
+        */
+       public function sanitizeFileNameThrowsExceptionOnInvalidFileName() {
+               $GLOBALS['TYPO3_CONF_VARS']['SYS']['UTF8filesystem'] = 1;
+               $this->createDriverFixture()->sanitizeFileName('');
+       }
+
 }