[BUGFIX] Use system locale when indexing external documents 12/5312/15
authorJigal van Hemert <jigal@xs4all.nl>
Sun, 25 Sep 2011 09:58:14 +0000 (11:58 +0200)
committerStefan Neufeind <typo3.neufeind@speedpartner.de>
Thu, 16 Jan 2014 23:19:57 +0000 (00:19 +0100)
If paths or filenames of external documents contain utf-8 characters the
system locale must be used. Functions like escapeshellarg and basename are
locale aware.

Change-Id: I50a73a42d60de569c63e5ba27ad6a6a3a66fd6c8
Fixes: #30244
Releases: 4.5, 6.0, 6.1, 6.2
Reviewed-on: https://review.typo3.org/5312
Reviewed-by: Markus Klein
Reviewed-by: Sebastian Fischer
Reviewed-by: Xavier Perseguers
Tested-by: Markus Klein
Reviewed-by: Stefan Neufeind
Tested-by: Stefan Neufeind
typo3/sysext/indexed_search/Classes/FileContentParser.php

index 8a3163c..33a9fa4 100644 (file)
@@ -447,10 +447,10 @@ class FileContentParser {
        /**
         * Reads the content of an external file being indexed.
         *
-        * @param       string          File extension, eg. "pdf", "doc" etc.
-        * @param       string          Absolute filename of file (must exist and be validated OK before calling function)
-        * @param       string          Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
-        * @return      array           Standard content array (title, description, keywords, body keys)
+        * @param string $ext File extension, eg. "pdf", "doc" etc.
+        * @param string $absFile Absolute filename of file (must exist and be validated OK before calling function)
+        * @param string $cPKey Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be split.)
+        * @return array Standard content array (title, description, keywords, body keys)
         * @todo Define visibility
         */
        public function readFileContent($ext, $absFile, $cPKey) {
@@ -463,6 +463,7 @@ class FileContentParser {
                switch ($ext) {
                        case 'pdf':
                                if ($this->app['pdfinfo']) {
+                                       $this->setLocaleForServerFileSystem();
                                        // Getting pdf-info:
                                        $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
                                        \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
@@ -487,21 +488,25 @@ class FileContentParser {
                                                        $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
                                                }
                                        }
+                                       $this->setLocaleForServerFileSystem(TRUE);
                                }
                                break;
                        case 'doc':
                                if ($this->app['catdoc']) {
+                                       $this->setLocaleForServerFileSystem();
                                        $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
                                        \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
                                        $content = implode(LF, $res);
                                        unset($res);
                                        $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
+                                       $this->setLocaleForServerFileSystem(TRUE);
                                }
                                break;
                        case 'pps':
 
                        case 'ppt':
                                if ($this->app['ppthtml']) {
+                                       $this->setLocaleForServerFileSystem();
                                        $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
                                        \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
                                        $content = implode(LF, $res);
@@ -509,10 +514,12 @@ class FileContentParser {
                                        $content = $this->pObj->convertHTMLToUtf8($content);
                                        $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
                                        $contentArr['title'] = basename($absFile);
+                                       $this->setLocaleForServerFileSystem(TRUE);
                                }
                                break;
                        case 'xls':
                                if ($this->app['xlhtml']) {
+                                       $this->setLocaleForServerFileSystem();
                                        $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
                                        \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
                                        $content = implode(LF, $res);
@@ -520,6 +527,7 @@ class FileContentParser {
                                        $content = $this->pObj->convertHTMLToUtf8($content);
                                        $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
                                        $contentArr['title'] = basename($absFile);
+                                       $this->setLocaleForServerFileSystem(TRUE);
                                }
                                break;
                        case 'sxi':
@@ -534,6 +542,7 @@ class FileContentParser {
 
                        case 'odt':
                                if ($this->app['unzip']) {
+                                       $this->setLocaleForServerFileSystem();
                                        // Read content.xml:
                                        $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml';
                                        \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
@@ -561,21 +570,25 @@ class FileContentParser {
                                                        }
                                                }
                                        }
+                                       $this->setLocaleForServerFileSystem(TRUE);
                                }
                                break;
                        case 'rtf':
                                if ($this->app['unrtf']) {
+                                       $this->setLocaleForServerFileSystem();
                                        $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
                                        \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
                                        $fileContent = implode(LF, $res);
                                        unset($res);
                                        $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
                                        $contentArr = $this->pObj->splitHTMLContent($fileContent);
+                                       $this->setLocaleForServerFileSystem(TRUE);
                                }
                                break;
                        case 'txt':
 
                        case 'csv':
+                               $this->setLocaleForServerFileSystem();
                                // Raw text
                                $content = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($absFile);
                                // TODO: Implement auto detection of charset (currently assuming utf-8)
@@ -584,6 +597,7 @@ class FileContentParser {
                                $contentArr = $this->pObj->splitRegularContent($content);
                                $contentArr['title'] = basename($absFile);
                                // Make sure the title doesn't expose the absolute path!
+                               $this->setLocaleForServerFileSystem(TRUE);
                                break;
                        case 'html':
 
@@ -593,6 +607,7 @@ class FileContentParser {
                                $contentArr = $this->pObj->splitHTMLContent($fileContent);
                                break;
                        case 'xml':
+                               $this->setLocaleForServerFileSystem();
                                // PHP strip-tags()
                                $fileContent = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($absFile);
                                // Finding charset:
@@ -603,12 +618,14 @@ class FileContentParser {
                                $contentArr = $this->pObj->splitRegularContent($fileContent);
                                $contentArr['title'] = basename($absFile);
                                // Make sure the title doesn't expose the absolute path!
+                               $this->setLocaleForServerFileSystem(TRUE);
                                break;
                        case 'jpg':
 
                        case 'jpeg':
 
                        case 'tif':
+                               $this->setLocaleForServerFileSystem();
                                // PHP EXIF
                                if (function_exists('exif_read_data')) {
                                        $exif = exif_read_data($absFile, 'IFD0');
@@ -623,6 +640,7 @@ class FileContentParser {
                                $contentArr = $this->pObj->splitRegularContent($comment);
                                $contentArr['title'] = basename($absFile);
                                // Make sure the title doesn't expose the absolute path!
+                               $this->setLocaleForServerFileSystem(TRUE);
                                break;
                        default:
                                return FALSE;
@@ -636,18 +654,53 @@ class FileContentParser {
        }
 
        /**
+        * Sets the locale for LC_CTYPE to $TYPO3_CONF_VARS['SYS']['systemLocale']
+        * if $TYPO3_CONF_VARS['SYS']['UTF8filesystem'] is set.
+        *
+        * Parameter <code>$resetLocale</code> has to be FALSE and TRUE alternating for all calls.
+        *
+        * @staticvar string $lastLocale Stores the locale used before it is overriden by this method.
+        * @param boolean $resetLocale TRUE resets the locale to $lastLocale.
+        * @return void
+        * @throws RuntimeException
+        */
+       protected function setLocaleForServerFileSystem($resetLocale = FALSE) {
+               static $lastLocale = NULL;
+               if (!$GLOBALS['TYPO3_CONF_VARS']['SYS']['UTF8filesystem']) {
+                       return;
+               }
+
+               if ($resetLocale) {
+                       if ($lastLocale == NULL) {
+                               throw new RuntimeException('Cannot reset locale to NULL.', 1357064326);
+                       }
+                       setlocale(LC_CTYPE, $lastLocale);
+                       $lastLocale = NULL;
+               } else {
+                       if ($lastLocale !== NULL) {
+                               throw new RuntimeException('Cannot set new locale as locale has already been changed before.', 1357064437);
+                       }
+                       $lastLocale = setlocale(LC_CTYPE, 0);
+                       setlocale(LC_CTYPE, $GLOBALS['TYPO3_CONF_VARS']['SYS']['systemLocale']);
+               }
+       }
+
+       /**
         * Creates an array with pointers to divisions of document.
-        * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero) coming back.
         *
-        * @param       string          File extension
-        * @param       string          Absolute filename (must exist and be validated OK before calling function)
-        * @return      array           Array of pointers to sections that the document should be divided into
+        * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero)
+        * coming back.
+        *
+        * @param string $ext File extension
+        * @param string $absFile Absolute filename (must exist and be validated OK before calling function)
+        * @return array Array of pointers to sections that the document should be divided into
         * @todo Define visibility
         */
        public function fileContentParts($ext, $absFile) {
                $cParts = array(0);
                switch ($ext) {
                        case 'pdf':
+                               $this->setLocaleForServerFileSystem();
                                // Getting pdf-info:
                                $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
                                \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
@@ -668,7 +721,9 @@ class FileContentParser {
                                                $cParts[] = $low . '-' . $high;
                                        }
                                }
+                               $this->setLocaleForServerFileSystem(TRUE);
                                break;
+                       default:
                }
                return $cParts;
        }