[FEATURE] Index docx, xlsx, ... from MS Office 2007 and above 63/42863/6
authorXavier Perseguers <xavier@typo3.org>
Tue, 25 Aug 2015 09:28:55 +0000 (11:28 +0200)
committerTymoteusz Motylewski <t.motylewski@gmail.com>
Sun, 30 Aug 2015 20:50:33 +0000 (22:50 +0200)
* Add Microsoft office new formats to indexed_search
* Use unzip to get the xml content

Change-Id: I6d4481e09c34701ef846bb48f16b2a10d3fac38a
Resolves: #23669
Releases: master
Reviewed-on: http://review.typo3.org/42863
Reviewed-by: Markus Sommer <markussom@posteo.de>
Tested-by: Markus Sommer <markussom@posteo.de>
Reviewed-by: Philipp Gampe <philipp.gampe@typo3.org>
Reviewed-by: Tymoteusz Motylewski <t.motylewski@gmail.com>
Tested-by: Tymoteusz Motylewski <t.motylewski@gmail.com>
typo3/sysext/indexed_search/Classes/FileContentParser.php
typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/docx.gif [new file with mode: 0644]
typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/dotx.gif [new file with mode: 0644]
typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/potx.gif [new file with mode: 0644]
typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/ppsx.gif [new file with mode: 0644]
typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/pptx.gif [new file with mode: 0644]
typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/xlsx.gif [new file with mode: 0644]
typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/xltx.gif [new file with mode: 0644]
typo3/sysext/indexed_search/ext_localconf.php

index 278b239..daa8317 100644 (file)
@@ -150,6 +150,25 @@ class FileContentParser {
                                        $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:xlhtmlDisabled'), 1);
                                }
                                break;
+                       case 'docx':    // Microsoft Word >= 2007
+                       case 'dotx':
+                       case 'pptx':    // Microsoft PowerPoint >= 2007
+                       case 'ppsx':
+                       case 'potx':
+                       case 'xlsx':    // Microsoft Excel >= 2007
+                       case 'xltx':
+                               if ($indexerConfig['unzip']) {
+                                       $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
+                                       if (@is_file($unzipPath . 'unzip' . $exe)) {
+                                               $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
+                                               $extOK = TRUE;
+                                       } else {
+                                               $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipNotFound'), $unzipPath), 3);
+                                       }
+                               } else {
+                                       $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipDisabled'), 1);
+                               }
+                               break;
                        case 'sxc':
                        case 'sxi':
                        case 'sxw':
@@ -225,9 +244,16 @@ class FileContentParser {
                switch ($extension) {
                        case 'pdf':
                        case 'doc':
+                       case 'docx':
+                       case 'dotx':
                        case 'pps':
+                       case 'ppsx':
                        case 'ppt':
+                       case 'pptx':
+                       case 'potx':
                        case 'xls':
+                       case 'xlsx':
+                       case 'xltx':
                        case 'sxc':
                        case 'sxi':
                        case 'sxw':
@@ -293,6 +319,26 @@ class FileContentParser {
                                        return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XLS'), $extension);
                                }
                                break;
+                       case 'docx':
+                       case 'dotx':
+                               // Microsoft Word >= 2007
+                               if ($indexerConfig['unzip']) {
+                                       return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.DOC'), $extension);
+                               }
+                               break;
+                       case 'pptx':    // Microsoft PowerPoint >= 2007
+                       case 'ppsx':
+                       case 'potx':
+                               if ($indexerConfig['unzip']) {
+                                       return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PP'), $extension);
+                               }
+                               break;
+                       case 'xlsx':    // Microsoft Excel >= 2007
+                       case 'xltx':
+                               if ($indexerConfig['unzip']) {
+                                       return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XLS'), $extension);
+                               }
+                               break;
                        case 'sxc':
                                // Open Office Calc.
                                if ($indexerConfig['unzip']) {
@@ -481,6 +527,55 @@ class FileContentParser {
                                        $this->setLocaleForServerFileSystem(TRUE);
                                }
                                break;
+                       case 'docx':
+                       case 'dotx':
+                       case 'pptx':
+                       case 'ppsx':
+                       case 'potx':
+                       case 'xlsx':
+                       case 'xltx':
+                               if ($this->app['unzip']) {
+                                       $this->setLocaleForServerFileSystem();
+                                       switch ($ext) {
+                                               case 'docx':
+                                               case 'dotx':
+                                                       // Read document.xml:
+                                                       $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' word/document.xml';
+                                                       break;
+                                               case 'ppsx':
+                                               case 'pptx':
+                                               case 'potx':
+                                                       // Read slide1.xml:
+                                                       $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' ppt/slides/slide1.xml';
+                                                       break;
+                                               case 'xlsx':
+                                               case 'xltx':
+                                                       // Read sheet1.xml:
+                                                       $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' xl/worksheets/sheet1.xml';
+                                                       break;
+                                       }
+                                       CommandUtility::exec($cmd, $res);
+                                       $content_xml = implode(LF, $res);
+                                       unset($res);
+                                       $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
+                                       $contentArr = $this->pObj->splitRegularContent($utf8_content);
+                                       // Make sure the title doesn't expose the absolute path!
+                                       $contentArr['title'] = basename($absFile);
+                                       // Meta information
+                                       $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' docProps/core.xml';
+                                       CommandUtility::exec($cmd, $res);
+                                       $meta_xml = implode(LF, $res);
+                                       unset($res);
+                                       $metaContent = GeneralUtility::xml2tree($meta_xml);
+                                       if (is_array($metaContent)) {
+                                               $contentArr['title'] .= ' ' . $metaContent['cp:coreProperties'][0]['ch']['dc:title'][0]['values'][0];
+                                               $contentArr['description'] = $metaContent['cp:coreProperties'][0]['ch']['dc:subject'][0]['values'][0];
+                                               $contentArr['description'] .= ' ' . $metaContent['cp:coreProperties'][0]['ch']['dc:description'][0]['values'][0];
+                                               $contentArr['keywords'] = $metaContent['cp:coreProperties'][0]['ch']['cp:keywords'][0]['values'][0];
+                                       }
+                                       $this->setLocaleForServerFileSystem(TRUE);
+                               }
+                               break;
                        case 'sxi':
                        case 'sxc':
                        case 'sxw':
diff --git a/typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/docx.gif b/typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/docx.gif
new file mode 100644 (file)
index 0000000..6961dfd
Binary files /dev/null and b/typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/docx.gif differ
diff --git a/typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/dotx.gif b/typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/dotx.gif
new file mode 100644 (file)
index 0000000..6961dfd
Binary files /dev/null and b/typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/dotx.gif differ
diff --git a/typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/potx.gif b/typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/potx.gif
new file mode 100644 (file)
index 0000000..1e212aa
Binary files /dev/null and b/typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/potx.gif differ
diff --git a/typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/ppsx.gif b/typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/ppsx.gif
new file mode 100644 (file)
index 0000000..d8cd788
Binary files /dev/null and b/typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/ppsx.gif differ
diff --git a/typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/pptx.gif b/typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/pptx.gif
new file mode 100644 (file)
index 0000000..1e212aa
Binary files /dev/null and b/typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/pptx.gif differ
diff --git a/typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/xlsx.gif b/typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/xlsx.gif
new file mode 100644 (file)
index 0000000..a397907
Binary files /dev/null and b/typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/xlsx.gif differ
diff --git a/typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/xltx.gif b/typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/xltx.gif
new file mode 100644 (file)
index 0000000..a397907
Binary files /dev/null and b/typo3/sysext/indexed_search/Resources/Public/Icons/FileTypes/xltx.gif differ
index a0e9aa9..46b64ae 100644 (file)
@@ -35,9 +35,16 @@ $GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_tcemain.php']['proc
 $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] = array(
        'pdf'  => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
        'doc'  => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
+       'docx' => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
+       'dotx' => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
        'pps'  => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
+       'ppsx' => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
        'ppt'  => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
+       'pptx' => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
+       'potx' => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
        'xls'  => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
+       'xlsx' => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
+       'xltx' => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
        'sxc'  => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
        'sxi'  => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
        'sxw'  => \TYPO3\CMS\IndexedSearch\FileContentParser::class,