Commit f57782b7 authored by Xavier Perseguers's avatar Xavier Perseguers Committed by Tymoteusz Motylewski
Browse files

[FEATURE] Index docx, xlsx, ... from MS Office 2007 and above

* Add Microsoft office new formats to indexed_search
* Use unzip to get the xml content

Change-Id: I6d4481e09c34701ef846bb48f16b2a10d3fac38a
Resolves: #23669
Releases: master
Reviewed-on: http://review.typo3.org/42863

Reviewed-by: default avatarMarkus Sommer <markussom@posteo.de>
Tested-by: default avatarMarkus Sommer <markussom@posteo.de>
Reviewed-by: Philipp Gampe's avatarPhilipp Gampe <philipp.gampe@typo3.org>
Reviewed-by: Tymoteusz Motylewski's avatarTymoteusz Motylewski <t.motylewski@gmail.com>
Tested-by: Tymoteusz Motylewski's avatarTymoteusz Motylewski <t.motylewski@gmail.com>
parent ff6748ae
......@@ -150,6 +150,25 @@ class FileContentParser {
$this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:xlhtmlDisabled'), 1);
}
break;
case 'docx': // Microsoft Word >= 2007
case 'dotx':
case 'pptx': // Microsoft PowerPoint >= 2007
case 'ppsx':
case 'potx':
case 'xlsx': // Microsoft Excel >= 2007
case 'xltx':
if ($indexerConfig['unzip']) {
$unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
if (@is_file($unzipPath . 'unzip' . $exe)) {
$this->app['unzip'] = $unzipPath . 'unzip' . $exe;
$extOK = TRUE;
} else {
$this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipNotFound'), $unzipPath), 3);
}
} else {
$this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipDisabled'), 1);
}
break;
case 'sxc':
case 'sxi':
case 'sxw':
......@@ -225,9 +244,16 @@ class FileContentParser {
switch ($extension) {
case 'pdf':
case 'doc':
case 'docx':
case 'dotx':
case 'pps':
case 'ppsx':
case 'ppt':
case 'pptx':
case 'potx':
case 'xls':
case 'xlsx':
case 'xltx':
case 'sxc':
case 'sxi':
case 'sxw':
......@@ -293,6 +319,26 @@ class FileContentParser {
return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XLS'), $extension);
}
break;
case 'docx':
case 'dotx':
// Microsoft Word >= 2007
if ($indexerConfig['unzip']) {
return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.DOC'), $extension);
}
break;
case 'pptx': // Microsoft PowerPoint >= 2007
case 'ppsx':
case 'potx':
if ($indexerConfig['unzip']) {
return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PP'), $extension);
}
break;
case 'xlsx': // Microsoft Excel >= 2007
case 'xltx':
if ($indexerConfig['unzip']) {
return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XLS'), $extension);
}
break;
case 'sxc':
// Open Office Calc.
if ($indexerConfig['unzip']) {
......@@ -481,6 +527,55 @@ class FileContentParser {
$this->setLocaleForServerFileSystem(TRUE);
}
break;
case 'docx':
case 'dotx':
case 'pptx':
case 'ppsx':
case 'potx':
case 'xlsx':
case 'xltx':
if ($this->app['unzip']) {
$this->setLocaleForServerFileSystem();
switch ($ext) {
case 'docx':
case 'dotx':
// Read document.xml:
$cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' word/document.xml';
break;
case 'ppsx':
case 'pptx':
case 'potx':
// Read slide1.xml:
$cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' ppt/slides/slide1.xml';
break;
case 'xlsx':
case 'xltx':
// Read sheet1.xml:
$cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' xl/worksheets/sheet1.xml';
break;
}
CommandUtility::exec($cmd, $res);
$content_xml = implode(LF, $res);
unset($res);
$utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
$contentArr = $this->pObj->splitRegularContent($utf8_content);
// Make sure the title doesn't expose the absolute path!
$contentArr['title'] = basename($absFile);
// Meta information
$cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' docProps/core.xml';
CommandUtility::exec($cmd, $res);
$meta_xml = implode(LF, $res);
unset($res);
$metaContent = GeneralUtility::xml2tree($meta_xml);
if (is_array($metaContent)) {
$contentArr['title'] .= ' ' . $metaContent['cp:coreProperties'][0]['ch']['dc:title'][0]['values'][0];
$contentArr['description'] = $metaContent['cp:coreProperties'][0]['ch']['dc:subject'][0]['values'][0];
$contentArr['description'] .= ' ' . $metaContent['cp:coreProperties'][0]['ch']['dc:description'][0]['values'][0];
$contentArr['keywords'] = $metaContent['cp:coreProperties'][0]['ch']['cp:keywords'][0]['values'][0];
}
$this->setLocaleForServerFileSystem(TRUE);
}
break;
case 'sxi':
case 'sxc':
case 'sxw':
......
......@@ -35,9 +35,16 @@ $GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_tcemain.php']['proc
$GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] = array(
'pdf' => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
'doc' => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
'docx' => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
'dotx' => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
'pps' => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
'ppsx' => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
'ppt' => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
'pptx' => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
'potx' => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
'xls' => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
'xlsx' => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
'xltx' => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
'sxc' => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
'sxi' => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
'sxw' => \TYPO3\CMS\IndexedSearch\FileContentParser::class,
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment