2 /***************************************************************
5 * (c) 2001-2008 Kasper Skaarhoj (kasperYYYY@typo3.com)
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
28 * External standard parsers for indexed_search
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * @coauthor Olivier Simah <noname_paris@yahoo.fr>
34 * [CLASS/FUNCTION INDEX of SCRIPT]
38 * 75: class tx_indexed_search_extparse
39 * 94: function initParser($extension)
40 * 214: function softInit($extension)
41 * 247: function searchTypeMediaTitle($extension)
42 * 323: function isMultiplePageExtension($extension)
44 * SECTION: Reading documents (for parsing)
45 * 354: function readFileContent($ext,$absFile,$cPKey)
46 * 521: function fileContentParts($ext,$absFile)
47 * 560: function splitPdfInfo($pdfInfoArray)
48 * 579: function removeEndJunk($string)
50 * SECTION: Backend analyzer
51 * 606: function getIcon($extension)
54 * (This index is automatically created/updated by the extension "extdeveval")
68 * External standard parsers for indexed_search
69 * MUST RETURN utf-8 content!
71 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
73 * @subpackage tx_indexedsearch
75 class tx_indexed_search_extparse
{
77 // This value is also overridden from config.
78 var $pdf_mode = -20; // zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
80 // This array is configured in initialization:
82 var $ext2itemtype_map = array();
83 var $supportedExtensions = array();
85 var $pObj; // Reference to parent object (indexer class)
89 * Initialize external parser for parsing content.
91 * @param string File extension
92 * @return boolean Returns true if extension is supported/enabled, otherwise false.
94 function initParser($extension) {
96 // Then read indexer-config and set if appropriate:
97 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
99 // If windows, apply extension to tool name:
100 $exe = (TYPO3_OS
== 'WIN') ?
'.exe' : ''; // lg
105 $ignoreExtensions = t3lib_div
::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
106 if (in_array($extension, $ignoreExtensions)) {
107 $this->pObj
->log_setTSlogMessage(sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:ignoreExtensions'), $extension), 1);
111 // Switch on file extension:
115 if ($indexerConfig['pdftools']) {
116 $pdfPath = ereg_replace("\/$",'',$indexerConfig['pdftools']).'/';
117 if (ini_get('safe_mode') ||
(@is_file
($pdfPath.'pdftotext'.$exe) && @is_file
($pdfPath.'pdfinfo'.$exe))) {
118 $this->app
['pdfinfo'] = $pdfPath.'pdfinfo'.$exe;
119 $this->app
['pdftotext'] = $pdfPath.'pdftotext'.$exe;
121 $this->pdf_mode
= t3lib_div
::intInRange($indexerConfig['pdf_mode'],-100,100);
123 } else $this->pObj
->log_setTSlogMessage(sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:pdfToolsNotFound'), $pdfPath), 3);
124 } else $this->pObj
->log_setTSlogMessage($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:pdfToolsDisabled'), 1);
128 if ($indexerConfig['catdoc']) {
129 $catdocPath = ereg_replace("\/$",'',$indexerConfig['catdoc']).'/';
130 if (ini_get('safe_mode') || @is_file
($catdocPath.'catdoc'.$exe)) {
131 $this->app
['catdoc'] = $catdocPath.'catdoc'.$exe;
133 } else $this->pObj
->log_setTSlogMessage(sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:catdocNotFound'), $catdocPath), 3);
134 } else $this->pObj
->log_setTSlogMessage($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:catdocDisabled'), 1);
136 case 'pps': // MS PowerPoint(?)
137 case 'ppt': // MS PowerPoint
139 if ($indexerConfig['ppthtml']) {
140 $ppthtmlPath = ereg_replace('\/$','',$indexerConfig['ppthtml']).'/';
141 if (ini_get('safe_mode') || @is_file
($ppthtmlPath.'ppthtml'.$exe)){
142 $this->app
['ppthtml'] = $ppthtmlPath.'ppthtml'.$exe;
144 } else $this->pObj
->log_setTSlogMessage(sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:ppthtmlNotFound'), $ppthtmlPath), 3);
145 } else $this->pObj
->log_setTSlogMessage($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:ppthtmlDisabled'), 1);
147 case 'xls': // MS Excel
149 if ($indexerConfig['xlhtml']) {
150 $xlhtmlPath = ereg_replace('\/$','',$indexerConfig['xlhtml']).'/';
151 if (ini_get('safe_mode') || @is_file
($xlhtmlPath.'xlhtml'.$exe)){
152 $this->app
['xlhtml'] = $xlhtmlPath.'xlhtml'.$exe;
154 } else $this->pObj
->log_setTSlogMessage(sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:xlhtmlNotFound'), $xlhtmlPath), 3);
155 } else $this->pObj
->log_setTSlogMessage($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:xlhtmlDisabled'), 1);
157 case 'sxc': // Open Office Calc.
158 case 'sxi': // Open Office Impress
159 case 'sxw': // Open Office Writer
160 case 'ods': // Oasis OpenDocument Spreadsheet
161 case 'odp': // Oasis OpenDocument Presentation
162 case 'odt': // Oasis OpenDocument Text
163 if ($indexerConfig['unzip']) {
164 $unzipPath = preg_replace('/\/$/','',$indexerConfig['unzip']).'/';
165 if (ini_get('safe_mode') || @is_file
($unzipPath.'unzip'.$exe)) {
166 $this->app
['unzip'] = $unzipPath.'unzip'.$exe;
168 } else $this->pObj
->log_setTSlogMessage(sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:unzipNotFound'), $unzipPath), 3);
169 } else $this->pObj
->log_setTSlogMessage($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:unzipDisabled'), 1);
173 if ($indexerConfig['unrtf']) {
174 $unrtfPath = ereg_replace("\/$",'',$indexerConfig['unrtf']).'/';
175 if (ini_get('safe_mode') || @is_file
($unrtfPath.'unrtf'.$exe)) {
176 $this->app
['unrtf'] = $unrtfPath.'unrtf'.$exe;
178 } else $this->pObj
->log_setTSlogMessage(sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:unrtfNotFound'), $unrtfPath), 3);
179 } else $this->pObj
->log_setTSlogMessage($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:unrtfDisabled'), 1);
181 case 'txt': // Raw text
182 case 'csv': // Raw text
183 case 'xml': // PHP strip-tags()
184 case 'tif': // PHP EXIF
187 case 'html': // PHP strip-tags()
188 case 'htm': // PHP strip-tags()
190 $mainExtension = 'html'; // making "html" the common "item_type"
192 case 'jpg': // PHP EXIF
193 case 'jpeg': // PHP EXIF
195 $mainExtension = 'jpeg'; // making "jpeg" the common item_type
199 // If extension was OK:
201 $this->supportedExtensions
[$extension] = TRUE;
202 $this->ext2itemtype_map
[$extension] = $mainExtension ?
$mainExtension : $extension;
208 * Initialize external parser for backend modules
209 * Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc) in backend and frontend plugin
211 * @param string File extension to initialize for.
212 * @return boolean Returns true if the extension is supported and enabled, otherwise false.
214 function softInit($extension) {
217 case 'doc': // MS Word files
218 case 'pps': // MS PowerPoint
219 case 'ppt': // MS PowerPoint
220 case 'xls': // MS Excel
221 case 'sxc': // Open Office Calc.
222 case 'sxi': // Open Office Impress
223 case 'sxw': // Open Office Writer
224 case 'ods': // Oasis OpenDocument Spreadsheet
225 case 'odp': // Oasis OpenDocument Presentation
226 case 'odt': // Oasis OpenDocument Text
227 case 'rtf': // RTF documents
228 case 'txt': // ASCII Text documents
231 case 'csv': // Comma Separated Values
232 case 'xml': // Generic XML
233 case 'jpg': // Jpeg images (EXIF comment)
234 case 'jpeg': // Jpeg images (EXIF comment)
235 case 'tif': // TIF images (EXIF comment)
242 * Return title of entry in media type selector box.
244 * @param string File extension
245 * @return string String with label value of entry in media type search selector box (frontend plugin).
247 function searchTypeMediaTitle($extension) {
249 // Read indexer-config
250 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
253 $ignoreExtensions = t3lib_div
::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
254 if (in_array($extension, $ignoreExtensions)) {
258 // Switch on file extension:
262 if ($indexerConfig['pdftools']) {
263 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extensionPDF'), $extension);
268 if ($indexerConfig['catdoc']) {
269 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.DOC'), $extension);
272 case 'pps': // MS PowerPoint(?)
273 case 'ppt': // MS PowerPoint
275 if ($indexerConfig['ppthtml']) {
276 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.PP'), $extension);
279 case 'xls': // MS Excel
281 if ($indexerConfig['xlhtml']) {
282 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.XLS'), $extension);
285 case 'sxc': // Open Office Calc.
286 if ($indexerConfig['unzip']) {
287 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.SXC'), $extension);
290 case 'sxi': // Open Office Impress
291 if ($indexerConfig['unzip']) {
292 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.SXI'), $extension);
295 case 'sxw': // Open Office Writer
296 if ($indexerConfig['unzip']) {
297 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.SXW'), $extension);
300 case 'ods': // Oasis OpenDocument Spreadsheet
301 if ($indexerConfig['unzip']) {
302 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.ODS'), $extension);
305 case 'odp': // Oasis OpenDocument Presentation
306 if ($indexerConfig['unzip']) {
307 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.ODP'), $extension);
310 case 'odt': // Oasis OpenDocument Text
311 if ($indexerConfig['unzip']) {
312 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.ODT'), $extension);
317 if ($indexerConfig['unrtf']) {
318 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.RTF'), $extension);
321 case 'jpeg': // PHP EXIF
322 case 'tif': // PHP EXIF
323 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.Images'), $extension);
325 case 'html': // PHP strip-tags()
326 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.HTML'), $extension);
328 case 'txt': // Raw text
329 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.TXT'), $extension);
331 case 'csv': // Raw text
332 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.CSV'), $extension);
334 case 'xml': // PHP strip-tags()
335 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.XML'), $extension);
337 // NO entry (duplicates or blank):
338 case 'htm': // PHP strip-tags()
339 case 'jpg': // PHP EXIF
346 * Returns true if the input extension (item_type) is a potentially a multi-page extension
348 * @param string Extension / item_type string
349 * @return boolean Return true if multi-page
351 function isMultiplePageExtension($extension) {
352 // Switch on file extension:
353 switch((string)$extension) {
368 /************************
370 * Reading documents (for parsing)
372 ************************/
375 * Reads the content of an external file being indexed.
377 * @param string File extension, eg. "pdf", "doc" etc.
378 * @param string Absolute filename of file (must exist and be validated OK before calling function)
379 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
380 * @return array Standard content array (title, description, keywords, body keys)
382 function readFileContent($ext,$absFile,$cPKey) {
385 // Return immediately if initialization didn't set support up:
386 if (!$this->supportedExtensions
[$ext]) return FALSE;
388 // Switch by file extension
391 if ($this->app
['pdfinfo']) {
393 $cmd = $this->app
['pdfinfo'].' "'.$absFile.'"';
395 $pdfInfo = $this->splitPdfInfo($res);
397 if (intval($pdfInfo['pages'])) {
398 list($low,$high) = explode('-',$cPKey);
401 $tempFileName = t3lib_div
::tempnam('Typo3_indexer'); // Create temporary name
402 @unlink
($tempFileName); // Delete if exists, just to be safe.
403 $cmd = $this->app
['pdftotext'].' -f '.$low.' -l '.$high.' -enc UTF-8 -q "'.$absFile.'" '.$tempFileName;
405 if (@is_file
($tempFileName)) {
406 $content = t3lib_div
::getUrl($tempFileName);
407 unlink($tempFileName);
409 $this->pObj
->log_setTSlogMessage(sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:pdfToolsFailed'), $absFile), 2);
411 if (strlen($content)) {
412 $contentArr = $this->pObj
->splitRegularContent($this->removeEndJunk($content));
418 if ($this->app
['catdoc']) {
419 $cmd = $this->app
['catdoc'].' -d utf-8 "'.$absFile.'"';
421 $content = implode(chr(10),$res);
423 $contentArr = $this->pObj
->splitRegularContent($this->removeEndJunk($content));
428 if ($this->app
['ppthtml']) {
429 $cmd = $this->app
['ppthtml'].' "'.$absFile.'"';
431 $content = implode(chr(10),$res);
433 $content = $this->pObj
->convertHTMLToUtf8($content);
434 $contentArr = $this->pObj
->splitHTMLContent($this->removeEndJunk($content));
435 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
439 if ($this->app
['xlhtml']) {
440 $cmd = $this->app
['xlhtml'].' -nc -te "'.$absFile.'"';
442 $content = implode(chr(10),$res);
444 $content = $this->pObj
->convertHTMLToUtf8($content);
445 $contentArr = $this->pObj
->splitHTMLContent($this->removeEndJunk($content));
446 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
455 if ($this->app
['unzip']) {
457 $cmd = $this->app
['unzip'].' -p "'.$absFile.'" content.xml';
459 $content_xml = implode(chr(10),$res);
463 $cmd = $this->app
['unzip'].' -p "'.$absFile.'" meta.xml';
465 $meta_xml = implode(chr(10),$res);
468 $utf8_content = trim(strip_tags(str_replace('<',' <',$content_xml)));
469 $contentArr = $this->pObj
->splitRegularContent($utf8_content);
470 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
473 $metaContent = t3lib_div
::xml2tree($meta_xml);
474 $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
475 if (is_array($metaContent)) {
476 $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ?
$metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
477 $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0].' '.$metaContent['dc:description'][0]['values'][0];
479 // Keywords collected:
480 if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
481 foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
482 $contentArr['keywords'].= $kwDat['values'][0].' ';
489 if ($this->app
['unrtf']) {
490 $cmd = $this->app
['unrtf'].' "'.$absFile.'"';
492 $fileContent = implode(chr(10),$res);
494 $fileContent = $this->pObj
->convertHTMLToUtf8($fileContent);
495 $contentArr = $this->pObj
->splitHTMLContent($fileContent);
499 case 'csv': // Raw text
500 $content = t3lib_div
::getUrl($absFile);
501 // TODO: Auto-registration of charset???? -> utf-8 (Current assuming western europe...)
502 $content = $this->pObj
->convertHTMLToUtf8($content, 'iso-8859-1');
503 $contentArr = $this->pObj
->splitRegularContent($content);
504 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
508 $fileContent = t3lib_div
::getUrl($absFile);
509 $fileContent = $this->pObj
->convertHTMLToUtf8($fileContent);
510 $contentArr = $this->pObj
->splitHTMLContent($fileContent);
512 case 'xml': // PHP strip-tags()
513 $fileContent = t3lib_div
::getUrl($absFile);
516 eregi('^[[:space:]]*<\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']',substr($fileContent,0,200),$reg);
517 $charset = $reg[1] ?
$this->pObj
->csObj
->parse_charset($reg[1]) : 'utf-8';
519 // Converting content:
520 $fileContent = $this->pObj
->convertHTMLToUtf8(strip_tags(str_replace('<',' <',$fileContent)), $charset);
521 $contentArr = $this->pObj
->splitRegularContent($fileContent);
522 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
524 case 'jpg': // PHP EXIF
525 case 'jpeg': // PHP EXIF
526 case 'tif': // PHP EXIF
527 if (function_exists('exif_read_data')) {
528 $exif = exif_read_data($absFile, 'IFD0');
534 $comment = trim($exif['COMMENT'][0].' '.$exif['ImageDescription']); // The comments in JPEG files are utf-8, while in Tif files they are 7-bit ascii.
538 $contentArr = $this->pObj
->splitRegularContent($comment);
539 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
545 // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
546 if (is_array($contentArr) && !$contentArr['title']) {
547 $contentArr['title'] = str_replace('_',' ',basename($absFile)); // Substituting "_" for " " because many filenames may have this instead of a space char.
554 * Creates an array with pointers to divisions of document.
555 * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero) coming back.
557 * @param string File extension
558 * @param string Absolute filename (must exist and be validated OK before calling function)
559 * @return array Array of pointers to sections that the document should be divided into
561 function fileContentParts($ext,$absFile) {
566 $cmd = $this->app
['pdfinfo'].' "'.$absFile.'"';
568 $pdfInfo = $this->splitPdfInfo($res);
571 if (intval($pdfInfo['pages'])) {
575 if ($this->pdf_mode
>0) {
576 $iter = ceil($pdfInfo['pages']/$this->pdf_mode
);
578 $iter = t3lib_div
::intInRange(abs($this->pdf_mode
),1,$pdfInfo['pages']);
581 // Traverse and create intervals.
582 for ($a=0;$a<$iter;$a++
) {
583 $low = floor($a*($pdfInfo['pages']/$iter))+
1;
584 $high = floor(($a+
1)*($pdfInfo['pages']/$iter));
585 $cParts[] = $low.'-'.$high;
594 * Analysing PDF info into a useable format.
596 * @param array Array of PDF content, coming from the pdfinfo tool
597 * @return array Result array
599 * @see fileContentParts()
601 function splitPdfInfo($pdfInfoArray) {
603 if (is_array($pdfInfoArray)) {
604 foreach($pdfInfoArray as $line) {
605 $parts = explode(':',$line,2);
606 if (count($parts)>1 && trim($parts[0])) {
607 $res[strtolower(trim($parts[0]))] = trim($parts[1]);
615 * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.
617 * @param string String to clean up
618 * @return string String
620 function removeEndJunk($string) {
621 return trim(ereg_replace('['.chr(10).chr(12).']*$','',$string));
635 /************************
639 ************************/
642 * Return icon for file extension
644 * @param string File extension, lowercase.
645 * @return string Relative file reference, resolvable by t3lib_div::getFileAbsFileName()
647 function getIcon($extension) {
648 if ($extension=='htm') $extension = 'html';
649 if ($extension=='jpeg') $extension = 'jpg';
650 return 'EXT:indexed_search/pi/res/'.$extension.'.gif';
654 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE
]['XCLASS']['ext/indexed_search/class.external_parser.php']) {
655 include_once($TYPO3_CONF_VARS[TYPO3_MODE
]['XCLASS']['ext/indexed_search/class.external_parser.php']);