7bcb686994d3196d6fa3fb6ada13f93a7bb1529e
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.external_parser.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2004 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * External standard parsers for indexed_search
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * @coauthor Olivier Simah <noname_paris@yahoo.fr>
32 */
33 /**
34 * [CLASS/FUNCTION INDEX of SCRIPT]
35 *
36 *
37 *
38 * 73: class tx_indexed_search_extparse
39 * 90: function initParser($extension)
40 * 215: function initBackend($extension)
41 *
42 * SECTION: Reading documents (for parsing)
43 * 261: function readFileContent($ext,$absFile,$cPKey)
44 * 441: function fileContentParts($ext,$absFile)
45 * 480: function splitPdfInfo($pdfInfoArray)
46 * 499: function removeEndJunk($string)
47 *
48 * SECTION: Backend analyzer
49 * 526: function getIcon($extension)
50 *
51 * TOTAL FUNCTIONS: 7
52 * (This index is automatically created/updated by the extension "extdeveval")
53 *
54 */
55
56
57
58
59
60
61
62
63
64
65 /**
66 * External standard parsers for indexed_search
67 * MUST RETURN utf-8 content!
68 *
69 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
70 * @package TYPO3
71 * @subpackage tx_indexedsearch
72 */
73 class tx_indexed_search_extparse {
74
75 // This value is also overridden from config.
76 var $pdf_mode = -20; // zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
77
78 // This array is configured in initialization:
79 var $app = array();
80
81 var $pObj; // Reference to parent object (indexer class)
82
83
84 /**
85 * Initialize external parser for parsing content.
86 *
87 * @param string File extension
88 * @return boolean Returns true if extension is supported/enabled, otherwise false.
89 */
90 function initParser($extension) {
91
92 // Then read indexer-config and set if appropriate:
93 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
94
95 // If windows, apply extension to tool name:
96 $exe = (TYPO3_OS == 'WIN') ? '.exe' : ''; // lg
97 $extOK = FALSE;
98
99 // Ignore extensions
100 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
101 if (in_array($extension, $ignoreExtensions)) {
102 $this->pObj->log_setTSlogMessage('Extension "'.$extension.'" was set to be ignored.',1);
103
104 return FALSE;
105 }
106
107 // Switch on file extension:
108 switch($extension) {
109 case 'pdf':
110 // PDF
111 if ($indexerConfig['pdftools']) {
112 $pdfPath = ereg_replace("\/$",'',$indexerConfig['pdftools']).'/';
113 if (ini_get('safe_mode') || (@is_file($pdfPath.'pdftotext'.$exe) && @is_file($pdfPath.'pdfinfo'.$exe))) {
114 $this->app['pdfinfo'] = $pdfPath.'pdfinfo'.$exe;
115 $this->app['pdftotext'] = $pdfPath.'pdftotext'.$exe;
116 // PDF mode:
117 $this->pdf_mode = t3lib_div::intInRange($indexerConfig['pdf_mode'],-100,100);
118 $extOK = TRUE;
119 } else $this->pObj->log_setTSlogMessage("PDF tools was not found in paths '".$pdfPath."pdftotext' and/or '".$pdfPath."pdfinfo'",3);
120 } else $this->pObj->log_setTSlogMessage('PDF tools disabled',1);
121 break;
122 case 'doc':
123 // Catdoc
124 if ($indexerConfig['catdoc']) {
125 $catdocPath = ereg_replace("\/$",'',$indexerConfig['catdoc']).'/';
126 if (ini_get('safe_mode') || @is_file($catdocPath.'catdoc'.$exe)) {
127 $this->app['catdoc'] = $catdocPath.'catdoc'.$exe;
128 $extOK = TRUE;
129 } else $this->pObj->log_setTSlogMessage("'catdoc' tool for reading Word-files was not found in paths '".$catdocPath."catdoc'",3);
130 } else $this->pObj->log_setTSlogMessage('catdoc tools (Word-files) disabled',1);
131 break;
132 case 'pps': // MS PowerPoint(?)
133 case 'ppt': // MS PowerPoint
134 // ppthtml
135 if ($indexerConfig['ppthtml']) {
136 $ppthtmlPath = ereg_replace('\/$','',$indexerConfig['ppthtml']).'/';
137 if (ini_get('safe_mode') || @is_file($ppthtmlPath.'ppthtml'.$exe)){
138 $this->app['ppthtml'] = $ppthtmlPath.'ppthtml'.$exe;
139 $extOK = TRUE;
140 } else $this->pObj->log_setTSlogMessage("'ppthtml' tool for reading Powerpoint-files was not found in paths '".$ppthtmlPath."ppthtml'",3);
141 } else $this->pObj->log_setTSlogMessage('ppthtml tools (Powerpoint-files) disabled',1);
142 break;
143 case 'xls': // MS Excel
144 // Xlhtml
145 if ($indexerConfig['xlhtml']) {
146 $xlhtmlPath = ereg_replace('\/$','',$indexerConfig['xlhtml']).'/';
147 if (ini_get('safe_mode') || @is_file($xlhtmlPath.'xlhtml'.$exe)){
148 $this->app['xlhtml'] = $xlhtmlPath.'xlhtml'.$exe;
149 $extOK = TRUE;
150 } else $this->pObj->log_setTSlogMessage("'xlhtml' tool for reading Excel-files was not found in paths '".$xlhtmlPath."xlhtml'",3);
151 } else $this->pObj->log_setTSlogMessage('xlhtml tools (Excel-files) disabled',1);
152 break;
153 case 'sxc': // Open Office Calc.
154 case 'sxi': // Open Office Impress
155 case 'sxw': // Open Office Writer
156 // ooo_extract.rb can be found at: http://www.math.umd.edu/~dcarrera/openoffice/misc/tools/ooo_extract.html
157 // I had to run this on debian before I could run the ooo_extract.rb script:
158 // apt-get install libzlib-ruby1.8
159 // apt-get install librexml-ruby1.8
160 // ruby + ooo_extract
161 if ($indexerConfig['nativeOOMethod']) {
162 if (t3lib_extMgm::isLoaded('libunzipped')) {
163 $this->app['nativeOOMethod'] = TRUE;
164 $extOK = TRUE;
165 $this->pObj->log_setTSlogMessage('Using "libunzipped" for extraction of Open Office files, "'.$extension.'".',1);
166 } else $this->pObj->log_setTSlogMessage('The extension "libunzipped" was not loaded (for extraction of Open Office files, "'.$extension.'")',2);
167 } else {
168 if ($indexerConfig['ruby']) {
169 $rubyPath = ereg_replace('\/$','',$indexerConfig['ruby']).'/';
170 $oooExPath = ereg_replace('\/$','',$indexerConfig['OOoExtract']).'/';
171 if (ini_get('safe_mode') || (@is_file($rubyPath.'ruby'.$exe) && @is_file($oooExPath.'ooo_extract.rb'))){
172 $this->app['ruby'] = $rubyPath.'ruby'.$exe;
173 $this->app['OOo'] = $oooExPath.'ooo_extract.rb';
174 $extOK = TRUE;
175 } else $this->pObj->log_setTSlogMessage("'Ruby and OOo_extract' tools for reading OOo documents were not found in paths '".$rubyPath."ruby' OR '".$oooExPath."ooo_extract.rb'",3);
176 } else $this->pObj->log_setTSlogMessage('Ruby & OOo_extract tools (OpenOffice-files) disabled',1);
177 }
178 break;
179 case 'rtf':
180 // Catdoc
181 if ($indexerConfig['unrtf']) {
182 $unrtfPath = ereg_replace("\/$",'',$indexerConfig['unrtf']).'/';
183 if (ini_get('safe_mode') || @is_file($unrtfPath.'unrtf'.$exe)) {
184 $this->app['unrtf'] = $unrtfPath.'unrtf'.$exe;
185 $extOK = TRUE;
186 } else $this->pObj->log_setTSlogMessage("'unrtf' tool for reading RTF-files was not found in paths '".$unrtfPath."unrtf'",3);
187 } else $this->pObj->log_setTSlogMessage('unrtf tool (RTF-files) disabled',1);
188 break;
189 case 'txt': // Raw text
190 case 'html': // PHP strip-tags()
191 case 'htm': // PHP strip-tags()
192 case 'csv': // Raw text
193 case 'xml': // PHP strip-tags()
194 case 'jpg': // PHP EXIF
195 case 'jpeg': // PHP EXIF
196 case 'tif': // PHP EXIF
197 $extOK = TRUE;
198 break;
199 }
200
201 // If extension was OK:
202 if ($extOK) {
203 $this->supportedExtensions[$extension] = TRUE;
204 return TRUE;
205 }
206 }
207
208 /**
209 * Initialize external parser for backend modules
210 * Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc).
211 *
212 * @param string File extension to initialize for.
213 * @return boolean Returns true if the extension is supported and enabled, otherwise false.
214 */
215 function initBackend($extension) {
216 switch($extension) {
217 case 'pdf': // PDF
218 case 'doc': // MS Word files
219 case 'pps': // MS PowerPoint
220 case 'ppt': // MS PowerPoint
221 case 'xls': // MS Excel
222 case 'sxc': // Open Office Calc.
223 case 'sxi': // Open Office Impress
224 case 'sxw': // Open Office Writer
225 case 'rtf': // RTF documents
226 case 'txt': // ASCII Text documents
227 case 'html': // HTML
228 case 'htm': // HTML
229 case 'csv': // Comma Separated Values
230 case 'xml': // Generic XML
231 case 'jpg': // Jpeg images (EXIF comment)
232 case 'jpeg': // Jpeg images (EXIF comment)
233 case 'tif': // TIf images (EXIT comment)
234 return TRUE;
235 break;
236 }
237 }
238
239
240
241
242
243
244
245
246
247 /************************
248 *
249 * Reading documents (for parsing)
250 *
251 ************************/
252
253 /**
254 * Reads the content of an external file being indexed.
255 *
256 * @param string File extension, eg. "pdf", "doc" etc.
257 * @param string Absolute filename of file (must exist and be validated OK before calling function)
258 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
259 * @return array Standard content array (title, description, keywords, body keys)
260 */
261 function readFileContent($ext,$absFile,$cPKey) {
262 unset($contentArr);
263
264 // Return immediately if initialization didn't set support up:
265 if (!$this->supportedExtensions[$ext]) return FALSE;
266
267 // Switch by file extension
268 switch ($ext) {
269 case 'pdf':
270 if ($this->app['pdfinfo']) {
271 // Getting pdf-info:
272 $cmd = $this->app['pdfinfo'].' '.$absFile;
273 exec($cmd,$res);
274 $pdfInfo = $this->splitPdfInfo($res);
275 if (intval($pdfInfo['pages'])) {
276 list($low,$high) = explode('-',$cPKey);
277
278 // Get pdf content:
279 $tempFileName = t3lib_div::tempnam('Typo3_indexer'); // Create temporary name
280 @unlink ($tempFileName); // Delete if exists, just to be safe.
281 $cmd = $this->app['pdftotext'].' -f '.$low.' -l '.$high.' -enc UTF-8 -q '.$absFile.' '.$tempFileName;
282 exec($cmd,$res);
283 if (@is_file($tempFileName)) {
284 $content = t3lib_div::getUrl($tempFileName);
285 unlink($tempFileName);
286 } else {
287 $this->pObj->log_setTSlogMessage('PDFtoText Failed on this document: '.$absFile.". Maybe the PDF file is locked for printing or encrypted.",2);
288 }
289 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
290 }
291 }
292 break;
293 case 'doc':
294 if ($this->app['catdoc']) {
295 $cmd = $this->app['catdoc'].' -d utf-8 '.$absFile;
296 exec($cmd,$res);
297 $content = implode(chr(10),$res);
298 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
299 }
300 break;
301 case 'pps':
302 case 'ppt':
303 if ($this->app['ppthtml']) {
304 $cmd = $this->app['ppthtml'].' '.$absFile;
305 exec($cmd,$res);
306 $content = implode(chr(10),$res);
307 $content = $this->pObj->convertHTMLToUtf8($content);
308 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
309 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
310 }
311 break;
312 case 'xls':
313 if ($this->app['xlhtml']) {
314 $cmd = $this->app['xlhtml'].' -nc -te '.$absFile;
315 exec($cmd,$res);
316 $content = implode(chr(10),$res);
317 $content = $this->pObj->convertHTMLToUtf8($content);
318 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
319 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
320 }
321 break;
322 case 'sxi':
323 case 'sxc':
324 case 'sxw':
325 if ($this->app['nativeOOMethod']) {
326 if (t3lib_extMgm::isLoaded('libunzipped')) {
327
328 global $TYPO3_CONF_VARS;
329 require_once(t3lib_extMgm::extPath('libunzipped').'class.tx_libunzipped.php');
330
331 // Initialize Unzip object:
332 $unzip = t3lib_div::makeInstance('tx_libunzipped');
333 $ooFiles = $unzip->init($absFile);
334 if (is_array($ooFiles)) {
335 // Read content.xml:
336 $content_xml = $unzip->getFileFromArchive('content.xml');
337 $meta_xml = $unzip->getFileFromArchive('meta.xml');
338 $utf8_content = trim(strip_tags(str_replace('<',' <',$content_xml['content'])));
339 $contentArr = $this->pObj->splitRegularContent($utf8_content);
340 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
341
342 // Meta information
343 $metaContent = t3lib_div::xml2tree($meta_xml['content']);
344 $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
345 if (is_array($metaContent)) {
346 $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
347 $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0].' '.$metaContent['dc:description'][0]['values'][0];
348
349 // Keywords collected:
350 if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
351 foreach($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
352 $contentArr['keywords'].= $kwDat['values'][0].' ';
353 }
354 }
355 }
356 }
357 }
358 } else {
359 if ($this->app['ruby']) {
360 // Extracting document headers:
361 $cmd = $this->app['ruby'].' '.$this->app['OOo'].' --heading '.$absFile;
362 exec($cmd,$headings);
363
364 // Extracting document text:
365 $cmd = $this->app['ruby'].' '.$this->app['OOo'].' '.$absFile;
366 exec($cmd,$texts);
367
368 $content = implode(chr(10),$headings).' '.implode(chr(10),$texts);
369 $contentArr = $this->pObj->splitRegularContent($content);
370 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
371 }
372 }
373 break;
374 case 'rtf':
375 if ($this->app['unrtf']) {
376 $cmd = $this->app['unrtf'].' '.$absFile;
377 exec($cmd,$res);
378 $fileContent = implode(chr(10),$res);
379 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
380 $contentArr = $this->pObj->splitHTMLContent($fileContent);
381 }
382 break;
383 case 'txt':
384 case 'csv': // Raw text
385 $content = t3lib_div::getUrl($absFile);
386 // TODO: Auto-registration of charset???? -> utf-8 (Current assuming western europe...)
387 $content = $this->pObj->convertHTMLToUtf8($content, 'iso-8859-1');
388 $contentArr = $this->pObj->splitRegularContent($content);
389 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
390 break;
391 case 'html':
392 case 'htm':
393 $fileContent = t3lib_div::getUrl($absFile);
394 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
395 $contentArr = $this->pObj->splitHTMLContent($fileContent);
396 break;
397 case 'xml': // PHP strip-tags()
398 $fileContent = t3lib_div::getUrl($absFile);
399
400 // Finding charset:
401 eregi('^[[:space:]]*<\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']',substr($fileContent,0,200),$reg);
402 $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
403
404 // Converting content:
405 $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<',' <',$fileContent)), $charset);
406 $contentArr = $this->pObj->splitRegularContent($fileContent);
407 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
408 break;
409 case 'jpg': // PHP EXIF
410 case 'jpeg': // PHP EXIF
411 case 'tif': // PHP EXIF
412 $exif = exif_read_data($absFile, 'IFD0');
413 if ($exif) {
414 $comment = trim($exif['COMMENT'][0].' '.$exif['ImageDescription']); // The comments in JPEG files are utf-8, while in Tif files they are 7-bit ascii.
415 } else {
416 $comment = '';
417 }
418 $contentArr = $this->pObj->splitRegularContent($comment);
419 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
420 break;
421 default:
422 return false;
423 break;
424 }
425 // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
426 if (is_array($contentArr) && !$contentArr['title']) {
427 $contentArr['title'] = str_replace('_',' ',basename($absFile)); // Substituting "_" for " " because many filenames may have this instead of a space char.
428 }
429
430 return $contentArr;
431 }
432
433 /**
434 * Creates an array with pointers to divisions of document.
435 * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero) coming back.
436 *
437 * @param string File extension
438 * @param string Absolute filename (must exist and be validated OK before calling function)
439 * @return array Array of pointers to sections that the document should be divided into
440 */
441 function fileContentParts($ext,$absFile) {
442 $cParts = array(0);
443 switch ($ext) {
444 case 'pdf':
445 // Getting pdf-info:
446 $cmd = $this->app['pdfinfo'].' '.$absFile;
447 exec($cmd,$res);
448 $pdfInfo = $this->splitPdfInfo($res);
449
450 if (intval($pdfInfo['pages'])) {
451 $cParts = array();
452
453 // Calculate mode
454 if ($this->pdf_mode>0) {
455 $iter = ceil($pdfInfo['pages']/$this->pdf_mode);
456 } else {
457 $iter = t3lib_div::intInRange(abs($this->pdf_mode),1,$pdfInfo['pages']);
458 }
459
460 // Traverse and create intervals.
461 for ($a=0;$a<$iter;$a++) {
462 $low = floor($a*($pdfInfo['pages']/$iter))+1;
463 $high = floor(($a+1)*($pdfInfo['pages']/$iter));
464 $cParts[] = $low.'-'.$high;
465 }
466 }
467 break;
468 }
469 return $cParts;
470 }
471
472 /**
473 * Analysing PDF info into a useable format.
474 *
475 * @param array Array of PDF content, coming from the pdfinfo tool
476 * @return array Result array
477 * @access private
478 * @see fileContentParts()
479 */
480 function splitPdfInfo($pdfInfoArray) {
481 $res = array();
482 if (is_array($pdfInfoArray)) {
483 foreach($pdfInfoArray as $line) {
484 $parts = explode(':',$line,2);
485 if (count($parts)>1 && trim($parts[0])) {
486 $res[strtolower(trim($parts[0]))] = trim($parts[1]);
487 }
488 }
489 }
490 return $res;
491 }
492
493 /**
494 * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.
495 *
496 * @param string String to clean up
497 * @return string String
498 */
499 function removeEndJunk($string) {
500 return trim(ereg_replace('['.chr(10).chr(12).']*$','',$string));
501 }
502
503
504
505
506
507
508
509
510
511
512
513
514 /************************
515 *
516 * Backend analyzer
517 *
518 ************************/
519
520 /**
521 * Return icon for file extension
522 *
523 * @param string File extension, lowercase.
524 * @return string Relative file reference, resolvable by t3lib_div::getFileAbsFileName()
525 */
526 function getIcon($extension) {
527 if ($extension=='htm') $extension = 'html';
528 if ($extension=='jpeg') $extension = 'jpg';
529 return 'EXT:indexed_search/pi/res/'.$extension.'.gif';
530 }
531 }
532
533 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']) {
534 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']);
535 }
536 ?>