Changes to indexed_search
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.external_parser.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2004 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * External standard parsers for indexed_search
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * @coauthor Olivier Simah <noname_paris@yahoo.fr>
32 */
33 /**
34 * [CLASS/FUNCTION INDEX of SCRIPT]
35 *
36 *
37 *
38 * 73: class tx_indexed_search_extparse
39 * 90: function initParser($extension)
40 * 215: function initBackend($extension)
41 *
42 * SECTION: Reading documents (for parsing)
43 * 261: function readFileContent($ext,$absFile,$cPKey)
44 * 441: function fileContentParts($ext,$absFile)
45 * 480: function splitPdfInfo($pdfInfoArray)
46 * 499: function removeEndJunk($string)
47 *
48 * SECTION: Backend analyzer
49 * 526: function getIcon($extension)
50 *
51 * TOTAL FUNCTIONS: 7
52 * (This index is automatically created/updated by the extension "extdeveval")
53 *
54 */
55
56
57
58
59
60
61
62
63
64
65 /**
66 * External standard parsers for indexed_search
67 * MUST RETURN utf-8 content!
68 *
69 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
70 * @package TYPO3
71 * @subpackage tx_indexedsearch
72 */
73 class tx_indexed_search_extparse {
74
75 // This value is also overridden from config.
76 var $pdf_mode = -20; // zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
77
78 // This array is configured in initialization:
79 var $app = array();
80 var $ext2itemtype_map = array();
81 var $supportedExtensions = array();
82
83 var $pObj; // Reference to parent object (indexer class)
84
85
86 /**
87 * Initialize external parser for parsing content.
88 *
89 * @param string File extension
90 * @return boolean Returns true if extension is supported/enabled, otherwise false.
91 */
92 function initParser($extension) {
93
94 // Then read indexer-config and set if appropriate:
95 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
96
97 // If windows, apply extension to tool name:
98 $exe = (TYPO3_OS == 'WIN') ? '.exe' : ''; // lg
99 $extOK = FALSE;
100 $mainExtension = '';
101
102 // Ignore extensions
103 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
104 if (in_array($extension, $ignoreExtensions)) {
105 $this->pObj->log_setTSlogMessage('Extension "'.$extension.'" was set to be ignored.',1);
106
107 return FALSE;
108 }
109
110 // Switch on file extension:
111 switch($extension) {
112 case 'pdf':
113 // PDF
114 if ($indexerConfig['pdftools']) {
115 $pdfPath = ereg_replace("\/$",'',$indexerConfig['pdftools']).'/';
116 if (ini_get('safe_mode') || (@is_file($pdfPath.'pdftotext'.$exe) && @is_file($pdfPath.'pdfinfo'.$exe))) {
117 $this->app['pdfinfo'] = $pdfPath.'pdfinfo'.$exe;
118 $this->app['pdftotext'] = $pdfPath.'pdftotext'.$exe;
119 // PDF mode:
120 $this->pdf_mode = t3lib_div::intInRange($indexerConfig['pdf_mode'],-100,100);
121 $extOK = TRUE;
122 } else $this->pObj->log_setTSlogMessage("PDF tools was not found in paths '".$pdfPath."pdftotext' and/or '".$pdfPath."pdfinfo'",3);
123 } else $this->pObj->log_setTSlogMessage('PDF tools disabled',1);
124 break;
125 case 'doc':
126 // Catdoc
127 if ($indexerConfig['catdoc']) {
128 $catdocPath = ereg_replace("\/$",'',$indexerConfig['catdoc']).'/';
129 if (ini_get('safe_mode') || @is_file($catdocPath.'catdoc'.$exe)) {
130 $this->app['catdoc'] = $catdocPath.'catdoc'.$exe;
131 $extOK = TRUE;
132 } else $this->pObj->log_setTSlogMessage("'catdoc' tool for reading Word-files was not found in paths '".$catdocPath."catdoc'",3);
133 } else $this->pObj->log_setTSlogMessage('catdoc tools (Word-files) disabled',1);
134 break;
135 case 'pps': // MS PowerPoint(?)
136 case 'ppt': // MS PowerPoint
137 // ppthtml
138 if ($indexerConfig['ppthtml']) {
139 $ppthtmlPath = ereg_replace('\/$','',$indexerConfig['ppthtml']).'/';
140 if (ini_get('safe_mode') || @is_file($ppthtmlPath.'ppthtml'.$exe)){
141 $this->app['ppthtml'] = $ppthtmlPath.'ppthtml'.$exe;
142 $extOK = TRUE;
143 } else $this->pObj->log_setTSlogMessage("'ppthtml' tool for reading Powerpoint-files was not found in paths '".$ppthtmlPath."ppthtml'",3);
144 } else $this->pObj->log_setTSlogMessage('ppthtml tools (Powerpoint-files) disabled',1);
145 break;
146 case 'xls': // MS Excel
147 // Xlhtml
148 if ($indexerConfig['xlhtml']) {
149 $xlhtmlPath = ereg_replace('\/$','',$indexerConfig['xlhtml']).'/';
150 if (ini_get('safe_mode') || @is_file($xlhtmlPath.'xlhtml'.$exe)){
151 $this->app['xlhtml'] = $xlhtmlPath.'xlhtml'.$exe;
152 $extOK = TRUE;
153 } else $this->pObj->log_setTSlogMessage("'xlhtml' tool for reading Excel-files was not found in paths '".$xlhtmlPath."xlhtml'",3);
154 } else $this->pObj->log_setTSlogMessage('xlhtml tools (Excel-files) disabled',1);
155 break;
156 case 'sxc': // Open Office Calc.
157 case 'sxi': // Open Office Impress
158 case 'sxw': // Open Office Writer
159 // ooo_extract.rb can be found at: http://www.math.umd.edu/~dcarrera/openoffice/misc/tools/ooo_extract.html
160 // I had to run this on debian before I could run the ooo_extract.rb script:
161 // apt-get install libzlib-ruby1.8
162 // apt-get install librexml-ruby1.8
163 // ruby + ooo_extract
164 if ($indexerConfig['nativeOOMethod']) {
165 if (t3lib_extMgm::isLoaded('libunzipped')) {
166 $this->app['nativeOOMethod'] = TRUE;
167 $extOK = TRUE;
168 $this->pObj->log_setTSlogMessage('Using "libunzipped" for extraction of Open Office files, "'.$extension.'".',1);
169 } else $this->pObj->log_setTSlogMessage('The extension "libunzipped" was not loaded (for extraction of Open Office files, "'.$extension.'")',2);
170 } else {
171 if ($indexerConfig['ruby']) {
172 $rubyPath = ereg_replace('\/$','',$indexerConfig['ruby']).'/';
173 $oooExPath = ereg_replace('\/$','',$indexerConfig['OOoExtract']).'/';
174 if (ini_get('safe_mode') || (@is_file($rubyPath.'ruby'.$exe) && @is_file($oooExPath.'ooo_extract.rb'))){
175 $this->app['ruby'] = $rubyPath.'ruby'.$exe;
176 $this->app['OOo'] = $oooExPath.'ooo_extract.rb';
177 $extOK = TRUE;
178 } else $this->pObj->log_setTSlogMessage("'Ruby and OOo_extract' tools for reading OOo documents were not found in paths '".$rubyPath."ruby' OR '".$oooExPath."ooo_extract.rb'",3);
179 } else $this->pObj->log_setTSlogMessage('Ruby & OOo_extract tools (OpenOffice-files) disabled',1);
180 }
181 break;
182 case 'rtf':
183 // Catdoc
184 if ($indexerConfig['unrtf']) {
185 $unrtfPath = ereg_replace("\/$",'',$indexerConfig['unrtf']).'/';
186 if (ini_get('safe_mode') || @is_file($unrtfPath.'unrtf'.$exe)) {
187 $this->app['unrtf'] = $unrtfPath.'unrtf'.$exe;
188 $extOK = TRUE;
189 } else $this->pObj->log_setTSlogMessage("'unrtf' tool for reading RTF-files was not found in paths '".$unrtfPath."unrtf'",3);
190 } else $this->pObj->log_setTSlogMessage('unrtf tool (RTF-files) disabled',1);
191 break;
192 case 'txt': // Raw text
193 case 'csv': // Raw text
194 case 'xml': // PHP strip-tags()
195 case 'tif': // PHP EXIF
196 $extOK = TRUE;
197 break;
198 case 'html': // PHP strip-tags()
199 case 'htm': // PHP strip-tags()
200 $extOK = TRUE;
201 $mainExtension = 'html'; // making "html" the common "item_type"
202 break;
203 case 'jpg': // PHP EXIF
204 case 'jpeg': // PHP EXIF
205 $extOK = TRUE;
206 $mainExtension = 'jpeg'; // making "jpeg" the common item_type
207 break;
208 }
209
210 // If extension was OK:
211 if ($extOK) {
212 $this->supportedExtensions[$extension] = TRUE;
213 $this->ext2itemtype_map[$extension] = $mainExtension ? $mainExtension : $extension;
214 return TRUE;
215 }
216 }
217
218 /**
219 * Initialize external parser for backend modules
220 * Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc) in backend and frontend plugin
221 *
222 * @param string File extension to initialize for.
223 * @return boolean Returns true if the extension is supported and enabled, otherwise false.
224 */
225 function softInit($extension) {
226 switch($extension) {
227 case 'pdf': // PDF
228 case 'doc': // MS Word files
229 case 'pps': // MS PowerPoint
230 case 'ppt': // MS PowerPoint
231 case 'xls': // MS Excel
232 case 'sxc': // Open Office Calc.
233 case 'sxi': // Open Office Impress
234 case 'sxw': // Open Office Writer
235 case 'rtf': // RTF documents
236 case 'txt': // ASCII Text documents
237 case 'html': // HTML
238 case 'htm': // HTML
239 case 'csv': // Comma Separated Values
240 case 'xml': // Generic XML
241 case 'jpg': // Jpeg images (EXIF comment)
242 case 'jpeg': // Jpeg images (EXIF comment)
243 case 'tif': // TIf images (EXIT comment)
244 return TRUE;
245 break;
246 }
247 }
248
249 /**
250 * Return title of entry in media type selector box.
251 *
252 * @param string File extension
253 * @return string String with label value of entry in media type search selector box (frontend plugin).
254 */
255 function searchTypeMediaTitle($extension) {
256
257 // Read indexer-config
258 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
259
260 // Ignore extensions
261 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
262 if (in_array($extension, $ignoreExtensions)) {
263 return FALSE;
264 }
265
266 // Switch on file extension:
267 switch($extension) {
268 case 'pdf':
269 // PDF
270 if ($indexerConfig['pdftools']) {
271 return 'PDF';
272 }
273 break;
274 case 'doc':
275 // Catdoc
276 if ($indexerConfig['catdoc']) {
277 return 'MS Word';
278 }
279 break;
280 case 'pps': // MS PowerPoint(?)
281 case 'ppt': // MS PowerPoint
282 // ppthtml
283 if ($indexerConfig['ppthtml']) {
284 return 'MS Powerpoint';
285 }
286 break;
287 case 'xls': // MS Excel
288 // Xlhtml
289 if ($indexerConfig['xlhtml']) {
290 return 'MS Excel';
291 }
292 break;
293 case 'sxc': // Open Office Calc.
294 case 'sxi': // Open Office Impress
295 case 'sxw': // Open Office Writer
296 if ($indexerConfig['nativeOOMethod'] || $indexerConfig['ruby']) {
297 return 'Open Office';
298 }
299 break;
300 case 'rtf':
301 // Catdoc
302 if ($indexerConfig['unrtf']) {
303 return 'RTF';
304 }
305 break;
306 case 'html': // PHP strip-tags()
307 case 'jpeg': // PHP EXIF
308 case 'txt': // Raw text
309 case 'csv': // Raw text
310 case 'xml': // PHP strip-tags()
311 case 'tif': // PHP EXIF
312 return strtoupper($extension);
313 break;
314 // NO entry (duplicates or blank):
315 case 'htm': // PHP strip-tags()
316 case 'jpg': // PHP EXIF
317 default:
318 break;
319 }
320 }
321
322 /**
323 * Returns true if the input extension (item_type) is a potentially a multi-page extension
324 *
325 * @param string Extension / item_type string
326 * @return boolean Return true if multi-page
327 */
328 function isMultiplePageExtension($extension) {
329 // Switch on file extension:
330 switch((string)$extension) {
331 case 'pdf':
332 return TRUE;
333 break;
334 }
335 }
336
337
338
339
340
341
342
343
344
345 /************************
346 *
347 * Reading documents (for parsing)
348 *
349 ************************/
350
351 /**
352 * Reads the content of an external file being indexed.
353 *
354 * @param string File extension, eg. "pdf", "doc" etc.
355 * @param string Absolute filename of file (must exist and be validated OK before calling function)
356 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
357 * @return array Standard content array (title, description, keywords, body keys)
358 */
359 function readFileContent($ext,$absFile,$cPKey) {
360 unset($contentArr);
361
362 // Return immediately if initialization didn't set support up:
363 if (!$this->supportedExtensions[$ext]) return FALSE;
364
365 // Switch by file extension
366 switch ($ext) {
367 case 'pdf':
368 if ($this->app['pdfinfo']) {
369 // Getting pdf-info:
370 $cmd = $this->app['pdfinfo'].' '.$absFile;
371 exec($cmd,$res);
372 $pdfInfo = $this->splitPdfInfo($res);
373 if (intval($pdfInfo['pages'])) {
374 list($low,$high) = explode('-',$cPKey);
375
376 // Get pdf content:
377 $tempFileName = t3lib_div::tempnam('Typo3_indexer'); // Create temporary name
378 @unlink ($tempFileName); // Delete if exists, just to be safe.
379 $cmd = $this->app['pdftotext'].' -f '.$low.' -l '.$high.' -enc UTF-8 -q '.$absFile.' '.$tempFileName;
380 exec($cmd,$res);
381 if (@is_file($tempFileName)) {
382 $content = t3lib_div::getUrl($tempFileName);
383 unlink($tempFileName);
384 } else {
385 $this->pObj->log_setTSlogMessage('PDFtoText Failed on this document: '.$absFile.". Maybe the PDF file is locked for printing or encrypted.",2);
386 }
387 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
388 }
389 }
390 break;
391 case 'doc':
392 if ($this->app['catdoc']) {
393 $cmd = $this->app['catdoc'].' -d utf-8 '.$absFile;
394 exec($cmd,$res);
395 $content = implode(chr(10),$res);
396 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
397 }
398 break;
399 case 'pps':
400 case 'ppt':
401 if ($this->app['ppthtml']) {
402 $cmd = $this->app['ppthtml'].' '.$absFile;
403 exec($cmd,$res);
404 $content = implode(chr(10),$res);
405 $content = $this->pObj->convertHTMLToUtf8($content);
406 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
407 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
408 }
409 break;
410 case 'xls':
411 if ($this->app['xlhtml']) {
412 $cmd = $this->app['xlhtml'].' -nc -te '.$absFile;
413 exec($cmd,$res);
414 $content = implode(chr(10),$res);
415 $content = $this->pObj->convertHTMLToUtf8($content);
416 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
417 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
418 }
419 break;
420 case 'sxi':
421 case 'sxc':
422 case 'sxw':
423 if ($this->app['nativeOOMethod']) {
424 if (t3lib_extMgm::isLoaded('libunzipped')) {
425
426 global $TYPO3_CONF_VARS;
427 require_once(t3lib_extMgm::extPath('libunzipped').'class.tx_libunzipped.php');
428
429 // Initialize Unzip object:
430 $unzip = t3lib_div::makeInstance('tx_libunzipped');
431 $ooFiles = $unzip->init($absFile);
432 if (is_array($ooFiles)) {
433 // Read content.xml:
434 $content_xml = $unzip->getFileFromArchive('content.xml');
435 $meta_xml = $unzip->getFileFromArchive('meta.xml');
436 $utf8_content = trim(strip_tags(str_replace('<',' <',$content_xml['content'])));
437 $contentArr = $this->pObj->splitRegularContent($utf8_content);
438 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
439
440 // Meta information
441 $metaContent = t3lib_div::xml2tree($meta_xml['content']);
442 $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
443 if (is_array($metaContent)) {
444 $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
445 $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0].' '.$metaContent['dc:description'][0]['values'][0];
446
447 // Keywords collected:
448 if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
449 foreach($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
450 $contentArr['keywords'].= $kwDat['values'][0].' ';
451 }
452 }
453 }
454 }
455 }
456 } else {
457 if ($this->app['ruby']) {
458 // Extracting document headers:
459 $cmd = $this->app['ruby'].' '.$this->app['OOo'].' --heading '.$absFile;
460 exec($cmd,$headings);
461
462 // Extracting document text:
463 $cmd = $this->app['ruby'].' '.$this->app['OOo'].' '.$absFile;
464 exec($cmd,$texts);
465
466 $content = implode(chr(10),$headings).' '.implode(chr(10),$texts);
467 $contentArr = $this->pObj->splitRegularContent($content);
468 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
469 }
470 }
471 break;
472 case 'rtf':
473 if ($this->app['unrtf']) {
474 $cmd = $this->app['unrtf'].' '.$absFile;
475 exec($cmd,$res);
476 $fileContent = implode(chr(10),$res);
477 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
478 $contentArr = $this->pObj->splitHTMLContent($fileContent);
479 }
480 break;
481 case 'txt':
482 case 'csv': // Raw text
483 $content = t3lib_div::getUrl($absFile);
484 // TODO: Auto-registration of charset???? -> utf-8 (Current assuming western europe...)
485 $content = $this->pObj->convertHTMLToUtf8($content, 'iso-8859-1');
486 $contentArr = $this->pObj->splitRegularContent($content);
487 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
488 break;
489 case 'html':
490 case 'htm':
491 $fileContent = t3lib_div::getUrl($absFile);
492 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
493 $contentArr = $this->pObj->splitHTMLContent($fileContent);
494 break;
495 case 'xml': // PHP strip-tags()
496 $fileContent = t3lib_div::getUrl($absFile);
497
498 // Finding charset:
499 eregi('^[[:space:]]*<\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']',substr($fileContent,0,200),$reg);
500 $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
501
502 // Converting content:
503 $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<',' <',$fileContent)), $charset);
504 $contentArr = $this->pObj->splitRegularContent($fileContent);
505 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
506 break;
507 case 'jpg': // PHP EXIF
508 case 'jpeg': // PHP EXIF
509 case 'tif': // PHP EXIF
510 $exif = exif_read_data($absFile, 'IFD0');
511 if ($exif) {
512 $comment = trim($exif['COMMENT'][0].' '.$exif['ImageDescription']); // The comments in JPEG files are utf-8, while in Tif files they are 7-bit ascii.
513 } else {
514 $comment = '';
515 }
516 $contentArr = $this->pObj->splitRegularContent($comment);
517 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
518 break;
519 default:
520 return false;
521 break;
522 }
523 // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
524 if (is_array($contentArr) && !$contentArr['title']) {
525 $contentArr['title'] = str_replace('_',' ',basename($absFile)); // Substituting "_" for " " because many filenames may have this instead of a space char.
526 }
527
528 return $contentArr;
529 }
530
531 /**
532 * Creates an array with pointers to divisions of document.
533 * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero) coming back.
534 *
535 * @param string File extension
536 * @param string Absolute filename (must exist and be validated OK before calling function)
537 * @return array Array of pointers to sections that the document should be divided into
538 */
539 function fileContentParts($ext,$absFile) {
540 $cParts = array(0);
541 switch ($ext) {
542 case 'pdf':
543 // Getting pdf-info:
544 $cmd = $this->app['pdfinfo'].' '.$absFile;
545 exec($cmd,$res);
546 $pdfInfo = $this->splitPdfInfo($res);
547
548 if (intval($pdfInfo['pages'])) {
549 $cParts = array();
550
551 // Calculate mode
552 if ($this->pdf_mode>0) {
553 $iter = ceil($pdfInfo['pages']/$this->pdf_mode);
554 } else {
555 $iter = t3lib_div::intInRange(abs($this->pdf_mode),1,$pdfInfo['pages']);
556 }
557
558 // Traverse and create intervals.
559 for ($a=0;$a<$iter;$a++) {
560 $low = floor($a*($pdfInfo['pages']/$iter))+1;
561 $high = floor(($a+1)*($pdfInfo['pages']/$iter));
562 $cParts[] = $low.'-'.$high;
563 }
564 }
565 break;
566 }
567 return $cParts;
568 }
569
570 /**
571 * Analysing PDF info into a useable format.
572 *
573 * @param array Array of PDF content, coming from the pdfinfo tool
574 * @return array Result array
575 * @access private
576 * @see fileContentParts()
577 */
578 function splitPdfInfo($pdfInfoArray) {
579 $res = array();
580 if (is_array($pdfInfoArray)) {
581 foreach($pdfInfoArray as $line) {
582 $parts = explode(':',$line,2);
583 if (count($parts)>1 && trim($parts[0])) {
584 $res[strtolower(trim($parts[0]))] = trim($parts[1]);
585 }
586 }
587 }
588 return $res;
589 }
590
591 /**
592 * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.
593 *
594 * @param string String to clean up
595 * @return string String
596 */
597 function removeEndJunk($string) {
598 return trim(ereg_replace('['.chr(10).chr(12).']*$','',$string));
599 }
600
601
602
603
604
605
606
607
608
609
610
611
612 /************************
613 *
614 * Backend analyzer
615 *
616 ************************/
617
618 /**
619 * Return icon for file extension
620 *
621 * @param string File extension, lowercase.
622 * @return string Relative file reference, resolvable by t3lib_div::getFileAbsFileName()
623 */
624 function getIcon($extension) {
625 if ($extension=='htm') $extension = 'html';
626 if ($extension=='jpeg') $extension = 'jpg';
627 return 'EXT:indexed_search/pi/res/'.$extension.'.gif';
628 }
629 }
630
631 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']) {
632 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']);
633 }
634 ?>