Fixed bug #5826: indexed_search: specify media types in advanced search
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.external_parser.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2008 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * External standard parsers for indexed_search
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * @coauthor Olivier Simah <noname_paris@yahoo.fr>
32 */
33 /**
34 * [CLASS/FUNCTION INDEX of SCRIPT]
35 *
36 *
37 *
38 * 75: class tx_indexed_search_extparse
39 * 94: function initParser($extension)
40 * 214: function softInit($extension)
41 * 247: function searchTypeMediaTitle($extension)
42 * 323: function isMultiplePageExtension($extension)
43 *
44 * SECTION: Reading documents (for parsing)
45 * 354: function readFileContent($ext,$absFile,$cPKey)
46 * 521: function fileContentParts($ext,$absFile)
47 * 560: function splitPdfInfo($pdfInfoArray)
48 * 579: function removeEndJunk($string)
49 *
50 * SECTION: Backend analyzer
51 * 606: function getIcon($extension)
52 *
53 * TOTAL FUNCTIONS: 9
54 * (This index is automatically created/updated by the extension "extdeveval")
55 *
56 */
57
58
59
60
61
62
63
64
65
66
67 /**
68 * External standard parsers for indexed_search
69 * MUST RETURN utf-8 content!
70 *
71 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
72 * @package TYPO3
73 * @subpackage tx_indexedsearch
74 */
75 class tx_indexed_search_extparse {
76
77 // This value is also overridden from config.
78 var $pdf_mode = -20; // zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
79
80 // This array is configured in initialization:
81 var $app = array();
82 var $ext2itemtype_map = array();
83 var $supportedExtensions = array();
84
85 var $pObj; // Reference to parent object (indexer class)
86
87
88 /**
89 * Initialize external parser for parsing content.
90 *
91 * @param string File extension
92 * @return boolean Returns true if extension is supported/enabled, otherwise false.
93 */
94 function initParser($extension) {
95
96 // Then read indexer-config and set if appropriate:
97 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
98
99 // If windows, apply extension to tool name:
100 $exe = (TYPO3_OS == 'WIN') ? '.exe' : ''; // lg
101 $extOK = FALSE;
102 $mainExtension = '';
103
104 // Ignore extensions
105 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
106 if (in_array($extension, $ignoreExtensions)) {
107 $this->pObj->log_setTSlogMessage(sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:ignoreExtensions'), $extension), 1);
108 return FALSE;
109 }
110
111 // Switch on file extension:
112 switch($extension) {
113 case 'pdf':
114 // PDF
115 if ($indexerConfig['pdftools']) {
116 $pdfPath = ereg_replace("\/$",'',$indexerConfig['pdftools']).'/';
117 if (ini_get('safe_mode') || (@is_file($pdfPath.'pdftotext'.$exe) && @is_file($pdfPath.'pdfinfo'.$exe))) {
118 $this->app['pdfinfo'] = $pdfPath.'pdfinfo'.$exe;
119 $this->app['pdftotext'] = $pdfPath.'pdftotext'.$exe;
120 // PDF mode:
121 $this->pdf_mode = t3lib_div::intInRange($indexerConfig['pdf_mode'],-100,100);
122 $extOK = TRUE;
123 } else $this->pObj->log_setTSlogMessage(sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:pdfToolsNotFound'), $pdfPath), 3);
124 } else $this->pObj->log_setTSlogMessage($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:pdfToolsDisabled'), 1);
125 break;
126 case 'doc':
127 // Catdoc
128 if ($indexerConfig['catdoc']) {
129 $catdocPath = ereg_replace("\/$",'',$indexerConfig['catdoc']).'/';
130 if (ini_get('safe_mode') || @is_file($catdocPath.'catdoc'.$exe)) {
131 $this->app['catdoc'] = $catdocPath.'catdoc'.$exe;
132 $extOK = TRUE;
133 } else $this->pObj->log_setTSlogMessage(sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:catdocNotFound'), $catdocPath), 3);
134 } else $this->pObj->log_setTSlogMessage($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:catdocDisabled'), 1);
135 break;
136 case 'pps': // MS PowerPoint(?)
137 case 'ppt': // MS PowerPoint
138 // ppthtml
139 if ($indexerConfig['ppthtml']) {
140 $ppthtmlPath = ereg_replace('\/$','',$indexerConfig['ppthtml']).'/';
141 if (ini_get('safe_mode') || @is_file($ppthtmlPath.'ppthtml'.$exe)){
142 $this->app['ppthtml'] = $ppthtmlPath.'ppthtml'.$exe;
143 $extOK = TRUE;
144 } else $this->pObj->log_setTSlogMessage(sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:ppthtmlNotFound'), $ppthtmlPath), 3);
145 } else $this->pObj->log_setTSlogMessage($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:ppthtmlDisabled'), 1);
146 break;
147 case 'xls': // MS Excel
148 // Xlhtml
149 if ($indexerConfig['xlhtml']) {
150 $xlhtmlPath = ereg_replace('\/$','',$indexerConfig['xlhtml']).'/';
151 if (ini_get('safe_mode') || @is_file($xlhtmlPath.'xlhtml'.$exe)){
152 $this->app['xlhtml'] = $xlhtmlPath.'xlhtml'.$exe;
153 $extOK = TRUE;
154 } else $this->pObj->log_setTSlogMessage(sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:xlhtmlNotFound'), $xlhtmlPath), 3);
155 } else $this->pObj->log_setTSlogMessage($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:xlhtmlDisabled'), 1);
156 break;
157 case 'sxc': // Open Office Calc.
158 case 'sxi': // Open Office Impress
159 case 'sxw': // Open Office Writer
160 case 'ods': // Oasis OpenDocument Spreadsheet
161 case 'odp': // Oasis OpenDocument Presentation
162 case 'odt': // Oasis OpenDocument Text
163 if ($indexerConfig['unzip']) {
164 $unzipPath = preg_replace('/\/$/','',$indexerConfig['unzip']).'/';
165 if (ini_get('safe_mode') || @is_file($unzipPath.'unzip'.$exe)) {
166 $this->app['unzip'] = $unzipPath.'unzip'.$exe;
167 $extOK = TRUE;
168 } else $this->pObj->log_setTSlogMessage(sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:unzipNotFound'), $unzipPath), 3);
169 } else $this->pObj->log_setTSlogMessage($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:unzipDisabled'), 1);
170 break;
171 case 'rtf':
172 // Catdoc
173 if ($indexerConfig['unrtf']) {
174 $unrtfPath = ereg_replace("\/$",'',$indexerConfig['unrtf']).'/';
175 if (ini_get('safe_mode') || @is_file($unrtfPath.'unrtf'.$exe)) {
176 $this->app['unrtf'] = $unrtfPath.'unrtf'.$exe;
177 $extOK = TRUE;
178 } else $this->pObj->log_setTSlogMessage(sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:unrtfNotFound'), $unrtfPath), 3);
179 } else $this->pObj->log_setTSlogMessage($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:unrtfDisabled'), 1);
180 break;
181 case 'txt': // Raw text
182 case 'csv': // Raw text
183 case 'xml': // PHP strip-tags()
184 case 'tif': // PHP EXIF
185 $extOK = TRUE;
186 break;
187 case 'html': // PHP strip-tags()
188 case 'htm': // PHP strip-tags()
189 $extOK = TRUE;
190 $mainExtension = 'html'; // making "html" the common "item_type"
191 break;
192 case 'jpg': // PHP EXIF
193 case 'jpeg': // PHP EXIF
194 $extOK = TRUE;
195 $mainExtension = 'jpeg'; // making "jpeg" the common item_type
196 break;
197 }
198
199 // If extension was OK:
200 if ($extOK) {
201 $this->supportedExtensions[$extension] = TRUE;
202 $this->ext2itemtype_map[$extension] = $mainExtension ? $mainExtension : $extension;
203 return TRUE;
204 }
205 }
206
207 /**
208 * Initialize external parser for backend modules
209 * Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc) in backend and frontend plugin
210 *
211 * @param string File extension to initialize for.
212 * @return boolean Returns true if the extension is supported and enabled, otherwise false.
213 */
214 function softInit($extension) {
215 switch($extension) {
216 case 'pdf': // PDF
217 case 'doc': // MS Word files
218 case 'pps': // MS PowerPoint
219 case 'ppt': // MS PowerPoint
220 case 'xls': // MS Excel
221 case 'sxc': // Open Office Calc.
222 case 'sxi': // Open Office Impress
223 case 'sxw': // Open Office Writer
224 case 'ods': // Oasis OpenDocument Spreadsheet
225 case 'odp': // Oasis OpenDocument Presentation
226 case 'odt': // Oasis OpenDocument Text
227 case 'rtf': // RTF documents
228 case 'txt': // ASCII Text documents
229 case 'html': // HTML
230 case 'htm': // HTML
231 case 'csv': // Comma Separated Values
232 case 'xml': // Generic XML
233 case 'jpg': // Jpeg images (EXIF comment)
234 case 'jpeg': // Jpeg images (EXIF comment)
235 case 'tif': // TIF images (EXIF comment)
236 return TRUE;
237 break;
238 }
239 }
240
241 /**
242 * Return title of entry in media type selector box.
243 *
244 * @param string File extension
245 * @return string String with label value of entry in media type search selector box (frontend plugin).
246 */
247 function searchTypeMediaTitle($extension) {
248
249 // Read indexer-config
250 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
251
252 // Ignore extensions
253 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
254 if (in_array($extension, $ignoreExtensions)) {
255 return FALSE;
256 }
257
258 // Switch on file extension:
259 switch($extension) {
260 case 'pdf':
261 // PDF
262 if ($indexerConfig['pdftools']) {
263 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extensionPDF'), $extension);
264 }
265 break;
266 case 'doc':
267 // Catdoc
268 if ($indexerConfig['catdoc']) {
269 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.DOC'), $extension);
270 }
271 break;
272 case 'pps': // MS PowerPoint(?)
273 case 'ppt': // MS PowerPoint
274 // ppthtml
275 if ($indexerConfig['ppthtml']) {
276 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.PP'), $extension);
277 }
278 break;
279 case 'xls': // MS Excel
280 // Xlhtml
281 if ($indexerConfig['xlhtml']) {
282 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.XLS'), $extension);
283 }
284 break;
285 case 'sxc': // Open Office Calc.
286 if ($indexerConfig['unzip']) {
287 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.SXC'), $extension);
288 }
289 break;
290 case 'sxi': // Open Office Impress
291 if ($indexerConfig['unzip']) {
292 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.SXI'), $extension);
293 }
294 break;
295 case 'sxw': // Open Office Writer
296 if ($indexerConfig['unzip']) {
297 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.SXW'), $extension);
298 }
299 break;
300 case 'ods': // Oasis OpenDocument Spreadsheet
301 if ($indexerConfig['unzip']) {
302 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.ODS'), $extension);
303 }
304 break;
305 case 'odp': // Oasis OpenDocument Presentation
306 if ($indexerConfig['unzip']) {
307 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.ODP'), $extension);
308 }
309 break;
310 case 'odt': // Oasis OpenDocument Text
311 if ($indexerConfig['unzip']) {
312 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.ODT'), $extension);
313 }
314 break;
315 case 'rtf':
316 // Catdoc
317 if ($indexerConfig['unrtf']) {
318 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.RTF'), $extension);
319 }
320 break;
321 case 'jpeg': // PHP EXIF
322 case 'tif': // PHP EXIF
323 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.Images'), $extension);
324 break;
325 case 'html': // PHP strip-tags()
326 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.HTML'), $extension);
327 break;
328 case 'txt': // Raw text
329 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.TXT'), $extension);
330 break;
331 case 'csv': // Raw text
332 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.CSV'), $extension);
333 break;
334 case 'xml': // PHP strip-tags()
335 return sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:extension.XML'), $extension);
336 break;
337 // NO entry (duplicates or blank):
338 case 'htm': // PHP strip-tags()
339 case 'jpg': // PHP EXIF
340 default:
341 break;
342 }
343 }
344
345 /**
346 * Returns true if the input extension (item_type) is a potentially a multi-page extension
347 *
348 * @param string Extension / item_type string
349 * @return boolean Return true if multi-page
350 */
351 function isMultiplePageExtension($extension) {
352 // Switch on file extension:
353 switch((string)$extension) {
354 case 'pdf':
355 return TRUE;
356 break;
357 }
358 }
359
360
361
362
363
364
365
366
367
368 /************************
369 *
370 * Reading documents (for parsing)
371 *
372 ************************/
373
374 /**
375 * Reads the content of an external file being indexed.
376 *
377 * @param string File extension, eg. "pdf", "doc" etc.
378 * @param string Absolute filename of file (must exist and be validated OK before calling function)
379 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
380 * @return array Standard content array (title, description, keywords, body keys)
381 */
382 function readFileContent($ext,$absFile,$cPKey) {
383 unset($contentArr);
384
385 // Return immediately if initialization didn't set support up:
386 if (!$this->supportedExtensions[$ext]) return FALSE;
387
388 // Switch by file extension
389 switch ($ext) {
390 case 'pdf':
391 if ($this->app['pdfinfo']) {
392 // Getting pdf-info:
393 $cmd = $this->app['pdfinfo'].' "'.$absFile.'"';
394 exec($cmd,$res);
395 $pdfInfo = $this->splitPdfInfo($res);
396 unset($res);
397 if (intval($pdfInfo['pages'])) {
398 list($low,$high) = explode('-',$cPKey);
399
400 // Get pdf content:
401 $tempFileName = t3lib_div::tempnam('Typo3_indexer'); // Create temporary name
402 @unlink ($tempFileName); // Delete if exists, just to be safe.
403 $cmd = $this->app['pdftotext'].' -f '.$low.' -l '.$high.' -enc UTF-8 -q "'.$absFile.'" '.$tempFileName;
404 exec($cmd);
405 if (@is_file($tempFileName)) {
406 $content = t3lib_div::getUrl($tempFileName);
407 unlink($tempFileName);
408 } else {
409 $this->pObj->log_setTSlogMessage(sprintf($GLOBALS['LANG']->sL('LLL:EXT:indexed_search/pi/locallang.xml:pdfToolsFailed'), $absFile), 2);
410 }
411 if (strlen($content)) {
412 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
413 }
414 }
415 }
416 break;
417 case 'doc':
418 if ($this->app['catdoc']) {
419 $cmd = $this->app['catdoc'].' -d utf-8 "'.$absFile.'"';
420 exec($cmd,$res);
421 $content = implode(chr(10),$res);
422 unset($res);
423 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
424 }
425 break;
426 case 'pps':
427 case 'ppt':
428 if ($this->app['ppthtml']) {
429 $cmd = $this->app['ppthtml'].' "'.$absFile.'"';
430 exec($cmd,$res);
431 $content = implode(chr(10),$res);
432 unset($res);
433 $content = $this->pObj->convertHTMLToUtf8($content);
434 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
435 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
436 }
437 break;
438 case 'xls':
439 if ($this->app['xlhtml']) {
440 $cmd = $this->app['xlhtml'].' -nc -te "'.$absFile.'"';
441 exec($cmd,$res);
442 $content = implode(chr(10),$res);
443 unset($res);
444 $content = $this->pObj->convertHTMLToUtf8($content);
445 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
446 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
447 }
448 break;
449 case 'sxi':
450 case 'sxc':
451 case 'sxw':
452 case 'ods':
453 case 'odp':
454 case 'odt':
455 if ($this->app['unzip']) {
456 // Read content.xml:
457 $cmd = $this->app['unzip'].' -p "'.$absFile.'" content.xml';
458 exec($cmd,$res);
459 $content_xml = implode(chr(10),$res);
460 unset($res);
461
462 // Read meta.xml:
463 $cmd = $this->app['unzip'].' -p "'.$absFile.'" meta.xml';
464 exec($cmd, $res);
465 $meta_xml = implode(chr(10),$res);
466 unset($res);
467
468 $utf8_content = trim(strip_tags(str_replace('<',' <',$content_xml)));
469 $contentArr = $this->pObj->splitRegularContent($utf8_content);
470 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
471
472 // Meta information
473 $metaContent = t3lib_div::xml2tree($meta_xml);
474 $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
475 if (is_array($metaContent)) {
476 $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
477 $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0].' '.$metaContent['dc:description'][0]['values'][0];
478
479 // Keywords collected:
480 if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
481 foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
482 $contentArr['keywords'].= $kwDat['values'][0].' ';
483 }
484 }
485 }
486 }
487 break;
488 case 'rtf':
489 if ($this->app['unrtf']) {
490 $cmd = $this->app['unrtf'].' "'.$absFile.'"';
491 exec($cmd,$res);
492 $fileContent = implode(chr(10),$res);
493 unset($res);
494 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
495 $contentArr = $this->pObj->splitHTMLContent($fileContent);
496 }
497 break;
498 case 'txt':
499 case 'csv': // Raw text
500 $content = t3lib_div::getUrl($absFile);
501 // TODO: Auto-registration of charset???? -> utf-8 (Current assuming western europe...)
502 $content = $this->pObj->convertHTMLToUtf8($content, 'iso-8859-1');
503 $contentArr = $this->pObj->splitRegularContent($content);
504 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
505 break;
506 case 'html':
507 case 'htm':
508 $fileContent = t3lib_div::getUrl($absFile);
509 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
510 $contentArr = $this->pObj->splitHTMLContent($fileContent);
511 break;
512 case 'xml': // PHP strip-tags()
513 $fileContent = t3lib_div::getUrl($absFile);
514
515 // Finding charset:
516 eregi('^[[:space:]]*<\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']',substr($fileContent,0,200),$reg);
517 $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
518
519 // Converting content:
520 $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<',' <',$fileContent)), $charset);
521 $contentArr = $this->pObj->splitRegularContent($fileContent);
522 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
523 break;
524 case 'jpg': // PHP EXIF
525 case 'jpeg': // PHP EXIF
526 case 'tif': // PHP EXIF
527 if (function_exists('exif_read_data')) {
528 $exif = exif_read_data($absFile, 'IFD0');
529 } else {
530 $exif = FALSE;
531 }
532
533 if ($exif) {
534 $comment = trim($exif['COMMENT'][0].' '.$exif['ImageDescription']); // The comments in JPEG files are utf-8, while in Tif files they are 7-bit ascii.
535 } else {
536 $comment = '';
537 }
538 $contentArr = $this->pObj->splitRegularContent($comment);
539 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
540 break;
541 default:
542 return false;
543 break;
544 }
545 // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
546 if (is_array($contentArr) && !$contentArr['title']) {
547 $contentArr['title'] = str_replace('_',' ',basename($absFile)); // Substituting "_" for " " because many filenames may have this instead of a space char.
548 }
549
550 return $contentArr;
551 }
552
553 /**
554 * Creates an array with pointers to divisions of document.
555 * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero) coming back.
556 *
557 * @param string File extension
558 * @param string Absolute filename (must exist and be validated OK before calling function)
559 * @return array Array of pointers to sections that the document should be divided into
560 */
561 function fileContentParts($ext,$absFile) {
562 $cParts = array(0);
563 switch ($ext) {
564 case 'pdf':
565 // Getting pdf-info:
566 $cmd = $this->app['pdfinfo'].' "'.$absFile.'"';
567 exec($cmd,$res);
568 $pdfInfo = $this->splitPdfInfo($res);
569 unset($res);
570
571 if (intval($pdfInfo['pages'])) {
572 $cParts = array();
573
574 // Calculate mode
575 if ($this->pdf_mode>0) {
576 $iter = ceil($pdfInfo['pages']/$this->pdf_mode);
577 } else {
578 $iter = t3lib_div::intInRange(abs($this->pdf_mode),1,$pdfInfo['pages']);
579 }
580
581 // Traverse and create intervals.
582 for ($a=0;$a<$iter;$a++) {
583 $low = floor($a*($pdfInfo['pages']/$iter))+1;
584 $high = floor(($a+1)*($pdfInfo['pages']/$iter));
585 $cParts[] = $low.'-'.$high;
586 }
587 }
588 break;
589 }
590 return $cParts;
591 }
592
593 /**
594 * Analysing PDF info into a useable format.
595 *
596 * @param array Array of PDF content, coming from the pdfinfo tool
597 * @return array Result array
598 * @access private
599 * @see fileContentParts()
600 */
601 function splitPdfInfo($pdfInfoArray) {
602 $res = array();
603 if (is_array($pdfInfoArray)) {
604 foreach($pdfInfoArray as $line) {
605 $parts = explode(':',$line,2);
606 if (count($parts)>1 && trim($parts[0])) {
607 $res[strtolower(trim($parts[0]))] = trim($parts[1]);
608 }
609 }
610 }
611 return $res;
612 }
613
614 /**
615 * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.
616 *
617 * @param string String to clean up
618 * @return string String
619 */
620 function removeEndJunk($string) {
621 return trim(ereg_replace('['.chr(10).chr(12).']*$','',$string));
622 }
623
624
625
626
627
628
629
630
631
632
633
634
635 /************************
636 *
637 * Backend analyzer
638 *
639 ************************/
640
641 /**
642 * Return icon for file extension
643 *
644 * @param string File extension, lowercase.
645 * @return string Relative file reference, resolvable by t3lib_div::getFileAbsFileName()
646 */
647 function getIcon($extension) {
648 if ($extension=='htm') $extension = 'html';
649 if ($extension=='jpeg') $extension = 'jpg';
650 return 'EXT:indexed_search/pi/res/'.$extension.'.gif';
651 }
652 }
653
654 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']) {
655 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']);
656 }
657 ?>