[TASK] Rename ExtensionManager class Part 2
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / FileContentParser.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /***************************************************************
5 * Copyright notice
6 *
7 * (c) 2001-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
8 * All rights reserved
9 *
10 * This script is part of the TYPO3 project. The TYPO3 project is
11 * free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * The GNU General Public License can be found at
17 * http://www.gnu.org/copyleft/gpl.html.
18 * A copy is found in the textfile GPL.txt and important notices to the license
19 * from the author is found in LICENSE.txt distributed with these scripts.
20 *
21 *
22 * This script is distributed in the hope that it will be useful,
23 * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 * GNU General Public License for more details.
26 *
27 * This copyright notice MUST APPEAR in all copies of the script!
28 ***************************************************************/
29 /**
30 * External standard parsers for indexed_search
31 *
32 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
33 * @coauthor Olivier Simah <noname_paris@yahoo.fr>
34 */
35 /**
36 * External standard parsers for indexed_search
37 * MUST RETURN utf-8 content!
38 *
39 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
40 * @package TYPO3
41 * @subpackage tx_indexedsearch
42 */
43 class FileContentParser {
44
45 // This value is also overridden from config.
46 /**
47 * @todo Define visibility
48 */
49 public $pdf_mode = -20;
50
51 // zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
52 // This array is configured in initialization:
53 /**
54 * @todo Define visibility
55 */
56 public $app = array();
57
58 /**
59 * @todo Define visibility
60 */
61 public $ext2itemtype_map = array();
62
63 /**
64 * @todo Define visibility
65 */
66 public $supportedExtensions = array();
67
68 /**
69 * @todo Define visibility
70 */
71 public $pObj;
72
73 // Reference to parent object (indexer class)
74 protected $langObject;
75
76 // Reference to LANG-Object
77 /**
78 * Constructs this external parsers object
79 */
80 public function __construct() {
81 // Set the language object to be used accordant to current TYPO3_MODE:
82 $this->langObject = TYPO3_MODE == 'FE' ? $GLOBALS['TSFE'] : $GLOBALS['LANG'];
83 }
84
85 /**
86 * Initialize external parser for parsing content.
87 *
88 * @param string File extension
89 * @return boolean Returns TRUE if extension is supported/enabled, otherwise FALSE.
90 * @todo Define visibility
91 */
92 public function initParser($extension) {
93 // Then read indexer-config and set if appropriate:
94 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
95 // If windows, apply extension to tool name:
96 $exe = TYPO3_OS == 'WIN' ? '.exe' : '';
97 // lg
98 $extOK = FALSE;
99 $mainExtension = '';
100 // Ignore extensions
101 $ignoreExtensions = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), 1);
102 if (in_array($extension, $ignoreExtensions)) {
103 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:ignoreExtensions'), $extension), 1);
104 return FALSE;
105 }
106 // Switch on file extension:
107 switch ($extension) {
108 case 'pdf':
109 // PDF
110 if ($indexerConfig['pdftools']) {
111 $pdfPath = rtrim($indexerConfig['pdftools'], '/') . '/';
112 if (@is_file(($pdfPath . 'pdftotext' . $exe)) && @is_file(($pdfPath . 'pdfinfo' . $exe))) {
113 $this->app['pdfinfo'] = $pdfPath . 'pdfinfo' . $exe;
114 $this->app['pdftotext'] = $pdfPath . 'pdftotext' . $exe;
115 // PDF mode:
116 $this->pdf_mode = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($indexerConfig['pdf_mode'], -100, 100);
117 $extOK = TRUE;
118 } else {
119 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsNotFound'), $pdfPath), 3);
120 }
121 } else {
122 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsDisabled'), 1);
123 }
124 break;
125 case 'doc':
126 // Catdoc
127 if ($indexerConfig['catdoc']) {
128 $catdocPath = rtrim($indexerConfig['catdoc'], '/') . '/';
129 if (@is_file(($catdocPath . 'catdoc' . $exe))) {
130 $this->app['catdoc'] = $catdocPath . 'catdoc' . $exe;
131 $extOK = TRUE;
132 } else {
133 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:catdocNotFound'), $catdocPath), 3);
134 }
135 } else {
136 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:catdocDisabled'), 1);
137 }
138 break;
139 case 'pps':
140
141 case 'ppt':
142 // MS PowerPoint
143 // ppthtml
144 if ($indexerConfig['ppthtml']) {
145 $ppthtmlPath = rtrim($indexerConfig['ppthtml'], '/') . '/';
146 if (@is_file(($ppthtmlPath . 'ppthtml' . $exe))) {
147 $this->app['ppthtml'] = $ppthtmlPath . 'ppthtml' . $exe;
148 $extOK = TRUE;
149 } else {
150 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:ppthtmlNotFound'), $ppthtmlPath), 3);
151 }
152 } else {
153 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:ppthtmlDisabled'), 1);
154 }
155 break;
156 case 'xls':
157 // MS Excel
158 // Xlhtml
159 if ($indexerConfig['xlhtml']) {
160 $xlhtmlPath = rtrim($indexerConfig['xlhtml'], '/') . '/';
161 if (@is_file(($xlhtmlPath . 'xlhtml' . $exe))) {
162 $this->app['xlhtml'] = $xlhtmlPath . 'xlhtml' . $exe;
163 $extOK = TRUE;
164 } else {
165 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:xlhtmlNotFound'), $xlhtmlPath), 3);
166 }
167 } else {
168 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:xlhtmlDisabled'), 1);
169 }
170 break;
171 case 'sxc':
172
173 case 'sxi':
174
175 case 'sxw':
176
177 case 'ods':
178
179 case 'odp':
180
181 case 'odt':
182 // Oasis OpenDocument Text
183 if ($indexerConfig['unzip']) {
184 $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
185 if (@is_file(($unzipPath . 'unzip' . $exe))) {
186 $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
187 $extOK = TRUE;
188 } else {
189 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:unzipNotFound'), $unzipPath), 3);
190 }
191 } else {
192 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:unzipDisabled'), 1);
193 }
194 break;
195 case 'rtf':
196 // Catdoc
197 if ($indexerConfig['unrtf']) {
198 $unrtfPath = rtrim($indexerConfig['unrtf'], '/') . '/';
199 if (@is_file(($unrtfPath . 'unrtf' . $exe))) {
200 $this->app['unrtf'] = $unrtfPath . 'unrtf' . $exe;
201 $extOK = TRUE;
202 } else {
203 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:unrtfNotFound'), $unrtfPath), 3);
204 }
205 } else {
206 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:unrtfDisabled'), 1);
207 }
208 break;
209 case 'txt':
210
211 case 'csv':
212
213 case 'xml':
214
215 case 'tif':
216 // PHP EXIF
217 $extOK = TRUE;
218 break;
219 case 'html':
220
221 case 'htm':
222 // PHP strip-tags()
223 $extOK = TRUE;
224 $mainExtension = 'html';
225 // making "html" the common "item_type"
226 break;
227 case 'jpg':
228
229 case 'jpeg':
230 // PHP EXIF
231 $extOK = TRUE;
232 $mainExtension = 'jpeg';
233 // making "jpeg" the common item_type
234 break;
235 }
236 // If extension was OK:
237 if ($extOK) {
238 $this->supportedExtensions[$extension] = TRUE;
239 $this->ext2itemtype_map[$extension] = $mainExtension ? $mainExtension : $extension;
240 return TRUE;
241 }
242 }
243
244 /**
245 * Initialize external parser for backend modules
246 * Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc) in backend and frontend plugin
247 *
248 * @param string File extension to initialize for.
249 * @return boolean Returns TRUE if the extension is supported and enabled, otherwise FALSE.
250 * @todo Define visibility
251 */
252 public function softInit($extension) {
253 switch ($extension) {
254 case 'pdf':
255
256 case 'doc':
257
258 case 'pps':
259
260 case 'ppt':
261
262 case 'xls':
263
264 case 'sxc':
265
266 case 'sxi':
267
268 case 'sxw':
269
270 case 'ods':
271
272 case 'odp':
273
274 case 'odt':
275
276 case 'rtf':
277
278 case 'txt':
279
280 case 'html':
281
282 case 'htm':
283
284 case 'csv':
285
286 case 'xml':
287
288 case 'jpg':
289
290 case 'jpeg':
291
292 case 'tif':
293 // TIF images (EXIF comment)
294 return TRUE;
295 break;
296 }
297 }
298
299 /**
300 * Return title of entry in media type selector box.
301 *
302 * @param string File extension
303 * @return string String with label value of entry in media type search selector box (frontend plugin).
304 * @todo Define visibility
305 */
306 public function searchTypeMediaTitle($extension) {
307 // Read indexer-config
308 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
309 // Ignore extensions
310 $ignoreExtensions = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), 1);
311 if (in_array($extension, $ignoreExtensions)) {
312 return FALSE;
313 }
314 // Switch on file extension:
315 switch ($extension) {
316 case 'pdf':
317 // PDF
318 if ($indexerConfig['pdftools']) {
319 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.PDF'), $extension);
320 }
321 break;
322 case 'doc':
323 // Catdoc
324 if ($indexerConfig['catdoc']) {
325 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.DOC'), $extension);
326 }
327 break;
328 case 'pps':
329
330 case 'ppt':
331 // MS PowerPoint
332 // ppthtml
333 if ($indexerConfig['ppthtml']) {
334 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.PP'), $extension);
335 }
336 break;
337 case 'xls':
338 // MS Excel
339 // Xlhtml
340 if ($indexerConfig['xlhtml']) {
341 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.XLS'), $extension);
342 }
343 break;
344 case 'sxc':
345 // Open Office Calc.
346 if ($indexerConfig['unzip']) {
347 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXC'), $extension);
348 }
349 break;
350 case 'sxi':
351 // Open Office Impress
352 if ($indexerConfig['unzip']) {
353 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXI'), $extension);
354 }
355 break;
356 case 'sxw':
357 // Open Office Writer
358 if ($indexerConfig['unzip']) {
359 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXW'), $extension);
360 }
361 break;
362 case 'ods':
363 // Oasis OpenDocument Spreadsheet
364 if ($indexerConfig['unzip']) {
365 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODS'), $extension);
366 }
367 break;
368 case 'odp':
369 // Oasis OpenDocument Presentation
370 if ($indexerConfig['unzip']) {
371 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODP'), $extension);
372 }
373 break;
374 case 'odt':
375 // Oasis OpenDocument Text
376 if ($indexerConfig['unzip']) {
377 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODT'), $extension);
378 }
379 break;
380 case 'rtf':
381 // Catdoc
382 if ($indexerConfig['unrtf']) {
383 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.RTF'), $extension);
384 }
385 break;
386 case 'jpeg':
387
388 case 'tif':
389 // PHP EXIF
390 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.Images'), $extension);
391 break;
392 case 'html':
393 // PHP strip-tags()
394 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.HTML'), $extension);
395 break;
396 case 'txt':
397 // Raw text
398 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.TXT'), $extension);
399 break;
400 case 'csv':
401 // Raw text
402 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.CSV'), $extension);
403 break;
404 case 'xml':
405 // PHP strip-tags()
406 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.XML'), $extension);
407 break;
408 case 'htm':
409
410 case 'jpg':
411
412 default:
413 break;
414 }
415 }
416
417 /**
418 * Returns TRUE if the input extension (item_type) is a potentially a multi-page extension
419 *
420 * @param string Extension / item_type string
421 * @return boolean Return TRUE if multi-page
422 * @todo Define visibility
423 */
424 public function isMultiplePageExtension($extension) {
425 // Switch on file extension:
426 switch ((string) $extension) {
427 case 'pdf':
428 return TRUE;
429 break;
430 }
431 }
432
433 /**
434 * Wraps the "splitLabel function" of the language object.
435 *
436 * @param string $reference: Reference/key of the label
437 * @param boolean $useHtmlSpecialChar: Convert special chars to HTML entities (default: FALSE)
438 * @return string The label of the reference/key to be fetched
439 */
440 protected function sL($reference, $useHtmlSpecialChar = FALSE) {
441 return $this->langObject->sL($reference, $useHtmlSpecialChar);
442 }
443
444 /************************
445 *
446 * Reading documents (for parsing)
447 *
448 ************************/
449 /**
450 * Reads the content of an external file being indexed.
451 *
452 * @param string File extension, eg. "pdf", "doc" etc.
453 * @param string Absolute filename of file (must exist and be validated OK before calling function)
454 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
455 * @return array Standard content array (title, description, keywords, body keys)
456 * @todo Define visibility
457 */
458 public function readFileContent($ext, $absFile, $cPKey) {
459 unset($contentArr);
460 // Return immediately if initialization didn't set support up:
461 if (!$this->supportedExtensions[$ext]) {
462 return FALSE;
463 }
464 // Switch by file extension
465 switch ($ext) {
466 case 'pdf':
467 if ($this->app['pdfinfo']) {
468 // Getting pdf-info:
469 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
470 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
471 $pdfInfo = $this->splitPdfInfo($res);
472 unset($res);
473 if (intval($pdfInfo['pages'])) {
474 list($low, $high) = explode('-', $cPKey);
475 // Get pdf content:
476 $tempFileName = \TYPO3\CMS\Core\Utility\GeneralUtility::tempnam('Typo3_indexer');
477 // Create temporary name
478 @unlink($tempFileName);
479 // Delete if exists, just to be safe.
480 $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
481 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd);
482 if (@is_file($tempFileName)) {
483 $content = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($tempFileName);
484 unlink($tempFileName);
485 } else {
486 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsFailed'), $absFile), 2);
487 }
488 if (strlen($content)) {
489 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
490 }
491 }
492 }
493 break;
494 case 'doc':
495 if ($this->app['catdoc']) {
496 $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
497 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
498 $content = implode(LF, $res);
499 unset($res);
500 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
501 }
502 break;
503 case 'pps':
504
505 case 'ppt':
506 if ($this->app['ppthtml']) {
507 $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
508 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
509 $content = implode(LF, $res);
510 unset($res);
511 $content = $this->pObj->convertHTMLToUtf8($content);
512 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
513 $contentArr['title'] = basename($absFile);
514 }
515 break;
516 case 'xls':
517 if ($this->app['xlhtml']) {
518 $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
519 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
520 $content = implode(LF, $res);
521 unset($res);
522 $content = $this->pObj->convertHTMLToUtf8($content);
523 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
524 $contentArr['title'] = basename($absFile);
525 }
526 break;
527 case 'sxi':
528
529 case 'sxc':
530
531 case 'sxw':
532
533 case 'ods':
534
535 case 'odp':
536
537 case 'odt':
538 if ($this->app['unzip']) {
539 // Read content.xml:
540 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml';
541 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
542 $content_xml = implode(LF, $res);
543 unset($res);
544 // Read meta.xml:
545 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml';
546 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
547 $meta_xml = implode(LF, $res);
548 unset($res);
549 $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
550 $contentArr = $this->pObj->splitRegularContent($utf8_content);
551 $contentArr['title'] = basename($absFile);
552 // Make sure the title doesn't expose the absolute path!
553 // Meta information
554 $metaContent = \TYPO3\CMS\Core\Utility\GeneralUtility::xml2tree($meta_xml);
555 $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
556 if (is_array($metaContent)) {
557 $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
558 $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0] . ' ' . $metaContent['dc:description'][0]['values'][0];
559 // Keywords collected:
560 if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
561 foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
562 $contentArr['keywords'] .= $kwDat['values'][0] . ' ';
563 }
564 }
565 }
566 }
567 break;
568 case 'rtf':
569 if ($this->app['unrtf']) {
570 $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
571 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
572 $fileContent = implode(LF, $res);
573 unset($res);
574 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
575 $contentArr = $this->pObj->splitHTMLContent($fileContent);
576 }
577 break;
578 case 'txt':
579
580 case 'csv':
581 // Raw text
582 $content = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($absFile);
583 // TODO: Implement auto detection of charset (currently assuming utf-8)
584 $contentCharset = 'utf-8';
585 $content = $this->pObj->convertHTMLToUtf8($content, $contentCharset);
586 $contentArr = $this->pObj->splitRegularContent($content);
587 $contentArr['title'] = basename($absFile);
588 // Make sure the title doesn't expose the absolute path!
589 break;
590 case 'html':
591
592 case 'htm':
593 $fileContent = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($absFile);
594 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
595 $contentArr = $this->pObj->splitHTMLContent($fileContent);
596 break;
597 case 'xml':
598 // PHP strip-tags()
599 $fileContent = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($absFile);
600 // Finding charset:
601 preg_match('/^[[:space:]]*<\\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i', substr($fileContent, 0, 200), $reg);
602 $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
603 // Converting content:
604 $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<', ' <', $fileContent)), $charset);
605 $contentArr = $this->pObj->splitRegularContent($fileContent);
606 $contentArr['title'] = basename($absFile);
607 // Make sure the title doesn't expose the absolute path!
608 break;
609 case 'jpg':
610
611 case 'jpeg':
612
613 case 'tif':
614 // PHP EXIF
615 if (function_exists('exif_read_data')) {
616 $exif = exif_read_data($absFile, 'IFD0');
617 } else {
618 $exif = FALSE;
619 }
620 if ($exif) {
621 $comment = trim($exif['COMMENT'][0] . ' ' . $exif['ImageDescription']);
622 } else {
623 $comment = '';
624 }
625 $contentArr = $this->pObj->splitRegularContent($comment);
626 $contentArr['title'] = basename($absFile);
627 // Make sure the title doesn't expose the absolute path!
628 break;
629 default:
630 return FALSE;
631 break;
632 }
633 // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
634 if (is_array($contentArr) && !$contentArr['title']) {
635 // Substituting "_" for " " because many filenames may have this instead of a space char.
636 $contentArr['title'] = str_replace('_', ' ', basename($absFile));
637 }
638 return $contentArr;
639 }
640
641 /**
642 * Creates an array with pointers to divisions of document.
643 * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero) coming back.
644 *
645 * @param string File extension
646 * @param string Absolute filename (must exist and be validated OK before calling function)
647 * @return array Array of pointers to sections that the document should be divided into
648 * @todo Define visibility
649 */
650 public function fileContentParts($ext, $absFile) {
651 $cParts = array(0);
652 switch ($ext) {
653 case 'pdf':
654 // Getting pdf-info:
655 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
656 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
657 $pdfInfo = $this->splitPdfInfo($res);
658 unset($res);
659 if (intval($pdfInfo['pages'])) {
660 $cParts = array();
661 // Calculate mode
662 if ($this->pdf_mode > 0) {
663 $iter = ceil($pdfInfo['pages'] / $this->pdf_mode);
664 } else {
665 $iter = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange(abs($this->pdf_mode), 1, $pdfInfo['pages']);
666 }
667 // Traverse and create intervals.
668 for ($a = 0; $a < $iter; $a++) {
669 $low = floor($a * ($pdfInfo['pages'] / $iter)) + 1;
670 $high = floor(($a + 1) * ($pdfInfo['pages'] / $iter));
671 $cParts[] = $low . '-' . $high;
672 }
673 }
674 break;
675 }
676 return $cParts;
677 }
678
679 /**
680 * Analysing PDF info into a useable format.
681 *
682 * @param array Array of PDF content, coming from the pdfinfo tool
683 * @return array Result array
684 * @access private
685 * @see fileContentParts()
686 * @todo Define visibility
687 */
688 public function splitPdfInfo($pdfInfoArray) {
689 $res = array();
690 if (is_array($pdfInfoArray)) {
691 foreach ($pdfInfoArray as $line) {
692 $parts = explode(':', $line, 2);
693 if (count($parts) > 1 && trim($parts[0])) {
694 $res[strtolower(trim($parts[0]))] = trim($parts[1]);
695 }
696 }
697 }
698 return $res;
699 }
700
701 /**
702 * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.
703 *
704 * @param string String to clean up
705 * @return string String
706 * @todo Define visibility
707 */
708 public function removeEndJunk($string) {
709 return trim(preg_replace('/[' . LF . chr(12) . ']*$/', '', $string));
710 }
711
712 /************************
713 *
714 * Backend analyzer
715 *
716 ************************/
717 /**
718 * Return icon for file extension
719 *
720 * @param string File extension, lowercase.
721 * @return string Relative file reference, resolvable by t3lib_div::getFileAbsFileName()
722 * @todo Define visibility
723 */
724 public function getIcon($extension) {
725 if ($extension == 'htm') {
726 $extension = 'html';
727 }
728 if ($extension == 'jpeg') {
729 $extension = 'jpg';
730 }
731 return 'EXT:indexed_search/pi/res/' . $extension . '.gif';
732 }
733
734 }
735
736
737 ?>