8a3163c4c1b9d18141ded59eea6e0851d48ee587
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / FileContentParser.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /***************************************************************
5 * Copyright notice
6 *
7 * (c) 2001-2013 Kasper Skårhøj (kasperYYYY@typo3.com)
8 * All rights reserved
9 *
10 * This script is part of the TYPO3 project. The TYPO3 project is
11 * free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * The GNU General Public License can be found at
17 * http://www.gnu.org/copyleft/gpl.html.
18 * A copy is found in the text file GPL.txt and important notices to the license
19 * from the author is found in LICENSE.txt distributed with these scripts.
20 *
21 *
22 * This script is distributed in the hope that it will be useful,
23 * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 * GNU General Public License for more details.
26 *
27 * This copyright notice MUST APPEAR in all copies of the script!
28 ***************************************************************/
29 /**
30 * External standard parsers for indexed_search
31 *
32 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
33 * @coauthor Olivier Simah <noname_paris@yahoo.fr>
34 */
35 /**
36 * External standard parsers for indexed_search
37 * MUST RETURN utf-8 content!
38 *
39 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
40 */
41 class FileContentParser {
42
43 // This value is also overridden from config.
44 /**
45 * @todo Define visibility
46 */
47 public $pdf_mode = -20;
48
49 // zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
50 // This array is configured in initialization:
51 /**
52 * @todo Define visibility
53 */
54 public $app = array();
55
56 /**
57 * @todo Define visibility
58 */
59 public $ext2itemtype_map = array();
60
61 /**
62 * @todo Define visibility
63 */
64 public $supportedExtensions = array();
65
66 /**
67 * @todo Define visibility
68 */
69 public $pObj;
70
71 // Reference to parent object (indexer class)
72 protected $langObject;
73
74 // Reference to LANG-Object
75 /**
76 * Constructs this external parsers object
77 */
78 public function __construct() {
79 // Set the language object to be used accordant to current TYPO3_MODE:
80 $this->langObject = TYPO3_MODE == 'FE' ? $GLOBALS['TSFE'] : $GLOBALS['LANG'];
81 }
82
83 /**
84 * Initialize external parser for parsing content.
85 *
86 * @param string File extension
87 * @return boolean Returns TRUE if extension is supported/enabled, otherwise FALSE.
88 * @todo Define visibility
89 */
90 public function initParser($extension) {
91 // Then read indexer-config and set if appropriate:
92 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
93 // If windows, apply extension to tool name:
94 $exe = TYPO3_OS == 'WIN' ? '.exe' : '';
95 // lg
96 $extOK = FALSE;
97 $mainExtension = '';
98 // Ignore extensions
99 $ignoreExtensions = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), TRUE);
100 if (in_array($extension, $ignoreExtensions)) {
101 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:ignoreExtensions'), $extension), 1);
102 return FALSE;
103 }
104 // Switch on file extension:
105 switch ($extension) {
106 case 'pdf':
107 // PDF
108 if ($indexerConfig['pdftools']) {
109 $pdfPath = rtrim($indexerConfig['pdftools'], '/') . '/';
110 if (@is_file(($pdfPath . 'pdftotext' . $exe)) && @is_file(($pdfPath . 'pdfinfo' . $exe))) {
111 $this->app['pdfinfo'] = $pdfPath . 'pdfinfo' . $exe;
112 $this->app['pdftotext'] = $pdfPath . 'pdftotext' . $exe;
113 // PDF mode:
114 $this->pdf_mode = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($indexerConfig['pdf_mode'], -100, 100);
115 $extOK = TRUE;
116 } else {
117 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:pdfToolsNotFound'), $pdfPath), 3);
118 }
119 } else {
120 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:pdfToolsDisabled'), 1);
121 }
122 break;
123 case 'doc':
124 // Catdoc
125 if ($indexerConfig['catdoc']) {
126 $catdocPath = rtrim($indexerConfig['catdoc'], '/') . '/';
127 if (@is_file(($catdocPath . 'catdoc' . $exe))) {
128 $this->app['catdoc'] = $catdocPath . 'catdoc' . $exe;
129 $extOK = TRUE;
130 } else {
131 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:catdocNotFound'), $catdocPath), 3);
132 }
133 } else {
134 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:catdocDisabled'), 1);
135 }
136 break;
137 case 'pps':
138
139 case 'ppt':
140 // MS PowerPoint
141 // ppthtml
142 if ($indexerConfig['ppthtml']) {
143 $ppthtmlPath = rtrim($indexerConfig['ppthtml'], '/') . '/';
144 if (@is_file(($ppthtmlPath . 'ppthtml' . $exe))) {
145 $this->app['ppthtml'] = $ppthtmlPath . 'ppthtml' . $exe;
146 $extOK = TRUE;
147 } else {
148 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:ppthtmlNotFound'), $ppthtmlPath), 3);
149 }
150 } else {
151 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:ppthtmlDisabled'), 1);
152 }
153 break;
154 case 'xls':
155 // MS Excel
156 // Xlhtml
157 if ($indexerConfig['xlhtml']) {
158 $xlhtmlPath = rtrim($indexerConfig['xlhtml'], '/') . '/';
159 if (@is_file(($xlhtmlPath . 'xlhtml' . $exe))) {
160 $this->app['xlhtml'] = $xlhtmlPath . 'xlhtml' . $exe;
161 $extOK = TRUE;
162 } else {
163 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:xlhtmlNotFound'), $xlhtmlPath), 3);
164 }
165 } else {
166 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:xlhtmlDisabled'), 1);
167 }
168 break;
169 case 'sxc':
170
171 case 'sxi':
172
173 case 'sxw':
174
175 case 'ods':
176
177 case 'odp':
178
179 case 'odt':
180 // Oasis OpenDocument Text
181 if ($indexerConfig['unzip']) {
182 $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
183 if (@is_file(($unzipPath . 'unzip' . $exe))) {
184 $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
185 $extOK = TRUE;
186 } else {
187 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:unzipNotFound'), $unzipPath), 3);
188 }
189 } else {
190 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:unzipDisabled'), 1);
191 }
192 break;
193 case 'rtf':
194 // Catdoc
195 if ($indexerConfig['unrtf']) {
196 $unrtfPath = rtrim($indexerConfig['unrtf'], '/') . '/';
197 if (@is_file(($unrtfPath . 'unrtf' . $exe))) {
198 $this->app['unrtf'] = $unrtfPath . 'unrtf' . $exe;
199 $extOK = TRUE;
200 } else {
201 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:unrtfNotFound'), $unrtfPath), 3);
202 }
203 } else {
204 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:unrtfDisabled'), 1);
205 }
206 break;
207 case 'txt':
208
209 case 'csv':
210
211 case 'xml':
212
213 case 'tif':
214 // PHP EXIF
215 $extOK = TRUE;
216 break;
217 case 'html':
218
219 case 'htm':
220 // PHP strip-tags()
221 $extOK = TRUE;
222 $mainExtension = 'html';
223 // making "html" the common "item_type"
224 break;
225 case 'jpg':
226
227 case 'jpeg':
228 // PHP EXIF
229 $extOK = TRUE;
230 $mainExtension = 'jpeg';
231 // making "jpeg" the common item_type
232 break;
233 }
234 // If extension was OK:
235 if ($extOK) {
236 $this->supportedExtensions[$extension] = TRUE;
237 $this->ext2itemtype_map[$extension] = $mainExtension ? $mainExtension : $extension;
238 return TRUE;
239 }
240 }
241
242 /**
243 * Initialize external parser for backend modules
244 * Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc) in backend and frontend plugin
245 *
246 * @param string File extension to initialize for.
247 * @return boolean Returns TRUE if the extension is supported and enabled, otherwise FALSE.
248 * @todo Define visibility
249 */
250 public function softInit($extension) {
251 switch ($extension) {
252 case 'pdf':
253
254 case 'doc':
255
256 case 'pps':
257
258 case 'ppt':
259
260 case 'xls':
261
262 case 'sxc':
263
264 case 'sxi':
265
266 case 'sxw':
267
268 case 'ods':
269
270 case 'odp':
271
272 case 'odt':
273
274 case 'rtf':
275
276 case 'txt':
277
278 case 'html':
279
280 case 'htm':
281
282 case 'csv':
283
284 case 'xml':
285
286 case 'jpg':
287
288 case 'jpeg':
289
290 case 'tif':
291 // TIF images (EXIF comment)
292 return TRUE;
293 break;
294 }
295 }
296
297 /**
298 * Return title of entry in media type selector box.
299 *
300 * @param string File extension
301 * @return string String with label value of entry in media type search selector box (frontend plugin).
302 * @todo Define visibility
303 */
304 public function searchTypeMediaTitle($extension) {
305 // Read indexer-config
306 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
307 // Ignore extensions
308 $ignoreExtensions = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), TRUE);
309 if (in_array($extension, $ignoreExtensions)) {
310 return FALSE;
311 }
312 // Switch on file extension:
313 switch ($extension) {
314 case 'pdf':
315 // PDF
316 if ($indexerConfig['pdftools']) {
317 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.PDF'), $extension);
318 }
319 break;
320 case 'doc':
321 // Catdoc
322 if ($indexerConfig['catdoc']) {
323 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.DOC'), $extension);
324 }
325 break;
326 case 'pps':
327
328 case 'ppt':
329 // MS PowerPoint
330 // ppthtml
331 if ($indexerConfig['ppthtml']) {
332 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.PP'), $extension);
333 }
334 break;
335 case 'xls':
336 // MS Excel
337 // Xlhtml
338 if ($indexerConfig['xlhtml']) {
339 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.XLS'), $extension);
340 }
341 break;
342 case 'sxc':
343 // Open Office Calc.
344 if ($indexerConfig['unzip']) {
345 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.SXC'), $extension);
346 }
347 break;
348 case 'sxi':
349 // Open Office Impress
350 if ($indexerConfig['unzip']) {
351 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.SXI'), $extension);
352 }
353 break;
354 case 'sxw':
355 // Open Office Writer
356 if ($indexerConfig['unzip']) {
357 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.SXW'), $extension);
358 }
359 break;
360 case 'ods':
361 // Oasis OpenDocument Spreadsheet
362 if ($indexerConfig['unzip']) {
363 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.ODS'), $extension);
364 }
365 break;
366 case 'odp':
367 // Oasis OpenDocument Presentation
368 if ($indexerConfig['unzip']) {
369 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.ODP'), $extension);
370 }
371 break;
372 case 'odt':
373 // Oasis OpenDocument Text
374 if ($indexerConfig['unzip']) {
375 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.ODT'), $extension);
376 }
377 break;
378 case 'rtf':
379 // Catdoc
380 if ($indexerConfig['unrtf']) {
381 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.RTF'), $extension);
382 }
383 break;
384 case 'jpeg':
385
386 case 'jpg':
387
388 case 'tif':
389 // PHP EXIF
390 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.Images'), $extension);
391 break;
392 case 'html':
393
394 case 'htm':
395 // PHP strip-tags()
396 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.HTML'), $extension);
397 break;
398 case 'txt':
399 // Raw text
400 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.TXT'), $extension);
401 break;
402 case 'csv':
403 // Raw text
404 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.CSV'), $extension);
405 break;
406 case 'xml':
407 // PHP strip-tags()
408 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.XML'), $extension);
409 break;
410 default:
411 // Do nothing
412 }
413 }
414
415 /**
416 * Returns TRUE if the input extension (item_type) is a potentially a multi-page extension
417 *
418 * @param string Extension / item_type string
419 * @return boolean Return TRUE if multi-page
420 * @todo Define visibility
421 */
422 public function isMultiplePageExtension($extension) {
423 // Switch on file extension:
424 switch ((string) $extension) {
425 case 'pdf':
426 return TRUE;
427 break;
428 }
429 }
430
431 /**
432 * Wraps the "splitLabel function" of the language object.
433 *
434 * @param string $reference: Reference/key of the label
435 * @param boolean $useHtmlSpecialChar: Convert special chars to HTML entities (default: FALSE)
436 * @return string The label of the reference/key to be fetched
437 */
438 protected function sL($reference, $useHtmlSpecialChar = FALSE) {
439 return $this->langObject->sL($reference, $useHtmlSpecialChar);
440 }
441
442 /************************
443 *
444 * Reading documents (for parsing)
445 *
446 ************************/
447 /**
448 * Reads the content of an external file being indexed.
449 *
450 * @param string File extension, eg. "pdf", "doc" etc.
451 * @param string Absolute filename of file (must exist and be validated OK before calling function)
452 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
453 * @return array Standard content array (title, description, keywords, body keys)
454 * @todo Define visibility
455 */
456 public function readFileContent($ext, $absFile, $cPKey) {
457 unset($contentArr);
458 // Return immediately if initialization didn't set support up:
459 if (!$this->supportedExtensions[$ext]) {
460 return FALSE;
461 }
462 // Switch by file extension
463 switch ($ext) {
464 case 'pdf':
465 if ($this->app['pdfinfo']) {
466 // Getting pdf-info:
467 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
468 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
469 $pdfInfo = $this->splitPdfInfo($res);
470 unset($res);
471 if (intval($pdfInfo['pages'])) {
472 list($low, $high) = explode('-', $cPKey);
473 // Get pdf content:
474 $tempFileName = \TYPO3\CMS\Core\Utility\GeneralUtility::tempnam('Typo3_indexer');
475 // Create temporary name
476 @unlink($tempFileName);
477 // Delete if exists, just to be safe.
478 $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
479 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd);
480 if (@is_file($tempFileName)) {
481 $content = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($tempFileName);
482 unlink($tempFileName);
483 } else {
484 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:pdfToolsFailed'), $absFile), 2);
485 }
486 if (strlen($content)) {
487 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
488 }
489 }
490 }
491 break;
492 case 'doc':
493 if ($this->app['catdoc']) {
494 $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
495 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
496 $content = implode(LF, $res);
497 unset($res);
498 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
499 }
500 break;
501 case 'pps':
502
503 case 'ppt':
504 if ($this->app['ppthtml']) {
505 $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
506 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
507 $content = implode(LF, $res);
508 unset($res);
509 $content = $this->pObj->convertHTMLToUtf8($content);
510 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
511 $contentArr['title'] = basename($absFile);
512 }
513 break;
514 case 'xls':
515 if ($this->app['xlhtml']) {
516 $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
517 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
518 $content = implode(LF, $res);
519 unset($res);
520 $content = $this->pObj->convertHTMLToUtf8($content);
521 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
522 $contentArr['title'] = basename($absFile);
523 }
524 break;
525 case 'sxi':
526
527 case 'sxc':
528
529 case 'sxw':
530
531 case 'ods':
532
533 case 'odp':
534
535 case 'odt':
536 if ($this->app['unzip']) {
537 // Read content.xml:
538 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml';
539 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
540 $content_xml = implode(LF, $res);
541 unset($res);
542 // Read meta.xml:
543 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml';
544 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
545 $meta_xml = implode(LF, $res);
546 unset($res);
547 $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
548 $contentArr = $this->pObj->splitRegularContent($utf8_content);
549 $contentArr['title'] = basename($absFile);
550 // Make sure the title doesn't expose the absolute path!
551 // Meta information
552 $metaContent = \TYPO3\CMS\Core\Utility\GeneralUtility::xml2tree($meta_xml);
553 $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
554 if (is_array($metaContent)) {
555 $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
556 $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0] . ' ' . $metaContent['dc:description'][0]['values'][0];
557 // Keywords collected:
558 if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
559 foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
560 $contentArr['keywords'] .= $kwDat['values'][0] . ' ';
561 }
562 }
563 }
564 }
565 break;
566 case 'rtf':
567 if ($this->app['unrtf']) {
568 $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
569 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
570 $fileContent = implode(LF, $res);
571 unset($res);
572 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
573 $contentArr = $this->pObj->splitHTMLContent($fileContent);
574 }
575 break;
576 case 'txt':
577
578 case 'csv':
579 // Raw text
580 $content = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($absFile);
581 // TODO: Implement auto detection of charset (currently assuming utf-8)
582 $contentCharset = 'utf-8';
583 $content = $this->pObj->convertHTMLToUtf8($content, $contentCharset);
584 $contentArr = $this->pObj->splitRegularContent($content);
585 $contentArr['title'] = basename($absFile);
586 // Make sure the title doesn't expose the absolute path!
587 break;
588 case 'html':
589
590 case 'htm':
591 $fileContent = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($absFile);
592 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
593 $contentArr = $this->pObj->splitHTMLContent($fileContent);
594 break;
595 case 'xml':
596 // PHP strip-tags()
597 $fileContent = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($absFile);
598 // Finding charset:
599 preg_match('/^[[:space:]]*<\\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i', substr($fileContent, 0, 200), $reg);
600 $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
601 // Converting content:
602 $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<', ' <', $fileContent)), $charset);
603 $contentArr = $this->pObj->splitRegularContent($fileContent);
604 $contentArr['title'] = basename($absFile);
605 // Make sure the title doesn't expose the absolute path!
606 break;
607 case 'jpg':
608
609 case 'jpeg':
610
611 case 'tif':
612 // PHP EXIF
613 if (function_exists('exif_read_data')) {
614 $exif = exif_read_data($absFile, 'IFD0');
615 } else {
616 $exif = FALSE;
617 }
618 if ($exif) {
619 $comment = trim($exif['COMMENT'][0] . ' ' . $exif['ImageDescription']);
620 } else {
621 $comment = '';
622 }
623 $contentArr = $this->pObj->splitRegularContent($comment);
624 $contentArr['title'] = basename($absFile);
625 // Make sure the title doesn't expose the absolute path!
626 break;
627 default:
628 return FALSE;
629 }
630 // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
631 if (is_array($contentArr) && !$contentArr['title']) {
632 // Substituting "_" for " " because many filenames may have this instead of a space char.
633 $contentArr['title'] = str_replace('_', ' ', basename($absFile));
634 }
635 return $contentArr;
636 }
637
638 /**
639 * Creates an array with pointers to divisions of document.
640 * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero) coming back.
641 *
642 * @param string File extension
643 * @param string Absolute filename (must exist and be validated OK before calling function)
644 * @return array Array of pointers to sections that the document should be divided into
645 * @todo Define visibility
646 */
647 public function fileContentParts($ext, $absFile) {
648 $cParts = array(0);
649 switch ($ext) {
650 case 'pdf':
651 // Getting pdf-info:
652 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
653 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
654 $pdfInfo = $this->splitPdfInfo($res);
655 unset($res);
656 if (intval($pdfInfo['pages'])) {
657 $cParts = array();
658 // Calculate mode
659 if ($this->pdf_mode > 0) {
660 $iter = ceil($pdfInfo['pages'] / $this->pdf_mode);
661 } else {
662 $iter = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange(abs($this->pdf_mode), 1, $pdfInfo['pages']);
663 }
664 // Traverse and create intervals.
665 for ($a = 0; $a < $iter; $a++) {
666 $low = floor($a * ($pdfInfo['pages'] / $iter)) + 1;
667 $high = floor(($a + 1) * ($pdfInfo['pages'] / $iter));
668 $cParts[] = $low . '-' . $high;
669 }
670 }
671 break;
672 }
673 return $cParts;
674 }
675
676 /**
677 * Analysing PDF info into a useable format.
678 *
679 * @param array Array of PDF content, coming from the pdfinfo tool
680 * @return array Result array
681 * @access private
682 * @see fileContentParts()
683 * @todo Define visibility
684 */
685 public function splitPdfInfo($pdfInfoArray) {
686 $res = array();
687 if (is_array($pdfInfoArray)) {
688 foreach ($pdfInfoArray as $line) {
689 $parts = explode(':', $line, 2);
690 if (count($parts) > 1 && trim($parts[0])) {
691 $res[strtolower(trim($parts[0]))] = trim($parts[1]);
692 }
693 }
694 }
695 return $res;
696 }
697
698 /**
699 * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.
700 *
701 * @param string String to clean up
702 * @return string String
703 * @todo Define visibility
704 */
705 public function removeEndJunk($string) {
706 return trim(preg_replace('/[' . LF . chr(12) . ']*$/', '', $string));
707 }
708
709 /************************
710 *
711 * Backend analyzer
712 *
713 ************************/
714 /**
715 * Return icon for file extension
716 *
717 * @param string File extension, lowercase.
718 * @return string Relative file reference, resolvable by \TYPO3\CMS\Core\Utility\GeneralUtility::getFileAbsFileName()
719 * @todo Define visibility
720 */
721 public function getIcon($extension) {
722 if ($extension == 'htm') {
723 $extension = 'html';
724 }
725 if ($extension == 'jpeg') {
726 $extension = 'jpg';
727 }
728 return 'EXT:indexed_search/pi/res/' . $extension . '.gif';
729 }
730
731 }