[BUGFIX] Suppress EXIF warnings indexing images
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / FileContentParser.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /***************************************************************
5 * Copyright notice
6 *
7 * (c) 2001-2013 Kasper Skårhøj (kasperYYYY@typo3.com)
8 * All rights reserved
9 *
10 * This script is part of the TYPO3 project. The TYPO3 project is
11 * free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * The GNU General Public License can be found at
17 * http://www.gnu.org/copyleft/gpl.html.
18 * A copy is found in the text file GPL.txt and important notices to the license
19 * from the author is found in LICENSE.txt distributed with these scripts.
20 *
21 *
22 * This script is distributed in the hope that it will be useful,
23 * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 * GNU General Public License for more details.
26 *
27 * This copyright notice MUST APPEAR in all copies of the script!
28 ***************************************************************/
29 /**
30 * External standard parsers for indexed_search
31 *
32 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
33 * @coauthor Olivier Simah <noname_paris@yahoo.fr>
34 */
35 /**
36 * External standard parsers for indexed_search
37 * MUST RETURN utf-8 content!
38 *
39 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
40 */
41 class FileContentParser {
42
43 // This value is also overridden from config.
44 /**
45 * @todo Define visibility
46 */
47 public $pdf_mode = -20;
48
49 // zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
50 // This array is configured in initialization:
51 /**
52 * @todo Define visibility
53 */
54 public $app = array();
55
56 /**
57 * @todo Define visibility
58 */
59 public $ext2itemtype_map = array();
60
61 /**
62 * @todo Define visibility
63 */
64 public $supportedExtensions = array();
65
66 /**
67 * @todo Define visibility
68 */
69 public $pObj;
70
71 // Reference to parent object (indexer class)
72 protected $langObject;
73
74 // Reference to LANG-Object
75 /**
76 * Constructs this external parsers object
77 */
78 public function __construct() {
79 // Set the language object to be used accordant to current TYPO3_MODE:
80 $this->langObject = TYPO3_MODE == 'FE' ? $GLOBALS['TSFE'] : $GLOBALS['LANG'];
81 }
82
83 /**
84 * Initialize external parser for parsing content.
85 *
86 * @param string File extension
87 * @return boolean Returns TRUE if extension is supported/enabled, otherwise FALSE.
88 * @todo Define visibility
89 */
90 public function initParser($extension) {
91 // Then read indexer-config and set if appropriate:
92 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
93 // If windows, apply extension to tool name:
94 $exe = TYPO3_OS == 'WIN' ? '.exe' : '';
95 // lg
96 $extOK = FALSE;
97 $mainExtension = '';
98 // Ignore extensions
99 $ignoreExtensions = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), TRUE);
100 if (in_array($extension, $ignoreExtensions)) {
101 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:ignoreExtensions'), $extension), 1);
102 return FALSE;
103 }
104 // Switch on file extension:
105 switch ($extension) {
106 case 'pdf':
107 // PDF
108 if ($indexerConfig['pdftools']) {
109 $pdfPath = rtrim($indexerConfig['pdftools'], '/') . '/';
110 if (@is_file(($pdfPath . 'pdftotext' . $exe)) && @is_file(($pdfPath . 'pdfinfo' . $exe))) {
111 $this->app['pdfinfo'] = $pdfPath . 'pdfinfo' . $exe;
112 $this->app['pdftotext'] = $pdfPath . 'pdftotext' . $exe;
113 // PDF mode:
114 $this->pdf_mode = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($indexerConfig['pdf_mode'], -100, 100);
115 $extOK = TRUE;
116 } else {
117 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:pdfToolsNotFound'), $pdfPath), 3);
118 }
119 } else {
120 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:pdfToolsDisabled'), 1);
121 }
122 break;
123 case 'doc':
124 // Catdoc
125 if ($indexerConfig['catdoc']) {
126 $catdocPath = rtrim($indexerConfig['catdoc'], '/') . '/';
127 if (@is_file(($catdocPath . 'catdoc' . $exe))) {
128 $this->app['catdoc'] = $catdocPath . 'catdoc' . $exe;
129 $extOK = TRUE;
130 } else {
131 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:catdocNotFound'), $catdocPath), 3);
132 }
133 } else {
134 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:catdocDisabled'), 1);
135 }
136 break;
137 case 'pps':
138
139 case 'ppt':
140 // MS PowerPoint
141 // ppthtml
142 if ($indexerConfig['ppthtml']) {
143 $ppthtmlPath = rtrim($indexerConfig['ppthtml'], '/') . '/';
144 if (@is_file(($ppthtmlPath . 'ppthtml' . $exe))) {
145 $this->app['ppthtml'] = $ppthtmlPath . 'ppthtml' . $exe;
146 $extOK = TRUE;
147 } else {
148 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:ppthtmlNotFound'), $ppthtmlPath), 3);
149 }
150 } else {
151 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:ppthtmlDisabled'), 1);
152 }
153 break;
154 case 'xls':
155 // MS Excel
156 // Xlhtml
157 if ($indexerConfig['xlhtml']) {
158 $xlhtmlPath = rtrim($indexerConfig['xlhtml'], '/') . '/';
159 if (@is_file(($xlhtmlPath . 'xlhtml' . $exe))) {
160 $this->app['xlhtml'] = $xlhtmlPath . 'xlhtml' . $exe;
161 $extOK = TRUE;
162 } else {
163 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:xlhtmlNotFound'), $xlhtmlPath), 3);
164 }
165 } else {
166 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:xlhtmlDisabled'), 1);
167 }
168 break;
169 case 'sxc':
170
171 case 'sxi':
172
173 case 'sxw':
174
175 case 'ods':
176
177 case 'odp':
178
179 case 'odt':
180 // Oasis OpenDocument Text
181 if ($indexerConfig['unzip']) {
182 $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
183 if (@is_file(($unzipPath . 'unzip' . $exe))) {
184 $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
185 $extOK = TRUE;
186 } else {
187 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:unzipNotFound'), $unzipPath), 3);
188 }
189 } else {
190 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:unzipDisabled'), 1);
191 }
192 break;
193 case 'rtf':
194 // Catdoc
195 if ($indexerConfig['unrtf']) {
196 $unrtfPath = rtrim($indexerConfig['unrtf'], '/') . '/';
197 if (@is_file(($unrtfPath . 'unrtf' . $exe))) {
198 $this->app['unrtf'] = $unrtfPath . 'unrtf' . $exe;
199 $extOK = TRUE;
200 } else {
201 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:unrtfNotFound'), $unrtfPath), 3);
202 }
203 } else {
204 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:unrtfDisabled'), 1);
205 }
206 break;
207 case 'txt':
208
209 case 'csv':
210
211 case 'xml':
212
213 case 'tif':
214 // PHP EXIF
215 $extOK = TRUE;
216 break;
217 case 'html':
218
219 case 'htm':
220 // PHP strip-tags()
221 $extOK = TRUE;
222 $mainExtension = 'html';
223 // making "html" the common "item_type"
224 break;
225 case 'jpg':
226
227 case 'jpeg':
228 // PHP EXIF
229 $extOK = TRUE;
230 $mainExtension = 'jpeg';
231 // making "jpeg" the common item_type
232 break;
233 }
234 // If extension was OK:
235 if ($extOK) {
236 $this->supportedExtensions[$extension] = TRUE;
237 $this->ext2itemtype_map[$extension] = $mainExtension ?: $extension;
238 return TRUE;
239 }
240 }
241
242 /**
243 * Initialize external parser for backend modules
244 * Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc) in backend and frontend plugin
245 *
246 * @param string File extension to initialize for.
247 * @return boolean Returns TRUE if the extension is supported and enabled, otherwise FALSE.
248 * @todo Define visibility
249 */
250 public function softInit($extension) {
251 switch ($extension) {
252 case 'pdf':
253
254 case 'doc':
255
256 case 'pps':
257
258 case 'ppt':
259
260 case 'xls':
261
262 case 'sxc':
263
264 case 'sxi':
265
266 case 'sxw':
267
268 case 'ods':
269
270 case 'odp':
271
272 case 'odt':
273
274 case 'rtf':
275
276 case 'txt':
277
278 case 'html':
279
280 case 'htm':
281
282 case 'csv':
283
284 case 'xml':
285
286 case 'jpg':
287
288 case 'jpeg':
289
290 case 'tif':
291 // TIF images (EXIF comment)
292 return TRUE;
293 break;
294 }
295 }
296
297 /**
298 * Return title of entry in media type selector box.
299 *
300 * @param string File extension
301 * @return string String with label value of entry in media type search selector box (frontend plugin).
302 * @todo Define visibility
303 */
304 public function searchTypeMediaTitle($extension) {
305 // Read indexer-config
306 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
307 // Ignore extensions
308 $ignoreExtensions = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), TRUE);
309 if (in_array($extension, $ignoreExtensions)) {
310 return FALSE;
311 }
312 // Switch on file extension:
313 switch ($extension) {
314 case 'pdf':
315 // PDF
316 if ($indexerConfig['pdftools']) {
317 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.PDF'), $extension);
318 }
319 break;
320 case 'doc':
321 // Catdoc
322 if ($indexerConfig['catdoc']) {
323 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.DOC'), $extension);
324 }
325 break;
326 case 'pps':
327
328 case 'ppt':
329 // MS PowerPoint
330 // ppthtml
331 if ($indexerConfig['ppthtml']) {
332 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.PP'), $extension);
333 }
334 break;
335 case 'xls':
336 // MS Excel
337 // Xlhtml
338 if ($indexerConfig['xlhtml']) {
339 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.XLS'), $extension);
340 }
341 break;
342 case 'sxc':
343 // Open Office Calc.
344 if ($indexerConfig['unzip']) {
345 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.SXC'), $extension);
346 }
347 break;
348 case 'sxi':
349 // Open Office Impress
350 if ($indexerConfig['unzip']) {
351 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.SXI'), $extension);
352 }
353 break;
354 case 'sxw':
355 // Open Office Writer
356 if ($indexerConfig['unzip']) {
357 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.SXW'), $extension);
358 }
359 break;
360 case 'ods':
361 // Oasis OpenDocument Spreadsheet
362 if ($indexerConfig['unzip']) {
363 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.ODS'), $extension);
364 }
365 break;
366 case 'odp':
367 // Oasis OpenDocument Presentation
368 if ($indexerConfig['unzip']) {
369 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.ODP'), $extension);
370 }
371 break;
372 case 'odt':
373 // Oasis OpenDocument Text
374 if ($indexerConfig['unzip']) {
375 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.ODT'), $extension);
376 }
377 break;
378 case 'rtf':
379 // Catdoc
380 if ($indexerConfig['unrtf']) {
381 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.RTF'), $extension);
382 }
383 break;
384 case 'jpeg':
385
386 case 'jpg':
387
388 case 'tif':
389 // PHP EXIF
390 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.Images'), $extension);
391 break;
392 case 'html':
393
394 case 'htm':
395 // PHP strip-tags()
396 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.HTML'), $extension);
397 break;
398 case 'txt':
399 // Raw text
400 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.TXT'), $extension);
401 break;
402 case 'csv':
403 // Raw text
404 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.CSV'), $extension);
405 break;
406 case 'xml':
407 // PHP strip-tags()
408 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.XML'), $extension);
409 break;
410 default:
411 // Do nothing
412 }
413 }
414
415 /**
416 * Returns TRUE if the input extension (item_type) is a potentially a multi-page extension
417 *
418 * @param string Extension / item_type string
419 * @return boolean Return TRUE if multi-page
420 * @todo Define visibility
421 */
422 public function isMultiplePageExtension($extension) {
423 // Switch on file extension:
424 switch ((string) $extension) {
425 case 'pdf':
426 return TRUE;
427 break;
428 }
429 }
430
431 /**
432 * Wraps the "splitLabel function" of the language object.
433 *
434 * @param string $reference: Reference/key of the label
435 * @param boolean $useHtmlSpecialChar: Convert special chars to HTML entities (default: FALSE)
436 * @return string The label of the reference/key to be fetched
437 */
438 protected function sL($reference, $useHtmlSpecialChar = FALSE) {
439 return $this->langObject->sL($reference, $useHtmlSpecialChar);
440 }
441
442 /************************
443 *
444 * Reading documents (for parsing)
445 *
446 ************************/
447 /**
448 * Reads the content of an external file being indexed.
449 *
450 * @param string $ext File extension, eg. "pdf", "doc" etc.
451 * @param string $absFile Absolute filename of file (must exist and be validated OK before calling function)
452 * @param string $cPKey Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be split.)
453 * @return array Standard content array (title, description, keywords, body keys)
454 * @todo Define visibility
455 */
456 public function readFileContent($ext, $absFile, $cPKey) {
457 unset($contentArr);
458 // Return immediately if initialization didn't set support up:
459 if (!$this->supportedExtensions[$ext]) {
460 return FALSE;
461 }
462 // Switch by file extension
463 switch ($ext) {
464 case 'pdf':
465 if ($this->app['pdfinfo']) {
466 $this->setLocaleForServerFileSystem();
467 // Getting pdf-info:
468 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
469 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
470 $pdfInfo = $this->splitPdfInfo($res);
471 unset($res);
472 if ((int)$pdfInfo['pages']) {
473 list($low, $high) = explode('-', $cPKey);
474 // Get pdf content:
475 $tempFileName = \TYPO3\CMS\Core\Utility\GeneralUtility::tempnam('Typo3_indexer');
476 // Create temporary name
477 @unlink($tempFileName);
478 // Delete if exists, just to be safe.
479 $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
480 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd);
481 if (@is_file($tempFileName)) {
482 $content = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($tempFileName);
483 unlink($tempFileName);
484 } else {
485 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:pdfToolsFailed'), $absFile), 2);
486 }
487 if (strlen($content)) {
488 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
489 }
490 }
491 $this->setLocaleForServerFileSystem(TRUE);
492 }
493 break;
494 case 'doc':
495 if ($this->app['catdoc']) {
496 $this->setLocaleForServerFileSystem();
497 $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
498 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
499 $content = implode(LF, $res);
500 unset($res);
501 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
502 $this->setLocaleForServerFileSystem(TRUE);
503 }
504 break;
505 case 'pps':
506
507 case 'ppt':
508 if ($this->app['ppthtml']) {
509 $this->setLocaleForServerFileSystem();
510 $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
511 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
512 $content = implode(LF, $res);
513 unset($res);
514 $content = $this->pObj->convertHTMLToUtf8($content);
515 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
516 $contentArr['title'] = basename($absFile);
517 $this->setLocaleForServerFileSystem(TRUE);
518 }
519 break;
520 case 'xls':
521 if ($this->app['xlhtml']) {
522 $this->setLocaleForServerFileSystem();
523 $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
524 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
525 $content = implode(LF, $res);
526 unset($res);
527 $content = $this->pObj->convertHTMLToUtf8($content);
528 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
529 $contentArr['title'] = basename($absFile);
530 $this->setLocaleForServerFileSystem(TRUE);
531 }
532 break;
533 case 'sxi':
534
535 case 'sxc':
536
537 case 'sxw':
538
539 case 'ods':
540
541 case 'odp':
542
543 case 'odt':
544 if ($this->app['unzip']) {
545 $this->setLocaleForServerFileSystem();
546 // Read content.xml:
547 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml';
548 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
549 $content_xml = implode(LF, $res);
550 unset($res);
551 // Read meta.xml:
552 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml';
553 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
554 $meta_xml = implode(LF, $res);
555 unset($res);
556 $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
557 $contentArr = $this->pObj->splitRegularContent($utf8_content);
558 $contentArr['title'] = basename($absFile);
559 // Make sure the title doesn't expose the absolute path!
560 // Meta information
561 $metaContent = \TYPO3\CMS\Core\Utility\GeneralUtility::xml2tree($meta_xml);
562 $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
563 if (is_array($metaContent)) {
564 $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
565 $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0] . ' ' . $metaContent['dc:description'][0]['values'][0];
566 // Keywords collected:
567 if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
568 foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
569 $contentArr['keywords'] .= $kwDat['values'][0] . ' ';
570 }
571 }
572 }
573 $this->setLocaleForServerFileSystem(TRUE);
574 }
575 break;
576 case 'rtf':
577 if ($this->app['unrtf']) {
578 $this->setLocaleForServerFileSystem();
579 $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
580 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
581 $fileContent = implode(LF, $res);
582 unset($res);
583 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
584 $contentArr = $this->pObj->splitHTMLContent($fileContent);
585 $this->setLocaleForServerFileSystem(TRUE);
586 }
587 break;
588 case 'txt':
589
590 case 'csv':
591 $this->setLocaleForServerFileSystem();
592 // Raw text
593 $content = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($absFile);
594 // TODO: Implement auto detection of charset (currently assuming utf-8)
595 $contentCharset = 'utf-8';
596 $content = $this->pObj->convertHTMLToUtf8($content, $contentCharset);
597 $contentArr = $this->pObj->splitRegularContent($content);
598 $contentArr['title'] = basename($absFile);
599 // Make sure the title doesn't expose the absolute path!
600 $this->setLocaleForServerFileSystem(TRUE);
601 break;
602 case 'html':
603
604 case 'htm':
605 $fileContent = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($absFile);
606 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
607 $contentArr = $this->pObj->splitHTMLContent($fileContent);
608 break;
609 case 'xml':
610 $this->setLocaleForServerFileSystem();
611 // PHP strip-tags()
612 $fileContent = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($absFile);
613 // Finding charset:
614 preg_match('/^[[:space:]]*<\\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i', substr($fileContent, 0, 200), $reg);
615 $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
616 // Converting content:
617 $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<', ' <', $fileContent)), $charset);
618 $contentArr = $this->pObj->splitRegularContent($fileContent);
619 $contentArr['title'] = basename($absFile);
620 // Make sure the title doesn't expose the absolute path!
621 $this->setLocaleForServerFileSystem(TRUE);
622 break;
623 case 'jpg':
624
625 case 'jpeg':
626
627 case 'tif':
628 $this->setLocaleForServerFileSystem();
629 // PHP EXIF
630 if (function_exists('exif_read_data')) {
631 $exif = @exif_read_data($absFile, 'IFD0');
632 } else {
633 $exif = FALSE;
634 }
635 if ($exif) {
636 $comment = trim($exif['COMMENT'][0] . ' ' . $exif['ImageDescription']);
637 } else {
638 $comment = '';
639 }
640 $contentArr = $this->pObj->splitRegularContent($comment);
641 $contentArr['title'] = basename($absFile);
642 // Make sure the title doesn't expose the absolute path!
643 $this->setLocaleForServerFileSystem(TRUE);
644 break;
645 default:
646 return FALSE;
647 }
648 // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
649 if (is_array($contentArr) && !$contentArr['title']) {
650 // Substituting "_" for " " because many filenames may have this instead of a space char.
651 $contentArr['title'] = str_replace('_', ' ', basename($absFile));
652 }
653 return $contentArr;
654 }
655
656 /**
657 * Sets the locale for LC_CTYPE to $TYPO3_CONF_VARS['SYS']['systemLocale']
658 * if $TYPO3_CONF_VARS['SYS']['UTF8filesystem'] is set.
659 *
660 * Parameter <code>$resetLocale</code> has to be FALSE and TRUE alternating for all calls.
661 *
662 * @staticvar string $lastLocale Stores the locale used before it is overridden by this method.
663 * @param boolean $resetLocale TRUE resets the locale to $lastLocale.
664 * @return void
665 * @throws \RuntimeException
666 */
667 protected function setLocaleForServerFileSystem($resetLocale = FALSE) {
668 static $lastLocale = NULL;
669 if (!$GLOBALS['TYPO3_CONF_VARS']['SYS']['UTF8filesystem']) {
670 return;
671 }
672
673 if ($resetLocale) {
674 if ($lastLocale == NULL) {
675 throw new \RuntimeException('Cannot reset locale to NULL.', 1357064326);
676 }
677 setlocale(LC_CTYPE, $lastLocale);
678 $lastLocale = NULL;
679 } else {
680 if ($lastLocale !== NULL) {
681 throw new \RuntimeException('Cannot set new locale as locale has already been changed before.', 1357064437);
682 }
683 $lastLocale = setlocale(LC_CTYPE, 0);
684 setlocale(LC_CTYPE, $GLOBALS['TYPO3_CONF_VARS']['SYS']['systemLocale']);
685 }
686 }
687
688 /**
689 * Creates an array with pointers to divisions of document.
690 *
691 * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero)
692 * coming back.
693 *
694 * @param string $ext File extension
695 * @param string $absFile Absolute filename (must exist and be validated OK before calling function)
696 * @return array Array of pointers to sections that the document should be divided into
697 * @todo Define visibility
698 */
699 public function fileContentParts($ext, $absFile) {
700 $cParts = array(0);
701 switch ($ext) {
702 case 'pdf':
703 $this->setLocaleForServerFileSystem();
704 // Getting pdf-info:
705 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
706 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
707 $pdfInfo = $this->splitPdfInfo($res);
708 unset($res);
709 if ((int)$pdfInfo['pages']) {
710 $cParts = array();
711 // Calculate mode
712 if ($this->pdf_mode > 0) {
713 $iter = ceil($pdfInfo['pages'] / $this->pdf_mode);
714 } else {
715 $iter = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange(abs($this->pdf_mode), 1, $pdfInfo['pages']);
716 }
717 // Traverse and create intervals.
718 for ($a = 0; $a < $iter; $a++) {
719 $low = floor($a * ($pdfInfo['pages'] / $iter)) + 1;
720 $high = floor(($a + 1) * ($pdfInfo['pages'] / $iter));
721 $cParts[] = $low . '-' . $high;
722 }
723 }
724 $this->setLocaleForServerFileSystem(TRUE);
725 break;
726 default:
727 }
728 return $cParts;
729 }
730
731 /**
732 * Analysing PDF info into a useable format.
733 *
734 * @param array Array of PDF content, coming from the pdfinfo tool
735 * @return array Result array
736 * @access private
737 * @see fileContentParts()
738 * @todo Define visibility
739 */
740 public function splitPdfInfo($pdfInfoArray) {
741 $res = array();
742 if (is_array($pdfInfoArray)) {
743 foreach ($pdfInfoArray as $line) {
744 $parts = explode(':', $line, 2);
745 if (count($parts) > 1 && trim($parts[0])) {
746 $res[strtolower(trim($parts[0]))] = trim($parts[1]);
747 }
748 }
749 }
750 return $res;
751 }
752
753 /**
754 * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.
755 *
756 * @param string String to clean up
757 * @return string String
758 * @todo Define visibility
759 */
760 public function removeEndJunk($string) {
761 return trim(preg_replace('/[' . LF . chr(12) . ']*$/', '', $string));
762 }
763
764 /************************
765 *
766 * Backend analyzer
767 *
768 ************************/
769 /**
770 * Return icon for file extension
771 *
772 * @param string File extension, lowercase.
773 * @return string Relative file reference, resolvable by \TYPO3\CMS\Core\Utility\GeneralUtility::getFileAbsFileName()
774 * @todo Define visibility
775 */
776 public function getIcon($extension) {
777 if ($extension == 'htm') {
778 $extension = 'html';
779 }
780 if ($extension == 'jpeg') {
781 $extension = 'jpg';
782 }
783 return 'EXT:indexed_search/pi/res/' . $extension . '.gif';
784 }
785
786 }