09b39929906f720e19701b9fd069f6b0dbdaa6a6
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / FileContentParser.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Utility\GeneralUtility;
18 use TYPO3\CMS\Core\Utility\CommandUtility;
19 use TYPO3\CMS\Core\Utility\MathUtility;
20
21 /**
22 * External standard parsers for indexed_search
23 * MUST RETURN utf-8 content!
24 */
25 class FileContentParser {
26
27 /**
28 * This value is also overridden from config.
29 * zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,....
30 * Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
31 *
32 * @var int
33 */
34 public $pdf_mode = -20;
35
36 /**
37 * @var array
38 */
39 public $app = array();
40
41 /**
42 * @var array
43 */
44 public $ext2itemtype_map = array();
45
46 /**
47 * @var array
48 */
49 public $supportedExtensions = array();
50
51 /**
52 * @var \TYPO3\CMS\IndexedSearch\Indexer
53 */
54 public $pObj;
55
56 /**
57 * @var \TYPO3\CMS\Lang\LanguageService|\TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController
58 */
59 protected $langObject;
60
61 /**
62 * Constructs this external parsers object
63 */
64 public function __construct() {
65 // Set the language object to be used accordant to current TYPO3_MODE:
66 $this->langObject = TYPO3_MODE === 'FE' ? $GLOBALS['TSFE'] : $GLOBALS['LANG'];
67 }
68
69 /**
70 * Initialize external parser for parsing content.
71 *
72 * @param string $extension File extension
73 * @return bool Returns TRUE if extension is supported/enabled, otherwise FALSE.
74 */
75 public function initParser($extension) {
76 // Then read indexer-config and set if appropriate:
77 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
78 // If windows, apply extension to tool name:
79 $exe = TYPO3_OS == 'WIN' ? '.exe' : '';
80 // lg
81 $extOK = FALSE;
82 $mainExtension = '';
83 // Ignore extensions
84 $ignoreExtensions = GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), TRUE);
85 if (in_array($extension, $ignoreExtensions)) {
86 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ignoreExtensions'), $extension), 1);
87 return FALSE;
88 }
89 // Switch on file extension:
90 switch ($extension) {
91 case 'pdf':
92 // PDF
93 if ($indexerConfig['pdftools']) {
94 $pdfPath = rtrim($indexerConfig['pdftools'], '/') . '/';
95 if (@is_file(($pdfPath . 'pdftotext' . $exe)) && @is_file(($pdfPath . 'pdfinfo' . $exe))) {
96 $this->app['pdfinfo'] = $pdfPath . 'pdfinfo' . $exe;
97 $this->app['pdftotext'] = $pdfPath . 'pdftotext' . $exe;
98 // PDF mode:
99 $this->pdf_mode = MathUtility::forceIntegerInRange($indexerConfig['pdf_mode'], -100, 100);
100 $extOK = TRUE;
101 } else {
102 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsNotFound'), $pdfPath), 3);
103 }
104 } else {
105 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsDisabled'), 1);
106 }
107 break;
108 case 'doc':
109 // Catdoc
110 if ($indexerConfig['catdoc']) {
111 $catdocPath = rtrim($indexerConfig['catdoc'], '/') . '/';
112 if (@is_file(($catdocPath . 'catdoc' . $exe))) {
113 $this->app['catdoc'] = $catdocPath . 'catdoc' . $exe;
114 $extOK = TRUE;
115 } else {
116 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:catdocNotFound'), $catdocPath), 3);
117 }
118 } else {
119 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:catdocDisabled'), 1);
120 }
121 break;
122 case 'pps':
123 case 'ppt':
124 // MS PowerPoint
125 // ppthtml
126 if ($indexerConfig['ppthtml']) {
127 $ppthtmlPath = rtrim($indexerConfig['ppthtml'], '/') . '/';
128 if (@is_file(($ppthtmlPath . 'ppthtml' . $exe))) {
129 $this->app['ppthtml'] = $ppthtmlPath . 'ppthtml' . $exe;
130 $extOK = TRUE;
131 } else {
132 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ppthtmlNotFound'), $ppthtmlPath), 3);
133 }
134 } else {
135 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ppthtmlDisabled'), 1);
136 }
137 break;
138 case 'xls':
139 // MS Excel
140 // Xlhtml
141 if ($indexerConfig['xlhtml']) {
142 $xlhtmlPath = rtrim($indexerConfig['xlhtml'], '/') . '/';
143 if (@is_file(($xlhtmlPath . 'xlhtml' . $exe))) {
144 $this->app['xlhtml'] = $xlhtmlPath . 'xlhtml' . $exe;
145 $extOK = TRUE;
146 } else {
147 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:xlhtmlNotFound'), $xlhtmlPath), 3);
148 }
149 } else {
150 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:xlhtmlDisabled'), 1);
151 }
152 break;
153 case 'sxc':
154 case 'sxi':
155 case 'sxw':
156 case 'ods':
157 case 'odp':
158 case 'odt':
159 // Oasis OpenDocument Text
160 if ($indexerConfig['unzip']) {
161 $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
162 if (@is_file(($unzipPath . 'unzip' . $exe))) {
163 $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
164 $extOK = TRUE;
165 } else {
166 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipNotFound'), $unzipPath), 3);
167 }
168 } else {
169 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipDisabled'), 1);
170 }
171 break;
172 case 'rtf':
173 // Catdoc
174 if ($indexerConfig['unrtf']) {
175 $unrtfPath = rtrim($indexerConfig['unrtf'], '/') . '/';
176 if (@is_file(($unrtfPath . 'unrtf' . $exe))) {
177 $this->app['unrtf'] = $unrtfPath . 'unrtf' . $exe;
178 $extOK = TRUE;
179 } else {
180 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unrtfNotFound'), $unrtfPath), 3);
181 }
182 } else {
183 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unrtfDisabled'), 1);
184 }
185 break;
186 case 'txt':
187 case 'csv':
188 case 'xml':
189 case 'tif':
190 // PHP EXIF
191 $extOK = TRUE;
192 break;
193 case 'html':
194 case 'htm':
195 // PHP strip-tags()
196 $extOK = TRUE;
197 $mainExtension = 'html';
198 // making "html" the common "item_type"
199 break;
200 case 'jpg':
201 case 'jpeg':
202 // PHP EXIF
203 $extOK = TRUE;
204 $mainExtension = 'jpeg';
205 // making "jpeg" the common item_type
206 break;
207 }
208 // If extension was OK:
209 if ($extOK) {
210 $this->supportedExtensions[$extension] = TRUE;
211 $this->ext2itemtype_map[$extension] = $mainExtension ?: $extension;
212 return TRUE;
213 }
214 return FALSE;
215 }
216
217 /**
218 * Initialize external parser for backend modules
219 * Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc) in backend and frontend plugin
220 *
221 * @param string $extension File extension to initialize for.
222 * @return bool Returns TRUE if the extension is supported and enabled, otherwise FALSE.
223 */
224 public function softInit($extension) {
225 switch ($extension) {
226 case 'pdf':
227 case 'doc':
228 case 'pps':
229 case 'ppt':
230 case 'xls':
231 case 'sxc':
232 case 'sxi':
233 case 'sxw':
234 case 'ods':
235 case 'odp':
236 case 'odt':
237 case 'rtf':
238 case 'txt':
239 case 'html':
240 case 'htm':
241 case 'csv':
242 case 'xml':
243 case 'jpg':
244 case 'jpeg':
245 case 'tif':
246 // TIF images (EXIF comment)
247 return TRUE;
248 break;
249 }
250 return FALSE;
251 }
252
253 /**
254 * Return title of entry in media type selector box.
255 *
256 * @param string $extension File extension
257 * @return string String with label value of entry in media type search selector box (frontend plugin).
258 */
259 public function searchTypeMediaTitle($extension) {
260 // Read indexer-config
261 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
262 // Ignore extensions
263 $ignoreExtensions = GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), TRUE);
264 if (in_array($extension, $ignoreExtensions)) {
265 return FALSE;
266 }
267 // Switch on file extension:
268 switch ($extension) {
269 case 'pdf':
270 // PDF
271 if ($indexerConfig['pdftools']) {
272 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PDF'), $extension);
273 }
274 break;
275 case 'doc':
276 // Catdoc
277 if ($indexerConfig['catdoc']) {
278 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.DOC'), $extension);
279 }
280 break;
281 case 'pps':
282 case 'ppt':
283 // MS PowerPoint
284 // ppthtml
285 if ($indexerConfig['ppthtml']) {
286 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PP'), $extension);
287 }
288 break;
289 case 'xls':
290 // MS Excel
291 // Xlhtml
292 if ($indexerConfig['xlhtml']) {
293 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XLS'), $extension);
294 }
295 break;
296 case 'sxc':
297 // Open Office Calc.
298 if ($indexerConfig['unzip']) {
299 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXC'), $extension);
300 }
301 break;
302 case 'sxi':
303 // Open Office Impress
304 if ($indexerConfig['unzip']) {
305 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXI'), $extension);
306 }
307 break;
308 case 'sxw':
309 // Open Office Writer
310 if ($indexerConfig['unzip']) {
311 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXW'), $extension);
312 }
313 break;
314 case 'ods':
315 // Oasis OpenDocument Spreadsheet
316 if ($indexerConfig['unzip']) {
317 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODS'), $extension);
318 }
319 break;
320 case 'odp':
321 // Oasis OpenDocument Presentation
322 if ($indexerConfig['unzip']) {
323 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODP'), $extension);
324 }
325 break;
326 case 'odt':
327 // Oasis OpenDocument Text
328 if ($indexerConfig['unzip']) {
329 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODT'), $extension);
330 }
331 break;
332 case 'rtf':
333 // Catdoc
334 if ($indexerConfig['unrtf']) {
335 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.RTF'), $extension);
336 }
337 break;
338 case 'jpeg':
339 case 'jpg':
340 case 'tif':
341 // PHP EXIF
342 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.Images'), $extension);
343 break;
344 case 'html':
345 case 'htm':
346 // PHP strip-tags()
347 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.HTML'), $extension);
348 break;
349 case 'txt':
350 // Raw text
351 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.TXT'), $extension);
352 break;
353 case 'csv':
354 // Raw text
355 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.CSV'), $extension);
356 break;
357 case 'xml':
358 // PHP strip-tags()
359 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XML'), $extension);
360 break;
361 default:
362 // Do nothing
363 }
364 return '';
365 }
366
367 /**
368 * Returns TRUE if the input extension (item_type) is a potentially a multi-page extension
369 *
370 * @param string $extension Extension / item_type string
371 * @return bool Return TRUE if multi-page
372 */
373 public function isMultiplePageExtension($extension) {
374 // Switch on file extension:
375 switch ((string)$extension) {
376 case 'pdf':
377 return TRUE;
378 break;
379 }
380 return FALSE;
381 }
382
383 /**
384 * Wraps the "splitLabel function" of the language object.
385 *
386 * @param string $reference: Reference/key of the label
387 * @param bool $useHtmlSpecialChar: Convert special chars to HTML entities (default: FALSE)
388 * @return string The label of the reference/key to be fetched
389 */
390 protected function sL($reference, $useHtmlSpecialChar = FALSE) {
391 return $this->langObject->sL($reference, $useHtmlSpecialChar);
392 }
393
394 /************************
395 *
396 * Reading documents (for parsing)
397 *
398 ************************/
399 /**
400 * Reads the content of an external file being indexed.
401 *
402 * @param string $ext File extension, eg. "pdf", "doc" etc.
403 * @param string $absFile Absolute filename of file (must exist and be validated OK before calling function)
404 * @param string $cPKey Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be split.)
405 * @return array Standard content array (title, description, keywords, body keys)
406 */
407 public function readFileContent($ext, $absFile, $cPKey) {
408 $contentArr = NULL;
409 // Return immediately if initialization didn't set support up:
410 if (!$this->supportedExtensions[$ext]) {
411 return FALSE;
412 }
413 // Switch by file extension
414 switch ($ext) {
415 case 'pdf':
416 if ($this->app['pdfinfo']) {
417 $this->setLocaleForServerFileSystem();
418 // Getting pdf-info:
419 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
420 CommandUtility::exec($cmd, $res);
421 $pdfInfo = $this->splitPdfInfo($res);
422 unset($res);
423 if ((int)$pdfInfo['pages']) {
424 list($low, $high) = explode('-', $cPKey);
425 // Get pdf content:
426 $tempFileName = GeneralUtility::tempnam('Typo3_indexer');
427 // Create temporary name
428 @unlink($tempFileName);
429 // Delete if exists, just to be safe.
430 $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
431 CommandUtility::exec($cmd);
432 if (@is_file($tempFileName)) {
433 $content = GeneralUtility::getUrl($tempFileName);
434 unlink($tempFileName);
435 } else {
436 $content = '';
437 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsFailed'), $absFile), 2);
438 }
439 if ((string)$content !== '') {
440 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
441 }
442 }
443 $this->setLocaleForServerFileSystem(TRUE);
444 }
445 break;
446 case 'doc':
447 if ($this->app['catdoc']) {
448 $this->setLocaleForServerFileSystem();
449 $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
450 CommandUtility::exec($cmd, $res);
451 $content = implode(LF, $res);
452 unset($res);
453 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
454 $this->setLocaleForServerFileSystem(TRUE);
455 }
456 break;
457 case 'pps':
458 case 'ppt':
459 if ($this->app['ppthtml']) {
460 $this->setLocaleForServerFileSystem();
461 $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
462 CommandUtility::exec($cmd, $res);
463 $content = implode(LF, $res);
464 unset($res);
465 $content = $this->pObj->convertHTMLToUtf8($content);
466 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
467 $contentArr['title'] = basename($absFile);
468 $this->setLocaleForServerFileSystem(TRUE);
469 }
470 break;
471 case 'xls':
472 if ($this->app['xlhtml']) {
473 $this->setLocaleForServerFileSystem();
474 $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
475 CommandUtility::exec($cmd, $res);
476 $content = implode(LF, $res);
477 unset($res);
478 $content = $this->pObj->convertHTMLToUtf8($content);
479 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
480 $contentArr['title'] = basename($absFile);
481 $this->setLocaleForServerFileSystem(TRUE);
482 }
483 break;
484 case 'sxi':
485 case 'sxc':
486 case 'sxw':
487 case 'ods':
488 case 'odp':
489 case 'odt':
490 if ($this->app['unzip']) {
491 $this->setLocaleForServerFileSystem();
492 // Read content.xml:
493 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml';
494 CommandUtility::exec($cmd, $res);
495 $content_xml = implode(LF, $res);
496 unset($res);
497 // Read meta.xml:
498 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml';
499 CommandUtility::exec($cmd, $res);
500 $meta_xml = implode(LF, $res);
501 unset($res);
502 $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
503 $contentArr = $this->pObj->splitRegularContent($utf8_content);
504 $contentArr['title'] = basename($absFile);
505 // Make sure the title doesn't expose the absolute path!
506 // Meta information
507 $metaContent = GeneralUtility::xml2tree($meta_xml);
508 $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
509 if (is_array($metaContent)) {
510 $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
511 $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0] . ' ' . $metaContent['dc:description'][0]['values'][0];
512 // Keywords collected:
513 if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
514 foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
515 $contentArr['keywords'] .= $kwDat['values'][0] . ' ';
516 }
517 }
518 }
519 $this->setLocaleForServerFileSystem(TRUE);
520 }
521 break;
522 case 'rtf':
523 if ($this->app['unrtf']) {
524 $this->setLocaleForServerFileSystem();
525 $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
526 CommandUtility::exec($cmd, $res);
527 $fileContent = implode(LF, $res);
528 unset($res);
529 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
530 $contentArr = $this->pObj->splitHTMLContent($fileContent);
531 $this->setLocaleForServerFileSystem(TRUE);
532 }
533 break;
534 case 'txt':
535 case 'csv':
536 $this->setLocaleForServerFileSystem();
537 // Raw text
538 $content = GeneralUtility::getUrl($absFile);
539 // @todo Implement auto detection of charset (currently assuming utf-8)
540 $contentCharset = 'utf-8';
541 $content = $this->pObj->convertHTMLToUtf8($content, $contentCharset);
542 $contentArr = $this->pObj->splitRegularContent($content);
543 $contentArr['title'] = basename($absFile);
544 // Make sure the title doesn't expose the absolute path!
545 $this->setLocaleForServerFileSystem(TRUE);
546 break;
547 case 'html':
548 case 'htm':
549 $fileContent = GeneralUtility::getUrl($absFile);
550 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
551 $contentArr = $this->pObj->splitHTMLContent($fileContent);
552 break;
553 case 'xml':
554 $this->setLocaleForServerFileSystem();
555 // PHP strip-tags()
556 $fileContent = GeneralUtility::getUrl($absFile);
557 // Finding charset:
558 preg_match('/^[[:space:]]*<\\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i', substr($fileContent, 0, 200), $reg);
559 $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
560 // Converting content:
561 $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<', ' <', $fileContent)), $charset);
562 $contentArr = $this->pObj->splitRegularContent($fileContent);
563 $contentArr['title'] = basename($absFile);
564 // Make sure the title doesn't expose the absolute path!
565 $this->setLocaleForServerFileSystem(TRUE);
566 break;
567 case 'jpg':
568 case 'jpeg':
569 case 'tif':
570 $this->setLocaleForServerFileSystem();
571 // PHP EXIF
572 if (function_exists('exif_read_data')) {
573 $exif = @exif_read_data($absFile, 'IFD0');
574 } else {
575 $exif = FALSE;
576 }
577 if ($exif) {
578 $comment = trim($exif['COMMENT'][0] . ' ' . $exif['ImageDescription']);
579 } else {
580 $comment = '';
581 }
582 $contentArr = $this->pObj->splitRegularContent($comment);
583 $contentArr['title'] = basename($absFile);
584 // Make sure the title doesn't expose the absolute path!
585 $this->setLocaleForServerFileSystem(TRUE);
586 break;
587 default:
588 return FALSE;
589 }
590 // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
591 if (is_array($contentArr) && !$contentArr['title']) {
592 // Substituting "_" for " " because many filenames may have this instead of a space char.
593 $contentArr['title'] = str_replace('_', ' ', basename($absFile));
594 }
595 return $contentArr;
596 }
597
598 /**
599 * Sets the locale for LC_CTYPE to $TYPO3_CONF_VARS['SYS']['systemLocale']
600 * if $TYPO3_CONF_VARS['SYS']['UTF8filesystem'] is set.
601 *
602 * Parameter <code>$resetLocale</code> has to be FALSE and TRUE alternating for all calls.
603 *
604 * @staticvar string $lastLocale Stores the locale used before it is overridden by this method.
605 * @param bool $resetLocale TRUE resets the locale to $lastLocale.
606 * @return void
607 * @throws \RuntimeException
608 */
609 protected function setLocaleForServerFileSystem($resetLocale = FALSE) {
610 static $lastLocale = NULL;
611 if (!$GLOBALS['TYPO3_CONF_VARS']['SYS']['UTF8filesystem']) {
612 return;
613 }
614
615 if ($resetLocale) {
616 if ($lastLocale == NULL) {
617 throw new \RuntimeException('Cannot reset locale to NULL.', 1357064326);
618 }
619 setlocale(LC_CTYPE, $lastLocale);
620 $lastLocale = NULL;
621 } else {
622 if ($lastLocale !== NULL) {
623 throw new \RuntimeException('Cannot set new locale as locale has already been changed before.', 1357064437);
624 }
625 $lastLocale = setlocale(LC_CTYPE, 0);
626 setlocale(LC_CTYPE, $GLOBALS['TYPO3_CONF_VARS']['SYS']['systemLocale']);
627 }
628 }
629
630 /**
631 * Creates an array with pointers to divisions of document.
632 *
633 * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero)
634 * coming back.
635 *
636 * @param string $ext File extension
637 * @param string $absFile Absolute filename (must exist and be validated OK before calling function)
638 * @return array Array of pointers to sections that the document should be divided into
639 */
640 public function fileContentParts($ext, $absFile) {
641 $cParts = array(0);
642 switch ($ext) {
643 case 'pdf':
644 $this->setLocaleForServerFileSystem();
645 // Getting pdf-info:
646 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
647 CommandUtility::exec($cmd, $res);
648 $pdfInfo = $this->splitPdfInfo($res);
649 unset($res);
650 if ((int)$pdfInfo['pages']) {
651 $cParts = array();
652 // Calculate mode
653 if ($this->pdf_mode > 0) {
654 $iter = ceil($pdfInfo['pages'] / $this->pdf_mode);
655 } else {
656 $iter = MathUtility::forceIntegerInRange(abs($this->pdf_mode), 1, $pdfInfo['pages']);
657 }
658 // Traverse and create intervals.
659 for ($a = 0; $a < $iter; $a++) {
660 $low = floor($a * ($pdfInfo['pages'] / $iter)) + 1;
661 $high = floor(($a + 1) * ($pdfInfo['pages'] / $iter));
662 $cParts[] = $low . '-' . $high;
663 }
664 }
665 $this->setLocaleForServerFileSystem(TRUE);
666 break;
667 default:
668 }
669 return $cParts;
670 }
671
672 /**
673 * Analysing PDF info into a useable format.
674 *
675 * @param array $pdfInfoArray Array of PDF content, coming from the pdfinfo tool
676 * @return array Result array
677 * @access private
678 * @see fileContentParts()
679 */
680 public function splitPdfInfo($pdfInfoArray) {
681 $res = array();
682 if (is_array($pdfInfoArray)) {
683 foreach ($pdfInfoArray as $line) {
684 $parts = explode(':', $line, 2);
685 if (count($parts) > 1 && trim($parts[0])) {
686 $res[strtolower(trim($parts[0]))] = trim($parts[1]);
687 }
688 }
689 }
690 return $res;
691 }
692
693 /**
694 * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.
695 *
696 * @param string $string String to clean up
697 * @return string String
698 */
699 public function removeEndJunk($string) {
700 return trim(preg_replace('/[' . LF . chr(12) . ']*$/', '', $string));
701 }
702
703 /************************
704 *
705 * Backend analyzer
706 *
707 ************************/
708 /**
709 * Return icon for file extension
710 *
711 * @param string $extension File extension, lowercase.
712 * @return string Relative file reference, resolvable by GeneralUtility::getFileAbsFileName()
713 */
714 public function getIcon($extension) {
715 if ($extension === 'htm') {
716 $extension = 'html';
717 } elseif ($extension === 'jpeg') {
718 $extension = 'jpg';
719 }
720 return 'EXT:indexed_search/pi/res/' . $extension . '.gif';
721 }
722
723 }