[BUGFIX] Remove "Extbase & fluid" suffix from indexed search
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / FileContentParser.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
18 use TYPO3\CMS\Core\Core\Environment;
19 use TYPO3\CMS\Core\Utility\CommandUtility;
20 use TYPO3\CMS\Core\Utility\GeneralUtility;
21 use TYPO3\CMS\Core\Utility\MathUtility;
22 use TYPO3\CMS\Core\Utility\PathUtility;
23
24 /**
25 * External standard parsers for indexed_search
26 * MUST RETURN utf-8 content!
27 */
28 class FileContentParser
29 {
30 /**
31 * This value is also overridden from config.
32 * zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,....
33 * Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
34 *
35 * @var int
36 */
37 public $pdf_mode = -20;
38
39 /**
40 * @var array
41 */
42 public $app = [];
43
44 /**
45 * @var array
46 */
47 public $ext2itemtype_map = [];
48
49 /**
50 * @var array
51 */
52 public $supportedExtensions = [];
53
54 /**
55 * @var \TYPO3\CMS\IndexedSearch\Indexer
56 */
57 public $pObj;
58
59 /**
60 * @var \TYPO3\CMS\Core\Localization\LanguageService|\TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController
61 */
62 protected $langObject;
63
64 /**
65 * @var string Backup for setLocaleForServerFileSystem()
66 */
67 protected $lastLocale;
68
69 /**
70 * Constructs this external parsers object
71 */
72 public function __construct()
73 {
74 // Set the language object to be used accordant to current TYPO3_MODE:
75 $this->langObject = TYPO3_MODE === 'FE' ? $GLOBALS['TSFE'] : $GLOBALS['LANG'];
76 }
77
78 /**
79 * Initialize external parser for parsing content.
80 *
81 * @param string $extension File extension
82 * @return bool Returns TRUE if extension is supported/enabled, otherwise FALSE.
83 */
84 public function initParser($extension)
85 {
86 // Then read indexer-config and set if appropriate:
87 $indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
88 // If windows, apply extension to tool name:
89 $exe = Environment::isWindows() ? '.exe' : '';
90 // lg
91 $extOK = false;
92 $mainExtension = '';
93 // Ignore extensions
94 $ignoreExtensions = GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), true);
95 if (in_array($extension, $ignoreExtensions)) {
96 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ignoreExtensions'), $extension), 1);
97 return false;
98 }
99 // Switch on file extension:
100 switch ($extension) {
101 case 'pdf':
102 // PDF
103 if ($indexerConfig['pdftools']) {
104 $pdfPath = rtrim($indexerConfig['pdftools'], '/') . '/';
105 if (@is_file($pdfPath . 'pdftotext' . $exe) && @is_file($pdfPath . 'pdfinfo' . $exe)) {
106 $this->app['pdfinfo'] = $pdfPath . 'pdfinfo' . $exe;
107 $this->app['pdftotext'] = $pdfPath . 'pdftotext' . $exe;
108 // PDF mode:
109 $this->pdf_mode = MathUtility::forceIntegerInRange($indexerConfig['pdf_mode'], -100, 100);
110 $extOK = true;
111 } else {
112 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsNotFound'), $pdfPath), 3);
113 }
114 } else {
115 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsDisabled'), 1);
116 }
117 break;
118 case 'doc':
119 // Catdoc
120 if ($indexerConfig['catdoc']) {
121 $catdocPath = rtrim($indexerConfig['catdoc'], '/') . '/';
122 if (@is_file($catdocPath . 'catdoc' . $exe)) {
123 $this->app['catdoc'] = $catdocPath . 'catdoc' . $exe;
124 $extOK = true;
125 } else {
126 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:catdocNotFound'), $catdocPath), 3);
127 }
128 } else {
129 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:catdocDisabled'), 1);
130 }
131 break;
132 case 'pps':
133 case 'ppt':
134 // MS PowerPoint
135 // ppthtml
136 if ($indexerConfig['ppthtml']) {
137 $ppthtmlPath = rtrim($indexerConfig['ppthtml'], '/') . '/';
138 if (@is_file($ppthtmlPath . 'ppthtml' . $exe)) {
139 $this->app['ppthtml'] = $ppthtmlPath . 'ppthtml' . $exe;
140 $extOK = true;
141 } else {
142 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ppthtmlNotFound'), $ppthtmlPath), 3);
143 }
144 } else {
145 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ppthtmlDisabled'), 1);
146 }
147 break;
148 case 'xls':
149 // MS Excel
150 // Xlhtml
151 if ($indexerConfig['xlhtml']) {
152 $xlhtmlPath = rtrim($indexerConfig['xlhtml'], '/') . '/';
153 if (@is_file($xlhtmlPath . 'xlhtml' . $exe)) {
154 $this->app['xlhtml'] = $xlhtmlPath . 'xlhtml' . $exe;
155 $extOK = true;
156 } else {
157 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:xlhtmlNotFound'), $xlhtmlPath), 3);
158 }
159 } else {
160 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:xlhtmlDisabled'), 1);
161 }
162 break;
163 case 'docx': // Microsoft Word >= 2007
164 case 'dotx':
165 case 'pptx': // Microsoft PowerPoint >= 2007
166 case 'ppsx':
167 case 'potx':
168 case 'xlsx': // Microsoft Excel >= 2007
169 case 'xltx':
170 if ($indexerConfig['unzip']) {
171 $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
172 if (@is_file($unzipPath . 'unzip' . $exe)) {
173 $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
174 $extOK = true;
175 } else {
176 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipNotFound'), $unzipPath), 3);
177 }
178 } else {
179 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipDisabled'), 1);
180 }
181 break;
182 case 'sxc':
183 case 'sxi':
184 case 'sxw':
185 case 'ods':
186 case 'odp':
187 case 'odt':
188 // Oasis OpenDocument Text
189 if ($indexerConfig['unzip']) {
190 $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
191 if (@is_file($unzipPath . 'unzip' . $exe)) {
192 $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
193 $extOK = true;
194 } else {
195 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipNotFound'), $unzipPath), 3);
196 }
197 } else {
198 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipDisabled'), 1);
199 }
200 break;
201 case 'rtf':
202 // Catdoc
203 if ($indexerConfig['unrtf']) {
204 $unrtfPath = rtrim($indexerConfig['unrtf'], '/') . '/';
205 if (@is_file($unrtfPath . 'unrtf' . $exe)) {
206 $this->app['unrtf'] = $unrtfPath . 'unrtf' . $exe;
207 $extOK = true;
208 } else {
209 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unrtfNotFound'), $unrtfPath), 3);
210 }
211 } else {
212 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unrtfDisabled'), 1);
213 }
214 break;
215 case 'txt':
216 case 'csv':
217 case 'xml':
218 case 'tif':
219 // PHP EXIF
220 $extOK = true;
221 break;
222 case 'html':
223 case 'htm':
224 // PHP strip-tags()
225 $extOK = true;
226 $mainExtension = 'html';
227 // making "html" the common "item_type"
228 break;
229 case 'jpg':
230 case 'jpeg':
231 // PHP EXIF
232 $extOK = true;
233 $mainExtension = 'jpeg';
234 // making "jpeg" the common item_type
235 break;
236 }
237 // If extension was OK:
238 if ($extOK) {
239 $this->supportedExtensions[$extension] = true;
240 $this->ext2itemtype_map[$extension] = $mainExtension ?: $extension;
241 return true;
242 }
243 return false;
244 }
245
246 /**
247 * Initialize external parser for backend modules
248 * Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc) in backend and frontend plugin
249 *
250 * @param string $extension File extension to initialize for.
251 * @return bool Returns TRUE if the extension is supported and enabled, otherwise FALSE.
252 */
253 public function softInit($extension)
254 {
255 switch ($extension) {
256 case 'pdf':
257 case 'doc':
258 case 'docx':
259 case 'dotx':
260 case 'pps':
261 case 'ppsx':
262 case 'ppt':
263 case 'pptx':
264 case 'potx':
265 case 'xls':
266 case 'xlsx':
267 case 'xltx':
268 case 'sxc':
269 case 'sxi':
270 case 'sxw':
271 case 'ods':
272 case 'odp':
273 case 'odt':
274 case 'rtf':
275 case 'txt':
276 case 'html':
277 case 'htm':
278 case 'csv':
279 case 'xml':
280 case 'jpg':
281 case 'jpeg':
282 case 'tif':
283 // TIF images (EXIF comment)
284 return true;
285 }
286 return false;
287 }
288
289 /**
290 * Return title of entry in media type selector box.
291 *
292 * @param string $extension File extension
293 * @return string String with label value of entry in media type search selector box (frontend plugin).
294 */
295 public function searchTypeMediaTitle($extension)
296 {
297 // Read indexer-config
298 $indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
299 // Ignore extensions
300 $ignoreExtensions = GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), true);
301 if (in_array($extension, $ignoreExtensions)) {
302 return false;
303 }
304 // Switch on file extension:
305 switch ($extension) {
306 case 'pdf':
307 // PDF
308 if ($indexerConfig['pdftools']) {
309 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PDF'), $extension);
310 }
311 break;
312 case 'doc':
313 // Catdoc
314 if ($indexerConfig['catdoc']) {
315 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.DOC'), $extension);
316 }
317 break;
318 case 'pps':
319 case 'ppt':
320 // MS PowerPoint
321 // ppthtml
322 if ($indexerConfig['ppthtml']) {
323 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PP'), $extension);
324 }
325 break;
326 case 'xls':
327 // MS Excel
328 // Xlhtml
329 if ($indexerConfig['xlhtml']) {
330 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XLS'), $extension);
331 }
332 break;
333 case 'docx':
334 case 'dotx':
335 // Microsoft Word >= 2007
336 if ($indexerConfig['unzip']) {
337 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.DOC'), $extension);
338 }
339 break;
340 case 'pptx': // Microsoft PowerPoint >= 2007
341 case 'ppsx':
342 case 'potx':
343 if ($indexerConfig['unzip']) {
344 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PP'), $extension);
345 }
346 break;
347 case 'xlsx': // Microsoft Excel >= 2007
348 case 'xltx':
349 if ($indexerConfig['unzip']) {
350 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XLS'), $extension);
351 }
352 break;
353 case 'sxc':
354 // Open Office Calc.
355 if ($indexerConfig['unzip']) {
356 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXC'), $extension);
357 }
358 break;
359 case 'sxi':
360 // Open Office Impress
361 if ($indexerConfig['unzip']) {
362 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXI'), $extension);
363 }
364 break;
365 case 'sxw':
366 // Open Office Writer
367 if ($indexerConfig['unzip']) {
368 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXW'), $extension);
369 }
370 break;
371 case 'ods':
372 // Oasis OpenDocument Spreadsheet
373 if ($indexerConfig['unzip']) {
374 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODS'), $extension);
375 }
376 break;
377 case 'odp':
378 // Oasis OpenDocument Presentation
379 if ($indexerConfig['unzip']) {
380 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODP'), $extension);
381 }
382 break;
383 case 'odt':
384 // Oasis OpenDocument Text
385 if ($indexerConfig['unzip']) {
386 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODT'), $extension);
387 }
388 break;
389 case 'rtf':
390 // Catdoc
391 if ($indexerConfig['unrtf']) {
392 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.RTF'), $extension);
393 }
394 break;
395 case 'jpeg':
396 case 'jpg':
397 case 'tif':
398 // PHP EXIF
399 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.Images'), $extension);
400 case 'html':
401 case 'htm':
402 // PHP strip-tags()
403 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.HTML'), $extension);
404 case 'txt':
405 // Raw text
406 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.TXT'), $extension);
407 case 'csv':
408 // Raw text
409 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.CSV'), $extension);
410 case 'xml':
411 // PHP strip-tags()
412 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XML'), $extension);
413 default:
414 // Do nothing
415 }
416 return '';
417 }
418
419 /**
420 * Returns TRUE if the input extension (item_type) is a potentially a multi-page extension
421 *
422 * @param string $extension Extension / item_type string
423 * @return bool Return TRUE if multi-page
424 */
425 public function isMultiplePageExtension($extension)
426 {
427 // Switch on file extension:
428 switch ((string)$extension) {
429 case 'pdf':
430 return true;
431 }
432 return false;
433 }
434
435 /**
436 * Wraps the "splitLabel function" of the language object.
437 *
438 * @param string $reference: Reference/key of the label
439 * @return string The label of the reference/key to be fetched
440 */
441 protected function sL($reference)
442 {
443 return $this->langObject->sL($reference);
444 }
445
446 /************************
447 *
448 * Reading documents (for parsing)
449 *
450 ************************/
451 /**
452 * Reads the content of an external file being indexed.
453 *
454 * @param string $ext File extension, eg. "pdf", "doc" etc.
455 * @param string $absFile Absolute filename of file (must exist and be validated OK before calling function)
456 * @param string $cPKey Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be split.)
457 * @return array Standard content array (title, description, keywords, body keys)
458 */
459 public function readFileContent($ext, $absFile, $cPKey)
460 {
461 $contentArr = null;
462 // Return immediately if initialization didn't set support up:
463 if (!$this->supportedExtensions[$ext]) {
464 return false;
465 }
466 // Switch by file extension
467 switch ($ext) {
468 case 'pdf':
469 if ($this->app['pdfinfo']) {
470 $this->setLocaleForServerFileSystem();
471 // Getting pdf-info:
472 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
473 CommandUtility::exec($cmd, $res);
474 $pdfInfo = $this->splitPdfInfo($res);
475 unset($res);
476 if ((int)$pdfInfo['pages']) {
477 list($low, $high) = explode('-', $cPKey);
478 // Get pdf content:
479 $tempFileName = GeneralUtility::tempnam('Typo3_indexer');
480 // Create temporary name
481 @unlink($tempFileName);
482 // Delete if exists, just to be safe.
483 $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
484 CommandUtility::exec($cmd);
485 if (@is_file($tempFileName)) {
486 $content = file_get_contents($tempFileName);
487 unlink($tempFileName);
488 } else {
489 $content = '';
490 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsFailed'), $absFile), 2);
491 }
492 if ((string)$content !== '') {
493 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
494 }
495 }
496 if (!empty($pdfInfo['title'])) {
497 $contentArr['title'] = $pdfInfo['title'];
498 }
499 $this->setLocaleForServerFileSystem(true);
500 }
501 break;
502 case 'doc':
503 if ($this->app['catdoc']) {
504 $this->setLocaleForServerFileSystem();
505 $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
506 CommandUtility::exec($cmd, $res);
507 $content = implode(LF, $res);
508 unset($res);
509 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
510 $this->setLocaleForServerFileSystem(true);
511 }
512 break;
513 case 'pps':
514 case 'ppt':
515 if ($this->app['ppthtml']) {
516 $this->setLocaleForServerFileSystem();
517 $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
518 CommandUtility::exec($cmd, $res);
519 $content = implode(LF, $res);
520 unset($res);
521 $content = $this->pObj->convertHTMLToUtf8($content);
522 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
523 $contentArr['title'] = PathUtility::basename($absFile);
524 $this->setLocaleForServerFileSystem(true);
525 }
526 break;
527 case 'xls':
528 if ($this->app['xlhtml']) {
529 $this->setLocaleForServerFileSystem();
530 $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
531 CommandUtility::exec($cmd, $res);
532 $content = implode(LF, $res);
533 unset($res);
534 $content = $this->pObj->convertHTMLToUtf8($content);
535 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
536 $contentArr['title'] = PathUtility::basename($absFile);
537 $this->setLocaleForServerFileSystem(true);
538 }
539 break;
540 case 'docx':
541 case 'dotx':
542 case 'pptx':
543 case 'ppsx':
544 case 'potx':
545 case 'xlsx':
546 case 'xltx':
547 if ($this->app['unzip']) {
548 $this->setLocaleForServerFileSystem();
549 switch ($ext) {
550 case 'docx':
551 case 'dotx':
552 // Read document.xml:
553 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' word/document.xml';
554 break;
555 case 'ppsx':
556 case 'pptx':
557 case 'potx':
558 // Read slide1.xml:
559 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' ppt/slides/slide1.xml';
560 break;
561 case 'xlsx':
562 case 'xltx':
563 // Read sheet1.xml:
564 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' xl/worksheets/sheet1.xml';
565 break;
566 }
567 CommandUtility::exec($cmd, $res);
568 $content_xml = implode(LF, $res);
569 unset($res);
570 $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
571 $contentArr = $this->pObj->splitRegularContent($utf8_content);
572 // Make sure the title doesn't expose the absolute path!
573 $contentArr['title'] = PathUtility::basename($absFile);
574 // Meta information
575 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' docProps/core.xml';
576 CommandUtility::exec($cmd, $res);
577 $meta_xml = implode(LF, $res);
578 unset($res);
579 $metaContent = GeneralUtility::xml2tree($meta_xml);
580 if (is_array($metaContent)) {
581 $contentArr['title'] .= ' ' . $metaContent['cp:coreProperties'][0]['ch']['dc:title'][0]['values'][0];
582 $contentArr['description'] = $metaContent['cp:coreProperties'][0]['ch']['dc:subject'][0]['values'][0];
583 $contentArr['description'] .= ' ' . $metaContent['cp:coreProperties'][0]['ch']['dc:description'][0]['values'][0];
584 $contentArr['keywords'] = $metaContent['cp:coreProperties'][0]['ch']['cp:keywords'][0]['values'][0];
585 }
586 $this->setLocaleForServerFileSystem(true);
587 }
588 break;
589 case 'sxi':
590 case 'sxc':
591 case 'sxw':
592 case 'ods':
593 case 'odp':
594 case 'odt':
595 if ($this->app['unzip']) {
596 $this->setLocaleForServerFileSystem();
597 // Read content.xml:
598 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml';
599 CommandUtility::exec($cmd, $res);
600 $content_xml = implode(LF, $res);
601 unset($res);
602 // Read meta.xml:
603 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml';
604 CommandUtility::exec($cmd, $res);
605 $meta_xml = implode(LF, $res);
606 unset($res);
607 $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
608 $contentArr = $this->pObj->splitRegularContent($utf8_content);
609 $contentArr['title'] = PathUtility::basename($absFile);
610 // Make sure the title doesn't expose the absolute path!
611 // Meta information
612 $metaContent = GeneralUtility::xml2tree($meta_xml);
613 $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
614 if (is_array($metaContent)) {
615 $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
616 $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0] . ' ' . $metaContent['dc:description'][0]['values'][0];
617 // Keywords collected:
618 if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
619 foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
620 $contentArr['keywords'] .= $kwDat['values'][0] . ' ';
621 }
622 }
623 }
624 $this->setLocaleForServerFileSystem(true);
625 }
626 break;
627 case 'rtf':
628 if ($this->app['unrtf']) {
629 $this->setLocaleForServerFileSystem();
630 $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
631 CommandUtility::exec($cmd, $res);
632 $fileContent = implode(LF, $res);
633 unset($res);
634 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
635 $contentArr = $this->pObj->splitHTMLContent($fileContent);
636 $this->setLocaleForServerFileSystem(true);
637 }
638 break;
639 case 'txt':
640 case 'csv':
641 $this->setLocaleForServerFileSystem();
642 // Raw text
643 $content = GeneralUtility::getUrl($absFile);
644 // @todo Implement auto detection of charset (currently assuming utf-8)
645 $contentCharset = 'utf-8';
646 $content = $this->pObj->convertHTMLToUtf8($content, $contentCharset);
647 $contentArr = $this->pObj->splitRegularContent($content);
648 $contentArr['title'] = PathUtility::basename($absFile);
649 // Make sure the title doesn't expose the absolute path!
650 $this->setLocaleForServerFileSystem(true);
651 break;
652 case 'html':
653 case 'htm':
654 $fileContent = GeneralUtility::getUrl($absFile);
655 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
656 $contentArr = $this->pObj->splitHTMLContent($fileContent);
657 break;
658 case 'xml':
659 $this->setLocaleForServerFileSystem();
660 // PHP strip-tags()
661 $fileContent = GeneralUtility::getUrl($absFile);
662 // Finding charset:
663 preg_match('/^[[:space:]]*<\\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i', substr($fileContent, 0, 200), $reg);
664 $charset = $reg[1] ? trim(strtolower($reg[1])) : 'utf-8';
665 // Converting content:
666 $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<', ' <', $fileContent)), $charset);
667 $contentArr = $this->pObj->splitRegularContent($fileContent);
668 $contentArr['title'] = PathUtility::basename($absFile);
669 // Make sure the title doesn't expose the absolute path!
670 $this->setLocaleForServerFileSystem(true);
671 break;
672 case 'jpg':
673 case 'jpeg':
674 case 'tif':
675 $this->setLocaleForServerFileSystem();
676 // PHP EXIF
677 if (function_exists('exif_read_data')) {
678 $exif = @exif_read_data($absFile, 'IFD0');
679 } else {
680 $exif = false;
681 }
682 if ($exif) {
683 $comment = trim($exif['COMMENT'][0] . ' ' . $exif['ImageDescription']);
684 } else {
685 $comment = '';
686 }
687 $contentArr = $this->pObj->splitRegularContent($comment);
688 $contentArr['title'] = PathUtility::basename($absFile);
689 // Make sure the title doesn't expose the absolute path!
690 $this->setLocaleForServerFileSystem(true);
691 break;
692 default:
693 return false;
694 }
695 // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
696 if (is_array($contentArr) && !$contentArr['title']) {
697 // Substituting "_" for " " because many filenames may have this instead of a space char.
698 $contentArr['title'] = str_replace('_', ' ', PathUtility::basename($absFile));
699 }
700 return $contentArr;
701 }
702
703 /**
704 * Sets the locale for LC_CTYPE to $TYPO3_CONF_VARS['SYS']['systemLocale']
705 * if $TYPO3_CONF_VARS['SYS']['UTF8filesystem'] is set.
706 *
707 * Parameter <code>$resetLocale</code> has to be FALSE and TRUE alternating for all calls.
708 *
709 * @staticvar string $lastLocale Stores the locale used before it is overridden by this method.
710 * @param bool $resetLocale TRUE resets the locale to $lastLocale.
711 * @throws \RuntimeException
712 */
713 protected function setLocaleForServerFileSystem($resetLocale = false)
714 {
715 if (!$GLOBALS['TYPO3_CONF_VARS']['SYS']['UTF8filesystem']) {
716 return;
717 }
718
719 if ($resetLocale) {
720 if ($this->lastLocale == null) {
721 throw new \RuntimeException('Cannot reset locale to NULL.', 1357064326);
722 }
723 setlocale(LC_CTYPE, $this->lastLocale);
724 $this->lastLocale = null;
725 } else {
726 if ($this->lastLocale !== null) {
727 throw new \RuntimeException('Cannot set new locale as locale has already been changed before.', 1357064437);
728 }
729 $this->lastLocale = setlocale(LC_CTYPE, 0);
730 setlocale(LC_CTYPE, $GLOBALS['TYPO3_CONF_VARS']['SYS']['systemLocale']);
731 }
732 }
733
734 /**
735 * Creates an array with pointers to divisions of document.
736 *
737 * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero)
738 * coming back.
739 *
740 * @param string $ext File extension
741 * @param string $absFile Absolute filename (must exist and be validated OK before calling function)
742 * @return array Array of pointers to sections that the document should be divided into
743 */
744 public function fileContentParts($ext, $absFile)
745 {
746 $cParts = [0];
747 switch ($ext) {
748 case 'pdf':
749 $this->setLocaleForServerFileSystem();
750 // Getting pdf-info:
751 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
752 CommandUtility::exec($cmd, $res);
753 $pdfInfo = $this->splitPdfInfo($res);
754 unset($res);
755 if ((int)$pdfInfo['pages']) {
756 $cParts = [];
757 // Calculate mode
758 if ($this->pdf_mode > 0) {
759 $iter = ceil($pdfInfo['pages'] / $this->pdf_mode);
760 } else {
761 $iter = MathUtility::forceIntegerInRange(abs($this->pdf_mode), 1, $pdfInfo['pages']);
762 }
763 // Traverse and create intervals.
764 for ($a = 0; $a < $iter; $a++) {
765 $low = floor($a * ($pdfInfo['pages'] / $iter)) + 1;
766 $high = floor(($a + 1) * ($pdfInfo['pages'] / $iter));
767 $cParts[] = $low . '-' . $high;
768 }
769 }
770 $this->setLocaleForServerFileSystem(true);
771 break;
772 default:
773 }
774 return $cParts;
775 }
776
777 /**
778 * Analysing PDF info into a usable format.
779 *
780 * @param array $pdfInfoArray Array of PDF content, coming from the pdfinfo tool
781 * @return array Result array
782 * @internal
783 * @see fileContentParts()
784 */
785 public function splitPdfInfo($pdfInfoArray)
786 {
787 $res = [];
788 if (is_array($pdfInfoArray)) {
789 foreach ($pdfInfoArray as $line) {
790 $parts = explode(':', $line, 2);
791 if (count($parts) > 1 && trim($parts[0])) {
792 $res[strtolower(trim($parts[0]))] = trim($parts[1]);
793 }
794 }
795 }
796 return $res;
797 }
798
799 /**
800 * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.
801 *
802 * @param string $string String to clean up
803 * @return string String
804 */
805 public function removeEndJunk($string)
806 {
807 return trim(preg_replace('/[' . LF . chr(12) . ']*$/', '', $string));
808 }
809
810 /************************
811 *
812 * Backend analyzer
813 *
814 ************************/
815 /**
816 * Return icon for file extension
817 *
818 * @param string $extension File extension, lowercase.
819 * @return string Relative file reference, resolvable by GeneralUtility::getFileAbsFileName()
820 */
821 public function getIcon($extension)
822 {
823 if ($extension === 'htm') {
824 $extension = 'html';
825 } elseif ($extension === 'jpeg') {
826 $extension = 'jpg';
827 }
828 return 'EXT:indexed_search/Resources/Public/Icons/FileTypes/' . $extension . '.gif';
829 }
830 }