[TASK] Replace TYPO3_OS constant with Environment check
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / FileContentParser.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
18 use TYPO3\CMS\Core\Core\Environment;
19 use TYPO3\CMS\Core\Utility\CommandUtility;
20 use TYPO3\CMS\Core\Utility\GeneralUtility;
21 use TYPO3\CMS\Core\Utility\MathUtility;
22
23 /**
24 * External standard parsers for indexed_search
25 * MUST RETURN utf-8 content!
26 */
27 class FileContentParser
28 {
29 /**
30 * This value is also overridden from config.
31 * zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,....
32 * Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
33 *
34 * @var int
35 */
36 public $pdf_mode = -20;
37
38 /**
39 * @var array
40 */
41 public $app = [];
42
43 /**
44 * @var array
45 */
46 public $ext2itemtype_map = [];
47
48 /**
49 * @var array
50 */
51 public $supportedExtensions = [];
52
53 /**
54 * @var \TYPO3\CMS\IndexedSearch\Indexer
55 */
56 public $pObj;
57
58 /**
59 * @var \TYPO3\CMS\Core\Localization\LanguageService|\TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController
60 */
61 protected $langObject;
62
63 /**
64 * Constructs this external parsers object
65 */
66 public function __construct()
67 {
68 // Set the language object to be used accordant to current TYPO3_MODE:
69 $this->langObject = TYPO3_MODE === 'FE' ? $GLOBALS['TSFE'] : $GLOBALS['LANG'];
70 }
71
72 /**
73 * Initialize external parser for parsing content.
74 *
75 * @param string $extension File extension
76 * @return bool Returns TRUE if extension is supported/enabled, otherwise FALSE.
77 */
78 public function initParser($extension)
79 {
80 // Then read indexer-config and set if appropriate:
81 $indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
82 // If windows, apply extension to tool name:
83 $exe = Environment::isWindows() ? '.exe' : '';
84 // lg
85 $extOK = false;
86 $mainExtension = '';
87 // Ignore extensions
88 $ignoreExtensions = GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), true);
89 if (in_array($extension, $ignoreExtensions)) {
90 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ignoreExtensions'), $extension), 1);
91 return false;
92 }
93 // Switch on file extension:
94 switch ($extension) {
95 case 'pdf':
96 // PDF
97 if ($indexerConfig['pdftools']) {
98 $pdfPath = rtrim($indexerConfig['pdftools'], '/') . '/';
99 if (@is_file($pdfPath . 'pdftotext' . $exe) && @is_file($pdfPath . 'pdfinfo' . $exe)) {
100 $this->app['pdfinfo'] = $pdfPath . 'pdfinfo' . $exe;
101 $this->app['pdftotext'] = $pdfPath . 'pdftotext' . $exe;
102 // PDF mode:
103 $this->pdf_mode = MathUtility::forceIntegerInRange($indexerConfig['pdf_mode'], -100, 100);
104 $extOK = true;
105 } else {
106 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsNotFound'), $pdfPath), 3);
107 }
108 } else {
109 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsDisabled'), 1);
110 }
111 break;
112 case 'doc':
113 // Catdoc
114 if ($indexerConfig['catdoc']) {
115 $catdocPath = rtrim($indexerConfig['catdoc'], '/') . '/';
116 if (@is_file($catdocPath . 'catdoc' . $exe)) {
117 $this->app['catdoc'] = $catdocPath . 'catdoc' . $exe;
118 $extOK = true;
119 } else {
120 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:catdocNotFound'), $catdocPath), 3);
121 }
122 } else {
123 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:catdocDisabled'), 1);
124 }
125 break;
126 case 'pps':
127 case 'ppt':
128 // MS PowerPoint
129 // ppthtml
130 if ($indexerConfig['ppthtml']) {
131 $ppthtmlPath = rtrim($indexerConfig['ppthtml'], '/') . '/';
132 if (@is_file($ppthtmlPath . 'ppthtml' . $exe)) {
133 $this->app['ppthtml'] = $ppthtmlPath . 'ppthtml' . $exe;
134 $extOK = true;
135 } else {
136 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ppthtmlNotFound'), $ppthtmlPath), 3);
137 }
138 } else {
139 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ppthtmlDisabled'), 1);
140 }
141 break;
142 case 'xls':
143 // MS Excel
144 // Xlhtml
145 if ($indexerConfig['xlhtml']) {
146 $xlhtmlPath = rtrim($indexerConfig['xlhtml'], '/') . '/';
147 if (@is_file($xlhtmlPath . 'xlhtml' . $exe)) {
148 $this->app['xlhtml'] = $xlhtmlPath . 'xlhtml' . $exe;
149 $extOK = true;
150 } else {
151 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:xlhtmlNotFound'), $xlhtmlPath), 3);
152 }
153 } else {
154 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:xlhtmlDisabled'), 1);
155 }
156 break;
157 case 'docx': // Microsoft Word >= 2007
158 case 'dotx':
159 case 'pptx': // Microsoft PowerPoint >= 2007
160 case 'ppsx':
161 case 'potx':
162 case 'xlsx': // Microsoft Excel >= 2007
163 case 'xltx':
164 if ($indexerConfig['unzip']) {
165 $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
166 if (@is_file($unzipPath . 'unzip' . $exe)) {
167 $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
168 $extOK = true;
169 } else {
170 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipNotFound'), $unzipPath), 3);
171 }
172 } else {
173 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipDisabled'), 1);
174 }
175 break;
176 case 'sxc':
177 case 'sxi':
178 case 'sxw':
179 case 'ods':
180 case 'odp':
181 case 'odt':
182 // Oasis OpenDocument Text
183 if ($indexerConfig['unzip']) {
184 $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
185 if (@is_file($unzipPath . 'unzip' . $exe)) {
186 $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
187 $extOK = true;
188 } else {
189 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipNotFound'), $unzipPath), 3);
190 }
191 } else {
192 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipDisabled'), 1);
193 }
194 break;
195 case 'rtf':
196 // Catdoc
197 if ($indexerConfig['unrtf']) {
198 $unrtfPath = rtrim($indexerConfig['unrtf'], '/') . '/';
199 if (@is_file($unrtfPath . 'unrtf' . $exe)) {
200 $this->app['unrtf'] = $unrtfPath . 'unrtf' . $exe;
201 $extOK = true;
202 } else {
203 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unrtfNotFound'), $unrtfPath), 3);
204 }
205 } else {
206 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unrtfDisabled'), 1);
207 }
208 break;
209 case 'txt':
210 case 'csv':
211 case 'xml':
212 case 'tif':
213 // PHP EXIF
214 $extOK = true;
215 break;
216 case 'html':
217 case 'htm':
218 // PHP strip-tags()
219 $extOK = true;
220 $mainExtension = 'html';
221 // making "html" the common "item_type"
222 break;
223 case 'jpg':
224 case 'jpeg':
225 // PHP EXIF
226 $extOK = true;
227 $mainExtension = 'jpeg';
228 // making "jpeg" the common item_type
229 break;
230 }
231 // If extension was OK:
232 if ($extOK) {
233 $this->supportedExtensions[$extension] = true;
234 $this->ext2itemtype_map[$extension] = $mainExtension ?: $extension;
235 return true;
236 }
237 return false;
238 }
239
240 /**
241 * Initialize external parser for backend modules
242 * Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc) in backend and frontend plugin
243 *
244 * @param string $extension File extension to initialize for.
245 * @return bool Returns TRUE if the extension is supported and enabled, otherwise FALSE.
246 */
247 public function softInit($extension)
248 {
249 switch ($extension) {
250 case 'pdf':
251 case 'doc':
252 case 'docx':
253 case 'dotx':
254 case 'pps':
255 case 'ppsx':
256 case 'ppt':
257 case 'pptx':
258 case 'potx':
259 case 'xls':
260 case 'xlsx':
261 case 'xltx':
262 case 'sxc':
263 case 'sxi':
264 case 'sxw':
265 case 'ods':
266 case 'odp':
267 case 'odt':
268 case 'rtf':
269 case 'txt':
270 case 'html':
271 case 'htm':
272 case 'csv':
273 case 'xml':
274 case 'jpg':
275 case 'jpeg':
276 case 'tif':
277 // TIF images (EXIF comment)
278 return true;
279 break;
280 }
281 return false;
282 }
283
284 /**
285 * Return title of entry in media type selector box.
286 *
287 * @param string $extension File extension
288 * @return string String with label value of entry in media type search selector box (frontend plugin).
289 */
290 public function searchTypeMediaTitle($extension)
291 {
292 // Read indexer-config
293 $indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
294 // Ignore extensions
295 $ignoreExtensions = GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), true);
296 if (in_array($extension, $ignoreExtensions)) {
297 return false;
298 }
299 // Switch on file extension:
300 switch ($extension) {
301 case 'pdf':
302 // PDF
303 if ($indexerConfig['pdftools']) {
304 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PDF'), $extension);
305 }
306 break;
307 case 'doc':
308 // Catdoc
309 if ($indexerConfig['catdoc']) {
310 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.DOC'), $extension);
311 }
312 break;
313 case 'pps':
314 case 'ppt':
315 // MS PowerPoint
316 // ppthtml
317 if ($indexerConfig['ppthtml']) {
318 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PP'), $extension);
319 }
320 break;
321 case 'xls':
322 // MS Excel
323 // Xlhtml
324 if ($indexerConfig['xlhtml']) {
325 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XLS'), $extension);
326 }
327 break;
328 case 'docx':
329 case 'dotx':
330 // Microsoft Word >= 2007
331 if ($indexerConfig['unzip']) {
332 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.DOC'), $extension);
333 }
334 break;
335 case 'pptx': // Microsoft PowerPoint >= 2007
336 case 'ppsx':
337 case 'potx':
338 if ($indexerConfig['unzip']) {
339 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PP'), $extension);
340 }
341 break;
342 case 'xlsx': // Microsoft Excel >= 2007
343 case 'xltx':
344 if ($indexerConfig['unzip']) {
345 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XLS'), $extension);
346 }
347 break;
348 case 'sxc':
349 // Open Office Calc.
350 if ($indexerConfig['unzip']) {
351 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXC'), $extension);
352 }
353 break;
354 case 'sxi':
355 // Open Office Impress
356 if ($indexerConfig['unzip']) {
357 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXI'), $extension);
358 }
359 break;
360 case 'sxw':
361 // Open Office Writer
362 if ($indexerConfig['unzip']) {
363 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXW'), $extension);
364 }
365 break;
366 case 'ods':
367 // Oasis OpenDocument Spreadsheet
368 if ($indexerConfig['unzip']) {
369 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODS'), $extension);
370 }
371 break;
372 case 'odp':
373 // Oasis OpenDocument Presentation
374 if ($indexerConfig['unzip']) {
375 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODP'), $extension);
376 }
377 break;
378 case 'odt':
379 // Oasis OpenDocument Text
380 if ($indexerConfig['unzip']) {
381 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODT'), $extension);
382 }
383 break;
384 case 'rtf':
385 // Catdoc
386 if ($indexerConfig['unrtf']) {
387 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.RTF'), $extension);
388 }
389 break;
390 case 'jpeg':
391 case 'jpg':
392 case 'tif':
393 // PHP EXIF
394 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.Images'), $extension);
395 break;
396 case 'html':
397 case 'htm':
398 // PHP strip-tags()
399 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.HTML'), $extension);
400 break;
401 case 'txt':
402 // Raw text
403 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.TXT'), $extension);
404 break;
405 case 'csv':
406 // Raw text
407 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.CSV'), $extension);
408 break;
409 case 'xml':
410 // PHP strip-tags()
411 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XML'), $extension);
412 break;
413 default:
414 // Do nothing
415 }
416 return '';
417 }
418
419 /**
420 * Returns TRUE if the input extension (item_type) is a potentially a multi-page extension
421 *
422 * @param string $extension Extension / item_type string
423 * @return bool Return TRUE if multi-page
424 */
425 public function isMultiplePageExtension($extension)
426 {
427 // Switch on file extension:
428 switch ((string)$extension) {
429 case 'pdf':
430 return true;
431 break;
432 }
433 return false;
434 }
435
436 /**
437 * Wraps the "splitLabel function" of the language object.
438 *
439 * @param string $reference: Reference/key of the label
440 * @return string The label of the reference/key to be fetched
441 */
442 protected function sL($reference)
443 {
444 return $this->langObject->sL($reference);
445 }
446
447 /************************
448 *
449 * Reading documents (for parsing)
450 *
451 ************************/
452 /**
453 * Reads the content of an external file being indexed.
454 *
455 * @param string $ext File extension, eg. "pdf", "doc" etc.
456 * @param string $absFile Absolute filename of file (must exist and be validated OK before calling function)
457 * @param string $cPKey Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be split.)
458 * @return array Standard content array (title, description, keywords, body keys)
459 */
460 public function readFileContent($ext, $absFile, $cPKey)
461 {
462 $contentArr = null;
463 // Return immediately if initialization didn't set support up:
464 if (!$this->supportedExtensions[$ext]) {
465 return false;
466 }
467 // Switch by file extension
468 switch ($ext) {
469 case 'pdf':
470 if ($this->app['pdfinfo']) {
471 $this->setLocaleForServerFileSystem();
472 // Getting pdf-info:
473 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
474 CommandUtility::exec($cmd, $res);
475 $pdfInfo = $this->splitPdfInfo($res);
476 unset($res);
477 if ((int)$pdfInfo['pages']) {
478 list($low, $high) = explode('-', $cPKey);
479 // Get pdf content:
480 $tempFileName = GeneralUtility::tempnam('Typo3_indexer');
481 // Create temporary name
482 @unlink($tempFileName);
483 // Delete if exists, just to be safe.
484 $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
485 CommandUtility::exec($cmd);
486 if (@is_file($tempFileName)) {
487 $content = file_get_contents($tempFileName);
488 unlink($tempFileName);
489 } else {
490 $content = '';
491 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsFailed'), $absFile), 2);
492 }
493 if ((string)$content !== '') {
494 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
495 }
496 }
497 if (!empty($pdfInfo['title'])) {
498 $contentArr['title'] = $pdfInfo['title'];
499 }
500 $this->setLocaleForServerFileSystem(true);
501 }
502 break;
503 case 'doc':
504 if ($this->app['catdoc']) {
505 $this->setLocaleForServerFileSystem();
506 $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
507 CommandUtility::exec($cmd, $res);
508 $content = implode(LF, $res);
509 unset($res);
510 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
511 $this->setLocaleForServerFileSystem(true);
512 }
513 break;
514 case 'pps':
515 case 'ppt':
516 if ($this->app['ppthtml']) {
517 $this->setLocaleForServerFileSystem();
518 $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
519 CommandUtility::exec($cmd, $res);
520 $content = implode(LF, $res);
521 unset($res);
522 $content = $this->pObj->convertHTMLToUtf8($content);
523 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
524 $contentArr['title'] = basename($absFile);
525 $this->setLocaleForServerFileSystem(true);
526 }
527 break;
528 case 'xls':
529 if ($this->app['xlhtml']) {
530 $this->setLocaleForServerFileSystem();
531 $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
532 CommandUtility::exec($cmd, $res);
533 $content = implode(LF, $res);
534 unset($res);
535 $content = $this->pObj->convertHTMLToUtf8($content);
536 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
537 $contentArr['title'] = basename($absFile);
538 $this->setLocaleForServerFileSystem(true);
539 }
540 break;
541 case 'docx':
542 case 'dotx':
543 case 'pptx':
544 case 'ppsx':
545 case 'potx':
546 case 'xlsx':
547 case 'xltx':
548 if ($this->app['unzip']) {
549 $this->setLocaleForServerFileSystem();
550 switch ($ext) {
551 case 'docx':
552 case 'dotx':
553 // Read document.xml:
554 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' word/document.xml';
555 break;
556 case 'ppsx':
557 case 'pptx':
558 case 'potx':
559 // Read slide1.xml:
560 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' ppt/slides/slide1.xml';
561 break;
562 case 'xlsx':
563 case 'xltx':
564 // Read sheet1.xml:
565 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' xl/worksheets/sheet1.xml';
566 break;
567 }
568 CommandUtility::exec($cmd, $res);
569 $content_xml = implode(LF, $res);
570 unset($res);
571 $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
572 $contentArr = $this->pObj->splitRegularContent($utf8_content);
573 // Make sure the title doesn't expose the absolute path!
574 $contentArr['title'] = basename($absFile);
575 // Meta information
576 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' docProps/core.xml';
577 CommandUtility::exec($cmd, $res);
578 $meta_xml = implode(LF, $res);
579 unset($res);
580 $metaContent = GeneralUtility::xml2tree($meta_xml);
581 if (is_array($metaContent)) {
582 $contentArr['title'] .= ' ' . $metaContent['cp:coreProperties'][0]['ch']['dc:title'][0]['values'][0];
583 $contentArr['description'] = $metaContent['cp:coreProperties'][0]['ch']['dc:subject'][0]['values'][0];
584 $contentArr['description'] .= ' ' . $metaContent['cp:coreProperties'][0]['ch']['dc:description'][0]['values'][0];
585 $contentArr['keywords'] = $metaContent['cp:coreProperties'][0]['ch']['cp:keywords'][0]['values'][0];
586 }
587 $this->setLocaleForServerFileSystem(true);
588 }
589 break;
590 case 'sxi':
591 case 'sxc':
592 case 'sxw':
593 case 'ods':
594 case 'odp':
595 case 'odt':
596 if ($this->app['unzip']) {
597 $this->setLocaleForServerFileSystem();
598 // Read content.xml:
599 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml';
600 CommandUtility::exec($cmd, $res);
601 $content_xml = implode(LF, $res);
602 unset($res);
603 // Read meta.xml:
604 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml';
605 CommandUtility::exec($cmd, $res);
606 $meta_xml = implode(LF, $res);
607 unset($res);
608 $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
609 $contentArr = $this->pObj->splitRegularContent($utf8_content);
610 $contentArr['title'] = basename($absFile);
611 // Make sure the title doesn't expose the absolute path!
612 // Meta information
613 $metaContent = GeneralUtility::xml2tree($meta_xml);
614 $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
615 if (is_array($metaContent)) {
616 $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
617 $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0] . ' ' . $metaContent['dc:description'][0]['values'][0];
618 // Keywords collected:
619 if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
620 foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
621 $contentArr['keywords'] .= $kwDat['values'][0] . ' ';
622 }
623 }
624 }
625 $this->setLocaleForServerFileSystem(true);
626 }
627 break;
628 case 'rtf':
629 if ($this->app['unrtf']) {
630 $this->setLocaleForServerFileSystem();
631 $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
632 CommandUtility::exec($cmd, $res);
633 $fileContent = implode(LF, $res);
634 unset($res);
635 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
636 $contentArr = $this->pObj->splitHTMLContent($fileContent);
637 $this->setLocaleForServerFileSystem(true);
638 }
639 break;
640 case 'txt':
641 case 'csv':
642 $this->setLocaleForServerFileSystem();
643 // Raw text
644 $content = GeneralUtility::getUrl($absFile);
645 // @todo Implement auto detection of charset (currently assuming utf-8)
646 $contentCharset = 'utf-8';
647 $content = $this->pObj->convertHTMLToUtf8($content, $contentCharset);
648 $contentArr = $this->pObj->splitRegularContent($content);
649 $contentArr['title'] = basename($absFile);
650 // Make sure the title doesn't expose the absolute path!
651 $this->setLocaleForServerFileSystem(true);
652 break;
653 case 'html':
654 case 'htm':
655 $fileContent = GeneralUtility::getUrl($absFile);
656 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
657 $contentArr = $this->pObj->splitHTMLContent($fileContent);
658 break;
659 case 'xml':
660 $this->setLocaleForServerFileSystem();
661 // PHP strip-tags()
662 $fileContent = GeneralUtility::getUrl($absFile);
663 // Finding charset:
664 preg_match('/^[[:space:]]*<\\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i', substr($fileContent, 0, 200), $reg);
665 $charset = $reg[1] ? trim(strtolower($reg[1])) : 'utf-8';
666 // Converting content:
667 $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<', ' <', $fileContent)), $charset);
668 $contentArr = $this->pObj->splitRegularContent($fileContent);
669 $contentArr['title'] = basename($absFile);
670 // Make sure the title doesn't expose the absolute path!
671 $this->setLocaleForServerFileSystem(true);
672 break;
673 case 'jpg':
674 case 'jpeg':
675 case 'tif':
676 $this->setLocaleForServerFileSystem();
677 // PHP EXIF
678 if (function_exists('exif_read_data')) {
679 $exif = @exif_read_data($absFile, 'IFD0');
680 } else {
681 $exif = false;
682 }
683 if ($exif) {
684 $comment = trim($exif['COMMENT'][0] . ' ' . $exif['ImageDescription']);
685 } else {
686 $comment = '';
687 }
688 $contentArr = $this->pObj->splitRegularContent($comment);
689 $contentArr['title'] = basename($absFile);
690 // Make sure the title doesn't expose the absolute path!
691 $this->setLocaleForServerFileSystem(true);
692 break;
693 default:
694 return false;
695 }
696 // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
697 if (is_array($contentArr) && !$contentArr['title']) {
698 // Substituting "_" for " " because many filenames may have this instead of a space char.
699 $contentArr['title'] = str_replace('_', ' ', basename($absFile));
700 }
701 return $contentArr;
702 }
703
704 /**
705 * Sets the locale for LC_CTYPE to $TYPO3_CONF_VARS['SYS']['systemLocale']
706 * if $TYPO3_CONF_VARS['SYS']['UTF8filesystem'] is set.
707 *
708 * Parameter <code>$resetLocale</code> has to be FALSE and TRUE alternating for all calls.
709 *
710 * @staticvar string $lastLocale Stores the locale used before it is overridden by this method.
711 * @param bool $resetLocale TRUE resets the locale to $lastLocale.
712 * @throws \RuntimeException
713 */
714 protected function setLocaleForServerFileSystem($resetLocale = false)
715 {
716 static $lastLocale = null;
717 if (!$GLOBALS['TYPO3_CONF_VARS']['SYS']['UTF8filesystem']) {
718 return;
719 }
720
721 if ($resetLocale) {
722 if ($lastLocale == null) {
723 throw new \RuntimeException('Cannot reset locale to NULL.', 1357064326);
724 }
725 setlocale(LC_CTYPE, $lastLocale);
726 $lastLocale = null;
727 } else {
728 if ($lastLocale !== null) {
729 throw new \RuntimeException('Cannot set new locale as locale has already been changed before.', 1357064437);
730 }
731 $lastLocale = setlocale(LC_CTYPE, 0);
732 setlocale(LC_CTYPE, $GLOBALS['TYPO3_CONF_VARS']['SYS']['systemLocale']);
733 }
734 }
735
736 /**
737 * Creates an array with pointers to divisions of document.
738 *
739 * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero)
740 * coming back.
741 *
742 * @param string $ext File extension
743 * @param string $absFile Absolute filename (must exist and be validated OK before calling function)
744 * @return array Array of pointers to sections that the document should be divided into
745 */
746 public function fileContentParts($ext, $absFile)
747 {
748 $cParts = [0];
749 switch ($ext) {
750 case 'pdf':
751 $this->setLocaleForServerFileSystem();
752 // Getting pdf-info:
753 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
754 CommandUtility::exec($cmd, $res);
755 $pdfInfo = $this->splitPdfInfo($res);
756 unset($res);
757 if ((int)$pdfInfo['pages']) {
758 $cParts = [];
759 // Calculate mode
760 if ($this->pdf_mode > 0) {
761 $iter = ceil($pdfInfo['pages'] / $this->pdf_mode);
762 } else {
763 $iter = MathUtility::forceIntegerInRange(abs($this->pdf_mode), 1, $pdfInfo['pages']);
764 }
765 // Traverse and create intervals.
766 for ($a = 0; $a < $iter; $a++) {
767 $low = floor($a * ($pdfInfo['pages'] / $iter)) + 1;
768 $high = floor(($a + 1) * ($pdfInfo['pages'] / $iter));
769 $cParts[] = $low . '-' . $high;
770 }
771 }
772 $this->setLocaleForServerFileSystem(true);
773 break;
774 default:
775 }
776 return $cParts;
777 }
778
779 /**
780 * Analysing PDF info into a useable format.
781 *
782 * @param array $pdfInfoArray Array of PDF content, coming from the pdfinfo tool
783 * @return array Result array
784 * @access private
785 * @see fileContentParts()
786 */
787 public function splitPdfInfo($pdfInfoArray)
788 {
789 $res = [];
790 if (is_array($pdfInfoArray)) {
791 foreach ($pdfInfoArray as $line) {
792 $parts = explode(':', $line, 2);
793 if (count($parts) > 1 && trim($parts[0])) {
794 $res[strtolower(trim($parts[0]))] = trim($parts[1]);
795 }
796 }
797 }
798 return $res;
799 }
800
801 /**
802 * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.
803 *
804 * @param string $string String to clean up
805 * @return string String
806 */
807 public function removeEndJunk($string)
808 {
809 return trim(preg_replace('/[' . LF . chr(12) . ']*$/', '', $string));
810 }
811
812 /************************
813 *
814 * Backend analyzer
815 *
816 ************************/
817 /**
818 * Return icon for file extension
819 *
820 * @param string $extension File extension, lowercase.
821 * @return string Relative file reference, resolvable by GeneralUtility::getFileAbsFileName()
822 */
823 public function getIcon($extension)
824 {
825 if ($extension === 'htm') {
826 $extension = 'html';
827 } elseif ($extension === 'jpeg') {
828 $extension = 'jpg';
829 }
830 return 'EXT:indexed_search/Resources/Public/Icons/FileTypes/' . $extension . '.gif';
831 }
832 }