e8c5ba7f32ac38757ab16acfc1aa630a5131759d
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / FileContentParser.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Utility\CommandUtility;
18 use TYPO3\CMS\Core\Utility\GeneralUtility;
19 use TYPO3\CMS\Core\Utility\MathUtility;
20
21 /**
22 * External standard parsers for indexed_search
23 * MUST RETURN utf-8 content!
24 */
25 class FileContentParser
26 {
27 /**
28 * This value is also overridden from config.
29 * zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,....
30 * Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
31 *
32 * @var int
33 */
34 public $pdf_mode = -20;
35
36 /**
37 * @var array
38 */
39 public $app = [];
40
41 /**
42 * @var array
43 */
44 public $ext2itemtype_map = [];
45
46 /**
47 * @var array
48 */
49 public $supportedExtensions = [];
50
51 /**
52 * @var \TYPO3\CMS\IndexedSearch\Indexer
53 */
54 public $pObj;
55
56 /**
57 * @var \TYPO3\CMS\Lang\LanguageService|\TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController
58 */
59 protected $langObject;
60
61 /**
62 * Constructs this external parsers object
63 */
64 public function __construct()
65 {
66 // Set the language object to be used accordant to current TYPO3_MODE:
67 $this->langObject = TYPO3_MODE === 'FE' ? $GLOBALS['TSFE'] : $GLOBALS['LANG'];
68 }
69
70 /**
71 * Initialize external parser for parsing content.
72 *
73 * @param string $extension File extension
74 * @return bool Returns TRUE if extension is supported/enabled, otherwise FALSE.
75 */
76 public function initParser($extension)
77 {
78 // Then read indexer-config and set if appropriate:
79 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search'], ['allowed_classes' => false]);
80 // If windows, apply extension to tool name:
81 $exe = TYPO3_OS == 'WIN' ? '.exe' : '';
82 // lg
83 $extOK = false;
84 $mainExtension = '';
85 // Ignore extensions
86 $ignoreExtensions = GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), true);
87 if (in_array($extension, $ignoreExtensions)) {
88 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ignoreExtensions'), $extension), 1);
89 return false;
90 }
91 // Switch on file extension:
92 switch ($extension) {
93 case 'pdf':
94 // PDF
95 if ($indexerConfig['pdftools']) {
96 $pdfPath = rtrim($indexerConfig['pdftools'], '/') . '/';
97 if (@is_file(($pdfPath . 'pdftotext' . $exe)) && @is_file(($pdfPath . 'pdfinfo' . $exe))) {
98 $this->app['pdfinfo'] = $pdfPath . 'pdfinfo' . $exe;
99 $this->app['pdftotext'] = $pdfPath . 'pdftotext' . $exe;
100 // PDF mode:
101 $this->pdf_mode = MathUtility::forceIntegerInRange($indexerConfig['pdf_mode'], -100, 100);
102 $extOK = true;
103 } else {
104 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsNotFound'), $pdfPath), 3);
105 }
106 } else {
107 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsDisabled'), 1);
108 }
109 break;
110 case 'doc':
111 // Catdoc
112 if ($indexerConfig['catdoc']) {
113 $catdocPath = rtrim($indexerConfig['catdoc'], '/') . '/';
114 if (@is_file(($catdocPath . 'catdoc' . $exe))) {
115 $this->app['catdoc'] = $catdocPath . 'catdoc' . $exe;
116 $extOK = true;
117 } else {
118 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:catdocNotFound'), $catdocPath), 3);
119 }
120 } else {
121 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:catdocDisabled'), 1);
122 }
123 break;
124 case 'pps':
125 case 'ppt':
126 // MS PowerPoint
127 // ppthtml
128 if ($indexerConfig['ppthtml']) {
129 $ppthtmlPath = rtrim($indexerConfig['ppthtml'], '/') . '/';
130 if (@is_file(($ppthtmlPath . 'ppthtml' . $exe))) {
131 $this->app['ppthtml'] = $ppthtmlPath . 'ppthtml' . $exe;
132 $extOK = true;
133 } else {
134 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ppthtmlNotFound'), $ppthtmlPath), 3);
135 }
136 } else {
137 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ppthtmlDisabled'), 1);
138 }
139 break;
140 case 'xls':
141 // MS Excel
142 // Xlhtml
143 if ($indexerConfig['xlhtml']) {
144 $xlhtmlPath = rtrim($indexerConfig['xlhtml'], '/') . '/';
145 if (@is_file(($xlhtmlPath . 'xlhtml' . $exe))) {
146 $this->app['xlhtml'] = $xlhtmlPath . 'xlhtml' . $exe;
147 $extOK = true;
148 } else {
149 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:xlhtmlNotFound'), $xlhtmlPath), 3);
150 }
151 } else {
152 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:xlhtmlDisabled'), 1);
153 }
154 break;
155 case 'docx': // Microsoft Word >= 2007
156 case 'dotx':
157 case 'pptx': // Microsoft PowerPoint >= 2007
158 case 'ppsx':
159 case 'potx':
160 case 'xlsx': // Microsoft Excel >= 2007
161 case 'xltx':
162 if ($indexerConfig['unzip']) {
163 $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
164 if (@is_file($unzipPath . 'unzip' . $exe)) {
165 $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
166 $extOK = true;
167 } else {
168 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipNotFound'), $unzipPath), 3);
169 }
170 } else {
171 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipDisabled'), 1);
172 }
173 break;
174 case 'sxc':
175 case 'sxi':
176 case 'sxw':
177 case 'ods':
178 case 'odp':
179 case 'odt':
180 // Oasis OpenDocument Text
181 if ($indexerConfig['unzip']) {
182 $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
183 if (@is_file(($unzipPath . 'unzip' . $exe))) {
184 $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
185 $extOK = true;
186 } else {
187 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipNotFound'), $unzipPath), 3);
188 }
189 } else {
190 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipDisabled'), 1);
191 }
192 break;
193 case 'rtf':
194 // Catdoc
195 if ($indexerConfig['unrtf']) {
196 $unrtfPath = rtrim($indexerConfig['unrtf'], '/') . '/';
197 if (@is_file(($unrtfPath . 'unrtf' . $exe))) {
198 $this->app['unrtf'] = $unrtfPath . 'unrtf' . $exe;
199 $extOK = true;
200 } else {
201 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unrtfNotFound'), $unrtfPath), 3);
202 }
203 } else {
204 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unrtfDisabled'), 1);
205 }
206 break;
207 case 'txt':
208 case 'csv':
209 case 'xml':
210 case 'tif':
211 // PHP EXIF
212 $extOK = true;
213 break;
214 case 'html':
215 case 'htm':
216 // PHP strip-tags()
217 $extOK = true;
218 $mainExtension = 'html';
219 // making "html" the common "item_type"
220 break;
221 case 'jpg':
222 case 'jpeg':
223 // PHP EXIF
224 $extOK = true;
225 $mainExtension = 'jpeg';
226 // making "jpeg" the common item_type
227 break;
228 }
229 // If extension was OK:
230 if ($extOK) {
231 $this->supportedExtensions[$extension] = true;
232 $this->ext2itemtype_map[$extension] = $mainExtension ?: $extension;
233 return true;
234 }
235 return false;
236 }
237
238 /**
239 * Initialize external parser for backend modules
240 * Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc) in backend and frontend plugin
241 *
242 * @param string $extension File extension to initialize for.
243 * @return bool Returns TRUE if the extension is supported and enabled, otherwise FALSE.
244 */
245 public function softInit($extension)
246 {
247 switch ($extension) {
248 case 'pdf':
249 case 'doc':
250 case 'docx':
251 case 'dotx':
252 case 'pps':
253 case 'ppsx':
254 case 'ppt':
255 case 'pptx':
256 case 'potx':
257 case 'xls':
258 case 'xlsx':
259 case 'xltx':
260 case 'sxc':
261 case 'sxi':
262 case 'sxw':
263 case 'ods':
264 case 'odp':
265 case 'odt':
266 case 'rtf':
267 case 'txt':
268 case 'html':
269 case 'htm':
270 case 'csv':
271 case 'xml':
272 case 'jpg':
273 case 'jpeg':
274 case 'tif':
275 // TIF images (EXIF comment)
276 return true;
277 break;
278 }
279 return false;
280 }
281
282 /**
283 * Return title of entry in media type selector box.
284 *
285 * @param string $extension File extension
286 * @return string String with label value of entry in media type search selector box (frontend plugin).
287 */
288 public function searchTypeMediaTitle($extension)
289 {
290 // Read indexer-config
291 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search'], ['allowed_classes' => false]);
292 // Ignore extensions
293 $ignoreExtensions = GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), true);
294 if (in_array($extension, $ignoreExtensions)) {
295 return false;
296 }
297 // Switch on file extension:
298 switch ($extension) {
299 case 'pdf':
300 // PDF
301 if ($indexerConfig['pdftools']) {
302 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PDF'), $extension);
303 }
304 break;
305 case 'doc':
306 // Catdoc
307 if ($indexerConfig['catdoc']) {
308 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.DOC'), $extension);
309 }
310 break;
311 case 'pps':
312 case 'ppt':
313 // MS PowerPoint
314 // ppthtml
315 if ($indexerConfig['ppthtml']) {
316 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PP'), $extension);
317 }
318 break;
319 case 'xls':
320 // MS Excel
321 // Xlhtml
322 if ($indexerConfig['xlhtml']) {
323 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XLS'), $extension);
324 }
325 break;
326 case 'docx':
327 case 'dotx':
328 // Microsoft Word >= 2007
329 if ($indexerConfig['unzip']) {
330 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.DOC'), $extension);
331 }
332 break;
333 case 'pptx': // Microsoft PowerPoint >= 2007
334 case 'ppsx':
335 case 'potx':
336 if ($indexerConfig['unzip']) {
337 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PP'), $extension);
338 }
339 break;
340 case 'xlsx': // Microsoft Excel >= 2007
341 case 'xltx':
342 if ($indexerConfig['unzip']) {
343 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XLS'), $extension);
344 }
345 break;
346 case 'sxc':
347 // Open Office Calc.
348 if ($indexerConfig['unzip']) {
349 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXC'), $extension);
350 }
351 break;
352 case 'sxi':
353 // Open Office Impress
354 if ($indexerConfig['unzip']) {
355 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXI'), $extension);
356 }
357 break;
358 case 'sxw':
359 // Open Office Writer
360 if ($indexerConfig['unzip']) {
361 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXW'), $extension);
362 }
363 break;
364 case 'ods':
365 // Oasis OpenDocument Spreadsheet
366 if ($indexerConfig['unzip']) {
367 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODS'), $extension);
368 }
369 break;
370 case 'odp':
371 // Oasis OpenDocument Presentation
372 if ($indexerConfig['unzip']) {
373 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODP'), $extension);
374 }
375 break;
376 case 'odt':
377 // Oasis OpenDocument Text
378 if ($indexerConfig['unzip']) {
379 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODT'), $extension);
380 }
381 break;
382 case 'rtf':
383 // Catdoc
384 if ($indexerConfig['unrtf']) {
385 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.RTF'), $extension);
386 }
387 break;
388 case 'jpeg':
389 case 'jpg':
390 case 'tif':
391 // PHP EXIF
392 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.Images'), $extension);
393 break;
394 case 'html':
395 case 'htm':
396 // PHP strip-tags()
397 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.HTML'), $extension);
398 break;
399 case 'txt':
400 // Raw text
401 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.TXT'), $extension);
402 break;
403 case 'csv':
404 // Raw text
405 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.CSV'), $extension);
406 break;
407 case 'xml':
408 // PHP strip-tags()
409 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XML'), $extension);
410 break;
411 default:
412 // Do nothing
413 }
414 return '';
415 }
416
417 /**
418 * Returns TRUE if the input extension (item_type) is a potentially a multi-page extension
419 *
420 * @param string $extension Extension / item_type string
421 * @return bool Return TRUE if multi-page
422 */
423 public function isMultiplePageExtension($extension)
424 {
425 // Switch on file extension:
426 switch ((string)$extension) {
427 case 'pdf':
428 return true;
429 break;
430 }
431 return false;
432 }
433
434 /**
435 * Wraps the "splitLabel function" of the language object.
436 *
437 * @param string $reference: Reference/key of the label
438 * @return string The label of the reference/key to be fetched
439 */
440 protected function sL($reference)
441 {
442 return $this->langObject->sL($reference);
443 }
444
445 /************************
446 *
447 * Reading documents (for parsing)
448 *
449 ************************/
450 /**
451 * Reads the content of an external file being indexed.
452 *
453 * @param string $ext File extension, eg. "pdf", "doc" etc.
454 * @param string $absFile Absolute filename of file (must exist and be validated OK before calling function)
455 * @param string $cPKey Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be split.)
456 * @return array Standard content array (title, description, keywords, body keys)
457 */
458 public function readFileContent($ext, $absFile, $cPKey)
459 {
460 $contentArr = null;
461 // Return immediately if initialization didn't set support up:
462 if (!$this->supportedExtensions[$ext]) {
463 return false;
464 }
465 // Switch by file extension
466 switch ($ext) {
467 case 'pdf':
468 if ($this->app['pdfinfo']) {
469 $this->setLocaleForServerFileSystem();
470 // Getting pdf-info:
471 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
472 CommandUtility::exec($cmd, $res);
473 $pdfInfo = $this->splitPdfInfo($res);
474 unset($res);
475 if ((int)$pdfInfo['pages']) {
476 list($low, $high) = explode('-', $cPKey);
477 // Get pdf content:
478 $tempFileName = GeneralUtility::tempnam('Typo3_indexer');
479 // Create temporary name
480 @unlink($tempFileName);
481 // Delete if exists, just to be safe.
482 $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
483 CommandUtility::exec($cmd);
484 if (@is_file($tempFileName)) {
485 $content = file_get_contents($tempFileName);
486 unlink($tempFileName);
487 } else {
488 $content = '';
489 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsFailed'), $absFile), 2);
490 }
491 if ((string)$content !== '') {
492 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
493 }
494 }
495 if (!empty($pdfInfo['title'])) {
496 $contentArr['title'] = $pdfInfo['title'];
497 }
498 $this->setLocaleForServerFileSystem(true);
499 }
500 break;
501 case 'doc':
502 if ($this->app['catdoc']) {
503 $this->setLocaleForServerFileSystem();
504 $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
505 CommandUtility::exec($cmd, $res);
506 $content = implode(LF, $res);
507 unset($res);
508 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
509 $this->setLocaleForServerFileSystem(true);
510 }
511 break;
512 case 'pps':
513 case 'ppt':
514 if ($this->app['ppthtml']) {
515 $this->setLocaleForServerFileSystem();
516 $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
517 CommandUtility::exec($cmd, $res);
518 $content = implode(LF, $res);
519 unset($res);
520 $content = $this->pObj->convertHTMLToUtf8($content);
521 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
522 $contentArr['title'] = basename($absFile);
523 $this->setLocaleForServerFileSystem(true);
524 }
525 break;
526 case 'xls':
527 if ($this->app['xlhtml']) {
528 $this->setLocaleForServerFileSystem();
529 $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
530 CommandUtility::exec($cmd, $res);
531 $content = implode(LF, $res);
532 unset($res);
533 $content = $this->pObj->convertHTMLToUtf8($content);
534 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
535 $contentArr['title'] = basename($absFile);
536 $this->setLocaleForServerFileSystem(true);
537 }
538 break;
539 case 'docx':
540 case 'dotx':
541 case 'pptx':
542 case 'ppsx':
543 case 'potx':
544 case 'xlsx':
545 case 'xltx':
546 if ($this->app['unzip']) {
547 $this->setLocaleForServerFileSystem();
548 switch ($ext) {
549 case 'docx':
550 case 'dotx':
551 // Read document.xml:
552 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' word/document.xml';
553 break;
554 case 'ppsx':
555 case 'pptx':
556 case 'potx':
557 // Read slide1.xml:
558 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' ppt/slides/slide1.xml';
559 break;
560 case 'xlsx':
561 case 'xltx':
562 // Read sheet1.xml:
563 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' xl/worksheets/sheet1.xml';
564 break;
565 }
566 CommandUtility::exec($cmd, $res);
567 $content_xml = implode(LF, $res);
568 unset($res);
569 $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
570 $contentArr = $this->pObj->splitRegularContent($utf8_content);
571 // Make sure the title doesn't expose the absolute path!
572 $contentArr['title'] = basename($absFile);
573 // Meta information
574 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' docProps/core.xml';
575 CommandUtility::exec($cmd, $res);
576 $meta_xml = implode(LF, $res);
577 unset($res);
578 $metaContent = GeneralUtility::xml2tree($meta_xml);
579 if (is_array($metaContent)) {
580 $contentArr['title'] .= ' ' . $metaContent['cp:coreProperties'][0]['ch']['dc:title'][0]['values'][0];
581 $contentArr['description'] = $metaContent['cp:coreProperties'][0]['ch']['dc:subject'][0]['values'][0];
582 $contentArr['description'] .= ' ' . $metaContent['cp:coreProperties'][0]['ch']['dc:description'][0]['values'][0];
583 $contentArr['keywords'] = $metaContent['cp:coreProperties'][0]['ch']['cp:keywords'][0]['values'][0];
584 }
585 $this->setLocaleForServerFileSystem(true);
586 }
587 break;
588 case 'sxi':
589 case 'sxc':
590 case 'sxw':
591 case 'ods':
592 case 'odp':
593 case 'odt':
594 if ($this->app['unzip']) {
595 $this->setLocaleForServerFileSystem();
596 // Read content.xml:
597 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml';
598 CommandUtility::exec($cmd, $res);
599 $content_xml = implode(LF, $res);
600 unset($res);
601 // Read meta.xml:
602 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml';
603 CommandUtility::exec($cmd, $res);
604 $meta_xml = implode(LF, $res);
605 unset($res);
606 $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
607 $contentArr = $this->pObj->splitRegularContent($utf8_content);
608 $contentArr['title'] = basename($absFile);
609 // Make sure the title doesn't expose the absolute path!
610 // Meta information
611 $metaContent = GeneralUtility::xml2tree($meta_xml);
612 $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
613 if (is_array($metaContent)) {
614 $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
615 $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0] . ' ' . $metaContent['dc:description'][0]['values'][0];
616 // Keywords collected:
617 if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
618 foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
619 $contentArr['keywords'] .= $kwDat['values'][0] . ' ';
620 }
621 }
622 }
623 $this->setLocaleForServerFileSystem(true);
624 }
625 break;
626 case 'rtf':
627 if ($this->app['unrtf']) {
628 $this->setLocaleForServerFileSystem();
629 $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
630 CommandUtility::exec($cmd, $res);
631 $fileContent = implode(LF, $res);
632 unset($res);
633 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
634 $contentArr = $this->pObj->splitHTMLContent($fileContent);
635 $this->setLocaleForServerFileSystem(true);
636 }
637 break;
638 case 'txt':
639 case 'csv':
640 $this->setLocaleForServerFileSystem();
641 // Raw text
642 $content = GeneralUtility::getUrl($absFile);
643 // @todo Implement auto detection of charset (currently assuming utf-8)
644 $contentCharset = 'utf-8';
645 $content = $this->pObj->convertHTMLToUtf8($content, $contentCharset);
646 $contentArr = $this->pObj->splitRegularContent($content);
647 $contentArr['title'] = basename($absFile);
648 // Make sure the title doesn't expose the absolute path!
649 $this->setLocaleForServerFileSystem(true);
650 break;
651 case 'html':
652 case 'htm':
653 $fileContent = GeneralUtility::getUrl($absFile);
654 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
655 $contentArr = $this->pObj->splitHTMLContent($fileContent);
656 break;
657 case 'xml':
658 $this->setLocaleForServerFileSystem();
659 // PHP strip-tags()
660 $fileContent = GeneralUtility::getUrl($absFile);
661 // Finding charset:
662 preg_match('/^[[:space:]]*<\\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i', substr($fileContent, 0, 200), $reg);
663 $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
664 // Converting content:
665 $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<', ' <', $fileContent)), $charset);
666 $contentArr = $this->pObj->splitRegularContent($fileContent);
667 $contentArr['title'] = basename($absFile);
668 // Make sure the title doesn't expose the absolute path!
669 $this->setLocaleForServerFileSystem(true);
670 break;
671 case 'jpg':
672 case 'jpeg':
673 case 'tif':
674 $this->setLocaleForServerFileSystem();
675 // PHP EXIF
676 if (function_exists('exif_read_data')) {
677 $exif = @exif_read_data($absFile, 'IFD0');
678 } else {
679 $exif = false;
680 }
681 if ($exif) {
682 $comment = trim($exif['COMMENT'][0] . ' ' . $exif['ImageDescription']);
683 } else {
684 $comment = '';
685 }
686 $contentArr = $this->pObj->splitRegularContent($comment);
687 $contentArr['title'] = basename($absFile);
688 // Make sure the title doesn't expose the absolute path!
689 $this->setLocaleForServerFileSystem(true);
690 break;
691 default:
692 return false;
693 }
694 // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
695 if (is_array($contentArr) && !$contentArr['title']) {
696 // Substituting "_" for " " because many filenames may have this instead of a space char.
697 $contentArr['title'] = str_replace('_', ' ', basename($absFile));
698 }
699 return $contentArr;
700 }
701
702 /**
703 * Sets the locale for LC_CTYPE to $TYPO3_CONF_VARS['SYS']['systemLocale']
704 * if $TYPO3_CONF_VARS['SYS']['UTF8filesystem'] is set.
705 *
706 * Parameter <code>$resetLocale</code> has to be FALSE and TRUE alternating for all calls.
707 *
708 * @staticvar string $lastLocale Stores the locale used before it is overridden by this method.
709 * @param bool $resetLocale TRUE resets the locale to $lastLocale.
710 * @return void
711 * @throws \RuntimeException
712 */
713 protected function setLocaleForServerFileSystem($resetLocale = false)
714 {
715 static $lastLocale = null;
716 if (!$GLOBALS['TYPO3_CONF_VARS']['SYS']['UTF8filesystem']) {
717 return;
718 }
719
720 if ($resetLocale) {
721 if ($lastLocale == null) {
722 throw new \RuntimeException('Cannot reset locale to NULL.', 1357064326);
723 }
724 setlocale(LC_CTYPE, $lastLocale);
725 $lastLocale = null;
726 } else {
727 if ($lastLocale !== null) {
728 throw new \RuntimeException('Cannot set new locale as locale has already been changed before.', 1357064437);
729 }
730 $lastLocale = setlocale(LC_CTYPE, 0);
731 setlocale(LC_CTYPE, $GLOBALS['TYPO3_CONF_VARS']['SYS']['systemLocale']);
732 }
733 }
734
735 /**
736 * Creates an array with pointers to divisions of document.
737 *
738 * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero)
739 * coming back.
740 *
741 * @param string $ext File extension
742 * @param string $absFile Absolute filename (must exist and be validated OK before calling function)
743 * @return array Array of pointers to sections that the document should be divided into
744 */
745 public function fileContentParts($ext, $absFile)
746 {
747 $cParts = [0];
748 switch ($ext) {
749 case 'pdf':
750 $this->setLocaleForServerFileSystem();
751 // Getting pdf-info:
752 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
753 CommandUtility::exec($cmd, $res);
754 $pdfInfo = $this->splitPdfInfo($res);
755 unset($res);
756 if ((int)$pdfInfo['pages']) {
757 $cParts = [];
758 // Calculate mode
759 if ($this->pdf_mode > 0) {
760 $iter = ceil($pdfInfo['pages'] / $this->pdf_mode);
761 } else {
762 $iter = MathUtility::forceIntegerInRange(abs($this->pdf_mode), 1, $pdfInfo['pages']);
763 }
764 // Traverse and create intervals.
765 for ($a = 0; $a < $iter; $a++) {
766 $low = floor($a * ($pdfInfo['pages'] / $iter)) + 1;
767 $high = floor(($a + 1) * ($pdfInfo['pages'] / $iter));
768 $cParts[] = $low . '-' . $high;
769 }
770 }
771 $this->setLocaleForServerFileSystem(true);
772 break;
773 default:
774 }
775 return $cParts;
776 }
777
778 /**
779 * Analysing PDF info into a useable format.
780 *
781 * @param array $pdfInfoArray Array of PDF content, coming from the pdfinfo tool
782 * @return array Result array
783 * @access private
784 * @see fileContentParts()
785 */
786 public function splitPdfInfo($pdfInfoArray)
787 {
788 $res = [];
789 if (is_array($pdfInfoArray)) {
790 foreach ($pdfInfoArray as $line) {
791 $parts = explode(':', $line, 2);
792 if (count($parts) > 1 && trim($parts[0])) {
793 $res[strtolower(trim($parts[0]))] = trim($parts[1]);
794 }
795 }
796 }
797 return $res;
798 }
799
800 /**
801 * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.
802 *
803 * @param string $string String to clean up
804 * @return string String
805 */
806 public function removeEndJunk($string)
807 {
808 return trim(preg_replace('/[' . LF . chr(12) . ']*$/', '', $string));
809 }
810
811 /************************
812 *
813 * Backend analyzer
814 *
815 ************************/
816 /**
817 * Return icon for file extension
818 *
819 * @param string $extension File extension, lowercase.
820 * @return string Relative file reference, resolvable by GeneralUtility::getFileAbsFileName()
821 */
822 public function getIcon($extension)
823 {
824 if ($extension === 'htm') {
825 $extension = 'html';
826 } elseif ($extension === 'jpeg') {
827 $extension = 'jpg';
828 }
829 return 'EXT:indexed_search/Resources/Public/Icons/FileTypes/' . $extension . '.gif';
830 }
831 }