daa8317ae25055105364630e857e74e8fafee181
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / FileContentParser.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Utility\GeneralUtility;
18 use TYPO3\CMS\Core\Utility\CommandUtility;
19 use TYPO3\CMS\Core\Utility\MathUtility;
20
21 /**
22 * External standard parsers for indexed_search
23 * MUST RETURN utf-8 content!
24 */
25 class FileContentParser {
26
27 /**
28 * This value is also overridden from config.
29 * zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,....
30 * Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
31 *
32 * @var int
33 */
34 public $pdf_mode = -20;
35
36 /**
37 * @var array
38 */
39 public $app = array();
40
41 /**
42 * @var array
43 */
44 public $ext2itemtype_map = array();
45
46 /**
47 * @var array
48 */
49 public $supportedExtensions = array();
50
51 /**
52 * @var \TYPO3\CMS\IndexedSearch\Indexer
53 */
54 public $pObj;
55
56 /**
57 * @var \TYPO3\CMS\Lang\LanguageService|\TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController
58 */
59 protected $langObject;
60
61 /**
62 * Constructs this external parsers object
63 */
64 public function __construct() {
65 // Set the language object to be used accordant to current TYPO3_MODE:
66 $this->langObject = TYPO3_MODE === 'FE' ? $GLOBALS['TSFE'] : $GLOBALS['LANG'];
67 }
68
69 /**
70 * Initialize external parser for parsing content.
71 *
72 * @param string $extension File extension
73 * @return bool Returns TRUE if extension is supported/enabled, otherwise FALSE.
74 */
75 public function initParser($extension) {
76 // Then read indexer-config and set if appropriate:
77 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
78 // If windows, apply extension to tool name:
79 $exe = TYPO3_OS == 'WIN' ? '.exe' : '';
80 // lg
81 $extOK = FALSE;
82 $mainExtension = '';
83 // Ignore extensions
84 $ignoreExtensions = GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), TRUE);
85 if (in_array($extension, $ignoreExtensions)) {
86 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ignoreExtensions'), $extension), 1);
87 return FALSE;
88 }
89 // Switch on file extension:
90 switch ($extension) {
91 case 'pdf':
92 // PDF
93 if ($indexerConfig['pdftools']) {
94 $pdfPath = rtrim($indexerConfig['pdftools'], '/') . '/';
95 if (@is_file(($pdfPath . 'pdftotext' . $exe)) && @is_file(($pdfPath . 'pdfinfo' . $exe))) {
96 $this->app['pdfinfo'] = $pdfPath . 'pdfinfo' . $exe;
97 $this->app['pdftotext'] = $pdfPath . 'pdftotext' . $exe;
98 // PDF mode:
99 $this->pdf_mode = MathUtility::forceIntegerInRange($indexerConfig['pdf_mode'], -100, 100);
100 $extOK = TRUE;
101 } else {
102 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsNotFound'), $pdfPath), 3);
103 }
104 } else {
105 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsDisabled'), 1);
106 }
107 break;
108 case 'doc':
109 // Catdoc
110 if ($indexerConfig['catdoc']) {
111 $catdocPath = rtrim($indexerConfig['catdoc'], '/') . '/';
112 if (@is_file(($catdocPath . 'catdoc' . $exe))) {
113 $this->app['catdoc'] = $catdocPath . 'catdoc' . $exe;
114 $extOK = TRUE;
115 } else {
116 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:catdocNotFound'), $catdocPath), 3);
117 }
118 } else {
119 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:catdocDisabled'), 1);
120 }
121 break;
122 case 'pps':
123 case 'ppt':
124 // MS PowerPoint
125 // ppthtml
126 if ($indexerConfig['ppthtml']) {
127 $ppthtmlPath = rtrim($indexerConfig['ppthtml'], '/') . '/';
128 if (@is_file(($ppthtmlPath . 'ppthtml' . $exe))) {
129 $this->app['ppthtml'] = $ppthtmlPath . 'ppthtml' . $exe;
130 $extOK = TRUE;
131 } else {
132 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ppthtmlNotFound'), $ppthtmlPath), 3);
133 }
134 } else {
135 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:ppthtmlDisabled'), 1);
136 }
137 break;
138 case 'xls':
139 // MS Excel
140 // Xlhtml
141 if ($indexerConfig['xlhtml']) {
142 $xlhtmlPath = rtrim($indexerConfig['xlhtml'], '/') . '/';
143 if (@is_file(($xlhtmlPath . 'xlhtml' . $exe))) {
144 $this->app['xlhtml'] = $xlhtmlPath . 'xlhtml' . $exe;
145 $extOK = TRUE;
146 } else {
147 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:xlhtmlNotFound'), $xlhtmlPath), 3);
148 }
149 } else {
150 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:xlhtmlDisabled'), 1);
151 }
152 break;
153 case 'docx': // Microsoft Word >= 2007
154 case 'dotx':
155 case 'pptx': // Microsoft PowerPoint >= 2007
156 case 'ppsx':
157 case 'potx':
158 case 'xlsx': // Microsoft Excel >= 2007
159 case 'xltx':
160 if ($indexerConfig['unzip']) {
161 $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
162 if (@is_file($unzipPath . 'unzip' . $exe)) {
163 $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
164 $extOK = TRUE;
165 } else {
166 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipNotFound'), $unzipPath), 3);
167 }
168 } else {
169 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipDisabled'), 1);
170 }
171 break;
172 case 'sxc':
173 case 'sxi':
174 case 'sxw':
175 case 'ods':
176 case 'odp':
177 case 'odt':
178 // Oasis OpenDocument Text
179 if ($indexerConfig['unzip']) {
180 $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
181 if (@is_file(($unzipPath . 'unzip' . $exe))) {
182 $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
183 $extOK = TRUE;
184 } else {
185 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipNotFound'), $unzipPath), 3);
186 }
187 } else {
188 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unzipDisabled'), 1);
189 }
190 break;
191 case 'rtf':
192 // Catdoc
193 if ($indexerConfig['unrtf']) {
194 $unrtfPath = rtrim($indexerConfig['unrtf'], '/') . '/';
195 if (@is_file(($unrtfPath . 'unrtf' . $exe))) {
196 $this->app['unrtf'] = $unrtfPath . 'unrtf' . $exe;
197 $extOK = TRUE;
198 } else {
199 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unrtfNotFound'), $unrtfPath), 3);
200 }
201 } else {
202 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:unrtfDisabled'), 1);
203 }
204 break;
205 case 'txt':
206 case 'csv':
207 case 'xml':
208 case 'tif':
209 // PHP EXIF
210 $extOK = TRUE;
211 break;
212 case 'html':
213 case 'htm':
214 // PHP strip-tags()
215 $extOK = TRUE;
216 $mainExtension = 'html';
217 // making "html" the common "item_type"
218 break;
219 case 'jpg':
220 case 'jpeg':
221 // PHP EXIF
222 $extOK = TRUE;
223 $mainExtension = 'jpeg';
224 // making "jpeg" the common item_type
225 break;
226 }
227 // If extension was OK:
228 if ($extOK) {
229 $this->supportedExtensions[$extension] = TRUE;
230 $this->ext2itemtype_map[$extension] = $mainExtension ?: $extension;
231 return TRUE;
232 }
233 return FALSE;
234 }
235
236 /**
237 * Initialize external parser for backend modules
238 * Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc) in backend and frontend plugin
239 *
240 * @param string $extension File extension to initialize for.
241 * @return bool Returns TRUE if the extension is supported and enabled, otherwise FALSE.
242 */
243 public function softInit($extension) {
244 switch ($extension) {
245 case 'pdf':
246 case 'doc':
247 case 'docx':
248 case 'dotx':
249 case 'pps':
250 case 'ppsx':
251 case 'ppt':
252 case 'pptx':
253 case 'potx':
254 case 'xls':
255 case 'xlsx':
256 case 'xltx':
257 case 'sxc':
258 case 'sxi':
259 case 'sxw':
260 case 'ods':
261 case 'odp':
262 case 'odt':
263 case 'rtf':
264 case 'txt':
265 case 'html':
266 case 'htm':
267 case 'csv':
268 case 'xml':
269 case 'jpg':
270 case 'jpeg':
271 case 'tif':
272 // TIF images (EXIF comment)
273 return TRUE;
274 break;
275 }
276 return FALSE;
277 }
278
279 /**
280 * Return title of entry in media type selector box.
281 *
282 * @param string $extension File extension
283 * @return string String with label value of entry in media type search selector box (frontend plugin).
284 */
285 public function searchTypeMediaTitle($extension) {
286 // Read indexer-config
287 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
288 // Ignore extensions
289 $ignoreExtensions = GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), TRUE);
290 if (in_array($extension, $ignoreExtensions)) {
291 return FALSE;
292 }
293 // Switch on file extension:
294 switch ($extension) {
295 case 'pdf':
296 // PDF
297 if ($indexerConfig['pdftools']) {
298 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PDF'), $extension);
299 }
300 break;
301 case 'doc':
302 // Catdoc
303 if ($indexerConfig['catdoc']) {
304 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.DOC'), $extension);
305 }
306 break;
307 case 'pps':
308 case 'ppt':
309 // MS PowerPoint
310 // ppthtml
311 if ($indexerConfig['ppthtml']) {
312 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PP'), $extension);
313 }
314 break;
315 case 'xls':
316 // MS Excel
317 // Xlhtml
318 if ($indexerConfig['xlhtml']) {
319 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XLS'), $extension);
320 }
321 break;
322 case 'docx':
323 case 'dotx':
324 // Microsoft Word >= 2007
325 if ($indexerConfig['unzip']) {
326 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.DOC'), $extension);
327 }
328 break;
329 case 'pptx': // Microsoft PowerPoint >= 2007
330 case 'ppsx':
331 case 'potx':
332 if ($indexerConfig['unzip']) {
333 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.PP'), $extension);
334 }
335 break;
336 case 'xlsx': // Microsoft Excel >= 2007
337 case 'xltx':
338 if ($indexerConfig['unzip']) {
339 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XLS'), $extension);
340 }
341 break;
342 case 'sxc':
343 // Open Office Calc.
344 if ($indexerConfig['unzip']) {
345 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXC'), $extension);
346 }
347 break;
348 case 'sxi':
349 // Open Office Impress
350 if ($indexerConfig['unzip']) {
351 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXI'), $extension);
352 }
353 break;
354 case 'sxw':
355 // Open Office Writer
356 if ($indexerConfig['unzip']) {
357 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.SXW'), $extension);
358 }
359 break;
360 case 'ods':
361 // Oasis OpenDocument Spreadsheet
362 if ($indexerConfig['unzip']) {
363 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODS'), $extension);
364 }
365 break;
366 case 'odp':
367 // Oasis OpenDocument Presentation
368 if ($indexerConfig['unzip']) {
369 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODP'), $extension);
370 }
371 break;
372 case 'odt':
373 // Oasis OpenDocument Text
374 if ($indexerConfig['unzip']) {
375 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.ODT'), $extension);
376 }
377 break;
378 case 'rtf':
379 // Catdoc
380 if ($indexerConfig['unrtf']) {
381 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.RTF'), $extension);
382 }
383 break;
384 case 'jpeg':
385 case 'jpg':
386 case 'tif':
387 // PHP EXIF
388 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.Images'), $extension);
389 break;
390 case 'html':
391 case 'htm':
392 // PHP strip-tags()
393 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.HTML'), $extension);
394 break;
395 case 'txt':
396 // Raw text
397 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.TXT'), $extension);
398 break;
399 case 'csv':
400 // Raw text
401 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.CSV'), $extension);
402 break;
403 case 'xml':
404 // PHP strip-tags()
405 return sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:extension.XML'), $extension);
406 break;
407 default:
408 // Do nothing
409 }
410 return '';
411 }
412
413 /**
414 * Returns TRUE if the input extension (item_type) is a potentially a multi-page extension
415 *
416 * @param string $extension Extension / item_type string
417 * @return bool Return TRUE if multi-page
418 */
419 public function isMultiplePageExtension($extension) {
420 // Switch on file extension:
421 switch ((string)$extension) {
422 case 'pdf':
423 return TRUE;
424 break;
425 }
426 return FALSE;
427 }
428
429 /**
430 * Wraps the "splitLabel function" of the language object.
431 *
432 * @param string $reference: Reference/key of the label
433 * @param bool $useHtmlSpecialChar: Convert special chars to HTML entities (default: FALSE)
434 * @return string The label of the reference/key to be fetched
435 */
436 protected function sL($reference, $useHtmlSpecialChar = FALSE) {
437 return $this->langObject->sL($reference, $useHtmlSpecialChar);
438 }
439
440 /************************
441 *
442 * Reading documents (for parsing)
443 *
444 ************************/
445 /**
446 * Reads the content of an external file being indexed.
447 *
448 * @param string $ext File extension, eg. "pdf", "doc" etc.
449 * @param string $absFile Absolute filename of file (must exist and be validated OK before calling function)
450 * @param string $cPKey Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be split.)
451 * @return array Standard content array (title, description, keywords, body keys)
452 */
453 public function readFileContent($ext, $absFile, $cPKey) {
454 $contentArr = NULL;
455 // Return immediately if initialization didn't set support up:
456 if (!$this->supportedExtensions[$ext]) {
457 return FALSE;
458 }
459 // Switch by file extension
460 switch ($ext) {
461 case 'pdf':
462 if ($this->app['pdfinfo']) {
463 $this->setLocaleForServerFileSystem();
464 // Getting pdf-info:
465 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
466 CommandUtility::exec($cmd, $res);
467 $pdfInfo = $this->splitPdfInfo($res);
468 unset($res);
469 if ((int)$pdfInfo['pages']) {
470 list($low, $high) = explode('-', $cPKey);
471 // Get pdf content:
472 $tempFileName = GeneralUtility::tempnam('Typo3_indexer');
473 // Create temporary name
474 @unlink($tempFileName);
475 // Delete if exists, just to be safe.
476 $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
477 CommandUtility::exec($cmd);
478 if (@is_file($tempFileName)) {
479 $content = GeneralUtility::getUrl($tempFileName);
480 unlink($tempFileName);
481 } else {
482 $content = '';
483 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/Resources/Private/Language/locallang_main.xlf:pdfToolsFailed'), $absFile), 2);
484 }
485 if ((string)$content !== '') {
486 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
487 }
488 }
489 $this->setLocaleForServerFileSystem(TRUE);
490 }
491 break;
492 case 'doc':
493 if ($this->app['catdoc']) {
494 $this->setLocaleForServerFileSystem();
495 $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
496 CommandUtility::exec($cmd, $res);
497 $content = implode(LF, $res);
498 unset($res);
499 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
500 $this->setLocaleForServerFileSystem(TRUE);
501 }
502 break;
503 case 'pps':
504 case 'ppt':
505 if ($this->app['ppthtml']) {
506 $this->setLocaleForServerFileSystem();
507 $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
508 CommandUtility::exec($cmd, $res);
509 $content = implode(LF, $res);
510 unset($res);
511 $content = $this->pObj->convertHTMLToUtf8($content);
512 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
513 $contentArr['title'] = basename($absFile);
514 $this->setLocaleForServerFileSystem(TRUE);
515 }
516 break;
517 case 'xls':
518 if ($this->app['xlhtml']) {
519 $this->setLocaleForServerFileSystem();
520 $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
521 CommandUtility::exec($cmd, $res);
522 $content = implode(LF, $res);
523 unset($res);
524 $content = $this->pObj->convertHTMLToUtf8($content);
525 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
526 $contentArr['title'] = basename($absFile);
527 $this->setLocaleForServerFileSystem(TRUE);
528 }
529 break;
530 case 'docx':
531 case 'dotx':
532 case 'pptx':
533 case 'ppsx':
534 case 'potx':
535 case 'xlsx':
536 case 'xltx':
537 if ($this->app['unzip']) {
538 $this->setLocaleForServerFileSystem();
539 switch ($ext) {
540 case 'docx':
541 case 'dotx':
542 // Read document.xml:
543 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' word/document.xml';
544 break;
545 case 'ppsx':
546 case 'pptx':
547 case 'potx':
548 // Read slide1.xml:
549 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' ppt/slides/slide1.xml';
550 break;
551 case 'xlsx':
552 case 'xltx':
553 // Read sheet1.xml:
554 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' xl/worksheets/sheet1.xml';
555 break;
556 }
557 CommandUtility::exec($cmd, $res);
558 $content_xml = implode(LF, $res);
559 unset($res);
560 $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
561 $contentArr = $this->pObj->splitRegularContent($utf8_content);
562 // Make sure the title doesn't expose the absolute path!
563 $contentArr['title'] = basename($absFile);
564 // Meta information
565 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' docProps/core.xml';
566 CommandUtility::exec($cmd, $res);
567 $meta_xml = implode(LF, $res);
568 unset($res);
569 $metaContent = GeneralUtility::xml2tree($meta_xml);
570 if (is_array($metaContent)) {
571 $contentArr['title'] .= ' ' . $metaContent['cp:coreProperties'][0]['ch']['dc:title'][0]['values'][0];
572 $contentArr['description'] = $metaContent['cp:coreProperties'][0]['ch']['dc:subject'][0]['values'][0];
573 $contentArr['description'] .= ' ' . $metaContent['cp:coreProperties'][0]['ch']['dc:description'][0]['values'][0];
574 $contentArr['keywords'] = $metaContent['cp:coreProperties'][0]['ch']['cp:keywords'][0]['values'][0];
575 }
576 $this->setLocaleForServerFileSystem(TRUE);
577 }
578 break;
579 case 'sxi':
580 case 'sxc':
581 case 'sxw':
582 case 'ods':
583 case 'odp':
584 case 'odt':
585 if ($this->app['unzip']) {
586 $this->setLocaleForServerFileSystem();
587 // Read content.xml:
588 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml';
589 CommandUtility::exec($cmd, $res);
590 $content_xml = implode(LF, $res);
591 unset($res);
592 // Read meta.xml:
593 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml';
594 CommandUtility::exec($cmd, $res);
595 $meta_xml = implode(LF, $res);
596 unset($res);
597 $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
598 $contentArr = $this->pObj->splitRegularContent($utf8_content);
599 $contentArr['title'] = basename($absFile);
600 // Make sure the title doesn't expose the absolute path!
601 // Meta information
602 $metaContent = GeneralUtility::xml2tree($meta_xml);
603 $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
604 if (is_array($metaContent)) {
605 $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
606 $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0] . ' ' . $metaContent['dc:description'][0]['values'][0];
607 // Keywords collected:
608 if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
609 foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
610 $contentArr['keywords'] .= $kwDat['values'][0] . ' ';
611 }
612 }
613 }
614 $this->setLocaleForServerFileSystem(TRUE);
615 }
616 break;
617 case 'rtf':
618 if ($this->app['unrtf']) {
619 $this->setLocaleForServerFileSystem();
620 $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
621 CommandUtility::exec($cmd, $res);
622 $fileContent = implode(LF, $res);
623 unset($res);
624 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
625 $contentArr = $this->pObj->splitHTMLContent($fileContent);
626 $this->setLocaleForServerFileSystem(TRUE);
627 }
628 break;
629 case 'txt':
630 case 'csv':
631 $this->setLocaleForServerFileSystem();
632 // Raw text
633 $content = GeneralUtility::getUrl($absFile);
634 // @todo Implement auto detection of charset (currently assuming utf-8)
635 $contentCharset = 'utf-8';
636 $content = $this->pObj->convertHTMLToUtf8($content, $contentCharset);
637 $contentArr = $this->pObj->splitRegularContent($content);
638 $contentArr['title'] = basename($absFile);
639 // Make sure the title doesn't expose the absolute path!
640 $this->setLocaleForServerFileSystem(TRUE);
641 break;
642 case 'html':
643 case 'htm':
644 $fileContent = GeneralUtility::getUrl($absFile);
645 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
646 $contentArr = $this->pObj->splitHTMLContent($fileContent);
647 break;
648 case 'xml':
649 $this->setLocaleForServerFileSystem();
650 // PHP strip-tags()
651 $fileContent = GeneralUtility::getUrl($absFile);
652 // Finding charset:
653 preg_match('/^[[:space:]]*<\\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i', substr($fileContent, 0, 200), $reg);
654 $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
655 // Converting content:
656 $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<', ' <', $fileContent)), $charset);
657 $contentArr = $this->pObj->splitRegularContent($fileContent);
658 $contentArr['title'] = basename($absFile);
659 // Make sure the title doesn't expose the absolute path!
660 $this->setLocaleForServerFileSystem(TRUE);
661 break;
662 case 'jpg':
663 case 'jpeg':
664 case 'tif':
665 $this->setLocaleForServerFileSystem();
666 // PHP EXIF
667 if (function_exists('exif_read_data')) {
668 $exif = @exif_read_data($absFile, 'IFD0');
669 } else {
670 $exif = FALSE;
671 }
672 if ($exif) {
673 $comment = trim($exif['COMMENT'][0] . ' ' . $exif['ImageDescription']);
674 } else {
675 $comment = '';
676 }
677 $contentArr = $this->pObj->splitRegularContent($comment);
678 $contentArr['title'] = basename($absFile);
679 // Make sure the title doesn't expose the absolute path!
680 $this->setLocaleForServerFileSystem(TRUE);
681 break;
682 default:
683 return FALSE;
684 }
685 // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
686 if (is_array($contentArr) && !$contentArr['title']) {
687 // Substituting "_" for " " because many filenames may have this instead of a space char.
688 $contentArr['title'] = str_replace('_', ' ', basename($absFile));
689 }
690 return $contentArr;
691 }
692
693 /**
694 * Sets the locale for LC_CTYPE to $TYPO3_CONF_VARS['SYS']['systemLocale']
695 * if $TYPO3_CONF_VARS['SYS']['UTF8filesystem'] is set.
696 *
697 * Parameter <code>$resetLocale</code> has to be FALSE and TRUE alternating for all calls.
698 *
699 * @staticvar string $lastLocale Stores the locale used before it is overridden by this method.
700 * @param bool $resetLocale TRUE resets the locale to $lastLocale.
701 * @return void
702 * @throws \RuntimeException
703 */
704 protected function setLocaleForServerFileSystem($resetLocale = FALSE) {
705 static $lastLocale = NULL;
706 if (!$GLOBALS['TYPO3_CONF_VARS']['SYS']['UTF8filesystem']) {
707 return;
708 }
709
710 if ($resetLocale) {
711 if ($lastLocale == NULL) {
712 throw new \RuntimeException('Cannot reset locale to NULL.', 1357064326);
713 }
714 setlocale(LC_CTYPE, $lastLocale);
715 $lastLocale = NULL;
716 } else {
717 if ($lastLocale !== NULL) {
718 throw new \RuntimeException('Cannot set new locale as locale has already been changed before.', 1357064437);
719 }
720 $lastLocale = setlocale(LC_CTYPE, 0);
721 setlocale(LC_CTYPE, $GLOBALS['TYPO3_CONF_VARS']['SYS']['systemLocale']);
722 }
723 }
724
725 /**
726 * Creates an array with pointers to divisions of document.
727 *
728 * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero)
729 * coming back.
730 *
731 * @param string $ext File extension
732 * @param string $absFile Absolute filename (must exist and be validated OK before calling function)
733 * @return array Array of pointers to sections that the document should be divided into
734 */
735 public function fileContentParts($ext, $absFile) {
736 $cParts = array(0);
737 switch ($ext) {
738 case 'pdf':
739 $this->setLocaleForServerFileSystem();
740 // Getting pdf-info:
741 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
742 CommandUtility::exec($cmd, $res);
743 $pdfInfo = $this->splitPdfInfo($res);
744 unset($res);
745 if ((int)$pdfInfo['pages']) {
746 $cParts = array();
747 // Calculate mode
748 if ($this->pdf_mode > 0) {
749 $iter = ceil($pdfInfo['pages'] / $this->pdf_mode);
750 } else {
751 $iter = MathUtility::forceIntegerInRange(abs($this->pdf_mode), 1, $pdfInfo['pages']);
752 }
753 // Traverse and create intervals.
754 for ($a = 0; $a < $iter; $a++) {
755 $low = floor($a * ($pdfInfo['pages'] / $iter)) + 1;
756 $high = floor(($a + 1) * ($pdfInfo['pages'] / $iter));
757 $cParts[] = $low . '-' . $high;
758 }
759 }
760 $this->setLocaleForServerFileSystem(TRUE);
761 break;
762 default:
763 }
764 return $cParts;
765 }
766
767 /**
768 * Analysing PDF info into a useable format.
769 *
770 * @param array $pdfInfoArray Array of PDF content, coming from the pdfinfo tool
771 * @return array Result array
772 * @access private
773 * @see fileContentParts()
774 */
775 public function splitPdfInfo($pdfInfoArray) {
776 $res = array();
777 if (is_array($pdfInfoArray)) {
778 foreach ($pdfInfoArray as $line) {
779 $parts = explode(':', $line, 2);
780 if (count($parts) > 1 && trim($parts[0])) {
781 $res[strtolower(trim($parts[0]))] = trim($parts[1]);
782 }
783 }
784 }
785 return $res;
786 }
787
788 /**
789 * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.
790 *
791 * @param string $string String to clean up
792 * @return string String
793 */
794 public function removeEndJunk($string) {
795 return trim(preg_replace('/[' . LF . chr(12) . ']*$/', '', $string));
796 }
797
798 /************************
799 *
800 * Backend analyzer
801 *
802 ************************/
803 /**
804 * Return icon for file extension
805 *
806 * @param string $extension File extension, lowercase.
807 * @return string Relative file reference, resolvable by GeneralUtility::getFileAbsFileName()
808 */
809 public function getIcon($extension) {
810 if ($extension === 'htm') {
811 $extension = 'html';
812 } elseif ($extension === 'jpeg') {
813 $extension = 'jpg';
814 }
815 return 'EXT:indexed_search/Resources/Public/Icons/FileTypes/' . $extension . '.gif';
816 }
817
818 }