[!!!][TASK] Rewrite backend modules of indexed_search
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / FileContentParser.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /**
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16 /**
17 * External standard parsers for indexed_search
18 *
19 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
20 * @coauthor Olivier Simah <noname_paris@yahoo.fr>
21 */
22 /**
23 * External standard parsers for indexed_search
24 * MUST RETURN utf-8 content!
25 *
26 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
27 */
28 class FileContentParser {
29
30 // This value is also overridden from config.
31 public $pdf_mode = -20;
32
33 // zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
34 // This array is configured in initialization:
35 public $app = array();
36
37 public $ext2itemtype_map = array();
38
39 public $supportedExtensions = array();
40
41 public $pObj;
42
43 // Reference to parent object (indexer class)
44 protected $langObject;
45
46 // Reference to LANG-Object
47 /**
48 * Constructs this external parsers object
49 */
50 public function __construct() {
51 // Set the language object to be used accordant to current TYPO3_MODE:
52 $this->langObject = TYPO3_MODE == 'FE' ? $GLOBALS['TSFE'] : $GLOBALS['LANG'];
53 }
54
55 /**
56 * Initialize external parser for parsing content.
57 *
58 * @param string File extension
59 * @return bool Returns TRUE if extension is supported/enabled, otherwise FALSE.
60 */
61 public function initParser($extension) {
62 // Then read indexer-config and set if appropriate:
63 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
64 // If windows, apply extension to tool name:
65 $exe = TYPO3_OS == 'WIN' ? '.exe' : '';
66 // lg
67 $extOK = FALSE;
68 $mainExtension = '';
69 // Ignore extensions
70 $ignoreExtensions = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), TRUE);
71 if (in_array($extension, $ignoreExtensions)) {
72 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:ignoreExtensions'), $extension), 1);
73 return FALSE;
74 }
75 // Switch on file extension:
76 switch ($extension) {
77 case 'pdf':
78 // PDF
79 if ($indexerConfig['pdftools']) {
80 $pdfPath = rtrim($indexerConfig['pdftools'], '/') . '/';
81 if (@is_file(($pdfPath . 'pdftotext' . $exe)) && @is_file(($pdfPath . 'pdfinfo' . $exe))) {
82 $this->app['pdfinfo'] = $pdfPath . 'pdfinfo' . $exe;
83 $this->app['pdftotext'] = $pdfPath . 'pdftotext' . $exe;
84 // PDF mode:
85 $this->pdf_mode = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($indexerConfig['pdf_mode'], -100, 100);
86 $extOK = TRUE;
87 } else {
88 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:pdfToolsNotFound'), $pdfPath), 3);
89 }
90 } else {
91 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:pdfToolsDisabled'), 1);
92 }
93 break;
94 case 'doc':
95 // Catdoc
96 if ($indexerConfig['catdoc']) {
97 $catdocPath = rtrim($indexerConfig['catdoc'], '/') . '/';
98 if (@is_file(($catdocPath . 'catdoc' . $exe))) {
99 $this->app['catdoc'] = $catdocPath . 'catdoc' . $exe;
100 $extOK = TRUE;
101 } else {
102 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:catdocNotFound'), $catdocPath), 3);
103 }
104 } else {
105 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:catdocDisabled'), 1);
106 }
107 break;
108 case 'pps':
109
110 case 'ppt':
111 // MS PowerPoint
112 // ppthtml
113 if ($indexerConfig['ppthtml']) {
114 $ppthtmlPath = rtrim($indexerConfig['ppthtml'], '/') . '/';
115 if (@is_file(($ppthtmlPath . 'ppthtml' . $exe))) {
116 $this->app['ppthtml'] = $ppthtmlPath . 'ppthtml' . $exe;
117 $extOK = TRUE;
118 } else {
119 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:ppthtmlNotFound'), $ppthtmlPath), 3);
120 }
121 } else {
122 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:ppthtmlDisabled'), 1);
123 }
124 break;
125 case 'xls':
126 // MS Excel
127 // Xlhtml
128 if ($indexerConfig['xlhtml']) {
129 $xlhtmlPath = rtrim($indexerConfig['xlhtml'], '/') . '/';
130 if (@is_file(($xlhtmlPath . 'xlhtml' . $exe))) {
131 $this->app['xlhtml'] = $xlhtmlPath . 'xlhtml' . $exe;
132 $extOK = TRUE;
133 } else {
134 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:xlhtmlNotFound'), $xlhtmlPath), 3);
135 }
136 } else {
137 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:xlhtmlDisabled'), 1);
138 }
139 break;
140 case 'sxc':
141
142 case 'sxi':
143
144 case 'sxw':
145
146 case 'ods':
147
148 case 'odp':
149
150 case 'odt':
151 // Oasis OpenDocument Text
152 if ($indexerConfig['unzip']) {
153 $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
154 if (@is_file(($unzipPath . 'unzip' . $exe))) {
155 $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
156 $extOK = TRUE;
157 } else {
158 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:unzipNotFound'), $unzipPath), 3);
159 }
160 } else {
161 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:unzipDisabled'), 1);
162 }
163 break;
164 case 'rtf':
165 // Catdoc
166 if ($indexerConfig['unrtf']) {
167 $unrtfPath = rtrim($indexerConfig['unrtf'], '/') . '/';
168 if (@is_file(($unrtfPath . 'unrtf' . $exe))) {
169 $this->app['unrtf'] = $unrtfPath . 'unrtf' . $exe;
170 $extOK = TRUE;
171 } else {
172 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:unrtfNotFound'), $unrtfPath), 3);
173 }
174 } else {
175 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:unrtfDisabled'), 1);
176 }
177 break;
178 case 'txt':
179
180 case 'csv':
181
182 case 'xml':
183
184 case 'tif':
185 // PHP EXIF
186 $extOK = TRUE;
187 break;
188 case 'html':
189
190 case 'htm':
191 // PHP strip-tags()
192 $extOK = TRUE;
193 $mainExtension = 'html';
194 // making "html" the common "item_type"
195 break;
196 case 'jpg':
197
198 case 'jpeg':
199 // PHP EXIF
200 $extOK = TRUE;
201 $mainExtension = 'jpeg';
202 // making "jpeg" the common item_type
203 break;
204 }
205 // If extension was OK:
206 if ($extOK) {
207 $this->supportedExtensions[$extension] = TRUE;
208 $this->ext2itemtype_map[$extension] = $mainExtension ?: $extension;
209 return TRUE;
210 }
211 }
212
213 /**
214 * Initialize external parser for backend modules
215 * Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc) in backend and frontend plugin
216 *
217 * @param string File extension to initialize for.
218 * @return bool Returns TRUE if the extension is supported and enabled, otherwise FALSE.
219 */
220 public function softInit($extension) {
221 switch ($extension) {
222 case 'pdf':
223
224 case 'doc':
225
226 case 'pps':
227
228 case 'ppt':
229
230 case 'xls':
231
232 case 'sxc':
233
234 case 'sxi':
235
236 case 'sxw':
237
238 case 'ods':
239
240 case 'odp':
241
242 case 'odt':
243
244 case 'rtf':
245
246 case 'txt':
247
248 case 'html':
249
250 case 'htm':
251
252 case 'csv':
253
254 case 'xml':
255
256 case 'jpg':
257
258 case 'jpeg':
259
260 case 'tif':
261 // TIF images (EXIF comment)
262 return TRUE;
263 break;
264 }
265 }
266
267 /**
268 * Return title of entry in media type selector box.
269 *
270 * @param string File extension
271 * @return string String with label value of entry in media type search selector box (frontend plugin).
272 */
273 public function searchTypeMediaTitle($extension) {
274 // Read indexer-config
275 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
276 // Ignore extensions
277 $ignoreExtensions = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), TRUE);
278 if (in_array($extension, $ignoreExtensions)) {
279 return FALSE;
280 }
281 // Switch on file extension:
282 switch ($extension) {
283 case 'pdf':
284 // PDF
285 if ($indexerConfig['pdftools']) {
286 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.PDF'), $extension);
287 }
288 break;
289 case 'doc':
290 // Catdoc
291 if ($indexerConfig['catdoc']) {
292 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.DOC'), $extension);
293 }
294 break;
295 case 'pps':
296
297 case 'ppt':
298 // MS PowerPoint
299 // ppthtml
300 if ($indexerConfig['ppthtml']) {
301 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.PP'), $extension);
302 }
303 break;
304 case 'xls':
305 // MS Excel
306 // Xlhtml
307 if ($indexerConfig['xlhtml']) {
308 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.XLS'), $extension);
309 }
310 break;
311 case 'sxc':
312 // Open Office Calc.
313 if ($indexerConfig['unzip']) {
314 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.SXC'), $extension);
315 }
316 break;
317 case 'sxi':
318 // Open Office Impress
319 if ($indexerConfig['unzip']) {
320 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.SXI'), $extension);
321 }
322 break;
323 case 'sxw':
324 // Open Office Writer
325 if ($indexerConfig['unzip']) {
326 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.SXW'), $extension);
327 }
328 break;
329 case 'ods':
330 // Oasis OpenDocument Spreadsheet
331 if ($indexerConfig['unzip']) {
332 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.ODS'), $extension);
333 }
334 break;
335 case 'odp':
336 // Oasis OpenDocument Presentation
337 if ($indexerConfig['unzip']) {
338 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.ODP'), $extension);
339 }
340 break;
341 case 'odt':
342 // Oasis OpenDocument Text
343 if ($indexerConfig['unzip']) {
344 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.ODT'), $extension);
345 }
346 break;
347 case 'rtf':
348 // Catdoc
349 if ($indexerConfig['unrtf']) {
350 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.RTF'), $extension);
351 }
352 break;
353 case 'jpeg':
354
355 case 'jpg':
356
357 case 'tif':
358 // PHP EXIF
359 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.Images'), $extension);
360 break;
361 case 'html':
362
363 case 'htm':
364 // PHP strip-tags()
365 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.HTML'), $extension);
366 break;
367 case 'txt':
368 // Raw text
369 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.TXT'), $extension);
370 break;
371 case 'csv':
372 // Raw text
373 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.CSV'), $extension);
374 break;
375 case 'xml':
376 // PHP strip-tags()
377 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.XML'), $extension);
378 break;
379 default:
380 // Do nothing
381 }
382 }
383
384 /**
385 * Returns TRUE if the input extension (item_type) is a potentially a multi-page extension
386 *
387 * @param string Extension / item_type string
388 * @return bool Return TRUE if multi-page
389 */
390 public function isMultiplePageExtension($extension) {
391 // Switch on file extension:
392 switch ((string)$extension) {
393 case 'pdf':
394 return TRUE;
395 break;
396 }
397 }
398
399 /**
400 * Wraps the "splitLabel function" of the language object.
401 *
402 * @param string $reference: Reference/key of the label
403 * @param bool $useHtmlSpecialChar: Convert special chars to HTML entities (default: FALSE)
404 * @return string The label of the reference/key to be fetched
405 */
406 protected function sL($reference, $useHtmlSpecialChar = FALSE) {
407 return $this->langObject->sL($reference, $useHtmlSpecialChar);
408 }
409
410 /************************
411 *
412 * Reading documents (for parsing)
413 *
414 ************************/
415 /**
416 * Reads the content of an external file being indexed.
417 *
418 * @param string $ext File extension, eg. "pdf", "doc" etc.
419 * @param string $absFile Absolute filename of file (must exist and be validated OK before calling function)
420 * @param string $cPKey Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be split.)
421 * @return array Standard content array (title, description, keywords, body keys)
422 */
423 public function readFileContent($ext, $absFile, $cPKey) {
424 unset($contentArr);
425 // Return immediately if initialization didn't set support up:
426 if (!$this->supportedExtensions[$ext]) {
427 return FALSE;
428 }
429 // Switch by file extension
430 switch ($ext) {
431 case 'pdf':
432 if ($this->app['pdfinfo']) {
433 $this->setLocaleForServerFileSystem();
434 // Getting pdf-info:
435 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
436 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
437 $pdfInfo = $this->splitPdfInfo($res);
438 unset($res);
439 if ((int)$pdfInfo['pages']) {
440 list($low, $high) = explode('-', $cPKey);
441 // Get pdf content:
442 $tempFileName = \TYPO3\CMS\Core\Utility\GeneralUtility::tempnam('Typo3_indexer');
443 // Create temporary name
444 @unlink($tempFileName);
445 // Delete if exists, just to be safe.
446 $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
447 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd);
448 if (@is_file($tempFileName)) {
449 $content = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($tempFileName);
450 unlink($tempFileName);
451 } else {
452 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:pdfToolsFailed'), $absFile), 2);
453 }
454 if (strlen($content)) {
455 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
456 }
457 }
458 $this->setLocaleForServerFileSystem(TRUE);
459 }
460 break;
461 case 'doc':
462 if ($this->app['catdoc']) {
463 $this->setLocaleForServerFileSystem();
464 $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
465 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
466 $content = implode(LF, $res);
467 unset($res);
468 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
469 $this->setLocaleForServerFileSystem(TRUE);
470 }
471 break;
472 case 'pps':
473
474 case 'ppt':
475 if ($this->app['ppthtml']) {
476 $this->setLocaleForServerFileSystem();
477 $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
478 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
479 $content = implode(LF, $res);
480 unset($res);
481 $content = $this->pObj->convertHTMLToUtf8($content);
482 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
483 $contentArr['title'] = basename($absFile);
484 $this->setLocaleForServerFileSystem(TRUE);
485 }
486 break;
487 case 'xls':
488 if ($this->app['xlhtml']) {
489 $this->setLocaleForServerFileSystem();
490 $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
491 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
492 $content = implode(LF, $res);
493 unset($res);
494 $content = $this->pObj->convertHTMLToUtf8($content);
495 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
496 $contentArr['title'] = basename($absFile);
497 $this->setLocaleForServerFileSystem(TRUE);
498 }
499 break;
500 case 'sxi':
501
502 case 'sxc':
503
504 case 'sxw':
505
506 case 'ods':
507
508 case 'odp':
509
510 case 'odt':
511 if ($this->app['unzip']) {
512 $this->setLocaleForServerFileSystem();
513 // Read content.xml:
514 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml';
515 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
516 $content_xml = implode(LF, $res);
517 unset($res);
518 // Read meta.xml:
519 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml';
520 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
521 $meta_xml = implode(LF, $res);
522 unset($res);
523 $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
524 $contentArr = $this->pObj->splitRegularContent($utf8_content);
525 $contentArr['title'] = basename($absFile);
526 // Make sure the title doesn't expose the absolute path!
527 // Meta information
528 $metaContent = \TYPO3\CMS\Core\Utility\GeneralUtility::xml2tree($meta_xml);
529 $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
530 if (is_array($metaContent)) {
531 $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
532 $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0] . ' ' . $metaContent['dc:description'][0]['values'][0];
533 // Keywords collected:
534 if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
535 foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
536 $contentArr['keywords'] .= $kwDat['values'][0] . ' ';
537 }
538 }
539 }
540 $this->setLocaleForServerFileSystem(TRUE);
541 }
542 break;
543 case 'rtf':
544 if ($this->app['unrtf']) {
545 $this->setLocaleForServerFileSystem();
546 $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
547 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
548 $fileContent = implode(LF, $res);
549 unset($res);
550 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
551 $contentArr = $this->pObj->splitHTMLContent($fileContent);
552 $this->setLocaleForServerFileSystem(TRUE);
553 }
554 break;
555 case 'txt':
556
557 case 'csv':
558 $this->setLocaleForServerFileSystem();
559 // Raw text
560 $content = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($absFile);
561 // TODO: Implement auto detection of charset (currently assuming utf-8)
562 $contentCharset = 'utf-8';
563 $content = $this->pObj->convertHTMLToUtf8($content, $contentCharset);
564 $contentArr = $this->pObj->splitRegularContent($content);
565 $contentArr['title'] = basename($absFile);
566 // Make sure the title doesn't expose the absolute path!
567 $this->setLocaleForServerFileSystem(TRUE);
568 break;
569 case 'html':
570
571 case 'htm':
572 $fileContent = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($absFile);
573 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
574 $contentArr = $this->pObj->splitHTMLContent($fileContent);
575 break;
576 case 'xml':
577 $this->setLocaleForServerFileSystem();
578 // PHP strip-tags()
579 $fileContent = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($absFile);
580 // Finding charset:
581 preg_match('/^[[:space:]]*<\\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i', substr($fileContent, 0, 200), $reg);
582 $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
583 // Converting content:
584 $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<', ' <', $fileContent)), $charset);
585 $contentArr = $this->pObj->splitRegularContent($fileContent);
586 $contentArr['title'] = basename($absFile);
587 // Make sure the title doesn't expose the absolute path!
588 $this->setLocaleForServerFileSystem(TRUE);
589 break;
590 case 'jpg':
591
592 case 'jpeg':
593
594 case 'tif':
595 $this->setLocaleForServerFileSystem();
596 // PHP EXIF
597 if (function_exists('exif_read_data')) {
598 $exif = @exif_read_data($absFile, 'IFD0');
599 } else {
600 $exif = FALSE;
601 }
602 if ($exif) {
603 $comment = trim($exif['COMMENT'][0] . ' ' . $exif['ImageDescription']);
604 } else {
605 $comment = '';
606 }
607 $contentArr = $this->pObj->splitRegularContent($comment);
608 $contentArr['title'] = basename($absFile);
609 // Make sure the title doesn't expose the absolute path!
610 $this->setLocaleForServerFileSystem(TRUE);
611 break;
612 default:
613 return FALSE;
614 }
615 // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
616 if (is_array($contentArr) && !$contentArr['title']) {
617 // Substituting "_" for " " because many filenames may have this instead of a space char.
618 $contentArr['title'] = str_replace('_', ' ', basename($absFile));
619 }
620 return $contentArr;
621 }
622
623 /**
624 * Sets the locale for LC_CTYPE to $TYPO3_CONF_VARS['SYS']['systemLocale']
625 * if $TYPO3_CONF_VARS['SYS']['UTF8filesystem'] is set.
626 *
627 * Parameter <code>$resetLocale</code> has to be FALSE and TRUE alternating for all calls.
628 *
629 * @staticvar string $lastLocale Stores the locale used before it is overridden by this method.
630 * @param bool $resetLocale TRUE resets the locale to $lastLocale.
631 * @return void
632 * @throws \RuntimeException
633 */
634 protected function setLocaleForServerFileSystem($resetLocale = FALSE) {
635 static $lastLocale = NULL;
636 if (!$GLOBALS['TYPO3_CONF_VARS']['SYS']['UTF8filesystem']) {
637 return;
638 }
639
640 if ($resetLocale) {
641 if ($lastLocale == NULL) {
642 throw new \RuntimeException('Cannot reset locale to NULL.', 1357064326);
643 }
644 setlocale(LC_CTYPE, $lastLocale);
645 $lastLocale = NULL;
646 } else {
647 if ($lastLocale !== NULL) {
648 throw new \RuntimeException('Cannot set new locale as locale has already been changed before.', 1357064437);
649 }
650 $lastLocale = setlocale(LC_CTYPE, 0);
651 setlocale(LC_CTYPE, $GLOBALS['TYPO3_CONF_VARS']['SYS']['systemLocale']);
652 }
653 }
654
655 /**
656 * Creates an array with pointers to divisions of document.
657 *
658 * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero)
659 * coming back.
660 *
661 * @param string $ext File extension
662 * @param string $absFile Absolute filename (must exist and be validated OK before calling function)
663 * @return array Array of pointers to sections that the document should be divided into
664 */
665 public function fileContentParts($ext, $absFile) {
666 $cParts = array(0);
667 switch ($ext) {
668 case 'pdf':
669 $this->setLocaleForServerFileSystem();
670 // Getting pdf-info:
671 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
672 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
673 $pdfInfo = $this->splitPdfInfo($res);
674 unset($res);
675 if ((int)$pdfInfo['pages']) {
676 $cParts = array();
677 // Calculate mode
678 if ($this->pdf_mode > 0) {
679 $iter = ceil($pdfInfo['pages'] / $this->pdf_mode);
680 } else {
681 $iter = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange(abs($this->pdf_mode), 1, $pdfInfo['pages']);
682 }
683 // Traverse and create intervals.
684 for ($a = 0; $a < $iter; $a++) {
685 $low = floor($a * ($pdfInfo['pages'] / $iter)) + 1;
686 $high = floor(($a + 1) * ($pdfInfo['pages'] / $iter));
687 $cParts[] = $low . '-' . $high;
688 }
689 }
690 $this->setLocaleForServerFileSystem(TRUE);
691 break;
692 default:
693 }
694 return $cParts;
695 }
696
697 /**
698 * Analysing PDF info into a useable format.
699 *
700 * @param array Array of PDF content, coming from the pdfinfo tool
701 * @return array Result array
702 * @access private
703 * @see fileContentParts()
704 */
705 public function splitPdfInfo($pdfInfoArray) {
706 $res = array();
707 if (is_array($pdfInfoArray)) {
708 foreach ($pdfInfoArray as $line) {
709 $parts = explode(':', $line, 2);
710 if (count($parts) > 1 && trim($parts[0])) {
711 $res[strtolower(trim($parts[0]))] = trim($parts[1]);
712 }
713 }
714 }
715 return $res;
716 }
717
718 /**
719 * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.
720 *
721 * @param string String to clean up
722 * @return string String
723 */
724 public function removeEndJunk($string) {
725 return trim(preg_replace('/[' . LF . chr(12) . ']*$/', '', $string));
726 }
727
728 /************************
729 *
730 * Backend analyzer
731 *
732 ************************/
733 /**
734 * Return icon for file extension
735 *
736 * @param string File extension, lowercase.
737 * @return string Relative file reference, resolvable by \TYPO3\CMS\Core\Utility\GeneralUtility::getFileAbsFileName()
738 */
739 public function getIcon($extension) {
740 if ($extension === 'htm') {
741 $extension = 'html';
742 } elseif ($extension === 'jpeg') {
743 $extension = 'jpg';
744 }
745 return 'EXT:indexed_search/pi/res/' . $extension . '.gif';
746 }
747
748 }