0c197daa04cc98830f6f5e22343f04b931ba0ea2
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / FileContentParser.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 /**
18 * External standard parsers for indexed_search
19 *
20 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
21 * @coauthor Olivier Simah <noname_paris@yahoo.fr>
22 */
23 /**
24 * External standard parsers for indexed_search
25 * MUST RETURN utf-8 content!
26 *
27 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
28 */
29 class FileContentParser {
30
31 // This value is also overridden from config.
32 public $pdf_mode = -20;
33
34 // zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
35 // This array is configured in initialization:
36 public $app = array();
37
38 public $ext2itemtype_map = array();
39
40 public $supportedExtensions = array();
41
42 public $pObj;
43
44 // Reference to parent object (indexer class)
45 protected $langObject;
46
47 // Reference to LANG-Object
48 /**
49 * Constructs this external parsers object
50 */
51 public function __construct() {
52 // Set the language object to be used accordant to current TYPO3_MODE:
53 $this->langObject = TYPO3_MODE == 'FE' ? $GLOBALS['TSFE'] : $GLOBALS['LANG'];
54 }
55
56 /**
57 * Initialize external parser for parsing content.
58 *
59 * @param string File extension
60 * @return bool Returns TRUE if extension is supported/enabled, otherwise FALSE.
61 */
62 public function initParser($extension) {
63 // Then read indexer-config and set if appropriate:
64 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
65 // If windows, apply extension to tool name:
66 $exe = TYPO3_OS == 'WIN' ? '.exe' : '';
67 // lg
68 $extOK = FALSE;
69 $mainExtension = '';
70 // Ignore extensions
71 $ignoreExtensions = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), TRUE);
72 if (in_array($extension, $ignoreExtensions)) {
73 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:ignoreExtensions'), $extension), 1);
74 return FALSE;
75 }
76 // Switch on file extension:
77 switch ($extension) {
78 case 'pdf':
79 // PDF
80 if ($indexerConfig['pdftools']) {
81 $pdfPath = rtrim($indexerConfig['pdftools'], '/') . '/';
82 if (@is_file(($pdfPath . 'pdftotext' . $exe)) && @is_file(($pdfPath . 'pdfinfo' . $exe))) {
83 $this->app['pdfinfo'] = $pdfPath . 'pdfinfo' . $exe;
84 $this->app['pdftotext'] = $pdfPath . 'pdftotext' . $exe;
85 // PDF mode:
86 $this->pdf_mode = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($indexerConfig['pdf_mode'], -100, 100);
87 $extOK = TRUE;
88 } else {
89 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:pdfToolsNotFound'), $pdfPath), 3);
90 }
91 } else {
92 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:pdfToolsDisabled'), 1);
93 }
94 break;
95 case 'doc':
96 // Catdoc
97 if ($indexerConfig['catdoc']) {
98 $catdocPath = rtrim($indexerConfig['catdoc'], '/') . '/';
99 if (@is_file(($catdocPath . 'catdoc' . $exe))) {
100 $this->app['catdoc'] = $catdocPath . 'catdoc' . $exe;
101 $extOK = TRUE;
102 } else {
103 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:catdocNotFound'), $catdocPath), 3);
104 }
105 } else {
106 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:catdocDisabled'), 1);
107 }
108 break;
109 case 'pps':
110
111 case 'ppt':
112 // MS PowerPoint
113 // ppthtml
114 if ($indexerConfig['ppthtml']) {
115 $ppthtmlPath = rtrim($indexerConfig['ppthtml'], '/') . '/';
116 if (@is_file(($ppthtmlPath . 'ppthtml' . $exe))) {
117 $this->app['ppthtml'] = $ppthtmlPath . 'ppthtml' . $exe;
118 $extOK = TRUE;
119 } else {
120 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:ppthtmlNotFound'), $ppthtmlPath), 3);
121 }
122 } else {
123 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:ppthtmlDisabled'), 1);
124 }
125 break;
126 case 'xls':
127 // MS Excel
128 // Xlhtml
129 if ($indexerConfig['xlhtml']) {
130 $xlhtmlPath = rtrim($indexerConfig['xlhtml'], '/') . '/';
131 if (@is_file(($xlhtmlPath . 'xlhtml' . $exe))) {
132 $this->app['xlhtml'] = $xlhtmlPath . 'xlhtml' . $exe;
133 $extOK = TRUE;
134 } else {
135 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:xlhtmlNotFound'), $xlhtmlPath), 3);
136 }
137 } else {
138 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:xlhtmlDisabled'), 1);
139 }
140 break;
141 case 'sxc':
142
143 case 'sxi':
144
145 case 'sxw':
146
147 case 'ods':
148
149 case 'odp':
150
151 case 'odt':
152 // Oasis OpenDocument Text
153 if ($indexerConfig['unzip']) {
154 $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
155 if (@is_file(($unzipPath . 'unzip' . $exe))) {
156 $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
157 $extOK = TRUE;
158 } else {
159 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:unzipNotFound'), $unzipPath), 3);
160 }
161 } else {
162 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:unzipDisabled'), 1);
163 }
164 break;
165 case 'rtf':
166 // Catdoc
167 if ($indexerConfig['unrtf']) {
168 $unrtfPath = rtrim($indexerConfig['unrtf'], '/') . '/';
169 if (@is_file(($unrtfPath . 'unrtf' . $exe))) {
170 $this->app['unrtf'] = $unrtfPath . 'unrtf' . $exe;
171 $extOK = TRUE;
172 } else {
173 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:unrtfNotFound'), $unrtfPath), 3);
174 }
175 } else {
176 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:unrtfDisabled'), 1);
177 }
178 break;
179 case 'txt':
180
181 case 'csv':
182
183 case 'xml':
184
185 case 'tif':
186 // PHP EXIF
187 $extOK = TRUE;
188 break;
189 case 'html':
190
191 case 'htm':
192 // PHP strip-tags()
193 $extOK = TRUE;
194 $mainExtension = 'html';
195 // making "html" the common "item_type"
196 break;
197 case 'jpg':
198
199 case 'jpeg':
200 // PHP EXIF
201 $extOK = TRUE;
202 $mainExtension = 'jpeg';
203 // making "jpeg" the common item_type
204 break;
205 }
206 // If extension was OK:
207 if ($extOK) {
208 $this->supportedExtensions[$extension] = TRUE;
209 $this->ext2itemtype_map[$extension] = $mainExtension ?: $extension;
210 return TRUE;
211 }
212 }
213
214 /**
215 * Initialize external parser for backend modules
216 * Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc) in backend and frontend plugin
217 *
218 * @param string File extension to initialize for.
219 * @return bool Returns TRUE if the extension is supported and enabled, otherwise FALSE.
220 */
221 public function softInit($extension) {
222 switch ($extension) {
223 case 'pdf':
224
225 case 'doc':
226
227 case 'pps':
228
229 case 'ppt':
230
231 case 'xls':
232
233 case 'sxc':
234
235 case 'sxi':
236
237 case 'sxw':
238
239 case 'ods':
240
241 case 'odp':
242
243 case 'odt':
244
245 case 'rtf':
246
247 case 'txt':
248
249 case 'html':
250
251 case 'htm':
252
253 case 'csv':
254
255 case 'xml':
256
257 case 'jpg':
258
259 case 'jpeg':
260
261 case 'tif':
262 // TIF images (EXIF comment)
263 return TRUE;
264 break;
265 }
266 }
267
268 /**
269 * Return title of entry in media type selector box.
270 *
271 * @param string File extension
272 * @return string String with label value of entry in media type search selector box (frontend plugin).
273 */
274 public function searchTypeMediaTitle($extension) {
275 // Read indexer-config
276 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
277 // Ignore extensions
278 $ignoreExtensions = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), TRUE);
279 if (in_array($extension, $ignoreExtensions)) {
280 return FALSE;
281 }
282 // Switch on file extension:
283 switch ($extension) {
284 case 'pdf':
285 // PDF
286 if ($indexerConfig['pdftools']) {
287 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.PDF'), $extension);
288 }
289 break;
290 case 'doc':
291 // Catdoc
292 if ($indexerConfig['catdoc']) {
293 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.DOC'), $extension);
294 }
295 break;
296 case 'pps':
297
298 case 'ppt':
299 // MS PowerPoint
300 // ppthtml
301 if ($indexerConfig['ppthtml']) {
302 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.PP'), $extension);
303 }
304 break;
305 case 'xls':
306 // MS Excel
307 // Xlhtml
308 if ($indexerConfig['xlhtml']) {
309 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.XLS'), $extension);
310 }
311 break;
312 case 'sxc':
313 // Open Office Calc.
314 if ($indexerConfig['unzip']) {
315 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.SXC'), $extension);
316 }
317 break;
318 case 'sxi':
319 // Open Office Impress
320 if ($indexerConfig['unzip']) {
321 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.SXI'), $extension);
322 }
323 break;
324 case 'sxw':
325 // Open Office Writer
326 if ($indexerConfig['unzip']) {
327 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.SXW'), $extension);
328 }
329 break;
330 case 'ods':
331 // Oasis OpenDocument Spreadsheet
332 if ($indexerConfig['unzip']) {
333 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.ODS'), $extension);
334 }
335 break;
336 case 'odp':
337 // Oasis OpenDocument Presentation
338 if ($indexerConfig['unzip']) {
339 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.ODP'), $extension);
340 }
341 break;
342 case 'odt':
343 // Oasis OpenDocument Text
344 if ($indexerConfig['unzip']) {
345 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.ODT'), $extension);
346 }
347 break;
348 case 'rtf':
349 // Catdoc
350 if ($indexerConfig['unrtf']) {
351 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.RTF'), $extension);
352 }
353 break;
354 case 'jpeg':
355
356 case 'jpg':
357
358 case 'tif':
359 // PHP EXIF
360 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.Images'), $extension);
361 break;
362 case 'html':
363
364 case 'htm':
365 // PHP strip-tags()
366 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.HTML'), $extension);
367 break;
368 case 'txt':
369 // Raw text
370 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.TXT'), $extension);
371 break;
372 case 'csv':
373 // Raw text
374 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.CSV'), $extension);
375 break;
376 case 'xml':
377 // PHP strip-tags()
378 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.XML'), $extension);
379 break;
380 default:
381 // Do nothing
382 }
383 }
384
385 /**
386 * Returns TRUE if the input extension (item_type) is a potentially a multi-page extension
387 *
388 * @param string Extension / item_type string
389 * @return bool Return TRUE if multi-page
390 */
391 public function isMultiplePageExtension($extension) {
392 // Switch on file extension:
393 switch ((string)$extension) {
394 case 'pdf':
395 return TRUE;
396 break;
397 }
398 }
399
400 /**
401 * Wraps the "splitLabel function" of the language object.
402 *
403 * @param string $reference: Reference/key of the label
404 * @param bool $useHtmlSpecialChar: Convert special chars to HTML entities (default: FALSE)
405 * @return string The label of the reference/key to be fetched
406 */
407 protected function sL($reference, $useHtmlSpecialChar = FALSE) {
408 return $this->langObject->sL($reference, $useHtmlSpecialChar);
409 }
410
411 /************************
412 *
413 * Reading documents (for parsing)
414 *
415 ************************/
416 /**
417 * Reads the content of an external file being indexed.
418 *
419 * @param string $ext File extension, eg. "pdf", "doc" etc.
420 * @param string $absFile Absolute filename of file (must exist and be validated OK before calling function)
421 * @param string $cPKey Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be split.)
422 * @return array Standard content array (title, description, keywords, body keys)
423 */
424 public function readFileContent($ext, $absFile, $cPKey) {
425 unset($contentArr);
426 // Return immediately if initialization didn't set support up:
427 if (!$this->supportedExtensions[$ext]) {
428 return FALSE;
429 }
430 // Switch by file extension
431 switch ($ext) {
432 case 'pdf':
433 if ($this->app['pdfinfo']) {
434 $this->setLocaleForServerFileSystem();
435 // Getting pdf-info:
436 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
437 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
438 $pdfInfo = $this->splitPdfInfo($res);
439 unset($res);
440 if ((int)$pdfInfo['pages']) {
441 list($low, $high) = explode('-', $cPKey);
442 // Get pdf content:
443 $tempFileName = \TYPO3\CMS\Core\Utility\GeneralUtility::tempnam('Typo3_indexer');
444 // Create temporary name
445 @unlink($tempFileName);
446 // Delete if exists, just to be safe.
447 $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
448 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd);
449 if (@is_file($tempFileName)) {
450 $content = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($tempFileName);
451 unlink($tempFileName);
452 } else {
453 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:pdfToolsFailed'), $absFile), 2);
454 }
455 if (strlen($content)) {
456 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
457 }
458 }
459 $this->setLocaleForServerFileSystem(TRUE);
460 }
461 break;
462 case 'doc':
463 if ($this->app['catdoc']) {
464 $this->setLocaleForServerFileSystem();
465 $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
466 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
467 $content = implode(LF, $res);
468 unset($res);
469 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
470 $this->setLocaleForServerFileSystem(TRUE);
471 }
472 break;
473 case 'pps':
474
475 case 'ppt':
476 if ($this->app['ppthtml']) {
477 $this->setLocaleForServerFileSystem();
478 $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
479 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
480 $content = implode(LF, $res);
481 unset($res);
482 $content = $this->pObj->convertHTMLToUtf8($content);
483 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
484 $contentArr['title'] = basename($absFile);
485 $this->setLocaleForServerFileSystem(TRUE);
486 }
487 break;
488 case 'xls':
489 if ($this->app['xlhtml']) {
490 $this->setLocaleForServerFileSystem();
491 $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
492 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
493 $content = implode(LF, $res);
494 unset($res);
495 $content = $this->pObj->convertHTMLToUtf8($content);
496 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
497 $contentArr['title'] = basename($absFile);
498 $this->setLocaleForServerFileSystem(TRUE);
499 }
500 break;
501 case 'sxi':
502
503 case 'sxc':
504
505 case 'sxw':
506
507 case 'ods':
508
509 case 'odp':
510
511 case 'odt':
512 if ($this->app['unzip']) {
513 $this->setLocaleForServerFileSystem();
514 // Read content.xml:
515 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml';
516 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
517 $content_xml = implode(LF, $res);
518 unset($res);
519 // Read meta.xml:
520 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml';
521 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
522 $meta_xml = implode(LF, $res);
523 unset($res);
524 $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
525 $contentArr = $this->pObj->splitRegularContent($utf8_content);
526 $contentArr['title'] = basename($absFile);
527 // Make sure the title doesn't expose the absolute path!
528 // Meta information
529 $metaContent = \TYPO3\CMS\Core\Utility\GeneralUtility::xml2tree($meta_xml);
530 $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
531 if (is_array($metaContent)) {
532 $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
533 $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0] . ' ' . $metaContent['dc:description'][0]['values'][0];
534 // Keywords collected:
535 if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
536 foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
537 $contentArr['keywords'] .= $kwDat['values'][0] . ' ';
538 }
539 }
540 }
541 $this->setLocaleForServerFileSystem(TRUE);
542 }
543 break;
544 case 'rtf':
545 if ($this->app['unrtf']) {
546 $this->setLocaleForServerFileSystem();
547 $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
548 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
549 $fileContent = implode(LF, $res);
550 unset($res);
551 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
552 $contentArr = $this->pObj->splitHTMLContent($fileContent);
553 $this->setLocaleForServerFileSystem(TRUE);
554 }
555 break;
556 case 'txt':
557
558 case 'csv':
559 $this->setLocaleForServerFileSystem();
560 // Raw text
561 $content = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($absFile);
562 // @todo Implement auto detection of charset (currently assuming utf-8)
563 $contentCharset = 'utf-8';
564 $content = $this->pObj->convertHTMLToUtf8($content, $contentCharset);
565 $contentArr = $this->pObj->splitRegularContent($content);
566 $contentArr['title'] = basename($absFile);
567 // Make sure the title doesn't expose the absolute path!
568 $this->setLocaleForServerFileSystem(TRUE);
569 break;
570 case 'html':
571
572 case 'htm':
573 $fileContent = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($absFile);
574 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
575 $contentArr = $this->pObj->splitHTMLContent($fileContent);
576 break;
577 case 'xml':
578 $this->setLocaleForServerFileSystem();
579 // PHP strip-tags()
580 $fileContent = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($absFile);
581 // Finding charset:
582 preg_match('/^[[:space:]]*<\\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i', substr($fileContent, 0, 200), $reg);
583 $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
584 // Converting content:
585 $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<', ' <', $fileContent)), $charset);
586 $contentArr = $this->pObj->splitRegularContent($fileContent);
587 $contentArr['title'] = basename($absFile);
588 // Make sure the title doesn't expose the absolute path!
589 $this->setLocaleForServerFileSystem(TRUE);
590 break;
591 case 'jpg':
592
593 case 'jpeg':
594
595 case 'tif':
596 $this->setLocaleForServerFileSystem();
597 // PHP EXIF
598 if (function_exists('exif_read_data')) {
599 $exif = @exif_read_data($absFile, 'IFD0');
600 } else {
601 $exif = FALSE;
602 }
603 if ($exif) {
604 $comment = trim($exif['COMMENT'][0] . ' ' . $exif['ImageDescription']);
605 } else {
606 $comment = '';
607 }
608 $contentArr = $this->pObj->splitRegularContent($comment);
609 $contentArr['title'] = basename($absFile);
610 // Make sure the title doesn't expose the absolute path!
611 $this->setLocaleForServerFileSystem(TRUE);
612 break;
613 default:
614 return FALSE;
615 }
616 // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
617 if (is_array($contentArr) && !$contentArr['title']) {
618 // Substituting "_" for " " because many filenames may have this instead of a space char.
619 $contentArr['title'] = str_replace('_', ' ', basename($absFile));
620 }
621 return $contentArr;
622 }
623
624 /**
625 * Sets the locale for LC_CTYPE to $TYPO3_CONF_VARS['SYS']['systemLocale']
626 * if $TYPO3_CONF_VARS['SYS']['UTF8filesystem'] is set.
627 *
628 * Parameter <code>$resetLocale</code> has to be FALSE and TRUE alternating for all calls.
629 *
630 * @staticvar string $lastLocale Stores the locale used before it is overridden by this method.
631 * @param bool $resetLocale TRUE resets the locale to $lastLocale.
632 * @return void
633 * @throws \RuntimeException
634 */
635 protected function setLocaleForServerFileSystem($resetLocale = FALSE) {
636 static $lastLocale = NULL;
637 if (!$GLOBALS['TYPO3_CONF_VARS']['SYS']['UTF8filesystem']) {
638 return;
639 }
640
641 if ($resetLocale) {
642 if ($lastLocale == NULL) {
643 throw new \RuntimeException('Cannot reset locale to NULL.', 1357064326);
644 }
645 setlocale(LC_CTYPE, $lastLocale);
646 $lastLocale = NULL;
647 } else {
648 if ($lastLocale !== NULL) {
649 throw new \RuntimeException('Cannot set new locale as locale has already been changed before.', 1357064437);
650 }
651 $lastLocale = setlocale(LC_CTYPE, 0);
652 setlocale(LC_CTYPE, $GLOBALS['TYPO3_CONF_VARS']['SYS']['systemLocale']);
653 }
654 }
655
656 /**
657 * Creates an array with pointers to divisions of document.
658 *
659 * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero)
660 * coming back.
661 *
662 * @param string $ext File extension
663 * @param string $absFile Absolute filename (must exist and be validated OK before calling function)
664 * @return array Array of pointers to sections that the document should be divided into
665 */
666 public function fileContentParts($ext, $absFile) {
667 $cParts = array(0);
668 switch ($ext) {
669 case 'pdf':
670 $this->setLocaleForServerFileSystem();
671 // Getting pdf-info:
672 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
673 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
674 $pdfInfo = $this->splitPdfInfo($res);
675 unset($res);
676 if ((int)$pdfInfo['pages']) {
677 $cParts = array();
678 // Calculate mode
679 if ($this->pdf_mode > 0) {
680 $iter = ceil($pdfInfo['pages'] / $this->pdf_mode);
681 } else {
682 $iter = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange(abs($this->pdf_mode), 1, $pdfInfo['pages']);
683 }
684 // Traverse and create intervals.
685 for ($a = 0; $a < $iter; $a++) {
686 $low = floor($a * ($pdfInfo['pages'] / $iter)) + 1;
687 $high = floor(($a + 1) * ($pdfInfo['pages'] / $iter));
688 $cParts[] = $low . '-' . $high;
689 }
690 }
691 $this->setLocaleForServerFileSystem(TRUE);
692 break;
693 default:
694 }
695 return $cParts;
696 }
697
698 /**
699 * Analysing PDF info into a useable format.
700 *
701 * @param array Array of PDF content, coming from the pdfinfo tool
702 * @return array Result array
703 * @access private
704 * @see fileContentParts()
705 */
706 public function splitPdfInfo($pdfInfoArray) {
707 $res = array();
708 if (is_array($pdfInfoArray)) {
709 foreach ($pdfInfoArray as $line) {
710 $parts = explode(':', $line, 2);
711 if (count($parts) > 1 && trim($parts[0])) {
712 $res[strtolower(trim($parts[0]))] = trim($parts[1]);
713 }
714 }
715 }
716 return $res;
717 }
718
719 /**
720 * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.
721 *
722 * @param string String to clean up
723 * @return string String
724 */
725 public function removeEndJunk($string) {
726 return trim(preg_replace('/[' . LF . chr(12) . ']*$/', '', $string));
727 }
728
729 /************************
730 *
731 * Backend analyzer
732 *
733 ************************/
734 /**
735 * Return icon for file extension
736 *
737 * @param string File extension, lowercase.
738 * @return string Relative file reference, resolvable by \TYPO3\CMS\Core\Utility\GeneralUtility::getFileAbsFileName()
739 */
740 public function getIcon($extension) {
741 if ($extension === 'htm') {
742 $extension = 'html';
743 } elseif ($extension === 'jpeg') {
744 $extension = 'jpg';
745 }
746 return 'EXT:indexed_search/pi/res/' . $extension . '.gif';
747 }
748
749 }