[CLEANUP] Replace strlen() with === for zero length check
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / FileContentParser.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 /**
18 * External standard parsers for indexed_search
19 *
20 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
21 * @coauthor Olivier Simah <noname_paris@yahoo.fr>
22 */
23 /**
24 * External standard parsers for indexed_search
25 * MUST RETURN utf-8 content!
26 *
27 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
28 */
29 class FileContentParser {
30
31 // This value is also overridden from config.
32 public $pdf_mode = -20;
33
34 // zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
35 // This array is configured in initialization:
36 public $app = array();
37
38 public $ext2itemtype_map = array();
39
40 public $supportedExtensions = array();
41
42 public $pObj;
43
44 // Reference to parent object (indexer class)
45 protected $langObject;
46
47 // Reference to LANG-Object
48 /**
49 * Constructs this external parsers object
50 */
51 public function __construct() {
52 // Set the language object to be used accordant to current TYPO3_MODE:
53 $this->langObject = TYPO3_MODE == 'FE' ? $GLOBALS['TSFE'] : $GLOBALS['LANG'];
54 }
55
56 /**
57 * Initialize external parser for parsing content.
58 *
59 * @param string File extension
60 * @return bool Returns TRUE if extension is supported/enabled, otherwise FALSE.
61 */
62 public function initParser($extension) {
63 // Then read indexer-config and set if appropriate:
64 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
65 // If windows, apply extension to tool name:
66 $exe = TYPO3_OS == 'WIN' ? '.exe' : '';
67 // lg
68 $extOK = FALSE;
69 $mainExtension = '';
70 // Ignore extensions
71 $ignoreExtensions = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), TRUE);
72 if (in_array($extension, $ignoreExtensions)) {
73 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:ignoreExtensions'), $extension), 1);
74 return FALSE;
75 }
76 // Switch on file extension:
77 switch ($extension) {
78 case 'pdf':
79 // PDF
80 if ($indexerConfig['pdftools']) {
81 $pdfPath = rtrim($indexerConfig['pdftools'], '/') . '/';
82 if (@is_file(($pdfPath . 'pdftotext' . $exe)) && @is_file(($pdfPath . 'pdfinfo' . $exe))) {
83 $this->app['pdfinfo'] = $pdfPath . 'pdfinfo' . $exe;
84 $this->app['pdftotext'] = $pdfPath . 'pdftotext' . $exe;
85 // PDF mode:
86 $this->pdf_mode = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($indexerConfig['pdf_mode'], -100, 100);
87 $extOK = TRUE;
88 } else {
89 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:pdfToolsNotFound'), $pdfPath), 3);
90 }
91 } else {
92 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:pdfToolsDisabled'), 1);
93 }
94 break;
95 case 'doc':
96 // Catdoc
97 if ($indexerConfig['catdoc']) {
98 $catdocPath = rtrim($indexerConfig['catdoc'], '/') . '/';
99 if (@is_file(($catdocPath . 'catdoc' . $exe))) {
100 $this->app['catdoc'] = $catdocPath . 'catdoc' . $exe;
101 $extOK = TRUE;
102 } else {
103 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:catdocNotFound'), $catdocPath), 3);
104 }
105 } else {
106 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:catdocDisabled'), 1);
107 }
108 break;
109 case 'pps':
110
111 case 'ppt':
112 // MS PowerPoint
113 // ppthtml
114 if ($indexerConfig['ppthtml']) {
115 $ppthtmlPath = rtrim($indexerConfig['ppthtml'], '/') . '/';
116 if (@is_file(($ppthtmlPath . 'ppthtml' . $exe))) {
117 $this->app['ppthtml'] = $ppthtmlPath . 'ppthtml' . $exe;
118 $extOK = TRUE;
119 } else {
120 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:ppthtmlNotFound'), $ppthtmlPath), 3);
121 }
122 } else {
123 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:ppthtmlDisabled'), 1);
124 }
125 break;
126 case 'xls':
127 // MS Excel
128 // Xlhtml
129 if ($indexerConfig['xlhtml']) {
130 $xlhtmlPath = rtrim($indexerConfig['xlhtml'], '/') . '/';
131 if (@is_file(($xlhtmlPath . 'xlhtml' . $exe))) {
132 $this->app['xlhtml'] = $xlhtmlPath . 'xlhtml' . $exe;
133 $extOK = TRUE;
134 } else {
135 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:xlhtmlNotFound'), $xlhtmlPath), 3);
136 }
137 } else {
138 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:xlhtmlDisabled'), 1);
139 }
140 break;
141 case 'sxc':
142
143 case 'sxi':
144
145 case 'sxw':
146
147 case 'ods':
148
149 case 'odp':
150
151 case 'odt':
152 // Oasis OpenDocument Text
153 if ($indexerConfig['unzip']) {
154 $unzipPath = rtrim($indexerConfig['unzip'], '/') . '/';
155 if (@is_file(($unzipPath . 'unzip' . $exe))) {
156 $this->app['unzip'] = $unzipPath . 'unzip' . $exe;
157 $extOK = TRUE;
158 } else {
159 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:unzipNotFound'), $unzipPath), 3);
160 }
161 } else {
162 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:unzipDisabled'), 1);
163 }
164 break;
165 case 'rtf':
166 // Catdoc
167 if ($indexerConfig['unrtf']) {
168 $unrtfPath = rtrim($indexerConfig['unrtf'], '/') . '/';
169 if (@is_file(($unrtfPath . 'unrtf' . $exe))) {
170 $this->app['unrtf'] = $unrtfPath . 'unrtf' . $exe;
171 $extOK = TRUE;
172 } else {
173 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:unrtfNotFound'), $unrtfPath), 3);
174 }
175 } else {
176 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xlf:unrtfDisabled'), 1);
177 }
178 break;
179 case 'txt':
180
181 case 'csv':
182
183 case 'xml':
184
185 case 'tif':
186 // PHP EXIF
187 $extOK = TRUE;
188 break;
189 case 'html':
190
191 case 'htm':
192 // PHP strip-tags()
193 $extOK = TRUE;
194 $mainExtension = 'html';
195 // making "html" the common "item_type"
196 break;
197 case 'jpg':
198
199 case 'jpeg':
200 // PHP EXIF
201 $extOK = TRUE;
202 $mainExtension = 'jpeg';
203 // making "jpeg" the common item_type
204 break;
205 }
206 // If extension was OK:
207 if ($extOK) {
208 $this->supportedExtensions[$extension] = TRUE;
209 $this->ext2itemtype_map[$extension] = $mainExtension ?: $extension;
210 return TRUE;
211 }
212 }
213
214 /**
215 * Initialize external parser for backend modules
216 * Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc) in backend and frontend plugin
217 *
218 * @param string File extension to initialize for.
219 * @return bool Returns TRUE if the extension is supported and enabled, otherwise FALSE.
220 */
221 public function softInit($extension) {
222 switch ($extension) {
223 case 'pdf':
224
225 case 'doc':
226
227 case 'pps':
228
229 case 'ppt':
230
231 case 'xls':
232
233 case 'sxc':
234
235 case 'sxi':
236
237 case 'sxw':
238
239 case 'ods':
240
241 case 'odp':
242
243 case 'odt':
244
245 case 'rtf':
246
247 case 'txt':
248
249 case 'html':
250
251 case 'htm':
252
253 case 'csv':
254
255 case 'xml':
256
257 case 'jpg':
258
259 case 'jpeg':
260
261 case 'tif':
262 // TIF images (EXIF comment)
263 return TRUE;
264 break;
265 }
266 }
267
268 /**
269 * Return title of entry in media type selector box.
270 *
271 * @param string File extension
272 * @return string String with label value of entry in media type search selector box (frontend plugin).
273 */
274 public function searchTypeMediaTitle($extension) {
275 // Read indexer-config
276 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
277 // Ignore extensions
278 $ignoreExtensions = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), TRUE);
279 if (in_array($extension, $ignoreExtensions)) {
280 return FALSE;
281 }
282 // Switch on file extension:
283 switch ($extension) {
284 case 'pdf':
285 // PDF
286 if ($indexerConfig['pdftools']) {
287 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.PDF'), $extension);
288 }
289 break;
290 case 'doc':
291 // Catdoc
292 if ($indexerConfig['catdoc']) {
293 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.DOC'), $extension);
294 }
295 break;
296 case 'pps':
297
298 case 'ppt':
299 // MS PowerPoint
300 // ppthtml
301 if ($indexerConfig['ppthtml']) {
302 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.PP'), $extension);
303 }
304 break;
305 case 'xls':
306 // MS Excel
307 // Xlhtml
308 if ($indexerConfig['xlhtml']) {
309 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.XLS'), $extension);
310 }
311 break;
312 case 'sxc':
313 // Open Office Calc.
314 if ($indexerConfig['unzip']) {
315 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.SXC'), $extension);
316 }
317 break;
318 case 'sxi':
319 // Open Office Impress
320 if ($indexerConfig['unzip']) {
321 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.SXI'), $extension);
322 }
323 break;
324 case 'sxw':
325 // Open Office Writer
326 if ($indexerConfig['unzip']) {
327 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.SXW'), $extension);
328 }
329 break;
330 case 'ods':
331 // Oasis OpenDocument Spreadsheet
332 if ($indexerConfig['unzip']) {
333 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.ODS'), $extension);
334 }
335 break;
336 case 'odp':
337 // Oasis OpenDocument Presentation
338 if ($indexerConfig['unzip']) {
339 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.ODP'), $extension);
340 }
341 break;
342 case 'odt':
343 // Oasis OpenDocument Text
344 if ($indexerConfig['unzip']) {
345 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.ODT'), $extension);
346 }
347 break;
348 case 'rtf':
349 // Catdoc
350 if ($indexerConfig['unrtf']) {
351 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.RTF'), $extension);
352 }
353 break;
354 case 'jpeg':
355
356 case 'jpg':
357
358 case 'tif':
359 // PHP EXIF
360 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.Images'), $extension);
361 break;
362 case 'html':
363
364 case 'htm':
365 // PHP strip-tags()
366 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.HTML'), $extension);
367 break;
368 case 'txt':
369 // Raw text
370 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.TXT'), $extension);
371 break;
372 case 'csv':
373 // Raw text
374 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.CSV'), $extension);
375 break;
376 case 'xml':
377 // PHP strip-tags()
378 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:extension.XML'), $extension);
379 break;
380 default:
381 // Do nothing
382 }
383 }
384
385 /**
386 * Returns TRUE if the input extension (item_type) is a potentially a multi-page extension
387 *
388 * @param string Extension / item_type string
389 * @return bool Return TRUE if multi-page
390 */
391 public function isMultiplePageExtension($extension) {
392 // Switch on file extension:
393 switch ((string)$extension) {
394 case 'pdf':
395 return TRUE;
396 break;
397 }
398 }
399
400 /**
401 * Wraps the "splitLabel function" of the language object.
402 *
403 * @param string $reference: Reference/key of the label
404 * @param bool $useHtmlSpecialChar: Convert special chars to HTML entities (default: FALSE)
405 * @return string The label of the reference/key to be fetched
406 */
407 protected function sL($reference, $useHtmlSpecialChar = FALSE) {
408 return $this->langObject->sL($reference, $useHtmlSpecialChar);
409 }
410
411 /************************
412 *
413 * Reading documents (for parsing)
414 *
415 ************************/
416 /**
417 * Reads the content of an external file being indexed.
418 *
419 * @param string $ext File extension, eg. "pdf", "doc" etc.
420 * @param string $absFile Absolute filename of file (must exist and be validated OK before calling function)
421 * @param string $cPKey Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be split.)
422 * @return array Standard content array (title, description, keywords, body keys)
423 */
424 public function readFileContent($ext, $absFile, $cPKey) {
425 unset($contentArr);
426 // Return immediately if initialization didn't set support up:
427 if (!$this->supportedExtensions[$ext]) {
428 return FALSE;
429 }
430 // Switch by file extension
431 switch ($ext) {
432 case 'pdf':
433 if ($this->app['pdfinfo']) {
434 $this->setLocaleForServerFileSystem();
435 // Getting pdf-info:
436 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
437 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
438 $pdfInfo = $this->splitPdfInfo($res);
439 unset($res);
440 if ((int)$pdfInfo['pages']) {
441 list($low, $high) = explode('-', $cPKey);
442 // Get pdf content:
443 $tempFileName = \TYPO3\CMS\Core\Utility\GeneralUtility::tempnam('Typo3_indexer');
444 // Create temporary name
445 @unlink($tempFileName);
446 // Delete if exists, just to be safe.
447 $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
448 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd);
449 if (@is_file($tempFileName)) {
450 $content = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($tempFileName);
451 unlink($tempFileName);
452 } else {
453 $content = '';
454 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xlf:pdfToolsFailed'), $absFile), 2);
455 }
456 if ((string)$content !== '') {
457 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
458 }
459 }
460 $this->setLocaleForServerFileSystem(TRUE);
461 }
462 break;
463 case 'doc':
464 if ($this->app['catdoc']) {
465 $this->setLocaleForServerFileSystem();
466 $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
467 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
468 $content = implode(LF, $res);
469 unset($res);
470 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
471 $this->setLocaleForServerFileSystem(TRUE);
472 }
473 break;
474 case 'pps':
475
476 case 'ppt':
477 if ($this->app['ppthtml']) {
478 $this->setLocaleForServerFileSystem();
479 $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
480 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
481 $content = implode(LF, $res);
482 unset($res);
483 $content = $this->pObj->convertHTMLToUtf8($content);
484 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
485 $contentArr['title'] = basename($absFile);
486 $this->setLocaleForServerFileSystem(TRUE);
487 }
488 break;
489 case 'xls':
490 if ($this->app['xlhtml']) {
491 $this->setLocaleForServerFileSystem();
492 $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
493 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
494 $content = implode(LF, $res);
495 unset($res);
496 $content = $this->pObj->convertHTMLToUtf8($content);
497 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
498 $contentArr['title'] = basename($absFile);
499 $this->setLocaleForServerFileSystem(TRUE);
500 }
501 break;
502 case 'sxi':
503
504 case 'sxc':
505
506 case 'sxw':
507
508 case 'ods':
509
510 case 'odp':
511
512 case 'odt':
513 if ($this->app['unzip']) {
514 $this->setLocaleForServerFileSystem();
515 // Read content.xml:
516 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml';
517 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
518 $content_xml = implode(LF, $res);
519 unset($res);
520 // Read meta.xml:
521 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml';
522 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
523 $meta_xml = implode(LF, $res);
524 unset($res);
525 $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
526 $contentArr = $this->pObj->splitRegularContent($utf8_content);
527 $contentArr['title'] = basename($absFile);
528 // Make sure the title doesn't expose the absolute path!
529 // Meta information
530 $metaContent = \TYPO3\CMS\Core\Utility\GeneralUtility::xml2tree($meta_xml);
531 $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
532 if (is_array($metaContent)) {
533 $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
534 $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0] . ' ' . $metaContent['dc:description'][0]['values'][0];
535 // Keywords collected:
536 if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
537 foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
538 $contentArr['keywords'] .= $kwDat['values'][0] . ' ';
539 }
540 }
541 }
542 $this->setLocaleForServerFileSystem(TRUE);
543 }
544 break;
545 case 'rtf':
546 if ($this->app['unrtf']) {
547 $this->setLocaleForServerFileSystem();
548 $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
549 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
550 $fileContent = implode(LF, $res);
551 unset($res);
552 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
553 $contentArr = $this->pObj->splitHTMLContent($fileContent);
554 $this->setLocaleForServerFileSystem(TRUE);
555 }
556 break;
557 case 'txt':
558
559 case 'csv':
560 $this->setLocaleForServerFileSystem();
561 // Raw text
562 $content = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($absFile);
563 // @todo Implement auto detection of charset (currently assuming utf-8)
564 $contentCharset = 'utf-8';
565 $content = $this->pObj->convertHTMLToUtf8($content, $contentCharset);
566 $contentArr = $this->pObj->splitRegularContent($content);
567 $contentArr['title'] = basename($absFile);
568 // Make sure the title doesn't expose the absolute path!
569 $this->setLocaleForServerFileSystem(TRUE);
570 break;
571 case 'html':
572
573 case 'htm':
574 $fileContent = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($absFile);
575 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
576 $contentArr = $this->pObj->splitHTMLContent($fileContent);
577 break;
578 case 'xml':
579 $this->setLocaleForServerFileSystem();
580 // PHP strip-tags()
581 $fileContent = \TYPO3\CMS\Core\Utility\GeneralUtility::getUrl($absFile);
582 // Finding charset:
583 preg_match('/^[[:space:]]*<\\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i', substr($fileContent, 0, 200), $reg);
584 $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
585 // Converting content:
586 $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<', ' <', $fileContent)), $charset);
587 $contentArr = $this->pObj->splitRegularContent($fileContent);
588 $contentArr['title'] = basename($absFile);
589 // Make sure the title doesn't expose the absolute path!
590 $this->setLocaleForServerFileSystem(TRUE);
591 break;
592 case 'jpg':
593
594 case 'jpeg':
595
596 case 'tif':
597 $this->setLocaleForServerFileSystem();
598 // PHP EXIF
599 if (function_exists('exif_read_data')) {
600 $exif = @exif_read_data($absFile, 'IFD0');
601 } else {
602 $exif = FALSE;
603 }
604 if ($exif) {
605 $comment = trim($exif['COMMENT'][0] . ' ' . $exif['ImageDescription']);
606 } else {
607 $comment = '';
608 }
609 $contentArr = $this->pObj->splitRegularContent($comment);
610 $contentArr['title'] = basename($absFile);
611 // Make sure the title doesn't expose the absolute path!
612 $this->setLocaleForServerFileSystem(TRUE);
613 break;
614 default:
615 return FALSE;
616 }
617 // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
618 if (is_array($contentArr) && !$contentArr['title']) {
619 // Substituting "_" for " " because many filenames may have this instead of a space char.
620 $contentArr['title'] = str_replace('_', ' ', basename($absFile));
621 }
622 return $contentArr;
623 }
624
625 /**
626 * Sets the locale for LC_CTYPE to $TYPO3_CONF_VARS['SYS']['systemLocale']
627 * if $TYPO3_CONF_VARS['SYS']['UTF8filesystem'] is set.
628 *
629 * Parameter <code>$resetLocale</code> has to be FALSE and TRUE alternating for all calls.
630 *
631 * @staticvar string $lastLocale Stores the locale used before it is overridden by this method.
632 * @param bool $resetLocale TRUE resets the locale to $lastLocale.
633 * @return void
634 * @throws \RuntimeException
635 */
636 protected function setLocaleForServerFileSystem($resetLocale = FALSE) {
637 static $lastLocale = NULL;
638 if (!$GLOBALS['TYPO3_CONF_VARS']['SYS']['UTF8filesystem']) {
639 return;
640 }
641
642 if ($resetLocale) {
643 if ($lastLocale == NULL) {
644 throw new \RuntimeException('Cannot reset locale to NULL.', 1357064326);
645 }
646 setlocale(LC_CTYPE, $lastLocale);
647 $lastLocale = NULL;
648 } else {
649 if ($lastLocale !== NULL) {
650 throw new \RuntimeException('Cannot set new locale as locale has already been changed before.', 1357064437);
651 }
652 $lastLocale = setlocale(LC_CTYPE, 0);
653 setlocale(LC_CTYPE, $GLOBALS['TYPO3_CONF_VARS']['SYS']['systemLocale']);
654 }
655 }
656
657 /**
658 * Creates an array with pointers to divisions of document.
659 *
660 * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero)
661 * coming back.
662 *
663 * @param string $ext File extension
664 * @param string $absFile Absolute filename (must exist and be validated OK before calling function)
665 * @return array Array of pointers to sections that the document should be divided into
666 */
667 public function fileContentParts($ext, $absFile) {
668 $cParts = array(0);
669 switch ($ext) {
670 case 'pdf':
671 $this->setLocaleForServerFileSystem();
672 // Getting pdf-info:
673 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
674 \TYPO3\CMS\Core\Utility\CommandUtility::exec($cmd, $res);
675 $pdfInfo = $this->splitPdfInfo($res);
676 unset($res);
677 if ((int)$pdfInfo['pages']) {
678 $cParts = array();
679 // Calculate mode
680 if ($this->pdf_mode > 0) {
681 $iter = ceil($pdfInfo['pages'] / $this->pdf_mode);
682 } else {
683 $iter = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange(abs($this->pdf_mode), 1, $pdfInfo['pages']);
684 }
685 // Traverse and create intervals.
686 for ($a = 0; $a < $iter; $a++) {
687 $low = floor($a * ($pdfInfo['pages'] / $iter)) + 1;
688 $high = floor(($a + 1) * ($pdfInfo['pages'] / $iter));
689 $cParts[] = $low . '-' . $high;
690 }
691 }
692 $this->setLocaleForServerFileSystem(TRUE);
693 break;
694 default:
695 }
696 return $cParts;
697 }
698
699 /**
700 * Analysing PDF info into a useable format.
701 *
702 * @param array Array of PDF content, coming from the pdfinfo tool
703 * @return array Result array
704 * @access private
705 * @see fileContentParts()
706 */
707 public function splitPdfInfo($pdfInfoArray) {
708 $res = array();
709 if (is_array($pdfInfoArray)) {
710 foreach ($pdfInfoArray as $line) {
711 $parts = explode(':', $line, 2);
712 if (count($parts) > 1 && trim($parts[0])) {
713 $res[strtolower(trim($parts[0]))] = trim($parts[1]);
714 }
715 }
716 }
717 return $res;
718 }
719
720 /**
721 * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.
722 *
723 * @param string String to clean up
724 * @return string String
725 */
726 public function removeEndJunk($string) {
727 return trim(preg_replace('/[' . LF . chr(12) . ']*$/', '', $string));
728 }
729
730 /************************
731 *
732 * Backend analyzer
733 *
734 ************************/
735 /**
736 * Return icon for file extension
737 *
738 * @param string File extension, lowercase.
739 * @return string Relative file reference, resolvable by \TYPO3\CMS\Core\Utility\GeneralUtility::getFileAbsFileName()
740 */
741 public function getIcon($extension) {
742 if ($extension === 'htm') {
743 $extension = 'html';
744 } elseif ($extension === 'jpeg') {
745 $extension = 'jpg';
746 }
747 return 'EXT:indexed_search/pi/res/' . $extension . '.gif';
748 }
749
750 }