[!!!][BUGFIX] *_user table password field is to short
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.external_parser.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * External standard parsers for indexed_search
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * @coauthor Olivier Simah <noname_paris@yahoo.fr>
32 */
33
34
35
36
37
38
39
40
41
42
43 /**
44 * External standard parsers for indexed_search
45 * MUST RETURN utf-8 content!
46 *
47 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
48 * @package TYPO3
49 * @subpackage tx_indexedsearch
50 */
51 class tx_indexed_search_extparse {
52
53 // This value is also overridden from config.
54 var $pdf_mode = -20; // zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
55
56 // This array is configured in initialization:
57 var $app = array();
58 var $ext2itemtype_map = array();
59 var $supportedExtensions = array();
60
61 var $pObj; // Reference to parent object (indexer class)
62 protected $langObject; // Reference to LANG-Object
63
64 /**
65 * Constructs this external parsers object
66 */
67 public function __construct() {
68 // Set the language object to be used accordant to current TYPO3_MODE:
69 $this->langObject = (TYPO3_MODE == 'FE' ? $GLOBALS['TSFE'] : $GLOBALS['LANG']);
70 }
71
72 /**
73 * Initialize external parser for parsing content.
74 *
75 * @param string File extension
76 * @return boolean Returns TRUE if extension is supported/enabled, otherwise FALSE.
77 */
78 function initParser($extension) {
79
80 // Then read indexer-config and set if appropriate:
81 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
82
83 // If windows, apply extension to tool name:
84 $exe = (TYPO3_OS == 'WIN') ? '.exe' : ''; // lg
85 $extOK = FALSE;
86 $mainExtension = '';
87
88 // Ignore extensions
89 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), 1);
90 if (in_array($extension, $ignoreExtensions)) {
91 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:ignoreExtensions'), $extension), 1);
92 return FALSE;
93 }
94
95 // Switch on file extension:
96 switch($extension) {
97 case 'pdf':
98 // PDF
99 if ($indexerConfig['pdftools']) {
100 $pdfPath = rtrim($indexerConfig['pdftools'], '/').'/';
101 if (@is_file($pdfPath . 'pdftotext' . $exe) && @is_file($pdfPath . 'pdfinfo' . $exe)) {
102 $this->app['pdfinfo'] = $pdfPath.'pdfinfo'.$exe;
103 $this->app['pdftotext'] = $pdfPath.'pdftotext'.$exe;
104 // PDF mode:
105 $this->pdf_mode = t3lib_utility_Math::forceIntegerInRange($indexerConfig['pdf_mode'], -100, 100);
106 $extOK = TRUE;
107 } else {
108 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsNotFound'), $pdfPath), 3);
109 }
110 } else {
111 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsDisabled'), 1);
112 }
113 break;
114 case 'doc':
115 // Catdoc
116 if ($indexerConfig['catdoc']) {
117 $catdocPath = rtrim($indexerConfig['catdoc'], '/').'/';
118 if (@is_file($catdocPath . 'catdoc' . $exe)) {
119 $this->app['catdoc'] = $catdocPath.'catdoc'.$exe;
120 $extOK = TRUE;
121 } else {
122 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:catdocNotFound'), $catdocPath), 3);
123 }
124 } else {
125 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:catdocDisabled'), 1);
126 }
127 break;
128 case 'pps': // MS PowerPoint(?)
129 case 'ppt': // MS PowerPoint
130 // ppthtml
131 if ($indexerConfig['ppthtml']) {
132 $ppthtmlPath = rtrim($indexerConfig['ppthtml'], '/').'/';
133 if (@is_file($ppthtmlPath . 'ppthtml' . $exe)) {
134 $this->app['ppthtml'] = $ppthtmlPath.'ppthtml'.$exe;
135 $extOK = TRUE;
136 } else {
137 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:ppthtmlNotFound'), $ppthtmlPath), 3);
138 }
139 } else {
140 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:ppthtmlDisabled'), 1);
141 }
142 break;
143 case 'xls': // MS Excel
144 // Xlhtml
145 if ($indexerConfig['xlhtml']) {
146 $xlhtmlPath = rtrim($indexerConfig['xlhtml'], '/').'/';
147 if (@is_file($xlhtmlPath . 'xlhtml' . $exe)) {
148 $this->app['xlhtml'] = $xlhtmlPath.'xlhtml'.$exe;
149 $extOK = TRUE;
150 } else {
151 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:xlhtmlNotFound'), $xlhtmlPath), 3);
152 }
153 } else {
154 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:xlhtmlDisabled'), 1);
155 }
156 break;
157 case 'sxc': // Open Office Calc.
158 case 'sxi': // Open Office Impress
159 case 'sxw': // Open Office Writer
160 case 'ods': // Oasis OpenDocument Spreadsheet
161 case 'odp': // Oasis OpenDocument Presentation
162 case 'odt': // Oasis OpenDocument Text
163 if ($indexerConfig['unzip']) {
164 $unzipPath = rtrim($indexerConfig['unzip'], '/').'/';
165 if (@is_file($unzipPath . 'unzip' . $exe)) {
166 $this->app['unzip'] = $unzipPath.'unzip'.$exe;
167 $extOK = TRUE;
168 } else {
169 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:unzipNotFound'), $unzipPath), 3);
170 }
171 } else {
172 $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:unzipDisabled'), 1);
173 }
174 break;
175 case 'rtf':
176 // Catdoc
177 if ($indexerConfig['unrtf']) {
178 $unrtfPath = rtrim($indexerConfig['unrtf'], '/').'/';
179 if (@is_file($unrtfPath . 'unrtf' . $exe)) {
180 $this->app['unrtf'] = $unrtfPath.'unrtf'.$exe;
181 $extOK = TRUE;
182 } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:unrtfNotFound'), $unrtfPath), 3);
183 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:unrtfDisabled'), 1);
184 break;
185 case 'txt': // Raw text
186 case 'csv': // Raw text
187 case 'xml': // PHP strip-tags()
188 case 'tif': // PHP EXIF
189 $extOK = TRUE;
190 break;
191 case 'html': // PHP strip-tags()
192 case 'htm': // PHP strip-tags()
193 $extOK = TRUE;
194 $mainExtension = 'html'; // making "html" the common "item_type"
195 break;
196 case 'jpg': // PHP EXIF
197 case 'jpeg': // PHP EXIF
198 $extOK = TRUE;
199 $mainExtension = 'jpeg'; // making "jpeg" the common item_type
200 break;
201 }
202
203 // If extension was OK:
204 if ($extOK) {
205 $this->supportedExtensions[$extension] = TRUE;
206 $this->ext2itemtype_map[$extension] = $mainExtension ? $mainExtension : $extension;
207 return TRUE;
208 }
209 }
210
211 /**
212 * Initialize external parser for backend modules
213 * Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc) in backend and frontend plugin
214 *
215 * @param string File extension to initialize for.
216 * @return boolean Returns TRUE if the extension is supported and enabled, otherwise FALSE.
217 */
218 function softInit($extension) {
219 switch($extension) {
220 case 'pdf': // PDF
221 case 'doc': // MS Word files
222 case 'pps': // MS PowerPoint
223 case 'ppt': // MS PowerPoint
224 case 'xls': // MS Excel
225 case 'sxc': // Open Office Calc.
226 case 'sxi': // Open Office Impress
227 case 'sxw': // Open Office Writer
228 case 'ods': // Oasis OpenDocument Spreadsheet
229 case 'odp': // Oasis OpenDocument Presentation
230 case 'odt': // Oasis OpenDocument Text
231 case 'rtf': // RTF documents
232 case 'txt': // ASCII Text documents
233 case 'html': // HTML
234 case 'htm': // HTML
235 case 'csv': // Comma Separated Values
236 case 'xml': // Generic XML
237 case 'jpg': // Jpeg images (EXIF comment)
238 case 'jpeg': // Jpeg images (EXIF comment)
239 case 'tif': // TIF images (EXIF comment)
240 return TRUE;
241 break;
242 }
243 }
244
245 /**
246 * Return title of entry in media type selector box.
247 *
248 * @param string File extension
249 * @return string String with label value of entry in media type search selector box (frontend plugin).
250 */
251 function searchTypeMediaTitle($extension) {
252
253 // Read indexer-config
254 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
255
256 // Ignore extensions
257 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']), 1);
258 if (in_array($extension, $ignoreExtensions)) {
259 return FALSE;
260 }
261
262 // Switch on file extension:
263 switch($extension) {
264 case 'pdf':
265 // PDF
266 if ($indexerConfig['pdftools']) {
267 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.PDF'), $extension);
268 }
269 break;
270 case 'doc':
271 // Catdoc
272 if ($indexerConfig['catdoc']) {
273 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.DOC'), $extension);
274 }
275 break;
276 case 'pps': // MS PowerPoint(?)
277 case 'ppt': // MS PowerPoint
278 // ppthtml
279 if ($indexerConfig['ppthtml']) {
280 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.PP'), $extension);
281 }
282 break;
283 case 'xls': // MS Excel
284 // Xlhtml
285 if ($indexerConfig['xlhtml']) {
286 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.XLS'), $extension);
287 }
288 break;
289 case 'sxc': // Open Office Calc.
290 if ($indexerConfig['unzip']) {
291 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXC'), $extension);
292 }
293 break;
294 case 'sxi': // Open Office Impress
295 if ($indexerConfig['unzip']) {
296 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXI'), $extension);
297 }
298 break;
299 case 'sxw': // Open Office Writer
300 if ($indexerConfig['unzip']) {
301 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXW'), $extension);
302 }
303 break;
304 case 'ods': // Oasis OpenDocument Spreadsheet
305 if ($indexerConfig['unzip']) {
306 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODS'), $extension);
307 }
308 break;
309 case 'odp': // Oasis OpenDocument Presentation
310 if ($indexerConfig['unzip']) {
311 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODP'), $extension);
312 }
313 break;
314 case 'odt': // Oasis OpenDocument Text
315 if ($indexerConfig['unzip']) {
316 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODT'), $extension);
317 }
318 break;
319 case 'rtf':
320 // Catdoc
321 if ($indexerConfig['unrtf']) {
322 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.RTF'), $extension);
323 }
324 break;
325 case 'jpeg': // PHP EXIF
326 case 'tif': // PHP EXIF
327 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.Images'), $extension);
328 break;
329 case 'html': // PHP strip-tags()
330 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.HTML'), $extension);
331 break;
332 case 'txt': // Raw text
333 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.TXT'), $extension);
334 break;
335 case 'csv': // Raw text
336 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.CSV'), $extension);
337 break;
338 case 'xml': // PHP strip-tags()
339 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.XML'), $extension);
340 break;
341 // NO entry (duplicates or blank):
342 case 'htm': // PHP strip-tags()
343 case 'jpg': // PHP EXIF
344 default:
345 break;
346 }
347 }
348
349 /**
350 * Returns TRUE if the input extension (item_type) is a potentially a multi-page extension
351 *
352 * @param string Extension / item_type string
353 * @return boolean Return TRUE if multi-page
354 */
355 function isMultiplePageExtension($extension) {
356 // Switch on file extension:
357 switch((string)$extension) {
358 case 'pdf':
359 return TRUE;
360 break;
361 }
362 }
363
364 /**
365 * Wraps the "splitLabel function" of the language object.
366 *
367 * @param string $reference: Reference/key of the label
368 * @param boolean $useHtmlSpecialChar: Convert special chars to HTML entities (default: FALSE)
369 * @return string The label of the reference/key to be fetched
370 */
371 protected function sL($reference, $useHtmlSpecialChar = FALSE) {
372 return $this->langObject->sL($reference, $useHtmlSpecialChar);
373 }
374
375
376
377
378
379
380
381
382
383 /************************
384 *
385 * Reading documents (for parsing)
386 *
387 ************************/
388
389 /**
390 * Reads the content of an external file being indexed.
391 *
392 * @param string File extension, eg. "pdf", "doc" etc.
393 * @param string Absolute filename of file (must exist and be validated OK before calling function)
394 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
395 * @return array Standard content array (title, description, keywords, body keys)
396 */
397 function readFileContent($ext, $absFile, $cPKey) {
398 unset($contentArr);
399
400 // Return immediately if initialization didn't set support up:
401 if (!$this->supportedExtensions[$ext]) {
402 return FALSE;
403 }
404
405 // Switch by file extension
406 switch ($ext) {
407 case 'pdf':
408 if ($this->app['pdfinfo']) {
409 // Getting pdf-info:
410 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
411 t3lib_utility_Command::exec($cmd, $res);
412 $pdfInfo = $this->splitPdfInfo($res);
413 unset($res);
414 if (intval($pdfInfo['pages'])) {
415 list($low, $high) = explode('-', $cPKey);
416
417 // Get pdf content:
418 $tempFileName = t3lib_div::tempnam('Typo3_indexer'); // Create temporary name
419 @unlink ($tempFileName); // Delete if exists, just to be safe.
420 $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
421 t3lib_utility_Command::exec($cmd);
422 if (@is_file($tempFileName)) {
423 $content = t3lib_div::getUrl($tempFileName);
424 unlink($tempFileName);
425 } else {
426 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsFailed'), $absFile), 2);
427 }
428 if (strlen($content)) {
429 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
430 }
431 }
432 }
433 break;
434 case 'doc':
435 if ($this->app['catdoc']) {
436 $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
437 t3lib_utility_Command::exec($cmd, $res);
438 $content = implode(LF, $res);
439 unset($res);
440 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
441 }
442 break;
443 case 'pps':
444 case 'ppt':
445 if ($this->app['ppthtml']) {
446 $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
447 t3lib_utility_Command::exec($cmd, $res);
448 $content = implode(LF, $res);
449 unset($res);
450 $content = $this->pObj->convertHTMLToUtf8($content);
451 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
452 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
453 }
454 break;
455 case 'xls':
456 if ($this->app['xlhtml']) {
457 $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
458 t3lib_utility_Command::exec($cmd, $res);
459 $content = implode(LF, $res);
460 unset($res);
461 $content = $this->pObj->convertHTMLToUtf8($content);
462 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
463 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
464 }
465 break;
466 case 'sxi':
467 case 'sxc':
468 case 'sxw':
469 case 'ods':
470 case 'odp':
471 case 'odt':
472 if ($this->app['unzip']) {
473 // Read content.xml:
474 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml';
475 t3lib_utility_Command::exec($cmd, $res);
476 $content_xml = implode(LF, $res);
477 unset($res);
478
479 // Read meta.xml:
480 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml';
481 t3lib_utility_Command::exec($cmd, $res);
482 $meta_xml = implode(LF, $res);
483 unset($res);
484
485 $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
486 $contentArr = $this->pObj->splitRegularContent($utf8_content);
487 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
488
489 // Meta information
490 $metaContent = t3lib_div::xml2tree($meta_xml);
491 $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
492 if (is_array($metaContent)) {
493 $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
494 $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0].' '.$metaContent['dc:description'][0]['values'][0];
495
496 // Keywords collected:
497 if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
498 foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
499 $contentArr['keywords'].= $kwDat['values'][0].' ';
500 }
501 }
502 }
503 }
504 break;
505 case 'rtf':
506 if ($this->app['unrtf']) {
507 $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
508 t3lib_utility_Command::exec($cmd, $res);
509 $fileContent = implode(LF, $res);
510 unset($res);
511 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
512 $contentArr = $this->pObj->splitHTMLContent($fileContent);
513 }
514 break;
515 case 'txt':
516 case 'csv': // Raw text
517 $content = t3lib_div::getUrl($absFile);
518 // TODO: Implement auto detection of charset (currently assuming utf-8)
519 $contentCharset = 'utf-8';
520 $content = $this->pObj->convertHTMLToUtf8($content, $contentCharset);
521 $contentArr = $this->pObj->splitRegularContent($content);
522 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
523 break;
524 case 'html':
525 case 'htm':
526 $fileContent = t3lib_div::getUrl($absFile);
527 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
528 $contentArr = $this->pObj->splitHTMLContent($fileContent);
529 break;
530 case 'xml': // PHP strip-tags()
531 $fileContent = t3lib_div::getUrl($absFile);
532
533 // Finding charset:
534 preg_match('/^[[:space:]]*<\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i', substr($fileContent, 0, 200), $reg);
535 $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
536
537 // Converting content:
538 $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<', ' <', $fileContent)), $charset);
539 $contentArr = $this->pObj->splitRegularContent($fileContent);
540 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
541 break;
542 case 'jpg': // PHP EXIF
543 case 'jpeg': // PHP EXIF
544 case 'tif': // PHP EXIF
545 if (function_exists('exif_read_data')) {
546 $exif = exif_read_data($absFile, 'IFD0');
547 } else {
548 $exif = FALSE;
549 }
550
551 if ($exif) {
552 $comment = trim($exif['COMMENT'][0].' '.$exif['ImageDescription']); // The comments in JPEG files are utf-8, while in Tif files they are 7-bit ascii.
553 } else {
554 $comment = '';
555 }
556 $contentArr = $this->pObj->splitRegularContent($comment);
557 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
558 break;
559 default:
560 return FALSE;
561 break;
562 }
563 // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
564 if (is_array($contentArr) && !$contentArr['title']) {
565 // Substituting "_" for " " because many filenames may have this instead of a space char.
566 $contentArr['title'] = str_replace('_', ' ', basename($absFile));
567 }
568
569 return $contentArr;
570 }
571
572 /**
573 * Creates an array with pointers to divisions of document.
574 * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero) coming back.
575 *
576 * @param string File extension
577 * @param string Absolute filename (must exist and be validated OK before calling function)
578 * @return array Array of pointers to sections that the document should be divided into
579 */
580 function fileContentParts($ext, $absFile) {
581 $cParts = array(0);
582 switch ($ext) {
583 case 'pdf':
584 // Getting pdf-info:
585 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
586 t3lib_utility_Command::exec($cmd, $res);
587 $pdfInfo = $this->splitPdfInfo($res);
588 unset($res);
589
590 if (intval($pdfInfo['pages'])) {
591 $cParts = array();
592
593 // Calculate mode
594 if ($this->pdf_mode>0) {
595 $iter = ceil($pdfInfo['pages']/$this->pdf_mode);
596 } else {
597 $iter = t3lib_utility_Math::forceIntegerInRange(abs($this->pdf_mode), 1, $pdfInfo['pages']);
598 }
599
600 // Traverse and create intervals.
601 for ($a=0;$a<$iter;$a++) {
602 $low = floor($a*($pdfInfo['pages']/$iter))+1;
603 $high = floor(($a+1)*($pdfInfo['pages']/$iter));
604 $cParts[] = $low.'-'.$high;
605 }
606 }
607 break;
608 }
609 return $cParts;
610 }
611
612 /**
613 * Analysing PDF info into a useable format.
614 *
615 * @param array Array of PDF content, coming from the pdfinfo tool
616 * @return array Result array
617 * @access private
618 * @see fileContentParts()
619 */
620 function splitPdfInfo($pdfInfoArray) {
621 $res = array();
622 if (is_array($pdfInfoArray)) {
623 foreach($pdfInfoArray as $line) {
624 $parts = explode(':', $line, 2);
625 if (count($parts)>1 && trim($parts[0])) {
626 $res[strtolower(trim($parts[0]))] = trim($parts[1]);
627 }
628 }
629 }
630 return $res;
631 }
632
633 /**
634 * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.
635 *
636 * @param string String to clean up
637 * @return string String
638 */
639 function removeEndJunk($string) {
640 return trim(preg_replace('/['.LF.chr(12).']*$/', '', $string));
641 }
642
643
644
645
646
647
648
649
650
651
652
653
654 /************************
655 *
656 * Backend analyzer
657 *
658 ************************/
659
660 /**
661 * Return icon for file extension
662 *
663 * @param string File extension, lowercase.
664 * @return string Relative file reference, resolvable by t3lib_div::getFileAbsFileName()
665 */
666 function getIcon($extension) {
667 if ($extension=='htm') {
668 $extension = 'html';
669 }
670 if ($extension=='jpeg') {
671 $extension = 'jpg';
672 }
673 return 'EXT:indexed_search/pi/res/'.$extension.'.gif';
674 }
675 }
676 ?>