Fixed bug #9994: ereg* is deprecated in PHP 5.3 alpha3
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.external_parser.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2009 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * External standard parsers for indexed_search
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * @coauthor Olivier Simah <noname_paris@yahoo.fr>
32 */
33 /**
34 * [CLASS/FUNCTION INDEX of SCRIPT]
35 *
36 *
37 *
38 * 75: class tx_indexed_search_extparse
39 * 94: function initParser($extension)
40 * 214: function softInit($extension)
41 * 247: function searchTypeMediaTitle($extension)
42 * 323: function isMultiplePageExtension($extension)
43 *
44 * SECTION: Reading documents (for parsing)
45 * 354: function readFileContent($ext,$absFile,$cPKey)
46 * 521: function fileContentParts($ext,$absFile)
47 * 560: function splitPdfInfo($pdfInfoArray)
48 * 579: function removeEndJunk($string)
49 *
50 * SECTION: Backend analyzer
51 * 606: function getIcon($extension)
52 *
53 * TOTAL FUNCTIONS: 9
54 * (This index is automatically created/updated by the extension "extdeveval")
55 *
56 */
57
58
59
60
61
62
63
64
65
66
67 /**
68 * External standard parsers for indexed_search
69 * MUST RETURN utf-8 content!
70 *
71 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
72 * @package TYPO3
73 * @subpackage tx_indexedsearch
74 */
75 class tx_indexed_search_extparse {
76
77 // This value is also overridden from config.
78 var $pdf_mode = -20; // zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
79
80 // This array is configured in initialization:
81 var $app = array();
82 var $ext2itemtype_map = array();
83 var $supportedExtensions = array();
84
85 var $pObj; // Reference to parent object (indexer class)
86 protected $langObject; // Reference to LANG-Object
87
88 /**
89 * Constructs this external parsers object
90 */
91 public function __construct() {
92 // Set the language object to be used accordant to current TYPO3_MODE:
93 $this->langObject = (TYPO3_MODE == 'FE' ? $GLOBALS['TSFE'] : $GLOBALS['LANG']);
94 }
95
96 /**
97 * Initialize external parser for parsing content.
98 *
99 * @param string File extension
100 * @return boolean Returns true if extension is supported/enabled, otherwise false.
101 */
102 function initParser($extension) {
103
104 // Then read indexer-config and set if appropriate:
105 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
106
107 // If windows, apply extension to tool name:
108 $exe = (TYPO3_OS == 'WIN') ? '.exe' : ''; // lg
109 $extOK = FALSE;
110 $mainExtension = '';
111
112 // Ignore extensions
113 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
114 if (in_array($extension, $ignoreExtensions)) {
115 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:ignoreExtensions'), $extension), 1);
116 return FALSE;
117 }
118
119 // Switch on file extension:
120 switch($extension) {
121 case 'pdf':
122 // PDF
123 if ($indexerConfig['pdftools']) {
124 $pdfPath = rtrim($indexerConfig['pdftools'], '/').'/';
125 if (ini_get('safe_mode') || (@is_file($pdfPath.'pdftotext'.$exe) && @is_file($pdfPath.'pdfinfo'.$exe))) {
126 $this->app['pdfinfo'] = $pdfPath.'pdfinfo'.$exe;
127 $this->app['pdftotext'] = $pdfPath.'pdftotext'.$exe;
128 // PDF mode:
129 $this->pdf_mode = t3lib_div::intInRange($indexerConfig['pdf_mode'],-100,100);
130 $extOK = TRUE;
131 } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsNotFound'), $pdfPath), 3);
132 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsDisabled'), 1);
133 break;
134 case 'doc':
135 // Catdoc
136 if ($indexerConfig['catdoc']) {
137 $catdocPath = rtrim($indexerConfig['catdoc'], '/').'/';
138 if (ini_get('safe_mode') || @is_file($catdocPath.'catdoc'.$exe)) {
139 $this->app['catdoc'] = $catdocPath.'catdoc'.$exe;
140 $extOK = TRUE;
141 } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:catdocNotFound'), $catdocPath), 3);
142 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:catdocDisabled'), 1);
143 break;
144 case 'pps': // MS PowerPoint(?)
145 case 'ppt': // MS PowerPoint
146 // ppthtml
147 if ($indexerConfig['ppthtml']) {
148 $ppthtmlPath = rtrim($indexerConfig['ppthtml'], '/').'/';
149 if (ini_get('safe_mode') || @is_file($ppthtmlPath.'ppthtml'.$exe)){
150 $this->app['ppthtml'] = $ppthtmlPath.'ppthtml'.$exe;
151 $extOK = TRUE;
152 } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:ppthtmlNotFound'), $ppthtmlPath), 3);
153 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:ppthtmlDisabled'), 1);
154 break;
155 case 'xls': // MS Excel
156 // Xlhtml
157 if ($indexerConfig['xlhtml']) {
158 $xlhtmlPath = rtrim($indexerConfig['xlhtml'], '/').'/';
159 if (ini_get('safe_mode') || @is_file($xlhtmlPath.'xlhtml'.$exe)){
160 $this->app['xlhtml'] = $xlhtmlPath.'xlhtml'.$exe;
161 $extOK = TRUE;
162 } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:xlhtmlNotFound'), $xlhtmlPath), 3);
163 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:xlhtmlDisabled'), 1);
164 break;
165 case 'sxc': // Open Office Calc.
166 case 'sxi': // Open Office Impress
167 case 'sxw': // Open Office Writer
168 case 'ods': // Oasis OpenDocument Spreadsheet
169 case 'odp': // Oasis OpenDocument Presentation
170 case 'odt': // Oasis OpenDocument Text
171 if ($indexerConfig['unzip']) {
172 $unzipPath = rtrim($indexerConfig['unzip'], '/').'/';
173 if (ini_get('safe_mode') || @is_file($unzipPath.'unzip'.$exe)) {
174 $this->app['unzip'] = $unzipPath.'unzip'.$exe;
175 $extOK = TRUE;
176 } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:unzipNotFound'), $unzipPath), 3);
177 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:unzipDisabled'), 1);
178 break;
179 case 'rtf':
180 // Catdoc
181 if ($indexerConfig['unrtf']) {
182 $unrtfPath = rtrim($indexerConfig['unrtf'], '/').'/';
183 if (ini_get('safe_mode') || @is_file($unrtfPath.'unrtf'.$exe)) {
184 $this->app['unrtf'] = $unrtfPath.'unrtf'.$exe;
185 $extOK = TRUE;
186 } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:unrtfNotFound'), $unrtfPath), 3);
187 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:unrtfDisabled'), 1);
188 break;
189 case 'txt': // Raw text
190 case 'csv': // Raw text
191 case 'xml': // PHP strip-tags()
192 case 'tif': // PHP EXIF
193 $extOK = TRUE;
194 break;
195 case 'html': // PHP strip-tags()
196 case 'htm': // PHP strip-tags()
197 $extOK = TRUE;
198 $mainExtension = 'html'; // making "html" the common "item_type"
199 break;
200 case 'jpg': // PHP EXIF
201 case 'jpeg': // PHP EXIF
202 $extOK = TRUE;
203 $mainExtension = 'jpeg'; // making "jpeg" the common item_type
204 break;
205 }
206
207 // If extension was OK:
208 if ($extOK) {
209 $this->supportedExtensions[$extension] = TRUE;
210 $this->ext2itemtype_map[$extension] = $mainExtension ? $mainExtension : $extension;
211 return TRUE;
212 }
213 }
214
215 /**
216 * Initialize external parser for backend modules
217 * Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc) in backend and frontend plugin
218 *
219 * @param string File extension to initialize for.
220 * @return boolean Returns true if the extension is supported and enabled, otherwise false.
221 */
222 function softInit($extension) {
223 switch($extension) {
224 case 'pdf': // PDF
225 case 'doc': // MS Word files
226 case 'pps': // MS PowerPoint
227 case 'ppt': // MS PowerPoint
228 case 'xls': // MS Excel
229 case 'sxc': // Open Office Calc.
230 case 'sxi': // Open Office Impress
231 case 'sxw': // Open Office Writer
232 case 'ods': // Oasis OpenDocument Spreadsheet
233 case 'odp': // Oasis OpenDocument Presentation
234 case 'odt': // Oasis OpenDocument Text
235 case 'rtf': // RTF documents
236 case 'txt': // ASCII Text documents
237 case 'html': // HTML
238 case 'htm': // HTML
239 case 'csv': // Comma Separated Values
240 case 'xml': // Generic XML
241 case 'jpg': // Jpeg images (EXIF comment)
242 case 'jpeg': // Jpeg images (EXIF comment)
243 case 'tif': // TIF images (EXIF comment)
244 return TRUE;
245 break;
246 }
247 }
248
249 /**
250 * Return title of entry in media type selector box.
251 *
252 * @param string File extension
253 * @return string String with label value of entry in media type search selector box (frontend plugin).
254 */
255 function searchTypeMediaTitle($extension) {
256
257 // Read indexer-config
258 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
259
260 // Ignore extensions
261 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
262 if (in_array($extension, $ignoreExtensions)) {
263 return FALSE;
264 }
265
266 // Switch on file extension:
267 switch($extension) {
268 case 'pdf':
269 // PDF
270 if ($indexerConfig['pdftools']) {
271 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.PDF'), $extension);
272 }
273 break;
274 case 'doc':
275 // Catdoc
276 if ($indexerConfig['catdoc']) {
277 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.DOC'), $extension);
278 }
279 break;
280 case 'pps': // MS PowerPoint(?)
281 case 'ppt': // MS PowerPoint
282 // ppthtml
283 if ($indexerConfig['ppthtml']) {
284 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.PP'), $extension);
285 }
286 break;
287 case 'xls': // MS Excel
288 // Xlhtml
289 if ($indexerConfig['xlhtml']) {
290 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.XLS'), $extension);
291 }
292 break;
293 case 'sxc': // Open Office Calc.
294 if ($indexerConfig['unzip']) {
295 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXC'), $extension);
296 }
297 break;
298 case 'sxi': // Open Office Impress
299 if ($indexerConfig['unzip']) {
300 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXI'), $extension);
301 }
302 break;
303 case 'sxw': // Open Office Writer
304 if ($indexerConfig['unzip']) {
305 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXW'), $extension);
306 }
307 break;
308 case 'ods': // Oasis OpenDocument Spreadsheet
309 if ($indexerConfig['unzip']) {
310 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODS'), $extension);
311 }
312 break;
313 case 'odp': // Oasis OpenDocument Presentation
314 if ($indexerConfig['unzip']) {
315 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODP'), $extension);
316 }
317 break;
318 case 'odt': // Oasis OpenDocument Text
319 if ($indexerConfig['unzip']) {
320 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODT'), $extension);
321 }
322 break;
323 case 'rtf':
324 // Catdoc
325 if ($indexerConfig['unrtf']) {
326 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.RTF'), $extension);
327 }
328 break;
329 case 'jpeg': // PHP EXIF
330 case 'tif': // PHP EXIF
331 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.Images'), $extension);
332 break;
333 case 'html': // PHP strip-tags()
334 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.HTML'), $extension);
335 break;
336 case 'txt': // Raw text
337 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.TXT'), $extension);
338 break;
339 case 'csv': // Raw text
340 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.CSV'), $extension);
341 break;
342 case 'xml': // PHP strip-tags()
343 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.XML'), $extension);
344 break;
345 // NO entry (duplicates or blank):
346 case 'htm': // PHP strip-tags()
347 case 'jpg': // PHP EXIF
348 default:
349 break;
350 }
351 }
352
353 /**
354 * Returns true if the input extension (item_type) is a potentially a multi-page extension
355 *
356 * @param string Extension / item_type string
357 * @return boolean Return true if multi-page
358 */
359 function isMultiplePageExtension($extension) {
360 // Switch on file extension:
361 switch((string)$extension) {
362 case 'pdf':
363 return TRUE;
364 break;
365 }
366 }
367
368 /**
369 * Wraps the "splitLabel function" of the language object.
370 *
371 * @param string $reference: Reference/key of the label
372 * @param boolean $useHtmlSpecialChar: Convert special chars to HTML entities (default: false)
373 * @return string The label of the reference/key to be fetched
374 */
375 protected function sL($reference, $useHtmlSpecialChar = false) {
376 return $this->langObject->sL($reference, $useHtmlSpecialChar);
377 }
378
379
380
381
382
383
384
385
386
387 /************************
388 *
389 * Reading documents (for parsing)
390 *
391 ************************/
392
393 /**
394 * Reads the content of an external file being indexed.
395 *
396 * @param string File extension, eg. "pdf", "doc" etc.
397 * @param string Absolute filename of file (must exist and be validated OK before calling function)
398 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
399 * @return array Standard content array (title, description, keywords, body keys)
400 */
401 function readFileContent($ext,$absFile,$cPKey) {
402 unset($contentArr);
403
404 // Return immediately if initialization didn't set support up:
405 if (!$this->supportedExtensions[$ext]) return FALSE;
406
407 // Switch by file extension
408 switch ($ext) {
409 case 'pdf':
410 if ($this->app['pdfinfo']) {
411 // Getting pdf-info:
412 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
413 exec($cmd,$res);
414 $pdfInfo = $this->splitPdfInfo($res);
415 unset($res);
416 if (intval($pdfInfo['pages'])) {
417 list($low,$high) = explode('-',$cPKey);
418
419 // Get pdf content:
420 $tempFileName = t3lib_div::tempnam('Typo3_indexer'); // Create temporary name
421 @unlink ($tempFileName); // Delete if exists, just to be safe.
422 $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
423 exec($cmd);
424 if (@is_file($tempFileName)) {
425 $content = t3lib_div::getUrl($tempFileName);
426 unlink($tempFileName);
427 } else {
428 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsFailed'), $absFile), 2);
429 }
430 if (strlen($content)) {
431 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
432 }
433 }
434 }
435 break;
436 case 'doc':
437 if ($this->app['catdoc']) {
438 $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
439 exec($cmd,$res);
440 $content = implode(chr(10),$res);
441 unset($res);
442 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
443 }
444 break;
445 case 'pps':
446 case 'ppt':
447 if ($this->app['ppthtml']) {
448 $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
449 exec($cmd,$res);
450 $content = implode(chr(10),$res);
451 unset($res);
452 $content = $this->pObj->convertHTMLToUtf8($content);
453 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
454 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
455 }
456 break;
457 case 'xls':
458 if ($this->app['xlhtml']) {
459 $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
460 exec($cmd,$res);
461 $content = implode(chr(10),$res);
462 unset($res);
463 $content = $this->pObj->convertHTMLToUtf8($content);
464 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
465 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
466 }
467 break;
468 case 'sxi':
469 case 'sxc':
470 case 'sxw':
471 case 'ods':
472 case 'odp':
473 case 'odt':
474 if ($this->app['unzip']) {
475 // Read content.xml:
476 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml';
477 exec($cmd,$res);
478 $content_xml = implode(chr(10),$res);
479 unset($res);
480
481 // Read meta.xml:
482 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml';
483 exec($cmd, $res);
484 $meta_xml = implode(chr(10),$res);
485 unset($res);
486
487 $utf8_content = trim(strip_tags(str_replace('<',' <',$content_xml)));
488 $contentArr = $this->pObj->splitRegularContent($utf8_content);
489 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
490
491 // Meta information
492 $metaContent = t3lib_div::xml2tree($meta_xml);
493 $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
494 if (is_array($metaContent)) {
495 $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
496 $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0].' '.$metaContent['dc:description'][0]['values'][0];
497
498 // Keywords collected:
499 if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
500 foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
501 $contentArr['keywords'].= $kwDat['values'][0].' ';
502 }
503 }
504 }
505 }
506 break;
507 case 'rtf':
508 if ($this->app['unrtf']) {
509 $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
510 exec($cmd,$res);
511 $fileContent = implode(chr(10),$res);
512 unset($res);
513 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
514 $contentArr = $this->pObj->splitHTMLContent($fileContent);
515 }
516 break;
517 case 'txt':
518 case 'csv': // Raw text
519 $content = t3lib_div::getUrl($absFile);
520 // TODO: Auto-registration of charset???? -> utf-8 (Current assuming western europe...)
521 $content = $this->pObj->convertHTMLToUtf8($content, 'iso-8859-1');
522 $contentArr = $this->pObj->splitRegularContent($content);
523 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
524 break;
525 case 'html':
526 case 'htm':
527 $fileContent = t3lib_div::getUrl($absFile);
528 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
529 $contentArr = $this->pObj->splitHTMLContent($fileContent);
530 break;
531 case 'xml': // PHP strip-tags()
532 $fileContent = t3lib_div::getUrl($absFile);
533
534 // Finding charset:
535 preg_match('/^[[:space:]]*<\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i',substr($fileContent,0,200),$reg);
536 $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
537
538 // Converting content:
539 $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<',' <',$fileContent)), $charset);
540 $contentArr = $this->pObj->splitRegularContent($fileContent);
541 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
542 break;
543 case 'jpg': // PHP EXIF
544 case 'jpeg': // PHP EXIF
545 case 'tif': // PHP EXIF
546 if (function_exists('exif_read_data')) {
547 $exif = exif_read_data($absFile, 'IFD0');
548 } else {
549 $exif = FALSE;
550 }
551
552 if ($exif) {
553 $comment = trim($exif['COMMENT'][0].' '.$exif['ImageDescription']); // The comments in JPEG files are utf-8, while in Tif files they are 7-bit ascii.
554 } else {
555 $comment = '';
556 }
557 $contentArr = $this->pObj->splitRegularContent($comment);
558 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
559 break;
560 default:
561 return false;
562 break;
563 }
564 // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
565 if (is_array($contentArr) && !$contentArr['title']) {
566 $contentArr['title'] = str_replace('_',' ',basename($absFile)); // Substituting "_" for " " because many filenames may have this instead of a space char.
567 }
568
569 return $contentArr;
570 }
571
572 /**
573 * Creates an array with pointers to divisions of document.
574 * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero) coming back.
575 *
576 * @param string File extension
577 * @param string Absolute filename (must exist and be validated OK before calling function)
578 * @return array Array of pointers to sections that the document should be divided into
579 */
580 function fileContentParts($ext,$absFile) {
581 $cParts = array(0);
582 switch ($ext) {
583 case 'pdf':
584 // Getting pdf-info:
585 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
586 exec($cmd,$res);
587 $pdfInfo = $this->splitPdfInfo($res);
588 unset($res);
589
590 if (intval($pdfInfo['pages'])) {
591 $cParts = array();
592
593 // Calculate mode
594 if ($this->pdf_mode>0) {
595 $iter = ceil($pdfInfo['pages']/$this->pdf_mode);
596 } else {
597 $iter = t3lib_div::intInRange(abs($this->pdf_mode),1,$pdfInfo['pages']);
598 }
599
600 // Traverse and create intervals.
601 for ($a=0;$a<$iter;$a++) {
602 $low = floor($a*($pdfInfo['pages']/$iter))+1;
603 $high = floor(($a+1)*($pdfInfo['pages']/$iter));
604 $cParts[] = $low.'-'.$high;
605 }
606 }
607 break;
608 }
609 return $cParts;
610 }
611
612 /**
613 * Analysing PDF info into a useable format.
614 *
615 * @param array Array of PDF content, coming from the pdfinfo tool
616 * @return array Result array
617 * @access private
618 * @see fileContentParts()
619 */
620 function splitPdfInfo($pdfInfoArray) {
621 $res = array();
622 if (is_array($pdfInfoArray)) {
623 foreach($pdfInfoArray as $line) {
624 $parts = explode(':',$line,2);
625 if (count($parts)>1 && trim($parts[0])) {
626 $res[strtolower(trim($parts[0]))] = trim($parts[1]);
627 }
628 }
629 }
630 return $res;
631 }
632
633 /**
634 * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.
635 *
636 * @param string String to clean up
637 * @return string String
638 */
639 function removeEndJunk($string) {
640 return trim(preg_replace('/['.chr(10).chr(12).']*$/','',$string));
641 }
642
643
644
645
646
647
648
649
650
651
652
653
654 /************************
655 *
656 * Backend analyzer
657 *
658 ************************/
659
660 /**
661 * Return icon for file extension
662 *
663 * @param string File extension, lowercase.
664 * @return string Relative file reference, resolvable by t3lib_div::getFileAbsFileName()
665 */
666 function getIcon($extension) {
667 if ($extension=='htm') $extension = 'html';
668 if ($extension=='jpeg') $extension = 'jpg';
669 return 'EXT:indexed_search/pi/res/'.$extension.'.gif';
670 }
671 }
672
673 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']) {
674 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']);
675 }
676
677 ?>