Cleanup: Updated copyright comments
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.external_parser.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * External standard parsers for indexed_search
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * @coauthor Olivier Simah <noname_paris@yahoo.fr>
32 */
33 /**
34 * [CLASS/FUNCTION INDEX of SCRIPT]
35 *
36 *
37 *
38 * 75: class tx_indexed_search_extparse
39 * 94: function initParser($extension)
40 * 214: function softInit($extension)
41 * 247: function searchTypeMediaTitle($extension)
42 * 323: function isMultiplePageExtension($extension)
43 *
44 * SECTION: Reading documents (for parsing)
45 * 354: function readFileContent($ext,$absFile,$cPKey)
46 * 521: function fileContentParts($ext,$absFile)
47 * 560: function splitPdfInfo($pdfInfoArray)
48 * 579: function removeEndJunk($string)
49 *
50 * SECTION: Backend analyzer
51 * 606: function getIcon($extension)
52 *
53 * TOTAL FUNCTIONS: 9
54 * (This index is automatically created/updated by the extension "extdeveval")
55 *
56 */
57
58
59
60
61
62
63
64
65
66
67 /**
68 * External standard parsers for indexed_search
69 * MUST RETURN utf-8 content!
70 *
71 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
72 * @package TYPO3
73 * @subpackage tx_indexedsearch
74 */
75 class tx_indexed_search_extparse {
76
77 // This value is also overridden from config.
78 var $pdf_mode = -20; // zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
79
80 // This array is configured in initialization:
81 var $app = array();
82 var $ext2itemtype_map = array();
83 var $supportedExtensions = array();
84
85 var $pObj; // Reference to parent object (indexer class)
86 protected $langObject; // Reference to LANG-Object
87
88 /**
89 * Constructs this external parsers object
90 */
91 public function __construct() {
92 // Set the language object to be used accordant to current TYPO3_MODE:
93 $this->langObject = (TYPO3_MODE == 'FE' ? $GLOBALS['TSFE'] : $GLOBALS['LANG']);
94 }
95
96 /**
97 * Initialize external parser for parsing content.
98 *
99 * @param string File extension
100 * @return boolean Returns true if extension is supported/enabled, otherwise false.
101 */
102 function initParser($extension) {
103
104 // Then read indexer-config and set if appropriate:
105 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
106
107 // If windows, apply extension to tool name:
108 $exe = (TYPO3_OS == 'WIN') ? '.exe' : ''; // lg
109 $extOK = FALSE;
110 $mainExtension = '';
111
112 // Ignore extensions
113 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
114 if (in_array($extension, $ignoreExtensions)) {
115 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:ignoreExtensions'), $extension), 1);
116 return FALSE;
117 }
118
119 $safeModeEnabled = t3lib_utility_PhpOptions::isSafeModeEnabled();
120
121 // Switch on file extension:
122 switch($extension) {
123 case 'pdf':
124 // PDF
125 if ($indexerConfig['pdftools']) {
126 $pdfPath = rtrim($indexerConfig['pdftools'], '/').'/';
127 if ($safeModeEnabled || (@is_file($pdfPath . 'pdftotext' . $exe) && @is_file($pdfPath . 'pdfinfo' . $exe))) {
128 $this->app['pdfinfo'] = $pdfPath.'pdfinfo'.$exe;
129 $this->app['pdftotext'] = $pdfPath.'pdftotext'.$exe;
130 // PDF mode:
131 $this->pdf_mode = t3lib_div::intInRange($indexerConfig['pdf_mode'],-100,100);
132 $extOK = TRUE;
133 } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsNotFound'), $pdfPath), 3);
134 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsDisabled'), 1);
135 break;
136 case 'doc':
137 // Catdoc
138 if ($indexerConfig['catdoc']) {
139 $catdocPath = rtrim($indexerConfig['catdoc'], '/').'/';
140 if ($safeModeEnabled || @is_file($catdocPath . 'catdoc' . $exe)) {
141 $this->app['catdoc'] = $catdocPath.'catdoc'.$exe;
142 $extOK = TRUE;
143 } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:catdocNotFound'), $catdocPath), 3);
144 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:catdocDisabled'), 1);
145 break;
146 case 'pps': // MS PowerPoint(?)
147 case 'ppt': // MS PowerPoint
148 // ppthtml
149 if ($indexerConfig['ppthtml']) {
150 $ppthtmlPath = rtrim($indexerConfig['ppthtml'], '/').'/';
151 if ($safeModeEnabled || @is_file($ppthtmlPath . 'ppthtml' . $exe)) {
152 $this->app['ppthtml'] = $ppthtmlPath.'ppthtml'.$exe;
153 $extOK = TRUE;
154 } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:ppthtmlNotFound'), $ppthtmlPath), 3);
155 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:ppthtmlDisabled'), 1);
156 break;
157 case 'xls': // MS Excel
158 // Xlhtml
159 if ($indexerConfig['xlhtml']) {
160 $xlhtmlPath = rtrim($indexerConfig['xlhtml'], '/').'/';
161 if ($safeModeEnabled || @is_file($xlhtmlPath . 'xlhtml' . $exe)) {
162 $this->app['xlhtml'] = $xlhtmlPath.'xlhtml'.$exe;
163 $extOK = TRUE;
164 } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:xlhtmlNotFound'), $xlhtmlPath), 3);
165 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:xlhtmlDisabled'), 1);
166 break;
167 case 'sxc': // Open Office Calc.
168 case 'sxi': // Open Office Impress
169 case 'sxw': // Open Office Writer
170 case 'ods': // Oasis OpenDocument Spreadsheet
171 case 'odp': // Oasis OpenDocument Presentation
172 case 'odt': // Oasis OpenDocument Text
173 if ($indexerConfig['unzip']) {
174 $unzipPath = rtrim($indexerConfig['unzip'], '/').'/';
175 if ($safeModeEnabled || @is_file($unzipPath . 'unzip' . $exe)) {
176 $this->app['unzip'] = $unzipPath.'unzip'.$exe;
177 $extOK = TRUE;
178 } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:unzipNotFound'), $unzipPath), 3);
179 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:unzipDisabled'), 1);
180 break;
181 case 'rtf':
182 // Catdoc
183 if ($indexerConfig['unrtf']) {
184 $unrtfPath = rtrim($indexerConfig['unrtf'], '/').'/';
185 if ($safeModeEnabled || @is_file($unrtfPath . 'unrtf' . $exe)) {
186 $this->app['unrtf'] = $unrtfPath.'unrtf'.$exe;
187 $extOK = TRUE;
188 } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:unrtfNotFound'), $unrtfPath), 3);
189 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:unrtfDisabled'), 1);
190 break;
191 case 'txt': // Raw text
192 case 'csv': // Raw text
193 case 'xml': // PHP strip-tags()
194 case 'tif': // PHP EXIF
195 $extOK = TRUE;
196 break;
197 case 'html': // PHP strip-tags()
198 case 'htm': // PHP strip-tags()
199 $extOK = TRUE;
200 $mainExtension = 'html'; // making "html" the common "item_type"
201 break;
202 case 'jpg': // PHP EXIF
203 case 'jpeg': // PHP EXIF
204 $extOK = TRUE;
205 $mainExtension = 'jpeg'; // making "jpeg" the common item_type
206 break;
207 }
208
209 // If extension was OK:
210 if ($extOK) {
211 $this->supportedExtensions[$extension] = TRUE;
212 $this->ext2itemtype_map[$extension] = $mainExtension ? $mainExtension : $extension;
213 return TRUE;
214 }
215 }
216
217 /**
218 * Initialize external parser for backend modules
219 * Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc) in backend and frontend plugin
220 *
221 * @param string File extension to initialize for.
222 * @return boolean Returns true if the extension is supported and enabled, otherwise false.
223 */
224 function softInit($extension) {
225 switch($extension) {
226 case 'pdf': // PDF
227 case 'doc': // MS Word files
228 case 'pps': // MS PowerPoint
229 case 'ppt': // MS PowerPoint
230 case 'xls': // MS Excel
231 case 'sxc': // Open Office Calc.
232 case 'sxi': // Open Office Impress
233 case 'sxw': // Open Office Writer
234 case 'ods': // Oasis OpenDocument Spreadsheet
235 case 'odp': // Oasis OpenDocument Presentation
236 case 'odt': // Oasis OpenDocument Text
237 case 'rtf': // RTF documents
238 case 'txt': // ASCII Text documents
239 case 'html': // HTML
240 case 'htm': // HTML
241 case 'csv': // Comma Separated Values
242 case 'xml': // Generic XML
243 case 'jpg': // Jpeg images (EXIF comment)
244 case 'jpeg': // Jpeg images (EXIF comment)
245 case 'tif': // TIF images (EXIF comment)
246 return TRUE;
247 break;
248 }
249 }
250
251 /**
252 * Return title of entry in media type selector box.
253 *
254 * @param string File extension
255 * @return string String with label value of entry in media type search selector box (frontend plugin).
256 */
257 function searchTypeMediaTitle($extension) {
258
259 // Read indexer-config
260 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
261
262 // Ignore extensions
263 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
264 if (in_array($extension, $ignoreExtensions)) {
265 return FALSE;
266 }
267
268 // Switch on file extension:
269 switch($extension) {
270 case 'pdf':
271 // PDF
272 if ($indexerConfig['pdftools']) {
273 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.PDF'), $extension);
274 }
275 break;
276 case 'doc':
277 // Catdoc
278 if ($indexerConfig['catdoc']) {
279 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.DOC'), $extension);
280 }
281 break;
282 case 'pps': // MS PowerPoint(?)
283 case 'ppt': // MS PowerPoint
284 // ppthtml
285 if ($indexerConfig['ppthtml']) {
286 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.PP'), $extension);
287 }
288 break;
289 case 'xls': // MS Excel
290 // Xlhtml
291 if ($indexerConfig['xlhtml']) {
292 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.XLS'), $extension);
293 }
294 break;
295 case 'sxc': // Open Office Calc.
296 if ($indexerConfig['unzip']) {
297 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXC'), $extension);
298 }
299 break;
300 case 'sxi': // Open Office Impress
301 if ($indexerConfig['unzip']) {
302 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXI'), $extension);
303 }
304 break;
305 case 'sxw': // Open Office Writer
306 if ($indexerConfig['unzip']) {
307 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXW'), $extension);
308 }
309 break;
310 case 'ods': // Oasis OpenDocument Spreadsheet
311 if ($indexerConfig['unzip']) {
312 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODS'), $extension);
313 }
314 break;
315 case 'odp': // Oasis OpenDocument Presentation
316 if ($indexerConfig['unzip']) {
317 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODP'), $extension);
318 }
319 break;
320 case 'odt': // Oasis OpenDocument Text
321 if ($indexerConfig['unzip']) {
322 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODT'), $extension);
323 }
324 break;
325 case 'rtf':
326 // Catdoc
327 if ($indexerConfig['unrtf']) {
328 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.RTF'), $extension);
329 }
330 break;
331 case 'jpeg': // PHP EXIF
332 case 'tif': // PHP EXIF
333 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.Images'), $extension);
334 break;
335 case 'html': // PHP strip-tags()
336 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.HTML'), $extension);
337 break;
338 case 'txt': // Raw text
339 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.TXT'), $extension);
340 break;
341 case 'csv': // Raw text
342 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.CSV'), $extension);
343 break;
344 case 'xml': // PHP strip-tags()
345 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.XML'), $extension);
346 break;
347 // NO entry (duplicates or blank):
348 case 'htm': // PHP strip-tags()
349 case 'jpg': // PHP EXIF
350 default:
351 break;
352 }
353 }
354
355 /**
356 * Returns true if the input extension (item_type) is a potentially a multi-page extension
357 *
358 * @param string Extension / item_type string
359 * @return boolean Return true if multi-page
360 */
361 function isMultiplePageExtension($extension) {
362 // Switch on file extension:
363 switch((string)$extension) {
364 case 'pdf':
365 return TRUE;
366 break;
367 }
368 }
369
370 /**
371 * Wraps the "splitLabel function" of the language object.
372 *
373 * @param string $reference: Reference/key of the label
374 * @param boolean $useHtmlSpecialChar: Convert special chars to HTML entities (default: false)
375 * @return string The label of the reference/key to be fetched
376 */
377 protected function sL($reference, $useHtmlSpecialChar = false) {
378 return $this->langObject->sL($reference, $useHtmlSpecialChar);
379 }
380
381
382
383
384
385
386
387
388
389 /************************
390 *
391 * Reading documents (for parsing)
392 *
393 ************************/
394
395 /**
396 * Reads the content of an external file being indexed.
397 *
398 * @param string File extension, eg. "pdf", "doc" etc.
399 * @param string Absolute filename of file (must exist and be validated OK before calling function)
400 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
401 * @return array Standard content array (title, description, keywords, body keys)
402 */
403 function readFileContent($ext,$absFile,$cPKey) {
404 unset($contentArr);
405
406 // Return immediately if initialization didn't set support up:
407 if (!$this->supportedExtensions[$ext]) return FALSE;
408
409 // Switch by file extension
410 switch ($ext) {
411 case 'pdf':
412 if ($this->app['pdfinfo']) {
413 // Getting pdf-info:
414 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
415 t3lib_utility_Command::exec($cmd, $res);
416 $pdfInfo = $this->splitPdfInfo($res);
417 unset($res);
418 if (intval($pdfInfo['pages'])) {
419 list($low,$high) = explode('-',$cPKey);
420
421 // Get pdf content:
422 $tempFileName = t3lib_div::tempnam('Typo3_indexer'); // Create temporary name
423 @unlink ($tempFileName); // Delete if exists, just to be safe.
424 $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
425 t3lib_utility_Command::exec($cmd);
426 if (@is_file($tempFileName)) {
427 $content = t3lib_div::getUrl($tempFileName);
428 unlink($tempFileName);
429 } else {
430 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsFailed'), $absFile), 2);
431 }
432 if (strlen($content)) {
433 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
434 }
435 }
436 }
437 break;
438 case 'doc':
439 if ($this->app['catdoc']) {
440 $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
441 t3lib_utility_Command::exec($cmd, $res);
442 $content = implode(LF,$res);
443 unset($res);
444 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
445 }
446 break;
447 case 'pps':
448 case 'ppt':
449 if ($this->app['ppthtml']) {
450 $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
451 t3lib_utility_Command::exec($cmd, $res);
452 $content = implode(LF,$res);
453 unset($res);
454 $content = $this->pObj->convertHTMLToUtf8($content);
455 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
456 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
457 }
458 break;
459 case 'xls':
460 if ($this->app['xlhtml']) {
461 $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
462 t3lib_utility_Command::exec($cmd, $res);
463 $content = implode(LF,$res);
464 unset($res);
465 $content = $this->pObj->convertHTMLToUtf8($content);
466 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
467 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
468 }
469 break;
470 case 'sxi':
471 case 'sxc':
472 case 'sxw':
473 case 'ods':
474 case 'odp':
475 case 'odt':
476 if ($this->app['unzip']) {
477 // Read content.xml:
478 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml';
479 t3lib_utility_Command::exec($cmd, $res);
480 $content_xml = implode(LF,$res);
481 unset($res);
482
483 // Read meta.xml:
484 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml';
485 t3lib_utility_Command::exec($cmd, $res);
486 $meta_xml = implode(LF,$res);
487 unset($res);
488
489 $utf8_content = trim(strip_tags(str_replace('<',' <',$content_xml)));
490 $contentArr = $this->pObj->splitRegularContent($utf8_content);
491 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
492
493 // Meta information
494 $metaContent = t3lib_div::xml2tree($meta_xml);
495 $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
496 if (is_array($metaContent)) {
497 $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
498 $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0].' '.$metaContent['dc:description'][0]['values'][0];
499
500 // Keywords collected:
501 if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
502 foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
503 $contentArr['keywords'].= $kwDat['values'][0].' ';
504 }
505 }
506 }
507 }
508 break;
509 case 'rtf':
510 if ($this->app['unrtf']) {
511 $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
512 t3lib_utility_Command::exec($cmd, $res);
513 $fileContent = implode(LF,$res);
514 unset($res);
515 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
516 $contentArr = $this->pObj->splitHTMLContent($fileContent);
517 }
518 break;
519 case 'txt':
520 case 'csv': // Raw text
521 $content = t3lib_div::getUrl($absFile);
522 // TODO: Auto-registration of charset???? -> utf-8 (Current assuming western europe...)
523 $content = $this->pObj->convertHTMLToUtf8($content, 'iso-8859-1');
524 $contentArr = $this->pObj->splitRegularContent($content);
525 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
526 break;
527 case 'html':
528 case 'htm':
529 $fileContent = t3lib_div::getUrl($absFile);
530 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
531 $contentArr = $this->pObj->splitHTMLContent($fileContent);
532 break;
533 case 'xml': // PHP strip-tags()
534 $fileContent = t3lib_div::getUrl($absFile);
535
536 // Finding charset:
537 preg_match('/^[[:space:]]*<\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i',substr($fileContent,0,200),$reg);
538 $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
539
540 // Converting content:
541 $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<',' <',$fileContent)), $charset);
542 $contentArr = $this->pObj->splitRegularContent($fileContent);
543 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
544 break;
545 case 'jpg': // PHP EXIF
546 case 'jpeg': // PHP EXIF
547 case 'tif': // PHP EXIF
548 if (function_exists('exif_read_data')) {
549 $exif = exif_read_data($absFile, 'IFD0');
550 } else {
551 $exif = FALSE;
552 }
553
554 if ($exif) {
555 $comment = trim($exif['COMMENT'][0].' '.$exif['ImageDescription']); // The comments in JPEG files are utf-8, while in Tif files they are 7-bit ascii.
556 } else {
557 $comment = '';
558 }
559 $contentArr = $this->pObj->splitRegularContent($comment);
560 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
561 break;
562 default:
563 return false;
564 break;
565 }
566 // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
567 if (is_array($contentArr) && !$contentArr['title']) {
568 $contentArr['title'] = str_replace('_',' ',basename($absFile)); // Substituting "_" for " " because many filenames may have this instead of a space char.
569 }
570
571 return $contentArr;
572 }
573
574 /**
575 * Creates an array with pointers to divisions of document.
576 * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero) coming back.
577 *
578 * @param string File extension
579 * @param string Absolute filename (must exist and be validated OK before calling function)
580 * @return array Array of pointers to sections that the document should be divided into
581 */
582 function fileContentParts($ext,$absFile) {
583 $cParts = array(0);
584 switch ($ext) {
585 case 'pdf':
586 // Getting pdf-info:
587 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
588 t3lib_utility_Command::exec($cmd, $res);
589 $pdfInfo = $this->splitPdfInfo($res);
590 unset($res);
591
592 if (intval($pdfInfo['pages'])) {
593 $cParts = array();
594
595 // Calculate mode
596 if ($this->pdf_mode>0) {
597 $iter = ceil($pdfInfo['pages']/$this->pdf_mode);
598 } else {
599 $iter = t3lib_div::intInRange(abs($this->pdf_mode),1,$pdfInfo['pages']);
600 }
601
602 // Traverse and create intervals.
603 for ($a=0;$a<$iter;$a++) {
604 $low = floor($a*($pdfInfo['pages']/$iter))+1;
605 $high = floor(($a+1)*($pdfInfo['pages']/$iter));
606 $cParts[] = $low.'-'.$high;
607 }
608 }
609 break;
610 }
611 return $cParts;
612 }
613
614 /**
615 * Analysing PDF info into a useable format.
616 *
617 * @param array Array of PDF content, coming from the pdfinfo tool
618 * @return array Result array
619 * @access private
620 * @see fileContentParts()
621 */
622 function splitPdfInfo($pdfInfoArray) {
623 $res = array();
624 if (is_array($pdfInfoArray)) {
625 foreach($pdfInfoArray as $line) {
626 $parts = explode(':',$line,2);
627 if (count($parts)>1 && trim($parts[0])) {
628 $res[strtolower(trim($parts[0]))] = trim($parts[1]);
629 }
630 }
631 }
632 return $res;
633 }
634
635 /**
636 * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.
637 *
638 * @param string String to clean up
639 * @return string String
640 */
641 function removeEndJunk($string) {
642 return trim(preg_replace('/['.LF.chr(12).']*$/','',$string));
643 }
644
645
646
647
648
649
650
651
652
653
654
655
656 /************************
657 *
658 * Backend analyzer
659 *
660 ************************/
661
662 /**
663 * Return icon for file extension
664 *
665 * @param string File extension, lowercase.
666 * @return string Relative file reference, resolvable by t3lib_div::getFileAbsFileName()
667 */
668 function getIcon($extension) {
669 if ($extension=='htm') $extension = 'html';
670 if ($extension=='jpeg') $extension = 'jpg';
671 return 'EXT:indexed_search/pi/res/'.$extension.'.gif';
672 }
673 }
674
675 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php'])) {
676 include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']);
677 }
678
679 ?>