[TASK] Unify backend: header/section
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.external_parser.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * External standard parsers for indexed_search
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 * @coauthor Olivier Simah <noname_paris@yahoo.fr>
32 */
33
34
35
36
37
38
39
40
41
42
43 /**
44 * External standard parsers for indexed_search
45 * MUST RETURN utf-8 content!
46 *
47 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
48 * @package TYPO3
49 * @subpackage tx_indexedsearch
50 */
51 class tx_indexed_search_extparse {
52
53 // This value is also overridden from config.
54 var $pdf_mode = -20; // zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
55
56 // This array is configured in initialization:
57 var $app = array();
58 var $ext2itemtype_map = array();
59 var $supportedExtensions = array();
60
61 var $pObj; // Reference to parent object (indexer class)
62 protected $langObject; // Reference to LANG-Object
63
64 /**
65 * Constructs this external parsers object
66 */
67 public function __construct() {
68 // Set the language object to be used accordant to current TYPO3_MODE:
69 $this->langObject = (TYPO3_MODE == 'FE' ? $GLOBALS['TSFE'] : $GLOBALS['LANG']);
70 }
71
72 /**
73 * Initialize external parser for parsing content.
74 *
75 * @param string File extension
76 * @return boolean Returns TRUE if extension is supported/enabled, otherwise FALSE.
77 */
78 function initParser($extension) {
79
80 // Then read indexer-config and set if appropriate:
81 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
82
83 // If windows, apply extension to tool name:
84 $exe = (TYPO3_OS == 'WIN') ? '.exe' : ''; // lg
85 $extOK = FALSE;
86 $mainExtension = '';
87
88 // Ignore extensions
89 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
90 if (in_array($extension, $ignoreExtensions)) {
91 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:ignoreExtensions'), $extension), 1);
92 return FALSE;
93 }
94
95 // Switch on file extension:
96 switch($extension) {
97 case 'pdf':
98 // PDF
99 if ($indexerConfig['pdftools']) {
100 $pdfPath = rtrim($indexerConfig['pdftools'], '/').'/';
101 if (@is_file($pdfPath . 'pdftotext' . $exe) && @is_file($pdfPath . 'pdfinfo' . $exe)) {
102 $this->app['pdfinfo'] = $pdfPath.'pdfinfo'.$exe;
103 $this->app['pdftotext'] = $pdfPath.'pdftotext'.$exe;
104 // PDF mode:
105 $this->pdf_mode = t3lib_utility_Math::forceIntegerInRange($indexerConfig['pdf_mode'],-100,100);
106 $extOK = TRUE;
107 } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsNotFound'), $pdfPath), 3);
108 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsDisabled'), 1);
109 break;
110 case 'doc':
111 // Catdoc
112 if ($indexerConfig['catdoc']) {
113 $catdocPath = rtrim($indexerConfig['catdoc'], '/').'/';
114 if (@is_file($catdocPath . 'catdoc' . $exe)) {
115 $this->app['catdoc'] = $catdocPath.'catdoc'.$exe;
116 $extOK = TRUE;
117 } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:catdocNotFound'), $catdocPath), 3);
118 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:catdocDisabled'), 1);
119 break;
120 case 'pps': // MS PowerPoint(?)
121 case 'ppt': // MS PowerPoint
122 // ppthtml
123 if ($indexerConfig['ppthtml']) {
124 $ppthtmlPath = rtrim($indexerConfig['ppthtml'], '/').'/';
125 if (@is_file($ppthtmlPath . 'ppthtml' . $exe)) {
126 $this->app['ppthtml'] = $ppthtmlPath.'ppthtml'.$exe;
127 $extOK = TRUE;
128 } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:ppthtmlNotFound'), $ppthtmlPath), 3);
129 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:ppthtmlDisabled'), 1);
130 break;
131 case 'xls': // MS Excel
132 // Xlhtml
133 if ($indexerConfig['xlhtml']) {
134 $xlhtmlPath = rtrim($indexerConfig['xlhtml'], '/').'/';
135 if (@is_file($xlhtmlPath . 'xlhtml' . $exe)) {
136 $this->app['xlhtml'] = $xlhtmlPath.'xlhtml'.$exe;
137 $extOK = TRUE;
138 } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:xlhtmlNotFound'), $xlhtmlPath), 3);
139 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:xlhtmlDisabled'), 1);
140 break;
141 case 'sxc': // Open Office Calc.
142 case 'sxi': // Open Office Impress
143 case 'sxw': // Open Office Writer
144 case 'ods': // Oasis OpenDocument Spreadsheet
145 case 'odp': // Oasis OpenDocument Presentation
146 case 'odt': // Oasis OpenDocument Text
147 if ($indexerConfig['unzip']) {
148 $unzipPath = rtrim($indexerConfig['unzip'], '/').'/';
149 if (@is_file($unzipPath . 'unzip' . $exe)) {
150 $this->app['unzip'] = $unzipPath.'unzip'.$exe;
151 $extOK = TRUE;
152 } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:unzipNotFound'), $unzipPath), 3);
153 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:unzipDisabled'), 1);
154 break;
155 case 'rtf':
156 // Catdoc
157 if ($indexerConfig['unrtf']) {
158 $unrtfPath = rtrim($indexerConfig['unrtf'], '/').'/';
159 if (@is_file($unrtfPath . 'unrtf' . $exe)) {
160 $this->app['unrtf'] = $unrtfPath.'unrtf'.$exe;
161 $extOK = TRUE;
162 } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:unrtfNotFound'), $unrtfPath), 3);
163 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:unrtfDisabled'), 1);
164 break;
165 case 'txt': // Raw text
166 case 'csv': // Raw text
167 case 'xml': // PHP strip-tags()
168 case 'tif': // PHP EXIF
169 $extOK = TRUE;
170 break;
171 case 'html': // PHP strip-tags()
172 case 'htm': // PHP strip-tags()
173 $extOK = TRUE;
174 $mainExtension = 'html'; // making "html" the common "item_type"
175 break;
176 case 'jpg': // PHP EXIF
177 case 'jpeg': // PHP EXIF
178 $extOK = TRUE;
179 $mainExtension = 'jpeg'; // making "jpeg" the common item_type
180 break;
181 }
182
183 // If extension was OK:
184 if ($extOK) {
185 $this->supportedExtensions[$extension] = TRUE;
186 $this->ext2itemtype_map[$extension] = $mainExtension ? $mainExtension : $extension;
187 return TRUE;
188 }
189 }
190
191 /**
192 * Initialize external parser for backend modules
193 * Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc) in backend and frontend plugin
194 *
195 * @param string File extension to initialize for.
196 * @return boolean Returns TRUE if the extension is supported and enabled, otherwise FALSE.
197 */
198 function softInit($extension) {
199 switch($extension) {
200 case 'pdf': // PDF
201 case 'doc': // MS Word files
202 case 'pps': // MS PowerPoint
203 case 'ppt': // MS PowerPoint
204 case 'xls': // MS Excel
205 case 'sxc': // Open Office Calc.
206 case 'sxi': // Open Office Impress
207 case 'sxw': // Open Office Writer
208 case 'ods': // Oasis OpenDocument Spreadsheet
209 case 'odp': // Oasis OpenDocument Presentation
210 case 'odt': // Oasis OpenDocument Text
211 case 'rtf': // RTF documents
212 case 'txt': // ASCII Text documents
213 case 'html': // HTML
214 case 'htm': // HTML
215 case 'csv': // Comma Separated Values
216 case 'xml': // Generic XML
217 case 'jpg': // Jpeg images (EXIF comment)
218 case 'jpeg': // Jpeg images (EXIF comment)
219 case 'tif': // TIF images (EXIF comment)
220 return TRUE;
221 break;
222 }
223 }
224
225 /**
226 * Return title of entry in media type selector box.
227 *
228 * @param string File extension
229 * @return string String with label value of entry in media type search selector box (frontend plugin).
230 */
231 function searchTypeMediaTitle($extension) {
232
233 // Read indexer-config
234 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
235
236 // Ignore extensions
237 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
238 if (in_array($extension, $ignoreExtensions)) {
239 return FALSE;
240 }
241
242 // Switch on file extension:
243 switch($extension) {
244 case 'pdf':
245 // PDF
246 if ($indexerConfig['pdftools']) {
247 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.PDF'), $extension);
248 }
249 break;
250 case 'doc':
251 // Catdoc
252 if ($indexerConfig['catdoc']) {
253 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.DOC'), $extension);
254 }
255 break;
256 case 'pps': // MS PowerPoint(?)
257 case 'ppt': // MS PowerPoint
258 // ppthtml
259 if ($indexerConfig['ppthtml']) {
260 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.PP'), $extension);
261 }
262 break;
263 case 'xls': // MS Excel
264 // Xlhtml
265 if ($indexerConfig['xlhtml']) {
266 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.XLS'), $extension);
267 }
268 break;
269 case 'sxc': // Open Office Calc.
270 if ($indexerConfig['unzip']) {
271 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXC'), $extension);
272 }
273 break;
274 case 'sxi': // Open Office Impress
275 if ($indexerConfig['unzip']) {
276 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXI'), $extension);
277 }
278 break;
279 case 'sxw': // Open Office Writer
280 if ($indexerConfig['unzip']) {
281 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXW'), $extension);
282 }
283 break;
284 case 'ods': // Oasis OpenDocument Spreadsheet
285 if ($indexerConfig['unzip']) {
286 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODS'), $extension);
287 }
288 break;
289 case 'odp': // Oasis OpenDocument Presentation
290 if ($indexerConfig['unzip']) {
291 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODP'), $extension);
292 }
293 break;
294 case 'odt': // Oasis OpenDocument Text
295 if ($indexerConfig['unzip']) {
296 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODT'), $extension);
297 }
298 break;
299 case 'rtf':
300 // Catdoc
301 if ($indexerConfig['unrtf']) {
302 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.RTF'), $extension);
303 }
304 break;
305 case 'jpeg': // PHP EXIF
306 case 'tif': // PHP EXIF
307 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.Images'), $extension);
308 break;
309 case 'html': // PHP strip-tags()
310 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.HTML'), $extension);
311 break;
312 case 'txt': // Raw text
313 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.TXT'), $extension);
314 break;
315 case 'csv': // Raw text
316 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.CSV'), $extension);
317 break;
318 case 'xml': // PHP strip-tags()
319 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.XML'), $extension);
320 break;
321 // NO entry (duplicates or blank):
322 case 'htm': // PHP strip-tags()
323 case 'jpg': // PHP EXIF
324 default:
325 break;
326 }
327 }
328
329 /**
330 * Returns TRUE if the input extension (item_type) is a potentially a multi-page extension
331 *
332 * @param string Extension / item_type string
333 * @return boolean Return TRUE if multi-page
334 */
335 function isMultiplePageExtension($extension) {
336 // Switch on file extension:
337 switch((string)$extension) {
338 case 'pdf':
339 return TRUE;
340 break;
341 }
342 }
343
344 /**
345 * Wraps the "splitLabel function" of the language object.
346 *
347 * @param string $reference: Reference/key of the label
348 * @param boolean $useHtmlSpecialChar: Convert special chars to HTML entities (default: FALSE)
349 * @return string The label of the reference/key to be fetched
350 */
351 protected function sL($reference, $useHtmlSpecialChar = FALSE) {
352 return $this->langObject->sL($reference, $useHtmlSpecialChar);
353 }
354
355
356
357
358
359
360
361
362
363 /************************
364 *
365 * Reading documents (for parsing)
366 *
367 ************************/
368
369 /**
370 * Reads the content of an external file being indexed.
371 *
372 * @param string File extension, eg. "pdf", "doc" etc.
373 * @param string Absolute filename of file (must exist and be validated OK before calling function)
374 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
375 * @return array Standard content array (title, description, keywords, body keys)
376 */
377 function readFileContent($ext,$absFile,$cPKey) {
378 unset($contentArr);
379
380 // Return immediately if initialization didn't set support up:
381 if (!$this->supportedExtensions[$ext]) return FALSE;
382
383 // Switch by file extension
384 switch ($ext) {
385 case 'pdf':
386 if ($this->app['pdfinfo']) {
387 // Getting pdf-info:
388 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
389 t3lib_utility_Command::exec($cmd, $res);
390 $pdfInfo = $this->splitPdfInfo($res);
391 unset($res);
392 if (intval($pdfInfo['pages'])) {
393 list($low,$high) = explode('-',$cPKey);
394
395 // Get pdf content:
396 $tempFileName = t3lib_div::tempnam('Typo3_indexer'); // Create temporary name
397 @unlink ($tempFileName); // Delete if exists, just to be safe.
398 $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
399 t3lib_utility_Command::exec($cmd);
400 if (@is_file($tempFileName)) {
401 $content = t3lib_div::getUrl($tempFileName);
402 unlink($tempFileName);
403 } else {
404 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsFailed'), $absFile), 2);
405 }
406 if (strlen($content)) {
407 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
408 }
409 }
410 }
411 break;
412 case 'doc':
413 if ($this->app['catdoc']) {
414 $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
415 t3lib_utility_Command::exec($cmd, $res);
416 $content = implode(LF,$res);
417 unset($res);
418 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
419 }
420 break;
421 case 'pps':
422 case 'ppt':
423 if ($this->app['ppthtml']) {
424 $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
425 t3lib_utility_Command::exec($cmd, $res);
426 $content = implode(LF,$res);
427 unset($res);
428 $content = $this->pObj->convertHTMLToUtf8($content);
429 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
430 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
431 }
432 break;
433 case 'xls':
434 if ($this->app['xlhtml']) {
435 $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
436 t3lib_utility_Command::exec($cmd, $res);
437 $content = implode(LF,$res);
438 unset($res);
439 $content = $this->pObj->convertHTMLToUtf8($content);
440 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
441 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
442 }
443 break;
444 case 'sxi':
445 case 'sxc':
446 case 'sxw':
447 case 'ods':
448 case 'odp':
449 case 'odt':
450 if ($this->app['unzip']) {
451 // Read content.xml:
452 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml';
453 t3lib_utility_Command::exec($cmd, $res);
454 $content_xml = implode(LF,$res);
455 unset($res);
456
457 // Read meta.xml:
458 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml';
459 t3lib_utility_Command::exec($cmd, $res);
460 $meta_xml = implode(LF,$res);
461 unset($res);
462
463 $utf8_content = trim(strip_tags(str_replace('<',' <',$content_xml)));
464 $contentArr = $this->pObj->splitRegularContent($utf8_content);
465 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
466
467 // Meta information
468 $metaContent = t3lib_div::xml2tree($meta_xml);
469 $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
470 if (is_array($metaContent)) {
471 $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
472 $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0].' '.$metaContent['dc:description'][0]['values'][0];
473
474 // Keywords collected:
475 if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
476 foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
477 $contentArr['keywords'].= $kwDat['values'][0].' ';
478 }
479 }
480 }
481 }
482 break;
483 case 'rtf':
484 if ($this->app['unrtf']) {
485 $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
486 t3lib_utility_Command::exec($cmd, $res);
487 $fileContent = implode(LF,$res);
488 unset($res);
489 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
490 $contentArr = $this->pObj->splitHTMLContent($fileContent);
491 }
492 break;
493 case 'txt':
494 case 'csv': // Raw text
495 $content = t3lib_div::getUrl($absFile);
496 // TODO: Auto-registration of charset???? -> utf-8 (Current assuming western europe...)
497 $content = $this->pObj->convertHTMLToUtf8($content, 'iso-8859-1');
498 $contentArr = $this->pObj->splitRegularContent($content);
499 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
500 break;
501 case 'html':
502 case 'htm':
503 $fileContent = t3lib_div::getUrl($absFile);
504 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
505 $contentArr = $this->pObj->splitHTMLContent($fileContent);
506 break;
507 case 'xml': // PHP strip-tags()
508 $fileContent = t3lib_div::getUrl($absFile);
509
510 // Finding charset:
511 preg_match('/^[[:space:]]*<\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i',substr($fileContent,0,200),$reg);
512 $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
513
514 // Converting content:
515 $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<',' <',$fileContent)), $charset);
516 $contentArr = $this->pObj->splitRegularContent($fileContent);
517 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
518 break;
519 case 'jpg': // PHP EXIF
520 case 'jpeg': // PHP EXIF
521 case 'tif': // PHP EXIF
522 if (function_exists('exif_read_data')) {
523 $exif = exif_read_data($absFile, 'IFD0');
524 } else {
525 $exif = FALSE;
526 }
527
528 if ($exif) {
529 $comment = trim($exif['COMMENT'][0].' '.$exif['ImageDescription']); // The comments in JPEG files are utf-8, while in Tif files they are 7-bit ascii.
530 } else {
531 $comment = '';
532 }
533 $contentArr = $this->pObj->splitRegularContent($comment);
534 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
535 break;
536 default:
537 return FALSE;
538 break;
539 }
540 // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
541 if (is_array($contentArr) && !$contentArr['title']) {
542 $contentArr['title'] = str_replace('_',' ',basename($absFile)); // Substituting "_" for " " because many filenames may have this instead of a space char.
543 }
544
545 return $contentArr;
546 }
547
548 /**
549 * Creates an array with pointers to divisions of document.
550 * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero) coming back.
551 *
552 * @param string File extension
553 * @param string Absolute filename (must exist and be validated OK before calling function)
554 * @return array Array of pointers to sections that the document should be divided into
555 */
556 function fileContentParts($ext,$absFile) {
557 $cParts = array(0);
558 switch ($ext) {
559 case 'pdf':
560 // Getting pdf-info:
561 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
562 t3lib_utility_Command::exec($cmd, $res);
563 $pdfInfo = $this->splitPdfInfo($res);
564 unset($res);
565
566 if (intval($pdfInfo['pages'])) {
567 $cParts = array();
568
569 // Calculate mode
570 if ($this->pdf_mode>0) {
571 $iter = ceil($pdfInfo['pages']/$this->pdf_mode);
572 } else {
573 $iter = t3lib_utility_Math::forceIntegerInRange(abs($this->pdf_mode),1,$pdfInfo['pages']);
574 }
575
576 // Traverse and create intervals.
577 for ($a=0;$a<$iter;$a++) {
578 $low = floor($a*($pdfInfo['pages']/$iter))+1;
579 $high = floor(($a+1)*($pdfInfo['pages']/$iter));
580 $cParts[] = $low.'-'.$high;
581 }
582 }
583 break;
584 }
585 return $cParts;
586 }
587
588 /**
589 * Analysing PDF info into a useable format.
590 *
591 * @param array Array of PDF content, coming from the pdfinfo tool
592 * @return array Result array
593 * @access private
594 * @see fileContentParts()
595 */
596 function splitPdfInfo($pdfInfoArray) {
597 $res = array();
598 if (is_array($pdfInfoArray)) {
599 foreach($pdfInfoArray as $line) {
600 $parts = explode(':',$line,2);
601 if (count($parts)>1 && trim($parts[0])) {
602 $res[strtolower(trim($parts[0]))] = trim($parts[1]);
603 }
604 }
605 }
606 return $res;
607 }
608
609 /**
610 * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.
611 *
612 * @param string String to clean up
613 * @return string String
614 */
615 function removeEndJunk($string) {
616 return trim(preg_replace('/['.LF.chr(12).']*$/','',$string));
617 }
618
619
620
621
622
623
624
625
626
627
628
629
630 /************************
631 *
632 * Backend analyzer
633 *
634 ************************/
635
636 /**
637 * Return icon for file extension
638 *
639 * @param string File extension, lowercase.
640 * @return string Relative file reference, resolvable by t3lib_div::getFileAbsFileName()
641 */
642 function getIcon($extension) {
643 if ($extension=='htm') $extension = 'html';
644 if ($extension=='jpeg') $extension = 'jpg';
645 return 'EXT:indexed_search/pi/res/'.$extension.'.gif';
646 }
647 }
648
649 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php'])) {
650 include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']);
651 }
652
653 ?>