Fixed bug #4973: Check if array is empty before looping over it (thanks to Thomas...
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.crawler.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2006 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * Crawler hook for indexed search. Works with the "crawler" extension
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 87: class tx_indexedsearch_crawler
38 * 106: function crawler_init(&$pObj)
39 * 219: function crawler_execute($params,&$pObj)
40 * 285: function crawler_execute_type1($cfgRec,&$session_data,$params,&$pObj)
41 * 345: function crawler_execute_type2($cfgRec,&$session_data,$params,&$pObj)
42 * 414: function crawler_execute_type3($cfgRec,&$session_data,$params,&$pObj)
43 * 458: function crawler_execute_type4($cfgRec,&$session_data,$params,&$pObj)
44 * 513: function cleanUpOldRunningConfigurations()
45 *
46 * SECTION: Helper functions
47 * 579: function checkUrl($url,$urlLog,$baseUrl)
48 * 602: function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)
49 * 645: function indexSingleRecord($r,$cfgRec,$rl=NULL)
50 * 694: function loadIndexerClass()
51 * 706: function getUidRootLineForClosestTemplate($id)
52 * 739: function generateNextIndexingTime($cfgRec)
53 * 778: function checkDeniedSuburls($url, $url_deny)
54 * 798: function addQueueEntryForHook($cfgRec, $title)
55 *
56 * SECTION: Hook functions for TCEmain (indexing of records)
57 * 830: function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, &$pObj)
58 *
59 *
60 * 879: class tx_indexedsearch_files
61 * 888: function crawler_execute($params,&$pObj)
62 * 913: function loadIndexerClass()
63 *
64 * TOTAL FUNCTIONS: 18
65 * (This index is automatically created/updated by the extension "extdeveval")
66 *
67 */
68
69
70
71
72 # To make sure the backend charset is available:
73 require_once(PATH_typo3.'sysext/lang/lang.php');
74 if (!is_object($GLOBALS['LANG'])) {
75 $GLOBALS['LANG'] = t3lib_div::makeInstance('language');
76 $GLOBALS['LANG']->init($GLOBALS['BE_USER']->uc['lang']);
77 }
78
79
80 /**
81 * Crawler hook for indexed search. Works with the "crawler" extension
82 *
83 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
84 * @package TYPO3
85 * @subpackage tx_indexedsearch
86 */
87 class tx_indexedsearch_crawler {
88
89 // Static:
90 var $secondsPerExternalUrl = 3; // Number of seconds to use as interval between queued indexing operations of URLs / files (types 2 & 3)
91
92 // Internal, dynamic:
93 var $instanceCounter = 0; // Counts up for each added URL (type 3)
94
95 // Internal, static:
96 var $callBack = 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'; // The object reference to this class.
97
98 /**
99 * Initialization of crawler hook.
100 * This function is asked for each instance of the crawler and we must check if something is timed to happen and if so put entry(s) in the crawlers log to start processing.
101 * In reality we select indexing configurations and evaluate if any of them needs to run.
102 *
103 * @param object Parent object (tx_crawler lib)
104 * @return void
105 */
106 function crawler_init(&$pObj){
107
108 // Select all indexing configuration which are waiting to be activated:
109 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
110 '*',
111 'index_config',
112 'hidden=0
113 AND (starttime=0 OR starttime<='.time().')
114 AND timer_next_indexing<'.time().'
115 AND set_id=0
116 '.t3lib_BEfunc::deleteClause('index_config')
117 );
118
119 // For each configuration, check if it should be executed and if so, start:
120 foreach($indexingConfigurations as $cfgRec) {
121
122 // Generate a unique set-ID:
123 $setId = t3lib_div::md5int(microtime());
124
125 // Get next time:
126 $nextTime = $this->generateNextIndexingTime($cfgRec);
127
128 // Start process by updating index-config record:
129 $field_array = array (
130 'set_id' => $setId,
131 'timer_next_indexing' => $nextTime,
132 'session_data' => '',
133 );
134 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
135
136 // Based on configuration type:
137 switch($cfgRec['type']) {
138 case 1: // RECORDS:
139
140 // Parameters:
141 $params = array(
142 'indexConfigUid' => $cfgRec['uid'],
143 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
144 'url' => 'Records (start)', // Just for show.
145 );
146 //
147 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
148 break;
149 case 2: // FILES:
150
151 // Parameters:
152 $params = array(
153 'indexConfigUid' => $cfgRec['uid'], // General
154 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), // General
155 'url' => $cfgRec['filepath'], // Partly general... (for URL and file types)
156 'depth' => 0 // Specific for URL and file types
157 );
158
159 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
160 break;
161 case 3: // External URL:
162
163 // Parameters:
164 $params = array(
165 'indexConfigUid' => $cfgRec['uid'], // General
166 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), // General
167 'url' => $cfgRec['externalUrl'], // Partly general... (for URL and file types)
168 'depth' => 0 // Specific for URL and file types
169 );
170
171 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
172 break;
173 case 4: // Page tree
174
175 // Parameters:
176 $params = array(
177 'indexConfigUid' => $cfgRec['uid'], // General
178 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), // General
179 'url' => $cfgRec['alternative_source_pid'], // Partly general... (for URL and file types and page tree (root))
180 'depth' => 0 // Specific for URL and file types and page tree
181 );
182
183 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
184 break;
185 case 5: // Meta configuration, nothing to do:
186 # NOOP
187 break;
188 default:
189 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
190 $hookObj = &t3lib_div::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
191
192 if (is_object($hookObj)) {
193
194 // Parameters:
195 $params = array(
196 'indexConfigUid' => $cfgRec['uid'], // General
197 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].'/CUSTOM]'), // General
198 'url' => $hookObj->initMessage($message),
199 );
200
201 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
202 }
203 }
204 break;
205 }
206 }
207
208 // Finally, look up all old index configurations which are finished and needs to be reset and done.
209 $this->cleanUpOldRunningConfigurations();
210 }
211
212 /**
213 * Call back function for execution of a log element
214 *
215 * @param array Params from log element. Must contain $params['indexConfigUid']
216 * @param object Parent object (tx_crawler lib)
217 * @return array Result array
218 */
219 function crawler_execute($params,&$pObj) {
220
221 // Indexer configuration ID must exist:
222 if ($params['indexConfigUid']) {
223
224 // Load the indexing configuration record:
225 list($cfgRec) = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
226 '*',
227 'index_config',
228 'uid='.intval($params['indexConfigUid'])
229 );
230
231 if (is_array($cfgRec)) {
232
233 // Unpack session data:
234 $session_data = unserialize($cfgRec['session_data']);
235
236 // Select which type:
237 switch($cfgRec['type']) {
238 case 1: // Records:
239 $this->crawler_execute_type1($cfgRec,$session_data,$params,$pObj);
240 break;
241 case 2: // Files
242 $this->crawler_execute_type2($cfgRec,$session_data,$params,$pObj);
243 break;
244 case 3: // External URL:
245 $this->crawler_execute_type3($cfgRec,$session_data,$params,$pObj);
246 break;
247 case 4: // Page tree:
248 $this->crawler_execute_type4($cfgRec,$session_data,$params,$pObj);
249 break;
250 case 5: // Meta
251 # NOOP (should never enter here!)
252 break;
253 default:
254 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
255 $hookObj = &t3lib_div::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
256
257 if (is_object($hookObj)) {
258 $this->pObj = &$pObj; // For addQueueEntryForHook()
259 $hookObj->indexOperation($cfgRec,$session_data,$params,$this);
260 }
261 }
262 break;
263 }
264
265 // Save process data which might be modified:
266 $field_array = array (
267 'session_data' => serialize($session_data)
268 );
269 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
270 }
271 }
272
273 return array('log' => $params);
274 }
275
276 /**
277 * Indexing records from a table
278 *
279 * @param array Indexing Configuration Record
280 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
281 * @param array Parameters from the log queue.
282 * @param object Parent object (from "crawler" extension!)
283 * @return void
284 */
285 function crawler_execute_type1($cfgRec,&$session_data,$params,&$pObj) {
286 if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']])) {
287
288 // Init session data array if not already:
289 if (!is_array($session_data)) {
290 $session_data = array(
291 'uid' => 0
292 );
293 }
294
295 // Init:
296 $pid = intval($cfgRec['alternative_source_pid']) ? intval($cfgRec['alternative_source_pid']) : $cfgRec['pid'];
297 $numberOfRecords = $cfgRec['recordsbatch'] ? t3lib_div::intInRange($cfgRec['recordsbatch'],1) : 100;
298
299 // Get root line:
300 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
301
302 // Select
303 $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
304 '*',
305 $cfgRec['table2index'],
306 'pid = '.intval($pid).'
307 AND uid > '.intval($session_data['uid']).
308 t3lib_BEfunc::deleteClause($cfgRec['table2index']),
309 '',
310 'uid',
311 $numberOfRecords
312 );
313
314 // Traverse:
315 if (count($recs)) {
316 foreach($recs as $r) {
317
318 // Index single record:
319 $this->indexSingleRecord($r,$cfgRec,$rl);
320
321 // Update the UID we last processed:
322 $session_data['uid'] = $r['uid'];
323 }
324
325 // Finally, set entry for next indexing of batch of records:
326 $nparams = array(
327 'indexConfigUid' => $cfgRec['uid'],
328 'url' => 'Records from UID#'.($r['uid']+1).'-?',
329 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']')
330 );
331 $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']);
332 }
333 }
334 }
335
336 /**
337 * Indexing files from fileadmin
338 *
339 * @param array Indexing Configuration Record
340 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
341 * @param array Parameters from the log queue.
342 * @param object Parent object (from "crawler" extension!)
343 * @return void
344 */
345 function crawler_execute_type2($cfgRec,&$session_data,$params,&$pObj) {
346
347 // Prepare path, making it absolute and checking:
348 $readpath = $params['url'];
349 if (!t3lib_div::isAbsPath($readpath)) {
350 $readpath = t3lib_div::getFileAbsFileName($readpath);
351 }
352
353 if (t3lib_div::isAllowedAbsPath($readpath)) {
354 if (@is_file($readpath)) { // If file, index it!
355
356 // Get root line (need to provide this when indexing external files)
357 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
358
359 // Load indexer if not yet.
360 $this->loadIndexerClass();
361
362 // (Re)-Indexing file on page.
363 $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
364 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
365 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
366 $indexerObj->hash['phash'] = -1; // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
367
368 // Index document:
369 $indexerObj->indexRegularDocument(substr($readpath,strlen(PATH_site)), TRUE);
370 } elseif (@is_dir($readpath)) { // If dir, read content and create new pending items for log:
371
372 // Select files and directories in path:
373 $extList = implode(',',t3lib_div::trimExplode(',',$cfgRec['extensions'],1));
374 $fileArr = array();
375 $files = t3lib_div::getAllFilesAndFoldersInPath($fileArr,$readpath,$extList,0,0);
376
377 $directoryList = t3lib_div::get_dirs($readpath);
378 if (is_array($directoryList) && $params['depth'] < $cfgRec['depth']) {
379 foreach ($directoryList as $subdir) {
380 if ((string)$subdir!='') {
381 $files[]= $readpath.$subdir.'/';
382 }
383 }
384 }
385 $files = t3lib_div::removePrefixPathFromList($files,PATH_site);
386
387 // traverse the items and create log entries:
388 foreach($files as $path) {
389 $this->instanceCounter++;
390 if ($path!==$params['url']) {
391 // Parameters:
392 $nparams = array(
393 'indexConfigUid' => $cfgRec['uid'],
394 'url' => $path,
395 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
396 'depth' => $params['depth']+1
397 );
398 $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid'],time()+$this->instanceCounter*$this->secondsPerExternalUrl);
399 }
400 }
401 }
402 }
403 }
404
405 /**
406 * Indexing External URLs
407 *
408 * @param array Indexing Configuration Record
409 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
410 * @param array Parameters from the log queue.
411 * @param object Parent object (from "crawler" extension!)
412 * @return void
413 */
414 function crawler_execute_type3($cfgRec,&$session_data,$params,&$pObj) {
415
416 // Init session data array if not already:
417 if (!is_array($session_data)) {
418 $session_data = array(
419 'urlLog' => array($params['url'])
420 );
421 }
422
423 // Index the URL:
424 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
425 $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
426
427 // Add more elements to log now:
428 if ($params['depth'] < $cfgRec['depth']) {
429 foreach($subUrls as $url) {
430 if ($url = $this->checkUrl($url,$session_data['urlLog'],$cfgRec['externalUrl'])) {
431 if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny'])) {
432 $this->instanceCounter++;
433 $session_data['urlLog'][] = $url;
434
435 // Parameters:
436 $nparams = array(
437 'indexConfigUid' => $cfgRec['uid'],
438 'url' => $url,
439 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
440 'depth' => $params['depth']+1
441 );
442 $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid'],time()+$this->instanceCounter*$this->secondsPerExternalUrl);
443 }
444 }
445 }
446 }
447 }
448
449 /**
450 * Page tree indexing type
451 *
452 * @param array Indexing Configuration Record
453 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
454 * @param array Parameters from the log queue.
455 * @param object Parent object (from "crawler" extension!)
456 * @return void
457 */
458 function crawler_execute_type4($cfgRec,&$session_data,$params,&$pObj) {
459
460 // Base page uid:
461 $pageUid = intval($params['url']);
462
463 // Get array of URLs from page:
464 $pageRow = t3lib_BEfunc::getRecord('pages',$pageUid);
465 $res = $pObj->getUrlsForPageRow($pageRow);
466
467 $duplicateTrack = array(); // Registry for duplicates
468 $downloadUrls = array(); // Dummy.
469
470 // Submit URLs:
471 if (count($res)) {
472 foreach($res as $paramSetKey => $vv) {
473 $urlList = $pObj->urlListFromUrlArray($vv,$pageRow,time(),30,1,0,$duplicateTrack,$downloadUrls,array('tx_indexedsearch_reindex'));
474 }
475 }
476
477 // Add subpages to log now:
478 if ($params['depth'] < $cfgRec['depth']) {
479
480 // Subpages selected
481 $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
482 'uid,title',
483 'pages',
484 'pid = '.intval($pageUid).
485 t3lib_BEfunc::deleteClause('pages')
486 );
487
488 // Traverse subpages and add to queue:
489 if (count($recs)) {
490 foreach($recs as $r) {
491 $this->instanceCounter++;
492 $url = 'pages:'.$r['uid'].': '.$r['title'];
493 $session_data['urlLog'][] = $url;
494
495 // Parameters:
496 $nparams = array(
497 'indexConfigUid' => $cfgRec['uid'],
498 'url' => $r['uid'],
499 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
500 'depth' => $params['depth']+1
501 );
502 $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid'],time()+$this->instanceCounter*$this->secondsPerExternalUrl);
503 }
504 }
505 }
506 }
507
508 /**
509 * Look up all old index configurations which are finished and needs to be reset and done
510 *
511 * @return void
512 */
513 function cleanUpOldRunningConfigurations() {
514
515 // Lookup running index configurations:
516 $runningIndexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
517 'uid,set_id',
518 'index_config',
519 'set_id!=0'.t3lib_BEfunc::deleteClause('index_config')
520 );
521
522 // For each running configuration, look up how many log entries there are which are scheduled for execution and if none, clear the "set_id" (means; Processing was DONE)
523 foreach($runningIndexingConfigurations as $cfgRec) {
524
525 // Look for ended processes:
526 list($queued_items) = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
527 'count(*) AS count',
528 'tx_crawler_queue',
529 'set_id='.intval($cfgRec['set_id']).' AND exec_time=0'
530 );
531
532 if (!$queued_items['count']) {
533
534 // Lookup old phash rows:
535 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
536 'phash',
537 'index_phash',
538 'freeIndexUid='.intval($cfgRec['uid']).' AND freeIndexSetId!='.$cfgRec['set_id']
539 );
540
541 foreach($oldPhashRows as $pHashRow) {
542 // Removing old registrations for all tables (code copied from class.tx_indexedsearch_modfunc1.php)
543 $tableArr = explode(',','index_phash,index_rel,index_section,index_grlist,index_fulltext,index_debug');
544 foreach($tableArr as $table) {
545 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($pHashRow['phash']));
546 }
547 }
548
549 // End process by updating index-config record:
550 $field_array = array (
551 'set_id' => 0,
552 'session_data' => '',
553 );
554 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
555 }
556 }
557 }
558
559
560
561
562
563
564
565 /*****************************************
566 *
567 * Helper functions
568 *
569 *****************************************/
570
571 /**
572 * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log.
573 *
574 * @param string URL string to check
575 * @param array Array of already indexed URLs (input url is looked up here and must not exist already)
576 * @param string Base URL of the indexing process (input URL must be "inside" the base URL!)
577 * @return string Returls the URL if OK, otherwise false
578 */
579 function checkUrl($url,$urlLog,$baseUrl) {
580 $url = ereg_replace('\/\/$','/',$url);
581 list($url) = explode('#',$url);
582
583 if (!strstr($url,'../')) {
584 if (t3lib_div::isFirstPartOfStr($url,$baseUrl)) {
585 if (!in_array($url,$urlLog)) {
586 return $url;
587 }
588 }
589 }
590 }
591
592 /**
593 * Indexing External URL
594 *
595 * @param string URL, http://....
596 * @param integer Page id to relate indexing to.
597 * @param array Rootline array to relate indexing to
598 * @param integer Configuration UID
599 * @param integer Set ID value
600 * @return array URLs found on this page
601 */
602 function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId) {
603
604 // Load indexer if not yet.
605 $this->loadIndexerClass();
606
607 // Index external URL:
608 $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
609 $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
610 $indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
611 $indexerObj->hash['phash'] = -1; // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
612
613 $indexerObj->indexExternalUrl($url);
614 $url_qParts = parse_url($url);
615
616 // Get URLs on this page:
617 $subUrls = array();
618 $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
619
620 // Traverse links:
621 foreach ($list as $count => $linkInfo) {
622
623 // Decode entities:
624 $subUrl = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
625
626 $qParts = parse_url($subUrl);
627 if (!$qParts['scheme']) {
628 $subUrl = $url_qParts['scheme'].'://'.$url_qParts['host'].'/'.t3lib_div::resolveBackPath($subUrl);
629 }
630
631 $subUrls[] = $subUrl;
632 }
633
634 return $subUrls;
635 }
636
637 /**
638 * Indexing Single Record
639 *
640 * @param array Record to index
641 * @param array Configuration Record
642 * @param array Rootline array to relate indexing to
643 * @return void
644 */
645 function indexSingleRecord($r,$cfgRec,$rl=NULL) {
646
647 // Load indexer if not yet.
648 $this->loadIndexerClass();
649
650
651 // Init:
652 $rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
653 $fieldList = t3lib_div::trimExplode(',',$cfgRec['fieldlist'],1);
654 $languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
655 $sys_language_uid = $languageField ? $r[$languageField] : 0;
656
657 // (Re)-Indexing a row from a table:
658 $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
659 parse_str(str_replace('###UID###',$r['uid'],$cfgRec['get_params']),$GETparams);
660 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, $cfgRec['chashcalc'] ? TRUE : FALSE);
661 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
662 $indexerObj->forceIndexing = TRUE;
663
664 $theContent = '';
665 foreach($fieldList as $k => $v) {
666 if (!$k) {
667 $theTitle = $r[$v];
668 } else {
669 $theContent.= $r[$v].' ';
670 }
671 }
672
673 // Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid())
674 $indexerObj->backend_indexAsTYPO3Page(
675 strip_tags($theTitle),
676 '',
677 '',
678 strip_tags($theContent),
679 $GLOBALS['LANG']->charSet, // Requires that
680 $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']],
681 $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']],
682 $r['uid']
683 );
684
685 #echo print_r($indexerObj->internal_log);
686 #echo print_r($indexerObj->contentParts);
687 }
688
689 /**
690 * Include indexer class.
691 *
692 * @return void
693 */
694 function loadIndexerClass() {
695 global $TYPO3_CONF_VARS;
696 require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php');
697 }
698
699 /**
700 * Get rootline for closest TypoScript template root.
701 * Algorithm same as used in Web > Template, Object browser
702 *
703 * @param integer The page id to traverse rootline back from
704 * @return array Array where the root lines uid values are found.
705 */
706 function getUidRootLineForClosestTemplate($id) {
707 global $TYPO3_CONF_VARS;
708
709 require_once (PATH_t3lib."class.t3lib_page.php");
710 require_once (PATH_t3lib."class.t3lib_tstemplate.php");
711 require_once (PATH_t3lib."class.t3lib_tsparser_ext.php");
712
713
714
715 $tmpl = t3lib_div::makeInstance("t3lib_tsparser_ext");
716 $tmpl->tt_track = 0; // Do not log time-performance information
717 $tmpl->init();
718
719 // Gets the rootLine
720 $sys_page = t3lib_div::makeInstance("t3lib_pageSelect");
721 $rootLine = $sys_page->getRootLine($id);
722 $tmpl->runThroughTemplates($rootLine,0); // This generates the constants/config + hierarchy info for the template.
723
724 // Root line uids
725 $rootline_uids = array();
726 foreach($tmpl->rootLine as $rlkey => $rldat) {
727 $rootline_uids[$rlkey] = $rldat['uid'];
728 }
729
730 return $rootline_uids;
731 }
732
733 /**
734 * Generate the unix time stamp for next visit.
735 *
736 * @param array Index configuration record
737 * @return integer The next time stamp
738 */
739 function generateNextIndexingTime($cfgRec) {
740 $currentTime = time();
741
742 // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected
743 if ($cfgRec['timer_frequency']<=24*3600) {
744 $aMidNight = mktime (0,0,0)-1*24*3600;
745 } else {
746 $lastTime = $cfgRec['timer_next_indexing']?$cfgRec['timer_next_indexing']:time();
747 $aMidNight = mktime (0,0,0, date('m',$lastTime), date('d',$lastTime), date('y',$lastTime));
748 }
749
750 // Find last offset time plus frequency in seconds:
751 $lastSureOffset = $aMidNight+t3lib_div::intInRange($cfgRec['timer_offset'],0,86400);
752 $frequencySeconds = t3lib_div::intInRange($cfgRec['timer_frequency'],1);
753
754 // Now, find out how many blocks of the length of frequency there is until the next time:
755 $frequencyBlocksUntilNextTime = ceil(($currentTime-$lastSureOffset)/$frequencySeconds);
756
757 // Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds.
758 $nextTime = $lastSureOffset + $frequencyBlocksUntilNextTime*$frequencySeconds;
759
760 return $nextTime;
761 }
762
763 /**
764 * Checks if $url has any of the URls in the $url_deny "list" in it and if so, returns true.
765 *
766 * @param string URL to test
767 * @param string String where URLs are separated by line-breaks; If any of these strings is the first part of $url, the function returns TRUE (to indicate denial of decend)
768 * @return boolean TRUE if there is a matching URL (hence, do not index!)
769 */
770 function checkDeniedSuburls($url, $url_deny) {
771 if (trim($url_deny)) {
772 $url_denyArray = t3lib_div::trimExplode(chr(10),$url_deny,1);
773 foreach($url_denyArray as $testurl) {
774 if (t3lib_div::isFirstPartOfStr($url,$testurl)) {
775 echo $url.' /// '.$url_deny.chr(10);
776 return TRUE;
777 }
778 }
779 }
780 return FALSE;
781 }
782
783 /**
784 * Adding entry in queue for Hook
785 *
786 * @param array Configuration record
787 * @param string Title/URL
788 * @return void
789 */
790 function addQueueEntryForHook($cfgRec, $title) {
791
792 $nparams = array(
793 'indexConfigUid' => $cfgRec['uid'], // This must ALWAYS be the cfgRec uid!
794 'url' => $title,
795 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']') // Also just for information. Its good style to show that its an indexing configuration that added the entry.
796 );
797 $this->pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']);
798 }
799
800 /**
801 * Deletes all data stored by indexed search for a given page
802 *
803 * @param integer Uid of the page to delete all pHash
804 * @return void
805 */
806 function deleteFromIndex($id) {
807
808 // Lookup old phash rows:
809 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('phash','index_section', 'page_id='.intval($id));
810
811 if (count($oldPhashRows)) {
812 $pHashesToDelete = array();
813 foreach ($oldPhashRows as $pHashRow) {
814 $pHashesToDelete[] = $pHashRow['phash'];
815 }
816
817 $where_clause = 'phash IN ('.implode(',',$GLOBALS['TYPO3_DB']->cleanIntArray($pHashesToDelete)).')';
818 $tables = explode(',', 'index_debug,index_fulltext,index_grlist,index_phash,index_rel,index_section');
819 foreach ($tables as $table) {
820 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, $where_clause);
821 }
822 }
823 }
824
825
826
827
828
829
830
831 /*************************
832 *
833 * Hook functions for TCEmain (indexing of records)
834 *
835 *************************/
836
837 /**
838 * TCEmain hook function for on-the-fly indexing of database records
839 *
840 * @param string TCEmain command
841 * @param string Table name
842 * @param string Record ID. If new record its a string pointing to index inside t3lib_tcemain::substNEWwithIDs
843 * @param mixed Target value (ignored)
844 * @param object Reference to tcemain calling object
845 * @return void
846 */
847 function processCmdmap_preProcess($command, $table, $id, $value, &$pObj) {
848
849 // Clean up the index
850 if ($command=='delete' && $table == 'pages') {
851 $this->deleteFromIndex($id);
852 }
853 }
854
855 /**
856 * TCEmain hook function for on-the-fly indexing of database records
857 *
858 * @param string Status "new" or "update"
859 * @param string Table name
860 * @param string Record ID. If new record its a string pointing to index inside t3lib_tcemain::substNEWwithIDs
861 * @param array Field array of updated fields in the operation
862 * @param object Reference to tcemain calling object
863 * @return void
864 */
865 function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, &$pObj) {
866
867 // Check if any fields are actually updated:
868 if (count($fieldArray)) {
869
870 // Translate new ids.
871 if ($status=='new') {
872 $id = $pObj->substNEWwithIDs[$id];
873
874 } elseif ($table=='pages' && $status=='update' && ((array_key_exists('hidden',$fieldArray) && $fieldArray['hidden']==1) || (array_key_exists('no_search',$fieldArray) && $fieldArray['no_search']==1))) {
875
876 // If the page should be hidden or not indexed after update, delete index for this page
877 $this->deleteFromIndex($id);
878 }
879
880 // Get full record and if exists, search for indexing configurations:
881 $currentRecord = t3lib_BEfunc::getRecord($table,$id);
882 if (is_array($currentRecord)) {
883
884 // Select all (not running) indexing configurations of type "record" (1) and which points to this table and is located on the same page as the record or pointing to the right source PID
885 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
886 '*',
887 'index_config',
888 'hidden=0
889 AND (starttime=0 OR starttime<='.time().')
890 AND set_id=0
891 AND type=1
892 AND table2index='.$GLOBALS['TYPO3_DB']->fullQuoteStr($table,'index_config').'
893 AND (
894 (alternative_source_pid='.$GLOBALS['TYPO3_DB']->fullQuoteStr('','index_config').' AND pid='.intval($currentRecord['pid']).')
895 OR (alternative_source_pid='.$GLOBALS['TYPO3_DB']->fullQuoteStr($currentRecord['pid'],'index_config').')
896 )
897 AND records_indexonchange=1
898 '.t3lib_BEfunc::deleteClause('index_config')
899 );
900
901 foreach($indexingConfigurations as $cfgRec) {
902 $this->indexSingleRecord($currentRecord,$cfgRec);
903 }
904 }
905 }
906 }
907 }
908
909
910 /**
911 * Crawler hook for indexed search. Works with the "crawler" extension
912 * This hook is specifically used to index external files found on pages through the crawler extension.
913 *
914 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
915 * @package TYPO3
916 * @subpackage tx_indexedsearch
917 * @see tx_indexedsearch_indexer::extractLinks()
918 */
919 class tx_indexedsearch_files {
920
921 /**
922 * Call back function for execution of a log element
923 *
924 * @param array Params from log element.
925 * @param object Parent object (tx_crawler lib)
926 * @return array Result array
927 */
928 function crawler_execute($params,&$pObj) {
929
930 // Load indexer if not yet.
931 $this->loadIndexerClass();
932
933 if (is_array($params['conf'])) {
934
935 // Initialize the indexer class:
936 $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
937 $indexerObj->conf = $params['conf'];
938 $indexerObj->init();
939
940 // Index document:
941 if ($params['alturl']) {
942 $fI = pathinfo($params['document']);
943 $ext = strtolower($fI['extension']);
944 $indexerObj->indexRegularDocument($params['alturl'], TRUE, $params['document'], $ext);
945 } else {
946 $indexerObj->indexRegularDocument($params['document'], TRUE);
947 }
948
949 // Return OK:
950 return array('content' => array());
951 }
952 }
953
954 /**
955 * Include indexer class.
956 *
957 * @return void
958 */
959 function loadIndexerClass() {
960 global $TYPO3_CONF_VARS;
961 require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php');
962 }
963 }
964
965
966 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.crawler.php']) {
967 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.crawler.php']);
968 }
969 ?>