Fixed #11430: Performance improvement: use $GLOBALS['EXEC_TIME'] instead of time...
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.crawler.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2009 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * Crawler hook for indexed search. Works with the "crawler" extension
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 87: class tx_indexedsearch_crawler
38 * 106: function crawler_init(&$pObj)
39 * 219: function crawler_execute($params,&$pObj)
40 * 285: function crawler_execute_type1($cfgRec,&$session_data,$params,&$pObj)
41 * 345: function crawler_execute_type2($cfgRec,&$session_data,$params,&$pObj)
42 * 414: function crawler_execute_type3($cfgRec,&$session_data,$params,&$pObj)
43 * 458: function crawler_execute_type4($cfgRec,&$session_data,$params,&$pObj)
44 * 513: function cleanUpOldRunningConfigurations()
45 *
46 * SECTION: Helper functions
47 * 579: function checkUrl($url,$urlLog,$baseUrl)
48 * 602: function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)
49 * 645: function indexSingleRecord($r,$cfgRec,$rl=NULL)
50 * 694: function loadIndexerClass()
51 * 706: function getUidRootLineForClosestTemplate($id)
52 * 739: function generateNextIndexingTime($cfgRec)
53 * 778: function checkDeniedSuburls($url, $url_deny)
54 * 798: function addQueueEntryForHook($cfgRec, $title)
55 *
56 * SECTION: Hook functions for TCEmain (indexing of records)
57 * 830: function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, &$pObj)
58 *
59 *
60 * 879: class tx_indexedsearch_files
61 * 888: function crawler_execute($params,&$pObj)
62 * 913: function loadIndexerClass()
63 *
64 * TOTAL FUNCTIONS: 18
65 * (This index is automatically created/updated by the extension "extdeveval")
66 *
67 */
68
69
70
71
72 # To make sure the backend charset is available:
73 if (!is_object($GLOBALS['LANG'])) {
74 $GLOBALS['LANG'] = t3lib_div::makeInstance('language');
75 $GLOBALS['LANG']->init($GLOBALS['BE_USER']->uc['lang']);
76 }
77
78
79 /**
80 * Crawler hook for indexed search. Works with the "crawler" extension
81 *
82 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
83 * @package TYPO3
84 * @subpackage tx_indexedsearch
85 */
86 class tx_indexedsearch_crawler {
87
88 // Static:
89 var $secondsPerExternalUrl = 3; // Number of seconds to use as interval between queued indexing operations of URLs / files (types 2 & 3)
90
91 // Internal, dynamic:
92 var $instanceCounter = 0; // Counts up for each added URL (type 3)
93
94 // Internal, static:
95 var $callBack = 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'; // The object reference to this class.
96
97 /**
98 * Initialization of crawler hook.
99 * This function is asked for each instance of the crawler and we must check if something is timed to happen and if so put entry(s) in the crawlers log to start processing.
100 * In reality we select indexing configurations and evaluate if any of them needs to run.
101 *
102 * @param object Parent object (tx_crawler lib)
103 * @return void
104 */
105 function crawler_init(&$pObj){
106
107 // Select all indexing configuration which are waiting to be activated:
108 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
109 '*',
110 'index_config',
111 'hidden=0
112 AND (starttime=0 OR starttime<=' . $GLOBALS['EXEC_TIME'] . ')
113 AND timer_next_indexing<' . $GLOBALS['EXEC_TIME'] . '
114 AND set_id=0
115 '.t3lib_BEfunc::deleteClause('index_config')
116 );
117
118 // For each configuration, check if it should be executed and if so, start:
119 foreach($indexingConfigurations as $cfgRec) {
120
121 // Generate a unique set-ID:
122 $setId = t3lib_div::md5int(microtime());
123
124 // Get next time:
125 $nextTime = $this->generateNextIndexingTime($cfgRec);
126
127 // Start process by updating index-config record:
128 $field_array = array (
129 'set_id' => $setId,
130 'timer_next_indexing' => $nextTime,
131 'session_data' => '',
132 );
133 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
134
135 // Based on configuration type:
136 switch($cfgRec['type']) {
137 case 1: // RECORDS:
138
139 // Parameters:
140 $params = array(
141 'indexConfigUid' => $cfgRec['uid'],
142 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
143 'url' => 'Records (start)', // Just for show.
144 );
145 //
146 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
147 break;
148 case 2: // FILES:
149
150 // Parameters:
151 $params = array(
152 'indexConfigUid' => $cfgRec['uid'], // General
153 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), // General
154 'url' => $cfgRec['filepath'], // Partly general... (for URL and file types)
155 'depth' => 0 // Specific for URL and file types
156 );
157
158 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
159 break;
160 case 3: // External URL:
161
162 // Parameters:
163 $params = array(
164 'indexConfigUid' => $cfgRec['uid'], // General
165 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), // General
166 'url' => $cfgRec['externalUrl'], // Partly general... (for URL and file types)
167 'depth' => 0 // Specific for URL and file types
168 );
169
170 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
171 break;
172 case 4: // Page tree
173
174 // Parameters:
175 $params = array(
176 'indexConfigUid' => $cfgRec['uid'], // General
177 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), // General
178 'url' => intval($cfgRec['alternative_source_pid']), // Partly general... (for URL and file types and page tree (root))
179 'depth' => 0 // Specific for URL and file types and page tree
180 );
181
182 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
183 break;
184 case 5: // Meta configuration, nothing to do:
185 # NOOP
186 break;
187 default:
188 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
189 $hookObj = &t3lib_div::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
190
191 if (is_object($hookObj)) {
192
193 // Parameters:
194 $params = array(
195 'indexConfigUid' => $cfgRec['uid'], // General
196 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].'/CUSTOM]'), // General
197 'url' => $hookObj->initMessage($message),
198 );
199
200 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
201 }
202 }
203 break;
204 }
205 }
206
207 // Finally, look up all old index configurations which are finished and needs to be reset and done.
208 $this->cleanUpOldRunningConfigurations();
209 }
210
211 /**
212 * Call back function for execution of a log element
213 *
214 * @param array Params from log element. Must contain $params['indexConfigUid']
215 * @param object Parent object (tx_crawler lib)
216 * @return array Result array
217 */
218 function crawler_execute($params,&$pObj) {
219
220 // Indexer configuration ID must exist:
221 if ($params['indexConfigUid']) {
222
223 // Load the indexing configuration record:
224 list($cfgRec) = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
225 '*',
226 'index_config',
227 'uid='.intval($params['indexConfigUid'])
228 );
229
230 if (is_array($cfgRec)) {
231
232 // Unpack session data:
233 $session_data = unserialize($cfgRec['session_data']);
234
235 // Select which type:
236 switch($cfgRec['type']) {
237 case 1: // Records:
238 $this->crawler_execute_type1($cfgRec,$session_data,$params,$pObj);
239 break;
240 case 2: // Files
241 $this->crawler_execute_type2($cfgRec,$session_data,$params,$pObj);
242 break;
243 case 3: // External URL:
244 $this->crawler_execute_type3($cfgRec,$session_data,$params,$pObj);
245 break;
246 case 4: // Page tree:
247 $this->crawler_execute_type4($cfgRec,$session_data,$params,$pObj);
248 break;
249 case 5: // Meta
250 # NOOP (should never enter here!)
251 break;
252 default:
253 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
254 $hookObj = &t3lib_div::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
255
256 if (is_object($hookObj)) {
257 $this->pObj = &$pObj; // For addQueueEntryForHook()
258 $hookObj->indexOperation($cfgRec,$session_data,$params,$this);
259 }
260 }
261 break;
262 }
263
264 // Save process data which might be modified:
265 $field_array = array (
266 'session_data' => serialize($session_data)
267 );
268 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
269 }
270 }
271
272 return array('log' => $params);
273 }
274
275 /**
276 * Indexing records from a table
277 *
278 * @param array Indexing Configuration Record
279 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
280 * @param array Parameters from the log queue.
281 * @param object Parent object (from "crawler" extension!)
282 * @return void
283 */
284 function crawler_execute_type1($cfgRec,&$session_data,$params,&$pObj) {
285 if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']])) {
286
287 // Init session data array if not already:
288 if (!is_array($session_data)) {
289 $session_data = array(
290 'uid' => 0
291 );
292 }
293
294 // Init:
295 $pid = intval($cfgRec['alternative_source_pid']) ? intval($cfgRec['alternative_source_pid']) : $cfgRec['pid'];
296 $numberOfRecords = $cfgRec['recordsbatch'] ? t3lib_div::intInRange($cfgRec['recordsbatch'],1) : 100;
297
298 // Get root line:
299 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
300
301 // Select
302 $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
303 '*',
304 $cfgRec['table2index'],
305 'pid = '.intval($pid).'
306 AND uid > '.intval($session_data['uid']).
307 t3lib_BEfunc::deleteClause($cfgRec['table2index']).
308 t3lib_BEfunc::BEenableFields($cfgRec['table2index']),
309 '',
310 'uid',
311 $numberOfRecords
312 );
313
314 // Traverse:
315 if (count($recs)) {
316 foreach($recs as $r) {
317
318 // Index single record:
319 $this->indexSingleRecord($r,$cfgRec,$rl);
320
321 // Update the UID we last processed:
322 $session_data['uid'] = $r['uid'];
323 }
324
325 // Finally, set entry for next indexing of batch of records:
326 $nparams = array(
327 'indexConfigUid' => $cfgRec['uid'],
328 'url' => 'Records from UID#'.($r['uid']+1).'-?',
329 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']')
330 );
331 $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']);
332 }
333 }
334 }
335
336 /**
337 * Indexing files from fileadmin
338 *
339 * @param array Indexing Configuration Record
340 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
341 * @param array Parameters from the log queue.
342 * @param object Parent object (from "crawler" extension!)
343 * @return void
344 */
345 function crawler_execute_type2($cfgRec,&$session_data,$params,&$pObj) {
346
347 // Prepare path, making it absolute and checking:
348 $readpath = $params['url'];
349 if (!t3lib_div::isAbsPath($readpath)) {
350 $readpath = t3lib_div::getFileAbsFileName($readpath);
351 }
352
353 if (t3lib_div::isAllowedAbsPath($readpath)) {
354 if (@is_file($readpath)) { // If file, index it!
355
356 // Get root line (need to provide this when indexing external files)
357 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
358
359 // Load indexer if not yet.
360 $this->loadIndexerClass();
361
362 // (Re)-Indexing file on page.
363 $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
364 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
365 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
366 $indexerObj->hash['phash'] = -1; // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
367
368 // Index document:
369 $indexerObj->indexRegularDocument(substr($readpath,strlen(PATH_site)), TRUE);
370 } elseif (@is_dir($readpath)) { // If dir, read content and create new pending items for log:
371
372 // Select files and directories in path:
373 $extList = implode(',',t3lib_div::trimExplode(',',$cfgRec['extensions'],1));
374 $fileArr = array();
375 $files = t3lib_div::getAllFilesAndFoldersInPath($fileArr,$readpath,$extList,0,0);
376
377 $directoryList = t3lib_div::get_dirs($readpath);
378 if (is_array($directoryList) && $params['depth'] < $cfgRec['depth']) {
379 foreach ($directoryList as $subdir) {
380 if ((string)$subdir!='') {
381 $files[]= $readpath.$subdir.'/';
382 }
383 }
384 }
385 $files = t3lib_div::removePrefixPathFromList($files,PATH_site);
386
387 // traverse the items and create log entries:
388 foreach($files as $path) {
389 $this->instanceCounter++;
390 if ($path!==$params['url']) {
391 // Parameters:
392 $nparams = array(
393 'indexConfigUid' => $cfgRec['uid'],
394 'url' => $path,
395 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
396 'depth' => $params['depth']+1
397 );
398 $pObj->addQueueEntry_callBack(
399 $cfgRec['set_id'],
400 $nparams,
401 $this->callBack,
402 $cfgRec['pid'],
403 $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
404 );
405 }
406 }
407 }
408 }
409 }
410
411 /**
412 * Indexing External URLs
413 *
414 * @param array Indexing Configuration Record
415 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
416 * @param array Parameters from the log queue.
417 * @param object Parent object (from "crawler" extension!)
418 * @return void
419 */
420 function crawler_execute_type3($cfgRec,&$session_data,$params,&$pObj) {
421
422 // Init session data array if not already:
423 if (!is_array($session_data)) {
424 $session_data = array(
425 'urlLog' => array($params['url'])
426 );
427 }
428
429 // Index the URL:
430 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
431 $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
432
433 // Add more elements to log now:
434 if ($params['depth'] < $cfgRec['depth']) {
435 foreach($subUrls as $url) {
436 if ($url = $this->checkUrl($url,$session_data['urlLog'],$cfgRec['externalUrl'])) {
437 if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny'])) {
438 $this->instanceCounter++;
439 $session_data['urlLog'][] = $url;
440
441 // Parameters:
442 $nparams = array(
443 'indexConfigUid' => $cfgRec['uid'],
444 'url' => $url,
445 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
446 'depth' => $params['depth']+1
447 );
448 $pObj->addQueueEntry_callBack(
449 $cfgRec['set_id'],
450 $nparams,
451 $this->callBack,
452 $cfgRec['pid'],
453 $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
454 );
455 }
456 }
457 }
458 }
459 }
460
461 /**
462 * Page tree indexing type
463 *
464 * @param array Indexing Configuration Record
465 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
466 * @param array Parameters from the log queue.
467 * @param object Parent object (from "crawler" extension!)
468 * @return void
469 */
470 function crawler_execute_type4($cfgRec,&$session_data,$params,&$pObj) {
471
472 // Base page uid:
473 $pageUid = intval($params['url']);
474
475 // Get array of URLs from page:
476 $pageRow = t3lib_BEfunc::getRecord('pages',$pageUid);
477 $res = $pObj->getUrlsForPageRow($pageRow);
478
479 $duplicateTrack = array(); // Registry for duplicates
480 $downloadUrls = array(); // Dummy.
481
482 // Submit URLs:
483 if (count($res)) {
484 foreach($res as $paramSetKey => $vv) {
485 $urlList = $pObj->urlListFromUrlArray(
486 $vv,
487 $pageRow,
488 $GLOBALS['EXEC_TIME'],
489 30,
490 1,
491 0,
492 $duplicateTrack,
493 $downloadUrls,
494 array('tx_indexedsearch_reindex')
495 );
496 }
497 }
498
499 // Add subpages to log now:
500 if ($params['depth'] < $cfgRec['depth']) {
501
502 // Subpages selected
503 $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
504 'uid,title',
505 'pages',
506 'pid = '.intval($pageUid).
507 t3lib_BEfunc::deleteClause('pages')
508 );
509
510 // Traverse subpages and add to queue:
511 if (count($recs)) {
512 foreach($recs as $r) {
513 $this->instanceCounter++;
514 $url = 'pages:'.$r['uid'].': '.$r['title'];
515 $session_data['urlLog'][] = $url;
516
517 // Parameters:
518 $nparams = array(
519 'indexConfigUid' => $cfgRec['uid'],
520 'url' => $r['uid'],
521 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
522 'depth' => $params['depth']+1
523 );
524 $pObj->addQueueEntry_callBack(
525 $cfgRec['set_id'],
526 $nparams,
527 $this->callBack,
528 $cfgRec['pid'],
529 $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
530 );
531 }
532 }
533 }
534 }
535
536 /**
537 * Look up all old index configurations which are finished and needs to be reset and done
538 *
539 * @return void
540 */
541 function cleanUpOldRunningConfigurations() {
542
543 // Lookup running index configurations:
544 $runningIndexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
545 'uid,set_id',
546 'index_config',
547 'set_id!=0'.t3lib_BEfunc::deleteClause('index_config')
548 );
549
550 // For each running configuration, look up how many log entries there are which are scheduled for execution and if none, clear the "set_id" (means; Processing was DONE)
551 foreach($runningIndexingConfigurations as $cfgRec) {
552
553 // Look for ended processes:
554 $queued_items = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
555 '*',
556 'tx_crawler_queue',
557 'set_id=' . intval($cfgRec['set_id']) . ' AND exec_time=0'
558 );
559
560 if (!$queued_items) {
561
562 // Lookup old phash rows:
563 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
564 'phash',
565 'index_phash',
566 'freeIndexUid='.intval($cfgRec['uid']).' AND freeIndexSetId!='.$cfgRec['set_id']
567 );
568
569 foreach($oldPhashRows as $pHashRow) {
570 // Removing old registrations for all tables (code copied from class.tx_indexedsearch_modfunc1.php)
571 $tableArr = explode(',','index_phash,index_rel,index_section,index_grlist,index_fulltext,index_debug');
572 foreach($tableArr as $table) {
573 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($pHashRow['phash']));
574 }
575 }
576
577 // End process by updating index-config record:
578 $field_array = array (
579 'set_id' => 0,
580 'session_data' => '',
581 );
582 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
583 }
584 }
585 }
586
587
588
589
590
591
592
593 /*****************************************
594 *
595 * Helper functions
596 *
597 *****************************************/
598
599 /**
600 * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log.
601 *
602 * @param string URL string to check
603 * @param array Array of already indexed URLs (input url is looked up here and must not exist already)
604 * @param string Base URL of the indexing process (input URL must be "inside" the base URL!)
605 * @return string Returls the URL if OK, otherwise false
606 */
607 function checkUrl($url,$urlLog,$baseUrl) {
608 $url = preg_replace('/\/\/$/','/',$url);
609 list($url) = explode('#',$url);
610
611 if (!strstr($url,'../')) {
612 if (t3lib_div::isFirstPartOfStr($url,$baseUrl)) {
613 if (!in_array($url,$urlLog)) {
614 return $url;
615 }
616 }
617 }
618 }
619
620 /**
621 * Indexing External URL
622 *
623 * @param string URL, http://....
624 * @param integer Page id to relate indexing to.
625 * @param array Rootline array to relate indexing to
626 * @param integer Configuration UID
627 * @param integer Set ID value
628 * @return array URLs found on this page
629 */
630 function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId) {
631
632 // Load indexer if not yet.
633 $this->loadIndexerClass();
634
635 // Index external URL:
636 $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
637 $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
638 $indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
639 $indexerObj->hash['phash'] = -1; // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
640
641 $indexerObj->indexExternalUrl($url);
642 $url_qParts = parse_url($url);
643
644 // Get URLs on this page:
645 $subUrls = array();
646 $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
647
648 // Traverse links:
649 foreach ($list as $count => $linkInfo) {
650
651 // Decode entities:
652 $subUrl = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
653
654 $qParts = parse_url($subUrl);
655 if (!$qParts['scheme']) {
656 $subUrl = $url_qParts['scheme'].'://'.$url_qParts['host'].'/'.t3lib_div::resolveBackPath($subUrl);
657 }
658
659 $subUrls[] = $subUrl;
660 }
661
662 return $subUrls;
663 }
664
665 /**
666 * Indexing Single Record
667 *
668 * @param array Record to index
669 * @param array Configuration Record
670 * @param array Rootline array to relate indexing to
671 * @return void
672 */
673 function indexSingleRecord($r,$cfgRec,$rl=NULL) {
674
675 // Load indexer if not yet.
676 $this->loadIndexerClass();
677
678
679 // Init:
680 $rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
681 $fieldList = t3lib_div::trimExplode(',',$cfgRec['fieldlist'],1);
682 $languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
683 $sys_language_uid = $languageField ? $r[$languageField] : 0;
684
685 // (Re)-Indexing a row from a table:
686 $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
687 parse_str(str_replace('###UID###',$r['uid'],$cfgRec['get_params']),$GETparams);
688 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, $cfgRec['chashcalc'] ? TRUE : FALSE);
689 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
690 $indexerObj->forceIndexing = TRUE;
691
692 $theContent = '';
693 foreach($fieldList as $k => $v) {
694 if (!$k) {
695 $theTitle = $r[$v];
696 } else {
697 $theContent.= $r[$v].' ';
698 }
699 }
700
701 // Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid())
702 $indexerObj->backend_indexAsTYPO3Page(
703 strip_tags($theTitle),
704 '',
705 '',
706 strip_tags($theContent),
707 $GLOBALS['LANG']->charSet, // Requires that
708 $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']],
709 $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']],
710 $r['uid']
711 );
712
713 #echo print_r($indexerObj->internal_log);
714 #echo print_r($indexerObj->contentParts);
715 }
716
717 /**
718 * Include indexer class.
719 *
720 * @return void
721 */
722 function loadIndexerClass() {
723 global $TYPO3_CONF_VARS;
724 require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php');
725 }
726
727 /**
728 * Get rootline for closest TypoScript template root.
729 * Algorithm same as used in Web > Template, Object browser
730 *
731 * @param integer The page id to traverse rootline back from
732 * @return array Array where the root lines uid values are found.
733 */
734 function getUidRootLineForClosestTemplate($id) {
735 global $TYPO3_CONF_VARS;
736
737 $tmpl = t3lib_div::makeInstance("t3lib_tsparser_ext");
738 $tmpl->tt_track = 0; // Do not log time-performance information
739 $tmpl->init();
740
741 // Gets the rootLine
742 $sys_page = t3lib_div::makeInstance("t3lib_pageSelect");
743 $rootLine = $sys_page->getRootLine($id);
744 $tmpl->runThroughTemplates($rootLine,0); // This generates the constants/config + hierarchy info for the template.
745
746 // Root line uids
747 $rootline_uids = array();
748 foreach($tmpl->rootLine as $rlkey => $rldat) {
749 $rootline_uids[$rlkey] = $rldat['uid'];
750 }
751
752 return $rootline_uids;
753 }
754
755 /**
756 * Generate the unix time stamp for next visit.
757 *
758 * @param array Index configuration record
759 * @return integer The next time stamp
760 */
761 function generateNextIndexingTime($cfgRec) {
762 $currentTime = $GLOBALS['EXEC_TIME'];
763
764 // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected
765 if ($cfgRec['timer_frequency']<=24*3600) {
766 $aMidNight = mktime (0,0,0)-1*24*3600;
767 } else {
768 $lastTime = $cfgRec['timer_next_indexing'] ? $cfgRec['timer_next_indexing'] : $GLOBALS['EXEC_TIME'];
769 $aMidNight = mktime (0,0,0, date('m',$lastTime), date('d',$lastTime), date('y',$lastTime));
770 }
771
772 // Find last offset time plus frequency in seconds:
773 $lastSureOffset = $aMidNight+t3lib_div::intInRange($cfgRec['timer_offset'],0,86400);
774 $frequencySeconds = t3lib_div::intInRange($cfgRec['timer_frequency'],1);
775
776 // Now, find out how many blocks of the length of frequency there is until the next time:
777 $frequencyBlocksUntilNextTime = ceil(($currentTime-$lastSureOffset)/$frequencySeconds);
778
779 // Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds.
780 $nextTime = $lastSureOffset + $frequencyBlocksUntilNextTime*$frequencySeconds;
781
782 return $nextTime;
783 }
784
785 /**
786 * Checks if $url has any of the URls in the $url_deny "list" in it and if so, returns true.
787 *
788 * @param string URL to test
789 * @param string String where URLs are separated by line-breaks; If any of these strings is the first part of $url, the function returns TRUE (to indicate denial of decend)
790 * @return boolean TRUE if there is a matching URL (hence, do not index!)
791 */
792 function checkDeniedSuburls($url, $url_deny) {
793 if (trim($url_deny)) {
794 $url_denyArray = t3lib_div::trimExplode(chr(10),$url_deny,1);
795 foreach($url_denyArray as $testurl) {
796 if (t3lib_div::isFirstPartOfStr($url,$testurl)) {
797 echo $url.' /// '.$url_deny.chr(10);
798 return TRUE;
799 }
800 }
801 }
802 return FALSE;
803 }
804
805 /**
806 * Adding entry in queue for Hook
807 *
808 * @param array Configuration record
809 * @param string Title/URL
810 * @return void
811 */
812 function addQueueEntryForHook($cfgRec, $title) {
813
814 $nparams = array(
815 'indexConfigUid' => $cfgRec['uid'], // This must ALWAYS be the cfgRec uid!
816 'url' => $title,
817 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']') // Also just for information. Its good style to show that its an indexing configuration that added the entry.
818 );
819 $this->pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']);
820 }
821
822 /**
823 * Deletes all data stored by indexed search for a given page
824 *
825 * @param integer Uid of the page to delete all pHash
826 * @return void
827 */
828 function deleteFromIndex($id) {
829
830 // Lookup old phash rows:
831 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('phash','index_section', 'page_id='.intval($id));
832
833 if (count($oldPhashRows)) {
834 $pHashesToDelete = array();
835 foreach ($oldPhashRows as $pHashRow) {
836 $pHashesToDelete[] = $pHashRow['phash'];
837 }
838
839 $where_clause = 'phash IN ('.implode(',',$GLOBALS['TYPO3_DB']->cleanIntArray($pHashesToDelete)).')';
840 $tables = explode(',', 'index_debug,index_fulltext,index_grlist,index_phash,index_rel,index_section');
841 foreach ($tables as $table) {
842 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, $where_clause);
843 }
844 }
845 }
846
847
848
849
850
851
852
853 /*************************
854 *
855 * Hook functions for TCEmain (indexing of records)
856 *
857 *************************/
858
859 /**
860 * TCEmain hook function for on-the-fly indexing of database records
861 *
862 * @param string TCEmain command
863 * @param string Table name
864 * @param string Record ID. If new record its a string pointing to index inside t3lib_tcemain::substNEWwithIDs
865 * @param mixed Target value (ignored)
866 * @param object Reference to tcemain calling object
867 * @return void
868 */
869 function processCmdmap_preProcess($command, $table, $id, $value, &$pObj) {
870
871 // Clean up the index
872 if ($command=='delete' && $table == 'pages') {
873 $this->deleteFromIndex($id);
874 }
875 }
876
877 /**
878 * TCEmain hook function for on-the-fly indexing of database records
879 *
880 * @param string Status "new" or "update"
881 * @param string Table name
882 * @param string Record ID. If new record its a string pointing to index inside t3lib_tcemain::substNEWwithIDs
883 * @param array Field array of updated fields in the operation
884 * @param object Reference to tcemain calling object
885 * @return void
886 */
887 function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, &$pObj) {
888
889 // Check if any fields are actually updated:
890 if (count($fieldArray)) {
891
892 // Translate new ids.
893 if ($status=='new') {
894 $id = $pObj->substNEWwithIDs[$id];
895
896 } elseif ($table=='pages' && $status=='update' && ((array_key_exists('hidden',$fieldArray) && $fieldArray['hidden']==1) || (array_key_exists('no_search',$fieldArray) && $fieldArray['no_search']==1))) {
897
898 // If the page should be hidden or not indexed after update, delete index for this page
899 $this->deleteFromIndex($id);
900 }
901
902 // Get full record and if exists, search for indexing configurations:
903 $currentRecord = t3lib_BEfunc::getRecord($table,$id);
904 if (is_array($currentRecord)) {
905
906 // Select all (not running) indexing configurations of type "record" (1) and which points to this table and is located on the same page as the record or pointing to the right source PID
907 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
908 '*',
909 'index_config',
910 'hidden=0
911 AND (starttime=0 OR starttime<=' . $GLOBALS['EXEC_TIME'] . ')
912 AND set_id=0
913 AND type=1
914 AND table2index='.$GLOBALS['TYPO3_DB']->fullQuoteStr($table,'index_config').'
915 AND (
916 (alternative_source_pid=0 AND pid='.intval($currentRecord['pid']).')
917 OR (alternative_source_pid='.intval($currentRecord['pid']).')
918 )
919 AND records_indexonchange=1
920 '.t3lib_BEfunc::deleteClause('index_config')
921 );
922
923 foreach($indexingConfigurations as $cfgRec) {
924 $this->indexSingleRecord($currentRecord,$cfgRec);
925 }
926 }
927 }
928 }
929 }
930
931
932 /**
933 * Crawler hook for indexed search. Works with the "crawler" extension
934 * This hook is specifically used to index external files found on pages through the crawler extension.
935 *
936 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
937 * @package TYPO3
938 * @subpackage tx_indexedsearch
939 * @see tx_indexedsearch_indexer::extractLinks()
940 */
941 class tx_indexedsearch_files {
942
943 /**
944 * Call back function for execution of a log element
945 *
946 * @param array Params from log element.
947 * @param object Parent object (tx_crawler lib)
948 * @return array Result array
949 */
950 function crawler_execute($params,&$pObj) {
951
952 // Load indexer if not yet.
953 $this->loadIndexerClass();
954
955 if (is_array($params['conf'])) {
956
957 // Initialize the indexer class:
958 $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
959 $indexerObj->conf = $params['conf'];
960 $indexerObj->init();
961
962 // Index document:
963 if ($params['alturl']) {
964 $fI = pathinfo($params['document']);
965 $ext = strtolower($fI['extension']);
966 $indexerObj->indexRegularDocument($params['alturl'], TRUE, $params['document'], $ext);
967 } else {
968 $indexerObj->indexRegularDocument($params['document'], TRUE);
969 }
970
971 // Return OK:
972 return array('content' => array());
973 }
974 }
975
976 /**
977 * Include indexer class.
978 *
979 * @return void
980 */
981 function loadIndexerClass() {
982 global $TYPO3_CONF_VARS;
983 require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php');
984 }
985 }
986
987
988 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.crawler.php']) {
989 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.crawler.php']);
990 }
991
992 ?>