removed empty lines at the end of the files
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.crawler.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2008 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * Crawler hook for indexed search. Works with the "crawler" extension
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 87: class tx_indexedsearch_crawler
38 * 106: function crawler_init(&$pObj)
39 * 219: function crawler_execute($params,&$pObj)
40 * 285: function crawler_execute_type1($cfgRec,&$session_data,$params,&$pObj)
41 * 345: function crawler_execute_type2($cfgRec,&$session_data,$params,&$pObj)
42 * 414: function crawler_execute_type3($cfgRec,&$session_data,$params,&$pObj)
43 * 458: function crawler_execute_type4($cfgRec,&$session_data,$params,&$pObj)
44 * 513: function cleanUpOldRunningConfigurations()
45 *
46 * SECTION: Helper functions
47 * 579: function checkUrl($url,$urlLog,$baseUrl)
48 * 602: function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)
49 * 645: function indexSingleRecord($r,$cfgRec,$rl=NULL)
50 * 694: function loadIndexerClass()
51 * 706: function getUidRootLineForClosestTemplate($id)
52 * 739: function generateNextIndexingTime($cfgRec)
53 * 778: function checkDeniedSuburls($url, $url_deny)
54 * 798: function addQueueEntryForHook($cfgRec, $title)
55 *
56 * SECTION: Hook functions for TCEmain (indexing of records)
57 * 830: function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, &$pObj)
58 *
59 *
60 * 879: class tx_indexedsearch_files
61 * 888: function crawler_execute($params,&$pObj)
62 * 913: function loadIndexerClass()
63 *
64 * TOTAL FUNCTIONS: 18
65 * (This index is automatically created/updated by the extension "extdeveval")
66 *
67 */
68
69
70
71
72 # To make sure the backend charset is available:
73 require_once(PATH_typo3.'sysext/lang/lang.php');
74 if (!is_object($GLOBALS['LANG'])) {
75 $GLOBALS['LANG'] = t3lib_div::makeInstance('language');
76 $GLOBALS['LANG']->init($GLOBALS['BE_USER']->uc['lang']);
77 }
78
79
80 /**
81 * Crawler hook for indexed search. Works with the "crawler" extension
82 *
83 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
84 * @package TYPO3
85 * @subpackage tx_indexedsearch
86 */
87 class tx_indexedsearch_crawler {
88
89 // Static:
90 var $secondsPerExternalUrl = 3; // Number of seconds to use as interval between queued indexing operations of URLs / files (types 2 & 3)
91
92 // Internal, dynamic:
93 var $instanceCounter = 0; // Counts up for each added URL (type 3)
94
95 // Internal, static:
96 var $callBack = 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'; // The object reference to this class.
97
98 /**
99 * Initialization of crawler hook.
100 * This function is asked for each instance of the crawler and we must check if something is timed to happen and if so put entry(s) in the crawlers log to start processing.
101 * In reality we select indexing configurations and evaluate if any of them needs to run.
102 *
103 * @param object Parent object (tx_crawler lib)
104 * @return void
105 */
106 function crawler_init(&$pObj){
107
108 // Select all indexing configuration which are waiting to be activated:
109 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
110 '*',
111 'index_config',
112 'hidden=0
113 AND (starttime=0 OR starttime<='.time().')
114 AND timer_next_indexing<'.time().'
115 AND set_id=0
116 '.t3lib_BEfunc::deleteClause('index_config')
117 );
118
119 // For each configuration, check if it should be executed and if so, start:
120 foreach($indexingConfigurations as $cfgRec) {
121
122 // Generate a unique set-ID:
123 $setId = t3lib_div::md5int(microtime());
124
125 // Get next time:
126 $nextTime = $this->generateNextIndexingTime($cfgRec);
127
128 // Start process by updating index-config record:
129 $field_array = array (
130 'set_id' => $setId,
131 'timer_next_indexing' => $nextTime,
132 'session_data' => '',
133 );
134 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
135
136 // Based on configuration type:
137 switch($cfgRec['type']) {
138 case 1: // RECORDS:
139
140 // Parameters:
141 $params = array(
142 'indexConfigUid' => $cfgRec['uid'],
143 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
144 'url' => 'Records (start)', // Just for show.
145 );
146 //
147 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
148 break;
149 case 2: // FILES:
150
151 // Parameters:
152 $params = array(
153 'indexConfigUid' => $cfgRec['uid'], // General
154 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), // General
155 'url' => $cfgRec['filepath'], // Partly general... (for URL and file types)
156 'depth' => 0 // Specific for URL and file types
157 );
158
159 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
160 break;
161 case 3: // External URL:
162
163 // Parameters:
164 $params = array(
165 'indexConfigUid' => $cfgRec['uid'], // General
166 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), // General
167 'url' => $cfgRec['externalUrl'], // Partly general... (for URL and file types)
168 'depth' => 0 // Specific for URL and file types
169 );
170
171 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
172 break;
173 case 4: // Page tree
174
175 // Parameters:
176 $params = array(
177 'indexConfigUid' => $cfgRec['uid'], // General
178 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), // General
179 'url' => intval($cfgRec['alternative_source_pid']), // Partly general... (for URL and file types and page tree (root))
180 'depth' => 0 // Specific for URL and file types and page tree
181 );
182
183 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
184 break;
185 case 5: // Meta configuration, nothing to do:
186 # NOOP
187 break;
188 default:
189 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
190 $hookObj = &t3lib_div::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
191
192 if (is_object($hookObj)) {
193
194 // Parameters:
195 $params = array(
196 'indexConfigUid' => $cfgRec['uid'], // General
197 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].'/CUSTOM]'), // General
198 'url' => $hookObj->initMessage($message),
199 );
200
201 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
202 }
203 }
204 break;
205 }
206 }
207
208 // Finally, look up all old index configurations which are finished and needs to be reset and done.
209 $this->cleanUpOldRunningConfigurations();
210 }
211
212 /**
213 * Call back function for execution of a log element
214 *
215 * @param array Params from log element. Must contain $params['indexConfigUid']
216 * @param object Parent object (tx_crawler lib)
217 * @return array Result array
218 */
219 function crawler_execute($params,&$pObj) {
220
221 // Indexer configuration ID must exist:
222 if ($params['indexConfigUid']) {
223
224 // Load the indexing configuration record:
225 list($cfgRec) = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
226 '*',
227 'index_config',
228 'uid='.intval($params['indexConfigUid'])
229 );
230
231 if (is_array($cfgRec)) {
232
233 // Unpack session data:
234 $session_data = unserialize($cfgRec['session_data']);
235
236 // Select which type:
237 switch($cfgRec['type']) {
238 case 1: // Records:
239 $this->crawler_execute_type1($cfgRec,$session_data,$params,$pObj);
240 break;
241 case 2: // Files
242 $this->crawler_execute_type2($cfgRec,$session_data,$params,$pObj);
243 break;
244 case 3: // External URL:
245 $this->crawler_execute_type3($cfgRec,$session_data,$params,$pObj);
246 break;
247 case 4: // Page tree:
248 $this->crawler_execute_type4($cfgRec,$session_data,$params,$pObj);
249 break;
250 case 5: // Meta
251 # NOOP (should never enter here!)
252 break;
253 default:
254 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
255 $hookObj = &t3lib_div::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
256
257 if (is_object($hookObj)) {
258 $this->pObj = &$pObj; // For addQueueEntryForHook()
259 $hookObj->indexOperation($cfgRec,$session_data,$params,$this);
260 }
261 }
262 break;
263 }
264
265 // Save process data which might be modified:
266 $field_array = array (
267 'session_data' => serialize($session_data)
268 );
269 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
270 }
271 }
272
273 return array('log' => $params);
274 }
275
276 /**
277 * Indexing records from a table
278 *
279 * @param array Indexing Configuration Record
280 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
281 * @param array Parameters from the log queue.
282 * @param object Parent object (from "crawler" extension!)
283 * @return void
284 */
285 function crawler_execute_type1($cfgRec,&$session_data,$params,&$pObj) {
286 if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']])) {
287
288 // Init session data array if not already:
289 if (!is_array($session_data)) {
290 $session_data = array(
291 'uid' => 0
292 );
293 }
294
295 // Init:
296 $pid = intval($cfgRec['alternative_source_pid']) ? intval($cfgRec['alternative_source_pid']) : $cfgRec['pid'];
297 $numberOfRecords = $cfgRec['recordsbatch'] ? t3lib_div::intInRange($cfgRec['recordsbatch'],1) : 100;
298
299 // Get root line:
300 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
301
302 // Select
303 $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
304 '*',
305 $cfgRec['table2index'],
306 'pid = '.intval($pid).'
307 AND uid > '.intval($session_data['uid']).
308 t3lib_BEfunc::deleteClause($cfgRec['table2index']).
309 t3lib_BEfunc::BEenableFields($cfgRec['table2index']),
310 '',
311 'uid',
312 $numberOfRecords
313 );
314
315 // Traverse:
316 if (count($recs)) {
317 foreach($recs as $r) {
318
319 // Index single record:
320 $this->indexSingleRecord($r,$cfgRec,$rl);
321
322 // Update the UID we last processed:
323 $session_data['uid'] = $r['uid'];
324 }
325
326 // Finally, set entry for next indexing of batch of records:
327 $nparams = array(
328 'indexConfigUid' => $cfgRec['uid'],
329 'url' => 'Records from UID#'.($r['uid']+1).'-?',
330 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']')
331 );
332 $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']);
333 }
334 }
335 }
336
337 /**
338 * Indexing files from fileadmin
339 *
340 * @param array Indexing Configuration Record
341 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
342 * @param array Parameters from the log queue.
343 * @param object Parent object (from "crawler" extension!)
344 * @return void
345 */
346 function crawler_execute_type2($cfgRec,&$session_data,$params,&$pObj) {
347
348 // Prepare path, making it absolute and checking:
349 $readpath = $params['url'];
350 if (!t3lib_div::isAbsPath($readpath)) {
351 $readpath = t3lib_div::getFileAbsFileName($readpath);
352 }
353
354 if (t3lib_div::isAllowedAbsPath($readpath)) {
355 if (@is_file($readpath)) { // If file, index it!
356
357 // Get root line (need to provide this when indexing external files)
358 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
359
360 // Load indexer if not yet.
361 $this->loadIndexerClass();
362
363 // (Re)-Indexing file on page.
364 $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
365 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
366 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
367 $indexerObj->hash['phash'] = -1; // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
368
369 // Index document:
370 $indexerObj->indexRegularDocument(substr($readpath,strlen(PATH_site)), TRUE);
371 } elseif (@is_dir($readpath)) { // If dir, read content and create new pending items for log:
372
373 // Select files and directories in path:
374 $extList = implode(',',t3lib_div::trimExplode(',',$cfgRec['extensions'],1));
375 $fileArr = array();
376 $files = t3lib_div::getAllFilesAndFoldersInPath($fileArr,$readpath,$extList,0,0);
377
378 $directoryList = t3lib_div::get_dirs($readpath);
379 if (is_array($directoryList) && $params['depth'] < $cfgRec['depth']) {
380 foreach ($directoryList as $subdir) {
381 if ((string)$subdir!='') {
382 $files[]= $readpath.$subdir.'/';
383 }
384 }
385 }
386 $files = t3lib_div::removePrefixPathFromList($files,PATH_site);
387
388 // traverse the items and create log entries:
389 foreach($files as $path) {
390 $this->instanceCounter++;
391 if ($path!==$params['url']) {
392 // Parameters:
393 $nparams = array(
394 'indexConfigUid' => $cfgRec['uid'],
395 'url' => $path,
396 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
397 'depth' => $params['depth']+1
398 );
399 $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid'],time()+$this->instanceCounter*$this->secondsPerExternalUrl);
400 }
401 }
402 }
403 }
404 }
405
406 /**
407 * Indexing External URLs
408 *
409 * @param array Indexing Configuration Record
410 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
411 * @param array Parameters from the log queue.
412 * @param object Parent object (from "crawler" extension!)
413 * @return void
414 */
415 function crawler_execute_type3($cfgRec,&$session_data,$params,&$pObj) {
416
417 // Init session data array if not already:
418 if (!is_array($session_data)) {
419 $session_data = array(
420 'urlLog' => array($params['url'])
421 );
422 }
423
424 // Index the URL:
425 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
426 $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
427
428 // Add more elements to log now:
429 if ($params['depth'] < $cfgRec['depth']) {
430 foreach($subUrls as $url) {
431 if ($url = $this->checkUrl($url,$session_data['urlLog'],$cfgRec['externalUrl'])) {
432 if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny'])) {
433 $this->instanceCounter++;
434 $session_data['urlLog'][] = $url;
435
436 // Parameters:
437 $nparams = array(
438 'indexConfigUid' => $cfgRec['uid'],
439 'url' => $url,
440 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
441 'depth' => $params['depth']+1
442 );
443 $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid'],time()+$this->instanceCounter*$this->secondsPerExternalUrl);
444 }
445 }
446 }
447 }
448 }
449
450 /**
451 * Page tree indexing type
452 *
453 * @param array Indexing Configuration Record
454 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
455 * @param array Parameters from the log queue.
456 * @param object Parent object (from "crawler" extension!)
457 * @return void
458 */
459 function crawler_execute_type4($cfgRec,&$session_data,$params,&$pObj) {
460
461 // Base page uid:
462 $pageUid = intval($params['url']);
463
464 // Get array of URLs from page:
465 $pageRow = t3lib_BEfunc::getRecord('pages',$pageUid);
466 $res = $pObj->getUrlsForPageRow($pageRow);
467
468 $duplicateTrack = array(); // Registry for duplicates
469 $downloadUrls = array(); // Dummy.
470
471 // Submit URLs:
472 if (count($res)) {
473 foreach($res as $paramSetKey => $vv) {
474 $urlList = $pObj->urlListFromUrlArray($vv,$pageRow,time(),30,1,0,$duplicateTrack,$downloadUrls,array('tx_indexedsearch_reindex'));
475 }
476 }
477
478 // Add subpages to log now:
479 if ($params['depth'] < $cfgRec['depth']) {
480
481 // Subpages selected
482 $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
483 'uid,title',
484 'pages',
485 'pid = '.intval($pageUid).
486 t3lib_BEfunc::deleteClause('pages')
487 );
488
489 // Traverse subpages and add to queue:
490 if (count($recs)) {
491 foreach($recs as $r) {
492 $this->instanceCounter++;
493 $url = 'pages:'.$r['uid'].': '.$r['title'];
494 $session_data['urlLog'][] = $url;
495
496 // Parameters:
497 $nparams = array(
498 'indexConfigUid' => $cfgRec['uid'],
499 'url' => $r['uid'],
500 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
501 'depth' => $params['depth']+1
502 );
503 $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid'],time()+$this->instanceCounter*$this->secondsPerExternalUrl);
504 }
505 }
506 }
507 }
508
509 /**
510 * Look up all old index configurations which are finished and needs to be reset and done
511 *
512 * @return void
513 */
514 function cleanUpOldRunningConfigurations() {
515
516 // Lookup running index configurations:
517 $runningIndexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
518 'uid,set_id',
519 'index_config',
520 'set_id!=0'.t3lib_BEfunc::deleteClause('index_config')
521 );
522
523 // For each running configuration, look up how many log entries there are which are scheduled for execution and if none, clear the "set_id" (means; Processing was DONE)
524 foreach($runningIndexingConfigurations as $cfgRec) {
525
526 // Look for ended processes:
527 list($queued_items) = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
528 'count(*) AS count',
529 'tx_crawler_queue',
530 'set_id='.intval($cfgRec['set_id']).' AND exec_time=0'
531 );
532
533 if (!$queued_items['count']) {
534
535 // Lookup old phash rows:
536 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
537 'phash',
538 'index_phash',
539 'freeIndexUid='.intval($cfgRec['uid']).' AND freeIndexSetId!='.$cfgRec['set_id']
540 );
541
542 foreach($oldPhashRows as $pHashRow) {
543 // Removing old registrations for all tables (code copied from class.tx_indexedsearch_modfunc1.php)
544 $tableArr = explode(',','index_phash,index_rel,index_section,index_grlist,index_fulltext,index_debug');
545 foreach($tableArr as $table) {
546 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($pHashRow['phash']));
547 }
548 }
549
550 // End process by updating index-config record:
551 $field_array = array (
552 'set_id' => 0,
553 'session_data' => '',
554 );
555 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
556 }
557 }
558 }
559
560
561
562
563
564
565
566 /*****************************************
567 *
568 * Helper functions
569 *
570 *****************************************/
571
572 /**
573 * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log.
574 *
575 * @param string URL string to check
576 * @param array Array of already indexed URLs (input url is looked up here and must not exist already)
577 * @param string Base URL of the indexing process (input URL must be "inside" the base URL!)
578 * @return string Returls the URL if OK, otherwise false
579 */
580 function checkUrl($url,$urlLog,$baseUrl) {
581 $url = ereg_replace('\/\/$','/',$url);
582 list($url) = explode('#',$url);
583
584 if (!strstr($url,'../')) {
585 if (t3lib_div::isFirstPartOfStr($url,$baseUrl)) {
586 if (!in_array($url,$urlLog)) {
587 return $url;
588 }
589 }
590 }
591 }
592
593 /**
594 * Indexing External URL
595 *
596 * @param string URL, http://....
597 * @param integer Page id to relate indexing to.
598 * @param array Rootline array to relate indexing to
599 * @param integer Configuration UID
600 * @param integer Set ID value
601 * @return array URLs found on this page
602 */
603 function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId) {
604
605 // Load indexer if not yet.
606 $this->loadIndexerClass();
607
608 // Index external URL:
609 $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
610 $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
611 $indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
612 $indexerObj->hash['phash'] = -1; // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
613
614 $indexerObj->indexExternalUrl($url);
615 $url_qParts = parse_url($url);
616
617 // Get URLs on this page:
618 $subUrls = array();
619 $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
620
621 // Traverse links:
622 foreach ($list as $count => $linkInfo) {
623
624 // Decode entities:
625 $subUrl = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
626
627 $qParts = parse_url($subUrl);
628 if (!$qParts['scheme']) {
629 $subUrl = $url_qParts['scheme'].'://'.$url_qParts['host'].'/'.t3lib_div::resolveBackPath($subUrl);
630 }
631
632 $subUrls[] = $subUrl;
633 }
634
635 return $subUrls;
636 }
637
638 /**
639 * Indexing Single Record
640 *
641 * @param array Record to index
642 * @param array Configuration Record
643 * @param array Rootline array to relate indexing to
644 * @return void
645 */
646 function indexSingleRecord($r,$cfgRec,$rl=NULL) {
647
648 // Load indexer if not yet.
649 $this->loadIndexerClass();
650
651
652 // Init:
653 $rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
654 $fieldList = t3lib_div::trimExplode(',',$cfgRec['fieldlist'],1);
655 $languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
656 $sys_language_uid = $languageField ? $r[$languageField] : 0;
657
658 // (Re)-Indexing a row from a table:
659 $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
660 parse_str(str_replace('###UID###',$r['uid'],$cfgRec['get_params']),$GETparams);
661 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, $cfgRec['chashcalc'] ? TRUE : FALSE);
662 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
663 $indexerObj->forceIndexing = TRUE;
664
665 $theContent = '';
666 foreach($fieldList as $k => $v) {
667 if (!$k) {
668 $theTitle = $r[$v];
669 } else {
670 $theContent.= $r[$v].' ';
671 }
672 }
673
674 // Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid())
675 $indexerObj->backend_indexAsTYPO3Page(
676 strip_tags($theTitle),
677 '',
678 '',
679 strip_tags($theContent),
680 $GLOBALS['LANG']->charSet, // Requires that
681 $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']],
682 $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']],
683 $r['uid']
684 );
685
686 #echo print_r($indexerObj->internal_log);
687 #echo print_r($indexerObj->contentParts);
688 }
689
690 /**
691 * Include indexer class.
692 *
693 * @return void
694 */
695 function loadIndexerClass() {
696 global $TYPO3_CONF_VARS;
697 require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php');
698 }
699
700 /**
701 * Get rootline for closest TypoScript template root.
702 * Algorithm same as used in Web > Template, Object browser
703 *
704 * @param integer The page id to traverse rootline back from
705 * @return array Array where the root lines uid values are found.
706 */
707 function getUidRootLineForClosestTemplate($id) {
708 global $TYPO3_CONF_VARS;
709
710 require_once (PATH_t3lib."class.t3lib_page.php");
711 require_once (PATH_t3lib."class.t3lib_tstemplate.php");
712 require_once (PATH_t3lib."class.t3lib_tsparser_ext.php");
713
714
715
716 $tmpl = t3lib_div::makeInstance("t3lib_tsparser_ext");
717 $tmpl->tt_track = 0; // Do not log time-performance information
718 $tmpl->init();
719
720 // Gets the rootLine
721 $sys_page = t3lib_div::makeInstance("t3lib_pageSelect");
722 $rootLine = $sys_page->getRootLine($id);
723 $tmpl->runThroughTemplates($rootLine,0); // This generates the constants/config + hierarchy info for the template.
724
725 // Root line uids
726 $rootline_uids = array();
727 foreach($tmpl->rootLine as $rlkey => $rldat) {
728 $rootline_uids[$rlkey] = $rldat['uid'];
729 }
730
731 return $rootline_uids;
732 }
733
734 /**
735 * Generate the unix time stamp for next visit.
736 *
737 * @param array Index configuration record
738 * @return integer The next time stamp
739 */
740 function generateNextIndexingTime($cfgRec) {
741 $currentTime = time();
742
743 // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected
744 if ($cfgRec['timer_frequency']<=24*3600) {
745 $aMidNight = mktime (0,0,0)-1*24*3600;
746 } else {
747 $lastTime = $cfgRec['timer_next_indexing']?$cfgRec['timer_next_indexing']:time();
748 $aMidNight = mktime (0,0,0, date('m',$lastTime), date('d',$lastTime), date('y',$lastTime));
749 }
750
751 // Find last offset time plus frequency in seconds:
752 $lastSureOffset = $aMidNight+t3lib_div::intInRange($cfgRec['timer_offset'],0,86400);
753 $frequencySeconds = t3lib_div::intInRange($cfgRec['timer_frequency'],1);
754
755 // Now, find out how many blocks of the length of frequency there is until the next time:
756 $frequencyBlocksUntilNextTime = ceil(($currentTime-$lastSureOffset)/$frequencySeconds);
757
758 // Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds.
759 $nextTime = $lastSureOffset + $frequencyBlocksUntilNextTime*$frequencySeconds;
760
761 return $nextTime;
762 }
763
764 /**
765 * Checks if $url has any of the URls in the $url_deny "list" in it and if so, returns true.
766 *
767 * @param string URL to test
768 * @param string String where URLs are separated by line-breaks; If any of these strings is the first part of $url, the function returns TRUE (to indicate denial of decend)
769 * @return boolean TRUE if there is a matching URL (hence, do not index!)
770 */
771 function checkDeniedSuburls($url, $url_deny) {
772 if (trim($url_deny)) {
773 $url_denyArray = t3lib_div::trimExplode(chr(10),$url_deny,1);
774 foreach($url_denyArray as $testurl) {
775 if (t3lib_div::isFirstPartOfStr($url,$testurl)) {
776 echo $url.' /// '.$url_deny.chr(10);
777 return TRUE;
778 }
779 }
780 }
781 return FALSE;
782 }
783
784 /**
785 * Adding entry in queue for Hook
786 *
787 * @param array Configuration record
788 * @param string Title/URL
789 * @return void
790 */
791 function addQueueEntryForHook($cfgRec, $title) {
792
793 $nparams = array(
794 'indexConfigUid' => $cfgRec['uid'], // This must ALWAYS be the cfgRec uid!
795 'url' => $title,
796 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']') // Also just for information. Its good style to show that its an indexing configuration that added the entry.
797 );
798 $this->pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']);
799 }
800
801 /**
802 * Deletes all data stored by indexed search for a given page
803 *
804 * @param integer Uid of the page to delete all pHash
805 * @return void
806 */
807 function deleteFromIndex($id) {
808
809 // Lookup old phash rows:
810 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('phash','index_section', 'page_id='.intval($id));
811
812 if (count($oldPhashRows)) {
813 $pHashesToDelete = array();
814 foreach ($oldPhashRows as $pHashRow) {
815 $pHashesToDelete[] = $pHashRow['phash'];
816 }
817
818 $where_clause = 'phash IN ('.implode(',',$GLOBALS['TYPO3_DB']->cleanIntArray($pHashesToDelete)).')';
819 $tables = explode(',', 'index_debug,index_fulltext,index_grlist,index_phash,index_rel,index_section');
820 foreach ($tables as $table) {
821 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, $where_clause);
822 }
823 }
824 }
825
826
827
828
829
830
831
832 /*************************
833 *
834 * Hook functions for TCEmain (indexing of records)
835 *
836 *************************/
837
838 /**
839 * TCEmain hook function for on-the-fly indexing of database records
840 *
841 * @param string TCEmain command
842 * @param string Table name
843 * @param string Record ID. If new record its a string pointing to index inside t3lib_tcemain::substNEWwithIDs
844 * @param mixed Target value (ignored)
845 * @param object Reference to tcemain calling object
846 * @return void
847 */
848 function processCmdmap_preProcess($command, $table, $id, $value, &$pObj) {
849
850 // Clean up the index
851 if ($command=='delete' && $table == 'pages') {
852 $this->deleteFromIndex($id);
853 }
854 }
855
856 /**
857 * TCEmain hook function for on-the-fly indexing of database records
858 *
859 * @param string Status "new" or "update"
860 * @param string Table name
861 * @param string Record ID. If new record its a string pointing to index inside t3lib_tcemain::substNEWwithIDs
862 * @param array Field array of updated fields in the operation
863 * @param object Reference to tcemain calling object
864 * @return void
865 */
866 function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, &$pObj) {
867
868 // Check if any fields are actually updated:
869 if (count($fieldArray)) {
870
871 // Translate new ids.
872 if ($status=='new') {
873 $id = $pObj->substNEWwithIDs[$id];
874
875 } elseif ($table=='pages' && $status=='update' && ((array_key_exists('hidden',$fieldArray) && $fieldArray['hidden']==1) || (array_key_exists('no_search',$fieldArray) && $fieldArray['no_search']==1))) {
876
877 // If the page should be hidden or not indexed after update, delete index for this page
878 $this->deleteFromIndex($id);
879 }
880
881 // Get full record and if exists, search for indexing configurations:
882 $currentRecord = t3lib_BEfunc::getRecord($table,$id);
883 if (is_array($currentRecord)) {
884
885 // Select all (not running) indexing configurations of type "record" (1) and which points to this table and is located on the same page as the record or pointing to the right source PID
886 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
887 '*',
888 'index_config',
889 'hidden=0
890 AND (starttime=0 OR starttime<='.time().')
891 AND set_id=0
892 AND type=1
893 AND table2index='.$GLOBALS['TYPO3_DB']->fullQuoteStr($table,'index_config').'
894 AND (
895 (alternative_source_pid=0 AND pid='.intval($currentRecord['pid']).')
896 OR (alternative_source_pid='.intval($currentRecord['pid']).')
897 )
898 AND records_indexonchange=1
899 '.t3lib_BEfunc::deleteClause('index_config')
900 );
901
902 foreach($indexingConfigurations as $cfgRec) {
903 $this->indexSingleRecord($currentRecord,$cfgRec);
904 }
905 }
906 }
907 }
908 }
909
910
911 /**
912 * Crawler hook for indexed search. Works with the "crawler" extension
913 * This hook is specifically used to index external files found on pages through the crawler extension.
914 *
915 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
916 * @package TYPO3
917 * @subpackage tx_indexedsearch
918 * @see tx_indexedsearch_indexer::extractLinks()
919 */
920 class tx_indexedsearch_files {
921
922 /**
923 * Call back function for execution of a log element
924 *
925 * @param array Params from log element.
926 * @param object Parent object (tx_crawler lib)
927 * @return array Result array
928 */
929 function crawler_execute($params,&$pObj) {
930
931 // Load indexer if not yet.
932 $this->loadIndexerClass();
933
934 if (is_array($params['conf'])) {
935
936 // Initialize the indexer class:
937 $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
938 $indexerObj->conf = $params['conf'];
939 $indexerObj->init();
940
941 // Index document:
942 if ($params['alturl']) {
943 $fI = pathinfo($params['document']);
944 $ext = strtolower($fI['extension']);
945 $indexerObj->indexRegularDocument($params['alturl'], TRUE, $params['document'], $ext);
946 } else {
947 $indexerObj->indexRegularDocument($params['document'], TRUE);
948 }
949
950 // Return OK:
951 return array('content' => array());
952 }
953 }
954
955 /**
956 * Include indexer class.
957 *
958 * @return void
959 */
960 function loadIndexerClass() {
961 global $TYPO3_CONF_VARS;
962 require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php');
963 }
964 }
965
966
967 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.crawler.php']) {
968 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.crawler.php']);
969 }
970
971 ?>