0b4d29c60d831c2d02de628f443fcede83d0aab0
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.crawler.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2009 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * Crawler hook for indexed search. Works with the "crawler" extension
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 87: class tx_indexedsearch_crawler
38 * 106: function crawler_init(&$pObj)
39 * 219: function crawler_execute($params,&$pObj)
40 * 285: function crawler_execute_type1($cfgRec,&$session_data,$params,&$pObj)
41 * 345: function crawler_execute_type2($cfgRec,&$session_data,$params,&$pObj)
42 * 414: function crawler_execute_type3($cfgRec,&$session_data,$params,&$pObj)
43 * 458: function crawler_execute_type4($cfgRec,&$session_data,$params,&$pObj)
44 * 513: function cleanUpOldRunningConfigurations()
45 *
46 * SECTION: Helper functions
47 * 579: function checkUrl($url,$urlLog,$baseUrl)
48 * 602: function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)
49 * 645: function indexSingleRecord($r,$cfgRec,$rl=NULL)
50 * 694: function loadIndexerClass()
51 * 706: function getUidRootLineForClosestTemplate($id)
52 * 739: function generateNextIndexingTime($cfgRec)
53 * 778: function checkDeniedSuburls($url, $url_deny)
54 * 798: function addQueueEntryForHook($cfgRec, $title)
55 *
56 * SECTION: Hook functions for TCEmain (indexing of records)
57 * 830: function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, &$pObj)
58 *
59 *
60 * 879: class tx_indexedsearch_files
61 * 888: function crawler_execute($params,&$pObj)
62 * 913: function loadIndexerClass()
63 *
64 * TOTAL FUNCTIONS: 18
65 * (This index is automatically created/updated by the extension "extdeveval")
66 *
67 */
68
69
70
71
72 # To make sure the backend charset is available:
73 if (!is_object($GLOBALS['LANG'])) {
74 $GLOBALS['LANG'] = t3lib_div::makeInstance('language');
75 $GLOBALS['LANG']->init($GLOBALS['BE_USER']->uc['lang']);
76 }
77
78
79 /**
80 * Crawler hook for indexed search. Works with the "crawler" extension
81 *
82 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
83 * @package TYPO3
84 * @subpackage tx_indexedsearch
85 */
86 class tx_indexedsearch_crawler {
87
88 // Static:
89 var $secondsPerExternalUrl = 3; // Number of seconds to use as interval between queued indexing operations of URLs / files (types 2 & 3)
90
91 // Internal, dynamic:
92 var $instanceCounter = 0; // Counts up for each added URL (type 3)
93
94 // Internal, static:
95 var $callBack = 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'; // The object reference to this class.
96
97 /**
98 * Initialization of crawler hook.
99 * This function is asked for each instance of the crawler and we must check if something is timed to happen and if so put entry(s) in the crawlers log to start processing.
100 * In reality we select indexing configurations and evaluate if any of them needs to run.
101 *
102 * @param object Parent object (tx_crawler lib)
103 * @return void
104 */
105 function crawler_init(&$pObj){
106
107 // Select all indexing configuration which are waiting to be activated:
108 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
109 '*',
110 'index_config',
111 'hidden=0
112 AND (starttime=0 OR starttime<='.time().')
113 AND timer_next_indexing<'.time().'
114 AND set_id=0
115 '.t3lib_BEfunc::deleteClause('index_config')
116 );
117
118 // For each configuration, check if it should be executed and if so, start:
119 foreach($indexingConfigurations as $cfgRec) {
120
121 // Generate a unique set-ID:
122 $setId = t3lib_div::md5int(microtime());
123
124 // Get next time:
125 $nextTime = $this->generateNextIndexingTime($cfgRec);
126
127 // Start process by updating index-config record:
128 $field_array = array (
129 'set_id' => $setId,
130 'timer_next_indexing' => $nextTime,
131 'session_data' => '',
132 );
133 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
134
135 // Based on configuration type:
136 switch($cfgRec['type']) {
137 case 1: // RECORDS:
138
139 // Parameters:
140 $params = array(
141 'indexConfigUid' => $cfgRec['uid'],
142 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
143 'url' => 'Records (start)', // Just for show.
144 );
145 //
146 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
147 break;
148 case 2: // FILES:
149
150 // Parameters:
151 $params = array(
152 'indexConfigUid' => $cfgRec['uid'], // General
153 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), // General
154 'url' => $cfgRec['filepath'], // Partly general... (for URL and file types)
155 'depth' => 0 // Specific for URL and file types
156 );
157
158 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
159 break;
160 case 3: // External URL:
161
162 // Parameters:
163 $params = array(
164 'indexConfigUid' => $cfgRec['uid'], // General
165 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), // General
166 'url' => $cfgRec['externalUrl'], // Partly general... (for URL and file types)
167 'depth' => 0 // Specific for URL and file types
168 );
169
170 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
171 break;
172 case 4: // Page tree
173
174 // Parameters:
175 $params = array(
176 'indexConfigUid' => $cfgRec['uid'], // General
177 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), // General
178 'url' => intval($cfgRec['alternative_source_pid']), // Partly general... (for URL and file types and page tree (root))
179 'depth' => 0 // Specific for URL and file types and page tree
180 );
181
182 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
183 break;
184 case 5: // Meta configuration, nothing to do:
185 # NOOP
186 break;
187 default:
188 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
189 $hookObj = &t3lib_div::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
190
191 if (is_object($hookObj)) {
192
193 // Parameters:
194 $params = array(
195 'indexConfigUid' => $cfgRec['uid'], // General
196 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].'/CUSTOM]'), // General
197 'url' => $hookObj->initMessage($message),
198 );
199
200 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
201 }
202 }
203 break;
204 }
205 }
206
207 // Finally, look up all old index configurations which are finished and needs to be reset and done.
208 $this->cleanUpOldRunningConfigurations();
209 }
210
211 /**
212 * Call back function for execution of a log element
213 *
214 * @param array Params from log element. Must contain $params['indexConfigUid']
215 * @param object Parent object (tx_crawler lib)
216 * @return array Result array
217 */
218 function crawler_execute($params,&$pObj) {
219
220 // Indexer configuration ID must exist:
221 if ($params['indexConfigUid']) {
222
223 // Load the indexing configuration record:
224 list($cfgRec) = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
225 '*',
226 'index_config',
227 'uid='.intval($params['indexConfigUid'])
228 );
229
230 if (is_array($cfgRec)) {
231
232 // Unpack session data:
233 $session_data = unserialize($cfgRec['session_data']);
234
235 // Select which type:
236 switch($cfgRec['type']) {
237 case 1: // Records:
238 $this->crawler_execute_type1($cfgRec,$session_data,$params,$pObj);
239 break;
240 case 2: // Files
241 $this->crawler_execute_type2($cfgRec,$session_data,$params,$pObj);
242 break;
243 case 3: // External URL:
244 $this->crawler_execute_type3($cfgRec,$session_data,$params,$pObj);
245 break;
246 case 4: // Page tree:
247 $this->crawler_execute_type4($cfgRec,$session_data,$params,$pObj);
248 break;
249 case 5: // Meta
250 # NOOP (should never enter here!)
251 break;
252 default:
253 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
254 $hookObj = &t3lib_div::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
255
256 if (is_object($hookObj)) {
257 $this->pObj = &$pObj; // For addQueueEntryForHook()
258 $hookObj->indexOperation($cfgRec,$session_data,$params,$this);
259 }
260 }
261 break;
262 }
263
264 // Save process data which might be modified:
265 $field_array = array (
266 'session_data' => serialize($session_data)
267 );
268 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
269 }
270 }
271
272 return array('log' => $params);
273 }
274
275 /**
276 * Indexing records from a table
277 *
278 * @param array Indexing Configuration Record
279 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
280 * @param array Parameters from the log queue.
281 * @param object Parent object (from "crawler" extension!)
282 * @return void
283 */
284 function crawler_execute_type1($cfgRec,&$session_data,$params,&$pObj) {
285 if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']])) {
286
287 // Init session data array if not already:
288 if (!is_array($session_data)) {
289 $session_data = array(
290 'uid' => 0
291 );
292 }
293
294 // Init:
295 $pid = intval($cfgRec['alternative_source_pid']) ? intval($cfgRec['alternative_source_pid']) : $cfgRec['pid'];
296 $numberOfRecords = $cfgRec['recordsbatch'] ? t3lib_div::intInRange($cfgRec['recordsbatch'],1) : 100;
297
298 // Get root line:
299 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
300
301 // Select
302 $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
303 '*',
304 $cfgRec['table2index'],
305 'pid = '.intval($pid).'
306 AND uid > '.intval($session_data['uid']).
307 t3lib_BEfunc::deleteClause($cfgRec['table2index']).
308 t3lib_BEfunc::BEenableFields($cfgRec['table2index']),
309 '',
310 'uid',
311 $numberOfRecords
312 );
313
314 // Traverse:
315 if (count($recs)) {
316 foreach($recs as $r) {
317
318 // Index single record:
319 $this->indexSingleRecord($r,$cfgRec,$rl);
320
321 // Update the UID we last processed:
322 $session_data['uid'] = $r['uid'];
323 }
324
325 // Finally, set entry for next indexing of batch of records:
326 $nparams = array(
327 'indexConfigUid' => $cfgRec['uid'],
328 'url' => 'Records from UID#'.($r['uid']+1).'-?',
329 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']')
330 );
331 $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']);
332 }
333 }
334 }
335
336 /**
337 * Indexing files from fileadmin
338 *
339 * @param array Indexing Configuration Record
340 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
341 * @param array Parameters from the log queue.
342 * @param object Parent object (from "crawler" extension!)
343 * @return void
344 */
345 function crawler_execute_type2($cfgRec,&$session_data,$params,&$pObj) {
346
347 // Prepare path, making it absolute and checking:
348 $readpath = $params['url'];
349 if (!t3lib_div::isAbsPath($readpath)) {
350 $readpath = t3lib_div::getFileAbsFileName($readpath);
351 }
352
353 if (t3lib_div::isAllowedAbsPath($readpath)) {
354 if (@is_file($readpath)) { // If file, index it!
355
356 // Get root line (need to provide this when indexing external files)
357 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
358
359 // Load indexer if not yet.
360 $this->loadIndexerClass();
361
362 // (Re)-Indexing file on page.
363 $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
364 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
365 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
366 $indexerObj->hash['phash'] = -1; // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
367
368 // Index document:
369 $indexerObj->indexRegularDocument(substr($readpath,strlen(PATH_site)), TRUE);
370 } elseif (@is_dir($readpath)) { // If dir, read content and create new pending items for log:
371
372 // Select files and directories in path:
373 $extList = implode(',',t3lib_div::trimExplode(',',$cfgRec['extensions'],1));
374 $fileArr = array();
375 $files = t3lib_div::getAllFilesAndFoldersInPath($fileArr,$readpath,$extList,0,0);
376
377 $directoryList = t3lib_div::get_dirs($readpath);
378 if (is_array($directoryList) && $params['depth'] < $cfgRec['depth']) {
379 foreach ($directoryList as $subdir) {
380 if ((string)$subdir!='') {
381 $files[]= $readpath.$subdir.'/';
382 }
383 }
384 }
385 $files = t3lib_div::removePrefixPathFromList($files,PATH_site);
386
387 // traverse the items and create log entries:
388 foreach($files as $path) {
389 $this->instanceCounter++;
390 if ($path!==$params['url']) {
391 // Parameters:
392 $nparams = array(
393 'indexConfigUid' => $cfgRec['uid'],
394 'url' => $path,
395 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
396 'depth' => $params['depth']+1
397 );
398 $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid'],time()+$this->instanceCounter*$this->secondsPerExternalUrl);
399 }
400 }
401 }
402 }
403 }
404
405 /**
406 * Indexing External URLs
407 *
408 * @param array Indexing Configuration Record
409 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
410 * @param array Parameters from the log queue.
411 * @param object Parent object (from "crawler" extension!)
412 * @return void
413 */
414 function crawler_execute_type3($cfgRec,&$session_data,$params,&$pObj) {
415
416 // Init session data array if not already:
417 if (!is_array($session_data)) {
418 $session_data = array(
419 'urlLog' => array($params['url'])
420 );
421 }
422
423 // Index the URL:
424 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
425 $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
426
427 // Add more elements to log now:
428 if ($params['depth'] < $cfgRec['depth']) {
429 foreach($subUrls as $url) {
430 if ($url = $this->checkUrl($url,$session_data['urlLog'],$cfgRec['externalUrl'])) {
431 if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny'])) {
432 $this->instanceCounter++;
433 $session_data['urlLog'][] = $url;
434
435 // Parameters:
436 $nparams = array(
437 'indexConfigUid' => $cfgRec['uid'],
438 'url' => $url,
439 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
440 'depth' => $params['depth']+1
441 );
442 $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid'],time()+$this->instanceCounter*$this->secondsPerExternalUrl);
443 }
444 }
445 }
446 }
447 }
448
449 /**
450 * Page tree indexing type
451 *
452 * @param array Indexing Configuration Record
453 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
454 * @param array Parameters from the log queue.
455 * @param object Parent object (from "crawler" extension!)
456 * @return void
457 */
458 function crawler_execute_type4($cfgRec,&$session_data,$params,&$pObj) {
459
460 // Base page uid:
461 $pageUid = intval($params['url']);
462
463 // Get array of URLs from page:
464 $pageRow = t3lib_BEfunc::getRecord('pages',$pageUid);
465 $res = $pObj->getUrlsForPageRow($pageRow);
466
467 $duplicateTrack = array(); // Registry for duplicates
468 $downloadUrls = array(); // Dummy.
469
470 // Submit URLs:
471 if (count($res)) {
472 foreach($res as $paramSetKey => $vv) {
473 $urlList = $pObj->urlListFromUrlArray($vv,$pageRow,time(),30,1,0,$duplicateTrack,$downloadUrls,array('tx_indexedsearch_reindex'));
474 }
475 }
476
477 // Add subpages to log now:
478 if ($params['depth'] < $cfgRec['depth']) {
479
480 // Subpages selected
481 $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
482 'uid,title',
483 'pages',
484 'pid = '.intval($pageUid).
485 t3lib_BEfunc::deleteClause('pages')
486 );
487
488 // Traverse subpages and add to queue:
489 if (count($recs)) {
490 foreach($recs as $r) {
491 $this->instanceCounter++;
492 $url = 'pages:'.$r['uid'].': '.$r['title'];
493 $session_data['urlLog'][] = $url;
494
495 // Parameters:
496 $nparams = array(
497 'indexConfigUid' => $cfgRec['uid'],
498 'url' => $r['uid'],
499 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
500 'depth' => $params['depth']+1
501 );
502 $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid'],time()+$this->instanceCounter*$this->secondsPerExternalUrl);
503 }
504 }
505 }
506 }
507
508 /**
509 * Look up all old index configurations which are finished and needs to be reset and done
510 *
511 * @return void
512 */
513 function cleanUpOldRunningConfigurations() {
514
515 // Lookup running index configurations:
516 $runningIndexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
517 'uid,set_id',
518 'index_config',
519 'set_id!=0'.t3lib_BEfunc::deleteClause('index_config')
520 );
521
522 // For each running configuration, look up how many log entries there are which are scheduled for execution and if none, clear the "set_id" (means; Processing was DONE)
523 foreach($runningIndexingConfigurations as $cfgRec) {
524
525 // Look for ended processes:
526 $queued_items = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
527 '*',
528 'tx_crawler_queue',
529 'set_id=' . intval($cfgRec['set_id']) . ' AND exec_time=0'
530 );
531
532 if (!$queued_items) {
533
534 // Lookup old phash rows:
535 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
536 'phash',
537 'index_phash',
538 'freeIndexUid='.intval($cfgRec['uid']).' AND freeIndexSetId!='.$cfgRec['set_id']
539 );
540
541 foreach($oldPhashRows as $pHashRow) {
542 // Removing old registrations for all tables (code copied from class.tx_indexedsearch_modfunc1.php)
543 $tableArr = explode(',','index_phash,index_rel,index_section,index_grlist,index_fulltext,index_debug');
544 foreach($tableArr as $table) {
545 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($pHashRow['phash']));
546 }
547 }
548
549 // End process by updating index-config record:
550 $field_array = array (
551 'set_id' => 0,
552 'session_data' => '',
553 );
554 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
555 }
556 }
557 }
558
559
560
561
562
563
564
565 /*****************************************
566 *
567 * Helper functions
568 *
569 *****************************************/
570
571 /**
572 * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log.
573 *
574 * @param string URL string to check
575 * @param array Array of already indexed URLs (input url is looked up here and must not exist already)
576 * @param string Base URL of the indexing process (input URL must be "inside" the base URL!)
577 * @return string Returls the URL if OK, otherwise false
578 */
579 function checkUrl($url,$urlLog,$baseUrl) {
580 $url = preg_replace('/\/\/$/','/',$url);
581 list($url) = explode('#',$url);
582
583 if (!strstr($url,'../')) {
584 if (t3lib_div::isFirstPartOfStr($url,$baseUrl)) {
585 if (!in_array($url,$urlLog)) {
586 return $url;
587 }
588 }
589 }
590 }
591
592 /**
593 * Indexing External URL
594 *
595 * @param string URL, http://....
596 * @param integer Page id to relate indexing to.
597 * @param array Rootline array to relate indexing to
598 * @param integer Configuration UID
599 * @param integer Set ID value
600 * @return array URLs found on this page
601 */
602 function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId) {
603
604 // Load indexer if not yet.
605 $this->loadIndexerClass();
606
607 // Index external URL:
608 $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
609 $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
610 $indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
611 $indexerObj->hash['phash'] = -1; // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
612
613 $indexerObj->indexExternalUrl($url);
614 $url_qParts = parse_url($url);
615
616 // Get URLs on this page:
617 $subUrls = array();
618 $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
619
620 // Traverse links:
621 foreach ($list as $count => $linkInfo) {
622
623 // Decode entities:
624 $subUrl = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
625
626 $qParts = parse_url($subUrl);
627 if (!$qParts['scheme']) {
628 $subUrl = $url_qParts['scheme'].'://'.$url_qParts['host'].'/'.t3lib_div::resolveBackPath($subUrl);
629 }
630
631 $subUrls[] = $subUrl;
632 }
633
634 return $subUrls;
635 }
636
637 /**
638 * Indexing Single Record
639 *
640 * @param array Record to index
641 * @param array Configuration Record
642 * @param array Rootline array to relate indexing to
643 * @return void
644 */
645 function indexSingleRecord($r,$cfgRec,$rl=NULL) {
646
647 // Load indexer if not yet.
648 $this->loadIndexerClass();
649
650
651 // Init:
652 $rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
653 $fieldList = t3lib_div::trimExplode(',',$cfgRec['fieldlist'],1);
654 $languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
655 $sys_language_uid = $languageField ? $r[$languageField] : 0;
656
657 // (Re)-Indexing a row from a table:
658 $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
659 parse_str(str_replace('###UID###',$r['uid'],$cfgRec['get_params']),$GETparams);
660 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, $cfgRec['chashcalc'] ? TRUE : FALSE);
661 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
662 $indexerObj->forceIndexing = TRUE;
663
664 $theContent = '';
665 foreach($fieldList as $k => $v) {
666 if (!$k) {
667 $theTitle = $r[$v];
668 } else {
669 $theContent.= $r[$v].' ';
670 }
671 }
672
673 // Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid())
674 $indexerObj->backend_indexAsTYPO3Page(
675 strip_tags($theTitle),
676 '',
677 '',
678 strip_tags($theContent),
679 $GLOBALS['LANG']->charSet, // Requires that
680 $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']],
681 $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']],
682 $r['uid']
683 );
684
685 #echo print_r($indexerObj->internal_log);
686 #echo print_r($indexerObj->contentParts);
687 }
688
689 /**
690 * Include indexer class.
691 *
692 * @return void
693 */
694 function loadIndexerClass() {
695 global $TYPO3_CONF_VARS;
696 require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php');
697 }
698
699 /**
700 * Get rootline for closest TypoScript template root.
701 * Algorithm same as used in Web > Template, Object browser
702 *
703 * @param integer The page id to traverse rootline back from
704 * @return array Array where the root lines uid values are found.
705 */
706 function getUidRootLineForClosestTemplate($id) {
707 global $TYPO3_CONF_VARS;
708
709 $tmpl = t3lib_div::makeInstance("t3lib_tsparser_ext");
710 $tmpl->tt_track = 0; // Do not log time-performance information
711 $tmpl->init();
712
713 // Gets the rootLine
714 $sys_page = t3lib_div::makeInstance("t3lib_pageSelect");
715 $rootLine = $sys_page->getRootLine($id);
716 $tmpl->runThroughTemplates($rootLine,0); // This generates the constants/config + hierarchy info for the template.
717
718 // Root line uids
719 $rootline_uids = array();
720 foreach($tmpl->rootLine as $rlkey => $rldat) {
721 $rootline_uids[$rlkey] = $rldat['uid'];
722 }
723
724 return $rootline_uids;
725 }
726
727 /**
728 * Generate the unix time stamp for next visit.
729 *
730 * @param array Index configuration record
731 * @return integer The next time stamp
732 */
733 function generateNextIndexingTime($cfgRec) {
734 $currentTime = time();
735
736 // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected
737 if ($cfgRec['timer_frequency']<=24*3600) {
738 $aMidNight = mktime (0,0,0)-1*24*3600;
739 } else {
740 $lastTime = $cfgRec['timer_next_indexing']?$cfgRec['timer_next_indexing']:time();
741 $aMidNight = mktime (0,0,0, date('m',$lastTime), date('d',$lastTime), date('y',$lastTime));
742 }
743
744 // Find last offset time plus frequency in seconds:
745 $lastSureOffset = $aMidNight+t3lib_div::intInRange($cfgRec['timer_offset'],0,86400);
746 $frequencySeconds = t3lib_div::intInRange($cfgRec['timer_frequency'],1);
747
748 // Now, find out how many blocks of the length of frequency there is until the next time:
749 $frequencyBlocksUntilNextTime = ceil(($currentTime-$lastSureOffset)/$frequencySeconds);
750
751 // Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds.
752 $nextTime = $lastSureOffset + $frequencyBlocksUntilNextTime*$frequencySeconds;
753
754 return $nextTime;
755 }
756
757 /**
758 * Checks if $url has any of the URls in the $url_deny "list" in it and if so, returns true.
759 *
760 * @param string URL to test
761 * @param string String where URLs are separated by line-breaks; If any of these strings is the first part of $url, the function returns TRUE (to indicate denial of decend)
762 * @return boolean TRUE if there is a matching URL (hence, do not index!)
763 */
764 function checkDeniedSuburls($url, $url_deny) {
765 if (trim($url_deny)) {
766 $url_denyArray = t3lib_div::trimExplode(chr(10),$url_deny,1);
767 foreach($url_denyArray as $testurl) {
768 if (t3lib_div::isFirstPartOfStr($url,$testurl)) {
769 echo $url.' /// '.$url_deny.chr(10);
770 return TRUE;
771 }
772 }
773 }
774 return FALSE;
775 }
776
777 /**
778 * Adding entry in queue for Hook
779 *
780 * @param array Configuration record
781 * @param string Title/URL
782 * @return void
783 */
784 function addQueueEntryForHook($cfgRec, $title) {
785
786 $nparams = array(
787 'indexConfigUid' => $cfgRec['uid'], // This must ALWAYS be the cfgRec uid!
788 'url' => $title,
789 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']') // Also just for information. Its good style to show that its an indexing configuration that added the entry.
790 );
791 $this->pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']);
792 }
793
794 /**
795 * Deletes all data stored by indexed search for a given page
796 *
797 * @param integer Uid of the page to delete all pHash
798 * @return void
799 */
800 function deleteFromIndex($id) {
801
802 // Lookup old phash rows:
803 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('phash','index_section', 'page_id='.intval($id));
804
805 if (count($oldPhashRows)) {
806 $pHashesToDelete = array();
807 foreach ($oldPhashRows as $pHashRow) {
808 $pHashesToDelete[] = $pHashRow['phash'];
809 }
810
811 $where_clause = 'phash IN ('.implode(',',$GLOBALS['TYPO3_DB']->cleanIntArray($pHashesToDelete)).')';
812 $tables = explode(',', 'index_debug,index_fulltext,index_grlist,index_phash,index_rel,index_section');
813 foreach ($tables as $table) {
814 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, $where_clause);
815 }
816 }
817 }
818
819
820
821
822
823
824
825 /*************************
826 *
827 * Hook functions for TCEmain (indexing of records)
828 *
829 *************************/
830
831 /**
832 * TCEmain hook function for on-the-fly indexing of database records
833 *
834 * @param string TCEmain command
835 * @param string Table name
836 * @param string Record ID. If new record its a string pointing to index inside t3lib_tcemain::substNEWwithIDs
837 * @param mixed Target value (ignored)
838 * @param object Reference to tcemain calling object
839 * @return void
840 */
841 function processCmdmap_preProcess($command, $table, $id, $value, &$pObj) {
842
843 // Clean up the index
844 if ($command=='delete' && $table == 'pages') {
845 $this->deleteFromIndex($id);
846 }
847 }
848
849 /**
850 * TCEmain hook function for on-the-fly indexing of database records
851 *
852 * @param string Status "new" or "update"
853 * @param string Table name
854 * @param string Record ID. If new record its a string pointing to index inside t3lib_tcemain::substNEWwithIDs
855 * @param array Field array of updated fields in the operation
856 * @param object Reference to tcemain calling object
857 * @return void
858 */
859 function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, &$pObj) {
860
861 // Check if any fields are actually updated:
862 if (count($fieldArray)) {
863
864 // Translate new ids.
865 if ($status=='new') {
866 $id = $pObj->substNEWwithIDs[$id];
867
868 } elseif ($table=='pages' && $status=='update' && ((array_key_exists('hidden',$fieldArray) && $fieldArray['hidden']==1) || (array_key_exists('no_search',$fieldArray) && $fieldArray['no_search']==1))) {
869
870 // If the page should be hidden or not indexed after update, delete index for this page
871 $this->deleteFromIndex($id);
872 }
873
874 // Get full record and if exists, search for indexing configurations:
875 $currentRecord = t3lib_BEfunc::getRecord($table,$id);
876 if (is_array($currentRecord)) {
877
878 // Select all (not running) indexing configurations of type "record" (1) and which points to this table and is located on the same page as the record or pointing to the right source PID
879 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
880 '*',
881 'index_config',
882 'hidden=0
883 AND (starttime=0 OR starttime<='.time().')
884 AND set_id=0
885 AND type=1
886 AND table2index='.$GLOBALS['TYPO3_DB']->fullQuoteStr($table,'index_config').'
887 AND (
888 (alternative_source_pid=0 AND pid='.intval($currentRecord['pid']).')
889 OR (alternative_source_pid='.intval($currentRecord['pid']).')
890 )
891 AND records_indexonchange=1
892 '.t3lib_BEfunc::deleteClause('index_config')
893 );
894
895 foreach($indexingConfigurations as $cfgRec) {
896 $this->indexSingleRecord($currentRecord,$cfgRec);
897 }
898 }
899 }
900 }
901 }
902
903
904 /**
905 * Crawler hook for indexed search. Works with the "crawler" extension
906 * This hook is specifically used to index external files found on pages through the crawler extension.
907 *
908 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
909 * @package TYPO3
910 * @subpackage tx_indexedsearch
911 * @see tx_indexedsearch_indexer::extractLinks()
912 */
913 class tx_indexedsearch_files {
914
915 /**
916 * Call back function for execution of a log element
917 *
918 * @param array Params from log element.
919 * @param object Parent object (tx_crawler lib)
920 * @return array Result array
921 */
922 function crawler_execute($params,&$pObj) {
923
924 // Load indexer if not yet.
925 $this->loadIndexerClass();
926
927 if (is_array($params['conf'])) {
928
929 // Initialize the indexer class:
930 $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
931 $indexerObj->conf = $params['conf'];
932 $indexerObj->init();
933
934 // Index document:
935 if ($params['alturl']) {
936 $fI = pathinfo($params['document']);
937 $ext = strtolower($fI['extension']);
938 $indexerObj->indexRegularDocument($params['alturl'], TRUE, $params['document'], $ext);
939 } else {
940 $indexerObj->indexRegularDocument($params['document'], TRUE);
941 }
942
943 // Return OK:
944 return array('content' => array());
945 }
946 }
947
948 /**
949 * Include indexer class.
950 *
951 * @return void
952 */
953 function loadIndexerClass() {
954 global $TYPO3_CONF_VARS;
955 require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php');
956 }
957 }
958
959
960 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.crawler.php']) {
961 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.crawler.php']);
962 }
963
964 ?>