Indexed Search modifications for support of cronjob based indexing. More to come...
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.crawler.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2005 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * Crawler hook for indexed search. Works with the "crawler" extension
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 59: class tx_indexedsearch_crawler
38 * 70: function crawler_init(&$pObj)
39 * 119: function crawler_execute($params,&$pObj)
40 * 180: function checkUrl($url,$urlLog,$baseUrl)
41 * 212: function indexExtUrl($url, $pageId, $rl, $cfgUid)
42 * 251: function loadIndexerClass()
43 * 263: function getUidRootLineForClosestTemplate($id)
44 *
45 * TOTAL FUNCTIONS: 6
46 * (This index is automatically created/updated by the extension "extdeveval")
47 *
48 */
49
50
51
52 /**
53 * Crawler hook for indexed search. Works with the "crawler" extension
54 *
55 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
56 * @package TYPO3
57 * @subpackage tx_indexedsearch
58 */
59 class tx_indexedsearch_crawler {
60
61 // Static:
62 var $secondsPerExternalUrl = 3; // Number of seconds to use as interval between queued indexing operations of URLs
63
64 // Internal, dynamic:
65 var $instanceCounter = 0; // Counts up for each added URL
66
67 // Internal, static:
68 var $callBack = 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'; // The object reference to this class.
69
70 /**
71 * Initialization of crawler hook.
72 * This function is asked for each instance of the crawler and we must check if something is timed to happen and if so put entry(s) in the crawlers log to start processing.
73 * In reality we select indexing configurations and evaluate if any of them needs to run.
74 *
75 * @param object Parent object (tx_crawler lib)
76 * @return void
77 */
78 function crawler_init(&$pObj){
79
80 // Select all indexing configuration which are waiting to be activated:
81 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
82 'uid,pid,first_run_time,frequency,last_run,type,externalUrl,filepath',
83 'index_config',
84 'hidden=0
85 AND (starttime=0 OR starttime<='.time().')
86 AND set_id=0
87 '.t3lib_BEfunc::deleteClause('index_config')
88
89 );
90
91 // For each configuration, check if it should be executed and if so, start:
92 foreach($indexingConfigurations as $cfgRec) {
93
94 // Generate a unique set-ID:
95 $setId = t3lib_div::md5int(microtime());
96
97 // Start process by updating index-config record:
98 $field_array = array (
99 'set_id' => $setId,
100 'session_data' => '',
101 );
102 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
103
104 // Based on configuration type:
105 switch($cfgRec['type']) {
106 case 1:
107 // Parameters:
108 $params = array(
109 'indexConfigUid' => $cfgRec['uid'],
110 'url' => 'Records (start)',
111 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']')
112 );
113 //
114 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
115 break;
116 case 3: // External URL:
117
118 // Parameters:
119 $params = array(
120 'indexConfigUid' => $cfgRec['uid'], // General
121 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), // General
122 'url' => $cfgRec['externalUrl'], // Partly general... (for URL and file types)
123 'depth' => 0 // Specific for URL and file types
124 );
125
126 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
127 break;
128 case 2:
129
130 // Parameters:
131 $params = array(
132 'indexConfigUid' => $cfgRec['uid'], // General
133 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), // General
134 'url' => $cfgRec['filepath'], // Partly general... (for URL and file types)
135 'depth' => 0 // Specific for URL and file types
136 );
137
138 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
139 break;
140 }
141 }
142
143 // Finally, look up all old index configurations which are finished and needs to be reset and done.
144 $this->cleanUpOldRunningConfigurations();
145 }
146
147 /**
148 * Call back function for execution of a log element
149 *
150 * @param array Params from log element
151 * @param object Parent object (tx_crawler lib)
152 * @return array Result array
153 */
154 function crawler_execute($params,&$pObj) {
155
156 // Indexer configuration ID must exist:
157 if ($params['indexConfigUid']) {
158 list($cfgRec) = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
159 '*',
160 'index_config',
161 'uid='.intval($params['indexConfigUid'])
162 );
163
164 if (is_array($cfgRec)) {
165
166 // Unpack session data:
167 $session_data = unserialize($cfgRec['session_data']);
168
169 // Select which type:
170 switch($cfgRec['type']) {
171 case 1:
172 if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']])) {
173
174 // Init session data array if not already:
175 if (!is_array($session_data)) {
176 $session_data = array(
177 'uid' => 0
178 );
179 }
180
181 // Init:
182 $pid = intval($cfgRec['alternative_source_pid']) ? intval($cfgRec['alternative_source_pid']) : $this->pObj->id;
183 $fieldList = t3lib_div::trimExplode(',',$cfgRec['fieldlist'],1);
184
185 // Get root line:
186 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
187
188 // Load indexer if not yet.
189 $this->loadIndexerClass();
190
191 // Select
192 $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
193 '*',
194 $cfgRec['table2index'],
195 'pid = '.intval($pid).'
196 AND uid > '.intval($session_data['uid']).
197 t3lib_BEfunc::deleteClause($cfgRec['table2index']),
198 '',
199 'uid',
200 '2'
201 );
202
203 // Traverse:
204 if (count($recs)) {
205 foreach($recs as $r) {
206
207 // (Re)-Indexing a row from a table:
208 $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
209 parse_str(str_replace('###UID###',$r['uid'],$cfgRec['get_params']),$GETparams);
210 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl, $GETparams, $cfgRec['chashcalc'] ? TRUE : FALSE);
211 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
212 $indexerObj->forceIndexing = TRUE;
213
214 $theContent = '';
215 foreach($fieldList as $k => $v) {
216 if (!$k) {
217 $theTitle = $r[$v];
218 } else {
219 $theContent.= $r[$v].' ';
220 }
221 }
222
223 $indexerObj->backend_indexAsTYPO3Page(
224 $theTitle,
225 '',
226 '',
227 $theContent,
228 $GLOBALS['LANG']->charSet,
229 $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']],
230 $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']],
231 $r['uid']
232 );
233
234 #debug($indexerObj->internal_log);
235
236 // Update the UID we last processed:
237 $session_data['uid'] = $r['uid'];
238 }
239
240
241 // Parameters:
242 $nparams = array(
243 'indexConfigUid' => $cfgRec['uid'],
244 'url' => 'Records from UID#'.($r['uid']+1).'-?',
245 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']')
246 );
247 //
248 $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']);
249 }
250 }
251 break;
252 case 3: // External URL:
253
254 // Init session data array if not already:
255 if (!is_array($session_data)) {
256 $session_data = array(
257 'urlLog' => array($params['url'])
258 );
259 }
260
261 // Index the URL:
262 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
263 $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
264
265 // Add more elements to log now:
266 if ($params['depth'] < $cfgRec['depth']) {
267 foreach($subUrls as $url) {
268 if ($url = $this->checkUrl($url,$session_data['urlLog'],$cfgRec['externalUrl'])) {
269 $this->instanceCounter++;
270 $session_data['urlLog'][] = $url;
271
272 // Parameters:
273 $nparams = array(
274 'indexConfigUid' => $cfgRec['uid'],
275 'url' => $url,
276 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
277 'depth' => $params['depth']+1
278 );
279 $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid'],time()+$this->instanceCounter*$this->secondsPerExternalUrl);
280 }
281 }
282 }
283 break;
284 case 2:
285
286 // Prepare path, making it absolute and checking:
287 $readpath = $params['url'];
288 if (!t3lib_div::isAbsPath($readPath)) {
289 $readpath = t3lib_div::getFileAbsFileName($readpath);
290 }
291
292 if (t3lib_div::isAllowedAbsPath($readpath)) {
293 if (@is_file($readpath)) { // If file, index it!
294
295 // Get root line:
296 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
297
298 // Load indexer if not yet.
299 $this->loadIndexerClass();
300
301 // (Re)-Indexing file on page.
302 $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
303 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
304 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
305 $indexerObj->hash['phash'] = -1; // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
306
307 // Index document:
308 $indexerObj->indexRegularDocument(substr($readpath,strlen(PATH_site)), TRUE);
309 } elseif (@is_dir($readpath)) { // If dir, read content and create new pending items for log:
310
311 // Select files and directories in path:
312 $extList = implode(',',t3lib_div::trimExplode(',',$cfgRec['extensions'],1));
313 $fileArr = array();
314 $files = t3lib_div::getAllFilesAndFoldersInPath($fileArr,$readpath,$extList,0,0);
315
316 $directoryList = t3lib_div::get_dirs($readpath);
317 if (is_array($directoryList) && $params['depth'] < $cfgRec['depth']) {
318 foreach ($directoryList as $subdir) {
319 if ((string)$subdir!='') {
320 $files[]= $readpath.$subdir.'/';
321 }
322 }
323 }
324 $files = t3lib_div::removePrefixPathFromList($files,PATH_site);
325
326 // traverse the items and create log entries:
327 foreach($files as $path) {
328 $this->instanceCounter++;
329 if ($path!==$params['url']) {
330 // Parameters:
331 $nparams = array(
332 'indexConfigUid' => $cfgRec['uid'],
333 'url' => $path,
334 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
335 'depth' => $params['depth']+1
336 );
337 $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid'],time()+$this->instanceCounter*$this->secondsPerExternalUrl);
338 }
339 }
340 }
341 }
342 break;
343 }
344
345 // Save process data which might be modified:
346 $field_array = array (
347 'session_data' => serialize($session_data)
348 );
349 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
350 }
351 }
352
353 return array('log' => $params);
354 }
355
356 /**
357 * Look up all old index configurations which are finished and needs to be reset and done
358 *
359 * @return void
360 */
361 function cleanUpOldRunningConfigurations() {
362
363 // Lookup running index configurations:
364 $runningIndexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
365 'uid,set_id',
366 'index_config',
367 'set_id!=0'.t3lib_BEfunc::deleteClause('index_config')
368 );
369
370 // For each running configuration, look up how many log entries there are which are scheduled for execution and if none, clear the "set_id" (means; Processing was DONE)
371 foreach($runningIndexingConfigurations as $cfgRec) {
372
373 // Look for ended processes:
374 list($queued_items) = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
375 'count(*) AS count',
376 'tx_crawler_queue',
377 'set_id='.intval($cfgRec['set_id']).' AND exec_time=0'
378 );
379
380 if (!$queued_items['count']) {
381
382 // Lookup old phash rows:
383 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
384 'phash,freeIndexUid,freeIndexSetId,externalUrl',
385 'index_phash',
386 'freeIndexUid='.intval($cfgRec['uid']).' AND freeIndexSetId!='.$cfgRec['set_id']
387 );
388
389 foreach($oldPhashRows as $pHashRow) {
390 // Removing old registrations for all tables (code copied from class.tx_indexedsearch_modfunc1.php)
391 $tableArr = explode(',','index_phash,index_rel,index_section,index_grlist,index_fulltext,index_debug');
392 foreach($tableArr as $table) {
393 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($pHashRow['phash']));
394 }
395 }
396
397 // End process by updating index-config record:
398 $field_array = array (
399 'set_id' => 0,
400 'session_data' => '',
401 );
402 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
403 }
404 }
405 }
406
407
408
409
410
411
412
413 /*****************************************
414 *
415 * Helper functions
416 *
417 *****************************************/
418
419 /**
420 * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log.
421 *
422 * @param string URL
423 * @param array Array of already indexed URLs (input url is looked up here and must not exist already)
424 * @param string Base URL of the indexing process (input URL must be "inside" the base URL!)
425 * @return string Returls the URL if OK, otherwise false
426 */
427 function checkUrl($url,$urlLog,$baseUrl) {
428 $url = ereg_replace('\/\/$','/',$url);
429 list($url) = explode('#',$url);
430
431 if (!strstr($url,'../')) {
432 if (t3lib_div::isFirstPartOfStr($url,$baseUrl)) {
433 if (!in_array($url,$urlLog)) {
434 return $url;
435 }
436 }
437 }
438 }
439
440 /**
441 * Indexing External URL
442 *
443 * @param string URL, http://....
444 * @param integer Page id to relate indexing to.
445 * @param array Rootline array to relate indexing to
446 * @param integer Configuration UID
447 * @param integer Set ID
448 * @return array URLs found on this page
449 */
450 function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId) {
451
452 // Load indexer if not yet.
453 $this->loadIndexerClass();
454
455 // Index external URL:
456 $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
457 $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
458 $indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
459
460 $indexerObj->indexExternalUrl($url);
461 $url_qParts = parse_url($url);
462
463 // Get URLs on this page:
464 $subUrls = array();
465 $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
466
467 // Traverse links:
468 foreach($list as $count => $linkInfo) {
469
470 // Decode entities:
471 $subUrl = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
472
473 $qParts = parse_url($subUrl);
474 if (!$qParts['scheme']) {
475 $subUrl = $url_qParts['scheme'].'://'.$url_qParts['host'].'/'.t3lib_div::resolveBackPath($subUrl);
476 }
477
478 $subUrls[] = $subUrl;
479 }
480
481 return $subUrls;
482 }
483
484 /**
485 * Include indexer class.
486 *
487 * @return void
488 */
489 function loadIndexerClass() {
490 global $TYPO3_CONF_VARS;
491 require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php');
492 }
493
494 /**
495 * Get rootline for closest TypoScript template root.
496 * Algorithm same as used in Web > Template, Object browser
497 *
498 * @param integer The page id to traverse rootline back from
499 * @return array Array where the root lines uid values are found.
500 */
501 function getUidRootLineForClosestTemplate($id) {
502 global $TYPO3_CONF_VARS;
503
504 require_once (PATH_t3lib."class.t3lib_page.php");
505 require_once (PATH_t3lib."class.t3lib_tstemplate.php");
506 require_once (PATH_t3lib."class.t3lib_tsparser_ext.php");
507
508
509
510 $tmpl = t3lib_div::makeInstance("t3lib_tsparser_ext");
511 $tmpl->tt_track = 0; // Do not log time-performance information
512 $tmpl->init();
513
514 // Gets the rootLine
515 $sys_page = t3lib_div::makeInstance("t3lib_pageSelect");
516 $rootLine = $sys_page->getRootLine($id);
517 $tmpl->runThroughTemplates($rootLine,0); // This generates the constants/config + hierarchy info for the template.
518
519 // Root line uids
520 $rootline_uids = array();
521 foreach($tmpl->rootLine as $rlkey => $rldat) {
522 $rootline_uids[$rlkey] = $rldat['uid'];
523 }
524
525 return $rootline_uids;
526 }
527 }
528
529
530 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.crawler.php']) {
531 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.crawler.php']);
532 }
533 ?>