Cleanup: Updated copyright comments
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.crawler.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * Crawler hook for indexed search. Works with the "crawler" extension
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 */
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
34 *
35 *
36 *
37 * 87: class tx_indexedsearch_crawler
38 * 106: function crawler_init(&$pObj)
39 * 219: function crawler_execute($params,&$pObj)
40 * 285: function crawler_execute_type1($cfgRec,&$session_data,$params,&$pObj)
41 * 345: function crawler_execute_type2($cfgRec,&$session_data,$params,&$pObj)
42 * 414: function crawler_execute_type3($cfgRec,&$session_data,$params,&$pObj)
43 * 458: function crawler_execute_type4($cfgRec,&$session_data,$params,&$pObj)
44 * 513: function cleanUpOldRunningConfigurations()
45 *
46 * SECTION: Helper functions
47 * 579: function checkUrl($url,$urlLog,$baseUrl)
48 * 602: function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)
49 * 645: function indexSingleRecord($r,$cfgRec,$rl=NULL)
50 * 694: function loadIndexerClass()
51 * 706: function getUidRootLineForClosestTemplate($id)
52 * 739: function generateNextIndexingTime($cfgRec)
53 * 778: function checkDeniedSuburls($url, $url_deny)
54 * 798: function addQueueEntryForHook($cfgRec, $title)
55 *
56 * SECTION: Hook functions for TCEmain (indexing of records)
57 * 830: function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, &$pObj)
58 *
59 *
60 * 879: class tx_indexedsearch_files
61 * 888: function crawler_execute($params,&$pObj)
62 * 913: function loadIndexerClass()
63 *
64 * TOTAL FUNCTIONS: 18
65 * (This index is automatically created/updated by the extension "extdeveval")
66 *
67 */
68
69
70
71
72 # To make sure the backend charset is available:
73 if (!is_object($GLOBALS['LANG'])) {
74 $GLOBALS['LANG'] = t3lib_div::makeInstance('language');
75 $GLOBALS['LANG']->init($GLOBALS['BE_USER']->uc['lang']);
76 }
77
78
79 /**
80 * Crawler hook for indexed search. Works with the "crawler" extension
81 *
82 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
83 * @package TYPO3
84 * @subpackage tx_indexedsearch
85 */
86 class tx_indexedsearch_crawler {
87
88 // Static:
89 var $secondsPerExternalUrl = 3; // Number of seconds to use as interval between queued indexing operations of URLs / files (types 2 & 3)
90
91 // Internal, dynamic:
92 var $instanceCounter = 0; // Counts up for each added URL (type 3)
93
94 // Internal, static:
95 var $callBack = 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'; // The object reference to this class.
96
97 /**
98 * Initialization of crawler hook.
99 * This function is asked for each instance of the crawler and we must check if something is timed to happen and if so put entry(s) in the crawlers log to start processing.
100 * In reality we select indexing configurations and evaluate if any of them needs to run.
101 *
102 * @param object Parent object (tx_crawler lib)
103 * @return void
104 */
105 function crawler_init(&$pObj){
106
107 // Select all indexing configuration which are waiting to be activated:
108 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
109 '*',
110 'index_config',
111 'hidden=0
112 AND (starttime=0 OR starttime<=' . $GLOBALS['EXEC_TIME'] . ')
113 AND timer_next_indexing<' . $GLOBALS['EXEC_TIME'] . '
114 AND set_id=0
115 '.t3lib_BEfunc::deleteClause('index_config')
116 );
117
118 // For each configuration, check if it should be executed and if so, start:
119 foreach($indexingConfigurations as $cfgRec) {
120
121 // Generate a unique set-ID:
122 $setId = t3lib_div::md5int(microtime());
123
124 // Get next time:
125 $nextTime = $this->generateNextIndexingTime($cfgRec);
126
127 // Start process by updating index-config record:
128 $field_array = array (
129 'set_id' => $setId,
130 'timer_next_indexing' => $nextTime,
131 'session_data' => '',
132 );
133 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
134
135 // Based on configuration type:
136 switch($cfgRec['type']) {
137 case 1: // RECORDS:
138
139 // Parameters:
140 $params = array(
141 'indexConfigUid' => $cfgRec['uid'],
142 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
143 'url' => 'Records (start)', // Just for show.
144 );
145 //
146 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
147 break;
148 case 2: // FILES:
149
150 // Parameters:
151 $params = array(
152 'indexConfigUid' => $cfgRec['uid'], // General
153 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), // General
154 'url' => $cfgRec['filepath'], // Partly general... (for URL and file types)
155 'depth' => 0 // Specific for URL and file types
156 );
157
158 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
159 break;
160 case 3: // External URL:
161
162 // Parameters:
163 $params = array(
164 'indexConfigUid' => $cfgRec['uid'], // General
165 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), // General
166 'url' => $cfgRec['externalUrl'], // Partly general... (for URL and file types)
167 'depth' => 0 // Specific for URL and file types
168 );
169
170 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
171 break;
172 case 4: // Page tree
173
174 // Parameters:
175 $params = array(
176 'indexConfigUid' => $cfgRec['uid'], // General
177 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), // General
178 'url' => intval($cfgRec['alternative_source_pid']), // Partly general... (for URL and file types and page tree (root))
179 'depth' => 0 // Specific for URL and file types and page tree
180 );
181
182 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
183 break;
184 case 5: // Meta configuration, nothing to do:
185 # NOOP
186 break;
187 default:
188 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
189 $hookObj = t3lib_div::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
190
191 if (is_object($hookObj)) {
192
193 // Parameters:
194 $params = array(
195 'indexConfigUid' => $cfgRec['uid'], // General
196 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].'/CUSTOM]'), // General
197 'url' => $hookObj->initMessage($message),
198 );
199
200 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
201 }
202 }
203 break;
204 }
205 }
206
207 // Finally, look up all old index configurations which are finished and needs to be reset and done.
208 $this->cleanUpOldRunningConfigurations();
209 }
210
211 /**
212 * Call back function for execution of a log element
213 *
214 * @param array Params from log element. Must contain $params['indexConfigUid']
215 * @param object Parent object (tx_crawler lib)
216 * @return array Result array
217 */
218 function crawler_execute($params,&$pObj) {
219
220 // Indexer configuration ID must exist:
221 if ($params['indexConfigUid']) {
222
223 // Load the indexing configuration record:
224 $cfgRec = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow(
225 '*',
226 'index_config',
227 'uid='.intval($params['indexConfigUid'])
228 );
229
230 if (is_array($cfgRec)) {
231
232 // Unpack session data:
233 $session_data = unserialize($cfgRec['session_data']);
234
235 // Select which type:
236 switch($cfgRec['type']) {
237 case 1: // Records:
238 $this->crawler_execute_type1($cfgRec,$session_data,$params,$pObj);
239 break;
240 case 2: // Files
241 $this->crawler_execute_type2($cfgRec,$session_data,$params,$pObj);
242 break;
243 case 3: // External URL:
244 $this->crawler_execute_type3($cfgRec,$session_data,$params,$pObj);
245 break;
246 case 4: // Page tree:
247 $this->crawler_execute_type4($cfgRec,$session_data,$params,$pObj);
248 break;
249 case 5: // Meta
250 # NOOP (should never enter here!)
251 break;
252 default:
253 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
254 $hookObj = t3lib_div::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
255
256 if (is_object($hookObj)) {
257 $this->pObj = $pObj; // For addQueueEntryForHook()
258 $hookObj->indexOperation($cfgRec,$session_data,$params,$this);
259 }
260 }
261 break;
262 }
263
264 // Save process data which might be modified:
265 $field_array = array (
266 'session_data' => serialize($session_data)
267 );
268 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
269 }
270 }
271
272 return array('log' => $params);
273 }
274
275 /**
276 * Indexing records from a table
277 *
278 * @param array Indexing Configuration Record
279 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
280 * @param array Parameters from the log queue.
281 * @param object Parent object (from "crawler" extension!)
282 * @return void
283 */
284 function crawler_execute_type1($cfgRec,&$session_data,$params,&$pObj) {
285 if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']])) {
286
287 // Init session data array if not already:
288 if (!is_array($session_data)) {
289 $session_data = array(
290 'uid' => 0
291 );
292 }
293
294 // Init:
295 $pid = intval($cfgRec['alternative_source_pid']) ? intval($cfgRec['alternative_source_pid']) : $cfgRec['pid'];
296 $numberOfRecords = $cfgRec['recordsbatch'] ? t3lib_div::intInRange($cfgRec['recordsbatch'],1) : 100;
297
298 // Get root line:
299 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
300
301 // Select
302 $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
303 '*',
304 $cfgRec['table2index'],
305 'pid = '.intval($pid).'
306 AND uid > '.intval($session_data['uid']).
307 t3lib_BEfunc::deleteClause($cfgRec['table2index']).
308 t3lib_BEfunc::BEenableFields($cfgRec['table2index']),
309 '',
310 'uid',
311 $numberOfRecords
312 );
313
314 // Traverse:
315 if (count($recs)) {
316 foreach($recs as $r) {
317
318 // Index single record:
319 $this->indexSingleRecord($r,$cfgRec,$rl);
320
321 // Update the UID we last processed:
322 $session_data['uid'] = $r['uid'];
323 }
324
325 // Finally, set entry for next indexing of batch of records:
326 $nparams = array(
327 'indexConfigUid' => $cfgRec['uid'],
328 'url' => 'Records from UID#'.($r['uid']+1).'-?',
329 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']')
330 );
331 $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']);
332 }
333 }
334 }
335
336 /**
337 * Indexing files from fileadmin
338 *
339 * @param array Indexing Configuration Record
340 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
341 * @param array Parameters from the log queue.
342 * @param object Parent object (from "crawler" extension!)
343 * @return void
344 */
345 function crawler_execute_type2($cfgRec,&$session_data,$params,&$pObj) {
346
347 // Prepare path, making it absolute and checking:
348 $readpath = $params['url'];
349 if (!t3lib_div::isAbsPath($readpath)) {
350 $readpath = t3lib_div::getFileAbsFileName($readpath);
351 }
352
353 if (t3lib_div::isAllowedAbsPath($readpath)) {
354 if (@is_file($readpath)) { // If file, index it!
355
356 // Get root line (need to provide this when indexing external files)
357 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
358
359 // Load indexer if not yet.
360 $this->loadIndexerClass();
361
362 // (Re)-Indexing file on page.
363 $indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer');
364 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
365 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
366 $indexerObj->hash['phash'] = -1; // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
367
368 // Index document:
369 $indexerObj->indexRegularDocument(substr($readpath,strlen(PATH_site)), TRUE);
370 } elseif (@is_dir($readpath)) { // If dir, read content and create new pending items for log:
371
372 // Select files and directories in path:
373 $extList = implode(',',t3lib_div::trimExplode(',',$cfgRec['extensions'],1));
374 $fileArr = array();
375 $files = t3lib_div::getAllFilesAndFoldersInPath($fileArr,$readpath,$extList,0,0);
376
377 $directoryList = t3lib_div::get_dirs($readpath);
378 if (is_array($directoryList) && $params['depth'] < $cfgRec['depth']) {
379 foreach ($directoryList as $subdir) {
380 if ((string)$subdir!='') {
381 $files[]= $readpath.$subdir.'/';
382 }
383 }
384 }
385 $files = t3lib_div::removePrefixPathFromList($files,PATH_site);
386
387 // traverse the items and create log entries:
388 foreach($files as $path) {
389 $this->instanceCounter++;
390 if ($path!==$params['url']) {
391 // Parameters:
392 $nparams = array(
393 'indexConfigUid' => $cfgRec['uid'],
394 'url' => $path,
395 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
396 'depth' => $params['depth']+1
397 );
398 $pObj->addQueueEntry_callBack(
399 $cfgRec['set_id'],
400 $nparams,
401 $this->callBack,
402 $cfgRec['pid'],
403 $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
404 );
405 }
406 }
407 }
408 }
409 }
410
411 /**
412 * Indexing External URLs
413 *
414 * @param array Indexing Configuration Record
415 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
416 * @param array Parameters from the log queue.
417 * @param object Parent object (from "crawler" extension!)
418 * @return void
419 */
420 function crawler_execute_type3($cfgRec,&$session_data,$params,&$pObj) {
421
422 // Init session data array if not already:
423 if (!is_array($session_data)) {
424 $session_data = array(
425 'urlLog' => array($params['url'])
426 );
427 }
428
429 // Index the URL:
430 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
431 $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
432
433 // Add more elements to log now:
434 if ($params['depth'] < $cfgRec['depth']) {
435 foreach($subUrls as $url) {
436 if ($url = $this->checkUrl($url,$session_data['urlLog'],$cfgRec['externalUrl'])) {
437 if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny'])) {
438 $this->instanceCounter++;
439 $session_data['urlLog'][] = $url;
440
441 // Parameters:
442 $nparams = array(
443 'indexConfigUid' => $cfgRec['uid'],
444 'url' => $url,
445 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
446 'depth' => $params['depth']+1
447 );
448 $pObj->addQueueEntry_callBack(
449 $cfgRec['set_id'],
450 $nparams,
451 $this->callBack,
452 $cfgRec['pid'],
453 $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
454 );
455 }
456 }
457 }
458 }
459 }
460
461 /**
462 * Page tree indexing type
463 *
464 * @param array Indexing Configuration Record
465 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
466 * @param array Parameters from the log queue.
467 * @param object Parent object (from "crawler" extension!)
468 * @return void
469 */
470 function crawler_execute_type4($cfgRec,&$session_data,$params,&$pObj) {
471
472 // Base page uid:
473 $pageUid = intval($params['url']);
474
475 // Get array of URLs from page:
476 $pageRow = t3lib_BEfunc::getRecord('pages',$pageUid);
477 $res = $pObj->getUrlsForPageRow($pageRow);
478
479 $duplicateTrack = array(); // Registry for duplicates
480 $downloadUrls = array(); // Dummy.
481
482 // Submit URLs:
483 if (count($res)) {
484 foreach($res as $paramSetKey => $vv) {
485 $urlList = $pObj->urlListFromUrlArray(
486 $vv,
487 $pageRow,
488 $GLOBALS['EXEC_TIME'],
489 30,
490 1,
491 0,
492 $duplicateTrack,
493 $downloadUrls,
494 array('tx_indexedsearch_reindex')
495 );
496 }
497 }
498
499 // Add subpages to log now:
500 if ($params['depth'] < $cfgRec['depth']) {
501
502 // Subpages selected
503 $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
504 'uid,title',
505 'pages',
506 'pid = '.intval($pageUid).
507 t3lib_BEfunc::deleteClause('pages')
508 );
509
510 // Traverse subpages and add to queue:
511 if (count($recs)) {
512 foreach($recs as $r) {
513 $this->instanceCounter++;
514 $url = 'pages:'.$r['uid'].': '.$r['title'];
515 $session_data['urlLog'][] = $url;
516
517 // Parameters:
518 $nparams = array(
519 'indexConfigUid' => $cfgRec['uid'],
520 'url' => $r['uid'],
521 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
522 'depth' => $params['depth']+1
523 );
524 $pObj->addQueueEntry_callBack(
525 $cfgRec['set_id'],
526 $nparams,
527 $this->callBack,
528 $cfgRec['pid'],
529 $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
530 );
531 }
532 }
533 }
534 }
535
536 /**
537 * Look up all old index configurations which are finished and needs to be reset and done
538 *
539 * @return void
540 */
541 function cleanUpOldRunningConfigurations() {
542
543 // Lookup running index configurations:
544 $runningIndexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
545 'uid,set_id',
546 'index_config',
547 'set_id!=0'.t3lib_BEfunc::deleteClause('index_config')
548 );
549
550 // For each running configuration, look up how many log entries there are which are scheduled for execution and if none, clear the "set_id" (means; Processing was DONE)
551 foreach($runningIndexingConfigurations as $cfgRec) {
552
553 // Look for ended processes:
554 $queued_items = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
555 '*',
556 'tx_crawler_queue',
557 'set_id=' . intval($cfgRec['set_id']) . ' AND exec_time=0'
558 );
559
560 if (!$queued_items) {
561
562 // Lookup old phash rows:
563 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
564 'phash',
565 'index_phash',
566 'freeIndexUid='.intval($cfgRec['uid']).' AND freeIndexSetId!='.$cfgRec['set_id']
567 );
568
569 foreach($oldPhashRows as $pHashRow) {
570 // Removing old registrations for all tables (code copied from class.tx_indexedsearch_modfunc1.php)
571 $tableArr = explode(',','index_phash,index_rel,index_section,index_grlist,index_fulltext,index_debug');
572 foreach($tableArr as $table) {
573 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($pHashRow['phash']));
574 }
575 }
576
577 // End process by updating index-config record:
578 $field_array = array (
579 'set_id' => 0,
580 'session_data' => '',
581 );
582 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
583 }
584 }
585 }
586
587
588
589
590
591
592
593 /*****************************************
594 *
595 * Helper functions
596 *
597 *****************************************/
598
599 /**
600 * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log.
601 *
602 * @param string URL string to check
603 * @param array Array of already indexed URLs (input url is looked up here and must not exist already)
604 * @param string Base URL of the indexing process (input URL must be "inside" the base URL!)
605 * @return string Returls the URL if OK, otherwise false
606 */
607 function checkUrl($url,$urlLog,$baseUrl) {
608 $url = preg_replace('/\/\/$/','/',$url);
609 list($url) = explode('#',$url);
610
611 if (!strstr($url,'../')) {
612 if (t3lib_div::isFirstPartOfStr($url,$baseUrl)) {
613 if (!in_array($url,$urlLog)) {
614 return $url;
615 }
616 }
617 }
618 }
619
620 /**
621 * Indexing External URL
622 *
623 * @param string URL, http://....
624 * @param integer Page id to relate indexing to.
625 * @param array Rootline array to relate indexing to
626 * @param integer Configuration UID
627 * @param integer Set ID value
628 * @return array URLs found on this page
629 */
630 function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId) {
631
632 // Load indexer if not yet.
633 $this->loadIndexerClass();
634
635 // Index external URL:
636 $indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer');
637 $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
638 $indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
639 $indexerObj->hash['phash'] = -1; // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
640
641 $indexerObj->indexExternalUrl($url);
642 $url_qParts = parse_url($url);
643
644 $baseAbsoluteHref = $url_qParts['scheme'] . '://' . $url_qParts['host'];
645 $baseHref = $indexerObj->extractBaseHref($indexerObj->indexExternalUrl_content);
646 if (!$baseHref) {
647 // Extract base href from current URL
648 $baseHref = $baseAbsoluteHref;
649 $baseHref .= substr($url_qParts['path'], 0, strrpos($url_qParts['path'], '/'));
650 }
651 $baseHref = rtrim($baseHref, '/');
652
653 // Get URLs on this page:
654 $subUrls = array();
655 $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
656
657 // Traverse links:
658 foreach ($list as $count => $linkInfo) {
659
660 // Decode entities:
661 $subUrl = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
662
663 $qParts = parse_url($subUrl);
664 if (!$qParts['scheme']) {
665 $relativeUrl = t3lib_div::resolveBackPath($subUrl);
666 if ($relativeUrl{0} === '/') {
667 $subUrl = $baseAbsoluteHref . $relativeUrl;
668 } else {
669 $subUrl = $baseHref . '/' . $relativeUrl;
670 }
671 }
672
673 $subUrls[] = $subUrl;
674 }
675
676 return $subUrls;
677 }
678
679 /**
680 * Indexing Single Record
681 *
682 * @param array Record to index
683 * @param array Configuration Record
684 * @param array Rootline array to relate indexing to
685 * @return void
686 */
687 function indexSingleRecord($r,$cfgRec,$rl=NULL) {
688
689 // Load indexer if not yet.
690 $this->loadIndexerClass();
691
692
693 // Init:
694 $rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
695 $fieldList = t3lib_div::trimExplode(',',$cfgRec['fieldlist'],1);
696 $languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
697 $sys_language_uid = $languageField ? $r[$languageField] : 0;
698
699 // (Re)-Indexing a row from a table:
700 $indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer');
701 parse_str(str_replace('###UID###',$r['uid'],$cfgRec['get_params']),$GETparams);
702 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, $cfgRec['chashcalc'] ? TRUE : FALSE);
703 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
704 $indexerObj->forceIndexing = TRUE;
705
706 $theContent = '';
707 foreach($fieldList as $k => $v) {
708 if (!$k) {
709 $theTitle = $r[$v];
710 } else {
711 $theContent.= $r[$v].' ';
712 }
713 }
714
715 // Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid())
716 $indexerObj->backend_indexAsTYPO3Page(
717 strip_tags(str_replace('<', ' <', $theTitle)),
718 '',
719 '',
720 strip_tags(str_replace('<', ' <', $theContent)),
721 $GLOBALS['LANG']->charSet, // Requires that
722 $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']],
723 $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']],
724 $r['uid']
725 );
726 }
727
728 /**
729 * Include indexer class.
730 *
731 * @return void
732 */
733 function loadIndexerClass() {
734 global $TYPO3_CONF_VARS;
735 require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php');
736 }
737
738 /**
739 * Get rootline for closest TypoScript template root.
740 * Algorithm same as used in Web > Template, Object browser
741 *
742 * @param integer The page id to traverse rootline back from
743 * @return array Array where the root lines uid values are found.
744 */
745 function getUidRootLineForClosestTemplate($id) {
746 global $TYPO3_CONF_VARS;
747
748 $tmpl = t3lib_div::makeInstance("t3lib_tsparser_ext");
749 $tmpl->tt_track = 0; // Do not log time-performance information
750 $tmpl->init();
751
752 // Gets the rootLine
753 $sys_page = t3lib_div::makeInstance("t3lib_pageSelect");
754 $rootLine = $sys_page->getRootLine($id);
755 $tmpl->runThroughTemplates($rootLine,0); // This generates the constants/config + hierarchy info for the template.
756
757 // Root line uids
758 $rootline_uids = array();
759 foreach($tmpl->rootLine as $rlkey => $rldat) {
760 $rootline_uids[$rlkey] = $rldat['uid'];
761 }
762
763 return $rootline_uids;
764 }
765
766 /**
767 * Generate the unix time stamp for next visit.
768 *
769 * @param array Index configuration record
770 * @return integer The next time stamp
771 */
772 function generateNextIndexingTime($cfgRec) {
773 $currentTime = $GLOBALS['EXEC_TIME'];
774
775 // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected
776 if ($cfgRec['timer_frequency']<=24*3600) {
777 $aMidNight = mktime (0,0,0)-1*24*3600;
778 } else {
779 $lastTime = $cfgRec['timer_next_indexing'] ? $cfgRec['timer_next_indexing'] : $GLOBALS['EXEC_TIME'];
780 $aMidNight = mktime (0,0,0, date('m',$lastTime), date('d',$lastTime), date('y',$lastTime));
781 }
782
783 // Find last offset time plus frequency in seconds:
784 $lastSureOffset = $aMidNight+t3lib_div::intInRange($cfgRec['timer_offset'],0,86400);
785 $frequencySeconds = t3lib_div::intInRange($cfgRec['timer_frequency'],1);
786
787 // Now, find out how many blocks of the length of frequency there is until the next time:
788 $frequencyBlocksUntilNextTime = ceil(($currentTime-$lastSureOffset)/$frequencySeconds);
789
790 // Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds.
791 $nextTime = $lastSureOffset + $frequencyBlocksUntilNextTime*$frequencySeconds;
792
793 return $nextTime;
794 }
795
796 /**
797 * Checks if $url has any of the URls in the $url_deny "list" in it and if so, returns true.
798 *
799 * @param string URL to test
800 * @param string String where URLs are separated by line-breaks; If any of these strings is the first part of $url, the function returns TRUE (to indicate denial of decend)
801 * @return boolean TRUE if there is a matching URL (hence, do not index!)
802 */
803 function checkDeniedSuburls($url, $url_deny) {
804 if (trim($url_deny)) {
805 $url_denyArray = t3lib_div::trimExplode(LF,$url_deny,1);
806 foreach($url_denyArray as $testurl) {
807 if (t3lib_div::isFirstPartOfStr($url,$testurl)) {
808 echo $url.' /// '.$url_deny.LF;
809 return TRUE;
810 }
811 }
812 }
813 return FALSE;
814 }
815
816 /**
817 * Adding entry in queue for Hook
818 *
819 * @param array Configuration record
820 * @param string Title/URL
821 * @return void
822 */
823 function addQueueEntryForHook($cfgRec, $title) {
824
825 $nparams = array(
826 'indexConfigUid' => $cfgRec['uid'], // This must ALWAYS be the cfgRec uid!
827 'url' => $title,
828 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']') // Also just for information. Its good style to show that its an indexing configuration that added the entry.
829 );
830 $this->pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']);
831 }
832
833 /**
834 * Deletes all data stored by indexed search for a given page
835 *
836 * @param integer Uid of the page to delete all pHash
837 * @return void
838 */
839 function deleteFromIndex($id) {
840
841 // Lookup old phash rows:
842 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('phash','index_section', 'page_id='.intval($id));
843
844 if (count($oldPhashRows)) {
845 $pHashesToDelete = array();
846 foreach ($oldPhashRows as $pHashRow) {
847 $pHashesToDelete[] = $pHashRow['phash'];
848 }
849
850 $where_clause = 'phash IN ('.implode(',',$GLOBALS['TYPO3_DB']->cleanIntArray($pHashesToDelete)).')';
851 $tables = explode(',', 'index_debug,index_fulltext,index_grlist,index_phash,index_rel,index_section');
852 foreach ($tables as $table) {
853 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, $where_clause);
854 }
855 }
856 }
857
858
859
860
861
862
863
864 /*************************
865 *
866 * Hook functions for TCEmain (indexing of records)
867 *
868 *************************/
869
870 /**
871 * TCEmain hook function for on-the-fly indexing of database records
872 *
873 * @param string TCEmain command
874 * @param string Table name
875 * @param string Record ID. If new record its a string pointing to index inside t3lib_tcemain::substNEWwithIDs
876 * @param mixed Target value (ignored)
877 * @param object Reference to tcemain calling object
878 * @return void
879 */
880 function processCmdmap_preProcess($command, $table, $id, $value, $pObj) {
881
882 // Clean up the index
883 if ($command=='delete' && $table == 'pages') {
884 $this->deleteFromIndex($id);
885 }
886 }
887
888 /**
889 * TCEmain hook function for on-the-fly indexing of database records
890 *
891 * @param string Status "new" or "update"
892 * @param string Table name
893 * @param string Record ID. If new record its a string pointing to index inside t3lib_tcemain::substNEWwithIDs
894 * @param array Field array of updated fields in the operation
895 * @param object Reference to tcemain calling object
896 * @return void
897 */
898 function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, $pObj) {
899
900 // Check if any fields are actually updated:
901 if (count($fieldArray)) {
902
903 // Translate new ids.
904 if ($status=='new') {
905 $id = $pObj->substNEWwithIDs[$id];
906
907 } elseif ($table=='pages' && $status=='update' && ((array_key_exists('hidden',$fieldArray) && $fieldArray['hidden']==1) || (array_key_exists('no_search',$fieldArray) && $fieldArray['no_search']==1))) {
908
909 // If the page should be hidden or not indexed after update, delete index for this page
910 $this->deleteFromIndex($id);
911 }
912
913 // Get full record and if exists, search for indexing configurations:
914 $currentRecord = t3lib_BEfunc::getRecord($table,$id);
915 if (is_array($currentRecord)) {
916
917 // Select all (not running) indexing configurations of type "record" (1) and which points to this table and is located on the same page as the record or pointing to the right source PID
918 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
919 '*',
920 'index_config',
921 'hidden=0
922 AND (starttime=0 OR starttime<=' . $GLOBALS['EXEC_TIME'] . ')
923 AND set_id=0
924 AND type=1
925 AND table2index='.$GLOBALS['TYPO3_DB']->fullQuoteStr($table,'index_config').'
926 AND (
927 (alternative_source_pid=0 AND pid='.intval($currentRecord['pid']).')
928 OR (alternative_source_pid='.intval($currentRecord['pid']).')
929 )
930 AND records_indexonchange=1
931 '.t3lib_BEfunc::deleteClause('index_config')
932 );
933
934 foreach($indexingConfigurations as $cfgRec) {
935 $this->indexSingleRecord($currentRecord,$cfgRec);
936 }
937 }
938 }
939 }
940 }
941
942
943 /**
944 * Crawler hook for indexed search. Works with the "crawler" extension
945 * This hook is specifically used to index external files found on pages through the crawler extension.
946 *
947 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
948 * @package TYPO3
949 * @subpackage tx_indexedsearch
950 * @see tx_indexedsearch_indexer::extractLinks()
951 */
952 class tx_indexedsearch_files {
953
954 /**
955 * Call back function for execution of a log element
956 *
957 * @param array Params from log element.
958 * @param object Parent object (tx_crawler lib)
959 * @return array Result array
960 */
961 function crawler_execute($params,&$pObj) {
962
963 // Load indexer if not yet.
964 $this->loadIndexerClass();
965
966 if (is_array($params['conf'])) {
967
968 // Initialize the indexer class:
969 $indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer');
970 $indexerObj->conf = $params['conf'];
971 $indexerObj->init();
972
973 // Index document:
974 if ($params['alturl']) {
975 $fI = pathinfo($params['document']);
976 $ext = strtolower($fI['extension']);
977 $indexerObj->indexRegularDocument($params['alturl'], TRUE, $params['document'], $ext);
978 } else {
979 $indexerObj->indexRegularDocument($params['document'], TRUE);
980 }
981
982 // Return OK:
983 return array('content' => array());
984 }
985 }
986
987 /**
988 * Include indexer class.
989 *
990 * @return void
991 */
992 function loadIndexerClass() {
993 global $TYPO3_CONF_VARS;
994 require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php');
995 }
996 }
997
998
999 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.crawler.php'])) {
1000 include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.crawler.php']);
1001 }
1002
1003 ?>