[!!!][BUGFIX] *_user table password field is to short
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / class.crawler.php
1 <?php
2 /***************************************************************
3 * Copyright notice
4 *
5 * (c) 2001-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
6 * All rights reserved
7 *
8 * This script is part of the TYPO3 project. The TYPO3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
16 * A copy is found in the textfile GPL.txt and important notices to the license
17 * from the author is found in LICENSE.txt distributed with these scripts.
18 *
19 *
20 * This script is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * This copyright notice MUST APPEAR in all copies of the script!
26 ***************************************************************/
27 /**
28 * Crawler hook for indexed search. Works with the "crawler" extension
29 *
30 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
31 */
32
33
34
35 // To make sure the backend charset is available:
36 if (!is_object($GLOBALS['LANG'])) {
37 $GLOBALS['LANG'] = t3lib_div::makeInstance('language');
38 $GLOBALS['LANG']->init($GLOBALS['BE_USER']->uc['lang']);
39 }
40
41
42 /**
43 * Crawler hook for indexed search. Works with the "crawler" extension
44 *
45 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
46 * @package TYPO3
47 * @subpackage tx_indexedsearch
48 */
49 class tx_indexedsearch_crawler {
50
51 // Static:
52 var $secondsPerExternalUrl = 3; // Number of seconds to use as interval between queued indexing operations of URLs / files (types 2 & 3)
53
54 // Internal, dynamic:
55 var $instanceCounter = 0; // Counts up for each added URL (type 3)
56
57 // Internal, static:
58 var $callBack = 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'; // The object reference to this class.
59
60 /**
61 * Initialization of crawler hook.
62 * This function is asked for each instance of the crawler and we must check if something is timed to happen and if so put entry(s) in the crawlers log to start processing.
63 * In reality we select indexing configurations and evaluate if any of them needs to run.
64 *
65 * @param object Parent object (tx_crawler lib)
66 * @return void
67 */
68 function crawler_init(&$pObj) {
69
70 // Select all indexing configuration which are waiting to be activated:
71 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
72 '*',
73 'index_config',
74 'hidden=0
75 AND (starttime=0 OR starttime<=' . $GLOBALS['EXEC_TIME'] . ')
76 AND timer_next_indexing<' . $GLOBALS['EXEC_TIME'] . '
77 AND set_id=0
78 '.t3lib_BEfunc::deleteClause('index_config')
79 );
80
81 // For each configuration, check if it should be executed and if so, start:
82 foreach($indexingConfigurations as $cfgRec) {
83
84 // Generate a unique set-ID:
85 $setId = t3lib_div::md5int(microtime());
86
87 // Get next time:
88 $nextTime = $this->generateNextIndexingTime($cfgRec);
89
90 // Start process by updating index-config record:
91 $field_array = array (
92 'set_id' => $setId,
93 'timer_next_indexing' => $nextTime,
94 'session_data' => '',
95 );
96 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config', 'uid='.intval($cfgRec['uid']), $field_array);
97
98 // Based on configuration type:
99 switch($cfgRec['type']) {
100 case 1: // RECORDS:
101
102 // Parameters:
103 $params = array(
104 'indexConfigUid' => $cfgRec['uid'],
105 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
106 'url' => 'Records (start)', // Just for show.
107 );
108 //
109 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
110 break;
111 case 2: // FILES:
112
113 // Parameters:
114 $params = array(
115 'indexConfigUid' => $cfgRec['uid'], // General
116 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), // General
117 'url' => $cfgRec['filepath'], // Partly general... (for URL and file types)
118 'depth' => 0 // Specific for URL and file types
119 );
120
121 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
122 break;
123 case 3: // External URL:
124
125 // Parameters:
126 $params = array(
127 'indexConfigUid' => $cfgRec['uid'], // General
128 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), // General
129 'url' => $cfgRec['externalUrl'], // Partly general... (for URL and file types)
130 'depth' => 0 // Specific for URL and file types
131 );
132
133 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
134 break;
135 case 4: // Page tree
136
137 // Parameters:
138 $params = array(
139 'indexConfigUid' => $cfgRec['uid'], // General
140 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), // General
141 'url' => intval($cfgRec['alternative_source_pid']), // Partly general... (for URL and file types and page tree (root))
142 'depth' => 0 // Specific for URL and file types and page tree
143 );
144
145 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
146 break;
147 case 5: // Meta configuration, nothing to do:
148 // NOOP
149 break;
150 default:
151 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
152 $hookObj = t3lib_div::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
153
154 if (is_object($hookObj)) {
155
156 // Parameters:
157 $params = array(
158 'indexConfigUid' => $cfgRec['uid'], // General
159 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].'/CUSTOM]'), // General
160 'url' => $hookObj->initMessage($message),
161 );
162
163 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
164 }
165 }
166 break;
167 }
168 }
169
170 // Finally, look up all old index configurations which are finished and needs to be reset and done.
171 $this->cleanUpOldRunningConfigurations();
172 }
173
174 /**
175 * Call back function for execution of a log element
176 *
177 * @param array Params from log element. Must contain $params['indexConfigUid']
178 * @param object Parent object (tx_crawler lib)
179 * @return array Result array
180 */
181 function crawler_execute($params, &$pObj) {
182
183 // Indexer configuration ID must exist:
184 if ($params['indexConfigUid']) {
185
186 // Load the indexing configuration record:
187 $cfgRec = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow(
188 '*',
189 'index_config',
190 'uid='.intval($params['indexConfigUid'])
191 );
192
193 if (is_array($cfgRec)) {
194
195 // Unpack session data:
196 $session_data = unserialize($cfgRec['session_data']);
197
198 // Select which type:
199 switch($cfgRec['type']) {
200 case 1: // Records:
201 $this->crawler_execute_type1($cfgRec, $session_data, $params, $pObj);
202 break;
203 case 2: // Files
204 $this->crawler_execute_type2($cfgRec, $session_data, $params, $pObj);
205 break;
206 case 3: // External URL:
207 $this->crawler_execute_type3($cfgRec, $session_data, $params, $pObj);
208 break;
209 case 4: // Page tree:
210 $this->crawler_execute_type4($cfgRec, $session_data, $params, $pObj);
211 break;
212 case 5: // Meta
213 // NOOP (should never enter here!)
214 break;
215 default:
216 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
217 $hookObj = t3lib_div::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
218
219 if (is_object($hookObj)) {
220 $this->pObj = $pObj; // For addQueueEntryForHook()
221 $hookObj->indexOperation($cfgRec, $session_data, $params, $this);
222 }
223 }
224 break;
225 }
226
227 // Save process data which might be modified:
228 $field_array = array (
229 'session_data' => serialize($session_data)
230 );
231 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config', 'uid=' . intval($cfgRec['uid']), $field_array);
232 }
233 }
234
235 return array('log' => $params);
236 }
237
238 /**
239 * Indexing records from a table
240 *
241 * @param array Indexing Configuration Record
242 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
243 * @param array Parameters from the log queue.
244 * @param object Parent object (from "crawler" extension!)
245 * @return void
246 */
247 function crawler_execute_type1($cfgRec, &$session_data, $params, &$pObj) {
248 if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']])) {
249
250 // Init session data array if not already:
251 if (!is_array($session_data)) {
252 $session_data = array(
253 'uid' => 0
254 );
255 }
256
257 // Init:
258 $pid = intval($cfgRec['alternative_source_pid']) ? intval($cfgRec['alternative_source_pid']) : $cfgRec['pid'];
259 $numberOfRecords = $cfgRec['recordsbatch'] ? t3lib_utility_Math::forceIntegerInRange($cfgRec['recordsbatch'], 1) : 100;
260
261 // Get root line:
262 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
263
264 // Select
265 $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
266 '*',
267 $cfgRec['table2index'],
268 'pid = '.intval($pid).'
269 AND uid > '.intval($session_data['uid']).
270 t3lib_BEfunc::deleteClause($cfgRec['table2index']).
271 t3lib_BEfunc::BEenableFields($cfgRec['table2index']),
272 '',
273 'uid',
274 $numberOfRecords
275 );
276
277 // Traverse:
278 if (count($recs)) {
279 foreach($recs as $r) {
280
281 // Index single record:
282 $this->indexSingleRecord($r, $cfgRec, $rl);
283
284 // Update the UID we last processed:
285 $session_data['uid'] = $r['uid'];
286 }
287
288 // Finally, set entry for next indexing of batch of records:
289 $nparams = array(
290 'indexConfigUid' => $cfgRec['uid'],
291 'url' => 'Records from UID#' . ($r['uid']+1) . '-?',
292 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']')
293 );
294 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
295 }
296 }
297 }
298
299 /**
300 * Indexing files from fileadmin
301 *
302 * @param array Indexing Configuration Record
303 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
304 * @param array Parameters from the log queue.
305 * @param object Parent object (from "crawler" extension!)
306 * @return void
307 */
308 function crawler_execute_type2($cfgRec, &$session_data, $params, &$pObj) {
309
310 // Prepare path, making it absolute and checking:
311 $readpath = $params['url'];
312 if (!t3lib_div::isAbsPath($readpath)) {
313 $readpath = t3lib_div::getFileAbsFileName($readpath);
314 }
315
316 if (t3lib_div::isAllowedAbsPath($readpath)) {
317 if (@is_file($readpath)) { // If file, index it!
318
319 // Get root line (need to provide this when indexing external files)
320 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
321
322 // Load indexer if not yet.
323 $this->loadIndexerClass();
324
325 // (Re)-Indexing file on page.
326 $indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer');
327 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
328 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
329 $indexerObj->hash['phash'] = -1; // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
330
331 // Index document:
332 $indexerObj->indexRegularDocument(substr($readpath, strlen(PATH_site)), TRUE);
333 } elseif (@is_dir($readpath)) { // If dir, read content and create new pending items for log:
334
335 // Select files and directories in path:
336 $extList = implode(',', t3lib_div::trimExplode(',', $cfgRec['extensions'], 1));
337 $fileArr = array();
338 $files = t3lib_div::getAllFilesAndFoldersInPath($fileArr, $readpath, $extList, 0, 0);
339
340 $directoryList = t3lib_div::get_dirs($readpath);
341 if (is_array($directoryList) && $params['depth'] < $cfgRec['depth']) {
342 foreach ($directoryList as $subdir) {
343 if ((string)$subdir!='') {
344 $files[]= $readpath.$subdir.'/';
345 }
346 }
347 }
348 $files = t3lib_div::removePrefixPathFromList($files, PATH_site);
349
350 // traverse the items and create log entries:
351 foreach($files as $path) {
352 $this->instanceCounter++;
353 if ($path!==$params['url']) {
354 // Parameters:
355 $nparams = array(
356 'indexConfigUid' => $cfgRec['uid'],
357 'url' => $path,
358 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
359 'depth' => $params['depth']+1
360 );
361 $pObj->addQueueEntry_callBack(
362 $cfgRec['set_id'],
363 $nparams,
364 $this->callBack,
365 $cfgRec['pid'],
366 $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
367 );
368 }
369 }
370 }
371 }
372 }
373
374 /**
375 * Indexing External URLs
376 *
377 * @param array Indexing Configuration Record
378 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
379 * @param array Parameters from the log queue.
380 * @param object Parent object (from "crawler" extension!)
381 * @return void
382 */
383 function crawler_execute_type3($cfgRec, &$session_data, $params, &$pObj) {
384
385 // Init session data array if not already:
386 if (!is_array($session_data)) {
387 $session_data = array(
388 'urlLog' => array($params['url'])
389 );
390 }
391
392 // Index the URL:
393 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
394 $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
395
396 // Add more elements to log now:
397 if ($params['depth'] < $cfgRec['depth']) {
398 foreach($subUrls as $url) {
399 if ($url = $this->checkUrl($url, $session_data['urlLog'], $cfgRec['externalUrl'])) {
400 if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny'])) {
401 $this->instanceCounter++;
402 $session_data['urlLog'][] = $url;
403
404 // Parameters:
405 $nparams = array(
406 'indexConfigUid' => $cfgRec['uid'],
407 'url' => $url,
408 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
409 'depth' => $params['depth']+1
410 );
411 $pObj->addQueueEntry_callBack(
412 $cfgRec['set_id'],
413 $nparams,
414 $this->callBack,
415 $cfgRec['pid'],
416 $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
417 );
418 }
419 }
420 }
421 }
422 }
423
424 /**
425 * Page tree indexing type
426 *
427 * @param array Indexing Configuration Record
428 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
429 * @param array Parameters from the log queue.
430 * @param object Parent object (from "crawler" extension!)
431 * @return void
432 */
433 function crawler_execute_type4($cfgRec, &$session_data, $params, &$pObj) {
434
435 // Base page uid:
436 $pageUid = intval($params['url']);
437
438 // Get array of URLs from page:
439 $pageRow = t3lib_BEfunc::getRecord('pages', $pageUid);
440 $res = $pObj->getUrlsForPageRow($pageRow);
441
442 $duplicateTrack = array(); // Registry for duplicates
443 $downloadUrls = array(); // Dummy.
444
445 // Submit URLs:
446 if (count($res)) {
447 foreach($res as $paramSetKey => $vv) {
448 $urlList = $pObj->urlListFromUrlArray(
449 $vv,
450 $pageRow,
451 $GLOBALS['EXEC_TIME'],
452 30,
453 1,
454 0,
455 $duplicateTrack,
456 $downloadUrls,
457 array('tx_indexedsearch_reindex')
458 );
459 }
460 }
461
462 // Add subpages to log now:
463 if ($params['depth'] < $cfgRec['depth']) {
464
465 // Subpages selected
466 $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
467 'uid,title',
468 'pages',
469 'pid = '.intval($pageUid).
470 t3lib_BEfunc::deleteClause('pages')
471 );
472
473 // Traverse subpages and add to queue:
474 if (count($recs)) {
475 foreach($recs as $r) {
476 $this->instanceCounter++;
477 $url = 'pages:'.$r['uid'].': '.$r['title'];
478 $session_data['urlLog'][] = $url;
479
480 // Parameters:
481 $nparams = array(
482 'indexConfigUid' => $cfgRec['uid'],
483 'url' => $r['uid'],
484 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
485 'depth' => $params['depth']+1
486 );
487 $pObj->addQueueEntry_callBack(
488 $cfgRec['set_id'],
489 $nparams,
490 $this->callBack,
491 $cfgRec['pid'],
492 $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
493 );
494 }
495 }
496 }
497 }
498
499 /**
500 * Look up all old index configurations which are finished and needs to be reset and done
501 *
502 * @return void
503 */
504 function cleanUpOldRunningConfigurations() {
505
506 // Lookup running index configurations:
507 $runningIndexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
508 'uid,set_id',
509 'index_config',
510 'set_id<>0'.t3lib_BEfunc::deleteClause('index_config')
511 );
512
513 // For each running configuration, look up how many log entries there are which are scheduled for execution and if none, clear the "set_id" (means; Processing was DONE)
514 foreach($runningIndexingConfigurations as $cfgRec) {
515
516 // Look for ended processes:
517 $queued_items = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
518 '*',
519 'tx_crawler_queue',
520 'set_id=' . intval($cfgRec['set_id']) . ' AND exec_time=0'
521 );
522
523 if (!$queued_items) {
524
525 // Lookup old phash rows:
526 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
527 'phash',
528 'index_phash',
529 'freeIndexUid=' . intval($cfgRec['uid']) . ' AND freeIndexSetId<>' . $cfgRec['set_id']
530 );
531
532 foreach($oldPhashRows as $pHashRow) {
533 // Removing old registrations for all tables (code copied from class.tx_indexedsearch_modfunc1.php)
534 $tableArr = explode(',', 'index_phash,index_rel,index_section,index_grlist,index_fulltext,index_debug');
535 foreach($tableArr as $table) {
536 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($pHashRow['phash']));
537 }
538 }
539
540 // End process by updating index-config record:
541 $field_array = array (
542 'set_id' => 0,
543 'session_data' => '',
544 );
545 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config', 'uid=' . intval($cfgRec['uid']), $field_array);
546 }
547 }
548 }
549
550
551
552
553
554
555
556 /*****************************************
557 *
558 * Helper functions
559 *
560 *****************************************/
561
562 /**
563 * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log.
564 *
565 * @param string URL string to check
566 * @param array Array of already indexed URLs (input url is looked up here and must not exist already)
567 * @param string Base URL of the indexing process (input URL must be "inside" the base URL!)
568 * @return string Returls the URL if OK, otherwise FALSE
569 */
570 function checkUrl($url, $urlLog, $baseUrl) {
571 $url = preg_replace('/\/\/$/', '/', $url);
572 list($url) = explode('#', $url);
573
574 if (!strstr($url, '../')) {
575 if (t3lib_div::isFirstPartOfStr($url, $baseUrl)) {
576 if (!in_array($url, $urlLog)) {
577 return $url;
578 }
579 }
580 }
581 }
582
583 /**
584 * Indexing External URL
585 *
586 * @param string URL, http://....
587 * @param integer Page id to relate indexing to.
588 * @param array Rootline array to relate indexing to
589 * @param integer Configuration UID
590 * @param integer Set ID value
591 * @return array URLs found on this page
592 */
593 function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId) {
594
595 // Load indexer if not yet.
596 $this->loadIndexerClass();
597
598 // Index external URL:
599 $indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer');
600 $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
601 $indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
602 $indexerObj->hash['phash'] = -1; // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
603
604 $indexerObj->indexExternalUrl($url);
605 $url_qParts = parse_url($url);
606
607 $baseAbsoluteHref = $url_qParts['scheme'] . '://' . $url_qParts['host'];
608 $baseHref = $indexerObj->extractBaseHref($indexerObj->indexExternalUrl_content);
609 if (!$baseHref) {
610 // Extract base href from current URL
611 $baseHref = $baseAbsoluteHref;
612 $baseHref .= substr($url_qParts['path'], 0, strrpos($url_qParts['path'], '/'));
613 }
614 $baseHref = rtrim($baseHref, '/');
615
616 // Get URLs on this page:
617 $subUrls = array();
618 $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
619
620 // Traverse links:
621 foreach ($list as $count => $linkInfo) {
622
623 // Decode entities:
624 $subUrl = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
625
626 $qParts = parse_url($subUrl);
627 if (!$qParts['scheme']) {
628 $relativeUrl = t3lib_div::resolveBackPath($subUrl);
629 if ($relativeUrl{0} === '/') {
630 $subUrl = $baseAbsoluteHref . $relativeUrl;
631 } else {
632 $subUrl = $baseHref . '/' . $relativeUrl;
633 }
634 }
635
636 $subUrls[] = $subUrl;
637 }
638
639 return $subUrls;
640 }
641
642 /**
643 * Indexing Single Record
644 *
645 * @param array Record to index
646 * @param array Configuration Record
647 * @param array Rootline array to relate indexing to
648 * @return void
649 */
650 function indexSingleRecord($r, $cfgRec, $rl = NULL) {
651
652 // Load indexer if not yet.
653 $this->loadIndexerClass();
654
655 // Init:
656 $rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
657 $fieldList = t3lib_div::trimExplode(',', $cfgRec['fieldlist'], 1);
658 $languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
659 $sys_language_uid = $languageField ? $r[$languageField] : 0;
660
661 // (Re)-Indexing a row from a table:
662 $indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer');
663 parse_str(str_replace('###UID###', $r['uid'], $cfgRec['get_params']), $GETparams);
664 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, $cfgRec['chashcalc'] ? TRUE : FALSE);
665 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
666 $indexerObj->forceIndexing = TRUE;
667
668 $theContent = '';
669 foreach($fieldList as $k => $v) {
670 if (!$k) {
671 $theTitle = $r[$v];
672 } else {
673 $theContent.= $r[$v].' ';
674 }
675 }
676
677 // Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid())
678 $indexerObj->backend_indexAsTYPO3Page(
679 strip_tags(str_replace('<', ' <', $theTitle)),
680 '',
681 '',
682 strip_tags(str_replace('<', ' <', $theContent)),
683 $GLOBALS['LANG']->charSet, // Requires that
684 $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']],
685 $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']],
686 $r['uid']
687 );
688 }
689
690 /**
691 * Include indexer class.
692 *
693 * @return void
694 */
695 function loadIndexerClass() {
696 global $TYPO3_CONF_VARS;
697 require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php');
698 }
699
700 /**
701 * Get rootline for closest TypoScript template root.
702 * Algorithm same as used in Web > Template, Object browser
703 *
704 * @param integer The page id to traverse rootline back from
705 * @return array Array where the root lines uid values are found.
706 */
707 function getUidRootLineForClosestTemplate($id) {
708 global $TYPO3_CONF_VARS;
709
710 $tmpl = t3lib_div::makeInstance('t3lib_tsparser_ext');
711 $tmpl->tt_track = 0; // Do not log time-performance information
712 $tmpl->init();
713
714 // Gets the rootLine
715 $sys_page = t3lib_div::makeInstance('t3lib_pageSelect');
716 $rootLine = $sys_page->getRootLine($id);
717 // This generates the constants/config + hierarchy info for the template.
718 $tmpl->runThroughTemplates($rootLine, 0);
719
720 // Root line uids
721 $rootline_uids = array();
722 foreach($tmpl->rootLine as $rlkey => $rldat) {
723 $rootline_uids[$rlkey] = $rldat['uid'];
724 }
725
726 return $rootline_uids;
727 }
728
729 /**
730 * Generate the unix time stamp for next visit.
731 *
732 * @param array Index configuration record
733 * @return integer The next time stamp
734 */
735 function generateNextIndexingTime($cfgRec) {
736 $currentTime = $GLOBALS['EXEC_TIME'];
737
738 // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected
739 if ($cfgRec['timer_frequency']<=24*3600) {
740 $aMidNight = mktime(0, 0, 0)-1*24*3600;
741 } else {
742 $lastTime = $cfgRec['timer_next_indexing'] ? $cfgRec['timer_next_indexing'] : $GLOBALS['EXEC_TIME'];
743 $aMidNight = mktime(0, 0, 0, date('m', $lastTime), date('d', $lastTime), date('y', $lastTime));
744 }
745
746 // Find last offset time plus frequency in seconds:
747 $lastSureOffset = $aMidNight + t3lib_utility_Math::forceIntegerInRange($cfgRec['timer_offset'], 0, 86400);
748 $frequencySeconds = t3lib_utility_Math::forceIntegerInRange($cfgRec['timer_frequency'], 1);
749
750 // Now, find out how many blocks of the length of frequency there is until the next time:
751 $frequencyBlocksUntilNextTime = ceil(($currentTime-$lastSureOffset)/$frequencySeconds);
752
753 // Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds.
754 $nextTime = $lastSureOffset + $frequencyBlocksUntilNextTime*$frequencySeconds;
755
756 return $nextTime;
757 }
758
759 /**
760 * Checks if $url has any of the URls in the $url_deny "list" in it and if so, returns TRUE.
761 *
762 * @param string URL to test
763 * @param string String where URLs are separated by line-breaks; If any of these strings is the first part of $url, the function returns TRUE (to indicate denial of decend)
764 * @return boolean TRUE if there is a matching URL (hence, do not index!)
765 */
766 function checkDeniedSuburls($url, $url_deny) {
767 if (trim($url_deny)) {
768 $url_denyArray = t3lib_div::trimExplode(LF, $url_deny, 1);
769 foreach($url_denyArray as $testurl) {
770 if (t3lib_div::isFirstPartOfStr($url, $testurl)) {
771 echo $url . ' /// ' . $url_deny.LF;
772 return TRUE;
773 }
774 }
775 }
776 return FALSE;
777 }
778
779 /**
780 * Adding entry in queue for Hook
781 *
782 * @param array Configuration record
783 * @param string Title/URL
784 * @return void
785 */
786 function addQueueEntryForHook($cfgRec, $title) {
787
788 $nparams = array(
789 'indexConfigUid' => $cfgRec['uid'], // This must ALWAYS be the cfgRec uid!
790 'url' => $title,
791 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']') // Also just for information. Its good style to show that its an indexing configuration that added the entry.
792 );
793 $this->pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
794 }
795
796 /**
797 * Deletes all data stored by indexed search for a given page
798 *
799 * @param integer Uid of the page to delete all pHash
800 * @return void
801 */
802 function deleteFromIndex($id) {
803
804 // Lookup old phash rows:
805 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('phash', 'index_section', 'page_id=' . intval($id));
806
807 if (count($oldPhashRows)) {
808 $pHashesToDelete = array();
809 foreach ($oldPhashRows as $pHashRow) {
810 $pHashesToDelete[] = $pHashRow['phash'];
811 }
812
813 $where_clause = 'phash IN (' . implode(',', $GLOBALS['TYPO3_DB']->cleanIntArray($pHashesToDelete)) . ')';
814 $tables = explode(',', 'index_debug,index_fulltext,index_grlist,index_phash,index_rel,index_section');
815 foreach ($tables as $table) {
816 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, $where_clause);
817 }
818 }
819 }
820
821
822
823
824
825
826
827 /*************************
828 *
829 * Hook functions for TCEmain (indexing of records)
830 *
831 *************************/
832
833 /**
834 * TCEmain hook function for on-the-fly indexing of database records
835 *
836 * @param string TCEmain command
837 * @param string Table name
838 * @param string Record ID. If new record its a string pointing to index inside t3lib_tcemain::substNEWwithIDs
839 * @param mixed Target value (ignored)
840 * @param object Reference to tcemain calling object
841 * @return void
842 */
843 function processCmdmap_preProcess($command, $table, $id, $value, $pObj) {
844
845 // Clean up the index
846 if ($command=='delete' && $table == 'pages') {
847 $this->deleteFromIndex($id);
848 }
849 }
850
851 /**
852 * TCEmain hook function for on-the-fly indexing of database records
853 *
854 * @param string Status "new" or "update"
855 * @param string Table name
856 * @param string Record ID. If new record its a string pointing to index inside t3lib_tcemain::substNEWwithIDs
857 * @param array Field array of updated fields in the operation
858 * @param object Reference to tcemain calling object
859 * @return void
860 */
861 function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, $pObj) {
862
863 // Check if any fields are actually updated:
864 if (count($fieldArray)) {
865
866 // Translate new ids.
867 if ($status=='new') {
868 $id = $pObj->substNEWwithIDs[$id];
869
870 } elseif ($table == 'pages' && $status == 'update' && ((array_key_exists('hidden', $fieldArray) && $fieldArray['hidden'] == 1) || (array_key_exists('no_search', $fieldArray) && $fieldArray['no_search'] == 1))) {
871
872 // If the page should be hidden or not indexed after update, delete index for this page
873 $this->deleteFromIndex($id);
874 }
875
876 // Get full record and if exists, search for indexing configurations:
877 $currentRecord = t3lib_BEfunc::getRecord($table, $id);
878 if (is_array($currentRecord)) {
879
880 // Select all (not running) indexing configurations of type "record" (1) and which points to this table and is located on the same page as the record or pointing to the right source PID
881 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
882 '*',
883 'index_config',
884 'hidden=0
885 AND (starttime=0 OR starttime<=' . $GLOBALS['EXEC_TIME'] . ')
886 AND set_id=0
887 AND type=1
888 AND table2index=' . $GLOBALS['TYPO3_DB']->fullQuoteStr($table, 'index_config') . '
889 AND (
890 (alternative_source_pid=0 AND pid='.intval($currentRecord['pid']).')
891 OR (alternative_source_pid='.intval($currentRecord['pid']).')
892 )
893 AND records_indexonchange=1
894 '.t3lib_BEfunc::deleteClause('index_config')
895 );
896
897 foreach($indexingConfigurations as $cfgRec) {
898 $this->indexSingleRecord($currentRecord, $cfgRec);
899 }
900 }
901 }
902 }
903 }
904
905
906 /**
907 * Crawler hook for indexed search. Works with the "crawler" extension
908 * This hook is specifically used to index external files found on pages through the crawler extension.
909 *
910 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
911 * @package TYPO3
912 * @subpackage tx_indexedsearch
913 * @see tx_indexedsearch_indexer::extractLinks()
914 */
915 class tx_indexedsearch_files {
916
917 /**
918 * Call back function for execution of a log element
919 *
920 * @param array Params from log element.
921 * @param object Parent object (tx_crawler lib)
922 * @return array Result array
923 */
924 function crawler_execute($params, &$pObj) {
925
926 // Load indexer if not yet.
927 $this->loadIndexerClass();
928
929 if (is_array($params['conf'])) {
930
931 // Initialize the indexer class:
932 $indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer');
933 $indexerObj->conf = $params['conf'];
934 $indexerObj->init();
935
936 // Index document:
937 if ($params['alturl']) {
938 $fI = pathinfo($params['document']);
939 $ext = strtolower($fI['extension']);
940 $indexerObj->indexRegularDocument($params['alturl'], TRUE, $params['document'], $ext);
941 } else {
942 $indexerObj->indexRegularDocument($params['document'], TRUE);
943 }
944
945 // Return OK:
946 return array('content' => array());
947 }
948 }
949
950 /**
951 * Include indexer class.
952 *
953 * @return void
954 */
955 function loadIndexerClass() {
956 global $TYPO3_CONF_VARS;
957 require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php');
958 }
959 }
960 ?>