[TASK] Move classAliasMaps into compatibility6 extension part 6
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Hook / CrawlerHook.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch\Hook;
3
4 /**
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Backend\Utility\BackendUtility;
18 use TYPO3\CMS\Core\Utility\GeneralUtility;
19
20 /**
21 * Crawler hook for indexed search. Works with the "crawler" extension
22 *
23 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
24 */
25 class CrawlerHook {
26
27 // Static:
28 /**
29 * @var int
30 */
31 public $secondsPerExternalUrl = 3;
32
33 // Number of seconds to use as interval between queued indexing operations of URLs / files (types 2 & 3)
34 // Internal, dynamic:
35 /**
36 * @var int
37 */
38 public $instanceCounter = 0;
39
40 // Counts up for each added URL (type 3)
41 // Internal, static:
42 /**
43 * @var string
44 */
45 public $callBack = '&TYPO3\\CMS\\IndexedSearch\\Hook\\CrawlerHook';
46
47 // The object reference to this class.
48 /**
49 * Initialization of crawler hook.
50 * This function is asked for each instance of the crawler and we must check if something is timed to happen and if so put entry(s) in the crawlers log to start processing.
51 * In reality we select indexing configurations and evaluate if any of them needs to run.
52 *
53 * @param object Parent object (tx_crawler lib)
54 * @return void
55 */
56 public function crawler_init(&$pObj) {
57 // Select all indexing configuration which are waiting to be activated:
58 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('*', 'index_config', 'hidden=0
59 AND (starttime=0 OR starttime<=' . $GLOBALS['EXEC_TIME'] . ')
60 AND timer_next_indexing<' . $GLOBALS['EXEC_TIME'] . '
61 AND set_id=0
62 ' . BackendUtility::deleteClause('index_config'));
63 // For each configuration, check if it should be executed and if so, start:
64 foreach ($indexingConfigurations as $cfgRec) {
65 // Generate a unique set-ID:
66 $setId = GeneralUtility::md5int(microtime());
67 // Get next time:
68 $nextTime = $this->generateNextIndexingTime($cfgRec);
69 // Start process by updating index-config record:
70 $field_array = array(
71 'set_id' => $setId,
72 'timer_next_indexing' => $nextTime,
73 'session_data' => ''
74 );
75 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config', 'uid=' . (int)$cfgRec['uid'], $field_array);
76 // Based on configuration type:
77 switch ($cfgRec['type']) {
78 case 1:
79 // RECORDS:
80 // Parameters:
81 $params = array(
82 'indexConfigUid' => $cfgRec['uid'],
83 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
84 'url' => 'Records (start)'
85 );
86 //
87 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
88 break;
89 case 2:
90 // FILES:
91 // Parameters:
92 $params = array(
93 'indexConfigUid' => $cfgRec['uid'],
94 // General
95 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
96 // General
97 'url' => $cfgRec['filepath'],
98 // Partly general... (for URL and file types)
99 'depth' => 0
100 );
101 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
102 break;
103 case 3:
104 // External URL:
105 // Parameters:
106 $params = array(
107 'indexConfigUid' => $cfgRec['uid'],
108 // General
109 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
110 // General
111 'url' => $cfgRec['externalUrl'],
112 // Partly general... (for URL and file types)
113 'depth' => 0
114 );
115 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
116 break;
117 case 4:
118 // Page tree
119 // Parameters:
120 $params = array(
121 'indexConfigUid' => $cfgRec['uid'],
122 // General
123 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
124 // General
125 'url' => (int)$cfgRec['alternative_source_pid'],
126 // Partly general... (for URL and file types and page tree (root))
127 'depth' => 0
128 );
129 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
130 break;
131 case 5:
132 // Meta configuration, nothing to do:
133 // NOOP
134 break;
135 default:
136 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
137 $hookObj = GeneralUtility::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
138 if (is_object($hookObj)) {
139 // Parameters:
140 $params = array(
141 'indexConfigUid' => $cfgRec['uid'],
142 // General
143 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . '/CUSTOM]'),
144 // General
145 'url' => $hookObj->initMessage($message)
146 );
147 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
148 }
149 }
150 }
151 }
152 // Finally, look up all old index configurations which are finished and needs to be reset and done.
153 $this->cleanUpOldRunningConfigurations();
154 }
155
156 /**
157 * Call back function for execution of a log element
158 *
159 * @param array Params from log element. Must contain $params['indexConfigUid']
160 * @param object Parent object (tx_crawler lib)
161 * @return array Result array
162 */
163 public function crawler_execute($params, &$pObj) {
164 // Indexer configuration ID must exist:
165 if ($params['indexConfigUid']) {
166 // Load the indexing configuration record:
167 $cfgRec = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('*', 'index_config', 'uid=' . (int)$params['indexConfigUid']);
168 if (is_array($cfgRec)) {
169 // Unpack session data:
170 $session_data = unserialize($cfgRec['session_data']);
171 // Select which type:
172 switch ($cfgRec['type']) {
173 case 1:
174 // Records:
175 $this->crawler_execute_type1($cfgRec, $session_data, $params, $pObj);
176 break;
177 case 2:
178 // Files
179 $this->crawler_execute_type2($cfgRec, $session_data, $params, $pObj);
180 break;
181 case 3:
182 // External URL:
183 $this->crawler_execute_type3($cfgRec, $session_data, $params, $pObj);
184 break;
185 case 4:
186 // Page tree:
187 $this->crawler_execute_type4($cfgRec, $session_data, $params, $pObj);
188 break;
189 case 5:
190 // Meta
191 // NOOP (should never enter here!)
192 break;
193 default:
194 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
195 $hookObj = GeneralUtility::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
196 if (is_object($hookObj)) {
197 $this->pObj = $pObj;
198 // For addQueueEntryForHook()
199 $hookObj->indexOperation($cfgRec, $session_data, $params, $this);
200 }
201 }
202 }
203 // Save process data which might be modified:
204 $field_array = array(
205 'session_data' => serialize($session_data)
206 );
207 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config', 'uid=' . (int)$cfgRec['uid'], $field_array);
208 }
209 }
210 return array('log' => $params);
211 }
212
213 /**
214 * Indexing records from a table
215 *
216 * @param array Indexing Configuration Record
217 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
218 * @param array Parameters from the log queue.
219 * @param object Parent object (from "crawler" extension!)
220 * @return void
221 */
222 public function crawler_execute_type1($cfgRec, &$session_data, $params, &$pObj) {
223 if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']])) {
224 // Init session data array if not already:
225 if (!is_array($session_data)) {
226 $session_data = array(
227 'uid' => 0
228 );
229 }
230 // Init:
231 $pid = (int)$cfgRec['alternative_source_pid'] ?: $cfgRec['pid'];
232 $numberOfRecords = $cfgRec['recordsbatch'] ? \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cfgRec['recordsbatch'], 1) : 100;
233 // Get root line:
234 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
235 // Select
236 $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('*', $cfgRec['table2index'], 'pid = ' . $pid . '
237 AND uid > ' . (int)$session_data['uid'] . BackendUtility::deleteClause($cfgRec['table2index']) . BackendUtility::BEenableFields($cfgRec['table2index']), '', 'uid', $numberOfRecords);
238 // Traverse:
239 if (count($recs)) {
240 foreach ($recs as $r) {
241 // Index single record:
242 $this->indexSingleRecord($r, $cfgRec, $rl);
243 // Update the UID we last processed:
244 $session_data['uid'] = $r['uid'];
245 }
246 // Finally, set entry for next indexing of batch of records:
247 $nparams = array(
248 'indexConfigUid' => $cfgRec['uid'],
249 'url' => 'Records from UID#' . ($r['uid'] + 1) . '-?',
250 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']')
251 );
252 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
253 }
254 }
255 }
256
257 /**
258 * Indexing files from fileadmin
259 *
260 * @param array Indexing Configuration Record
261 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
262 * @param array Parameters from the log queue.
263 * @param object Parent object (from "crawler" extension!)
264 * @return void
265 */
266 public function crawler_execute_type2($cfgRec, &$session_data, $params, &$pObj) {
267 // Prepare path, making it absolute and checking:
268 $readpath = $params['url'];
269 if (!GeneralUtility::isAbsPath($readpath)) {
270 $readpath = GeneralUtility::getFileAbsFileName($readpath);
271 }
272 if (GeneralUtility::isAllowedAbsPath($readpath)) {
273 if (@is_file($readpath)) {
274 // If file, index it!
275 // Get root line (need to provide this when indexing external files)
276 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
277 // (Re)-Indexing file on page.
278 $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
279 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
280 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
281 $indexerObj->hash['phash'] = -1;
282 // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
283 // Index document:
284 $indexerObj->indexRegularDocument(\TYPO3\CMS\Core\Utility\PathUtility::stripPathSitePrefix($readpath), TRUE);
285 } elseif (@is_dir($readpath)) {
286 // If dir, read content and create new pending items for log:
287 // Select files and directories in path:
288 $extList = implode(',', GeneralUtility::trimExplode(',', $cfgRec['extensions'], TRUE));
289 $fileArr = array();
290 $files = GeneralUtility::getAllFilesAndFoldersInPath($fileArr, $readpath, $extList, 0, 0);
291 $directoryList = GeneralUtility::get_dirs($readpath);
292 if (is_array($directoryList) && $params['depth'] < $cfgRec['depth']) {
293 foreach ($directoryList as $subdir) {
294 if ((string)$subdir != '') {
295 $files[] = $readpath . $subdir . '/';
296 }
297 }
298 }
299 $files = GeneralUtility::removePrefixPathFromList($files, PATH_site);
300 // traverse the items and create log entries:
301 foreach ($files as $path) {
302 $this->instanceCounter++;
303 if ($path !== $params['url']) {
304 // Parameters:
305 $nparams = array(
306 'indexConfigUid' => $cfgRec['uid'],
307 'url' => $path,
308 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
309 'depth' => $params['depth'] + 1
310 );
311 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
312 }
313 }
314 }
315 }
316 }
317
318 /**
319 * Indexing External URLs
320 *
321 * @param array Indexing Configuration Record
322 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
323 * @param array Parameters from the log queue.
324 * @param object Parent object (from "crawler" extension!)
325 * @return void
326 */
327 public function crawler_execute_type3($cfgRec, &$session_data, $params, &$pObj) {
328 // Init session data array if not already:
329 if (!is_array($session_data)) {
330 $session_data = array(
331 'urlLog' => array($params['url'])
332 );
333 }
334 // Index the URL:
335 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
336 $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
337 // Add more elements to log now:
338 if ($params['depth'] < $cfgRec['depth']) {
339 foreach ($subUrls as $url) {
340 if ($url = $this->checkUrl($url, $session_data['urlLog'], $cfgRec['externalUrl'])) {
341 if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny'])) {
342 $this->instanceCounter++;
343 $session_data['urlLog'][] = $url;
344 // Parameters:
345 $nparams = array(
346 'indexConfigUid' => $cfgRec['uid'],
347 'url' => $url,
348 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
349 'depth' => $params['depth'] + 1
350 );
351 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
352 }
353 }
354 }
355 }
356 }
357
358 /**
359 * Page tree indexing type
360 *
361 * @param array Indexing Configuration Record
362 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
363 * @param array Parameters from the log queue.
364 * @param object Parent object (from "crawler" extension!)
365 * @return void
366 */
367 public function crawler_execute_type4($cfgRec, &$session_data, $params, &$pObj) {
368 // Base page uid:
369 $pageUid = (int)$params['url'];
370 // Get array of URLs from page:
371 $pageRow = BackendUtility::getRecord('pages', $pageUid);
372 $res = $pObj->getUrlsForPageRow($pageRow);
373 $duplicateTrack = array();
374 // Registry for duplicates
375 $downloadUrls = array();
376 // Dummy.
377 // Submit URLs:
378 if (count($res)) {
379 foreach ($res as $paramSetKey => $vv) {
380 $urlList = $pObj->urlListFromUrlArray($vv, $pageRow, $GLOBALS['EXEC_TIME'], 30, 1, 0, $duplicateTrack, $downloadUrls, array('tx_indexedsearch_reindex'));
381 }
382 }
383 // Add subpages to log now:
384 if ($params['depth'] < $cfgRec['depth']) {
385 // Subpages selected
386 $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('uid,title', 'pages', 'pid = ' . $pageUid . BackendUtility::deleteClause('pages'));
387 // Traverse subpages and add to queue:
388 if (count($recs)) {
389 foreach ($recs as $r) {
390 $this->instanceCounter++;
391 $url = 'pages:' . $r['uid'] . ': ' . $r['title'];
392 $session_data['urlLog'][] = $url;
393 // Parameters:
394 $nparams = array(
395 'indexConfigUid' => $cfgRec['uid'],
396 'url' => $r['uid'],
397 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
398 'depth' => $params['depth'] + 1
399 );
400 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
401 }
402 }
403 }
404 }
405
406 /**
407 * Look up all old index configurations which are finished and needs to be reset and done
408 *
409 * @return void
410 */
411 public function cleanUpOldRunningConfigurations() {
412 // Lookup running index configurations:
413 $runningIndexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('uid,set_id', 'index_config', 'set_id<>0' . BackendUtility::deleteClause('index_config'));
414 // For each running configuration, look up how many log entries there are which are scheduled for execution and if none, clear the "set_id" (means; Processing was DONE)
415 foreach ($runningIndexingConfigurations as $cfgRec) {
416 // Look for ended processes:
417 $queued_items = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('*', 'tx_crawler_queue', 'set_id=' . (int)$cfgRec['set_id'] . ' AND exec_time=0');
418 if (!$queued_items) {
419 // Lookup old phash rows:
420 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('phash', 'index_phash', 'freeIndexUid=' . (int)$cfgRec['uid'] . ' AND freeIndexSetId<>' . (int)$cfgRec['set_id']);
421 foreach ($oldPhashRows as $pHashRow) {
422 // Removing old registrations for all tables (code copied from \TYPO3\CMS\IndexedSearch\Controller\IndexedPagesController)
423 $tableArr = explode(',', 'index_phash,index_rel,index_section,index_grlist,index_fulltext,index_debug');
424 foreach ($tableArr as $table) {
425 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . (int)$pHashRow['phash']);
426 }
427 }
428 // End process by updating index-config record:
429 $field_array = array(
430 'set_id' => 0,
431 'session_data' => ''
432 );
433 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config', 'uid=' . (int)$cfgRec['uid'], $field_array);
434 }
435 }
436 }
437
438 /*****************************************
439 *
440 * Helper functions
441 *
442 *****************************************/
443 /**
444 * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log.
445 *
446 * @param string URL string to check
447 * @param array Array of already indexed URLs (input url is looked up here and must not exist already)
448 * @param string Base URL of the indexing process (input URL must be "inside" the base URL!)
449 * @return string Returls the URL if OK, otherwise FALSE
450 */
451 public function checkUrl($url, $urlLog, $baseUrl) {
452 $url = preg_replace('/\\/\\/$/', '/', $url);
453 list($url) = explode('#', $url);
454 if (!strstr($url, '../')) {
455 if (GeneralUtility::isFirstPartOfStr($url, $baseUrl)) {
456 if (!in_array($url, $urlLog)) {
457 return $url;
458 }
459 }
460 }
461 }
462
463 /**
464 * Indexing External URL
465 *
466 * @param string URL, http://....
467 * @param int Page id to relate indexing to.
468 * @param array Rootline array to relate indexing to
469 * @param int Configuration UID
470 * @param int Set ID value
471 * @return array URLs found on this page
472 */
473 public function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId) {
474 // Index external URL:
475 $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
476 $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
477 $indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
478 $indexerObj->hash['phash'] = -1;
479 // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
480 $indexerObj->indexExternalUrl($url);
481 $url_qParts = parse_url($url);
482 $baseAbsoluteHref = $url_qParts['scheme'] . '://' . $url_qParts['host'];
483 $baseHref = $indexerObj->extractBaseHref($indexerObj->indexExternalUrl_content);
484 if (!$baseHref) {
485 // Extract base href from current URL
486 $baseHref = $baseAbsoluteHref;
487 $baseHref .= substr($url_qParts['path'], 0, strrpos($url_qParts['path'], '/'));
488 }
489 $baseHref = rtrim($baseHref, '/');
490 // Get URLs on this page:
491 $subUrls = array();
492 $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
493 // Traverse links:
494 foreach ($list as $count => $linkInfo) {
495 // Decode entities:
496 $subUrl = htmlspecialchars_decode($linkInfo['href']);
497 $qParts = parse_url($subUrl);
498 if (!$qParts['scheme']) {
499 $relativeUrl = GeneralUtility::resolveBackPath($subUrl);
500 if ($relativeUrl[0] === '/') {
501 $subUrl = $baseAbsoluteHref . $relativeUrl;
502 } else {
503 $subUrl = $baseHref . '/' . $relativeUrl;
504 }
505 }
506 $subUrls[] = $subUrl;
507 }
508 return $subUrls;
509 }
510
511 /**
512 * Indexing Single Record
513 *
514 * @param array Record to index
515 * @param array Configuration Record
516 * @param array Rootline array to relate indexing to
517 * @return void
518 */
519 public function indexSingleRecord($r, $cfgRec, $rl = NULL) {
520 // Init:
521 $rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
522 $fieldList = GeneralUtility::trimExplode(',', $cfgRec['fieldlist'], TRUE);
523 $languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
524 $sys_language_uid = $languageField ? $r[$languageField] : 0;
525 // (Re)-Indexing a row from a table:
526 $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
527 parse_str(str_replace('###UID###', $r['uid'], $cfgRec['get_params']), $GETparams);
528 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, $cfgRec['chashcalc'] ? TRUE : FALSE);
529 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
530 $indexerObj->forceIndexing = TRUE;
531 $theContent = '';
532 foreach ($fieldList as $k => $v) {
533 if (!$k) {
534 $theTitle = $r[$v];
535 } else {
536 $theContent .= $r[$v] . ' ';
537 }
538 }
539 // Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid())
540 $indexerObj->backend_indexAsTYPO3Page(strip_tags(str_replace('<', ' <', $theTitle)), '', '', strip_tags(str_replace('<', ' <', $theContent)), $GLOBALS['LANG']->charSet, $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']], $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']], $r['uid']);
541 }
542
543 /**
544 * Get rootline for closest TypoScript template root.
545 * Algorithm same as used in Web > Template, Object browser
546 *
547 * @param int The page id to traverse rootline back from
548 * @return array Array where the root lines uid values are found.
549 */
550 public function getUidRootLineForClosestTemplate($id) {
551 $tmpl = GeneralUtility::makeInstance(\TYPO3\CMS\Core\TypoScript\ExtendedTemplateService::class);
552 $tmpl->tt_track = 0;
553 // Do not log time-performance information
554 $tmpl->init();
555 // Gets the rootLine
556 $sys_page = GeneralUtility::makeInstance(\TYPO3\CMS\Frontend\Page\PageRepository::class);
557 $rootLine = $sys_page->getRootLine($id);
558 // This generates the constants/config + hierarchy info for the template.
559 $tmpl->runThroughTemplates($rootLine, 0);
560 // Root line uids
561 $rootline_uids = array();
562 foreach ($tmpl->rootLine as $rlkey => $rldat) {
563 $rootline_uids[$rlkey] = $rldat['uid'];
564 }
565 return $rootline_uids;
566 }
567
568 /**
569 * Generate the unix time stamp for next visit.
570 *
571 * @param array Index configuration record
572 * @return int The next time stamp
573 */
574 public function generateNextIndexingTime($cfgRec) {
575 $currentTime = $GLOBALS['EXEC_TIME'];
576 // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected
577 if ($cfgRec['timer_frequency'] <= 24 * 3600) {
578 $aMidNight = mktime(0, 0, 0) - 1 * 24 * 3600;
579 } else {
580 $lastTime = $cfgRec['timer_next_indexing'] ?: $GLOBALS['EXEC_TIME'];
581 $aMidNight = mktime(0, 0, 0, date('m', $lastTime), date('d', $lastTime), date('y', $lastTime));
582 }
583 // Find last offset time plus frequency in seconds:
584 $lastSureOffset = $aMidNight + \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cfgRec['timer_offset'], 0, 86400);
585 $frequencySeconds = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cfgRec['timer_frequency'], 1);
586 // Now, find out how many blocks of the length of frequency there is until the next time:
587 $frequencyBlocksUntilNextTime = ceil(($currentTime - $lastSureOffset) / $frequencySeconds);
588 // Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds.
589 $nextTime = $lastSureOffset + $frequencyBlocksUntilNextTime * $frequencySeconds;
590 return $nextTime;
591 }
592
593 /**
594 * Checks if $url has any of the URls in the $url_deny "list" in it and if so, returns TRUE.
595 *
596 * @param string URL to test
597 * @param string String where URLs are separated by line-breaks; If any of these strings is the first part of $url, the function returns TRUE (to indicate denial of decend)
598 * @return bool TRUE if there is a matching URL (hence, do not index!)
599 */
600 public function checkDeniedSuburls($url, $url_deny) {
601 if (trim($url_deny)) {
602 $url_denyArray = GeneralUtility::trimExplode(LF, $url_deny, TRUE);
603 foreach ($url_denyArray as $testurl) {
604 if (GeneralUtility::isFirstPartOfStr($url, $testurl)) {
605 echo $url . ' /// ' . $url_deny . LF;
606 return TRUE;
607 }
608 }
609 }
610 return FALSE;
611 }
612
613 /**
614 * Adding entry in queue for Hook
615 *
616 * @param array Configuration record
617 * @param string Title/URL
618 * @return void
619 */
620 public function addQueueEntryForHook($cfgRec, $title) {
621 $nparams = array(
622 'indexConfigUid' => $cfgRec['uid'],
623 // This must ALWAYS be the cfgRec uid!
624 'url' => $title,
625 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']')
626 );
627 $this->pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
628 }
629
630 /**
631 * Deletes all data stored by indexed search for a given page
632 *
633 * @param int Uid of the page to delete all pHash
634 * @return void
635 */
636 public function deleteFromIndex($id) {
637 // Lookup old phash rows:
638 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('phash', 'index_section', 'page_id=' . (int)$id);
639 if (count($oldPhashRows)) {
640 $pHashesToDelete = array();
641 foreach ($oldPhashRows as $pHashRow) {
642 $pHashesToDelete[] = $pHashRow['phash'];
643 }
644 $where_clause = 'phash IN (' . implode(',', $GLOBALS['TYPO3_DB']->cleanIntArray($pHashesToDelete)) . ')';
645 $tables = explode(',', 'index_debug,index_fulltext,index_grlist,index_phash,index_rel,index_section');
646 foreach ($tables as $table) {
647 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, $where_clause);
648 }
649 }
650 }
651
652 /*************************
653 *
654 * Hook functions for TCEmain (indexing of records)
655 *
656 *************************/
657 /**
658 * TCEmain hook function for on-the-fly indexing of database records
659 *
660 * @param string TCEmain command
661 * @param string Table name
662 * @param string Record ID. If new record its a string pointing to index inside \TYPO3\CMS\Core\DataHandling\DataHandler::substNEWwithIDs
663 * @param mixed Target value (ignored)
664 * @param object Reference to tcemain calling object
665 * @return void
666 */
667 public function processCmdmap_preProcess($command, $table, $id, $value, $pObj) {
668 // Clean up the index
669 if ($command == 'delete' && $table == 'pages') {
670 $this->deleteFromIndex($id);
671 }
672 }
673
674 /**
675 * TCEmain hook function for on-the-fly indexing of database records
676 *
677 * @param string Status "new" or "update
678 * @param string Table name
679 * @param string Record ID. If new record its a string pointing to index inside \TYPO3\CMS\Core\DataHandling\DataHandler::substNEWwithIDs
680 * @param array Field array of updated fields in the operation
681 * @param object Reference to tcemain calling object
682 * @return void
683 */
684 public function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, $pObj) {
685 // Check if any fields are actually updated:
686 if (count($fieldArray)) {
687 // Translate new ids.
688 if ($status == 'new') {
689 $id = $pObj->substNEWwithIDs[$id];
690 } elseif ($table == 'pages' && $status == 'update' && (array_key_exists('hidden', $fieldArray) && $fieldArray['hidden'] == 1 || array_key_exists('no_search', $fieldArray) && $fieldArray['no_search'] == 1)) {
691 // If the page should be hidden or not indexed after update, delete index for this page
692 $this->deleteFromIndex($id);
693 }
694 // Get full record and if exists, search for indexing configurations:
695 $currentRecord = BackendUtility::getRecord($table, $id);
696 if (is_array($currentRecord)) {
697 // Select all (not running) indexing configurations of type "record" (1) and which points to this table and is located on the same page as the record or pointing to the right source PID
698 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('*', 'index_config', 'hidden=0
699 AND (starttime=0 OR starttime<=' . $GLOBALS['EXEC_TIME'] . ')
700 AND set_id=0
701 AND type=1
702 AND table2index=' . $GLOBALS['TYPO3_DB']->fullQuoteStr($table, 'index_config') . '
703 AND (
704 (alternative_source_pid=0 AND pid=' . (int)$currentRecord['pid'] . ')
705 OR (alternative_source_pid=' . (int)$currentRecord['pid'] . ')
706 )
707 AND records_indexonchange=1
708 ' . BackendUtility::deleteClause('index_config'));
709 foreach ($indexingConfigurations as $cfgRec) {
710 $this->indexSingleRecord($currentRecord, $cfgRec);
711 }
712 }
713 }
714 }
715
716 }