b28d30225c76d8ee4cf885a37a9337d223a42f8f
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Hook / CrawlerHook.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch\Hook;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Backend\Form\FormEngine;
18 use TYPO3\CMS\Backend\Utility\BackendUtility;
19 use TYPO3\CMS\Core\Database\ConnectionPool;
20 use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
21 use TYPO3\CMS\Core\Utility\GeneralUtility;
22 use TYPO3\CMS\Core\Utility\MathUtility;
23
24 /**
25 * Crawler hook for indexed search. Works with the "crawler" extension
26 */
27 class CrawlerHook
28 {
29 /**
30 * Number of seconds to use as interval between queued indexing operations of URLs / files (types 2 & 3)
31 *
32 * @var int
33 */
34 public $secondsPerExternalUrl = 3;
35
36 /**
37 * Counts up for each added URL (type 3)
38 *
39 * @var int
40 */
41 public $instanceCounter = 0;
42
43 /**
44 * @var string
45 */
46 public $callBack = CrawlerHook::class;
47
48 /**
49 * The constructor
50 */
51 public function __construct()
52 {
53 // To make sure the backend charset is available:
54 if (!is_object($GLOBALS['LANG'])) {
55 $GLOBALS['LANG'] = GeneralUtility::makeInstance(\TYPO3\CMS\Lang\LanguageService::class);
56 $GLOBALS['LANG']->init($GLOBALS['BE_USER']->uc['lang']);
57 }
58 }
59
60 /**
61 * Initialization of crawler hook.
62 * This function is asked for each instance of the crawler and we must check if something is timed to happen and if so put entry(s) in the crawlers log to start processing.
63 * In reality we select indexing configurations and evaluate if any of them needs to run.
64 *
65 * @param object $pObj Parent object (tx_crawler lib)
66 * @return void
67 */
68 public function crawler_init(&$pObj)
69 {
70 // Select all indexing configuration which are waiting to be activated:
71 $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_config');
72 $queryBuilder = $connection->createQueryBuilder();
73
74 $result = $queryBuilder->select('*')
75 ->from('index_config')
76 ->where(
77 $queryBuilder->expr()->lt('timer_next_indexing', (int)$GLOBALS['EXEC_TIME']),
78 $queryBuilder->expr()->eq('set_id', 0)
79 )
80 ->execute();
81
82 // For each configuration, check if it should be executed and if so, start:
83 while ($cfgRec = $result->fetch()) {
84 // Generate a unique set-ID:
85 $setId = GeneralUtility::md5int(microtime());
86 // Get next time:
87 $nextTime = $this->generateNextIndexingTime($cfgRec);
88 // Start process by updating index-config record:
89 $connection->update(
90 'index_config',
91 [
92 'set_id' => $setId,
93 'timer_next_indexing' => $nextTime,
94 'session_data' => ''
95 ],
96 [
97 'uid' => (int)$cfgRec['uid']
98 ]
99 );
100 // Based on configuration type:
101 switch ($cfgRec['type']) {
102 case 1:
103 // RECORDS:
104 // Parameters:
105 $params = [
106 'indexConfigUid' => $cfgRec['uid'],
107 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
108 'url' => 'Records (start)'
109 ];
110 //
111 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
112 break;
113 case 2:
114 // FILES:
115 // Parameters:
116 $params = [
117 'indexConfigUid' => $cfgRec['uid'],
118 // General
119 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
120 // General
121 'url' => $cfgRec['filepath'],
122 // Partly general... (for URL and file types)
123 'depth' => 0
124 ];
125 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
126 break;
127 case 3:
128 // External URL:
129 // Parameters:
130 $params = [
131 'indexConfigUid' => $cfgRec['uid'],
132 // General
133 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
134 // General
135 'url' => $cfgRec['externalUrl'],
136 // Partly general... (for URL and file types)
137 'depth' => 0
138 ];
139 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
140 break;
141 case 4:
142 // Page tree
143 // Parameters:
144 $params = [
145 'indexConfigUid' => $cfgRec['uid'],
146 // General
147 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
148 // General
149 'url' => (int)$cfgRec['alternative_source_pid'],
150 // Partly general... (for URL and file types and page tree (root))
151 'depth' => 0
152 ];
153 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
154 break;
155 case 5:
156 // Meta configuration, nothing to do:
157 // NOOP
158 break;
159 default:
160 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
161 $hookObj = GeneralUtility::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
162 if (is_object($hookObj)) {
163 // Parameters:
164 $params = [
165 'indexConfigUid' => $cfgRec['uid'],
166 // General
167 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . '/CUSTOM]'],
168 // General
169 'url' => $hookObj->initMessage($message)
170 ];
171 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
172 }
173 }
174 }
175 }
176 // Finally, look up all old index configurations which are finished and needs to be reset and done.
177 $this->cleanUpOldRunningConfigurations();
178 }
179
180 /**
181 * Call back function for execution of a log element
182 *
183 * @param array $params Params from log element. Must contain $params['indexConfigUid']
184 * @param object $pObj Parent object (tx_crawler lib)
185 * @return array Result array
186 */
187 public function crawler_execute($params, &$pObj)
188 {
189 // Indexer configuration ID must exist:
190 if ($params['indexConfigUid']) {
191 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
192 ->getQueryBuilderForTable('index_config');
193 $queryBuilder->getRestrictions()->removeAll();
194 // Load the indexing configuration record:
195 $cfgRec = $queryBuilder
196 ->select('*')
197 ->from('index_config')
198 ->where(
199 $queryBuilder->expr()->eq('uid', (int)$params['indexConfigUid'])
200 )
201 ->execute()
202 ->fetch();
203 if (is_array($cfgRec)) {
204 // Unpack session data:
205 $session_data = unserialize($cfgRec['session_data']);
206 // Select which type:
207 switch ($cfgRec['type']) {
208 case 1:
209 // Records:
210 $this->crawler_execute_type1($cfgRec, $session_data, $params, $pObj);
211 break;
212 case 2:
213 // Files
214 $this->crawler_execute_type2($cfgRec, $session_data, $params, $pObj);
215 break;
216 case 3:
217 // External URL:
218 $this->crawler_execute_type3($cfgRec, $session_data, $params, $pObj);
219 break;
220 case 4:
221 // Page tree:
222 $this->crawler_execute_type4($cfgRec, $session_data, $params, $pObj);
223 break;
224 case 5:
225 // Meta
226 // NOOP (should never enter here!)
227 break;
228 default:
229 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
230 $hookObj = GeneralUtility::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
231 if (is_object($hookObj)) {
232 $this->pObj = $pObj;
233 // For addQueueEntryForHook()
234 $hookObj->indexOperation($cfgRec, $session_data, $params, $this);
235 }
236 }
237 }
238 // Save process data which might be modified:
239 GeneralUtility::makeInstance(ConnectionPool::class)
240 ->getConnectionForTable('index_config')
241 ->update(
242 'index_config',
243 ['session_data' => serialize($session_data)],
244 ['uid' => (int)$cfgRec['uid']]
245 );
246 }
247 }
248 return ['log' => $params];
249 }
250
251 /**
252 * Indexing records from a table
253 *
254 * @param array $cfgRec Indexing Configuration Record
255 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
256 * @param array $params Parameters from the log queue.
257 * @param object $pObj Parent object (from "crawler" extension!)
258 * @return void
259 */
260 public function crawler_execute_type1($cfgRec, &$session_data, $params, &$pObj)
261 {
262 if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']])) {
263 // Init session data array if not already:
264 if (!is_array($session_data)) {
265 $session_data = [
266 'uid' => 0
267 ];
268 }
269 // Init:
270 $pid = (int)$cfgRec['alternative_source_pid'] ?: $cfgRec['pid'];
271 $numberOfRecords = $cfgRec['recordsbatch']
272 ? MathUtility::forceIntegerInRange($cfgRec['recordsbatch'], 1)
273 : 100;
274
275 // Get root line:
276 $rootLine = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
277 // Select
278 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
279 ->getQueryBuilderForTable($cfgRec['table2index']);
280
281 $result = $queryBuilder->select('*')
282 ->from($cfgRec['table2index'])
283 ->where(
284 $queryBuilder->expr()->eq('pid', $pid),
285 $queryBuilder->expr()->gt('uid', (int)$session_data['uid'])
286 )
287 ->setMaxResults($numberOfRecords)
288 ->orderBy('uid')
289 ->execute();
290
291 // Traverse:
292 while ($row = $result->fetch()) {
293 // Index single record:
294 $this->indexSingleRecord($row, $cfgRec, $rootLine);
295 // Update the UID we last processed:
296 $session_data['uid'] = $row['uid'];
297 }
298
299 // Finally, set entry for next indexing of batch of records:
300 if ($result->rowCount()) {
301 $nparams = [
302 'indexConfigUid' => $cfgRec['uid'],
303 'url' => 'Records from UID#' . ($session_data['uid'] + 1) . '-?',
304 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']']
305 ];
306 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
307 }
308 }
309 }
310
311 /**
312 * Indexing files from fileadmin
313 *
314 * @param array $cfgRec Indexing Configuration Record
315 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
316 * @param array $params Parameters from the log queue.
317 * @param object $pObj Parent object (from "crawler" extension!)
318 * @return void
319 */
320 public function crawler_execute_type2($cfgRec, &$session_data, $params, &$pObj)
321 {
322 // Prepare path, making it absolute and checking:
323 $readpath = $params['url'];
324 if (!GeneralUtility::isAbsPath($readpath)) {
325 $readpath = GeneralUtility::getFileAbsFileName($readpath);
326 }
327 if (GeneralUtility::isAllowedAbsPath($readpath)) {
328 if (@is_file($readpath)) {
329 // If file, index it!
330 // Get root line (need to provide this when indexing external files)
331 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
332 // (Re)-Indexing file on page.
333 $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
334 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
335 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
336 $indexerObj->hash['phash'] = -1;
337 // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
338 // Index document:
339 $indexerObj->indexRegularDocument(\TYPO3\CMS\Core\Utility\PathUtility::stripPathSitePrefix($readpath), true);
340 } elseif (@is_dir($readpath)) {
341 // If dir, read content and create new pending items for log:
342 // Select files and directories in path:
343 $extList = implode(',', GeneralUtility::trimExplode(',', $cfgRec['extensions'], true));
344 $fileArr = [];
345 $files = GeneralUtility::getAllFilesAndFoldersInPath($fileArr, $readpath, $extList, 0, 0);
346 $directoryList = GeneralUtility::get_dirs($readpath);
347 if (is_array($directoryList) && $params['depth'] < $cfgRec['depth']) {
348 foreach ($directoryList as $subdir) {
349 if ((string)$subdir != '') {
350 $files[] = $readpath . $subdir . '/';
351 }
352 }
353 }
354 $files = GeneralUtility::removePrefixPathFromList($files, PATH_site);
355 // traverse the items and create log entries:
356 foreach ($files as $path) {
357 $this->instanceCounter++;
358 if ($path !== $params['url']) {
359 // Parameters:
360 $nparams = [
361 'indexConfigUid' => $cfgRec['uid'],
362 'url' => $path,
363 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
364 'depth' => $params['depth'] + 1
365 ];
366 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
367 }
368 }
369 }
370 }
371 }
372
373 /**
374 * Indexing External URLs
375 *
376 * @param array $cfgRec Indexing Configuration Record
377 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
378 * @param array $params Parameters from the log queue.
379 * @param object $pObj Parent object (from "crawler" extension!)
380 * @return void
381 */
382 public function crawler_execute_type3($cfgRec, &$session_data, $params, &$pObj)
383 {
384 // Init session data array if not already:
385 if (!is_array($session_data)) {
386 $session_data = [
387 'urlLog' => [$params['url']]
388 ];
389 }
390 // Index the URL:
391 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
392 $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
393 // Add more elements to log now:
394 if ($params['depth'] < $cfgRec['depth']) {
395 foreach ($subUrls as $url) {
396 if ($url = $this->checkUrl($url, $session_data['urlLog'], $cfgRec['externalUrl'])) {
397 if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny'])) {
398 $this->instanceCounter++;
399 $session_data['urlLog'][] = $url;
400 // Parameters:
401 $nparams = [
402 'indexConfigUid' => $cfgRec['uid'],
403 'url' => $url,
404 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
405 'depth' => $params['depth'] + 1
406 ];
407 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
408 }
409 }
410 }
411 }
412 }
413
414 /**
415 * Page tree indexing type
416 *
417 * @param array $cfgRec Indexing Configuration Record
418 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
419 * @param array $params Parameters from the log queue.
420 * @param object $pObj Parent object (from "crawler" extension!)
421 * @return void
422 */
423 public function crawler_execute_type4($cfgRec, &$session_data, $params, &$pObj)
424 {
425 // Base page uid:
426 $pageUid = (int)$params['url'];
427 // Get array of URLs from page:
428 $pageRow = BackendUtility::getRecord('pages', $pageUid);
429 $res = $pObj->getUrlsForPageRow($pageRow);
430 $duplicateTrack = [];
431 // Registry for duplicates
432 $downloadUrls = [];
433 // Dummy.
434 // Submit URLs:
435 if (!empty($res)) {
436 foreach ($res as $paramSetKey => $vv) {
437 $urlList = $pObj->urlListFromUrlArray($vv, $pageRow, $GLOBALS['EXEC_TIME'], 30, 1, 0, $duplicateTrack, $downloadUrls, ['tx_indexedsearch_reindex']);
438 }
439 }
440 // Add subpages to log now:
441 if ($params['depth'] < $cfgRec['depth']) {
442 // Subpages selected
443 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('pages');
444 $queryBuilder->getRestrictions()
445 ->removeAll()
446 ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
447 $result = $queryBuilder->select('uid', 'title')
448 ->from('pages')
449 ->where($queryBuilder->expr()->eq('pid', $pageUid))
450 ->execute();
451 // Traverse subpages and add to queue:
452 while ($row = $result->fetch()) {
453 $this->instanceCounter++;
454 $url = 'pages:' . $row['uid'] . ': ' . $row['title'];
455 $session_data['urlLog'][] = $url;
456 // Parameters:
457 $nparams = [
458 'indexConfigUid' => $cfgRec['uid'],
459 'url' => $row['uid'],
460 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
461 'depth' => $params['depth'] + 1
462 ];
463 $pObj->addQueueEntry_callBack(
464 $cfgRec['set_id'],
465 $nparams,
466 $this->callBack,
467 $cfgRec['pid'],
468 $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
469 );
470 }
471 }
472 }
473
474 /**
475 * Look up all old index configurations which are finished and needs to be reset and done
476 *
477 * @return void
478 */
479 public function cleanUpOldRunningConfigurations()
480 {
481 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
482 // List of tables that store information related to the phash value
483 $tablesToClean = [
484 'index_phash',
485 'index_rel',
486 'index_section',
487 'index_grlist',
488 'index_fulltext',
489 'index_debug'
490 ];
491
492 $queryBuilder = $connectionPool->getQueryBuilderForTable('index_config');
493 $queryBuilder->getRestrictions()
494 ->removeAll()
495 ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
496
497 // Lookup running index configurations:
498 $runningIndexingConfigurations = $queryBuilder->select('*')
499 ->from('index_config')
500 ->where($queryBuilder->expr()->neq('set_id', 0))
501 ->execute()
502 ->fetchAll();
503 // For each running configuration, look up how many log entries there are which are scheduled
504 // for execution and if none, clear the "set_id" (means; Processing was DONE)
505 foreach ($runningIndexingConfigurations as $cfgRec) {
506 // Look for ended processes:
507 $queued_items = $connectionPool->getConnectionForTable('tx_crawler_queue')
508 ->count(
509 '*',
510 'tx_crawler_queue',
511 [
512 'set_id' => (int)$cfgRec['set_id'],
513 'exec_time' => 0
514 ]
515 );
516 if (!$queued_items) {
517 // Lookup old phash rows:
518 $queryBuilder = $connectionPool->getQueryBuilderForTable('index_phash');
519 $oldPhashRows = $queryBuilder
520 ->select('phash')
521 ->from('index_phash')
522 ->where(
523 $queryBuilder->expr()->eq('freeIndexUid', (int)$cfgRec['uid']),
524 $queryBuilder->expr()->neq('freeIndexSetId', (int)$cfgRec['set_id'])
525 )
526 ->execute()
527 ->fetchAll();
528
529 $oldPhashRows = array_map('intval', array_column($oldPhashRows, 'phash'));
530 // Removing old registrations for all tables
531 foreach ($tablesToClean as $table) {
532 $queryBuilder = $connectionPool->getQueryBuilderForTable($table);
533 $queryBuilder->delete($table)
534 ->where($queryBuilder->expr()->in('phash', $oldPhashRows))
535 ->execute();
536 }
537
538 // End process by updating index-config record:
539 $connectionPool->getConnectionForTable('index_config')
540 ->update(
541 'index_config',
542 [
543 'set_id' => 0,
544 'session_data' => ''
545 ],
546 ['uid' => (int)$cfgRec['uid']]
547 );
548 }
549 }
550 }
551
552 /*****************************************
553 *
554 * Helper functions
555 *
556 *****************************************/
557 /**
558 * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log.
559 *
560 * @param string $url URL string to check
561 * @param array $urlLog Array of already indexed URLs (input url is looked up here and must not exist already)
562 * @param string $baseUrl Base URL of the indexing process (input URL must be "inside" the base URL!)
563 * @return string Returls the URL if OK, otherwise FALSE
564 */
565 public function checkUrl($url, $urlLog, $baseUrl)
566 {
567 $url = preg_replace('/\\/\\/$/', '/', $url);
568 list($url) = explode('#', $url);
569 if (!strstr($url, '../')) {
570 if (GeneralUtility::isFirstPartOfStr($url, $baseUrl)) {
571 if (!in_array($url, $urlLog)) {
572 return $url;
573 }
574 }
575 }
576 }
577
578 /**
579 * Indexing External URL
580 *
581 * @param string $url URL, http://....
582 * @param int $pageId Page id to relate indexing to.
583 * @param array $rl Rootline array to relate indexing to
584 * @param int $cfgUid Configuration UID
585 * @param int $setId Set ID value
586 * @return array URLs found on this page
587 */
588 public function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)
589 {
590 // Index external URL:
591 $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
592 $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
593 $indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
594 $indexerObj->hash['phash'] = -1;
595 // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
596 $indexerObj->indexExternalUrl($url);
597 $url_qParts = parse_url($url);
598 $baseAbsoluteHref = $url_qParts['scheme'] . '://' . $url_qParts['host'];
599 $baseHref = $indexerObj->extractBaseHref($indexerObj->indexExternalUrl_content);
600 if (!$baseHref) {
601 // Extract base href from current URL
602 $baseHref = $baseAbsoluteHref;
603 $baseHref .= substr($url_qParts['path'], 0, strrpos($url_qParts['path'], '/'));
604 }
605 $baseHref = rtrim($baseHref, '/');
606 // Get URLs on this page:
607 $subUrls = [];
608 $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
609 // Traverse links:
610 foreach ($list as $count => $linkInfo) {
611 // Decode entities:
612 $subUrl = htmlspecialchars_decode($linkInfo['href']);
613 $qParts = parse_url($subUrl);
614 if (!$qParts['scheme']) {
615 $relativeUrl = GeneralUtility::resolveBackPath($subUrl);
616 if ($relativeUrl[0] === '/') {
617 $subUrl = $baseAbsoluteHref . $relativeUrl;
618 } else {
619 $subUrl = $baseHref . '/' . $relativeUrl;
620 }
621 }
622 $subUrls[] = $subUrl;
623 }
624 return $subUrls;
625 }
626
627 /**
628 * Indexing Single Record
629 *
630 * @param array $r Record to index
631 * @param array $cfgRec Configuration Record
632 * @param array $rl Rootline array to relate indexing to
633 * @return void
634 */
635 public function indexSingleRecord($r, $cfgRec, $rl = null)
636 {
637 // Init:
638 $rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
639 $fieldList = GeneralUtility::trimExplode(',', $cfgRec['fieldlist'], true);
640 $languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
641 $sys_language_uid = $languageField ? $r[$languageField] : 0;
642 // (Re)-Indexing a row from a table:
643 $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
644 parse_str(str_replace('###UID###', $r['uid'], $cfgRec['get_params']), $GETparams);
645 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, (bool)$cfgRec['chashcalc']);
646 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
647 $indexerObj->forceIndexing = true;
648 $theContent = '';
649 foreach ($fieldList as $k => $v) {
650 if (!$k) {
651 $theTitle = $r[$v];
652 } else {
653 $theContent .= $r[$v] . ' ';
654 }
655 }
656 // Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid())
657 $indexerObj->backend_indexAsTYPO3Page(strip_tags(str_replace('<', ' <', $theTitle)), '', '', strip_tags(str_replace('<', ' <', $theContent)), 'utf-8', $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']], $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']], $r['uid']);
658 }
659
660 /**
661 * Get rootline for closest TypoScript template root.
662 * Algorithm same as used in Web > Template, Object browser
663 *
664 * @param int $id The page id to traverse rootline back from
665 * @return array Array where the root lines uid values are found.
666 */
667 public function getUidRootLineForClosestTemplate($id)
668 {
669 $tmpl = GeneralUtility::makeInstance(\TYPO3\CMS\Core\TypoScript\ExtendedTemplateService::class);
670 $tmpl->init();
671 // Gets the rootLine
672 $sys_page = GeneralUtility::makeInstance(\TYPO3\CMS\Frontend\Page\PageRepository::class);
673 $rootLine = $sys_page->getRootLine($id);
674 // This generates the constants/config + hierarchy info for the template.
675 $tmpl->runThroughTemplates($rootLine, 0);
676 // Root line uids
677 $rootline_uids = [];
678 foreach ($tmpl->rootLine as $rlkey => $rldat) {
679 $rootline_uids[$rlkey] = $rldat['uid'];
680 }
681 return $rootline_uids;
682 }
683
684 /**
685 * Generate the unix time stamp for next visit.
686 *
687 * @param array $cfgRec Index configuration record
688 * @return int The next time stamp
689 */
690 public function generateNextIndexingTime($cfgRec)
691 {
692 $currentTime = $GLOBALS['EXEC_TIME'];
693 // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected
694 if ($cfgRec['timer_frequency'] <= 24 * 3600) {
695 $aMidNight = mktime(0, 0, 0) - 1 * 24 * 3600;
696 } else {
697 $lastTime = $cfgRec['timer_next_indexing'] ?: $GLOBALS['EXEC_TIME'];
698 $aMidNight = mktime(0, 0, 0, date('m', $lastTime), date('d', $lastTime), date('y', $lastTime));
699 }
700 // Find last offset time plus frequency in seconds:
701 $lastSureOffset = $aMidNight + MathUtility::forceIntegerInRange($cfgRec['timer_offset'], 0, 86400);
702 $frequencySeconds = MathUtility::forceIntegerInRange($cfgRec['timer_frequency'], 1);
703 // Now, find out how many blocks of the length of frequency there is until the next time:
704 $frequencyBlocksUntilNextTime = ceil(($currentTime - $lastSureOffset) / $frequencySeconds);
705 // Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds.
706 return $lastSureOffset + $frequencyBlocksUntilNextTime * $frequencySeconds;
707 }
708
709 /**
710 * Checks if $url has any of the URls in the $url_deny "list" in it and if so, returns TRUE.
711 *
712 * @param string $url URL to test
713 * @param string $url_deny String where URLs are separated by line-breaks; If any of these strings is the first part of $url, the function returns TRUE (to indicate denial of decend)
714 * @return bool TRUE if there is a matching URL (hence, do not index!)
715 */
716 public function checkDeniedSuburls($url, $url_deny)
717 {
718 if (trim($url_deny)) {
719 $url_denyArray = GeneralUtility::trimExplode(LF, $url_deny, true);
720 foreach ($url_denyArray as $testurl) {
721 if (GeneralUtility::isFirstPartOfStr($url, $testurl)) {
722 return true;
723 }
724 }
725 }
726 return false;
727 }
728
729 /**
730 * Adding entry in queue for Hook
731 *
732 * @param array $cfgRec Configuration record
733 * @param string $title Title/URL
734 * @return void
735 */
736 public function addQueueEntryForHook($cfgRec, $title)
737 {
738 $nparams = [
739 'indexConfigUid' => $cfgRec['uid'],
740 // This must ALWAYS be the cfgRec uid!
741 'url' => $title,
742 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']']
743 ];
744 $this->pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
745 }
746
747 /**
748 * Deletes all data stored by indexed search for a given page
749 *
750 * @param int $id Uid of the page to delete all pHash
751 * @return void
752 */
753 public function deleteFromIndex($id)
754 {
755 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
756
757 // Lookup old phash rows:
758
759 $queryBuilder = $connectionPool->getQueryBuilderForTable('index_section');
760 $oldPhashRows = $queryBuilder->select('phash')
761 ->from('index_section')
762 ->where($queryBuilder->expr()->eq('page_id', (int)$id))
763 ->execute()
764 ->fetchAll();
765
766 if (empty($oldPhashRows)) {
767 return;
768 }
769
770 $pHashesToDelete = array_map('intval', array_column($oldPhashRows, 'phash'));
771 $tables = [
772 'index_debug',
773 'index_fulltext',
774 'index_grlist',
775 'index_phash',
776 'index_rel',
777 'index_section',
778 ];
779 foreach ($tables as $table) {
780 $queryBuilder = $connectionPool->getQueryBuilderForTable($table);
781 $queryBuilder->delete($table)
782 ->where($queryBuilder->expr()->in('phash', $pHashesToDelete))
783 ->execute();
784 }
785 }
786
787 /*************************
788 *
789 * Hook functions for TCEmain (indexing of records)
790 *
791 *************************/
792 /**
793 * TCEmain hook function for on-the-fly indexing of database records
794 *
795 * @param string $command TCEmain command
796 * @param string $table Table name
797 * @param string $id Record ID. If new record its a string pointing to index inside \TYPO3\CMS\Core\DataHandling\DataHandler::substNEWwithIDs
798 * @param mixed $value Target value (ignored)
799 * @param FormEngine $pObj tcemain calling object
800 * @return void
801 */
802 public function processCmdmap_preProcess($command, $table, $id, $value, $pObj)
803 {
804 // Clean up the index
805 if ($command === 'delete' && $table === 'pages') {
806 $this->deleteFromIndex($id);
807 }
808 }
809
810 /**
811 * TCEmain hook function for on-the-fly indexing of database records
812 *
813 * @param string $status Status "new" or "update
814 * @param string $table Table name
815 * @param string $id Record ID. If new record its a string pointing to index inside \TYPO3\CMS\Core\DataHandling\DataHandler::substNEWwithIDs
816 * @param array $fieldArray Field array of updated fields in the operation
817 * @param FormEngine $pObj tcemain calling object
818 * @return void
819 */
820 public function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, $pObj)
821 {
822 // Check if any fields are actually updated:
823 if (empty($fieldArray)) {
824 return;
825 }
826 // Translate new ids.
827 if ($status === 'new') {
828 $id = $pObj->substNEWwithIDs[$id];
829 } elseif ($table === 'pages' && $status === 'update' && (array_key_exists('hidden', $fieldArray) && $fieldArray['hidden'] == 1 || array_key_exists('no_search', $fieldArray) && $fieldArray['no_search'] == 1)) {
830 // If the page should be hidden or not indexed after update, delete index for this page
831 $this->deleteFromIndex($id);
832 }
833 // Get full record and if exists, search for indexing configurations:
834 $currentRecord = BackendUtility::getRecord($table, $id);
835 if (is_array($currentRecord)) {
836 // Select all (not running) indexing configurations of type "record" (1) and
837 // which points to this table and is located on the same page as the record
838 // or pointing to the right source PID
839 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
840 ->getQueryBuilderForTable('index_config');
841 $result = $queryBuilder->select('*')
842 ->from('index_config')
843 ->where(
844 $queryBuilder->expr()->eq('set_id', 0),
845 $queryBuilder->expr()->eq('type', 1),
846 $queryBuilder->expr()->eq('table2index', $queryBuilder->createNamedParameter($table)),
847 $queryBuilder->expr()->orX(
848 $queryBuilder->expr()->andX(
849 $queryBuilder->expr()->eq('alternative_source_pid', 0),
850 $queryBuilder->expr()->eq('pid', (int)$currentRecord['pid'])
851 ),
852 $queryBuilder->expr()->eq('alternative_source_pid', (int)$currentRecord['pid'])
853 ),
854 $queryBuilder->expr()->eq('records_indexonchange', 1)
855 )
856 ->execute();
857
858 while ($cfgRec = $result->fetch()) {
859 $this->indexSingleRecord($currentRecord, $cfgRec);
860 }
861 }
862 }
863 }