[TASK] Use a reference variable to pass $this into hooks
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Hook / CrawlerHook.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch\Hook;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Backend\Utility\BackendUtility;
18 use TYPO3\CMS\Core\Core\Environment;
19 use TYPO3\CMS\Core\Database\Connection;
20 use TYPO3\CMS\Core\Database\ConnectionPool;
21 use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
22 use TYPO3\CMS\Core\DataHandling\DataHandler;
23 use TYPO3\CMS\Core\Exception\Page\RootLineException;
24 use TYPO3\CMS\Core\Utility\GeneralUtility;
25 use TYPO3\CMS\Core\Utility\MathUtility;
26 use TYPO3\CMS\Core\Utility\RootlineUtility;
27
28 /**
29 * Crawler hook for indexed search. Works with the "crawler" extension
30 * @internal this is a TYPO3-internal hook implementation and not part of TYPO3's Core API.
31 */
32 class CrawlerHook
33 {
34 /**
35 * Number of seconds to use as interval between queued indexing operations of URLs / files (types 2 & 3)
36 *
37 * @var int
38 */
39 public $secondsPerExternalUrl = 3;
40
41 /**
42 * Counts up for each added URL (type 3)
43 *
44 * @var int
45 */
46 public $instanceCounter = 0;
47
48 /**
49 * @var string
50 */
51 public $callBack = self::class;
52
53 /**
54 * @var object
55 */
56 private $pObj;
57
58 /**
59 * Initialization of crawler hook.
60 * This function is asked for each instance of the crawler and we must check if something is timed to happen and if so put entry(s) in the crawlers log to start processing.
61 * In reality we select indexing configurations and evaluate if any of them needs to run.
62 *
63 * @param object $pObj Parent object (tx_crawler lib)
64 */
65 public function crawler_init(&$pObj)
66 {
67 $this->pObj = $pObj;
68
69 $message = null;
70 // Select all indexing configuration which are waiting to be activated:
71 $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_config');
72 $queryBuilder = $connection->createQueryBuilder();
73
74 $result = $queryBuilder->select('*')
75 ->from('index_config')
76 ->where(
77 $queryBuilder->expr()->lt(
78 'timer_next_indexing',
79 $queryBuilder->createNamedParameter($GLOBALS['EXEC_TIME'], \PDO::PARAM_INT)
80 ),
81 $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT))
82 )
83 ->execute();
84
85 // For each configuration, check if it should be executed and if so, start:
86 while ($cfgRec = $result->fetch()) {
87 // Generate a unique set-ID:
88 $setId = GeneralUtility::md5int(microtime());
89 // Get next time:
90 $nextTime = $this->generateNextIndexingTime($cfgRec);
91 // Start process by updating index-config record:
92 $connection->update(
93 'index_config',
94 [
95 'set_id' => $setId,
96 'timer_next_indexing' => $nextTime,
97 'session_data' => ''
98 ],
99 [
100 'uid' => (int)$cfgRec['uid']
101 ]
102 );
103 // Based on configuration type:
104 switch ($cfgRec['type']) {
105 case 1:
106 // RECORDS:
107 // Parameters:
108 $params = [
109 'indexConfigUid' => $cfgRec['uid'],
110 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
111 'url' => 'Records (start)'
112 ];
113 //
114 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
115 break;
116 case 2:
117 // FILES:
118 // Parameters:
119 $params = [
120 'indexConfigUid' => $cfgRec['uid'],
121 // General
122 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
123 // General
124 'url' => $cfgRec['filepath'],
125 // Partly general... (for URL and file types)
126 'depth' => 0
127 ];
128 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
129 break;
130 case 3:
131 // External URL:
132 // Parameters:
133 $params = [
134 'indexConfigUid' => $cfgRec['uid'],
135 // General
136 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
137 // General
138 'url' => $cfgRec['externalUrl'],
139 // Partly general... (for URL and file types)
140 'depth' => 0
141 ];
142 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
143 break;
144 case 4:
145 // Page tree
146 // Parameters:
147 $params = [
148 'indexConfigUid' => $cfgRec['uid'],
149 // General
150 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
151 // General
152 'url' => (int)$cfgRec['alternative_source_pid'],
153 // Partly general... (for URL and file types and page tree (root))
154 'depth' => 0
155 ];
156 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
157 break;
158 case 5:
159 // Meta configuration, nothing to do:
160 // NOOP
161 break;
162 default:
163 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
164 $hookObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
165 // Parameters:
166 $params = [
167 'indexConfigUid' => $cfgRec['uid'],
168 // General
169 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . '/CUSTOM]'],
170 // General
171 'url' => $hookObj->initMessage($message)
172 ];
173 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
174 }
175 }
176 }
177 // Finally, look up all old index configurations which are finished and needs to be reset and done.
178 $this->cleanUpOldRunningConfigurations();
179 }
180
181 /**
182 * Call back function for execution of a log element
183 *
184 * @param array $params Params from log element. Must contain $params['indexConfigUid']
185 * @param object $pObj Parent object (tx_crawler lib)
186 * @return array Result array
187 */
188 public function crawler_execute($params, &$pObj)
189 {
190 // Indexer configuration ID must exist:
191 if ($params['indexConfigUid']) {
192 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
193 ->getQueryBuilderForTable('index_config');
194 $queryBuilder->getRestrictions()->removeAll();
195 // Load the indexing configuration record:
196 $cfgRec = $queryBuilder
197 ->select('*')
198 ->from('index_config')
199 ->where(
200 $queryBuilder->expr()->eq(
201 'uid',
202 $queryBuilder->createNamedParameter($params['indexConfigUid'], \PDO::PARAM_INT)
203 )
204 )
205 ->execute()
206 ->fetch();
207 if (is_array($cfgRec)) {
208 // Unpack session data:
209 $session_data = unserialize($cfgRec['session_data']);
210 // Select which type:
211 switch ($cfgRec['type']) {
212 case 1:
213 // Records:
214 $this->crawler_execute_type1($cfgRec, $session_data, $params, $pObj);
215 break;
216 case 2:
217 // Files
218 $this->crawler_execute_type2($cfgRec, $session_data, $params, $pObj);
219 break;
220 case 3:
221 // External URL:
222 $this->crawler_execute_type3($cfgRec, $session_data, $params, $pObj);
223 break;
224 case 4:
225 // Page tree:
226 $this->crawler_execute_type4($cfgRec, $session_data, $params, $pObj);
227 break;
228 case 5:
229 // Meta
230 // NOOP (should never enter here!)
231 break;
232 default:
233 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
234 $hookObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
235 $this->pObj = $pObj;
236 // For addQueueEntryForHook()
237 $ref = $this; // introduced for phpstan to not lose type information when passing $this into callUserFunction
238 $hookObj->indexOperation($cfgRec, $session_data, $params, $ref);
239 }
240 }
241 // Save process data which might be modified:
242 GeneralUtility::makeInstance(ConnectionPool::class)
243 ->getConnectionForTable('index_config')
244 ->update(
245 'index_config',
246 ['session_data' => serialize($session_data)],
247 ['uid' => (int)$cfgRec['uid']]
248 );
249 }
250 }
251 return ['log' => $params];
252 }
253
254 /**
255 * Indexing records from a table
256 *
257 * @param array $cfgRec Indexing Configuration Record
258 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
259 * @param array $params Parameters from the log queue.
260 * @param object $pObj Parent object (from "crawler" extension!)
261 */
262 public function crawler_execute_type1($cfgRec, &$session_data, $params, &$pObj)
263 {
264 if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']])) {
265 // Init session data array if not already:
266 if (!is_array($session_data)) {
267 $session_data = [
268 'uid' => 0
269 ];
270 }
271 // Init:
272 $pid = (int)$cfgRec['alternative_source_pid'] ?: $cfgRec['pid'];
273 $numberOfRecords = $cfgRec['recordsbatch']
274 ? MathUtility::forceIntegerInRange($cfgRec['recordsbatch'], 1)
275 : 100;
276
277 // Get root line:
278 $rootLine = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
279 // Select
280 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
281 ->getQueryBuilderForTable($cfgRec['table2index']);
282
283 $baseQueryBuilder = $queryBuilder->select('*')
284 ->from($cfgRec['table2index'])
285 ->where(
286 $queryBuilder->expr()->eq(
287 'pid',
288 $queryBuilder->createNamedParameter($pid, \PDO::PARAM_INT)
289 ),
290 $queryBuilder->expr()->gt(
291 'uid',
292 $queryBuilder->createNamedParameter($session_data['uid'], \PDO::PARAM_INT)
293 )
294 );
295 $result = $baseQueryBuilder
296 ->setMaxResults($numberOfRecords)
297 ->orderBy('uid')
298 ->execute();
299
300 // Traverse:
301 while ($row = $result->fetch()) {
302 // Index single record:
303 $this->indexSingleRecord($row, $cfgRec, $rootLine);
304 // Update the UID we last processed:
305 $session_data['uid'] = $row['uid'];
306 }
307
308 $rowCount = $baseQueryBuilder->count('uid')->execute()->fetchColumn(0);
309 // Finally, set entry for next indexing of batch of records:
310 if ($rowCount) {
311 $nparams = [
312 'indexConfigUid' => $cfgRec['uid'],
313 'url' => 'Records from UID#' . ($session_data['uid'] + 1) . '-?',
314 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']']
315 ];
316 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
317 }
318 }
319 }
320
321 /**
322 * Indexing files from fileadmin
323 *
324 * @param array $cfgRec Indexing Configuration Record
325 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
326 * @param array $params Parameters from the log queue.
327 * @param object $pObj Parent object (from "crawler" extension!)
328 */
329 public function crawler_execute_type2($cfgRec, &$session_data, $params, &$pObj)
330 {
331 // Prepare path, making it absolute and checking:
332 $readpath = $params['url'];
333 if (!GeneralUtility::isAbsPath($readpath)) {
334 $readpath = GeneralUtility::getFileAbsFileName($readpath);
335 }
336 if (GeneralUtility::isAllowedAbsPath($readpath)) {
337 if (@is_file($readpath)) {
338 // If file, index it!
339 // Get root line (need to provide this when indexing external files)
340 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
341 // (Re)-Indexing file on page.
342 $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
343 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
344 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
345 $indexerObj->hash['phash'] = -1;
346 // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
347 // Index document:
348 $indexerObj->indexRegularDocument(\TYPO3\CMS\Core\Utility\PathUtility::stripPathSitePrefix($readpath), true);
349 } elseif (@is_dir($readpath)) {
350 // If dir, read content and create new pending items for log:
351 // Select files and directories in path:
352 $extList = implode(',', GeneralUtility::trimExplode(',', $cfgRec['extensions'], true));
353 $fileArr = [];
354 $files = GeneralUtility::getAllFilesAndFoldersInPath($fileArr, $readpath, $extList, 0, 0);
355 $directoryList = GeneralUtility::get_dirs($readpath);
356 if (is_array($directoryList) && $params['depth'] < $cfgRec['depth']) {
357 foreach ($directoryList as $subdir) {
358 if ((string)$subdir != '') {
359 $files[] = $readpath . $subdir . '/';
360 }
361 }
362 }
363 $files = GeneralUtility::removePrefixPathFromList($files, Environment::getPublicPath() . '/');
364 // traverse the items and create log entries:
365 foreach ($files as $path) {
366 $this->instanceCounter++;
367 if ($path !== $params['url']) {
368 // Parameters:
369 $nparams = [
370 'indexConfigUid' => $cfgRec['uid'],
371 'url' => $path,
372 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
373 'depth' => $params['depth'] + 1
374 ];
375 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
376 }
377 }
378 }
379 }
380 }
381
382 /**
383 * Indexing External URLs
384 *
385 * @param array $cfgRec Indexing Configuration Record
386 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
387 * @param array $params Parameters from the log queue.
388 * @param object $pObj Parent object (from "crawler" extension!)
389 */
390 public function crawler_execute_type3($cfgRec, &$session_data, $params, &$pObj)
391 {
392 // Init session data array if not already:
393 if (!is_array($session_data)) {
394 $session_data = [
395 'urlLog' => [$params['url']]
396 ];
397 }
398 // Index the URL:
399 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
400 $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
401 // Add more elements to log now:
402 if ($params['depth'] < $cfgRec['depth']) {
403 foreach ($subUrls as $url) {
404 if ($url = $this->checkUrl($url, $session_data['urlLog'], $cfgRec['externalUrl'])) {
405 if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny'])) {
406 $this->instanceCounter++;
407 $session_data['urlLog'][] = $url;
408 // Parameters:
409 $nparams = [
410 'indexConfigUid' => $cfgRec['uid'],
411 'url' => $url,
412 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
413 'depth' => $params['depth'] + 1
414 ];
415 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
416 }
417 }
418 }
419 }
420 }
421
422 /**
423 * Page tree indexing type
424 *
425 * @param array $cfgRec Indexing Configuration Record
426 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
427 * @param array $params Parameters from the log queue.
428 * @param object $pObj Parent object (from "crawler" extension!)
429 */
430 public function crawler_execute_type4($cfgRec, &$session_data, $params, &$pObj)
431 {
432 // Base page uid:
433 $pageUid = (int)$params['url'];
434 // Get array of URLs from page:
435 $pageRow = BackendUtility::getRecord('pages', $pageUid);
436 $res = $pObj->getUrlsForPageRow($pageRow);
437 $duplicateTrack = [];
438 // Registry for duplicates
439 $downloadUrls = [];
440 // Dummy.
441 // Submit URLs:
442 if (!empty($res)) {
443 foreach ($res as $paramSetKey => $vv) {
444 $pObj->urlListFromUrlArray($vv, $pageRow, $GLOBALS['EXEC_TIME'], 30, 1, 0, $duplicateTrack, $downloadUrls, ['tx_indexedsearch_reindex']);
445 }
446 }
447 // Add subpages to log now:
448 if ($params['depth'] < $cfgRec['depth']) {
449 // Subpages selected
450 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('pages');
451 $queryBuilder->getRestrictions()
452 ->removeAll()
453 ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
454 $result = $queryBuilder->select('uid', 'title')
455 ->from('pages')
456 ->where(
457 $queryBuilder->expr()->eq(
458 'pid',
459 $queryBuilder->createNamedParameter($pageUid, \PDO::PARAM_INT)
460 )
461 )
462 ->execute();
463 // Traverse subpages and add to queue:
464 while ($row = $result->fetch()) {
465 $this->instanceCounter++;
466 $url = 'pages:' . $row['uid'] . ': ' . $row['title'];
467 $session_data['urlLog'][] = $url;
468 // Parameters:
469 $nparams = [
470 'indexConfigUid' => $cfgRec['uid'],
471 'url' => $row['uid'],
472 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
473 'depth' => $params['depth'] + 1
474 ];
475 $pObj->addQueueEntry_callBack(
476 $cfgRec['set_id'],
477 $nparams,
478 $this->callBack,
479 $cfgRec['pid'],
480 $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
481 );
482 }
483 }
484 }
485
486 /**
487 * Look up all old index configurations which are finished and needs to be reset and done
488 */
489 public function cleanUpOldRunningConfigurations()
490 {
491 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
492 // List of tables that store information related to the phash value
493 $tablesToClean = [
494 'index_phash',
495 'index_rel',
496 'index_section',
497 'index_grlist',
498 'index_fulltext',
499 'index_debug'
500 ];
501
502 $queryBuilder = $connectionPool->getQueryBuilderForTable('index_config');
503 $queryBuilder->getRestrictions()
504 ->removeAll()
505 ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
506
507 // Lookup running index configurations:
508 $runningIndexingConfigurations = $queryBuilder->select('*')
509 ->from('index_config')
510 ->where($queryBuilder->expr()->neq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)))
511 ->execute()
512 ->fetchAll();
513 // For each running configuration, look up how many log entries there are which are scheduled
514 // for execution and if none, clear the "set_id" (means; Processing was DONE)
515 foreach ($runningIndexingConfigurations as $cfgRec) {
516 // Look for ended processes:
517 $queued_items = $connectionPool->getConnectionForTable('tx_crawler_queue')
518 ->count(
519 '*',
520 'tx_crawler_queue',
521 [
522 'set_id' => (int)$cfgRec['set_id'],
523 'exec_time' => 0
524 ]
525 );
526 if (!$queued_items) {
527 // Lookup old phash rows:
528 $queryBuilder = $connectionPool->getQueryBuilderForTable('index_phash');
529 $oldPhashRows = $queryBuilder
530 ->select('phash')
531 ->from('index_phash')
532 ->where(
533 $queryBuilder->expr()->eq(
534 'freeIndexUid',
535 $queryBuilder->createNamedParameter($cfgRec['uid'], \PDO::PARAM_INT)
536 ),
537 $queryBuilder->expr()->neq(
538 'freeIndexSetId',
539 $queryBuilder->createNamedParameter($cfgRec['set_id'], \PDO::PARAM_INT)
540 )
541 )
542 ->execute()
543 ->fetchAll();
544
545 // Removing old registrations for all tables
546 foreach ($tablesToClean as $table) {
547 $queryBuilder = $connectionPool->getQueryBuilderForTable($table);
548 $queryBuilder->delete($table)
549 ->where(
550 $queryBuilder->expr()->in(
551 'phash',
552 $queryBuilder->createNamedParameter(
553 array_column($oldPhashRows, 'phash'),
554 Connection::PARAM_INT_ARRAY
555 )
556 )
557 )
558 ->execute();
559 }
560
561 // End process by updating index-config record:
562 $connectionPool->getConnectionForTable('index_config')
563 ->update(
564 'index_config',
565 [
566 'set_id' => 0,
567 'session_data' => ''
568 ],
569 ['uid' => (int)$cfgRec['uid']]
570 );
571 }
572 }
573 }
574
575 /*****************************************
576 *
577 * Helper functions
578 *
579 *****************************************/
580 /**
581 * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log.
582 *
583 * @param string $url URL string to check
584 * @param array $urlLog Array of already indexed URLs (input url is looked up here and must not exist already)
585 * @param string $baseUrl Base URL of the indexing process (input URL must be "inside" the base URL!)
586 * @return string Returns the URL if OK, otherwise FALSE
587 */
588 public function checkUrl($url, $urlLog, $baseUrl)
589 {
590 $url = preg_replace('/\\/\\/$/', '/', $url);
591 [$url] = explode('#', $url);
592 if (strpos($url, '../') === false) {
593 if (GeneralUtility::isFirstPartOfStr($url, $baseUrl)) {
594 if (!in_array($url, $urlLog)) {
595 return $url;
596 }
597 }
598 }
599 }
600
601 /**
602 * Indexing External URL
603 *
604 * @param string $url URL, http://....
605 * @param int $pageId Page id to relate indexing to.
606 * @param array $rl Rootline array to relate indexing to
607 * @param int $cfgUid Configuration UID
608 * @param int $setId Set ID value
609 * @return array URLs found on this page
610 */
611 public function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)
612 {
613 // Index external URL:
614 $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
615 $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
616 $indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
617 $indexerObj->hash['phash'] = -1;
618 // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
619 $indexerObj->indexExternalUrl($url);
620 $url_qParts = parse_url($url);
621 $baseAbsoluteHref = $url_qParts['scheme'] . '://' . $url_qParts['host'];
622 $baseHref = $indexerObj->extractBaseHref($indexerObj->indexExternalUrl_content);
623 if (!$baseHref) {
624 // Extract base href from current URL
625 $baseHref = $baseAbsoluteHref;
626 $baseHref .= substr($url_qParts['path'], 0, strrpos($url_qParts['path'], '/'));
627 }
628 $baseHref = rtrim($baseHref, '/');
629 // Get URLs on this page:
630 $subUrls = [];
631 $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
632 // Traverse links:
633 foreach ($list as $count => $linkInfo) {
634 // Decode entities:
635 $subUrl = htmlspecialchars_decode($linkInfo['href']);
636 $qParts = parse_url($subUrl);
637 if (!$qParts['scheme']) {
638 $relativeUrl = GeneralUtility::resolveBackPath($subUrl);
639 if ($relativeUrl[0] === '/') {
640 $subUrl = $baseAbsoluteHref . $relativeUrl;
641 } else {
642 $subUrl = $baseHref . '/' . $relativeUrl;
643 }
644 }
645 $subUrls[] = $subUrl;
646 }
647 return $subUrls;
648 }
649
650 /**
651 * Indexing Single Record
652 *
653 * @param array $r Record to index
654 * @param array $cfgRec Configuration Record
655 * @param array $rl Rootline array to relate indexing to
656 */
657 public function indexSingleRecord($r, $cfgRec, $rl = null)
658 {
659 // Init:
660 $rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
661 $fieldList = GeneralUtility::trimExplode(',', $cfgRec['fieldlist'], true);
662 $languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
663 $sys_language_uid = $languageField ? $r[$languageField] : 0;
664 // (Re)-Indexing a row from a table:
665 $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
666 parse_str(str_replace('###UID###', $r['uid'], $cfgRec['get_params']), $GETparams);
667 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams);
668 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
669 $indexerObj->forceIndexing = true;
670 $theContent = '';
671 foreach ($fieldList as $k => $v) {
672 if (!$k) {
673 $theTitle = $r[$v];
674 } else {
675 $theContent .= $r[$v] . ' ';
676 }
677 }
678 // Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid())
679 $indexerObj->backend_indexAsTYPO3Page(strip_tags(str_replace('<', ' <', $theTitle)), '', '', strip_tags(str_replace('<', ' <', $theContent)), 'utf-8', $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']], $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']], $r['uid']);
680 }
681
682 /**
683 * Get rootline for closest TypoScript template root.
684 * Algorithm same as used in Web > Template, Object browser
685 *
686 * @param int $id The page id to traverse rootline back from
687 * @return array Array where the root lines uid values are found.
688 */
689 public function getUidRootLineForClosestTemplate($id)
690 {
691 $rootLineUids = [];
692 try {
693 // Gets the rootLine
694 $rootLine = GeneralUtility::makeInstance(RootlineUtility::class, $id)->get();
695 // This generates the constants/config + hierarchy info for the template.
696 $tmpl = GeneralUtility::makeInstance(\TYPO3\CMS\Core\TypoScript\ExtendedTemplateService::class);
697 $tmpl->runThroughTemplates($rootLine);
698 // Root line uids
699 foreach ($tmpl->rootLine as $rlkey => $rldat) {
700 $rootLineUids[$rlkey] = $rldat['uid'];
701 }
702 } catch (RootLineException $e) {
703 // do nothing
704 }
705 return $rootLineUids;
706 }
707
708 /**
709 * Generate the unix time stamp for next visit.
710 *
711 * @param array $cfgRec Index configuration record
712 * @return int The next time stamp
713 */
714 public function generateNextIndexingTime($cfgRec)
715 {
716 $currentTime = $GLOBALS['EXEC_TIME'];
717 // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected
718 if ($cfgRec['timer_frequency'] <= 24 * 3600) {
719 $aMidNight = mktime(0, 0, 0) - 1 * 24 * 3600;
720 } else {
721 $lastTime = $cfgRec['timer_next_indexing'] ?: $GLOBALS['EXEC_TIME'];
722 $aMidNight = mktime(0, 0, 0, date('m', $lastTime), date('d', $lastTime), date('y', $lastTime));
723 }
724 // Find last offset time plus frequency in seconds:
725 $lastSureOffset = $aMidNight + MathUtility::forceIntegerInRange($cfgRec['timer_offset'], 0, 86400);
726 $frequencySeconds = MathUtility::forceIntegerInRange($cfgRec['timer_frequency'], 1);
727 // Now, find out how many blocks of the length of frequency there is until the next time:
728 $frequencyBlocksUntilNextTime = ceil(($currentTime - $lastSureOffset) / $frequencySeconds);
729 // Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds.
730 return $lastSureOffset + $frequencyBlocksUntilNextTime * $frequencySeconds;
731 }
732
733 /**
734 * Checks if $url has any of the URls in the $url_deny "list" in it and if so, returns TRUE.
735 *
736 * @param string $url URL to test
737 * @param string $url_deny String where URLs are separated by line-breaks; If any of these strings is the first part of $url, the function returns TRUE (to indicate denial of descend)
738 * @return bool TRUE if there is a matching URL (hence, do not index!)
739 */
740 public function checkDeniedSuburls($url, $url_deny)
741 {
742 if (trim($url_deny)) {
743 $url_denyArray = GeneralUtility::trimExplode(LF, $url_deny, true);
744 foreach ($url_denyArray as $testurl) {
745 if (GeneralUtility::isFirstPartOfStr($url, $testurl)) {
746 return true;
747 }
748 }
749 }
750 return false;
751 }
752
753 /**
754 * Adding entry in queue for Hook
755 *
756 * @param array $cfgRec Configuration record
757 * @param string $title Title/URL
758 */
759 public function addQueueEntryForHook($cfgRec, $title)
760 {
761 $nparams = [
762 'indexConfigUid' => $cfgRec['uid'],
763 // This must ALWAYS be the cfgRec uid!
764 'url' => $title,
765 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']']
766 ];
767 $this->pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
768 }
769
770 /**
771 * Deletes all data stored by indexed search for a given page
772 *
773 * @param int $id Uid of the page to delete all pHash
774 */
775 public function deleteFromIndex($id)
776 {
777 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
778
779 // Lookup old phash rows:
780
781 $queryBuilder = $connectionPool->getQueryBuilderForTable('index_section');
782 $oldPhashRows = $queryBuilder->select('phash')
783 ->from('index_section')
784 ->where(
785 $queryBuilder->expr()->eq(
786 'page_id',
787 $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT)
788 )
789 )
790 ->execute()
791 ->fetchAll();
792
793 if (empty($oldPhashRows)) {
794 return;
795 }
796
797 $tables = [
798 'index_debug',
799 'index_fulltext',
800 'index_grlist',
801 'index_phash',
802 'index_rel',
803 'index_section',
804 ];
805 foreach ($tables as $table) {
806 $queryBuilder = $connectionPool->getQueryBuilderForTable($table);
807 $queryBuilder->delete($table)
808 ->where(
809 $queryBuilder->expr()->in(
810 'phash',
811 $queryBuilder->createNamedParameter(
812 array_column($oldPhashRows, 'phash'),
813 Connection::PARAM_INT_ARRAY
814 )
815 )
816 )
817 ->execute();
818 }
819 }
820
821 /*************************
822 *
823 * Hook functions for DataHandler (indexing of records)
824 *
825 *************************/
826 /**
827 * DataHandler hook function for on-the-fly indexing of database records
828 *
829 * @param string $command DataHandler command
830 * @param string $table Table name
831 * @param string $id Record ID. If new record its a string pointing to index inside \TYPO3\CMS\Core\DataHandling\DataHandler::substNEWwithIDs
832 * @param mixed $value Target value (ignored)
833 * @param DataHandler $pObj DataHandler calling object
834 */
835 public function processCmdmap_preProcess($command, $table, $id, $value, $pObj)
836 {
837 // Clean up the index
838 if ($command === 'delete' && $table === 'pages') {
839 $this->deleteFromIndex($id);
840 }
841 }
842
843 /**
844 * DataHandler hook function for on-the-fly indexing of database records
845 *
846 * @param string $status Status "new" or "update
847 * @param string $table Table name
848 * @param string $id Record ID. If new record its a string pointing to index inside \TYPO3\CMS\Core\DataHandling\DataHandler::substNEWwithIDs
849 * @param array $fieldArray Field array of updated fields in the operation
850 * @param DataHandler $pObj DataHandler calling object
851 */
852 public function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, $pObj)
853 {
854 // Check if any fields are actually updated:
855 if (empty($fieldArray)) {
856 return;
857 }
858 // Translate new ids.
859 if ($status === 'new') {
860 $id = $pObj->substNEWwithIDs[$id];
861 } elseif ($table === 'pages' && $status === 'update' && (array_key_exists('hidden', $fieldArray) && $fieldArray['hidden'] == 1 || array_key_exists('no_search', $fieldArray) && $fieldArray['no_search'] == 1)) {
862 // If the page should be hidden or not indexed after update, delete index for this page
863 $this->deleteFromIndex($id);
864 }
865 // Get full record and if exists, search for indexing configurations:
866 $currentRecord = BackendUtility::getRecord($table, $id);
867 if (is_array($currentRecord)) {
868 // Select all (not running) indexing configurations of type "record" (1) and
869 // which points to this table and is located on the same page as the record
870 // or pointing to the right source PID
871 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
872 ->getQueryBuilderForTable('index_config');
873 $result = $queryBuilder->select('*')
874 ->from('index_config')
875 ->where(
876 $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)),
877 $queryBuilder->expr()->eq('type', $queryBuilder->createNamedParameter(1, \PDO::PARAM_INT)),
878 $queryBuilder->expr()->eq(
879 'table2index',
880 $queryBuilder->createNamedParameter($table, \PDO::PARAM_STR)
881 ),
882 $queryBuilder->expr()->orX(
883 $queryBuilder->expr()->andX(
884 $queryBuilder->expr()->eq(
885 'alternative_source_pid',
886 $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)
887 ),
888 $queryBuilder->expr()->eq(
889 'pid',
890 $queryBuilder->createNamedParameter($currentRecord['pid'], \PDO::PARAM_INT)
891 )
892 ),
893 $queryBuilder->expr()->eq(
894 'alternative_source_pid',
895 $queryBuilder->createNamedParameter($currentRecord['pid'], \PDO::PARAM_INT)
896 )
897 ),
898 $queryBuilder->expr()->eq(
899 'records_indexonchange',
900 $queryBuilder->createNamedParameter(1, \PDO::PARAM_INT)
901 )
902 )
903 ->execute();
904
905 while ($cfgRec = $result->fetch()) {
906 $this->indexSingleRecord($currentRecord, $cfgRec);
907 }
908 }
909 }
910 }