484ddd88092e2f695cb06c9a099fd1ea8a852641
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Hook / CrawlerHook.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch\Hook;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Backend\Utility\BackendUtility;
18 use TYPO3\CMS\Core\Database\Connection;
19 use TYPO3\CMS\Core\Database\ConnectionPool;
20 use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
21 use TYPO3\CMS\Core\DataHandling\DataHandler;
22 use TYPO3\CMS\Core\Utility\GeneralUtility;
23 use TYPO3\CMS\Core\Utility\MathUtility;
24
25 /**
26 * Crawler hook for indexed search. Works with the "crawler" extension
27 */
28 class CrawlerHook
29 {
30 /**
31 * Number of seconds to use as interval between queued indexing operations of URLs / files (types 2 & 3)
32 *
33 * @var int
34 */
35 public $secondsPerExternalUrl = 3;
36
37 /**
38 * Counts up for each added URL (type 3)
39 *
40 * @var int
41 */
42 public $instanceCounter = 0;
43
44 /**
45 * @var string
46 */
47 public $callBack = self::class;
48
49 /**
50 * The constructor
51 */
52 public function __construct()
53 {
54 // To make sure the backend charset is available:
55 if (!is_object($GLOBALS['LANG'])) {
56 $GLOBALS['LANG'] = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Localization\LanguageService::class);
57 $GLOBALS['LANG']->init($GLOBALS['BE_USER']->uc['lang']);
58 }
59 }
60
61 /**
62 * Initialization of crawler hook.
63 * This function is asked for each instance of the crawler and we must check if something is timed to happen and if so put entry(s) in the crawlers log to start processing.
64 * In reality we select indexing configurations and evaluate if any of them needs to run.
65 *
66 * @param object $pObj Parent object (tx_crawler lib)
67 */
68 public function crawler_init(&$pObj)
69 {
70 // Select all indexing configuration which are waiting to be activated:
71 $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_config');
72 $queryBuilder = $connection->createQueryBuilder();
73
74 $result = $queryBuilder->select('*')
75 ->from('index_config')
76 ->where(
77 $queryBuilder->expr()->lt(
78 'timer_next_indexing',
79 $queryBuilder->createNamedParameter($GLOBALS['EXEC_TIME'], \PDO::PARAM_INT)
80 ),
81 $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT))
82 )
83 ->execute();
84
85 // For each configuration, check if it should be executed and if so, start:
86 while ($cfgRec = $result->fetch()) {
87 // Generate a unique set-ID:
88 $setId = GeneralUtility::md5int(microtime());
89 // Get next time:
90 $nextTime = $this->generateNextIndexingTime($cfgRec);
91 // Start process by updating index-config record:
92 $connection->update(
93 'index_config',
94 [
95 'set_id' => $setId,
96 'timer_next_indexing' => $nextTime,
97 'session_data' => ''
98 ],
99 [
100 'uid' => (int)$cfgRec['uid']
101 ]
102 );
103 // Based on configuration type:
104 switch ($cfgRec['type']) {
105 case 1:
106 // RECORDS:
107 // Parameters:
108 $params = [
109 'indexConfigUid' => $cfgRec['uid'],
110 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
111 'url' => 'Records (start)'
112 ];
113 //
114 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
115 break;
116 case 2:
117 // FILES:
118 // Parameters:
119 $params = [
120 'indexConfigUid' => $cfgRec['uid'],
121 // General
122 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
123 // General
124 'url' => $cfgRec['filepath'],
125 // Partly general... (for URL and file types)
126 'depth' => 0
127 ];
128 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
129 break;
130 case 3:
131 // External URL:
132 // Parameters:
133 $params = [
134 'indexConfigUid' => $cfgRec['uid'],
135 // General
136 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
137 // General
138 'url' => $cfgRec['externalUrl'],
139 // Partly general... (for URL and file types)
140 'depth' => 0
141 ];
142 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
143 break;
144 case 4:
145 // Page tree
146 // Parameters:
147 $params = [
148 'indexConfigUid' => $cfgRec['uid'],
149 // General
150 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
151 // General
152 'url' => (int)$cfgRec['alternative_source_pid'],
153 // Partly general... (for URL and file types and page tree (root))
154 'depth' => 0
155 ];
156 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
157 break;
158 case 5:
159 // Meta configuration, nothing to do:
160 // NOOP
161 break;
162 default:
163 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
164 $hookObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
165 // Parameters:
166 $params = [
167 'indexConfigUid' => $cfgRec['uid'],
168 // General
169 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . '/CUSTOM]'],
170 // General
171 'url' => $hookObj->initMessage($message)
172 ];
173 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
174 }
175 }
176 }
177 // Finally, look up all old index configurations which are finished and needs to be reset and done.
178 $this->cleanUpOldRunningConfigurations();
179 }
180
181 /**
182 * Call back function for execution of a log element
183 *
184 * @param array $params Params from log element. Must contain $params['indexConfigUid']
185 * @param object $pObj Parent object (tx_crawler lib)
186 * @return array Result array
187 */
188 public function crawler_execute($params, &$pObj)
189 {
190 // Indexer configuration ID must exist:
191 if ($params['indexConfigUid']) {
192 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
193 ->getQueryBuilderForTable('index_config');
194 $queryBuilder->getRestrictions()->removeAll();
195 // Load the indexing configuration record:
196 $cfgRec = $queryBuilder
197 ->select('*')
198 ->from('index_config')
199 ->where(
200 $queryBuilder->expr()->eq(
201 'uid',
202 $queryBuilder->createNamedParameter($params['indexConfigUid'], \PDO::PARAM_INT)
203 )
204 )
205 ->execute()
206 ->fetch();
207 if (is_array($cfgRec)) {
208 // Unpack session data:
209 $session_data = unserialize($cfgRec['session_data']);
210 // Select which type:
211 switch ($cfgRec['type']) {
212 case 1:
213 // Records:
214 $this->crawler_execute_type1($cfgRec, $session_data, $params, $pObj);
215 break;
216 case 2:
217 // Files
218 $this->crawler_execute_type2($cfgRec, $session_data, $params, $pObj);
219 break;
220 case 3:
221 // External URL:
222 $this->crawler_execute_type3($cfgRec, $session_data, $params, $pObj);
223 break;
224 case 4:
225 // Page tree:
226 $this->crawler_execute_type4($cfgRec, $session_data, $params, $pObj);
227 break;
228 case 5:
229 // Meta
230 // NOOP (should never enter here!)
231 break;
232 default:
233 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
234 $hookObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
235 $this->pObj = $pObj;
236 // For addQueueEntryForHook()
237 $hookObj->indexOperation($cfgRec, $session_data, $params, $this);
238 }
239 }
240 // Save process data which might be modified:
241 GeneralUtility::makeInstance(ConnectionPool::class)
242 ->getConnectionForTable('index_config')
243 ->update(
244 'index_config',
245 ['session_data' => serialize($session_data)],
246 ['uid' => (int)$cfgRec['uid']]
247 );
248 }
249 }
250 return ['log' => $params];
251 }
252
253 /**
254 * Indexing records from a table
255 *
256 * @param array $cfgRec Indexing Configuration Record
257 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
258 * @param array $params Parameters from the log queue.
259 * @param object $pObj Parent object (from "crawler" extension!)
260 */
261 public function crawler_execute_type1($cfgRec, &$session_data, $params, &$pObj)
262 {
263 if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']])) {
264 // Init session data array if not already:
265 if (!is_array($session_data)) {
266 $session_data = [
267 'uid' => 0
268 ];
269 }
270 // Init:
271 $pid = (int)$cfgRec['alternative_source_pid'] ?: $cfgRec['pid'];
272 $numberOfRecords = $cfgRec['recordsbatch']
273 ? MathUtility::forceIntegerInRange($cfgRec['recordsbatch'], 1)
274 : 100;
275
276 // Get root line:
277 $rootLine = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
278 // Select
279 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
280 ->getQueryBuilderForTable($cfgRec['table2index']);
281
282 $result = $queryBuilder->select('*')
283 ->from($cfgRec['table2index'])
284 ->where(
285 $queryBuilder->expr()->eq(
286 'pid',
287 $queryBuilder->createNamedParameter($pid, \PDO::PARAM_INT)
288 ),
289 $queryBuilder->expr()->gt(
290 'uid',
291 $queryBuilder->createNamedParameter($session_data['uid'], \PDO::PARAM_INT)
292 )
293 )
294 ->setMaxResults($numberOfRecords)
295 ->orderBy('uid')
296 ->execute();
297
298 // Traverse:
299 while ($row = $result->fetch()) {
300 // Index single record:
301 $this->indexSingleRecord($row, $cfgRec, $rootLine);
302 // Update the UID we last processed:
303 $session_data['uid'] = $row['uid'];
304 }
305
306 // Finally, set entry for next indexing of batch of records:
307 if ($result->rowCount()) {
308 $nparams = [
309 'indexConfigUid' => $cfgRec['uid'],
310 'url' => 'Records from UID#' . ($session_data['uid'] + 1) . '-?',
311 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']']
312 ];
313 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
314 }
315 }
316 }
317
318 /**
319 * Indexing files from fileadmin
320 *
321 * @param array $cfgRec Indexing Configuration Record
322 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
323 * @param array $params Parameters from the log queue.
324 * @param object $pObj Parent object (from "crawler" extension!)
325 */
326 public function crawler_execute_type2($cfgRec, &$session_data, $params, &$pObj)
327 {
328 // Prepare path, making it absolute and checking:
329 $readpath = $params['url'];
330 if (!GeneralUtility::isAbsPath($readpath)) {
331 $readpath = GeneralUtility::getFileAbsFileName($readpath);
332 }
333 if (GeneralUtility::isAllowedAbsPath($readpath)) {
334 if (@is_file($readpath)) {
335 // If file, index it!
336 // Get root line (need to provide this when indexing external files)
337 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
338 // (Re)-Indexing file on page.
339 $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
340 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
341 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
342 $indexerObj->hash['phash'] = -1;
343 // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
344 // Index document:
345 $indexerObj->indexRegularDocument(\TYPO3\CMS\Core\Utility\PathUtility::stripPathSitePrefix($readpath), true);
346 } elseif (@is_dir($readpath)) {
347 // If dir, read content and create new pending items for log:
348 // Select files and directories in path:
349 $extList = implode(',', GeneralUtility::trimExplode(',', $cfgRec['extensions'], true));
350 $fileArr = [];
351 $files = GeneralUtility::getAllFilesAndFoldersInPath($fileArr, $readpath, $extList, 0, 0);
352 $directoryList = GeneralUtility::get_dirs($readpath);
353 if (is_array($directoryList) && $params['depth'] < $cfgRec['depth']) {
354 foreach ($directoryList as $subdir) {
355 if ((string)$subdir != '') {
356 $files[] = $readpath . $subdir . '/';
357 }
358 }
359 }
360 $files = GeneralUtility::removePrefixPathFromList($files, PATH_site);
361 // traverse the items and create log entries:
362 foreach ($files as $path) {
363 $this->instanceCounter++;
364 if ($path !== $params['url']) {
365 // Parameters:
366 $nparams = [
367 'indexConfigUid' => $cfgRec['uid'],
368 'url' => $path,
369 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
370 'depth' => $params['depth'] + 1
371 ];
372 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
373 }
374 }
375 }
376 }
377 }
378
379 /**
380 * Indexing External URLs
381 *
382 * @param array $cfgRec Indexing Configuration Record
383 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
384 * @param array $params Parameters from the log queue.
385 * @param object $pObj Parent object (from "crawler" extension!)
386 */
387 public function crawler_execute_type3($cfgRec, &$session_data, $params, &$pObj)
388 {
389 // Init session data array if not already:
390 if (!is_array($session_data)) {
391 $session_data = [
392 'urlLog' => [$params['url']]
393 ];
394 }
395 // Index the URL:
396 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
397 $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
398 // Add more elements to log now:
399 if ($params['depth'] < $cfgRec['depth']) {
400 foreach ($subUrls as $url) {
401 if ($url = $this->checkUrl($url, $session_data['urlLog'], $cfgRec['externalUrl'])) {
402 if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny'])) {
403 $this->instanceCounter++;
404 $session_data['urlLog'][] = $url;
405 // Parameters:
406 $nparams = [
407 'indexConfigUid' => $cfgRec['uid'],
408 'url' => $url,
409 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
410 'depth' => $params['depth'] + 1
411 ];
412 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
413 }
414 }
415 }
416 }
417 }
418
419 /**
420 * Page tree indexing type
421 *
422 * @param array $cfgRec Indexing Configuration Record
423 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
424 * @param array $params Parameters from the log queue.
425 * @param object $pObj Parent object (from "crawler" extension!)
426 */
427 public function crawler_execute_type4($cfgRec, &$session_data, $params, &$pObj)
428 {
429 // Base page uid:
430 $pageUid = (int)$params['url'];
431 // Get array of URLs from page:
432 $pageRow = BackendUtility::getRecord('pages', $pageUid);
433 $res = $pObj->getUrlsForPageRow($pageRow);
434 $duplicateTrack = [];
435 // Registry for duplicates
436 $downloadUrls = [];
437 // Dummy.
438 // Submit URLs:
439 if (!empty($res)) {
440 foreach ($res as $paramSetKey => $vv) {
441 $pObj->urlListFromUrlArray($vv, $pageRow, $GLOBALS['EXEC_TIME'], 30, 1, 0, $duplicateTrack, $downloadUrls, ['tx_indexedsearch_reindex']);
442 }
443 }
444 // Add subpages to log now:
445 if ($params['depth'] < $cfgRec['depth']) {
446 // Subpages selected
447 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('pages');
448 $queryBuilder->getRestrictions()
449 ->removeAll()
450 ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
451 $result = $queryBuilder->select('uid', 'title')
452 ->from('pages')
453 ->where(
454 $queryBuilder->expr()->eq(
455 'pid',
456 $queryBuilder->createNamedParameter($pageUid, \PDO::PARAM_INT)
457 )
458 )
459 ->execute();
460 // Traverse subpages and add to queue:
461 while ($row = $result->fetch()) {
462 $this->instanceCounter++;
463 $url = 'pages:' . $row['uid'] . ': ' . $row['title'];
464 $session_data['urlLog'][] = $url;
465 // Parameters:
466 $nparams = [
467 'indexConfigUid' => $cfgRec['uid'],
468 'url' => $row['uid'],
469 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
470 'depth' => $params['depth'] + 1
471 ];
472 $pObj->addQueueEntry_callBack(
473 $cfgRec['set_id'],
474 $nparams,
475 $this->callBack,
476 $cfgRec['pid'],
477 $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
478 );
479 }
480 }
481 }
482
483 /**
484 * Look up all old index configurations which are finished and needs to be reset and done
485 */
486 public function cleanUpOldRunningConfigurations()
487 {
488 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
489 // List of tables that store information related to the phash value
490 $tablesToClean = [
491 'index_phash',
492 'index_rel',
493 'index_section',
494 'index_grlist',
495 'index_fulltext',
496 'index_debug'
497 ];
498
499 $queryBuilder = $connectionPool->getQueryBuilderForTable('index_config');
500 $queryBuilder->getRestrictions()
501 ->removeAll()
502 ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
503
504 // Lookup running index configurations:
505 $runningIndexingConfigurations = $queryBuilder->select('*')
506 ->from('index_config')
507 ->where($queryBuilder->expr()->neq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)))
508 ->execute()
509 ->fetchAll();
510 // For each running configuration, look up how many log entries there are which are scheduled
511 // for execution and if none, clear the "set_id" (means; Processing was DONE)
512 foreach ($runningIndexingConfigurations as $cfgRec) {
513 // Look for ended processes:
514 $queued_items = $connectionPool->getConnectionForTable('tx_crawler_queue')
515 ->count(
516 '*',
517 'tx_crawler_queue',
518 [
519 'set_id' => (int)$cfgRec['set_id'],
520 'exec_time' => 0
521 ]
522 );
523 if (!$queued_items) {
524 // Lookup old phash rows:
525 $queryBuilder = $connectionPool->getQueryBuilderForTable('index_phash');
526 $oldPhashRows = $queryBuilder
527 ->select('phash')
528 ->from('index_phash')
529 ->where(
530 $queryBuilder->expr()->eq(
531 'freeIndexUid',
532 $queryBuilder->createNamedParameter($cfgRec['uid'], \PDO::PARAM_INT)
533 ),
534 $queryBuilder->expr()->neq(
535 'freeIndexSetId',
536 $queryBuilder->createNamedParameter($cfgRec['set_id'], \PDO::PARAM_INT)
537 )
538 )
539 ->execute()
540 ->fetchAll();
541
542 // Removing old registrations for all tables
543 foreach ($tablesToClean as $table) {
544 $queryBuilder = $connectionPool->getQueryBuilderForTable($table);
545 $queryBuilder->delete($table)
546 ->where(
547 $queryBuilder->expr()->in(
548 'phash',
549 $queryBuilder->createNamedParameter(
550 array_column($oldPhashRows, 'phash'),
551 Connection::PARAM_INT_ARRAY
552 )
553 )
554 )
555 ->execute();
556 }
557
558 // End process by updating index-config record:
559 $connectionPool->getConnectionForTable('index_config')
560 ->update(
561 'index_config',
562 [
563 'set_id' => 0,
564 'session_data' => ''
565 ],
566 ['uid' => (int)$cfgRec['uid']]
567 );
568 }
569 }
570 }
571
572 /*****************************************
573 *
574 * Helper functions
575 *
576 *****************************************/
577 /**
578 * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log.
579 *
580 * @param string $url URL string to check
581 * @param array $urlLog Array of already indexed URLs (input url is looked up here and must not exist already)
582 * @param string $baseUrl Base URL of the indexing process (input URL must be "inside" the base URL!)
583 * @return string Returls the URL if OK, otherwise FALSE
584 */
585 public function checkUrl($url, $urlLog, $baseUrl)
586 {
587 $url = preg_replace('/\\/\\/$/', '/', $url);
588 list($url) = explode('#', $url);
589 if (!strstr($url, '../')) {
590 if (GeneralUtility::isFirstPartOfStr($url, $baseUrl)) {
591 if (!in_array($url, $urlLog)) {
592 return $url;
593 }
594 }
595 }
596 }
597
598 /**
599 * Indexing External URL
600 *
601 * @param string $url URL, http://....
602 * @param int $pageId Page id to relate indexing to.
603 * @param array $rl Rootline array to relate indexing to
604 * @param int $cfgUid Configuration UID
605 * @param int $setId Set ID value
606 * @return array URLs found on this page
607 */
608 public function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)
609 {
610 // Index external URL:
611 $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
612 $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
613 $indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
614 $indexerObj->hash['phash'] = -1;
615 // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
616 $indexerObj->indexExternalUrl($url);
617 $url_qParts = parse_url($url);
618 $baseAbsoluteHref = $url_qParts['scheme'] . '://' . $url_qParts['host'];
619 $baseHref = $indexerObj->extractBaseHref($indexerObj->indexExternalUrl_content);
620 if (!$baseHref) {
621 // Extract base href from current URL
622 $baseHref = $baseAbsoluteHref;
623 $baseHref .= substr($url_qParts['path'], 0, strrpos($url_qParts['path'], '/'));
624 }
625 $baseHref = rtrim($baseHref, '/');
626 // Get URLs on this page:
627 $subUrls = [];
628 $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
629 // Traverse links:
630 foreach ($list as $count => $linkInfo) {
631 // Decode entities:
632 $subUrl = htmlspecialchars_decode($linkInfo['href']);
633 $qParts = parse_url($subUrl);
634 if (!$qParts['scheme']) {
635 $relativeUrl = GeneralUtility::resolveBackPath($subUrl);
636 if ($relativeUrl[0] === '/') {
637 $subUrl = $baseAbsoluteHref . $relativeUrl;
638 } else {
639 $subUrl = $baseHref . '/' . $relativeUrl;
640 }
641 }
642 $subUrls[] = $subUrl;
643 }
644 return $subUrls;
645 }
646
647 /**
648 * Indexing Single Record
649 *
650 * @param array $r Record to index
651 * @param array $cfgRec Configuration Record
652 * @param array $rl Rootline array to relate indexing to
653 */
654 public function indexSingleRecord($r, $cfgRec, $rl = null)
655 {
656 // Init:
657 $rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
658 $fieldList = GeneralUtility::trimExplode(',', $cfgRec['fieldlist'], true);
659 $languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
660 $sys_language_uid = $languageField ? $r[$languageField] : 0;
661 // (Re)-Indexing a row from a table:
662 $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
663 parse_str(str_replace('###UID###', $r['uid'], $cfgRec['get_params']), $GETparams);
664 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, (bool)$cfgRec['chashcalc']);
665 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
666 $indexerObj->forceIndexing = true;
667 $theContent = '';
668 foreach ($fieldList as $k => $v) {
669 if (!$k) {
670 $theTitle = $r[$v];
671 } else {
672 $theContent .= $r[$v] . ' ';
673 }
674 }
675 // Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid())
676 $indexerObj->backend_indexAsTYPO3Page(strip_tags(str_replace('<', ' <', $theTitle)), '', '', strip_tags(str_replace('<', ' <', $theContent)), 'utf-8', $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']], $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']], $r['uid']);
677 }
678
679 /**
680 * Get rootline for closest TypoScript template root.
681 * Algorithm same as used in Web > Template, Object browser
682 *
683 * @param int $id The page id to traverse rootline back from
684 * @return array Array where the root lines uid values are found.
685 */
686 public function getUidRootLineForClosestTemplate($id)
687 {
688 $tmpl = GeneralUtility::makeInstance(\TYPO3\CMS\Core\TypoScript\ExtendedTemplateService::class);
689 $tmpl->init();
690 // Gets the rootLine
691 $sys_page = GeneralUtility::makeInstance(\TYPO3\CMS\Frontend\Page\PageRepository::class);
692 $rootLine = $sys_page->getRootLine($id);
693 // This generates the constants/config + hierarchy info for the template.
694 $tmpl->runThroughTemplates($rootLine, 0);
695 // Root line uids
696 $rootline_uids = [];
697 foreach ($tmpl->rootLine as $rlkey => $rldat) {
698 $rootline_uids[$rlkey] = $rldat['uid'];
699 }
700 return $rootline_uids;
701 }
702
703 /**
704 * Generate the unix time stamp for next visit.
705 *
706 * @param array $cfgRec Index configuration record
707 * @return int The next time stamp
708 */
709 public function generateNextIndexingTime($cfgRec)
710 {
711 $currentTime = $GLOBALS['EXEC_TIME'];
712 // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected
713 if ($cfgRec['timer_frequency'] <= 24 * 3600) {
714 $aMidNight = mktime(0, 0, 0) - 1 * 24 * 3600;
715 } else {
716 $lastTime = $cfgRec['timer_next_indexing'] ?: $GLOBALS['EXEC_TIME'];
717 $aMidNight = mktime(0, 0, 0, date('m', $lastTime), date('d', $lastTime), date('y', $lastTime));
718 }
719 // Find last offset time plus frequency in seconds:
720 $lastSureOffset = $aMidNight + MathUtility::forceIntegerInRange($cfgRec['timer_offset'], 0, 86400);
721 $frequencySeconds = MathUtility::forceIntegerInRange($cfgRec['timer_frequency'], 1);
722 // Now, find out how many blocks of the length of frequency there is until the next time:
723 $frequencyBlocksUntilNextTime = ceil(($currentTime - $lastSureOffset) / $frequencySeconds);
724 // Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds.
725 return $lastSureOffset + $frequencyBlocksUntilNextTime * $frequencySeconds;
726 }
727
728 /**
729 * Checks if $url has any of the URls in the $url_deny "list" in it and if so, returns TRUE.
730 *
731 * @param string $url URL to test
732 * @param string $url_deny String where URLs are separated by line-breaks; If any of these strings is the first part of $url, the function returns TRUE (to indicate denial of decend)
733 * @return bool TRUE if there is a matching URL (hence, do not index!)
734 */
735 public function checkDeniedSuburls($url, $url_deny)
736 {
737 if (trim($url_deny)) {
738 $url_denyArray = GeneralUtility::trimExplode(LF, $url_deny, true);
739 foreach ($url_denyArray as $testurl) {
740 if (GeneralUtility::isFirstPartOfStr($url, $testurl)) {
741 return true;
742 }
743 }
744 }
745 return false;
746 }
747
748 /**
749 * Adding entry in queue for Hook
750 *
751 * @param array $cfgRec Configuration record
752 * @param string $title Title/URL
753 */
754 public function addQueueEntryForHook($cfgRec, $title)
755 {
756 $nparams = [
757 'indexConfigUid' => $cfgRec['uid'],
758 // This must ALWAYS be the cfgRec uid!
759 'url' => $title,
760 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']']
761 ];
762 $this->pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
763 }
764
765 /**
766 * Deletes all data stored by indexed search for a given page
767 *
768 * @param int $id Uid of the page to delete all pHash
769 */
770 public function deleteFromIndex($id)
771 {
772 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
773
774 // Lookup old phash rows:
775
776 $queryBuilder = $connectionPool->getQueryBuilderForTable('index_section');
777 $oldPhashRows = $queryBuilder->select('phash')
778 ->from('index_section')
779 ->where(
780 $queryBuilder->expr()->eq(
781 'page_id',
782 $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT)
783 )
784 )
785 ->execute()
786 ->fetchAll();
787
788 if (empty($oldPhashRows)) {
789 return;
790 }
791
792 $tables = [
793 'index_debug',
794 'index_fulltext',
795 'index_grlist',
796 'index_phash',
797 'index_rel',
798 'index_section',
799 ];
800 foreach ($tables as $table) {
801 $queryBuilder = $connectionPool->getQueryBuilderForTable($table);
802 $queryBuilder->delete($table)
803 ->where(
804 $queryBuilder->expr()->in(
805 'phash',
806 $queryBuilder->createNamedParameter(
807 array_column($oldPhashRows, 'phash'),
808 Connection::PARAM_INT_ARRAY
809 )
810 )
811 )
812 ->execute();
813 }
814 }
815
816 /*************************
817 *
818 * Hook functions for DataHandler (indexing of records)
819 *
820 *************************/
821 /**
822 * DataHandler hook function for on-the-fly indexing of database records
823 *
824 * @param string $command DataHandler command
825 * @param string $table Table name
826 * @param string $id Record ID. If new record its a string pointing to index inside \TYPO3\CMS\Core\DataHandling\DataHandler::substNEWwithIDs
827 * @param mixed $value Target value (ignored)
828 * @param DataHandler $pObj DataHandler calling object
829 */
830 public function processCmdmap_preProcess($command, $table, $id, $value, $pObj)
831 {
832 // Clean up the index
833 if ($command === 'delete' && $table === 'pages') {
834 $this->deleteFromIndex($id);
835 }
836 }
837
838 /**
839 * DataHandler hook function for on-the-fly indexing of database records
840 *
841 * @param string $status Status "new" or "update
842 * @param string $table Table name
843 * @param string $id Record ID. If new record its a string pointing to index inside \TYPO3\CMS\Core\DataHandling\DataHandler::substNEWwithIDs
844 * @param array $fieldArray Field array of updated fields in the operation
845 * @param DataHandler $pObj DataHandler calling object
846 */
847 public function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, $pObj)
848 {
849 // Check if any fields are actually updated:
850 if (empty($fieldArray)) {
851 return;
852 }
853 // Translate new ids.
854 if ($status === 'new') {
855 $id = $pObj->substNEWwithIDs[$id];
856 } elseif ($table === 'pages' && $status === 'update' && (array_key_exists('hidden', $fieldArray) && $fieldArray['hidden'] == 1 || array_key_exists('no_search', $fieldArray) && $fieldArray['no_search'] == 1)) {
857 // If the page should be hidden or not indexed after update, delete index for this page
858 $this->deleteFromIndex($id);
859 }
860 // Get full record and if exists, search for indexing configurations:
861 $currentRecord = BackendUtility::getRecord($table, $id);
862 if (is_array($currentRecord)) {
863 // Select all (not running) indexing configurations of type "record" (1) and
864 // which points to this table and is located on the same page as the record
865 // or pointing to the right source PID
866 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
867 ->getQueryBuilderForTable('index_config');
868 $result = $queryBuilder->select('*')
869 ->from('index_config')
870 ->where(
871 $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)),
872 $queryBuilder->expr()->eq('type', $queryBuilder->createNamedParameter(1, \PDO::PARAM_INT)),
873 $queryBuilder->expr()->eq(
874 'table2index',
875 $queryBuilder->createNamedParameter($table, \PDO::PARAM_STR)
876 ),
877 $queryBuilder->expr()->orX(
878 $queryBuilder->expr()->andX(
879 $queryBuilder->expr()->eq(
880 'alternative_source_pid',
881 $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)
882 ),
883 $queryBuilder->expr()->eq(
884 'pid',
885 $queryBuilder->createNamedParameter($currentRecord['pid'], \PDO::PARAM_INT)
886 )
887 ),
888 $queryBuilder->expr()->eq(
889 'alternative_source_pid',
890 $queryBuilder->createNamedParameter($currentRecord['pid'], \PDO::PARAM_INT)
891 )
892 ),
893 $queryBuilder->expr()->eq(
894 'records_indexonchange',
895 $queryBuilder->createNamedParameter(1, \PDO::PARAM_INT)
896 )
897 )
898 ->execute();
899
900 while ($cfgRec = $result->fetch()) {
901 $this->indexSingleRecord($currentRecord, $cfgRec);
902 }
903 }
904 }
905 }