[BUGFIX] Fix several typos in php comments
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Hook / CrawlerHook.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch\Hook;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Backend\Utility\BackendUtility;
18 use TYPO3\CMS\Core\Core\Bootstrap;
19 use TYPO3\CMS\Core\Core\Environment;
20 use TYPO3\CMS\Core\Database\Connection;
21 use TYPO3\CMS\Core\Database\ConnectionPool;
22 use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
23 use TYPO3\CMS\Core\DataHandling\DataHandler;
24 use TYPO3\CMS\Core\Exception\Page\RootLineException;
25 use TYPO3\CMS\Core\Utility\GeneralUtility;
26 use TYPO3\CMS\Core\Utility\MathUtility;
27 use TYPO3\CMS\Core\Utility\RootlineUtility;
28
29 /**
30 * Crawler hook for indexed search. Works with the "crawler" extension
31 * @internal this is a TYPO3-internal hook implementation and not part of TYPO3's Core API.
32 */
33 class CrawlerHook
34 {
35 /**
36 * Number of seconds to use as interval between queued indexing operations of URLs / files (types 2 & 3)
37 *
38 * @var int
39 */
40 public $secondsPerExternalUrl = 3;
41
42 /**
43 * Counts up for each added URL (type 3)
44 *
45 * @var int
46 */
47 public $instanceCounter = 0;
48
49 /**
50 * @var string
51 */
52 public $callBack = self::class;
53
54 /**
55 * The constructor
56 */
57 public function __construct()
58 {
59 // To make sure the backend charset is available:
60 if (!is_object($GLOBALS['LANG'])) {
61 Bootstrap::initializeLanguageObject();
62 }
63 }
64
65 /**
66 * Initialization of crawler hook.
67 * This function is asked for each instance of the crawler and we must check if something is timed to happen and if so put entry(s) in the crawlers log to start processing.
68 * In reality we select indexing configurations and evaluate if any of them needs to run.
69 *
70 * @param object $pObj Parent object (tx_crawler lib)
71 */
72 public function crawler_init(&$pObj)
73 {
74 // Select all indexing configuration which are waiting to be activated:
75 $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_config');
76 $queryBuilder = $connection->createQueryBuilder();
77
78 $result = $queryBuilder->select('*')
79 ->from('index_config')
80 ->where(
81 $queryBuilder->expr()->lt(
82 'timer_next_indexing',
83 $queryBuilder->createNamedParameter($GLOBALS['EXEC_TIME'], \PDO::PARAM_INT)
84 ),
85 $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT))
86 )
87 ->execute();
88
89 // For each configuration, check if it should be executed and if so, start:
90 while ($cfgRec = $result->fetch()) {
91 // Generate a unique set-ID:
92 $setId = GeneralUtility::md5int(microtime());
93 // Get next time:
94 $nextTime = $this->generateNextIndexingTime($cfgRec);
95 // Start process by updating index-config record:
96 $connection->update(
97 'index_config',
98 [
99 'set_id' => $setId,
100 'timer_next_indexing' => $nextTime,
101 'session_data' => ''
102 ],
103 [
104 'uid' => (int)$cfgRec['uid']
105 ]
106 );
107 // Based on configuration type:
108 switch ($cfgRec['type']) {
109 case 1:
110 // RECORDS:
111 // Parameters:
112 $params = [
113 'indexConfigUid' => $cfgRec['uid'],
114 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
115 'url' => 'Records (start)'
116 ];
117 //
118 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
119 break;
120 case 2:
121 // FILES:
122 // Parameters:
123 $params = [
124 'indexConfigUid' => $cfgRec['uid'],
125 // General
126 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
127 // General
128 'url' => $cfgRec['filepath'],
129 // Partly general... (for URL and file types)
130 'depth' => 0
131 ];
132 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
133 break;
134 case 3:
135 // External URL:
136 // Parameters:
137 $params = [
138 'indexConfigUid' => $cfgRec['uid'],
139 // General
140 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
141 // General
142 'url' => $cfgRec['externalUrl'],
143 // Partly general... (for URL and file types)
144 'depth' => 0
145 ];
146 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
147 break;
148 case 4:
149 // Page tree
150 // Parameters:
151 $params = [
152 'indexConfigUid' => $cfgRec['uid'],
153 // General
154 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
155 // General
156 'url' => (int)$cfgRec['alternative_source_pid'],
157 // Partly general... (for URL and file types and page tree (root))
158 'depth' => 0
159 ];
160 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
161 break;
162 case 5:
163 // Meta configuration, nothing to do:
164 // NOOP
165 break;
166 default:
167 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
168 $hookObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
169 // Parameters:
170 $params = [
171 'indexConfigUid' => $cfgRec['uid'],
172 // General
173 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . '/CUSTOM]'],
174 // General
175 'url' => $hookObj->initMessage($message)
176 ];
177 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
178 }
179 }
180 }
181 // Finally, look up all old index configurations which are finished and needs to be reset and done.
182 $this->cleanUpOldRunningConfigurations();
183 }
184
185 /**
186 * Call back function for execution of a log element
187 *
188 * @param array $params Params from log element. Must contain $params['indexConfigUid']
189 * @param object $pObj Parent object (tx_crawler lib)
190 * @return array Result array
191 */
192 public function crawler_execute($params, &$pObj)
193 {
194 // Indexer configuration ID must exist:
195 if ($params['indexConfigUid']) {
196 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
197 ->getQueryBuilderForTable('index_config');
198 $queryBuilder->getRestrictions()->removeAll();
199 // Load the indexing configuration record:
200 $cfgRec = $queryBuilder
201 ->select('*')
202 ->from('index_config')
203 ->where(
204 $queryBuilder->expr()->eq(
205 'uid',
206 $queryBuilder->createNamedParameter($params['indexConfigUid'], \PDO::PARAM_INT)
207 )
208 )
209 ->execute()
210 ->fetch();
211 if (is_array($cfgRec)) {
212 // Unpack session data:
213 $session_data = unserialize($cfgRec['session_data']);
214 // Select which type:
215 switch ($cfgRec['type']) {
216 case 1:
217 // Records:
218 $this->crawler_execute_type1($cfgRec, $session_data, $params, $pObj);
219 break;
220 case 2:
221 // Files
222 $this->crawler_execute_type2($cfgRec, $session_data, $params, $pObj);
223 break;
224 case 3:
225 // External URL:
226 $this->crawler_execute_type3($cfgRec, $session_data, $params, $pObj);
227 break;
228 case 4:
229 // Page tree:
230 $this->crawler_execute_type4($cfgRec, $session_data, $params, $pObj);
231 break;
232 case 5:
233 // Meta
234 // NOOP (should never enter here!)
235 break;
236 default:
237 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
238 $hookObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
239 $this->pObj = $pObj;
240 // For addQueueEntryForHook()
241 $hookObj->indexOperation($cfgRec, $session_data, $params, $this);
242 }
243 }
244 // Save process data which might be modified:
245 GeneralUtility::makeInstance(ConnectionPool::class)
246 ->getConnectionForTable('index_config')
247 ->update(
248 'index_config',
249 ['session_data' => serialize($session_data)],
250 ['uid' => (int)$cfgRec['uid']]
251 );
252 }
253 }
254 return ['log' => $params];
255 }
256
257 /**
258 * Indexing records from a table
259 *
260 * @param array $cfgRec Indexing Configuration Record
261 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
262 * @param array $params Parameters from the log queue.
263 * @param object $pObj Parent object (from "crawler" extension!)
264 */
265 public function crawler_execute_type1($cfgRec, &$session_data, $params, &$pObj)
266 {
267 if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']])) {
268 // Init session data array if not already:
269 if (!is_array($session_data)) {
270 $session_data = [
271 'uid' => 0
272 ];
273 }
274 // Init:
275 $pid = (int)$cfgRec['alternative_source_pid'] ?: $cfgRec['pid'];
276 $numberOfRecords = $cfgRec['recordsbatch']
277 ? MathUtility::forceIntegerInRange($cfgRec['recordsbatch'], 1)
278 : 100;
279
280 // Get root line:
281 $rootLine = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
282 // Select
283 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
284 ->getQueryBuilderForTable($cfgRec['table2index']);
285
286 $baseQueryBuilder = $queryBuilder->select('*')
287 ->from($cfgRec['table2index'])
288 ->where(
289 $queryBuilder->expr()->eq(
290 'pid',
291 $queryBuilder->createNamedParameter($pid, \PDO::PARAM_INT)
292 ),
293 $queryBuilder->expr()->gt(
294 'uid',
295 $queryBuilder->createNamedParameter($session_data['uid'], \PDO::PARAM_INT)
296 )
297 );
298 $result = $baseQueryBuilder
299 ->setMaxResults($numberOfRecords)
300 ->orderBy('uid')
301 ->execute();
302
303 // Traverse:
304 while ($row = $result->fetch()) {
305 // Index single record:
306 $this->indexSingleRecord($row, $cfgRec, $rootLine);
307 // Update the UID we last processed:
308 $session_data['uid'] = $row['uid'];
309 }
310
311 $rowCount = $baseQueryBuilder->count('uid')->execute()->fetchColumn(0);
312 // Finally, set entry for next indexing of batch of records:
313 if ($rowCount) {
314 $nparams = [
315 'indexConfigUid' => $cfgRec['uid'],
316 'url' => 'Records from UID#' . ($session_data['uid'] + 1) . '-?',
317 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']']
318 ];
319 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
320 }
321 }
322 }
323
324 /**
325 * Indexing files from fileadmin
326 *
327 * @param array $cfgRec Indexing Configuration Record
328 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
329 * @param array $params Parameters from the log queue.
330 * @param object $pObj Parent object (from "crawler" extension!)
331 */
332 public function crawler_execute_type2($cfgRec, &$session_data, $params, &$pObj)
333 {
334 // Prepare path, making it absolute and checking:
335 $readpath = $params['url'];
336 if (!GeneralUtility::isAbsPath($readpath)) {
337 $readpath = GeneralUtility::getFileAbsFileName($readpath);
338 }
339 if (GeneralUtility::isAllowedAbsPath($readpath)) {
340 if (@is_file($readpath)) {
341 // If file, index it!
342 // Get root line (need to provide this when indexing external files)
343 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
344 // (Re)-Indexing file on page.
345 $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
346 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
347 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
348 $indexerObj->hash['phash'] = -1;
349 // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
350 // Index document:
351 $indexerObj->indexRegularDocument(\TYPO3\CMS\Core\Utility\PathUtility::stripPathSitePrefix($readpath), true);
352 } elseif (@is_dir($readpath)) {
353 // If dir, read content and create new pending items for log:
354 // Select files and directories in path:
355 $extList = implode(',', GeneralUtility::trimExplode(',', $cfgRec['extensions'], true));
356 $fileArr = [];
357 $files = GeneralUtility::getAllFilesAndFoldersInPath($fileArr, $readpath, $extList, 0, 0);
358 $directoryList = GeneralUtility::get_dirs($readpath);
359 if (is_array($directoryList) && $params['depth'] < $cfgRec['depth']) {
360 foreach ($directoryList as $subdir) {
361 if ((string)$subdir != '') {
362 $files[] = $readpath . $subdir . '/';
363 }
364 }
365 }
366 $files = GeneralUtility::removePrefixPathFromList($files, Environment::getPublicPath() . '/');
367 // traverse the items and create log entries:
368 foreach ($files as $path) {
369 $this->instanceCounter++;
370 if ($path !== $params['url']) {
371 // Parameters:
372 $nparams = [
373 'indexConfigUid' => $cfgRec['uid'],
374 'url' => $path,
375 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
376 'depth' => $params['depth'] + 1
377 ];
378 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
379 }
380 }
381 }
382 }
383 }
384
385 /**
386 * Indexing External URLs
387 *
388 * @param array $cfgRec Indexing Configuration Record
389 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
390 * @param array $params Parameters from the log queue.
391 * @param object $pObj Parent object (from "crawler" extension!)
392 */
393 public function crawler_execute_type3($cfgRec, &$session_data, $params, &$pObj)
394 {
395 // Init session data array if not already:
396 if (!is_array($session_data)) {
397 $session_data = [
398 'urlLog' => [$params['url']]
399 ];
400 }
401 // Index the URL:
402 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
403 $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
404 // Add more elements to log now:
405 if ($params['depth'] < $cfgRec['depth']) {
406 foreach ($subUrls as $url) {
407 if ($url = $this->checkUrl($url, $session_data['urlLog'], $cfgRec['externalUrl'])) {
408 if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny'])) {
409 $this->instanceCounter++;
410 $session_data['urlLog'][] = $url;
411 // Parameters:
412 $nparams = [
413 'indexConfigUid' => $cfgRec['uid'],
414 'url' => $url,
415 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
416 'depth' => $params['depth'] + 1
417 ];
418 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
419 }
420 }
421 }
422 }
423 }
424
425 /**
426 * Page tree indexing type
427 *
428 * @param array $cfgRec Indexing Configuration Record
429 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
430 * @param array $params Parameters from the log queue.
431 * @param object $pObj Parent object (from "crawler" extension!)
432 */
433 public function crawler_execute_type4($cfgRec, &$session_data, $params, &$pObj)
434 {
435 // Base page uid:
436 $pageUid = (int)$params['url'];
437 // Get array of URLs from page:
438 $pageRow = BackendUtility::getRecord('pages', $pageUid);
439 $res = $pObj->getUrlsForPageRow($pageRow);
440 $duplicateTrack = [];
441 // Registry for duplicates
442 $downloadUrls = [];
443 // Dummy.
444 // Submit URLs:
445 if (!empty($res)) {
446 foreach ($res as $paramSetKey => $vv) {
447 $pObj->urlListFromUrlArray($vv, $pageRow, $GLOBALS['EXEC_TIME'], 30, 1, 0, $duplicateTrack, $downloadUrls, ['tx_indexedsearch_reindex']);
448 }
449 }
450 // Add subpages to log now:
451 if ($params['depth'] < $cfgRec['depth']) {
452 // Subpages selected
453 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('pages');
454 $queryBuilder->getRestrictions()
455 ->removeAll()
456 ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
457 $result = $queryBuilder->select('uid', 'title')
458 ->from('pages')
459 ->where(
460 $queryBuilder->expr()->eq(
461 'pid',
462 $queryBuilder->createNamedParameter($pageUid, \PDO::PARAM_INT)
463 )
464 )
465 ->execute();
466 // Traverse subpages and add to queue:
467 while ($row = $result->fetch()) {
468 $this->instanceCounter++;
469 $url = 'pages:' . $row['uid'] . ': ' . $row['title'];
470 $session_data['urlLog'][] = $url;
471 // Parameters:
472 $nparams = [
473 'indexConfigUid' => $cfgRec['uid'],
474 'url' => $row['uid'],
475 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
476 'depth' => $params['depth'] + 1
477 ];
478 $pObj->addQueueEntry_callBack(
479 $cfgRec['set_id'],
480 $nparams,
481 $this->callBack,
482 $cfgRec['pid'],
483 $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
484 );
485 }
486 }
487 }
488
489 /**
490 * Look up all old index configurations which are finished and needs to be reset and done
491 */
492 public function cleanUpOldRunningConfigurations()
493 {
494 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
495 // List of tables that store information related to the phash value
496 $tablesToClean = [
497 'index_phash',
498 'index_rel',
499 'index_section',
500 'index_grlist',
501 'index_fulltext',
502 'index_debug'
503 ];
504
505 $queryBuilder = $connectionPool->getQueryBuilderForTable('index_config');
506 $queryBuilder->getRestrictions()
507 ->removeAll()
508 ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
509
510 // Lookup running index configurations:
511 $runningIndexingConfigurations = $queryBuilder->select('*')
512 ->from('index_config')
513 ->where($queryBuilder->expr()->neq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)))
514 ->execute()
515 ->fetchAll();
516 // For each running configuration, look up how many log entries there are which are scheduled
517 // for execution and if none, clear the "set_id" (means; Processing was DONE)
518 foreach ($runningIndexingConfigurations as $cfgRec) {
519 // Look for ended processes:
520 $queued_items = $connectionPool->getConnectionForTable('tx_crawler_queue')
521 ->count(
522 '*',
523 'tx_crawler_queue',
524 [
525 'set_id' => (int)$cfgRec['set_id'],
526 'exec_time' => 0
527 ]
528 );
529 if (!$queued_items) {
530 // Lookup old phash rows:
531 $queryBuilder = $connectionPool->getQueryBuilderForTable('index_phash');
532 $oldPhashRows = $queryBuilder
533 ->select('phash')
534 ->from('index_phash')
535 ->where(
536 $queryBuilder->expr()->eq(
537 'freeIndexUid',
538 $queryBuilder->createNamedParameter($cfgRec['uid'], \PDO::PARAM_INT)
539 ),
540 $queryBuilder->expr()->neq(
541 'freeIndexSetId',
542 $queryBuilder->createNamedParameter($cfgRec['set_id'], \PDO::PARAM_INT)
543 )
544 )
545 ->execute()
546 ->fetchAll();
547
548 // Removing old registrations for all tables
549 foreach ($tablesToClean as $table) {
550 $queryBuilder = $connectionPool->getQueryBuilderForTable($table);
551 $queryBuilder->delete($table)
552 ->where(
553 $queryBuilder->expr()->in(
554 'phash',
555 $queryBuilder->createNamedParameter(
556 array_column($oldPhashRows, 'phash'),
557 Connection::PARAM_INT_ARRAY
558 )
559 )
560 )
561 ->execute();
562 }
563
564 // End process by updating index-config record:
565 $connectionPool->getConnectionForTable('index_config')
566 ->update(
567 'index_config',
568 [
569 'set_id' => 0,
570 'session_data' => ''
571 ],
572 ['uid' => (int)$cfgRec['uid']]
573 );
574 }
575 }
576 }
577
578 /*****************************************
579 *
580 * Helper functions
581 *
582 *****************************************/
583 /**
584 * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log.
585 *
586 * @param string $url URL string to check
587 * @param array $urlLog Array of already indexed URLs (input url is looked up here and must not exist already)
588 * @param string $baseUrl Base URL of the indexing process (input URL must be "inside" the base URL!)
589 * @return string Returns the URL if OK, otherwise FALSE
590 */
591 public function checkUrl($url, $urlLog, $baseUrl)
592 {
593 $url = preg_replace('/\\/\\/$/', '/', $url);
594 list($url) = explode('#', $url);
595 if (strpos($url, '../') === false) {
596 if (GeneralUtility::isFirstPartOfStr($url, $baseUrl)) {
597 if (!in_array($url, $urlLog)) {
598 return $url;
599 }
600 }
601 }
602 }
603
604 /**
605 * Indexing External URL
606 *
607 * @param string $url URL, http://....
608 * @param int $pageId Page id to relate indexing to.
609 * @param array $rl Rootline array to relate indexing to
610 * @param int $cfgUid Configuration UID
611 * @param int $setId Set ID value
612 * @return array URLs found on this page
613 */
614 public function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)
615 {
616 // Index external URL:
617 $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
618 $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
619 $indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
620 $indexerObj->hash['phash'] = -1;
621 // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
622 $indexerObj->indexExternalUrl($url);
623 $url_qParts = parse_url($url);
624 $baseAbsoluteHref = $url_qParts['scheme'] . '://' . $url_qParts['host'];
625 $baseHref = $indexerObj->extractBaseHref($indexerObj->indexExternalUrl_content);
626 if (!$baseHref) {
627 // Extract base href from current URL
628 $baseHref = $baseAbsoluteHref;
629 $baseHref .= substr($url_qParts['path'], 0, strrpos($url_qParts['path'], '/'));
630 }
631 $baseHref = rtrim($baseHref, '/');
632 // Get URLs on this page:
633 $subUrls = [];
634 $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
635 // Traverse links:
636 foreach ($list as $count => $linkInfo) {
637 // Decode entities:
638 $subUrl = htmlspecialchars_decode($linkInfo['href']);
639 $qParts = parse_url($subUrl);
640 if (!$qParts['scheme']) {
641 $relativeUrl = GeneralUtility::resolveBackPath($subUrl);
642 if ($relativeUrl[0] === '/') {
643 $subUrl = $baseAbsoluteHref . $relativeUrl;
644 } else {
645 $subUrl = $baseHref . '/' . $relativeUrl;
646 }
647 }
648 $subUrls[] = $subUrl;
649 }
650 return $subUrls;
651 }
652
653 /**
654 * Indexing Single Record
655 *
656 * @param array $r Record to index
657 * @param array $cfgRec Configuration Record
658 * @param array $rl Rootline array to relate indexing to
659 */
660 public function indexSingleRecord($r, $cfgRec, $rl = null)
661 {
662 // Init:
663 $rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
664 $fieldList = GeneralUtility::trimExplode(',', $cfgRec['fieldlist'], true);
665 $languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
666 $sys_language_uid = $languageField ? $r[$languageField] : 0;
667 // (Re)-Indexing a row from a table:
668 $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
669 parse_str(str_replace('###UID###', $r['uid'], $cfgRec['get_params']), $GETparams);
670 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams);
671 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
672 $indexerObj->forceIndexing = true;
673 $theContent = '';
674 foreach ($fieldList as $k => $v) {
675 if (!$k) {
676 $theTitle = $r[$v];
677 } else {
678 $theContent .= $r[$v] . ' ';
679 }
680 }
681 // Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid())
682 $indexerObj->backend_indexAsTYPO3Page(strip_tags(str_replace('<', ' <', $theTitle)), '', '', strip_tags(str_replace('<', ' <', $theContent)), 'utf-8', $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']], $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']], $r['uid']);
683 }
684
685 /**
686 * Get rootline for closest TypoScript template root.
687 * Algorithm same as used in Web > Template, Object browser
688 *
689 * @param int $id The page id to traverse rootline back from
690 * @return array Array where the root lines uid values are found.
691 */
692 public function getUidRootLineForClosestTemplate($id)
693 {
694 $rootLineUids = [];
695 try {
696 // Gets the rootLine
697 $rootLine = GeneralUtility::makeInstance(RootlineUtility::class, $id)->get();
698 // This generates the constants/config + hierarchy info for the template.
699 $tmpl = GeneralUtility::makeInstance(\TYPO3\CMS\Core\TypoScript\ExtendedTemplateService::class);
700 $tmpl->runThroughTemplates($rootLine);
701 // Root line uids
702 foreach ($tmpl->rootLine as $rlkey => $rldat) {
703 $rootLineUids[$rlkey] = $rldat['uid'];
704 }
705 } catch (RootLineException $e) {
706 // do nothing
707 }
708 return $rootLineUids;
709 }
710
711 /**
712 * Generate the unix time stamp for next visit.
713 *
714 * @param array $cfgRec Index configuration record
715 * @return int The next time stamp
716 */
717 public function generateNextIndexingTime($cfgRec)
718 {
719 $currentTime = $GLOBALS['EXEC_TIME'];
720 // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected
721 if ($cfgRec['timer_frequency'] <= 24 * 3600) {
722 $aMidNight = mktime(0, 0, 0) - 1 * 24 * 3600;
723 } else {
724 $lastTime = $cfgRec['timer_next_indexing'] ?: $GLOBALS['EXEC_TIME'];
725 $aMidNight = mktime(0, 0, 0, date('m', $lastTime), date('d', $lastTime), date('y', $lastTime));
726 }
727 // Find last offset time plus frequency in seconds:
728 $lastSureOffset = $aMidNight + MathUtility::forceIntegerInRange($cfgRec['timer_offset'], 0, 86400);
729 $frequencySeconds = MathUtility::forceIntegerInRange($cfgRec['timer_frequency'], 1);
730 // Now, find out how many blocks of the length of frequency there is until the next time:
731 $frequencyBlocksUntilNextTime = ceil(($currentTime - $lastSureOffset) / $frequencySeconds);
732 // Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds.
733 return $lastSureOffset + $frequencyBlocksUntilNextTime * $frequencySeconds;
734 }
735
736 /**
737 * Checks if $url has any of the URls in the $url_deny "list" in it and if so, returns TRUE.
738 *
739 * @param string $url URL to test
740 * @param string $url_deny String where URLs are separated by line-breaks; If any of these strings is the first part of $url, the function returns TRUE (to indicate denial of descend)
741 * @return bool TRUE if there is a matching URL (hence, do not index!)
742 */
743 public function checkDeniedSuburls($url, $url_deny)
744 {
745 if (trim($url_deny)) {
746 $url_denyArray = GeneralUtility::trimExplode(LF, $url_deny, true);
747 foreach ($url_denyArray as $testurl) {
748 if (GeneralUtility::isFirstPartOfStr($url, $testurl)) {
749 return true;
750 }
751 }
752 }
753 return false;
754 }
755
756 /**
757 * Adding entry in queue for Hook
758 *
759 * @param array $cfgRec Configuration record
760 * @param string $title Title/URL
761 */
762 public function addQueueEntryForHook($cfgRec, $title)
763 {
764 $nparams = [
765 'indexConfigUid' => $cfgRec['uid'],
766 // This must ALWAYS be the cfgRec uid!
767 'url' => $title,
768 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']']
769 ];
770 $this->pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
771 }
772
773 /**
774 * Deletes all data stored by indexed search for a given page
775 *
776 * @param int $id Uid of the page to delete all pHash
777 */
778 public function deleteFromIndex($id)
779 {
780 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
781
782 // Lookup old phash rows:
783
784 $queryBuilder = $connectionPool->getQueryBuilderForTable('index_section');
785 $oldPhashRows = $queryBuilder->select('phash')
786 ->from('index_section')
787 ->where(
788 $queryBuilder->expr()->eq(
789 'page_id',
790 $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT)
791 )
792 )
793 ->execute()
794 ->fetchAll();
795
796 if (empty($oldPhashRows)) {
797 return;
798 }
799
800 $tables = [
801 'index_debug',
802 'index_fulltext',
803 'index_grlist',
804 'index_phash',
805 'index_rel',
806 'index_section',
807 ];
808 foreach ($tables as $table) {
809 $queryBuilder = $connectionPool->getQueryBuilderForTable($table);
810 $queryBuilder->delete($table)
811 ->where(
812 $queryBuilder->expr()->in(
813 'phash',
814 $queryBuilder->createNamedParameter(
815 array_column($oldPhashRows, 'phash'),
816 Connection::PARAM_INT_ARRAY
817 )
818 )
819 )
820 ->execute();
821 }
822 }
823
824 /*************************
825 *
826 * Hook functions for DataHandler (indexing of records)
827 *
828 *************************/
829 /**
830 * DataHandler hook function for on-the-fly indexing of database records
831 *
832 * @param string $command DataHandler command
833 * @param string $table Table name
834 * @param string $id Record ID. If new record its a string pointing to index inside \TYPO3\CMS\Core\DataHandling\DataHandler::substNEWwithIDs
835 * @param mixed $value Target value (ignored)
836 * @param DataHandler $pObj DataHandler calling object
837 */
838 public function processCmdmap_preProcess($command, $table, $id, $value, $pObj)
839 {
840 // Clean up the index
841 if ($command === 'delete' && $table === 'pages') {
842 $this->deleteFromIndex($id);
843 }
844 }
845
846 /**
847 * DataHandler hook function for on-the-fly indexing of database records
848 *
849 * @param string $status Status "new" or "update
850 * @param string $table Table name
851 * @param string $id Record ID. If new record its a string pointing to index inside \TYPO3\CMS\Core\DataHandling\DataHandler::substNEWwithIDs
852 * @param array $fieldArray Field array of updated fields in the operation
853 * @param DataHandler $pObj DataHandler calling object
854 */
855 public function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, $pObj)
856 {
857 // Check if any fields are actually updated:
858 if (empty($fieldArray)) {
859 return;
860 }
861 // Translate new ids.
862 if ($status === 'new') {
863 $id = $pObj->substNEWwithIDs[$id];
864 } elseif ($table === 'pages' && $status === 'update' && (array_key_exists('hidden', $fieldArray) && $fieldArray['hidden'] == 1 || array_key_exists('no_search', $fieldArray) && $fieldArray['no_search'] == 1)) {
865 // If the page should be hidden or not indexed after update, delete index for this page
866 $this->deleteFromIndex($id);
867 }
868 // Get full record and if exists, search for indexing configurations:
869 $currentRecord = BackendUtility::getRecord($table, $id);
870 if (is_array($currentRecord)) {
871 // Select all (not running) indexing configurations of type "record" (1) and
872 // which points to this table and is located on the same page as the record
873 // or pointing to the right source PID
874 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
875 ->getQueryBuilderForTable('index_config');
876 $result = $queryBuilder->select('*')
877 ->from('index_config')
878 ->where(
879 $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)),
880 $queryBuilder->expr()->eq('type', $queryBuilder->createNamedParameter(1, \PDO::PARAM_INT)),
881 $queryBuilder->expr()->eq(
882 'table2index',
883 $queryBuilder->createNamedParameter($table, \PDO::PARAM_STR)
884 ),
885 $queryBuilder->expr()->orX(
886 $queryBuilder->expr()->andX(
887 $queryBuilder->expr()->eq(
888 'alternative_source_pid',
889 $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)
890 ),
891 $queryBuilder->expr()->eq(
892 'pid',
893 $queryBuilder->createNamedParameter($currentRecord['pid'], \PDO::PARAM_INT)
894 )
895 ),
896 $queryBuilder->expr()->eq(
897 'alternative_source_pid',
898 $queryBuilder->createNamedParameter($currentRecord['pid'], \PDO::PARAM_INT)
899 )
900 ),
901 $queryBuilder->expr()->eq(
902 'records_indexonchange',
903 $queryBuilder->createNamedParameter(1, \PDO::PARAM_INT)
904 )
905 )
906 ->execute();
907
908 while ($cfgRec = $result->fetch()) {
909 $this->indexSingleRecord($currentRecord, $cfgRec);
910 }
911 }
912 }
913 }