[BUGFIX] Statement::rowCount not reliable for SELECT queries
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Hook / CrawlerHook.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch\Hook;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Backend\Utility\BackendUtility;
18 use TYPO3\CMS\Core\Database\Connection;
19 use TYPO3\CMS\Core\Database\ConnectionPool;
20 use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
21 use TYPO3\CMS\Core\DataHandling\DataHandler;
22 use TYPO3\CMS\Core\Utility\GeneralUtility;
23 use TYPO3\CMS\Core\Utility\MathUtility;
24
25 /**
26 * Crawler hook for indexed search. Works with the "crawler" extension
27 */
28 class CrawlerHook
29 {
30 /**
31 * Number of seconds to use as interval between queued indexing operations of URLs / files (types 2 & 3)
32 *
33 * @var int
34 */
35 public $secondsPerExternalUrl = 3;
36
37 /**
38 * Counts up for each added URL (type 3)
39 *
40 * @var int
41 */
42 public $instanceCounter = 0;
43
44 /**
45 * @var string
46 */
47 public $callBack = self::class;
48
49 /**
50 * The constructor
51 */
52 public function __construct()
53 {
54 // To make sure the backend charset is available:
55 if (!is_object($GLOBALS['LANG'])) {
56 $GLOBALS['LANG'] = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Localization\LanguageService::class);
57 $GLOBALS['LANG']->init($GLOBALS['BE_USER']->uc['lang']);
58 }
59 }
60
61 /**
62 * Initialization of crawler hook.
63 * This function is asked for each instance of the crawler and we must check if something is timed to happen and if so put entry(s) in the crawlers log to start processing.
64 * In reality we select indexing configurations and evaluate if any of them needs to run.
65 *
66 * @param object $pObj Parent object (tx_crawler lib)
67 */
68 public function crawler_init(&$pObj)
69 {
70 // Select all indexing configuration which are waiting to be activated:
71 $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_config');
72 $queryBuilder = $connection->createQueryBuilder();
73
74 $result = $queryBuilder->select('*')
75 ->from('index_config')
76 ->where(
77 $queryBuilder->expr()->lt(
78 'timer_next_indexing',
79 $queryBuilder->createNamedParameter($GLOBALS['EXEC_TIME'], \PDO::PARAM_INT)
80 ),
81 $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT))
82 )
83 ->execute();
84
85 // For each configuration, check if it should be executed and if so, start:
86 while ($cfgRec = $result->fetch()) {
87 // Generate a unique set-ID:
88 $setId = GeneralUtility::md5int(microtime());
89 // Get next time:
90 $nextTime = $this->generateNextIndexingTime($cfgRec);
91 // Start process by updating index-config record:
92 $connection->update(
93 'index_config',
94 [
95 'set_id' => $setId,
96 'timer_next_indexing' => $nextTime,
97 'session_data' => ''
98 ],
99 [
100 'uid' => (int)$cfgRec['uid']
101 ]
102 );
103 // Based on configuration type:
104 switch ($cfgRec['type']) {
105 case 1:
106 // RECORDS:
107 // Parameters:
108 $params = [
109 'indexConfigUid' => $cfgRec['uid'],
110 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
111 'url' => 'Records (start)'
112 ];
113 //
114 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
115 break;
116 case 2:
117 // FILES:
118 // Parameters:
119 $params = [
120 'indexConfigUid' => $cfgRec['uid'],
121 // General
122 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
123 // General
124 'url' => $cfgRec['filepath'],
125 // Partly general... (for URL and file types)
126 'depth' => 0
127 ];
128 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
129 break;
130 case 3:
131 // External URL:
132 // Parameters:
133 $params = [
134 'indexConfigUid' => $cfgRec['uid'],
135 // General
136 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
137 // General
138 'url' => $cfgRec['externalUrl'],
139 // Partly general... (for URL and file types)
140 'depth' => 0
141 ];
142 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
143 break;
144 case 4:
145 // Page tree
146 // Parameters:
147 $params = [
148 'indexConfigUid' => $cfgRec['uid'],
149 // General
150 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
151 // General
152 'url' => (int)$cfgRec['alternative_source_pid'],
153 // Partly general... (for URL and file types and page tree (root))
154 'depth' => 0
155 ];
156 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
157 break;
158 case 5:
159 // Meta configuration, nothing to do:
160 // NOOP
161 break;
162 default:
163 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
164 $hookObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
165 // Parameters:
166 $params = [
167 'indexConfigUid' => $cfgRec['uid'],
168 // General
169 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . '/CUSTOM]'],
170 // General
171 'url' => $hookObj->initMessage($message)
172 ];
173 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
174 }
175 }
176 }
177 // Finally, look up all old index configurations which are finished and needs to be reset and done.
178 $this->cleanUpOldRunningConfigurations();
179 }
180
181 /**
182 * Call back function for execution of a log element
183 *
184 * @param array $params Params from log element. Must contain $params['indexConfigUid']
185 * @param object $pObj Parent object (tx_crawler lib)
186 * @return array Result array
187 */
188 public function crawler_execute($params, &$pObj)
189 {
190 // Indexer configuration ID must exist:
191 if ($params['indexConfigUid']) {
192 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
193 ->getQueryBuilderForTable('index_config');
194 $queryBuilder->getRestrictions()->removeAll();
195 // Load the indexing configuration record:
196 $cfgRec = $queryBuilder
197 ->select('*')
198 ->from('index_config')
199 ->where(
200 $queryBuilder->expr()->eq(
201 'uid',
202 $queryBuilder->createNamedParameter($params['indexConfigUid'], \PDO::PARAM_INT)
203 )
204 )
205 ->execute()
206 ->fetch();
207 if (is_array($cfgRec)) {
208 // Unpack session data:
209 $session_data = unserialize($cfgRec['session_data']);
210 // Select which type:
211 switch ($cfgRec['type']) {
212 case 1:
213 // Records:
214 $this->crawler_execute_type1($cfgRec, $session_data, $params, $pObj);
215 break;
216 case 2:
217 // Files
218 $this->crawler_execute_type2($cfgRec, $session_data, $params, $pObj);
219 break;
220 case 3:
221 // External URL:
222 $this->crawler_execute_type3($cfgRec, $session_data, $params, $pObj);
223 break;
224 case 4:
225 // Page tree:
226 $this->crawler_execute_type4($cfgRec, $session_data, $params, $pObj);
227 break;
228 case 5:
229 // Meta
230 // NOOP (should never enter here!)
231 break;
232 default:
233 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
234 $hookObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
235 $this->pObj = $pObj;
236 // For addQueueEntryForHook()
237 $hookObj->indexOperation($cfgRec, $session_data, $params, $this);
238 }
239 }
240 // Save process data which might be modified:
241 GeneralUtility::makeInstance(ConnectionPool::class)
242 ->getConnectionForTable('index_config')
243 ->update(
244 'index_config',
245 ['session_data' => serialize($session_data)],
246 ['uid' => (int)$cfgRec['uid']]
247 );
248 }
249 }
250 return ['log' => $params];
251 }
252
253 /**
254 * Indexing records from a table
255 *
256 * @param array $cfgRec Indexing Configuration Record
257 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
258 * @param array $params Parameters from the log queue.
259 * @param object $pObj Parent object (from "crawler" extension!)
260 */
261 public function crawler_execute_type1($cfgRec, &$session_data, $params, &$pObj)
262 {
263 if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']])) {
264 // Init session data array if not already:
265 if (!is_array($session_data)) {
266 $session_data = [
267 'uid' => 0
268 ];
269 }
270 // Init:
271 $pid = (int)$cfgRec['alternative_source_pid'] ?: $cfgRec['pid'];
272 $numberOfRecords = $cfgRec['recordsbatch']
273 ? MathUtility::forceIntegerInRange($cfgRec['recordsbatch'], 1)
274 : 100;
275
276 // Get root line:
277 $rootLine = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
278 // Select
279 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
280 ->getQueryBuilderForTable($cfgRec['table2index']);
281
282 $baseQueryBuilder = $queryBuilder->select('*')
283 ->from($cfgRec['table2index'])
284 ->where(
285 $queryBuilder->expr()->eq(
286 'pid',
287 $queryBuilder->createNamedParameter($pid, \PDO::PARAM_INT)
288 ),
289 $queryBuilder->expr()->gt(
290 'uid',
291 $queryBuilder->createNamedParameter($session_data['uid'], \PDO::PARAM_INT)
292 )
293 );
294 $result = $baseQueryBuilder
295 ->setMaxResults($numberOfRecords)
296 ->orderBy('uid')
297 ->execute();
298
299 // Traverse:
300 while ($row = $result->fetch()) {
301 // Index single record:
302 $this->indexSingleRecord($row, $cfgRec, $rootLine);
303 // Update the UID we last processed:
304 $session_data['uid'] = $row['uid'];
305 }
306
307 $rowCount = $baseQueryBuilder->count('uid')->execute()->fetchColumn(0);
308 // Finally, set entry for next indexing of batch of records:
309 if ($rowCount) {
310 $nparams = [
311 'indexConfigUid' => $cfgRec['uid'],
312 'url' => 'Records from UID#' . ($session_data['uid'] + 1) . '-?',
313 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']']
314 ];
315 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
316 }
317 }
318 }
319
320 /**
321 * Indexing files from fileadmin
322 *
323 * @param array $cfgRec Indexing Configuration Record
324 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
325 * @param array $params Parameters from the log queue.
326 * @param object $pObj Parent object (from "crawler" extension!)
327 */
328 public function crawler_execute_type2($cfgRec, &$session_data, $params, &$pObj)
329 {
330 // Prepare path, making it absolute and checking:
331 $readpath = $params['url'];
332 if (!GeneralUtility::isAbsPath($readpath)) {
333 $readpath = GeneralUtility::getFileAbsFileName($readpath);
334 }
335 if (GeneralUtility::isAllowedAbsPath($readpath)) {
336 if (@is_file($readpath)) {
337 // If file, index it!
338 // Get root line (need to provide this when indexing external files)
339 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
340 // (Re)-Indexing file on page.
341 $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
342 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
343 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
344 $indexerObj->hash['phash'] = -1;
345 // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
346 // Index document:
347 $indexerObj->indexRegularDocument(\TYPO3\CMS\Core\Utility\PathUtility::stripPathSitePrefix($readpath), true);
348 } elseif (@is_dir($readpath)) {
349 // If dir, read content and create new pending items for log:
350 // Select files and directories in path:
351 $extList = implode(',', GeneralUtility::trimExplode(',', $cfgRec['extensions'], true));
352 $fileArr = [];
353 $files = GeneralUtility::getAllFilesAndFoldersInPath($fileArr, $readpath, $extList, 0, 0);
354 $directoryList = GeneralUtility::get_dirs($readpath);
355 if (is_array($directoryList) && $params['depth'] < $cfgRec['depth']) {
356 foreach ($directoryList as $subdir) {
357 if ((string)$subdir != '') {
358 $files[] = $readpath . $subdir . '/';
359 }
360 }
361 }
362 $files = GeneralUtility::removePrefixPathFromList($files, PATH_site);
363 // traverse the items and create log entries:
364 foreach ($files as $path) {
365 $this->instanceCounter++;
366 if ($path !== $params['url']) {
367 // Parameters:
368 $nparams = [
369 'indexConfigUid' => $cfgRec['uid'],
370 'url' => $path,
371 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
372 'depth' => $params['depth'] + 1
373 ];
374 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
375 }
376 }
377 }
378 }
379 }
380
381 /**
382 * Indexing External URLs
383 *
384 * @param array $cfgRec Indexing Configuration Record
385 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
386 * @param array $params Parameters from the log queue.
387 * @param object $pObj Parent object (from "crawler" extension!)
388 */
389 public function crawler_execute_type3($cfgRec, &$session_data, $params, &$pObj)
390 {
391 // Init session data array if not already:
392 if (!is_array($session_data)) {
393 $session_data = [
394 'urlLog' => [$params['url']]
395 ];
396 }
397 // Index the URL:
398 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
399 $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
400 // Add more elements to log now:
401 if ($params['depth'] < $cfgRec['depth']) {
402 foreach ($subUrls as $url) {
403 if ($url = $this->checkUrl($url, $session_data['urlLog'], $cfgRec['externalUrl'])) {
404 if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny'])) {
405 $this->instanceCounter++;
406 $session_data['urlLog'][] = $url;
407 // Parameters:
408 $nparams = [
409 'indexConfigUid' => $cfgRec['uid'],
410 'url' => $url,
411 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
412 'depth' => $params['depth'] + 1
413 ];
414 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
415 }
416 }
417 }
418 }
419 }
420
421 /**
422 * Page tree indexing type
423 *
424 * @param array $cfgRec Indexing Configuration Record
425 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
426 * @param array $params Parameters from the log queue.
427 * @param object $pObj Parent object (from "crawler" extension!)
428 */
429 public function crawler_execute_type4($cfgRec, &$session_data, $params, &$pObj)
430 {
431 // Base page uid:
432 $pageUid = (int)$params['url'];
433 // Get array of URLs from page:
434 $pageRow = BackendUtility::getRecord('pages', $pageUid);
435 $res = $pObj->getUrlsForPageRow($pageRow);
436 $duplicateTrack = [];
437 // Registry for duplicates
438 $downloadUrls = [];
439 // Dummy.
440 // Submit URLs:
441 if (!empty($res)) {
442 foreach ($res as $paramSetKey => $vv) {
443 $pObj->urlListFromUrlArray($vv, $pageRow, $GLOBALS['EXEC_TIME'], 30, 1, 0, $duplicateTrack, $downloadUrls, ['tx_indexedsearch_reindex']);
444 }
445 }
446 // Add subpages to log now:
447 if ($params['depth'] < $cfgRec['depth']) {
448 // Subpages selected
449 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('pages');
450 $queryBuilder->getRestrictions()
451 ->removeAll()
452 ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
453 $result = $queryBuilder->select('uid', 'title')
454 ->from('pages')
455 ->where(
456 $queryBuilder->expr()->eq(
457 'pid',
458 $queryBuilder->createNamedParameter($pageUid, \PDO::PARAM_INT)
459 )
460 )
461 ->execute();
462 // Traverse subpages and add to queue:
463 while ($row = $result->fetch()) {
464 $this->instanceCounter++;
465 $url = 'pages:' . $row['uid'] . ': ' . $row['title'];
466 $session_data['urlLog'][] = $url;
467 // Parameters:
468 $nparams = [
469 'indexConfigUid' => $cfgRec['uid'],
470 'url' => $row['uid'],
471 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
472 'depth' => $params['depth'] + 1
473 ];
474 $pObj->addQueueEntry_callBack(
475 $cfgRec['set_id'],
476 $nparams,
477 $this->callBack,
478 $cfgRec['pid'],
479 $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
480 );
481 }
482 }
483 }
484
485 /**
486 * Look up all old index configurations which are finished and needs to be reset and done
487 */
488 public function cleanUpOldRunningConfigurations()
489 {
490 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
491 // List of tables that store information related to the phash value
492 $tablesToClean = [
493 'index_phash',
494 'index_rel',
495 'index_section',
496 'index_grlist',
497 'index_fulltext',
498 'index_debug'
499 ];
500
501 $queryBuilder = $connectionPool->getQueryBuilderForTable('index_config');
502 $queryBuilder->getRestrictions()
503 ->removeAll()
504 ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
505
506 // Lookup running index configurations:
507 $runningIndexingConfigurations = $queryBuilder->select('*')
508 ->from('index_config')
509 ->where($queryBuilder->expr()->neq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)))
510 ->execute()
511 ->fetchAll();
512 // For each running configuration, look up how many log entries there are which are scheduled
513 // for execution and if none, clear the "set_id" (means; Processing was DONE)
514 foreach ($runningIndexingConfigurations as $cfgRec) {
515 // Look for ended processes:
516 $queued_items = $connectionPool->getConnectionForTable('tx_crawler_queue')
517 ->count(
518 '*',
519 'tx_crawler_queue',
520 [
521 'set_id' => (int)$cfgRec['set_id'],
522 'exec_time' => 0
523 ]
524 );
525 if (!$queued_items) {
526 // Lookup old phash rows:
527 $queryBuilder = $connectionPool->getQueryBuilderForTable('index_phash');
528 $oldPhashRows = $queryBuilder
529 ->select('phash')
530 ->from('index_phash')
531 ->where(
532 $queryBuilder->expr()->eq(
533 'freeIndexUid',
534 $queryBuilder->createNamedParameter($cfgRec['uid'], \PDO::PARAM_INT)
535 ),
536 $queryBuilder->expr()->neq(
537 'freeIndexSetId',
538 $queryBuilder->createNamedParameter($cfgRec['set_id'], \PDO::PARAM_INT)
539 )
540 )
541 ->execute()
542 ->fetchAll();
543
544 // Removing old registrations for all tables
545 foreach ($tablesToClean as $table) {
546 $queryBuilder = $connectionPool->getQueryBuilderForTable($table);
547 $queryBuilder->delete($table)
548 ->where(
549 $queryBuilder->expr()->in(
550 'phash',
551 $queryBuilder->createNamedParameter(
552 array_column($oldPhashRows, 'phash'),
553 Connection::PARAM_INT_ARRAY
554 )
555 )
556 )
557 ->execute();
558 }
559
560 // End process by updating index-config record:
561 $connectionPool->getConnectionForTable('index_config')
562 ->update(
563 'index_config',
564 [
565 'set_id' => 0,
566 'session_data' => ''
567 ],
568 ['uid' => (int)$cfgRec['uid']]
569 );
570 }
571 }
572 }
573
574 /*****************************************
575 *
576 * Helper functions
577 *
578 *****************************************/
579 /**
580 * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log.
581 *
582 * @param string $url URL string to check
583 * @param array $urlLog Array of already indexed URLs (input url is looked up here and must not exist already)
584 * @param string $baseUrl Base URL of the indexing process (input URL must be "inside" the base URL!)
585 * @return string Returls the URL if OK, otherwise FALSE
586 */
587 public function checkUrl($url, $urlLog, $baseUrl)
588 {
589 $url = preg_replace('/\\/\\/$/', '/', $url);
590 list($url) = explode('#', $url);
591 if (!strstr($url, '../')) {
592 if (GeneralUtility::isFirstPartOfStr($url, $baseUrl)) {
593 if (!in_array($url, $urlLog)) {
594 return $url;
595 }
596 }
597 }
598 }
599
600 /**
601 * Indexing External URL
602 *
603 * @param string $url URL, http://....
604 * @param int $pageId Page id to relate indexing to.
605 * @param array $rl Rootline array to relate indexing to
606 * @param int $cfgUid Configuration UID
607 * @param int $setId Set ID value
608 * @return array URLs found on this page
609 */
610 public function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)
611 {
612 // Index external URL:
613 $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
614 $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
615 $indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
616 $indexerObj->hash['phash'] = -1;
617 // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
618 $indexerObj->indexExternalUrl($url);
619 $url_qParts = parse_url($url);
620 $baseAbsoluteHref = $url_qParts['scheme'] . '://' . $url_qParts['host'];
621 $baseHref = $indexerObj->extractBaseHref($indexerObj->indexExternalUrl_content);
622 if (!$baseHref) {
623 // Extract base href from current URL
624 $baseHref = $baseAbsoluteHref;
625 $baseHref .= substr($url_qParts['path'], 0, strrpos($url_qParts['path'], '/'));
626 }
627 $baseHref = rtrim($baseHref, '/');
628 // Get URLs on this page:
629 $subUrls = [];
630 $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
631 // Traverse links:
632 foreach ($list as $count => $linkInfo) {
633 // Decode entities:
634 $subUrl = htmlspecialchars_decode($linkInfo['href']);
635 $qParts = parse_url($subUrl);
636 if (!$qParts['scheme']) {
637 $relativeUrl = GeneralUtility::resolveBackPath($subUrl);
638 if ($relativeUrl[0] === '/') {
639 $subUrl = $baseAbsoluteHref . $relativeUrl;
640 } else {
641 $subUrl = $baseHref . '/' . $relativeUrl;
642 }
643 }
644 $subUrls[] = $subUrl;
645 }
646 return $subUrls;
647 }
648
649 /**
650 * Indexing Single Record
651 *
652 * @param array $r Record to index
653 * @param array $cfgRec Configuration Record
654 * @param array $rl Rootline array to relate indexing to
655 */
656 public function indexSingleRecord($r, $cfgRec, $rl = null)
657 {
658 // Init:
659 $rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
660 $fieldList = GeneralUtility::trimExplode(',', $cfgRec['fieldlist'], true);
661 $languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
662 $sys_language_uid = $languageField ? $r[$languageField] : 0;
663 // (Re)-Indexing a row from a table:
664 $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
665 parse_str(str_replace('###UID###', $r['uid'], $cfgRec['get_params']), $GETparams);
666 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, (bool)$cfgRec['chashcalc']);
667 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
668 $indexerObj->forceIndexing = true;
669 $theContent = '';
670 foreach ($fieldList as $k => $v) {
671 if (!$k) {
672 $theTitle = $r[$v];
673 } else {
674 $theContent .= $r[$v] . ' ';
675 }
676 }
677 // Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid())
678 $indexerObj->backend_indexAsTYPO3Page(strip_tags(str_replace('<', ' <', $theTitle)), '', '', strip_tags(str_replace('<', ' <', $theContent)), 'utf-8', $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']], $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']], $r['uid']);
679 }
680
681 /**
682 * Get rootline for closest TypoScript template root.
683 * Algorithm same as used in Web > Template, Object browser
684 *
685 * @param int $id The page id to traverse rootline back from
686 * @return array Array where the root lines uid values are found.
687 */
688 public function getUidRootLineForClosestTemplate($id)
689 {
690 $tmpl = GeneralUtility::makeInstance(\TYPO3\CMS\Core\TypoScript\ExtendedTemplateService::class);
691 $tmpl->init();
692 // Gets the rootLine
693 $sys_page = GeneralUtility::makeInstance(\TYPO3\CMS\Frontend\Page\PageRepository::class);
694 $rootLine = $sys_page->getRootLine($id);
695 // This generates the constants/config + hierarchy info for the template.
696 $tmpl->runThroughTemplates($rootLine, 0);
697 // Root line uids
698 $rootline_uids = [];
699 foreach ($tmpl->rootLine as $rlkey => $rldat) {
700 $rootline_uids[$rlkey] = $rldat['uid'];
701 }
702 return $rootline_uids;
703 }
704
705 /**
706 * Generate the unix time stamp for next visit.
707 *
708 * @param array $cfgRec Index configuration record
709 * @return int The next time stamp
710 */
711 public function generateNextIndexingTime($cfgRec)
712 {
713 $currentTime = $GLOBALS['EXEC_TIME'];
714 // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected
715 if ($cfgRec['timer_frequency'] <= 24 * 3600) {
716 $aMidNight = mktime(0, 0, 0) - 1 * 24 * 3600;
717 } else {
718 $lastTime = $cfgRec['timer_next_indexing'] ?: $GLOBALS['EXEC_TIME'];
719 $aMidNight = mktime(0, 0, 0, date('m', $lastTime), date('d', $lastTime), date('y', $lastTime));
720 }
721 // Find last offset time plus frequency in seconds:
722 $lastSureOffset = $aMidNight + MathUtility::forceIntegerInRange($cfgRec['timer_offset'], 0, 86400);
723 $frequencySeconds = MathUtility::forceIntegerInRange($cfgRec['timer_frequency'], 1);
724 // Now, find out how many blocks of the length of frequency there is until the next time:
725 $frequencyBlocksUntilNextTime = ceil(($currentTime - $lastSureOffset) / $frequencySeconds);
726 // Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds.
727 return $lastSureOffset + $frequencyBlocksUntilNextTime * $frequencySeconds;
728 }
729
730 /**
731 * Checks if $url has any of the URls in the $url_deny "list" in it and if so, returns TRUE.
732 *
733 * @param string $url URL to test
734 * @param string $url_deny String where URLs are separated by line-breaks; If any of these strings is the first part of $url, the function returns TRUE (to indicate denial of decend)
735 * @return bool TRUE if there is a matching URL (hence, do not index!)
736 */
737 public function checkDeniedSuburls($url, $url_deny)
738 {
739 if (trim($url_deny)) {
740 $url_denyArray = GeneralUtility::trimExplode(LF, $url_deny, true);
741 foreach ($url_denyArray as $testurl) {
742 if (GeneralUtility::isFirstPartOfStr($url, $testurl)) {
743 return true;
744 }
745 }
746 }
747 return false;
748 }
749
750 /**
751 * Adding entry in queue for Hook
752 *
753 * @param array $cfgRec Configuration record
754 * @param string $title Title/URL
755 */
756 public function addQueueEntryForHook($cfgRec, $title)
757 {
758 $nparams = [
759 'indexConfigUid' => $cfgRec['uid'],
760 // This must ALWAYS be the cfgRec uid!
761 'url' => $title,
762 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']']
763 ];
764 $this->pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
765 }
766
767 /**
768 * Deletes all data stored by indexed search for a given page
769 *
770 * @param int $id Uid of the page to delete all pHash
771 */
772 public function deleteFromIndex($id)
773 {
774 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
775
776 // Lookup old phash rows:
777
778 $queryBuilder = $connectionPool->getQueryBuilderForTable('index_section');
779 $oldPhashRows = $queryBuilder->select('phash')
780 ->from('index_section')
781 ->where(
782 $queryBuilder->expr()->eq(
783 'page_id',
784 $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT)
785 )
786 )
787 ->execute()
788 ->fetchAll();
789
790 if (empty($oldPhashRows)) {
791 return;
792 }
793
794 $tables = [
795 'index_debug',
796 'index_fulltext',
797 'index_grlist',
798 'index_phash',
799 'index_rel',
800 'index_section',
801 ];
802 foreach ($tables as $table) {
803 $queryBuilder = $connectionPool->getQueryBuilderForTable($table);
804 $queryBuilder->delete($table)
805 ->where(
806 $queryBuilder->expr()->in(
807 'phash',
808 $queryBuilder->createNamedParameter(
809 array_column($oldPhashRows, 'phash'),
810 Connection::PARAM_INT_ARRAY
811 )
812 )
813 )
814 ->execute();
815 }
816 }
817
818 /*************************
819 *
820 * Hook functions for DataHandler (indexing of records)
821 *
822 *************************/
823 /**
824 * DataHandler hook function for on-the-fly indexing of database records
825 *
826 * @param string $command DataHandler command
827 * @param string $table Table name
828 * @param string $id Record ID. If new record its a string pointing to index inside \TYPO3\CMS\Core\DataHandling\DataHandler::substNEWwithIDs
829 * @param mixed $value Target value (ignored)
830 * @param DataHandler $pObj DataHandler calling object
831 */
832 public function processCmdmap_preProcess($command, $table, $id, $value, $pObj)
833 {
834 // Clean up the index
835 if ($command === 'delete' && $table === 'pages') {
836 $this->deleteFromIndex($id);
837 }
838 }
839
840 /**
841 * DataHandler hook function for on-the-fly indexing of database records
842 *
843 * @param string $status Status "new" or "update
844 * @param string $table Table name
845 * @param string $id Record ID. If new record its a string pointing to index inside \TYPO3\CMS\Core\DataHandling\DataHandler::substNEWwithIDs
846 * @param array $fieldArray Field array of updated fields in the operation
847 * @param DataHandler $pObj DataHandler calling object
848 */
849 public function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, $pObj)
850 {
851 // Check if any fields are actually updated:
852 if (empty($fieldArray)) {
853 return;
854 }
855 // Translate new ids.
856 if ($status === 'new') {
857 $id = $pObj->substNEWwithIDs[$id];
858 } elseif ($table === 'pages' && $status === 'update' && (array_key_exists('hidden', $fieldArray) && $fieldArray['hidden'] == 1 || array_key_exists('no_search', $fieldArray) && $fieldArray['no_search'] == 1)) {
859 // If the page should be hidden or not indexed after update, delete index for this page
860 $this->deleteFromIndex($id);
861 }
862 // Get full record and if exists, search for indexing configurations:
863 $currentRecord = BackendUtility::getRecord($table, $id);
864 if (is_array($currentRecord)) {
865 // Select all (not running) indexing configurations of type "record" (1) and
866 // which points to this table and is located on the same page as the record
867 // or pointing to the right source PID
868 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
869 ->getQueryBuilderForTable('index_config');
870 $result = $queryBuilder->select('*')
871 ->from('index_config')
872 ->where(
873 $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)),
874 $queryBuilder->expr()->eq('type', $queryBuilder->createNamedParameter(1, \PDO::PARAM_INT)),
875 $queryBuilder->expr()->eq(
876 'table2index',
877 $queryBuilder->createNamedParameter($table, \PDO::PARAM_STR)
878 ),
879 $queryBuilder->expr()->orX(
880 $queryBuilder->expr()->andX(
881 $queryBuilder->expr()->eq(
882 'alternative_source_pid',
883 $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)
884 ),
885 $queryBuilder->expr()->eq(
886 'pid',
887 $queryBuilder->createNamedParameter($currentRecord['pid'], \PDO::PARAM_INT)
888 )
889 ),
890 $queryBuilder->expr()->eq(
891 'alternative_source_pid',
892 $queryBuilder->createNamedParameter($currentRecord['pid'], \PDO::PARAM_INT)
893 )
894 ),
895 $queryBuilder->expr()->eq(
896 'records_indexonchange',
897 $queryBuilder->createNamedParameter(1, \PDO::PARAM_INT)
898 )
899 )
900 ->execute();
901
902 while ($cfgRec = $result->fetch()) {
903 $this->indexSingleRecord($currentRecord, $cfgRec);
904 }
905 }
906 }
907 }