[TASK] Doctrine: Migrate indexed_search part 1
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Hook / CrawlerHook.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch\Hook;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Backend\Form\FormEngine;
18 use TYPO3\CMS\Backend\Utility\BackendUtility;
19 use TYPO3\CMS\Core\Database\ConnectionPool;
20 use TYPO3\CMS\Core\Utility\GeneralUtility;
21
22 /**
23 * Crawler hook for indexed search. Works with the "crawler" extension
24 */
25 class CrawlerHook
26 {
27 /**
28 * Number of seconds to use as interval between queued indexing operations of URLs / files (types 2 & 3)
29 *
30 * @var int
31 */
32 public $secondsPerExternalUrl = 3;
33
34 /**
35 * Counts up for each added URL (type 3)
36 *
37 * @var int
38 */
39 public $instanceCounter = 0;
40
41 /**
42 * @var string
43 */
44 public $callBack = CrawlerHook::class;
45
46 /**
47 * The constructor
48 */
49 public function __construct()
50 {
51 // To make sure the backend charset is available:
52 if (!is_object($GLOBALS['LANG'])) {
53 $GLOBALS['LANG'] = GeneralUtility::makeInstance(\TYPO3\CMS\Lang\LanguageService::class);
54 $GLOBALS['LANG']->init($GLOBALS['BE_USER']->uc['lang']);
55 }
56 }
57
58 /**
59 * Initialization of crawler hook.
60 * This function is asked for each instance of the crawler and we must check if something is timed to happen and if so put entry(s) in the crawlers log to start processing.
61 * In reality we select indexing configurations and evaluate if any of them needs to run.
62 *
63 * @param object $pObj Parent object (tx_crawler lib)
64 * @return void
65 */
66 public function crawler_init(&$pObj)
67 {
68 // Select all indexing configuration which are waiting to be activated:
69 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('*', 'index_config', 'hidden=0
70 AND (starttime=0 OR starttime<=' . $GLOBALS['EXEC_TIME'] . ')
71 AND timer_next_indexing<' . $GLOBALS['EXEC_TIME'] . '
72 AND set_id=0
73 ' . BackendUtility::deleteClause('index_config'));
74 // For each configuration, check if it should be executed and if so, start:
75 foreach ($indexingConfigurations as $cfgRec) {
76 // Generate a unique set-ID:
77 $setId = GeneralUtility::md5int(microtime());
78 // Get next time:
79 $nextTime = $this->generateNextIndexingTime($cfgRec);
80 // Start process by updating index-config record:
81 $field_array = array(
82 'set_id' => $setId,
83 'timer_next_indexing' => $nextTime,
84 'session_data' => ''
85 );
86 $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_config');
87 $connection->update('index_config', $field_array, ['uid' => (int)$cfgRec['uid']]);
88 // Based on configuration type:
89 switch ($cfgRec['type']) {
90 case 1:
91 // RECORDS:
92 // Parameters:
93 $params = array(
94 'indexConfigUid' => $cfgRec['uid'],
95 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
96 'url' => 'Records (start)'
97 );
98 //
99 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
100 break;
101 case 2:
102 // FILES:
103 // Parameters:
104 $params = array(
105 'indexConfigUid' => $cfgRec['uid'],
106 // General
107 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
108 // General
109 'url' => $cfgRec['filepath'],
110 // Partly general... (for URL and file types)
111 'depth' => 0
112 );
113 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
114 break;
115 case 3:
116 // External URL:
117 // Parameters:
118 $params = array(
119 'indexConfigUid' => $cfgRec['uid'],
120 // General
121 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
122 // General
123 'url' => $cfgRec['externalUrl'],
124 // Partly general... (for URL and file types)
125 'depth' => 0
126 );
127 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
128 break;
129 case 4:
130 // Page tree
131 // Parameters:
132 $params = array(
133 'indexConfigUid' => $cfgRec['uid'],
134 // General
135 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
136 // General
137 'url' => (int)$cfgRec['alternative_source_pid'],
138 // Partly general... (for URL and file types and page tree (root))
139 'depth' => 0
140 );
141 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
142 break;
143 case 5:
144 // Meta configuration, nothing to do:
145 // NOOP
146 break;
147 default:
148 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
149 $hookObj = GeneralUtility::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
150 if (is_object($hookObj)) {
151 // Parameters:
152 $params = array(
153 'indexConfigUid' => $cfgRec['uid'],
154 // General
155 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . '/CUSTOM]'),
156 // General
157 'url' => $hookObj->initMessage($message)
158 );
159 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
160 }
161 }
162 }
163 }
164 // Finally, look up all old index configurations which are finished and needs to be reset and done.
165 $this->cleanUpOldRunningConfigurations();
166 }
167
168 /**
169 * Call back function for execution of a log element
170 *
171 * @param array $params Params from log element. Must contain $params['indexConfigUid']
172 * @param object $pObj Parent object (tx_crawler lib)
173 * @return array Result array
174 */
175 public function crawler_execute($params, &$pObj)
176 {
177 // Indexer configuration ID must exist:
178 if ($params['indexConfigUid']) {
179 // Load the indexing configuration record:
180 $cfgRec = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('*', 'index_config', 'uid=' . (int)$params['indexConfigUid']);
181 if (is_array($cfgRec)) {
182 // Unpack session data:
183 $session_data = unserialize($cfgRec['session_data']);
184 // Select which type:
185 switch ($cfgRec['type']) {
186 case 1:
187 // Records:
188 $this->crawler_execute_type1($cfgRec, $session_data, $params, $pObj);
189 break;
190 case 2:
191 // Files
192 $this->crawler_execute_type2($cfgRec, $session_data, $params, $pObj);
193 break;
194 case 3:
195 // External URL:
196 $this->crawler_execute_type3($cfgRec, $session_data, $params, $pObj);
197 break;
198 case 4:
199 // Page tree:
200 $this->crawler_execute_type4($cfgRec, $session_data, $params, $pObj);
201 break;
202 case 5:
203 // Meta
204 // NOOP (should never enter here!)
205 break;
206 default:
207 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
208 $hookObj = GeneralUtility::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
209 if (is_object($hookObj)) {
210 $this->pObj = $pObj;
211 // For addQueueEntryForHook()
212 $hookObj->indexOperation($cfgRec, $session_data, $params, $this);
213 }
214 }
215 }
216 // Save process data which might be modified:
217 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('index_config');
218 $queryBuilder
219 ->update('index_config')
220 ->where(
221 $queryBuilder->expr()->eq('uid', (int)$cfgRec['uid'])
222 )
223 ->set('session_data', serialize($session_data))
224 ->execute();
225 }
226 }
227 return array('log' => $params);
228 }
229
230 /**
231 * Indexing records from a table
232 *
233 * @param array $cfgRec Indexing Configuration Record
234 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
235 * @param array $params Parameters from the log queue.
236 * @param object $pObj Parent object (from "crawler" extension!)
237 * @return void
238 */
239 public function crawler_execute_type1($cfgRec, &$session_data, $params, &$pObj)
240 {
241 if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']])) {
242 // Init session data array if not already:
243 if (!is_array($session_data)) {
244 $session_data = array(
245 'uid' => 0
246 );
247 }
248 // Init:
249 $pid = (int)$cfgRec['alternative_source_pid'] ?: $cfgRec['pid'];
250 $numberOfRecords = $cfgRec['recordsbatch'] ? \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cfgRec['recordsbatch'], 1) : 100;
251 // Get root line:
252 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
253 // Select
254 $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('*', $cfgRec['table2index'], 'pid = ' . $pid . '
255 AND uid > ' . (int)$session_data['uid'] . BackendUtility::deleteClause($cfgRec['table2index']) . BackendUtility::BEenableFields($cfgRec['table2index']), '', 'uid', $numberOfRecords);
256 // Traverse:
257 if (!empty($recs)) {
258 foreach ($recs as $r) {
259 // Index single record:
260 $this->indexSingleRecord($r, $cfgRec, $rl);
261 // Update the UID we last processed:
262 $session_data['uid'] = $r['uid'];
263 }
264 // Finally, set entry for next indexing of batch of records:
265 $nparams = array(
266 'indexConfigUid' => $cfgRec['uid'],
267 'url' => 'Records from UID#' . ($r['uid'] + 1) . '-?',
268 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']')
269 );
270 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
271 }
272 }
273 }
274
275 /**
276 * Indexing files from fileadmin
277 *
278 * @param array $cfgRec Indexing Configuration Record
279 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
280 * @param array $params Parameters from the log queue.
281 * @param object $pObj Parent object (from "crawler" extension!)
282 * @return void
283 */
284 public function crawler_execute_type2($cfgRec, &$session_data, $params, &$pObj)
285 {
286 // Prepare path, making it absolute and checking:
287 $readpath = $params['url'];
288 if (!GeneralUtility::isAbsPath($readpath)) {
289 $readpath = GeneralUtility::getFileAbsFileName($readpath);
290 }
291 if (GeneralUtility::isAllowedAbsPath($readpath)) {
292 if (@is_file($readpath)) {
293 // If file, index it!
294 // Get root line (need to provide this when indexing external files)
295 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
296 // (Re)-Indexing file on page.
297 $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
298 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
299 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
300 $indexerObj->hash['phash'] = -1;
301 // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
302 // Index document:
303 $indexerObj->indexRegularDocument(\TYPO3\CMS\Core\Utility\PathUtility::stripPathSitePrefix($readpath), true);
304 } elseif (@is_dir($readpath)) {
305 // If dir, read content and create new pending items for log:
306 // Select files and directories in path:
307 $extList = implode(',', GeneralUtility::trimExplode(',', $cfgRec['extensions'], true));
308 $fileArr = array();
309 $files = GeneralUtility::getAllFilesAndFoldersInPath($fileArr, $readpath, $extList, 0, 0);
310 $directoryList = GeneralUtility::get_dirs($readpath);
311 if (is_array($directoryList) && $params['depth'] < $cfgRec['depth']) {
312 foreach ($directoryList as $subdir) {
313 if ((string)$subdir != '') {
314 $files[] = $readpath . $subdir . '/';
315 }
316 }
317 }
318 $files = GeneralUtility::removePrefixPathFromList($files, PATH_site);
319 // traverse the items and create log entries:
320 foreach ($files as $path) {
321 $this->instanceCounter++;
322 if ($path !== $params['url']) {
323 // Parameters:
324 $nparams = array(
325 'indexConfigUid' => $cfgRec['uid'],
326 'url' => $path,
327 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
328 'depth' => $params['depth'] + 1
329 );
330 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
331 }
332 }
333 }
334 }
335 }
336
337 /**
338 * Indexing External URLs
339 *
340 * @param array $cfgRec Indexing Configuration Record
341 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
342 * @param array $params Parameters from the log queue.
343 * @param object $pObj Parent object (from "crawler" extension!)
344 * @return void
345 */
346 public function crawler_execute_type3($cfgRec, &$session_data, $params, &$pObj)
347 {
348 // Init session data array if not already:
349 if (!is_array($session_data)) {
350 $session_data = array(
351 'urlLog' => array($params['url'])
352 );
353 }
354 // Index the URL:
355 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
356 $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
357 // Add more elements to log now:
358 if ($params['depth'] < $cfgRec['depth']) {
359 foreach ($subUrls as $url) {
360 if ($url = $this->checkUrl($url, $session_data['urlLog'], $cfgRec['externalUrl'])) {
361 if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny'])) {
362 $this->instanceCounter++;
363 $session_data['urlLog'][] = $url;
364 // Parameters:
365 $nparams = array(
366 'indexConfigUid' => $cfgRec['uid'],
367 'url' => $url,
368 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
369 'depth' => $params['depth'] + 1
370 );
371 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
372 }
373 }
374 }
375 }
376 }
377
378 /**
379 * Page tree indexing type
380 *
381 * @param array $cfgRec Indexing Configuration Record
382 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
383 * @param array $params Parameters from the log queue.
384 * @param object $pObj Parent object (from "crawler" extension!)
385 * @return void
386 */
387 public function crawler_execute_type4($cfgRec, &$session_data, $params, &$pObj)
388 {
389 // Base page uid:
390 $pageUid = (int)$params['url'];
391 // Get array of URLs from page:
392 $pageRow = BackendUtility::getRecord('pages', $pageUid);
393 $res = $pObj->getUrlsForPageRow($pageRow);
394 $duplicateTrack = array();
395 // Registry for duplicates
396 $downloadUrls = array();
397 // Dummy.
398 // Submit URLs:
399 if (!empty($res)) {
400 foreach ($res as $paramSetKey => $vv) {
401 $urlList = $pObj->urlListFromUrlArray($vv, $pageRow, $GLOBALS['EXEC_TIME'], 30, 1, 0, $duplicateTrack, $downloadUrls, array('tx_indexedsearch_reindex'));
402 }
403 }
404 // Add subpages to log now:
405 if ($params['depth'] < $cfgRec['depth']) {
406 // Subpages selected
407 $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('uid,title', 'pages', 'pid = ' . $pageUid . BackendUtility::deleteClause('pages'));
408 // Traverse subpages and add to queue:
409 if (!empty($recs)) {
410 foreach ($recs as $r) {
411 $this->instanceCounter++;
412 $url = 'pages:' . $r['uid'] . ': ' . $r['title'];
413 $session_data['urlLog'][] = $url;
414 // Parameters:
415 $nparams = array(
416 'indexConfigUid' => $cfgRec['uid'],
417 'url' => $r['uid'],
418 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
419 'depth' => $params['depth'] + 1
420 );
421 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
422 }
423 }
424 }
425 }
426
427 /**
428 * Look up all old index configurations which are finished and needs to be reset and done
429 *
430 * @return void
431 */
432 public function cleanUpOldRunningConfigurations()
433 {
434 // Lookup running index configurations:
435 $runningIndexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('uid,set_id', 'index_config', 'set_id<>0' . BackendUtility::deleteClause('index_config'));
436 // For each running configuration, look up how many log entries there are which are scheduled for execution and if none, clear the "set_id" (means; Processing was DONE)
437 foreach ($runningIndexingConfigurations as $cfgRec) {
438 // Look for ended processes:
439 $queued_items = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('*', 'tx_crawler_queue', 'set_id=' . (int)$cfgRec['set_id'] . ' AND exec_time=0');
440 if (!$queued_items) {
441 // Lookup old phash rows:
442 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('phash', 'index_phash', 'freeIndexUid=' . (int)$cfgRec['uid'] . ' AND freeIndexSetId<>' . (int)$cfgRec['set_id']);
443 foreach ($oldPhashRows as $pHashRow) {
444 // Removing old registrations for all tables (code copied from \TYPO3\CMS\IndexedSearch\Domain\Repository\IndexedPagesController\AdministrationRepository)
445 $tableArr = array('index_phash', 'index_rel', 'index_section', 'index_grlist', 'index_fulltext', 'index_debug');
446 foreach ($tableArr as $table) {
447 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . (int)$pHashRow['phash']);
448 }
449 }
450 // End process by updating index-config record:
451 $field_array = array(
452 'set_id' => 0,
453 'session_data' => ''
454 );
455 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config', 'uid=' . (int)$cfgRec['uid'], $field_array);
456 }
457 }
458 }
459
460 /*****************************************
461 *
462 * Helper functions
463 *
464 *****************************************/
465 /**
466 * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log.
467 *
468 * @param string $url URL string to check
469 * @param array $urlLog Array of already indexed URLs (input url is looked up here and must not exist already)
470 * @param string $baseUrl Base URL of the indexing process (input URL must be "inside" the base URL!)
471 * @return string Returls the URL if OK, otherwise FALSE
472 */
473 public function checkUrl($url, $urlLog, $baseUrl)
474 {
475 $url = preg_replace('/\\/\\/$/', '/', $url);
476 list($url) = explode('#', $url);
477 if (!strstr($url, '../')) {
478 if (GeneralUtility::isFirstPartOfStr($url, $baseUrl)) {
479 if (!in_array($url, $urlLog)) {
480 return $url;
481 }
482 }
483 }
484 }
485
486 /**
487 * Indexing External URL
488 *
489 * @param string $url URL, http://....
490 * @param int $pageId Page id to relate indexing to.
491 * @param array $rl Rootline array to relate indexing to
492 * @param int $cfgUid Configuration UID
493 * @param int $setId Set ID value
494 * @return array URLs found on this page
495 */
496 public function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)
497 {
498 // Index external URL:
499 $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
500 $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
501 $indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
502 $indexerObj->hash['phash'] = -1;
503 // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
504 $indexerObj->indexExternalUrl($url);
505 $url_qParts = parse_url($url);
506 $baseAbsoluteHref = $url_qParts['scheme'] . '://' . $url_qParts['host'];
507 $baseHref = $indexerObj->extractBaseHref($indexerObj->indexExternalUrl_content);
508 if (!$baseHref) {
509 // Extract base href from current URL
510 $baseHref = $baseAbsoluteHref;
511 $baseHref .= substr($url_qParts['path'], 0, strrpos($url_qParts['path'], '/'));
512 }
513 $baseHref = rtrim($baseHref, '/');
514 // Get URLs on this page:
515 $subUrls = array();
516 $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
517 // Traverse links:
518 foreach ($list as $count => $linkInfo) {
519 // Decode entities:
520 $subUrl = htmlspecialchars_decode($linkInfo['href']);
521 $qParts = parse_url($subUrl);
522 if (!$qParts['scheme']) {
523 $relativeUrl = GeneralUtility::resolveBackPath($subUrl);
524 if ($relativeUrl[0] === '/') {
525 $subUrl = $baseAbsoluteHref . $relativeUrl;
526 } else {
527 $subUrl = $baseHref . '/' . $relativeUrl;
528 }
529 }
530 $subUrls[] = $subUrl;
531 }
532 return $subUrls;
533 }
534
535 /**
536 * Indexing Single Record
537 *
538 * @param array $r Record to index
539 * @param array $cfgRec Configuration Record
540 * @param array $rl Rootline array to relate indexing to
541 * @return void
542 */
543 public function indexSingleRecord($r, $cfgRec, $rl = null)
544 {
545 // Init:
546 $rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
547 $fieldList = GeneralUtility::trimExplode(',', $cfgRec['fieldlist'], true);
548 $languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
549 $sys_language_uid = $languageField ? $r[$languageField] : 0;
550 // (Re)-Indexing a row from a table:
551 $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
552 parse_str(str_replace('###UID###', $r['uid'], $cfgRec['get_params']), $GETparams);
553 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, (bool)$cfgRec['chashcalc']);
554 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
555 $indexerObj->forceIndexing = true;
556 $theContent = '';
557 foreach ($fieldList as $k => $v) {
558 if (!$k) {
559 $theTitle = $r[$v];
560 } else {
561 $theContent .= $r[$v] . ' ';
562 }
563 }
564 // Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid())
565 $indexerObj->backend_indexAsTYPO3Page(strip_tags(str_replace('<', ' <', $theTitle)), '', '', strip_tags(str_replace('<', ' <', $theContent)), 'utf-8', $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']], $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']], $r['uid']);
566 }
567
568 /**
569 * Get rootline for closest TypoScript template root.
570 * Algorithm same as used in Web > Template, Object browser
571 *
572 * @param int $id The page id to traverse rootline back from
573 * @return array Array where the root lines uid values are found.
574 */
575 public function getUidRootLineForClosestTemplate($id)
576 {
577 $tmpl = GeneralUtility::makeInstance(\TYPO3\CMS\Core\TypoScript\ExtendedTemplateService::class);
578 $tmpl->init();
579 // Gets the rootLine
580 $sys_page = GeneralUtility::makeInstance(\TYPO3\CMS\Frontend\Page\PageRepository::class);
581 $rootLine = $sys_page->getRootLine($id);
582 // This generates the constants/config + hierarchy info for the template.
583 $tmpl->runThroughTemplates($rootLine, 0);
584 // Root line uids
585 $rootline_uids = array();
586 foreach ($tmpl->rootLine as $rlkey => $rldat) {
587 $rootline_uids[$rlkey] = $rldat['uid'];
588 }
589 return $rootline_uids;
590 }
591
592 /**
593 * Generate the unix time stamp for next visit.
594 *
595 * @param array $cfgRec Index configuration record
596 * @return int The next time stamp
597 */
598 public function generateNextIndexingTime($cfgRec)
599 {
600 $currentTime = $GLOBALS['EXEC_TIME'];
601 // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected
602 if ($cfgRec['timer_frequency'] <= 24 * 3600) {
603 $aMidNight = mktime(0, 0, 0) - 1 * 24 * 3600;
604 } else {
605 $lastTime = $cfgRec['timer_next_indexing'] ?: $GLOBALS['EXEC_TIME'];
606 $aMidNight = mktime(0, 0, 0, date('m', $lastTime), date('d', $lastTime), date('y', $lastTime));
607 }
608 // Find last offset time plus frequency in seconds:
609 $lastSureOffset = $aMidNight + \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cfgRec['timer_offset'], 0, 86400);
610 $frequencySeconds = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cfgRec['timer_frequency'], 1);
611 // Now, find out how many blocks of the length of frequency there is until the next time:
612 $frequencyBlocksUntilNextTime = ceil(($currentTime - $lastSureOffset) / $frequencySeconds);
613 // Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds.
614 return $lastSureOffset + $frequencyBlocksUntilNextTime * $frequencySeconds;
615 }
616
617 /**
618 * Checks if $url has any of the URls in the $url_deny "list" in it and if so, returns TRUE.
619 *
620 * @param string $url URL to test
621 * @param string $url_deny String where URLs are separated by line-breaks; If any of these strings is the first part of $url, the function returns TRUE (to indicate denial of decend)
622 * @return bool TRUE if there is a matching URL (hence, do not index!)
623 */
624 public function checkDeniedSuburls($url, $url_deny)
625 {
626 if (trim($url_deny)) {
627 $url_denyArray = GeneralUtility::trimExplode(LF, $url_deny, true);
628 foreach ($url_denyArray as $testurl) {
629 if (GeneralUtility::isFirstPartOfStr($url, $testurl)) {
630 return true;
631 }
632 }
633 }
634 return false;
635 }
636
637 /**
638 * Adding entry in queue for Hook
639 *
640 * @param array $cfgRec Configuration record
641 * @param string $title Title/URL
642 * @return void
643 */
644 public function addQueueEntryForHook($cfgRec, $title)
645 {
646 $nparams = array(
647 'indexConfigUid' => $cfgRec['uid'],
648 // This must ALWAYS be the cfgRec uid!
649 'url' => $title,
650 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']')
651 );
652 $this->pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
653 }
654
655 /**
656 * Deletes all data stored by indexed search for a given page
657 *
658 * @param int $id Uid of the page to delete all pHash
659 * @return void
660 */
661 public function deleteFromIndex($id)
662 {
663 // Lookup old phash rows:
664 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('phash', 'index_section', 'page_id=' . (int)$id);
665 if (!empty($oldPhashRows)) {
666 $pHashesToDelete = array();
667 foreach ($oldPhashRows as $pHashRow) {
668 $pHashesToDelete[] = $pHashRow['phash'];
669 }
670 $where_clause = 'phash IN (' . implode(',', $GLOBALS['TYPO3_DB']->cleanIntArray($pHashesToDelete)) . ')';
671 $tables = array(
672 'index_debug',
673 'index_fulltext',
674 'index_grlist',
675 'index_phash',
676 'index_rel',
677 'index_section',
678 );
679 foreach ($tables as $table) {
680 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, $where_clause);
681 }
682 }
683 }
684
685 /*************************
686 *
687 * Hook functions for TCEmain (indexing of records)
688 *
689 *************************/
690 /**
691 * TCEmain hook function for on-the-fly indexing of database records
692 *
693 * @param string $command TCEmain command
694 * @param string $table Table name
695 * @param string $id Record ID. If new record its a string pointing to index inside \TYPO3\CMS\Core\DataHandling\DataHandler::substNEWwithIDs
696 * @param mixed $value Target value (ignored)
697 * @param FormEngine $pObj tcemain calling object
698 * @return void
699 */
700 public function processCmdmap_preProcess($command, $table, $id, $value, $pObj)
701 {
702 // Clean up the index
703 if ($command === 'delete' && $table === 'pages') {
704 $this->deleteFromIndex($id);
705 }
706 }
707
708 /**
709 * TCEmain hook function for on-the-fly indexing of database records
710 *
711 * @param string $status Status "new" or "update
712 * @param string $table Table name
713 * @param string $id Record ID. If new record its a string pointing to index inside \TYPO3\CMS\Core\DataHandling\DataHandler::substNEWwithIDs
714 * @param array $fieldArray Field array of updated fields in the operation
715 * @param FormEngine $pObj tcemain calling object
716 * @return void
717 */
718 public function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, $pObj)
719 {
720 // Check if any fields are actually updated:
721 if (!empty($fieldArray)) {
722 // Translate new ids.
723 if ($status === 'new') {
724 $id = $pObj->substNEWwithIDs[$id];
725 } elseif ($table === 'pages' && $status === 'update' && (array_key_exists('hidden', $fieldArray) && $fieldArray['hidden'] == 1 || array_key_exists('no_search', $fieldArray) && $fieldArray['no_search'] == 1)) {
726 // If the page should be hidden or not indexed after update, delete index for this page
727 $this->deleteFromIndex($id);
728 }
729 // Get full record and if exists, search for indexing configurations:
730 $currentRecord = BackendUtility::getRecord($table, $id);
731 if (is_array($currentRecord)) {
732 // Select all (not running) indexing configurations of type "record" (1) and which points to this table and is located on the same page as the record or pointing to the right source PID
733 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('*', 'index_config', 'hidden=0
734 AND (starttime=0 OR starttime<=' . $GLOBALS['EXEC_TIME'] . ')
735 AND set_id=0
736 AND type=1
737 AND table2index=' . $GLOBALS['TYPO3_DB']->fullQuoteStr($table, 'index_config') . '
738 AND (
739 (alternative_source_pid=0 AND pid=' . (int)$currentRecord['pid'] . ')
740 OR (alternative_source_pid=' . (int)$currentRecord['pid'] . ')
741 )
742 AND records_indexonchange=1
743 ' . BackendUtility::deleteClause('index_config'));
744 foreach ($indexingConfigurations as $cfgRec) {
745 $this->indexSingleRecord($currentRecord, $cfgRec);
746 }
747 }
748 }
749 }
750 }