[CLEANUP] Improve the @param/@return/@var PHPDoc
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Hook / CrawlerHook.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch\Hook;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Backend\Form\FormEngine;
18 use TYPO3\CMS\Backend\Utility\BackendUtility;
19 use TYPO3\CMS\Core\Utility\GeneralUtility;
20
21 /**
22 * Crawler hook for indexed search. Works with the "crawler" extension
23 *
24 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
25 */
26 class CrawlerHook {
27
28 // Static:
29 /**
30 * @var int
31 */
32 public $secondsPerExternalUrl = 3;
33
34 // Number of seconds to use as interval between queued indexing operations of URLs / files (types 2 & 3)
35 // Internal, dynamic:
36 /**
37 * @var int
38 */
39 public $instanceCounter = 0;
40
41 // Counts up for each added URL (type 3)
42 // Internal, static:
43 /**
44 * @var string
45 */
46 public $callBack = '&TYPO3\\CMS\\IndexedSearch\\Hook\\CrawlerHook';
47
48 // The object reference to this class.
49 /**
50 * Initialization of crawler hook.
51 * This function is asked for each instance of the crawler and we must check if something is timed to happen and if so put entry(s) in the crawlers log to start processing.
52 * In reality we select indexing configurations and evaluate if any of them needs to run.
53 *
54 * @param object $pObj Parent object (tx_crawler lib)
55 * @return void
56 */
57 public function crawler_init(&$pObj) {
58 // Select all indexing configuration which are waiting to be activated:
59 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('*', 'index_config', 'hidden=0
60 AND (starttime=0 OR starttime<=' . $GLOBALS['EXEC_TIME'] . ')
61 AND timer_next_indexing<' . $GLOBALS['EXEC_TIME'] . '
62 AND set_id=0
63 ' . BackendUtility::deleteClause('index_config'));
64 // For each configuration, check if it should be executed and if so, start:
65 foreach ($indexingConfigurations as $cfgRec) {
66 // Generate a unique set-ID:
67 $setId = GeneralUtility::md5int(microtime());
68 // Get next time:
69 $nextTime = $this->generateNextIndexingTime($cfgRec);
70 // Start process by updating index-config record:
71 $field_array = array(
72 'set_id' => $setId,
73 'timer_next_indexing' => $nextTime,
74 'session_data' => ''
75 );
76 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config', 'uid=' . (int)$cfgRec['uid'], $field_array);
77 // Based on configuration type:
78 switch ($cfgRec['type']) {
79 case 1:
80 // RECORDS:
81 // Parameters:
82 $params = array(
83 'indexConfigUid' => $cfgRec['uid'],
84 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
85 'url' => 'Records (start)'
86 );
87 //
88 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
89 break;
90 case 2:
91 // FILES:
92 // Parameters:
93 $params = array(
94 'indexConfigUid' => $cfgRec['uid'],
95 // General
96 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
97 // General
98 'url' => $cfgRec['filepath'],
99 // Partly general... (for URL and file types)
100 'depth' => 0
101 );
102 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
103 break;
104 case 3:
105 // External URL:
106 // Parameters:
107 $params = array(
108 'indexConfigUid' => $cfgRec['uid'],
109 // General
110 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
111 // General
112 'url' => $cfgRec['externalUrl'],
113 // Partly general... (for URL and file types)
114 'depth' => 0
115 );
116 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
117 break;
118 case 4:
119 // Page tree
120 // Parameters:
121 $params = array(
122 'indexConfigUid' => $cfgRec['uid'],
123 // General
124 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
125 // General
126 'url' => (int)$cfgRec['alternative_source_pid'],
127 // Partly general... (for URL and file types and page tree (root))
128 'depth' => 0
129 );
130 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
131 break;
132 case 5:
133 // Meta configuration, nothing to do:
134 // NOOP
135 break;
136 default:
137 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
138 $hookObj = GeneralUtility::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
139 if (is_object($hookObj)) {
140 // Parameters:
141 $params = array(
142 'indexConfigUid' => $cfgRec['uid'],
143 // General
144 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . '/CUSTOM]'),
145 // General
146 'url' => $hookObj->initMessage($message)
147 );
148 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
149 }
150 }
151 }
152 }
153 // Finally, look up all old index configurations which are finished and needs to be reset and done.
154 $this->cleanUpOldRunningConfigurations();
155 }
156
157 /**
158 * Call back function for execution of a log element
159 *
160 * @param array $params Params from log element. Must contain $params['indexConfigUid']
161 * @param object $pObj Parent object (tx_crawler lib)
162 * @return array Result array
163 */
164 public function crawler_execute($params, &$pObj) {
165 // Indexer configuration ID must exist:
166 if ($params['indexConfigUid']) {
167 // Load the indexing configuration record:
168 $cfgRec = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('*', 'index_config', 'uid=' . (int)$params['indexConfigUid']);
169 if (is_array($cfgRec)) {
170 // Unpack session data:
171 $session_data = unserialize($cfgRec['session_data']);
172 // Select which type:
173 switch ($cfgRec['type']) {
174 case 1:
175 // Records:
176 $this->crawler_execute_type1($cfgRec, $session_data, $params, $pObj);
177 break;
178 case 2:
179 // Files
180 $this->crawler_execute_type2($cfgRec, $session_data, $params, $pObj);
181 break;
182 case 3:
183 // External URL:
184 $this->crawler_execute_type3($cfgRec, $session_data, $params, $pObj);
185 break;
186 case 4:
187 // Page tree:
188 $this->crawler_execute_type4($cfgRec, $session_data, $params, $pObj);
189 break;
190 case 5:
191 // Meta
192 // NOOP (should never enter here!)
193 break;
194 default:
195 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
196 $hookObj = GeneralUtility::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
197 if (is_object($hookObj)) {
198 $this->pObj = $pObj;
199 // For addQueueEntryForHook()
200 $hookObj->indexOperation($cfgRec, $session_data, $params, $this);
201 }
202 }
203 }
204 // Save process data which might be modified:
205 $field_array = array(
206 'session_data' => serialize($session_data)
207 );
208 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config', 'uid=' . (int)$cfgRec['uid'], $field_array);
209 }
210 }
211 return array('log' => $params);
212 }
213
214 /**
215 * Indexing records from a table
216 *
217 * @param array $cfgRec Indexing Configuration Record
218 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
219 * @param array $params Parameters from the log queue.
220 * @param object $pObj Parent object (from "crawler" extension!)
221 * @return void
222 */
223 public function crawler_execute_type1($cfgRec, &$session_data, $params, &$pObj) {
224 if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']])) {
225 // Init session data array if not already:
226 if (!is_array($session_data)) {
227 $session_data = array(
228 'uid' => 0
229 );
230 }
231 // Init:
232 $pid = (int)$cfgRec['alternative_source_pid'] ?: $cfgRec['pid'];
233 $numberOfRecords = $cfgRec['recordsbatch'] ? \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cfgRec['recordsbatch'], 1) : 100;
234 // Get root line:
235 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
236 // Select
237 $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('*', $cfgRec['table2index'], 'pid = ' . $pid . '
238 AND uid > ' . (int)$session_data['uid'] . BackendUtility::deleteClause($cfgRec['table2index']) . BackendUtility::BEenableFields($cfgRec['table2index']), '', 'uid', $numberOfRecords);
239 // Traverse:
240 if (count($recs)) {
241 foreach ($recs as $r) {
242 // Index single record:
243 $this->indexSingleRecord($r, $cfgRec, $rl);
244 // Update the UID we last processed:
245 $session_data['uid'] = $r['uid'];
246 }
247 // Finally, set entry for next indexing of batch of records:
248 $nparams = array(
249 'indexConfigUid' => $cfgRec['uid'],
250 'url' => 'Records from UID#' . ($r['uid'] + 1) . '-?',
251 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']')
252 );
253 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
254 }
255 }
256 }
257
258 /**
259 * Indexing files from fileadmin
260 *
261 * @param array $cfgRec Indexing Configuration Record
262 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
263 * @param array $params Parameters from the log queue.
264 * @param object $pObj Parent object (from "crawler" extension!)
265 * @return void
266 */
267 public function crawler_execute_type2($cfgRec, &$session_data, $params, &$pObj) {
268 // Prepare path, making it absolute and checking:
269 $readpath = $params['url'];
270 if (!GeneralUtility::isAbsPath($readpath)) {
271 $readpath = GeneralUtility::getFileAbsFileName($readpath);
272 }
273 if (GeneralUtility::isAllowedAbsPath($readpath)) {
274 if (@is_file($readpath)) {
275 // If file, index it!
276 // Get root line (need to provide this when indexing external files)
277 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
278 // (Re)-Indexing file on page.
279 $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
280 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
281 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
282 $indexerObj->hash['phash'] = -1;
283 // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
284 // Index document:
285 $indexerObj->indexRegularDocument(\TYPO3\CMS\Core\Utility\PathUtility::stripPathSitePrefix($readpath), TRUE);
286 } elseif (@is_dir($readpath)) {
287 // If dir, read content and create new pending items for log:
288 // Select files and directories in path:
289 $extList = implode(',', GeneralUtility::trimExplode(',', $cfgRec['extensions'], TRUE));
290 $fileArr = array();
291 $files = GeneralUtility::getAllFilesAndFoldersInPath($fileArr, $readpath, $extList, 0, 0);
292 $directoryList = GeneralUtility::get_dirs($readpath);
293 if (is_array($directoryList) && $params['depth'] < $cfgRec['depth']) {
294 foreach ($directoryList as $subdir) {
295 if ((string)$subdir != '') {
296 $files[] = $readpath . $subdir . '/';
297 }
298 }
299 }
300 $files = GeneralUtility::removePrefixPathFromList($files, PATH_site);
301 // traverse the items and create log entries:
302 foreach ($files as $path) {
303 $this->instanceCounter++;
304 if ($path !== $params['url']) {
305 // Parameters:
306 $nparams = array(
307 'indexConfigUid' => $cfgRec['uid'],
308 'url' => $path,
309 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
310 'depth' => $params['depth'] + 1
311 );
312 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
313 }
314 }
315 }
316 }
317 }
318
319 /**
320 * Indexing External URLs
321 *
322 * @param array $cfgRec Indexing Configuration Record
323 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
324 * @param array $params Parameters from the log queue.
325 * @param object $pObj Parent object (from "crawler" extension!)
326 * @return void
327 */
328 public function crawler_execute_type3($cfgRec, &$session_data, $params, &$pObj) {
329 // Init session data array if not already:
330 if (!is_array($session_data)) {
331 $session_data = array(
332 'urlLog' => array($params['url'])
333 );
334 }
335 // Index the URL:
336 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
337 $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
338 // Add more elements to log now:
339 if ($params['depth'] < $cfgRec['depth']) {
340 foreach ($subUrls as $url) {
341 if ($url = $this->checkUrl($url, $session_data['urlLog'], $cfgRec['externalUrl'])) {
342 if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny'])) {
343 $this->instanceCounter++;
344 $session_data['urlLog'][] = $url;
345 // Parameters:
346 $nparams = array(
347 'indexConfigUid' => $cfgRec['uid'],
348 'url' => $url,
349 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
350 'depth' => $params['depth'] + 1
351 );
352 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
353 }
354 }
355 }
356 }
357 }
358
359 /**
360 * Page tree indexing type
361 *
362 * @param array $cfgRec Indexing Configuration Record
363 * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
364 * @param array $params Parameters from the log queue.
365 * @param object $pObj Parent object (from "crawler" extension!)
366 * @return void
367 */
368 public function crawler_execute_type4($cfgRec, &$session_data, $params, &$pObj) {
369 // Base page uid:
370 $pageUid = (int)$params['url'];
371 // Get array of URLs from page:
372 $pageRow = BackendUtility::getRecord('pages', $pageUid);
373 $res = $pObj->getUrlsForPageRow($pageRow);
374 $duplicateTrack = array();
375 // Registry for duplicates
376 $downloadUrls = array();
377 // Dummy.
378 // Submit URLs:
379 if (count($res)) {
380 foreach ($res as $paramSetKey => $vv) {
381 $urlList = $pObj->urlListFromUrlArray($vv, $pageRow, $GLOBALS['EXEC_TIME'], 30, 1, 0, $duplicateTrack, $downloadUrls, array('tx_indexedsearch_reindex'));
382 }
383 }
384 // Add subpages to log now:
385 if ($params['depth'] < $cfgRec['depth']) {
386 // Subpages selected
387 $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('uid,title', 'pages', 'pid = ' . $pageUid . BackendUtility::deleteClause('pages'));
388 // Traverse subpages and add to queue:
389 if (count($recs)) {
390 foreach ($recs as $r) {
391 $this->instanceCounter++;
392 $url = 'pages:' . $r['uid'] . ': ' . $r['title'];
393 $session_data['urlLog'][] = $url;
394 // Parameters:
395 $nparams = array(
396 'indexConfigUid' => $cfgRec['uid'],
397 'url' => $r['uid'],
398 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
399 'depth' => $params['depth'] + 1
400 );
401 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
402 }
403 }
404 }
405 }
406
407 /**
408 * Look up all old index configurations which are finished and needs to be reset and done
409 *
410 * @return void
411 */
412 public function cleanUpOldRunningConfigurations() {
413 // Lookup running index configurations:
414 $runningIndexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('uid,set_id', 'index_config', 'set_id<>0' . BackendUtility::deleteClause('index_config'));
415 // For each running configuration, look up how many log entries there are which are scheduled for execution and if none, clear the "set_id" (means; Processing was DONE)
416 foreach ($runningIndexingConfigurations as $cfgRec) {
417 // Look for ended processes:
418 $queued_items = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('*', 'tx_crawler_queue', 'set_id=' . (int)$cfgRec['set_id'] . ' AND exec_time=0');
419 if (!$queued_items) {
420 // Lookup old phash rows:
421 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('phash', 'index_phash', 'freeIndexUid=' . (int)$cfgRec['uid'] . ' AND freeIndexSetId<>' . (int)$cfgRec['set_id']);
422 foreach ($oldPhashRows as $pHashRow) {
423 // Removing old registrations for all tables (code copied from \TYPO3\CMS\IndexedSearch\Domain\Repository\IndexedPagesController\AdministrationRepository)
424 $tableArr = array('index_phash', 'index_rel', 'index_section', 'index_grlist', 'index_fulltext', 'index_debug');
425 foreach ($tableArr as $table) {
426 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . (int)$pHashRow['phash']);
427 }
428 }
429 // End process by updating index-config record:
430 $field_array = array(
431 'set_id' => 0,
432 'session_data' => ''
433 );
434 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config', 'uid=' . (int)$cfgRec['uid'], $field_array);
435 }
436 }
437 }
438
439 /*****************************************
440 *
441 * Helper functions
442 *
443 *****************************************/
444 /**
445 * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log.
446 *
447 * @param string URL string to check
448 * @param array Array of already indexed URLs (input url is looked up here and must not exist already)
449 * @param string Base URL of the indexing process (input URL must be "inside" the base URL!)
450 * @return string Returls the URL if OK, otherwise FALSE
451 */
452 public function checkUrl($url, $urlLog, $baseUrl) {
453 $url = preg_replace('/\\/\\/$/', '/', $url);
454 list($url) = explode('#', $url);
455 if (!strstr($url, '../')) {
456 if (GeneralUtility::isFirstPartOfStr($url, $baseUrl)) {
457 if (!in_array($url, $urlLog)) {
458 return $url;
459 }
460 }
461 }
462 }
463
464 /**
465 * Indexing External URL
466 *
467 * @param string URL, http://....
468 * @param int Page id to relate indexing to.
469 * @param array Rootline array to relate indexing to
470 * @param int Configuration UID
471 * @param int Set ID value
472 * @return array URLs found on this page
473 */
474 public function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId) {
475 // Index external URL:
476 $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
477 $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
478 $indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
479 $indexerObj->hash['phash'] = -1;
480 // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
481 $indexerObj->indexExternalUrl($url);
482 $url_qParts = parse_url($url);
483 $baseAbsoluteHref = $url_qParts['scheme'] . '://' . $url_qParts['host'];
484 $baseHref = $indexerObj->extractBaseHref($indexerObj->indexExternalUrl_content);
485 if (!$baseHref) {
486 // Extract base href from current URL
487 $baseHref = $baseAbsoluteHref;
488 $baseHref .= substr($url_qParts['path'], 0, strrpos($url_qParts['path'], '/'));
489 }
490 $baseHref = rtrim($baseHref, '/');
491 // Get URLs on this page:
492 $subUrls = array();
493 $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
494 // Traverse links:
495 foreach ($list as $count => $linkInfo) {
496 // Decode entities:
497 $subUrl = htmlspecialchars_decode($linkInfo['href']);
498 $qParts = parse_url($subUrl);
499 if (!$qParts['scheme']) {
500 $relativeUrl = GeneralUtility::resolveBackPath($subUrl);
501 if ($relativeUrl[0] === '/') {
502 $subUrl = $baseAbsoluteHref . $relativeUrl;
503 } else {
504 $subUrl = $baseHref . '/' . $relativeUrl;
505 }
506 }
507 $subUrls[] = $subUrl;
508 }
509 return $subUrls;
510 }
511
512 /**
513 * Indexing Single Record
514 *
515 * @param array Record to index
516 * @param array Configuration Record
517 * @param array Rootline array to relate indexing to
518 * @return void
519 */
520 public function indexSingleRecord($r, $cfgRec, $rl = NULL) {
521 // Init:
522 $rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
523 $fieldList = GeneralUtility::trimExplode(',', $cfgRec['fieldlist'], TRUE);
524 $languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
525 $sys_language_uid = $languageField ? $r[$languageField] : 0;
526 // (Re)-Indexing a row from a table:
527 $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
528 parse_str(str_replace('###UID###', $r['uid'], $cfgRec['get_params']), $GETparams);
529 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, $cfgRec['chashcalc'] ? TRUE : FALSE);
530 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
531 $indexerObj->forceIndexing = TRUE;
532 $theContent = '';
533 foreach ($fieldList as $k => $v) {
534 if (!$k) {
535 $theTitle = $r[$v];
536 } else {
537 $theContent .= $r[$v] . ' ';
538 }
539 }
540 // Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid())
541 $indexerObj->backend_indexAsTYPO3Page(strip_tags(str_replace('<', ' <', $theTitle)), '', '', strip_tags(str_replace('<', ' <', $theContent)), $GLOBALS['LANG']->charSet, $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']], $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']], $r['uid']);
542 }
543
544 /**
545 * Get rootline for closest TypoScript template root.
546 * Algorithm same as used in Web > Template, Object browser
547 *
548 * @param int The page id to traverse rootline back from
549 * @return array Array where the root lines uid values are found.
550 */
551 public function getUidRootLineForClosestTemplate($id) {
552 $tmpl = GeneralUtility::makeInstance(\TYPO3\CMS\Core\TypoScript\ExtendedTemplateService::class);
553 $tmpl->tt_track = 0;
554 // Do not log time-performance information
555 $tmpl->init();
556 // Gets the rootLine
557 $sys_page = GeneralUtility::makeInstance(\TYPO3\CMS\Frontend\Page\PageRepository::class);
558 $rootLine = $sys_page->getRootLine($id);
559 // This generates the constants/config + hierarchy info for the template.
560 $tmpl->runThroughTemplates($rootLine, 0);
561 // Root line uids
562 $rootline_uids = array();
563 foreach ($tmpl->rootLine as $rlkey => $rldat) {
564 $rootline_uids[$rlkey] = $rldat['uid'];
565 }
566 return $rootline_uids;
567 }
568
569 /**
570 * Generate the unix time stamp for next visit.
571 *
572 * @param array Index configuration record
573 * @return int The next time stamp
574 */
575 public function generateNextIndexingTime($cfgRec) {
576 $currentTime = $GLOBALS['EXEC_TIME'];
577 // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected
578 if ($cfgRec['timer_frequency'] <= 24 * 3600) {
579 $aMidNight = mktime(0, 0, 0) - 1 * 24 * 3600;
580 } else {
581 $lastTime = $cfgRec['timer_next_indexing'] ?: $GLOBALS['EXEC_TIME'];
582 $aMidNight = mktime(0, 0, 0, date('m', $lastTime), date('d', $lastTime), date('y', $lastTime));
583 }
584 // Find last offset time plus frequency in seconds:
585 $lastSureOffset = $aMidNight + \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cfgRec['timer_offset'], 0, 86400);
586 $frequencySeconds = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cfgRec['timer_frequency'], 1);
587 // Now, find out how many blocks of the length of frequency there is until the next time:
588 $frequencyBlocksUntilNextTime = ceil(($currentTime - $lastSureOffset) / $frequencySeconds);
589 // Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds.
590 $nextTime = $lastSureOffset + $frequencyBlocksUntilNextTime * $frequencySeconds;
591 return $nextTime;
592 }
593
594 /**
595 * Checks if $url has any of the URls in the $url_deny "list" in it and if so, returns TRUE.
596 *
597 * @param string URL to test
598 * @param string String where URLs are separated by line-breaks; If any of these strings is the first part of $url, the function returns TRUE (to indicate denial of decend)
599 * @return bool TRUE if there is a matching URL (hence, do not index!)
600 */
601 public function checkDeniedSuburls($url, $url_deny) {
602 if (trim($url_deny)) {
603 $url_denyArray = GeneralUtility::trimExplode(LF, $url_deny, TRUE);
604 foreach ($url_denyArray as $testurl) {
605 if (GeneralUtility::isFirstPartOfStr($url, $testurl)) {
606 echo $url . ' /// ' . $url_deny . LF;
607 return TRUE;
608 }
609 }
610 }
611 return FALSE;
612 }
613
614 /**
615 * Adding entry in queue for Hook
616 *
617 * @param array Configuration record
618 * @param string Title/URL
619 * @return void
620 */
621 public function addQueueEntryForHook($cfgRec, $title) {
622 $nparams = array(
623 'indexConfigUid' => $cfgRec['uid'],
624 // This must ALWAYS be the cfgRec uid!
625 'url' => $title,
626 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']')
627 );
628 $this->pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
629 }
630
631 /**
632 * Deletes all data stored by indexed search for a given page
633 *
634 * @param int Uid of the page to delete all pHash
635 * @return void
636 */
637 public function deleteFromIndex($id) {
638 // Lookup old phash rows:
639 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('phash', 'index_section', 'page_id=' . (int)$id);
640 if (count($oldPhashRows)) {
641 $pHashesToDelete = array();
642 foreach ($oldPhashRows as $pHashRow) {
643 $pHashesToDelete[] = $pHashRow['phash'];
644 }
645 $where_clause = 'phash IN (' . implode(',', $GLOBALS['TYPO3_DB']->cleanIntArray($pHashesToDelete)) . ')';
646 $tables = explode(',', 'index_debug,index_fulltext,index_grlist,index_phash,index_rel,index_section');
647 foreach ($tables as $table) {
648 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, $where_clause);
649 }
650 }
651 }
652
653 /*************************
654 *
655 * Hook functions for TCEmain (indexing of records)
656 *
657 *************************/
658 /**
659 * TCEmain hook function for on-the-fly indexing of database records
660 *
661 * @param string $command TCEmain command
662 * @param string $table Table name
663 * @param string $id Record ID. If new record its a string pointing to index inside \TYPO3\CMS\Core\DataHandling\DataHandler::substNEWwithIDs
664 * @param mixed $value Target value (ignored)
665 * @param FormEngine $pObj tcemain calling object
666 * @return void
667 */
668 public function processCmdmap_preProcess($command, $table, $id, $value, $pObj) {
669 // Clean up the index
670 if ($command == 'delete' && $table == 'pages') {
671 $this->deleteFromIndex($id);
672 }
673 }
674
675 /**
676 * TCEmain hook function for on-the-fly indexing of database records
677 *
678 * @param string $status Status "new" or "update
679 * @param string $table Table name
680 * @param string $id Record ID. If new record its a string pointing to index inside \TYPO3\CMS\Core\DataHandling\DataHandler::substNEWwithIDs
681 * @param array $fieldArray Field array of updated fields in the operation
682 * @param FormEngine $pObj tcemain calling object
683 * @return void
684 */
685 public function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, $pObj) {
686 // Check if any fields are actually updated:
687 if (count($fieldArray)) {
688 // Translate new ids.
689 if ($status == 'new') {
690 $id = $pObj->substNEWwithIDs[$id];
691 } elseif ($table == 'pages' && $status == 'update' && (array_key_exists('hidden', $fieldArray) && $fieldArray['hidden'] == 1 || array_key_exists('no_search', $fieldArray) && $fieldArray['no_search'] == 1)) {
692 // If the page should be hidden or not indexed after update, delete index for this page
693 $this->deleteFromIndex($id);
694 }
695 // Get full record and if exists, search for indexing configurations:
696 $currentRecord = BackendUtility::getRecord($table, $id);
697 if (is_array($currentRecord)) {
698 // Select all (not running) indexing configurations of type "record" (1) and which points to this table and is located on the same page as the record or pointing to the right source PID
699 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('*', 'index_config', 'hidden=0
700 AND (starttime=0 OR starttime<=' . $GLOBALS['EXEC_TIME'] . ')
701 AND set_id=0
702 AND type=1
703 AND table2index=' . $GLOBALS['TYPO3_DB']->fullQuoteStr($table, 'index_config') . '
704 AND (
705 (alternative_source_pid=0 AND pid=' . (int)$currentRecord['pid'] . ')
706 OR (alternative_source_pid=' . (int)$currentRecord['pid'] . ')
707 )
708 AND records_indexonchange=1
709 ' . BackendUtility::deleteClause('index_config'));
710 foreach ($indexingConfigurations as $cfgRec) {
711 $this->indexSingleRecord($currentRecord, $cfgRec);
712 }
713 }
714 }
715 }
716
717 }