3dd1809a47d6063d95d7440b813a6afbe149c55a
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Hook / CrawlerHook.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch\Hook;
3
4 /**
5 * Crawler hook for indexed search. Works with the "crawler" extension
6 *
7 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
8 */
9 class CrawlerHook {
10
11 // Static:
12 /**
13 * @todo Define visibility
14 */
15 public $secondsPerExternalUrl = 3;
16
17 // Number of seconds to use as interval between queued indexing operations of URLs / files (types 2 & 3)
18 // Internal, dynamic:
19 /**
20 * @todo Define visibility
21 */
22 public $instanceCounter = 0;
23
24 // Counts up for each added URL (type 3)
25 // Internal, static:
26 /**
27 * @todo Define visibility
28 */
29 public $callBack = 'EXT:indexed_search/class.crawler.php:&TYPO3\\CMS\\IndexedSearch\\Controller\\SearchFormController_crawler';
30
31 // The object reference to this class.
32 /**
33 * Initialization of crawler hook.
34 * This function is asked for each instance of the crawler and we must check if something is timed to happen and if so put entry(s) in the crawlers log to start processing.
35 * In reality we select indexing configurations and evaluate if any of them needs to run.
36 *
37 * @param object Parent object (tx_crawler lib)
38 * @return void
39 * @todo Define visibility
40 */
41 public function crawler_init(&$pObj) {
42 // Select all indexing configuration which are waiting to be activated:
43 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('*', 'index_config', 'hidden=0
44 AND (starttime=0 OR starttime<=' . $GLOBALS['EXEC_TIME'] . ')
45 AND timer_next_indexing<' . $GLOBALS['EXEC_TIME'] . '
46 AND set_id=0
47 ' . \TYPO3\CMS\Backend\Utility\BackendUtility::deleteClause('index_config'));
48 // For each configuration, check if it should be executed and if so, start:
49 foreach ($indexingConfigurations as $cfgRec) {
50 // Generate a unique set-ID:
51 $setId = \TYPO3\CMS\Core\Utility\GeneralUtility::md5int(microtime());
52 // Get next time:
53 $nextTime = $this->generateNextIndexingTime($cfgRec);
54 // Start process by updating index-config record:
55 $field_array = array(
56 'set_id' => $setId,
57 'timer_next_indexing' => $nextTime,
58 'session_data' => ''
59 );
60 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config', 'uid=' . intval($cfgRec['uid']), $field_array);
61 // Based on configuration type:
62 switch ($cfgRec['type']) {
63 case 1:
64 // RECORDS:
65 // Parameters:
66 $params = array(
67 'indexConfigUid' => $cfgRec['uid'],
68 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
69 'url' => 'Records (start)'
70 );
71 //
72 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
73 break;
74 case 2:
75 // FILES:
76 // Parameters:
77 $params = array(
78 'indexConfigUid' => $cfgRec['uid'],
79 // General
80 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
81 // General
82 'url' => $cfgRec['filepath'],
83 // Partly general... (for URL and file types)
84 'depth' => 0
85 );
86 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
87 break;
88 case 3:
89 // External URL:
90 // Parameters:
91 $params = array(
92 'indexConfigUid' => $cfgRec['uid'],
93 // General
94 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
95 // General
96 'url' => $cfgRec['externalUrl'],
97 // Partly general... (for URL and file types)
98 'depth' => 0
99 );
100 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
101 break;
102 case 4:
103 // Page tree
104 // Parameters:
105 $params = array(
106 'indexConfigUid' => $cfgRec['uid'],
107 // General
108 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
109 // General
110 'url' => intval($cfgRec['alternative_source_pid']),
111 // Partly general... (for URL and file types and page tree (root))
112 'depth' => 0
113 );
114 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
115 break;
116 case 5:
117 // Meta configuration, nothing to do:
118 // NOOP
119 break;
120 default:
121 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
122 $hookObj = \TYPO3\CMS\Core\Utility\GeneralUtility::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
123 if (is_object($hookObj)) {
124 // Parameters:
125 $params = array(
126 'indexConfigUid' => $cfgRec['uid'],
127 // General
128 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . '/CUSTOM]'),
129 // General
130 'url' => $hookObj->initMessage($message)
131 );
132 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
133 }
134 }
135 break;
136 }
137 }
138 // Finally, look up all old index configurations which are finished and needs to be reset and done.
139 $this->cleanUpOldRunningConfigurations();
140 }
141
142 /**
143 * Call back function for execution of a log element
144 *
145 * @param array Params from log element. Must contain $params['indexConfigUid']
146 * @param object Parent object (tx_crawler lib)
147 * @return array Result array
148 * @todo Define visibility
149 */
150 public function crawler_execute($params, &$pObj) {
151 // Indexer configuration ID must exist:
152 if ($params['indexConfigUid']) {
153 // Load the indexing configuration record:
154 $cfgRec = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('*', 'index_config', 'uid=' . intval($params['indexConfigUid']));
155 if (is_array($cfgRec)) {
156 // Unpack session data:
157 $session_data = unserialize($cfgRec['session_data']);
158 // Select which type:
159 switch ($cfgRec['type']) {
160 case 1:
161 // Records:
162 $this->crawler_execute_type1($cfgRec, $session_data, $params, $pObj);
163 break;
164 case 2:
165 // Files
166 $this->crawler_execute_type2($cfgRec, $session_data, $params, $pObj);
167 break;
168 case 3:
169 // External URL:
170 $this->crawler_execute_type3($cfgRec, $session_data, $params, $pObj);
171 break;
172 case 4:
173 // Page tree:
174 $this->crawler_execute_type4($cfgRec, $session_data, $params, $pObj);
175 break;
176 case 5:
177 // Meta
178 // NOOP (should never enter here!)
179 break;
180 default:
181 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
182 $hookObj = \TYPO3\CMS\Core\Utility\GeneralUtility::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
183 if (is_object($hookObj)) {
184 $this->pObj = $pObj;
185 // For addQueueEntryForHook()
186 $hookObj->indexOperation($cfgRec, $session_data, $params, $this);
187 }
188 }
189 break;
190 }
191 // Save process data which might be modified:
192 $field_array = array(
193 'session_data' => serialize($session_data)
194 );
195 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config', 'uid=' . intval($cfgRec['uid']), $field_array);
196 }
197 }
198 return array('log' => $params);
199 }
200
201 /**
202 * Indexing records from a table
203 *
204 * @param array Indexing Configuration Record
205 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
206 * @param array Parameters from the log queue.
207 * @param object Parent object (from "crawler" extension!)
208 * @return void
209 * @todo Define visibility
210 */
211 public function crawler_execute_type1($cfgRec, &$session_data, $params, &$pObj) {
212 if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']])) {
213 // Init session data array if not already:
214 if (!is_array($session_data)) {
215 $session_data = array(
216 'uid' => 0
217 );
218 }
219 // Init:
220 $pid = intval($cfgRec['alternative_source_pid']) ? intval($cfgRec['alternative_source_pid']) : $cfgRec['pid'];
221 $numberOfRecords = $cfgRec['recordsbatch'] ? \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cfgRec['recordsbatch'], 1) : 100;
222 // Get root line:
223 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
224 // Select
225 $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('*', $cfgRec['table2index'], 'pid = ' . intval($pid) . '
226 AND uid > ' . intval($session_data['uid']) . \TYPO3\CMS\Backend\Utility\BackendUtility::deleteClause($cfgRec['table2index']) . \TYPO3\CMS\Backend\Utility\BackendUtility::BEenableFields($cfgRec['table2index']), '', 'uid', $numberOfRecords);
227 // Traverse:
228 if (count($recs)) {
229 foreach ($recs as $r) {
230 // Index single record:
231 $this->indexSingleRecord($r, $cfgRec, $rl);
232 // Update the UID we last processed:
233 $session_data['uid'] = $r['uid'];
234 }
235 // Finally, set entry for next indexing of batch of records:
236 $nparams = array(
237 'indexConfigUid' => $cfgRec['uid'],
238 'url' => 'Records from UID#' . ($r['uid'] + 1) . '-?',
239 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']')
240 );
241 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
242 }
243 }
244 }
245
246 /**
247 * Indexing files from fileadmin
248 *
249 * @param array Indexing Configuration Record
250 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
251 * @param array Parameters from the log queue.
252 * @param object Parent object (from "crawler" extension!)
253 * @return void
254 * @todo Define visibility
255 */
256 public function crawler_execute_type2($cfgRec, &$session_data, $params, &$pObj) {
257 // Prepare path, making it absolute and checking:
258 $readpath = $params['url'];
259 if (!\TYPO3\CMS\Core\Utility\GeneralUtility::isAbsPath($readpath)) {
260 $readpath = \TYPO3\CMS\Core\Utility\GeneralUtility::getFileAbsFileName($readpath);
261 }
262 if (\TYPO3\CMS\Core\Utility\GeneralUtility::isAllowedAbsPath($readpath)) {
263 if (@is_file($readpath)) {
264 // If file, index it!
265 // Get root line (need to provide this when indexing external files)
266 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
267 // Load indexer if not yet.
268 $this->loadIndexerClass();
269 // (Re)-Indexing file on page.
270 $indexerObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('tx_indexedsearch_indexer');
271 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
272 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
273 $indexerObj->hash['phash'] = -1;
274 // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
275 // Index document:
276 $indexerObj->indexRegularDocument(substr($readpath, strlen(PATH_site)), TRUE);
277 } elseif (@is_dir($readpath)) {
278 // If dir, read content and create new pending items for log:
279 // Select files and directories in path:
280 $extList = implode(',', \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',', $cfgRec['extensions'], 1));
281 $fileArr = array();
282 $files = \TYPO3\CMS\Core\Utility\GeneralUtility::getAllFilesAndFoldersInPath($fileArr, $readpath, $extList, 0, 0);
283 $directoryList = \TYPO3\CMS\Core\Utility\GeneralUtility::get_dirs($readpath);
284 if (is_array($directoryList) && $params['depth'] < $cfgRec['depth']) {
285 foreach ($directoryList as $subdir) {
286 if ((string) $subdir != '') {
287 $files[] = $readpath . $subdir . '/';
288 }
289 }
290 }
291 $files = \TYPO3\CMS\Core\Utility\GeneralUtility::removePrefixPathFromList($files, PATH_site);
292 // traverse the items and create log entries:
293 foreach ($files as $path) {
294 $this->instanceCounter++;
295 if ($path !== $params['url']) {
296 // Parameters:
297 $nparams = array(
298 'indexConfigUid' => $cfgRec['uid'],
299 'url' => $path,
300 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
301 'depth' => $params['depth'] + 1
302 );
303 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
304 }
305 }
306 }
307 }
308 }
309
310 /**
311 * Indexing External URLs
312 *
313 * @param array Indexing Configuration Record
314 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
315 * @param array Parameters from the log queue.
316 * @param object Parent object (from "crawler" extension!)
317 * @return void
318 * @todo Define visibility
319 */
320 public function crawler_execute_type3($cfgRec, &$session_data, $params, &$pObj) {
321 // Init session data array if not already:
322 if (!is_array($session_data)) {
323 $session_data = array(
324 'urlLog' => array($params['url'])
325 );
326 }
327 // Index the URL:
328 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
329 $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
330 // Add more elements to log now:
331 if ($params['depth'] < $cfgRec['depth']) {
332 foreach ($subUrls as $url) {
333 if ($url = $this->checkUrl($url, $session_data['urlLog'], $cfgRec['externalUrl'])) {
334 if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny'])) {
335 $this->instanceCounter++;
336 $session_data['urlLog'][] = $url;
337 // Parameters:
338 $nparams = array(
339 'indexConfigUid' => $cfgRec['uid'],
340 'url' => $url,
341 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
342 'depth' => $params['depth'] + 1
343 );
344 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
345 }
346 }
347 }
348 }
349 }
350
351 /**
352 * Page tree indexing type
353 *
354 * @param array Indexing Configuration Record
355 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
356 * @param array Parameters from the log queue.
357 * @param object Parent object (from "crawler" extension!)
358 * @return void
359 * @todo Define visibility
360 */
361 public function crawler_execute_type4($cfgRec, &$session_data, $params, &$pObj) {
362 // Base page uid:
363 $pageUid = intval($params['url']);
364 // Get array of URLs from page:
365 $pageRow = \TYPO3\CMS\Backend\Utility\BackendUtility::getRecord('pages', $pageUid);
366 $res = $pObj->getUrlsForPageRow($pageRow);
367 $duplicateTrack = array();
368 // Registry for duplicates
369 $downloadUrls = array();
370 // Dummy.
371 // Submit URLs:
372 if (count($res)) {
373 foreach ($res as $paramSetKey => $vv) {
374 $urlList = $pObj->urlListFromUrlArray($vv, $pageRow, $GLOBALS['EXEC_TIME'], 30, 1, 0, $duplicateTrack, $downloadUrls, array('tx_indexedsearch_reindex'));
375 }
376 }
377 // Add subpages to log now:
378 if ($params['depth'] < $cfgRec['depth']) {
379 // Subpages selected
380 $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('uid,title', 'pages', 'pid = ' . intval($pageUid) . \TYPO3\CMS\Backend\Utility\BackendUtility::deleteClause('pages'));
381 // Traverse subpages and add to queue:
382 if (count($recs)) {
383 foreach ($recs as $r) {
384 $this->instanceCounter++;
385 $url = 'pages:' . $r['uid'] . ': ' . $r['title'];
386 $session_data['urlLog'][] = $url;
387 // Parameters:
388 $nparams = array(
389 'indexConfigUid' => $cfgRec['uid'],
390 'url' => $r['uid'],
391 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
392 'depth' => $params['depth'] + 1
393 );
394 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
395 }
396 }
397 }
398 }
399
400 /**
401 * Look up all old index configurations which are finished and needs to be reset and done
402 *
403 * @return void
404 * @todo Define visibility
405 */
406 public function cleanUpOldRunningConfigurations() {
407 // Lookup running index configurations:
408 $runningIndexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('uid,set_id', 'index_config', 'set_id<>0' . \TYPO3\CMS\Backend\Utility\BackendUtility::deleteClause('index_config'));
409 // For each running configuration, look up how many log entries there are which are scheduled for execution and if none, clear the "set_id" (means; Processing was DONE)
410 foreach ($runningIndexingConfigurations as $cfgRec) {
411 // Look for ended processes:
412 $queued_items = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('*', 'tx_crawler_queue', 'set_id=' . intval($cfgRec['set_id']) . ' AND exec_time=0');
413 if (!$queued_items) {
414 // Lookup old phash rows:
415 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('phash', 'index_phash', 'freeIndexUid=' . intval($cfgRec['uid']) . ' AND freeIndexSetId<>' . $cfgRec['set_id']);
416 foreach ($oldPhashRows as $pHashRow) {
417 // Removing old registrations for all tables (code copied from class.tx_indexedsearch_modfunc1.php)
418 $tableArr = explode(',', 'index_phash,index_rel,index_section,index_grlist,index_fulltext,index_debug');
419 foreach ($tableArr as $table) {
420 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . intval($pHashRow['phash']));
421 }
422 }
423 // End process by updating index-config record:
424 $field_array = array(
425 'set_id' => 0,
426 'session_data' => ''
427 );
428 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config', 'uid=' . intval($cfgRec['uid']), $field_array);
429 }
430 }
431 }
432
433 /*****************************************
434 *
435 * Helper functions
436 *
437 *****************************************/
438 /**
439 * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log.
440 *
441 * @param string URL string to check
442 * @param array Array of already indexed URLs (input url is looked up here and must not exist already)
443 * @param string Base URL of the indexing process (input URL must be "inside" the base URL!)
444 * @return string Returls the URL if OK, otherwise FALSE
445 * @todo Define visibility
446 */
447 public function checkUrl($url, $urlLog, $baseUrl) {
448 $url = preg_replace('/\\/\\/$/', '/', $url);
449 list($url) = explode('#', $url);
450 if (!strstr($url, '../')) {
451 if (\TYPO3\CMS\Core\Utility\GeneralUtility::isFirstPartOfStr($url, $baseUrl)) {
452 if (!in_array($url, $urlLog)) {
453 return $url;
454 }
455 }
456 }
457 }
458
459 /**
460 * Indexing External URL
461 *
462 * @param string URL, http://....
463 * @param integer Page id to relate indexing to.
464 * @param array Rootline array to relate indexing to
465 * @param integer Configuration UID
466 * @param integer Set ID value
467 * @return array URLs found on this page
468 * @todo Define visibility
469 */
470 public function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId) {
471 // Load indexer if not yet.
472 $this->loadIndexerClass();
473 // Index external URL:
474 $indexerObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('tx_indexedsearch_indexer');
475 $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
476 $indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
477 $indexerObj->hash['phash'] = -1;
478 // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
479 $indexerObj->indexExternalUrl($url);
480 $url_qParts = parse_url($url);
481 $baseAbsoluteHref = $url_qParts['scheme'] . '://' . $url_qParts['host'];
482 $baseHref = $indexerObj->extractBaseHref($indexerObj->indexExternalUrl_content);
483 if (!$baseHref) {
484 // Extract base href from current URL
485 $baseHref = $baseAbsoluteHref;
486 $baseHref .= substr($url_qParts['path'], 0, strrpos($url_qParts['path'], '/'));
487 }
488 $baseHref = rtrim($baseHref, '/');
489 // Get URLs on this page:
490 $subUrls = array();
491 $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
492 // Traverse links:
493 foreach ($list as $count => $linkInfo) {
494 // Decode entities:
495 $subUrl = \TYPO3\CMS\Core\Utility\GeneralUtility::htmlspecialchars_decode($linkInfo['href']);
496 $qParts = parse_url($subUrl);
497 if (!$qParts['scheme']) {
498 $relativeUrl = \TYPO3\CMS\Core\Utility\GeneralUtility::resolveBackPath($subUrl);
499 if ($relativeUrl[0] === '/') {
500 $subUrl = $baseAbsoluteHref . $relativeUrl;
501 } else {
502 $subUrl = $baseHref . '/' . $relativeUrl;
503 }
504 }
505 $subUrls[] = $subUrl;
506 }
507 return $subUrls;
508 }
509
510 /**
511 * Indexing Single Record
512 *
513 * @param array Record to index
514 * @param array Configuration Record
515 * @param array Rootline array to relate indexing to
516 * @return void
517 * @todo Define visibility
518 */
519 public function indexSingleRecord($r, $cfgRec, $rl = NULL) {
520 // Load indexer if not yet.
521 $this->loadIndexerClass();
522 // Init:
523 $rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
524 $fieldList = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',', $cfgRec['fieldlist'], 1);
525 $languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
526 $sys_language_uid = $languageField ? $r[$languageField] : 0;
527 // (Re)-Indexing a row from a table:
528 $indexerObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('tx_indexedsearch_indexer');
529 parse_str(str_replace('###UID###', $r['uid'], $cfgRec['get_params']), $GETparams);
530 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, $cfgRec['chashcalc'] ? TRUE : FALSE);
531 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
532 $indexerObj->forceIndexing = TRUE;
533 $theContent = '';
534 foreach ($fieldList as $k => $v) {
535 if (!$k) {
536 $theTitle = $r[$v];
537 } else {
538 $theContent .= $r[$v] . ' ';
539 }
540 }
541 // Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid())
542 $indexerObj->backend_indexAsTYPO3Page(strip_tags(str_replace('<', ' <', $theTitle)), '', '', strip_tags(str_replace('<', ' <', $theContent)), $GLOBALS['LANG']->charSet, $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']], $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']], $r['uid']);
543 }
544
545 /**
546 * Include indexer class.
547 *
548 * @return void
549 * @todo Define visibility
550 */
551 public function loadIndexerClass() {
552 global $TYPO3_CONF_VARS;
553 require_once \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('indexed_search') . 'class.indexer.php';
554 }
555
556 /**
557 * Get rootline for closest TypoScript template root.
558 * Algorithm same as used in Web > Template, Object browser
559 *
560 * @param integer The page id to traverse rootline back from
561 * @return array Array where the root lines uid values are found.
562 * @todo Define visibility
563 */
564 public function getUidRootLineForClosestTemplate($id) {
565 global $TYPO3_CONF_VARS;
566 $tmpl = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\\CMS\\Core\\TypoScript\\ExtendedTemplateService');
567 $tmpl->tt_track = 0;
568 // Do not log time-performance information
569 $tmpl->init();
570 // Gets the rootLine
571 $sys_page = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\\CMS\\Frontend\\Page\\PageRepository');
572 $rootLine = $sys_page->getRootLine($id);
573 // This generates the constants/config + hierarchy info for the template.
574 $tmpl->runThroughTemplates($rootLine, 0);
575 // Root line uids
576 $rootline_uids = array();
577 foreach ($tmpl->rootLine as $rlkey => $rldat) {
578 $rootline_uids[$rlkey] = $rldat['uid'];
579 }
580 return $rootline_uids;
581 }
582
583 /**
584 * Generate the unix time stamp for next visit.
585 *
586 * @param array Index configuration record
587 * @return integer The next time stamp
588 * @todo Define visibility
589 */
590 public function generateNextIndexingTime($cfgRec) {
591 $currentTime = $GLOBALS['EXEC_TIME'];
592 // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected
593 if ($cfgRec['timer_frequency'] <= 24 * 3600) {
594 $aMidNight = mktime(0, 0, 0) - 1 * 24 * 3600;
595 } else {
596 $lastTime = $cfgRec['timer_next_indexing'] ? $cfgRec['timer_next_indexing'] : $GLOBALS['EXEC_TIME'];
597 $aMidNight = mktime(0, 0, 0, date('m', $lastTime), date('d', $lastTime), date('y', $lastTime));
598 }
599 // Find last offset time plus frequency in seconds:
600 $lastSureOffset = $aMidNight + \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cfgRec['timer_offset'], 0, 86400);
601 $frequencySeconds = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cfgRec['timer_frequency'], 1);
602 // Now, find out how many blocks of the length of frequency there is until the next time:
603 $frequencyBlocksUntilNextTime = ceil(($currentTime - $lastSureOffset) / $frequencySeconds);
604 // Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds.
605 $nextTime = $lastSureOffset + $frequencyBlocksUntilNextTime * $frequencySeconds;
606 return $nextTime;
607 }
608
609 /**
610 * Checks if $url has any of the URls in the $url_deny "list" in it and if so, returns TRUE.
611 *
612 * @param string URL to test
613 * @param string String where URLs are separated by line-breaks; If any of these strings is the first part of $url, the function returns TRUE (to indicate denial of decend)
614 * @return boolean TRUE if there is a matching URL (hence, do not index!)
615 * @todo Define visibility
616 */
617 public function checkDeniedSuburls($url, $url_deny) {
618 if (trim($url_deny)) {
619 $url_denyArray = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(LF, $url_deny, 1);
620 foreach ($url_denyArray as $testurl) {
621 if (\TYPO3\CMS\Core\Utility\GeneralUtility::isFirstPartOfStr($url, $testurl)) {
622 echo $url . ' /// ' . $url_deny . LF;
623 return TRUE;
624 }
625 }
626 }
627 return FALSE;
628 }
629
630 /**
631 * Adding entry in queue for Hook
632 *
633 * @param array Configuration record
634 * @param string Title/URL
635 * @return void
636 * @todo Define visibility
637 */
638 public function addQueueEntryForHook($cfgRec, $title) {
639 $nparams = array(
640 'indexConfigUid' => $cfgRec['uid'],
641 // This must ALWAYS be the cfgRec uid!
642 'url' => $title,
643 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']')
644 );
645 $this->pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
646 }
647
648 /**
649 * Deletes all data stored by indexed search for a given page
650 *
651 * @param integer Uid of the page to delete all pHash
652 * @return void
653 * @todo Define visibility
654 */
655 public function deleteFromIndex($id) {
656 // Lookup old phash rows:
657 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('phash', 'index_section', 'page_id=' . intval($id));
658 if (count($oldPhashRows)) {
659 $pHashesToDelete = array();
660 foreach ($oldPhashRows as $pHashRow) {
661 $pHashesToDelete[] = $pHashRow['phash'];
662 }
663 $where_clause = 'phash IN (' . implode(',', $GLOBALS['TYPO3_DB']->cleanIntArray($pHashesToDelete)) . ')';
664 $tables = explode(',', 'index_debug,index_fulltext,index_grlist,index_phash,index_rel,index_section');
665 foreach ($tables as $table) {
666 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, $where_clause);
667 }
668 }
669 }
670
671 /*************************
672 *
673 * Hook functions for TCEmain (indexing of records)
674 *
675 *************************/
676 /**
677 * TCEmain hook function for on-the-fly indexing of database records
678 *
679 * @param string TCEmain command
680 * @param string Table name
681 * @param string Record ID. If new record its a string pointing to index inside t3lib_tcemain::substNEWwithIDs
682 * @param mixed Target value (ignored)
683 * @param object Reference to tcemain calling object
684 * @return void
685 * @todo Define visibility
686 */
687 public function processCmdmap_preProcess($command, $table, $id, $value, $pObj) {
688 // Clean up the index
689 if ($command == 'delete' && $table == 'pages') {
690 $this->deleteFromIndex($id);
691 }
692 }
693
694 /**
695 * TCEmain hook function for on-the-fly indexing of database records
696 *
697 * @param string Status "new" or "update
698 * @param string Table name
699 * @param string Record ID. If new record its a string pointing to index inside t3lib_tcemain::substNEWwithIDs
700 * @param array Field array of updated fields in the operation
701 * @param object Reference to tcemain calling object
702 * @return void
703 * @todo Define visibility
704 */
705 public function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, $pObj) {
706 // Check if any fields are actually updated:
707 if (count($fieldArray)) {
708 // Translate new ids.
709 if ($status == 'new') {
710 $id = $pObj->substNEWwithIDs[$id];
711 } elseif ($table == 'pages' && $status == 'update' && (array_key_exists('hidden', $fieldArray) && $fieldArray['hidden'] == 1 || array_key_exists('no_search', $fieldArray) && $fieldArray['no_search'] == 1)) {
712 // If the page should be hidden or not indexed after update, delete index for this page
713 $this->deleteFromIndex($id);
714 }
715 // Get full record and if exists, search for indexing configurations:
716 $currentRecord = \TYPO3\CMS\Backend\Utility\BackendUtility::getRecord($table, $id);
717 if (is_array($currentRecord)) {
718 // Select all (not running) indexing configurations of type "record" (1) and which points to this table and is located on the same page as the record or pointing to the right source PID
719 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('*', 'index_config', 'hidden=0
720 AND (starttime=0 OR starttime<=' . $GLOBALS['EXEC_TIME'] . ')
721 AND set_id=0
722 AND type=1
723 AND table2index=' . $GLOBALS['TYPO3_DB']->fullQuoteStr($table, 'index_config') . '
724 AND (
725 (alternative_source_pid=0 AND pid=' . intval($currentRecord['pid']) . ')
726 OR (alternative_source_pid=' . intval($currentRecord['pid']) . ')
727 )
728 AND records_indexonchange=1
729 ' . \TYPO3\CMS\Backend\Utility\BackendUtility::deleteClause('index_config'));
730 foreach ($indexingConfigurations as $cfgRec) {
731 $this->indexSingleRecord($currentRecord, $cfgRec);
732 }
733 }
734 }
735 }
736
737 }
738
739
740 ?>