[TASK] Add tool-function to strip PATH_site-part of paths
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Hook / CrawlerHook.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch\Hook;
3
4 /***************************************************************
5 * Copyright notice
6 *
7 * (c) 2001-2013 Kasper Skårhøj (kasperYYYY@typo3.com)
8 * All rights reserved
9 *
10 * This script is part of the TYPO3 project. The TYPO3 project is
11 * free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * The GNU General Public License can be found at
17 * http://www.gnu.org/copyleft/gpl.html.
18 * A copy is found in the textfile GPL.txt and important notices to the license
19 * from the author is found in LICENSE.txt distributed with these scripts.
20 *
21 *
22 * This script is distributed in the hope that it will be useful,
23 * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 * GNU General Public License for more details.
26 *
27 * This copyright notice MUST APPEAR in all copies of the script!
28 ***************************************************************/
29
30 use TYPO3\CMS\Backend\Utility\BackendUtility;
31 use TYPO3\CMS\Core\Utility\GeneralUtility;
32
33 /**
34 * Crawler hook for indexed search. Works with the "crawler" extension
35 *
36 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
37 */
38 class CrawlerHook {
39
40 // Static:
41 /**
42 * @todo Define visibility
43 */
44 public $secondsPerExternalUrl = 3;
45
46 // Number of seconds to use as interval between queued indexing operations of URLs / files (types 2 & 3)
47 // Internal, dynamic:
48 /**
49 * @todo Define visibility
50 */
51 public $instanceCounter = 0;
52
53 // Counts up for each added URL (type 3)
54 // Internal, static:
55 /**
56 * @todo Define visibility
57 */
58 public $callBack = '&TYPO3\\CMS\\IndexedSearch\\Hook\\CrawlerHook';
59
60 // The object reference to this class.
61 /**
62 * Initialization of crawler hook.
63 * This function is asked for each instance of the crawler and we must check if something is timed to happen and if so put entry(s) in the crawlers log to start processing.
64 * In reality we select indexing configurations and evaluate if any of them needs to run.
65 *
66 * @param object Parent object (tx_crawler lib)
67 * @return void
68 * @todo Define visibility
69 */
70 public function crawler_init(&$pObj) {
71 // Select all indexing configuration which are waiting to be activated:
72 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('*', 'index_config', 'hidden=0
73 AND (starttime=0 OR starttime<=' . $GLOBALS['EXEC_TIME'] . ')
74 AND timer_next_indexing<' . $GLOBALS['EXEC_TIME'] . '
75 AND set_id=0
76 ' . BackendUtility::deleteClause('index_config'));
77 // For each configuration, check if it should be executed and if so, start:
78 foreach ($indexingConfigurations as $cfgRec) {
79 // Generate a unique set-ID:
80 $setId = GeneralUtility::md5int(microtime());
81 // Get next time:
82 $nextTime = $this->generateNextIndexingTime($cfgRec);
83 // Start process by updating index-config record:
84 $field_array = array(
85 'set_id' => $setId,
86 'timer_next_indexing' => $nextTime,
87 'session_data' => ''
88 );
89 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config', 'uid=' . intval($cfgRec['uid']), $field_array);
90 // Based on configuration type:
91 switch ($cfgRec['type']) {
92 case 1:
93 // RECORDS:
94 // Parameters:
95 $params = array(
96 'indexConfigUid' => $cfgRec['uid'],
97 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
98 'url' => 'Records (start)'
99 );
100 //
101 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
102 break;
103 case 2:
104 // FILES:
105 // Parameters:
106 $params = array(
107 'indexConfigUid' => $cfgRec['uid'],
108 // General
109 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
110 // General
111 'url' => $cfgRec['filepath'],
112 // Partly general... (for URL and file types)
113 'depth' => 0
114 );
115 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
116 break;
117 case 3:
118 // External URL:
119 // Parameters:
120 $params = array(
121 'indexConfigUid' => $cfgRec['uid'],
122 // General
123 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
124 // General
125 'url' => $cfgRec['externalUrl'],
126 // Partly general... (for URL and file types)
127 'depth' => 0
128 );
129 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
130 break;
131 case 4:
132 // Page tree
133 // Parameters:
134 $params = array(
135 'indexConfigUid' => $cfgRec['uid'],
136 // General
137 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
138 // General
139 'url' => intval($cfgRec['alternative_source_pid']),
140 // Partly general... (for URL and file types and page tree (root))
141 'depth' => 0
142 );
143 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
144 break;
145 case 5:
146 // Meta configuration, nothing to do:
147 // NOOP
148 break;
149 default:
150 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
151 $hookObj = GeneralUtility::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
152 if (is_object($hookObj)) {
153 // Parameters:
154 $params = array(
155 'indexConfigUid' => $cfgRec['uid'],
156 // General
157 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . '/CUSTOM]'),
158 // General
159 'url' => $hookObj->initMessage($message)
160 );
161 $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
162 }
163 }
164 }
165 }
166 // Finally, look up all old index configurations which are finished and needs to be reset and done.
167 $this->cleanUpOldRunningConfigurations();
168 }
169
170 /**
171 * Call back function for execution of a log element
172 *
173 * @param array Params from log element. Must contain $params['indexConfigUid']
174 * @param object Parent object (tx_crawler lib)
175 * @return array Result array
176 * @todo Define visibility
177 */
178 public function crawler_execute($params, &$pObj) {
179 // Indexer configuration ID must exist:
180 if ($params['indexConfigUid']) {
181 // Load the indexing configuration record:
182 $cfgRec = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow('*', 'index_config', 'uid=' . intval($params['indexConfigUid']));
183 if (is_array($cfgRec)) {
184 // Unpack session data:
185 $session_data = unserialize($cfgRec['session_data']);
186 // Select which type:
187 switch ($cfgRec['type']) {
188 case 1:
189 // Records:
190 $this->crawler_execute_type1($cfgRec, $session_data, $params, $pObj);
191 break;
192 case 2:
193 // Files
194 $this->crawler_execute_type2($cfgRec, $session_data, $params, $pObj);
195 break;
196 case 3:
197 // External URL:
198 $this->crawler_execute_type3($cfgRec, $session_data, $params, $pObj);
199 break;
200 case 4:
201 // Page tree:
202 $this->crawler_execute_type4($cfgRec, $session_data, $params, $pObj);
203 break;
204 case 5:
205 // Meta
206 // NOOP (should never enter here!)
207 break;
208 default:
209 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
210 $hookObj = GeneralUtility::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
211 if (is_object($hookObj)) {
212 $this->pObj = $pObj;
213 // For addQueueEntryForHook()
214 $hookObj->indexOperation($cfgRec, $session_data, $params, $this);
215 }
216 }
217 }
218 // Save process data which might be modified:
219 $field_array = array(
220 'session_data' => serialize($session_data)
221 );
222 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config', 'uid=' . intval($cfgRec['uid']), $field_array);
223 }
224 }
225 return array('log' => $params);
226 }
227
228 /**
229 * Indexing records from a table
230 *
231 * @param array Indexing Configuration Record
232 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
233 * @param array Parameters from the log queue.
234 * @param object Parent object (from "crawler" extension!)
235 * @return void
236 * @todo Define visibility
237 */
238 public function crawler_execute_type1($cfgRec, &$session_data, $params, &$pObj) {
239 if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']])) {
240 // Init session data array if not already:
241 if (!is_array($session_data)) {
242 $session_data = array(
243 'uid' => 0
244 );
245 }
246 // Init:
247 $pid = intval($cfgRec['alternative_source_pid']) ? intval($cfgRec['alternative_source_pid']) : $cfgRec['pid'];
248 $numberOfRecords = $cfgRec['recordsbatch'] ? \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cfgRec['recordsbatch'], 1) : 100;
249 // Get root line:
250 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
251 // Select
252 $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('*', $cfgRec['table2index'], 'pid = ' . intval($pid) . '
253 AND uid > ' . intval($session_data['uid']) . BackendUtility::deleteClause($cfgRec['table2index']) . BackendUtility::BEenableFields($cfgRec['table2index']), '', 'uid', $numberOfRecords);
254 // Traverse:
255 if (count($recs)) {
256 foreach ($recs as $r) {
257 // Index single record:
258 $this->indexSingleRecord($r, $cfgRec, $rl);
259 // Update the UID we last processed:
260 $session_data['uid'] = $r['uid'];
261 }
262 // Finally, set entry for next indexing of batch of records:
263 $nparams = array(
264 'indexConfigUid' => $cfgRec['uid'],
265 'url' => 'Records from UID#' . ($r['uid'] + 1) . '-?',
266 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']')
267 );
268 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
269 }
270 }
271 }
272
273 /**
274 * Indexing files from fileadmin
275 *
276 * @param array Indexing Configuration Record
277 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
278 * @param array Parameters from the log queue.
279 * @param object Parent object (from "crawler" extension!)
280 * @return void
281 * @todo Define visibility
282 */
283 public function crawler_execute_type2($cfgRec, &$session_data, $params, &$pObj) {
284 // Prepare path, making it absolute and checking:
285 $readpath = $params['url'];
286 if (!GeneralUtility::isAbsPath($readpath)) {
287 $readpath = GeneralUtility::getFileAbsFileName($readpath);
288 }
289 if (GeneralUtility::isAllowedAbsPath($readpath)) {
290 if (@is_file($readpath)) {
291 // If file, index it!
292 // Get root line (need to provide this when indexing external files)
293 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
294 // Load indexer if not yet.
295 $this->loadIndexerClass();
296 // (Re)-Indexing file on page.
297 $indexerObj = GeneralUtility::makeInstance('TYPO3\\CMS\\IndexedSearch\\Indexer');
298 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
299 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
300 $indexerObj->hash['phash'] = -1;
301 // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
302 // Index document:
303 $indexerObj->indexRegularDocument(\TYPO3\CMS\Core\Utility\PathUtility::stripPathSitePrefix($readpath), TRUE);
304 } elseif (@is_dir($readpath)) {
305 // If dir, read content and create new pending items for log:
306 // Select files and directories in path:
307 $extList = implode(',', GeneralUtility::trimExplode(',', $cfgRec['extensions'], TRUE));
308 $fileArr = array();
309 $files = GeneralUtility::getAllFilesAndFoldersInPath($fileArr, $readpath, $extList, 0, 0);
310 $directoryList = GeneralUtility::get_dirs($readpath);
311 if (is_array($directoryList) && $params['depth'] < $cfgRec['depth']) {
312 foreach ($directoryList as $subdir) {
313 if ((string) $subdir != '') {
314 $files[] = $readpath . $subdir . '/';
315 }
316 }
317 }
318 $files = GeneralUtility::removePrefixPathFromList($files, PATH_site);
319 // traverse the items and create log entries:
320 foreach ($files as $path) {
321 $this->instanceCounter++;
322 if ($path !== $params['url']) {
323 // Parameters:
324 $nparams = array(
325 'indexConfigUid' => $cfgRec['uid'],
326 'url' => $path,
327 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
328 'depth' => $params['depth'] + 1
329 );
330 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
331 }
332 }
333 }
334 }
335 }
336
337 /**
338 * Indexing External URLs
339 *
340 * @param array Indexing Configuration Record
341 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
342 * @param array Parameters from the log queue.
343 * @param object Parent object (from "crawler" extension!)
344 * @return void
345 * @todo Define visibility
346 */
347 public function crawler_execute_type3($cfgRec, &$session_data, $params, &$pObj) {
348 // Init session data array if not already:
349 if (!is_array($session_data)) {
350 $session_data = array(
351 'urlLog' => array($params['url'])
352 );
353 }
354 // Index the URL:
355 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
356 $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
357 // Add more elements to log now:
358 if ($params['depth'] < $cfgRec['depth']) {
359 foreach ($subUrls as $url) {
360 if ($url = $this->checkUrl($url, $session_data['urlLog'], $cfgRec['externalUrl'])) {
361 if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny'])) {
362 $this->instanceCounter++;
363 $session_data['urlLog'][] = $url;
364 // Parameters:
365 $nparams = array(
366 'indexConfigUid' => $cfgRec['uid'],
367 'url' => $url,
368 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
369 'depth' => $params['depth'] + 1
370 );
371 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
372 }
373 }
374 }
375 }
376 }
377
378 /**
379 * Page tree indexing type
380 *
381 * @param array Indexing Configuration Record
382 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
383 * @param array Parameters from the log queue.
384 * @param object Parent object (from "crawler" extension!)
385 * @return void
386 * @todo Define visibility
387 */
388 public function crawler_execute_type4($cfgRec, &$session_data, $params, &$pObj) {
389 // Base page uid:
390 $pageUid = intval($params['url']);
391 // Get array of URLs from page:
392 $pageRow = BackendUtility::getRecord('pages', $pageUid);
393 $res = $pObj->getUrlsForPageRow($pageRow);
394 $duplicateTrack = array();
395 // Registry for duplicates
396 $downloadUrls = array();
397 // Dummy.
398 // Submit URLs:
399 if (count($res)) {
400 foreach ($res as $paramSetKey => $vv) {
401 $urlList = $pObj->urlListFromUrlArray($vv, $pageRow, $GLOBALS['EXEC_TIME'], 30, 1, 0, $duplicateTrack, $downloadUrls, array('tx_indexedsearch_reindex'));
402 }
403 }
404 // Add subpages to log now:
405 if ($params['depth'] < $cfgRec['depth']) {
406 // Subpages selected
407 $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('uid,title', 'pages', 'pid = ' . intval($pageUid) . BackendUtility::deleteClause('pages'));
408 // Traverse subpages and add to queue:
409 if (count($recs)) {
410 foreach ($recs as $r) {
411 $this->instanceCounter++;
412 $url = 'pages:' . $r['uid'] . ': ' . $r['title'];
413 $session_data['urlLog'][] = $url;
414 // Parameters:
415 $nparams = array(
416 'indexConfigUid' => $cfgRec['uid'],
417 'url' => $r['uid'],
418 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']'),
419 'depth' => $params['depth'] + 1
420 );
421 $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
422 }
423 }
424 }
425 }
426
427 /**
428 * Look up all old index configurations which are finished and needs to be reset and done
429 *
430 * @return void
431 * @todo Define visibility
432 */
433 public function cleanUpOldRunningConfigurations() {
434 // Lookup running index configurations:
435 $runningIndexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('uid,set_id', 'index_config', 'set_id<>0' . BackendUtility::deleteClause('index_config'));
436 // For each running configuration, look up how many log entries there are which are scheduled for execution and if none, clear the "set_id" (means; Processing was DONE)
437 foreach ($runningIndexingConfigurations as $cfgRec) {
438 // Look for ended processes:
439 $queued_items = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows('*', 'tx_crawler_queue', 'set_id=' . intval($cfgRec['set_id']) . ' AND exec_time=0');
440 if (!$queued_items) {
441 // Lookup old phash rows:
442 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('phash', 'index_phash', 'freeIndexUid=' . intval($cfgRec['uid']) . ' AND freeIndexSetId<>' . $cfgRec['set_id']);
443 foreach ($oldPhashRows as $pHashRow) {
444 // Removing old registrations for all tables (code copied from class.tx_indexedsearch_modfunc1.php)
445 $tableArr = explode(',', 'index_phash,index_rel,index_section,index_grlist,index_fulltext,index_debug');
446 foreach ($tableArr as $table) {
447 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash=' . intval($pHashRow['phash']));
448 }
449 }
450 // End process by updating index-config record:
451 $field_array = array(
452 'set_id' => 0,
453 'session_data' => ''
454 );
455 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config', 'uid=' . intval($cfgRec['uid']), $field_array);
456 }
457 }
458 }
459
460 /*****************************************
461 *
462 * Helper functions
463 *
464 *****************************************/
465 /**
466 * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log.
467 *
468 * @param string URL string to check
469 * @param array Array of already indexed URLs (input url is looked up here and must not exist already)
470 * @param string Base URL of the indexing process (input URL must be "inside" the base URL!)
471 * @return string Returls the URL if OK, otherwise FALSE
472 * @todo Define visibility
473 */
474 public function checkUrl($url, $urlLog, $baseUrl) {
475 $url = preg_replace('/\\/\\/$/', '/', $url);
476 list($url) = explode('#', $url);
477 if (!strstr($url, '../')) {
478 if (GeneralUtility::isFirstPartOfStr($url, $baseUrl)) {
479 if (!in_array($url, $urlLog)) {
480 return $url;
481 }
482 }
483 }
484 }
485
486 /**
487 * Indexing External URL
488 *
489 * @param string URL, http://....
490 * @param integer Page id to relate indexing to.
491 * @param array Rootline array to relate indexing to
492 * @param integer Configuration UID
493 * @param integer Set ID value
494 * @return array URLs found on this page
495 * @todo Define visibility
496 */
497 public function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId) {
498 // Load indexer if not yet.
499 $this->loadIndexerClass();
500 // Index external URL:
501 $indexerObj = GeneralUtility::makeInstance('TYPO3\\CMS\\IndexedSearch\\Indexer');
502 $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
503 $indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
504 $indexerObj->hash['phash'] = -1;
505 // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
506 $indexerObj->indexExternalUrl($url);
507 $url_qParts = parse_url($url);
508 $baseAbsoluteHref = $url_qParts['scheme'] . '://' . $url_qParts['host'];
509 $baseHref = $indexerObj->extractBaseHref($indexerObj->indexExternalUrl_content);
510 if (!$baseHref) {
511 // Extract base href from current URL
512 $baseHref = $baseAbsoluteHref;
513 $baseHref .= substr($url_qParts['path'], 0, strrpos($url_qParts['path'], '/'));
514 }
515 $baseHref = rtrim($baseHref, '/');
516 // Get URLs on this page:
517 $subUrls = array();
518 $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
519 // Traverse links:
520 foreach ($list as $count => $linkInfo) {
521 // Decode entities:
522 $subUrl = GeneralUtility::htmlspecialchars_decode($linkInfo['href']);
523 $qParts = parse_url($subUrl);
524 if (!$qParts['scheme']) {
525 $relativeUrl = GeneralUtility::resolveBackPath($subUrl);
526 if ($relativeUrl[0] === '/') {
527 $subUrl = $baseAbsoluteHref . $relativeUrl;
528 } else {
529 $subUrl = $baseHref . '/' . $relativeUrl;
530 }
531 }
532 $subUrls[] = $subUrl;
533 }
534 return $subUrls;
535 }
536
537 /**
538 * Indexing Single Record
539 *
540 * @param array Record to index
541 * @param array Configuration Record
542 * @param array Rootline array to relate indexing to
543 * @return void
544 * @todo Define visibility
545 */
546 public function indexSingleRecord($r, $cfgRec, $rl = NULL) {
547 // Load indexer if not yet.
548 $this->loadIndexerClass();
549 // Init:
550 $rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
551 $fieldList = GeneralUtility::trimExplode(',', $cfgRec['fieldlist'], TRUE);
552 $languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
553 $sys_language_uid = $languageField ? $r[$languageField] : 0;
554 // (Re)-Indexing a row from a table:
555 $indexerObj = GeneralUtility::makeInstance('TYPO3\\CMS\\IndexedSearch\\Indexer');
556 parse_str(str_replace('###UID###', $r['uid'], $cfgRec['get_params']), $GETparams);
557 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, $cfgRec['chashcalc'] ? TRUE : FALSE);
558 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
559 $indexerObj->forceIndexing = TRUE;
560 $theContent = '';
561 foreach ($fieldList as $k => $v) {
562 if (!$k) {
563 $theTitle = $r[$v];
564 } else {
565 $theContent .= $r[$v] . ' ';
566 }
567 }
568 // Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid())
569 $indexerObj->backend_indexAsTYPO3Page(strip_tags(str_replace('<', ' <', $theTitle)), '', '', strip_tags(str_replace('<', ' <', $theContent)), $GLOBALS['LANG']->charSet, $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']], $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']], $r['uid']);
570 }
571
572 /**
573 * Include indexer class.
574 *
575 * @return void
576 * @todo Define visibility
577 */
578 public function loadIndexerClass() {
579 global $TYPO3_CONF_VARS;
580 require_once \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('indexed_search') . 'class.indexer.php';
581 }
582
583 /**
584 * Get rootline for closest TypoScript template root.
585 * Algorithm same as used in Web > Template, Object browser
586 *
587 * @param integer The page id to traverse rootline back from
588 * @return array Array where the root lines uid values are found.
589 * @todo Define visibility
590 */
591 public function getUidRootLineForClosestTemplate($id) {
592 global $TYPO3_CONF_VARS;
593 $tmpl = GeneralUtility::makeInstance('TYPO3\\CMS\\Core\\TypoScript\\ExtendedTemplateService');
594 $tmpl->tt_track = 0;
595 // Do not log time-performance information
596 $tmpl->init();
597 // Gets the rootLine
598 $sys_page = GeneralUtility::makeInstance('TYPO3\\CMS\\Frontend\\Page\\PageRepository');
599 $rootLine = $sys_page->getRootLine($id);
600 // This generates the constants/config + hierarchy info for the template.
601 $tmpl->runThroughTemplates($rootLine, 0);
602 // Root line uids
603 $rootline_uids = array();
604 foreach ($tmpl->rootLine as $rlkey => $rldat) {
605 $rootline_uids[$rlkey] = $rldat['uid'];
606 }
607 return $rootline_uids;
608 }
609
610 /**
611 * Generate the unix time stamp for next visit.
612 *
613 * @param array Index configuration record
614 * @return integer The next time stamp
615 * @todo Define visibility
616 */
617 public function generateNextIndexingTime($cfgRec) {
618 $currentTime = $GLOBALS['EXEC_TIME'];
619 // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected
620 if ($cfgRec['timer_frequency'] <= 24 * 3600) {
621 $aMidNight = mktime(0, 0, 0) - 1 * 24 * 3600;
622 } else {
623 $lastTime = $cfgRec['timer_next_indexing'] ? $cfgRec['timer_next_indexing'] : $GLOBALS['EXEC_TIME'];
624 $aMidNight = mktime(0, 0, 0, date('m', $lastTime), date('d', $lastTime), date('y', $lastTime));
625 }
626 // Find last offset time plus frequency in seconds:
627 $lastSureOffset = $aMidNight + \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cfgRec['timer_offset'], 0, 86400);
628 $frequencySeconds = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cfgRec['timer_frequency'], 1);
629 // Now, find out how many blocks of the length of frequency there is until the next time:
630 $frequencyBlocksUntilNextTime = ceil(($currentTime - $lastSureOffset) / $frequencySeconds);
631 // Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds.
632 $nextTime = $lastSureOffset + $frequencyBlocksUntilNextTime * $frequencySeconds;
633 return $nextTime;
634 }
635
636 /**
637 * Checks if $url has any of the URls in the $url_deny "list" in it and if so, returns TRUE.
638 *
639 * @param string URL to test
640 * @param string String where URLs are separated by line-breaks; If any of these strings is the first part of $url, the function returns TRUE (to indicate denial of decend)
641 * @return boolean TRUE if there is a matching URL (hence, do not index!)
642 * @todo Define visibility
643 */
644 public function checkDeniedSuburls($url, $url_deny) {
645 if (trim($url_deny)) {
646 $url_denyArray = GeneralUtility::trimExplode(LF, $url_deny, TRUE);
647 foreach ($url_denyArray as $testurl) {
648 if (GeneralUtility::isFirstPartOfStr($url, $testurl)) {
649 echo $url . ' /// ' . $url_deny . LF;
650 return TRUE;
651 }
652 }
653 }
654 return FALSE;
655 }
656
657 /**
658 * Adding entry in queue for Hook
659 *
660 * @param array Configuration record
661 * @param string Title/URL
662 * @return void
663 * @todo Define visibility
664 */
665 public function addQueueEntryForHook($cfgRec, $title) {
666 $nparams = array(
667 'indexConfigUid' => $cfgRec['uid'],
668 // This must ALWAYS be the cfgRec uid!
669 'url' => $title,
670 'procInstructions' => array('[Index Cfg UID#' . $cfgRec['uid'] . ']')
671 );
672 $this->pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
673 }
674
675 /**
676 * Deletes all data stored by indexed search for a given page
677 *
678 * @param integer Uid of the page to delete all pHash
679 * @return void
680 * @todo Define visibility
681 */
682 public function deleteFromIndex($id) {
683 // Lookup old phash rows:
684 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('phash', 'index_section', 'page_id=' . intval($id));
685 if (count($oldPhashRows)) {
686 $pHashesToDelete = array();
687 foreach ($oldPhashRows as $pHashRow) {
688 $pHashesToDelete[] = $pHashRow['phash'];
689 }
690 $where_clause = 'phash IN (' . implode(',', $GLOBALS['TYPO3_DB']->cleanIntArray($pHashesToDelete)) . ')';
691 $tables = explode(',', 'index_debug,index_fulltext,index_grlist,index_phash,index_rel,index_section');
692 foreach ($tables as $table) {
693 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, $where_clause);
694 }
695 }
696 }
697
698 /*************************
699 *
700 * Hook functions for TCEmain (indexing of records)
701 *
702 *************************/
703 /**
704 * TCEmain hook function for on-the-fly indexing of database records
705 *
706 * @param string TCEmain command
707 * @param string Table name
708 * @param string Record ID. If new record its a string pointing to index inside \TYPO3\CMS\Core\DataHandling\DataHandler::substNEWwithIDs
709 * @param mixed Target value (ignored)
710 * @param object Reference to tcemain calling object
711 * @return void
712 * @todo Define visibility
713 */
714 public function processCmdmap_preProcess($command, $table, $id, $value, $pObj) {
715 // Clean up the index
716 if ($command == 'delete' && $table == 'pages') {
717 $this->deleteFromIndex($id);
718 }
719 }
720
721 /**
722 * TCEmain hook function for on-the-fly indexing of database records
723 *
724 * @param string Status "new" or "update
725 * @param string Table name
726 * @param string Record ID. If new record its a string pointing to index inside \TYPO3\CMS\Core\DataHandling\DataHandler::substNEWwithIDs
727 * @param array Field array of updated fields in the operation
728 * @param object Reference to tcemain calling object
729 * @return void
730 * @todo Define visibility
731 */
732 public function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, $pObj) {
733 // Check if any fields are actually updated:
734 if (count($fieldArray)) {
735 // Translate new ids.
736 if ($status == 'new') {
737 $id = $pObj->substNEWwithIDs[$id];
738 } elseif ($table == 'pages' && $status == 'update' && (array_key_exists('hidden', $fieldArray) && $fieldArray['hidden'] == 1 || array_key_exists('no_search', $fieldArray) && $fieldArray['no_search'] == 1)) {
739 // If the page should be hidden or not indexed after update, delete index for this page
740 $this->deleteFromIndex($id);
741 }
742 // Get full record and if exists, search for indexing configurations:
743 $currentRecord = BackendUtility::getRecord($table, $id);
744 if (is_array($currentRecord)) {
745 // Select all (not running) indexing configurations of type "record" (1) and which points to this table and is located on the same page as the record or pointing to the right source PID
746 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('*', 'index_config', 'hidden=0
747 AND (starttime=0 OR starttime<=' . $GLOBALS['EXEC_TIME'] . ')
748 AND set_id=0
749 AND type=1
750 AND table2index=' . $GLOBALS['TYPO3_DB']->fullQuoteStr($table, 'index_config') . '
751 AND (
752 (alternative_source_pid=0 AND pid=' . intval($currentRecord['pid']) . ')
753 OR (alternative_source_pid=' . intval($currentRecord['pid']) . ')
754 )
755 AND records_indexonchange=1
756 ' . BackendUtility::deleteClause('index_config'));
757 foreach ($indexingConfigurations as $cfgRec) {
758 $this->indexSingleRecord($currentRecord, $cfgRec);
759 }
760 }
761 }
762 }
763
764 }