[BUGFIX] Use AjaxDataHandler to delete records from context menu
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Indexer.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use Psr\Http\Message\ServerRequestInterface;
18 use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
19 use TYPO3\CMS\Core\Context\Context;
20 use TYPO3\CMS\Core\Context\LanguageAspect;
21 use TYPO3\CMS\Core\Core\Environment;
22 use TYPO3\CMS\Core\Database\Connection;
23 use TYPO3\CMS\Core\Database\ConnectionPool;
24 use TYPO3\CMS\Core\Routing\PageArguments;
25 use TYPO3\CMS\Core\TimeTracker\TimeTracker;
26 use TYPO3\CMS\Core\Utility\GeneralUtility;
27 use TYPO3\CMS\Core\Utility\MathUtility;
28 use TYPO3\CMS\Core\Utility\PathUtility;
29 use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
30 use TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility;
31
32 /**
33 * Indexing class for TYPO3 frontend
34 */
35 class Indexer
36 {
37
38 /**
39 * @var array
40 */
41 public $reasons = [
42 -1 => 'mtime matched the document, so no changes detected and no content updated',
43 -2 => 'The minimum age was not exceeded',
44 1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
45 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
46 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
47 4 => 'Page has never been indexed (is not represented in the index_phash table).'
48 ];
49
50 /**
51 * HTML code blocks to exclude from indexing
52 *
53 * @var string
54 */
55 public $excludeSections = 'script,style';
56
57 /**
58 * Supported Extensions for external files
59 *
60 * @var array
61 */
62 public $external_parsers = [];
63
64 /**
65 * External parser objects, keys are file extension names. Values are objects with certain methods.
66 * Fe-group list (pages might be indexed separately for each usergroup combination to support search
67 * in access limited pages!)
68 *
69 * @var string
70 */
71 public $defaultGrList = '0,-1';
72
73 /**
74 * Min/Max times
75 *
76 * @var int
77 */
78 public $tstamp_maxAge = 0;
79
80 /**
81 * If set, this tells a number of seconds that is the maximum age of an indexed document.
82 * Regardless of mtime the document will be re-indexed if this limit is exceeded.
83 *
84 * @var int
85 */
86 public $tstamp_minAge = 0;
87
88 /**
89 * If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
90 *
91 * @var int
92 */
93 public $maxExternalFiles = 0;
94
95 /**
96 * Max number of external files to index.
97 *
98 * @var bool
99 */
100 public $forceIndexing = false;
101
102 /**
103 * If TRUE, indexing is forced despite of hashes etc.
104 *
105 * @var bool
106 */
107 public $crawlerActive = false;
108
109 /**
110 * Set when crawler is detected (internal)
111 *
112 * @var array
113 */
114 public $defaultContentArray = [
115 'title' => '',
116 'description' => '',
117 'keywords' => '',
118 'body' => ''
119 ];
120
121 /**
122 * @var int
123 */
124 public $wordcount = 0;
125
126 /**
127 * @var int
128 */
129 public $externalFileCounter = 0;
130
131 /**
132 * @var array
133 */
134 public $conf = [];
135
136 /**
137 * Configuration set internally (see init functions for required keys and their meaning)
138 *
139 * @var array
140 */
141 public $indexerConfig = [];
142
143 /**
144 * Indexer configuration, coming from TYPO3's system configuration for EXT:indexed_search
145 *
146 * @var array
147 */
148 public $hash = [];
149
150 /**
151 * Hash array, contains phash and phash_grouping
152 *
153 * @var array
154 */
155 public $file_phash_arr = [];
156
157 /**
158 * Hash array for files
159 *
160 * @var array
161 */
162 public $contentParts = [];
163
164 /**
165 * Content of TYPO3 page
166 *
167 * @var string
168 */
169 public $content_md5h = '';
170
171 /**
172 * @var array
173 */
174 public $internal_log = [];
175
176 /**
177 * Internal log
178 *
179 * @var string
180 */
181 public $indexExternalUrl_content = '';
182
183 /**
184 * @var int
185 */
186 public $freqRange = 32000;
187
188 /**
189 * @var float
190 */
191 public $freqMax = 0.1;
192
193 /**
194 * @var bool
195 */
196 public $enableMetaphoneSearch = false;
197
198 /**
199 * @var bool
200 */
201 public $storeMetaphoneInfoAsWords;
202
203 /**
204 * @var string
205 */
206 public $metaphoneContent = '';
207
208 /**
209 * Metaphone object, if any
210 *
211 * @var \TYPO3\CMS\IndexedSearch\Utility\DoubleMetaPhoneUtility
212 */
213 public $metaphoneObj;
214
215 /**
216 * Lexer object for word splitting
217 *
218 * @var \TYPO3\CMS\IndexedSearch\Lexer
219 */
220 public $lexerObj;
221
222 /**
223 * @var bool
224 */
225 public $flagBitMask;
226
227 /**
228 * @var TimeTracker
229 */
230 protected $timeTracker;
231
232 /**
233 * Indexer constructor.
234 */
235 public function __construct()
236 {
237 $this->timeTracker = GeneralUtility::makeInstance(TimeTracker::class);
238 }
239
240 /**
241 * Parent Object (TSFE) Initialization
242 *
243 * @param TypoScriptFrontendController $pObj Parent Object, passed by reference
244 */
245 public function hook_indexContent(&$pObj)
246 {
247 // Indexer configuration from Extension Manager interface:
248 $disableFrontendIndexing = (bool)GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search', 'disableFrontendIndexing');
249 // Crawler activation:
250 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
251 if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler') && $pObj->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
252 // Setting simple log message:
253 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
254 // Setting variables:
255 $this->crawlerActive = true;
256 // Crawler active flag
257 $this->forceIndexing = true;
258 }
259 // Determine if page should be indexed, and if so, configure and initialize indexer
260 if ($pObj->config['config']['index_enable']) {
261 $this->log_push('Index page', '');
262 if (!$disableFrontendIndexing || $this->crawlerActive) {
263 if (!$pObj->page['no_search']) {
264 if (!$pObj->no_cache) {
265 /** @var LanguageAspect $languageAspect */
266 $languageAspect = GeneralUtility::makeInstance(Context::class)->getAspect('language');
267 if ($languageAspect->getId() === $languageAspect->getContentId()) {
268 // Setting up internal configuration from config array:
269 $this->conf = [];
270 // Information about page for which the indexing takes place
271 $this->conf['id'] = $pObj->id;
272 // Page id
273 $this->conf['type'] = $pObj->type;
274 // Page type
275 $this->conf['sys_language_uid'] = $languageAspect->getId();
276 // sys_language UID of the language of the indexing.
277 $this->conf['MP'] = $pObj->MP;
278 // MP variable, if any (Mount Points)
279 // Group list
280 $this->conf['gr_list'] = implode(',', GeneralUtility::makeInstance(Context::class)->getPropertyFromAspect('frontend.user', 'groupIds', [0, -1]));
281 // page arguments array
282 $this->conf['staticPageArguments'] = [];
283 /** @var PageArguments $pageArguments */
284 if ($GLOBALS['TYPO3_REQUEST'] instanceof ServerRequestInterface) {
285 $pageArguments = $GLOBALS['TYPO3_REQUEST']->getAttribute('routing', null);
286 if ($pageArguments instanceof PageArguments) {
287 $this->conf['staticPageArguments'] = $pageArguments->getStaticArguments();
288 }
289 }
290 // Array of the additional parameters
291 $this->conf['crdate'] = $pObj->page['crdate'];
292 // The creation date of the TYPO3 page
293
294 // Root line uids
295 $this->conf['rootline_uids'] = [];
296 foreach ($pObj->config['rootLine'] as $rlkey => $rldat) {
297 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
298 }
299 // Content of page:
300 $this->conf['content'] = $pObj->content;
301 // Content string (HTML of TYPO3 page)
302 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
303 // Alternative title for indexing
304 $this->conf['metaCharset'] = $pObj->metaCharset;
305 // Character set of content (will be converted to utf-8 during indexing)
306 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED'] ?? $pObj->page['SYS_LASTCHANGED'];
307 // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
308 // Configuration of behavior:
309 $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
310 // Whether to index external documents like PDF, DOC etc. (if possible)
311 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
312 // Length of description text (max 250, default 200)
313 $this->conf['index_metatags'] = $pObj->config['config']['index_metatags'] ?? true;
314 // Set to zero:
315 $this->conf['recordUid'] = 0;
316 $this->conf['freeIndexUid'] = 0;
317 $this->conf['freeIndexSetId'] = 0;
318 // Init and start indexing:
319 $this->init();
320 $this->indexTypo3PageContent();
321 } else {
322 $this->log_setTSlogMessage('Index page? No, languageId was different from contentId which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
323 }
324 } else {
325 $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
326 }
327 } else {
328 $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
329 }
330 } else {
331 $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
332 }
333 $this->log_pull();
334 }
335 }
336
337 /****************************
338 *
339 * Backend API
340 *
341 ****************************/
342 /**
343 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
344 *
345 * @param int $id The page uid, &id=
346 * @param int $type The page type, &type=
347 * @param int $sys_language_uid sys_language uid, typically &L=
348 * @param string $MP The MP variable (Mount Points), &MP=
349 * @param array $uidRL Rootline array of only UIDs.
350 * @param array $queryArguments Array of GET variables to register with this indexing
351 */
352 public function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $queryArguments = [])
353 {
354 // Setting up internal configuration from config array:
355 $this->conf = [];
356 // Information about page for which the indexing takes place
357 $this->conf['id'] = $id;
358 // Page id (int)
359 $this->conf['type'] = $type;
360 // Page type (int)
361 $this->conf['sys_language_uid'] = $sys_language_uid;
362 // sys_language UID of the language of the indexing (int)
363 $this->conf['MP'] = $MP;
364 // MP variable, if any (Mount Points) (string)
365 $this->conf['gr_list'] = '0,-1';
366 // Group list (hardcoded for now...)
367 $this->conf['staticPageArguments'] = $queryArguments;
368 // Set to defaults
369 $this->conf['freeIndexUid'] = 0;
370 $this->conf['freeIndexSetId'] = 0;
371
372 // Root line uids
373 $this->conf['rootline_uids'] = $uidRL;
374 // Configuration of behavior:
375 $this->conf['index_externals'] = 1;
376 // Whether to index external documents like PDF, DOC etc. (if possible)
377 $this->conf['index_descrLgd'] = 200;
378 // Length of description text (max 250, default 200)
379 $this->conf['index_metatags'] = true;
380 // Whether to index document keywords and description (if present)
381 // Init and start indexing:
382 $this->init();
383 }
384
385 /**
386 * Sets the free-index uid. Can be called right after backend_initIndexer()
387 *
388 * @param int $freeIndexUid Free index UID
389 * @param int $freeIndexSetId Set id - an integer identifying the "set" of indexing operations.
390 */
391 public function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId = 0)
392 {
393 $this->conf['freeIndexUid'] = $freeIndexUid;
394 $this->conf['freeIndexSetId'] = $freeIndexSetId;
395 }
396
397 /**
398 * Indexing records as the content of a TYPO3 page.
399 *
400 * @param string $title Title equivalent
401 * @param string $keywords Keywords equivalent
402 * @param string $description Description equivalent
403 * @param string $content The main content to index
404 * @param string $charset The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
405 * @param int $mtime Last modification time, in seconds
406 * @param int $crdate The creation date of the content, in seconds
407 * @param int $recordUid The record UID that the content comes from (for registration with the indexed rows)
408 */
409 public function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate = 0, $recordUid = 0)
410 {
411 // Content of page:
412 $this->conf['mtime'] = $mtime;
413 // Most recent modification time (seconds) of the content
414 $this->conf['crdate'] = $crdate;
415 // The creation date of the TYPO3 content
416 $this->conf['recordUid'] = $recordUid;
417 // UID of the record, if applicable
418 // Construct fake HTML for parsing:
419 $this->conf['content'] = '
420 <html>
421 <head>
422 <title>' . htmlspecialchars($title) . '</title>
423 <meta name="keywords" content="' . htmlspecialchars($keywords) . '" />
424 <meta name="description" content="' . htmlspecialchars($description) . '" />
425 </head>
426 <body>
427 ' . htmlspecialchars($content) . '
428 </body>
429 </html>';
430 // Content string (HTML of TYPO3 page)
431 // Initializing charset:
432 $this->conf['metaCharset'] = $charset;
433 // Character set of content (will be converted to utf-8 during indexing)
434 $this->conf['indexedDocTitle'] = '';
435 // Alternative title for indexing
436 // Index content as if it was a TYPO3 page:
437 $this->indexTypo3PageContent();
438 }
439
440 /********************************
441 *
442 * Initialization
443 *
444 *******************************/
445 /**
446 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
447 */
448 public function init()
449 {
450 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
451 $this->setT3Hashes();
452 // Indexer configuration from Extension Manager interface:
453 $this->indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
454 $this->tstamp_minAge = MathUtility::forceIntegerInRange($this->indexerConfig['minAge'] * 3600, 0);
455 $this->tstamp_maxAge = MathUtility::forceIntegerInRange($this->indexerConfig['maxAge'] * 3600, 0);
456 $this->maxExternalFiles = MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
457 $this->flagBitMask = MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
458 // Workaround: If the extension configuration was not updated yet, the value is not existing
459 $this->enableMetaphoneSearch = !isset($this->indexerConfig['enableMetaphoneSearch']) || $this->indexerConfig['enableMetaphoneSearch'];
460 $this->storeMetaphoneInfoAsWords = !IndexedSearchUtility::isTableUsed('index_words') && $this->enableMetaphoneSearch;
461 // Initialize external document parsers:
462 // Example configuration, see ext_localconf.php of this file!
463 if ($this->conf['index_externals']) {
464 $this->initializeExternalParsers();
465 }
466 // Initialize lexer (class that deconstructs the text into words):
467 $lexerObjectClassName = $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] ?: Lexer::class;
468 $this->lexerObj = GeneralUtility::makeInstance($lexerObjectClassName);
469 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
470 // Initialize metaphone hook:
471 // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
472 if ($this->enableMetaphoneSearch && $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']) {
473 $this->metaphoneObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']);
474 $this->metaphoneObj->pObj = $this;
475 }
476 }
477
478 /**
479 * Initialize external parsers
480 *
481 * @internal
482 * @see init()
483 */
484 public function initializeExternalParsers()
485 {
486 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] ?? [] as $extension => $className) {
487 $this->external_parsers[$extension] = GeneralUtility::makeInstance($className);
488 $this->external_parsers[$extension]->pObj = $this;
489 // Init parser and if it returns FALSE, unset its entry again:
490 if (!$this->external_parsers[$extension]->initParser($extension)) {
491 unset($this->external_parsers[$extension]);
492 }
493 }
494 }
495
496 /********************************
497 *
498 * Indexing; TYPO3 pages (HTML content)
499 *
500 *******************************/
501 /**
502 * Start indexing of the TYPO3 page
503 */
504 public function indexTypo3PageContent()
505 {
506 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
507 $is_grlist = $this->is_grlist_set($this->hash['phash']);
508 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
509 // Setting message:
510 if ($this->forceIndexing) {
511 $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
512 } elseif ($check > 0) {
513 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
514 } else {
515 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
516 }
517 // Divide into title,keywords,description and body:
518 $this->log_push('Split content', '');
519 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
520 if ($this->conf['indexedDocTitle']) {
521 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
522 }
523 $this->log_pull();
524 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so don't!)
525 $this->content_md5h = IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
526 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
527 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
528 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
529 $checkCHash = $this->checkContentHash();
530 if (!is_array($checkCHash) || $check === 1) {
531 $Pstart = GeneralUtility::milliseconds();
532 $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
533 $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
534 $this->log_pull();
535 // Splitting words
536 $this->log_push('Extract words from content', '');
537 $splitInWords = $this->processWordsInArrays($this->contentParts);
538 $this->log_pull();
539 // Analyze the indexed words.
540 $this->log_push('Analyze the extracted words', '');
541 $indexArr = $this->indexAnalyze($splitInWords);
542 $this->log_pull();
543 // Submitting page (phash) record
544 $this->log_push('Submitting page', '');
545 $this->submitPage();
546 $this->log_pull();
547 // Check words and submit to word list if not there
548 $this->log_push('Check word list and submit words', '');
549 if (IndexedSearchUtility::isTableUsed('index_words')) {
550 $this->checkWordList($indexArr);
551 $this->submitWords($indexArr, $this->hash['phash']);
552 }
553 $this->log_pull();
554 // Set parsetime
555 $this->updateParsetime($this->hash['phash'], GeneralUtility::milliseconds() - $Pstart);
556 // Checking external files if configured for.
557 $this->log_push('Checking external files', '');
558 if ($this->conf['index_externals']) {
559 $this->extractLinks($this->conf['content']);
560 }
561 $this->log_pull();
562 } else {
563 // Update the timestamp
564 $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
565 $this->updateSetId($this->hash['phash']);
566 // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
567 $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
568 $this->updateRootline();
569 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
570 }
571 } else {
572 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
573 }
574 }
575
576 /**
577 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
578 *
579 * @param string $content HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
580 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
581 * @see splitRegularContent()
582 */
583 public function splitHTMLContent($content)
584 {
585 // divide head from body ( u-ouh :) )
586 $contentArr = $this->defaultContentArray;
587 $contentArr['body'] = stristr($content, '<body');
588 $headPart = substr($content, 0, -strlen($contentArr['body']));
589 // get title
590 $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
591 $titleParts = explode(':', $contentArr['title'], 2);
592 $contentArr['title'] = trim($titleParts[1] ?? $titleParts[0]);
593 // get keywords and description metatags
594 if ($this->conf['index_metatags']) {
595 $meta = [];
596 $i = 0;
597 while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
598 $i++;
599 }
600 // @todo The code below stops at first unset tag. Is that correct?
601 for ($i = 0; isset($meta[$i]); $i++) {
602 $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i]);
603 if (stripos($meta[$i]['name'], 'keywords') !== false) {
604 $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
605 }
606 if (stripos($meta[$i]['name'], 'description') !== false) {
607 $contentArr['description'] .= ',' . $meta[$i]['content'];
608 }
609 }
610 }
611 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
612 $this->typoSearchTags($contentArr['body']);
613 // Get rid of unwanted sections (ie. scripting and style stuff) in body
614 $tagList = explode(',', $this->excludeSections);
615 foreach ($tagList as $tag) {
616 while ($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
617 }
618 }
619 // remove tags, but first make sure we don't concatenate words by doing it
620 $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
621 $contentArr['body'] = trim(strip_tags($contentArr['body']));
622 $contentArr['keywords'] = trim($contentArr['keywords']);
623 $contentArr['description'] = trim($contentArr['description']);
624 // Return array
625 return $contentArr;
626 }
627
628 /**
629 * Extract the charset value from HTML meta tag.
630 *
631 * @param string $content HTML content
632 * @return string The charset value if found.
633 */
634 public function getHTMLcharset($content)
635 {
636 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
637 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
638 return $reg2[1];
639 }
640 }
641 }
642
643 /**
644 * Converts a HTML document to utf-8
645 *
646 * @param string $content HTML content, any charset
647 * @param string $charset Optional charset (otherwise extracted from HTML)
648 * @return string Converted HTML
649 */
650 public function convertHTMLToUtf8($content, $charset = '')
651 {
652 // Find charset:
653 $charset = $charset ?: $this->getHTMLcharset($content);
654 $charset = trim(strtolower($charset));
655 // Convert charset:
656 if ($charset && $charset !== 'utf-8') {
657 $content = mb_convert_encoding($content, 'utf-8', $charset);
658 }
659 // Convert entities, assuming document is now UTF-8:
660 return html_entity_decode($content);
661 }
662
663 /**
664 * Finds first occurrence of embracing tags and returns the embraced content and the original string with
665 * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding
666 * <title> of document or removing <script>-sections
667 *
668 * @param string $string String to search in
669 * @param string $tagName Tag name, eg. "script
670 * @param string $tagContent Passed by reference: Content inside found tag
671 * @param string $stringAfter Passed by reference: Content after found tag
672 * @param string $paramList Passed by reference: Attributes of the found tag.
673 * @return bool Returns FALSE if tag was not found, otherwise TRUE.
674 */
675 public function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
676 {
677 $endTag = '</' . $tagName . '>';
678 $startTag = '<' . $tagName;
679 // stristr used because we want a case-insensitive search for the tag.
680 $isTagInText = stristr($string, $startTag);
681 // if the tag was not found, return FALSE
682 if (!$isTagInText) {
683 return false;
684 }
685 list($paramList, $isTagInText) = explode('>', substr($isTagInText, strlen($startTag)), 2);
686 $afterTagInText = stristr($isTagInText, $endTag);
687 if ($afterTagInText) {
688 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
689 $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
690 $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
691 } else {
692 $tagContent = '';
693 $stringAfter = $isTagInText;
694 }
695 return true;
696 }
697
698 /**
699 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
700 *
701 * @param string $body HTML Content, passed by reference
702 * @return bool Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE.
703 */
704 public function typoSearchTags(&$body)
705 {
706 $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
707 if (count($expBody) > 1) {
708 $body = '';
709 foreach ($expBody as $val) {
710 $part = explode('-->', $val, 2);
711 if (trim($part[0]) === 'begin') {
712 $body .= $part[1];
713 $prev = '';
714 } elseif (trim($part[0]) === 'end') {
715 $body .= $prev;
716 } else {
717 $prev = $val;
718 }
719 }
720 return true;
721 }
722 return false;
723 }
724
725 /**
726 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
727 *
728 * @param string $content HTML content
729 */
730 public function extractLinks($content)
731 {
732 // Get links:
733 $list = $this->extractHyperLinks($content);
734 if ($this->indexerConfig['useCrawlerForExternalFiles'] && \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler')) {
735 $crawler = GeneralUtility::makeInstance(\tx_crawler_lib::class);
736 }
737 // Traverse links:
738 foreach ($list as $linkInfo) {
739 // Decode entities:
740 if ($linkInfo['localPath']) {
741 // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
742 $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
743 } else {
744 $linkSource = htmlspecialchars_decode($linkInfo['href']);
745 }
746 // Parse URL:
747 $qParts = parse_url($linkSource);
748 // Check for jumpurl (TYPO3 specific thing...)
749 if ($qParts['query'] && strpos($qParts['query'], 'jumpurl=') !== false) {
750 parse_str($qParts['query'], $getP);
751 $linkSource = $getP['jumpurl'];
752 $qParts = parse_url($linkSource);
753 }
754 if (!$linkInfo['localPath'] && $qParts['scheme']) {
755 if ($this->indexerConfig['indexExternalURLs']) {
756 // Index external URL (http or otherwise)
757 $this->indexExternalUrl($linkSource);
758 }
759 } elseif (!$qParts['query']) {
760 $linkSource = urldecode($linkSource);
761 if (GeneralUtility::isAllowedAbsPath($linkSource)) {
762 $localFile = $linkSource;
763 } else {
764 $localFile = GeneralUtility::getFileAbsFileName(Environment::getPublicPath() . '/' . $linkSource);
765 }
766 if ($localFile && @is_file($localFile)) {
767 // Index local file:
768 if ($linkInfo['localPath']) {
769 $fI = pathinfo($linkSource);
770 $ext = strtolower($fI['extension']);
771 if (is_object($crawler)) {
772 $params = [
773 'document' => $linkSource,
774 'alturl' => $linkInfo['href'],
775 'conf' => $this->conf
776 ];
777 unset($params['conf']['content']);
778 $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
779 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
780 } else {
781 $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
782 }
783 } else {
784 if (is_object($crawler)) {
785 $params = [
786 'document' => $linkSource,
787 'conf' => $this->conf
788 ];
789 unset($params['conf']['content']);
790 $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
791 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
792 } else {
793 $this->indexRegularDocument($linkSource);
794 }
795 }
796 }
797 }
798 }
799 }
800
801 /**
802 * Extracts all links to external documents from the HTML content string
803 *
804 * @param string $html
805 * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
806 * @see extractLinks()
807 */
808 public function extractHyperLinks($html)
809 {
810 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
811 $htmlParts = $htmlParser->splitTags('a', $html);
812 $hyperLinksData = [];
813 foreach ($htmlParts as $index => $tagData) {
814 if ($index % 2 !== 0) {
815 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
816 $firstTagName = $htmlParser->getFirstTagName($tagData);
817 if (strtolower($firstTagName) === 'a') {
818 if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] !== '#') {
819 $hyperLinksData[] = [
820 'tag' => $tagData,
821 'href' => $tagAttributes[0]['href'],
822 'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
823 ];
824 }
825 }
826 }
827 }
828 return $hyperLinksData;
829 }
830
831 /**
832 * Extracts the "base href" from content string.
833 *
834 * @param string $html Content to analyze
835 * @return string The base href or an empty string if not found
836 */
837 public function extractBaseHref($html)
838 {
839 $href = '';
840 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
841 $htmlParts = $htmlParser->splitTags('base', $html);
842 foreach ($htmlParts as $index => $tagData) {
843 if ($index % 2 !== 0) {
844 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
845 $firstTagName = $htmlParser->getFirstTagName($tagData);
846 if (strtolower($firstTagName) === 'base') {
847 $href = $tagAttributes[0]['href'];
848 if ($href) {
849 break;
850 }
851 }
852 }
853 }
854 return $href;
855 }
856
857 /******************************************
858 *
859 * Indexing; external URL
860 *
861 ******************************************/
862 /**
863 * Index External URLs HTML content
864 *
865 * @param string $externalUrl URL, eg. "http://typo3.org/
866 * @see indexRegularDocument()
867 */
868 public function indexExternalUrl($externalUrl)
869 {
870 // Get headers:
871 $urlHeaders = $this->getUrlHeaders($externalUrl);
872 if (stripos($urlHeaders['Content-Type'], 'text/html') !== false) {
873 $content = ($this->indexExternalUrl_content = GeneralUtility::getUrl($externalUrl));
874 if ((string)$content !== '') {
875 // Create temporary file:
876 $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
877 if ($tmpFile) {
878 GeneralUtility::writeFile($tmpFile, $content);
879 // Index that file:
880 $this->indexRegularDocument($externalUrl, true, $tmpFile, 'html');
881 // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
882 unlink($tmpFile);
883 }
884 }
885 }
886 }
887
888 /**
889 * Getting HTTP request headers of URL
890 *
891 * @param string $url The URL
892 * @return mixed If no answer, returns FALSE. Otherwise an array where HTTP headers are keys
893 */
894 public function getUrlHeaders($url)
895 {
896 // Try to get the headers only
897 $content = GeneralUtility::getUrl($url, 2);
898 if ((string)$content !== '') {
899 // Compile headers:
900 $headers = GeneralUtility::trimExplode(LF, $content, true);
901 $retVal = [];
902 foreach ($headers as $line) {
903 if (trim($line) === '') {
904 break;
905 }
906 list($headKey, $headValue) = explode(':', $line, 2);
907 $retVal[$headKey] = $headValue;
908 }
909 return $retVal;
910 }
911 }
912
913 /**
914 * Checks if the file is local
915 *
916 * @param string $sourcePath
917 * @return string Absolute path to file if file is local, else empty string
918 */
919 protected function createLocalPath($sourcePath)
920 {
921 $localPath = '';
922 $pathFunctions = [
923 'createLocalPathUsingAbsRefPrefix',
924 'createLocalPathUsingDomainURL',
925 'createLocalPathFromAbsoluteURL',
926 'createLocalPathFromRelativeURL'
927 ];
928 foreach ($pathFunctions as $functionName) {
929 $localPath = $this->{$functionName}($sourcePath);
930 if ($localPath != '') {
931 break;
932 }
933 }
934 return $localPath;
935 }
936
937 /**
938 * Attempts to create a local file path by matching a current request URL.
939 *
940 * @param string $sourcePath
941 * @return string
942 */
943 protected function createLocalPathUsingDomainURL($sourcePath)
944 {
945 $localPath = '';
946 $baseURL = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
947 $baseURLLength = strlen($baseURL);
948 if (strpos($sourcePath, $baseURL) === 0) {
949 $sourcePath = substr($sourcePath, $baseURLLength);
950 $localPath = Environment::getPublicPath() . '/' . $sourcePath;
951 if (!self::isAllowedLocalFile($localPath)) {
952 $localPath = '';
953 }
954 }
955 return $localPath;
956 }
957
958 /**
959 * Attempts to create a local file path by matching absRefPrefix. This
960 * requires TSFE. If TSFE is missing, this function does nothing.
961 *
962 * @param string $sourcePath
963 * @return string
964 */
965 protected function createLocalPathUsingAbsRefPrefix($sourcePath)
966 {
967 $localPath = '';
968 if (isset($GLOBALS['TSFE']) && $GLOBALS['TSFE'] instanceof TypoScriptFrontendController) {
969 $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
970 $absRefPrefixLength = strlen($absRefPrefix);
971 if ($absRefPrefixLength > 0 && strpos($sourcePath, $absRefPrefix) === 0) {
972 $sourcePath = substr($sourcePath, $absRefPrefixLength);
973 $localPath = Environment::getPublicPath() . '/' . $sourcePath;
974 if (!self::isAllowedLocalFile($localPath)) {
975 $localPath = '';
976 }
977 }
978 }
979 return $localPath;
980 }
981
982 /**
983 * Attempts to create a local file path from the absolute URL without
984 * schema.
985 *
986 * @param string $sourcePath
987 * @return string
988 */
989 protected function createLocalPathFromAbsoluteURL($sourcePath)
990 {
991 $localPath = '';
992 if ($sourcePath[0] === '/') {
993 $sourcePath = substr($sourcePath, 1);
994 $localPath = Environment::getPublicPath() . '/' . $sourcePath;
995 if (!self::isAllowedLocalFile($localPath)) {
996 $localPath = '';
997 }
998 }
999 return $localPath;
1000 }
1001
1002 /**
1003 * Attempts to create a local file path from the relative URL.
1004 *
1005 * @param string $sourcePath
1006 * @return string
1007 */
1008 protected function createLocalPathFromRelativeURL($sourcePath)
1009 {
1010 $localPath = '';
1011 if (self::isRelativeURL($sourcePath)) {
1012 $localPath = Environment::getPublicPath() . '/' . $sourcePath;
1013 if (!self::isAllowedLocalFile($localPath)) {
1014 $localPath = '';
1015 }
1016 }
1017 return $localPath;
1018 }
1019
1020 /**
1021 * Checks if URL is relative.
1022 *
1023 * @param string $url
1024 * @return bool
1025 */
1026 protected static function isRelativeURL($url)
1027 {
1028 $urlParts = @parse_url($url);
1029 return (!isset($urlParts['scheme']) || $urlParts['scheme'] === '') && $urlParts['path'][0] !== '/';
1030 }
1031
1032 /**
1033 * Checks if the path points to the file inside the web site
1034 *
1035 * @param string $filePath
1036 * @return bool
1037 */
1038 protected static function isAllowedLocalFile($filePath)
1039 {
1040 $filePath = GeneralUtility::resolveBackPath($filePath);
1041 $insideWebPath = strpos($filePath, Environment::getPublicPath()) === 0;
1042 $isFile = is_file($filePath);
1043 return $insideWebPath && $isFile;
1044 }
1045
1046 /******************************************
1047 *
1048 * Indexing; external files (PDF, DOC, etc)
1049 *
1050 ******************************************/
1051 /**
1052 * Indexing a regular document given as $file (relative to public web path, local file)
1053 *
1054 * @param string $file Relative Filename, relative to public web path. It can also be an absolute path as long as it is inside the lockRootPath (validated with \TYPO3\CMS\Core\Utility\GeneralUtility::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
1055 * @param bool $force If set, indexing is forced (despite content hashes, mtime etc).
1056 * @param string $contentTmpFile Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
1057 * @param string $altExtension File extension for temporary file.
1058 */
1059 public function indexRegularDocument($file, $force = false, $contentTmpFile = '', $altExtension = '')
1060 {
1061 // Init
1062 $fI = pathinfo($file);
1063 $ext = $altExtension ?: strtolower($fI['extension']);
1064 // Create abs-path:
1065 if (!$contentTmpFile) {
1066 if (!GeneralUtility::isAbsPath($file)) {
1067 // Relative, prepend public web path:
1068 $absFile = GeneralUtility::getFileAbsFileName(Environment::getPublicPath() . '/' . $file);
1069 } else {
1070 // Absolute, pass-through:
1071 $absFile = $file;
1072 }
1073 $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
1074 } else {
1075 $absFile = $contentTmpFile;
1076 }
1077 // Indexing the document:
1078 if ($absFile && @is_file($absFile)) {
1079 if ($this->external_parsers[$ext]) {
1080 $fileInfo = stat($absFile);
1081 $cParts = $this->fileContentParts($ext, $absFile);
1082 foreach ($cParts as $cPKey) {
1083 $this->internal_log = [];
1084 $this->log_push('Index: ' . str_replace('.', '_', PathUtility::basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
1085 $Pstart = GeneralUtility::milliseconds();
1086 $subinfo = ['key' => $cPKey];
1087 // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1088 $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
1089 $check = $this->checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
1090 if ($check > 0 || $force) {
1091 if ($check > 0) {
1092 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
1093 } else {
1094 $this->log_setTSlogMessage('Indexing forced by flag', 1);
1095 }
1096 // Check external file counter:
1097 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1098 // Divide into title,keywords,description and body:
1099 $this->log_push('Split content', '');
1100 $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
1101 $this->log_pull();
1102 if (is_array($contentParts)) {
1103 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1104 $content_md5h = IndexedSearchUtility::md5inthash(implode($contentParts, ''));
1105 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1106 // Increment counter:
1107 $this->externalFileCounter++;
1108 // Splitting words
1109 $this->log_push('Extract words from content', '');
1110 $splitInWords = $this->processWordsInArrays($contentParts);
1111 $this->log_pull();
1112 // Analyze the indexed words.
1113 $this->log_push('Analyze the extracted words', '');
1114 $indexArr = $this->indexAnalyze($splitInWords);
1115 $this->log_pull();
1116 // Submitting page (phash) record
1117 $this->log_push('Submitting page', '');
1118 // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1119 $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], $content_md5h, $contentParts);
1120 $this->log_pull();
1121 // Check words and submit to word list if not there
1122 $this->log_push('Check word list and submit words', '');
1123 if (IndexedSearchUtility::isTableUsed('index_words')) {
1124 $this->checkWordList($indexArr);
1125 $this->submitWords($indexArr, $phash_arr['phash']);
1126 }
1127 $this->log_pull();
1128 // Set parsetime
1129 $this->updateParsetime($phash_arr['phash'], GeneralUtility::milliseconds() - $Pstart);
1130 } else {
1131 // Update the timestamp
1132 $this->updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
1133 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
1134 }
1135 } else {
1136 $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1137 }
1138 } else {
1139 $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
1140 }
1141 } else {
1142 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
1143 }
1144 // Checking and setting sections:
1145 $this->submitFile_section($phash_arr['phash']);
1146 // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1147 $this->log_pull();
1148 }
1149 } else {
1150 $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
1151 }
1152 } else {
1153 $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
1154 }
1155 }
1156
1157 /**
1158 * Reads the content of an external file being indexed.
1159 * The content from the external parser MUST be returned in utf-8!
1160 *
1161 * @param string $fileExtension File extension, eg. "pdf", "doc" etc.
1162 * @param string $absoluteFileName Absolute filename of file (must exist and be validated OK before calling function)
1163 * @param string $sectionPointer Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1164 * @return array Standard content array (title, description, keywords, body keys)
1165 */
1166 public function readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
1167 {
1168 $contentArray = null;
1169 // Consult relevant external document parser:
1170 if (is_object($this->external_parsers[$fileExtension])) {
1171 $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1172 }
1173 return $contentArray;
1174 }
1175
1176 /**
1177 * Creates an array with pointers to divisions of document.
1178 *
1179 * @param string $ext File extension
1180 * @param string $absFile Absolute filename (must exist and be validated OK before calling function)
1181 * @return array Array of pointers to sections that the document should be divided into
1182 */
1183 public function fileContentParts($ext, $absFile)
1184 {
1185 $cParts = [0];
1186 // Consult relevant external document parser:
1187 if (is_object($this->external_parsers[$ext])) {
1188 $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
1189 }
1190 return $cParts;
1191 }
1192
1193 /**
1194 * Splits non-HTML content (from external files for instance)
1195 *
1196 * @param string $content Input content (non-HTML) to index.
1197 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1198 * @see splitHTMLContent()
1199 */
1200 public function splitRegularContent($content)
1201 {
1202 $contentArr = $this->defaultContentArray;
1203 $contentArr['body'] = $content;
1204 return $contentArr;
1205 }
1206
1207 /**********************************
1208 *
1209 * Analysing content, Extracting words
1210 *
1211 **********************************/
1212 /**
1213 * Convert character set and HTML entities in the value of input content array keys
1214 *
1215 * @param array $contentArr Standard content array
1216 * @param string $charset Charset of the input content (converted to utf-8)
1217 */
1218 public function charsetEntity2utf8(&$contentArr, $charset)
1219 {
1220 // Convert charset if necessary
1221 foreach ($contentArr as $key => $value) {
1222 if ((string)$contentArr[$key] !== '') {
1223 if ($charset !== 'utf-8') {
1224 $contentArr[$key] = mb_convert_encoding($contentArr[$key], 'utf-8', $charset);
1225 }
1226 // decode all numeric / html-entities in the string to real characters:
1227 $contentArr[$key] = html_entity_decode($contentArr[$key]);
1228 }
1229 }
1230 }
1231
1232 /**
1233 * Processing words in the array from split*Content -functions
1234 *
1235 * @param array $contentArr Array of content to index, see splitHTMLContent() and splitRegularContent()
1236 * @return array Content input array modified so each key is not a unique array of words
1237 */
1238 public function processWordsInArrays($contentArr)
1239 {
1240 // split all parts to words
1241 foreach ($contentArr as $key => $value) {
1242 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1243 }
1244 // For title, keywords, and description we don't want duplicates:
1245 $contentArr['title'] = array_unique($contentArr['title']);
1246 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1247 $contentArr['description'] = array_unique($contentArr['description']);
1248 // Return modified array:
1249 return $contentArr;
1250 }
1251
1252 /**
1253 * Extracts the sample description text from the content array.
1254 *
1255 * @param array $contentArr Content array
1256 * @return string Description string
1257 */
1258 public function bodyDescription($contentArr)
1259 {
1260 // Setting description
1261 $maxL = MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1262 if ($maxL) {
1263 $bodyDescription = preg_replace('/\s+/u', ' ', $contentArr['body']);
1264 // Shorten the string:
1265 $bodyDescription = mb_strcut($bodyDescription, 0, $maxL, 'utf-8');
1266 }
1267 return $bodyDescription;
1268 }
1269
1270 /**
1271 * Analyzes content to use for indexing,
1272 *
1273 * @param array $content Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1274 * @return array Index Array (whatever that is...)
1275 */
1276 public function indexAnalyze($content)
1277 {
1278 $indexArr = [];
1279 $this->analyzeHeaderinfo($indexArr, $content, 'title', 7);
1280 $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1281 $this->analyzeHeaderinfo($indexArr, $content, 'description', 5);
1282 $this->analyzeBody($indexArr, $content);
1283 return $indexArr;
1284 }
1285
1286 /**
1287 * Calculates relevant information for headercontent
1288 *
1289 * @param array $retArr Index array, passed by reference
1290 * @param array $content Standard content array
1291 * @param string $key Key from standard content array
1292 * @param int $offset Bit-wise priority to type
1293 */
1294 public function analyzeHeaderinfo(&$retArr, $content, $key, $offset)
1295 {
1296 foreach ($content[$key] as $val) {
1297 $val = substr($val, 0, 60);
1298 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1299 if (!isset($retArr[$val])) {
1300 // Word ID (wid)
1301 $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1302 // Metaphone value is also 60 only chars long
1303 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1304 $retArr[$val]['metaphone'] = $metaphone;
1305 }
1306 // Build metaphone fulltext string (can be used for fulltext indexing)
1307 if ($this->storeMetaphoneInfoAsWords) {
1308 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1309 }
1310 // Priority used for flagBitMask feature (see extension configuration)
1311 $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | pow(2, $offset);
1312 // Increase number of occurrences
1313 $retArr[$val]['count']++;
1314 $this->wordcount++;
1315 }
1316 }
1317
1318 /**
1319 * Calculates relevant information for bodycontent
1320 *
1321 * @param array $retArr Index array, passed by reference
1322 * @param array $content Standard content array
1323 */
1324 public function analyzeBody(&$retArr, $content)
1325 {
1326 foreach ($content['body'] as $key => $val) {
1327 $val = substr($val, 0, 60);
1328 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1329 if (!isset($retArr[$val])) {
1330 // First occurrence (used for ranking results)
1331 $retArr[$val]['first'] = $key;
1332 // Word ID (wid)
1333 $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1334 // Metaphone value is also only 60 chars long
1335 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1336 $retArr[$val]['metaphone'] = $metaphone;
1337 }
1338 // Build metaphone fulltext string (can be used for fulltext indexing)
1339 if ($this->storeMetaphoneInfoAsWords) {
1340 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1341 }
1342 // Increase number of occurrences
1343 $retArr[$val]['count']++;
1344 $this->wordcount++;
1345 }
1346 }
1347
1348 /**
1349 * Creating metaphone based hash from input word
1350 *
1351 * @param string $word Word to convert
1352 * @param bool $returnRawMetaphoneValue If set, returns the raw metaphone value (not hashed)
1353 * @return mixed Metaphone hash integer (or raw value, string)
1354 */
1355 public function metaphone($word, $returnRawMetaphoneValue = false)
1356 {
1357 if (is_object($this->metaphoneObj)) {
1358 $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1359 } else {
1360 // Use native PHP function instead of advanced doubleMetaphone class
1361 $metaphoneRawValue = metaphone($word);
1362 }
1363 if ($returnRawMetaphoneValue) {
1364 $result = $metaphoneRawValue;
1365 } elseif ($metaphoneRawValue !== '') {
1366 // Create hash and return integer
1367 $result = IndexedSearchUtility::md5inthash($metaphoneRawValue);
1368 } else {
1369 $result = 0;
1370 }
1371 return $result;
1372 }
1373
1374 /********************************
1375 *
1376 * SQL; TYPO3 Pages
1377 *
1378 *******************************/
1379 /**
1380 * Updates db with information about the page (TYPO3 page, not external media)
1381 */
1382 public function submitPage()
1383 {
1384 // Remove any current data for this phash:
1385 $this->removeOldIndexedPages($this->hash['phash']);
1386 // setting new phash_row
1387 $fields = [
1388 'phash' => $this->hash['phash'],
1389 'phash_grouping' => $this->hash['phash_grouping'],
1390 'static_page_arguments' => json_encode($this->conf['staticPageArguments']),
1391 'contentHash' => $this->content_md5h,
1392 'data_page_id' => $this->conf['id'],
1393 'data_page_type' => $this->conf['type'],
1394 'data_page_mp' => $this->conf['MP'],
1395 'gr_list' => $this->conf['gr_list'],
1396 'item_type' => 0,
1397 // TYPO3 page
1398 'item_title' => $this->contentParts['title'],
1399 'item_description' => $this->bodyDescription($this->contentParts),
1400 'item_mtime' => (int)$this->conf['mtime'],
1401 'item_size' => strlen($this->conf['content']),
1402 'tstamp' => $GLOBALS['EXEC_TIME'],
1403 'crdate' => $GLOBALS['EXEC_TIME'],
1404 'item_crdate' => $this->conf['crdate'],
1405 // Creation date of page
1406 'sys_language_uid' => $this->conf['sys_language_uid'],
1407 // Sys language uid of the page. Should reflect which language it DOES actually display!
1408 'externalUrl' => 0,
1409 'recordUid' => (int)$this->conf['recordUid'],
1410 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1411 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1412 ];
1413 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1414 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1415 ->getConnectionForTable('index_phash');
1416 $connection->insert(
1417 'index_phash',
1418 $fields
1419 );
1420 }
1421 // PROCESSING index_section
1422 $this->submit_section($this->hash['phash'], $this->hash['phash']);
1423 // PROCESSING index_grlist
1424 $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1425 // PROCESSING index_fulltext
1426 $fields = [
1427 'phash' => $this->hash['phash'],
1428 'fulltextdata' => implode(' ', $this->contentParts),
1429 'metaphonedata' => $this->metaphoneContent
1430 ];
1431 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1432 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1433 }
1434 if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1435 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1436 ->getConnectionForTable('index_fulltext');
1437 $connection->insert('index_fulltext', $fields);
1438 }
1439 // PROCESSING index_debug
1440 if ($this->indexerConfig['debugMode']) {
1441 $fields = [
1442 'phash' => $this->hash['phash'],
1443 'debuginfo' => json_encode([
1444 'external_parsers initialized' => array_keys($this->external_parsers),
1445 'conf' => array_merge($this->conf, ['content' => substr($this->conf['content'], 0, 1000)]),
1446 'contentParts' => array_merge($this->contentParts, ['body' => substr($this->contentParts['body'], 0, 1000)]),
1447 'logs' => $this->internal_log,
1448 'lexer' => $this->lexerObj->debugString
1449 ])
1450 ];
1451 if (IndexedSearchUtility::isTableUsed('index_debug')) {
1452 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1453 ->getConnectionForTable('index_debug');
1454 $connection->insert('index_debug', $fields);
1455 }
1456 }
1457 }
1458
1459 /**
1460 * Stores gr_list in the database.
1461 *
1462 * @param int $hash Search result record phash
1463 * @param int $phash_x Actual phash of current content
1464 * @see update_grlist()
1465 */
1466 public function submit_grlist($hash, $phash_x)
1467 {
1468 // Setting the gr_list record
1469 $fields = [
1470 'phash' => $hash,
1471 'phash_x' => $phash_x,
1472 'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1473 'gr_list' => $this->conf['gr_list']
1474 ];
1475 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1476 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1477 ->getConnectionForTable('index_grlist');
1478 $connection->insert('index_grlist', $fields);
1479 }
1480 }
1481
1482 /**
1483 * Stores section
1484 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1485 *
1486 * @param int $hash phash of TYPO3 parent search result record
1487 * @param int $hash_t3 phash of the file indexation search record
1488 */
1489 public function submit_section($hash, $hash_t3)
1490 {
1491 $fields = [
1492 'phash' => $hash,
1493 'phash_t3' => $hash_t3,
1494 'page_id' => (int)$this->conf['id']
1495 ];
1496 $this->getRootLineFields($fields);
1497 if (IndexedSearchUtility::isTableUsed('index_section')) {
1498 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1499 ->getConnectionForTable('index_section');
1500 $connection->insert('index_section', $fields);
1501 }
1502 }
1503
1504 /**
1505 * Removes records for the indexed page, $phash
1506 *
1507 * @param int $phash phash value to flush
1508 */
1509 public function removeOldIndexedPages($phash)
1510 {
1511 // Removing old registrations for all tables. Because the pages are TYPO3 pages
1512 // there can be nothing else than 1-1 relations here.
1513 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1514 $tableArray = ['index_phash', 'index_section', 'index_grlist', 'index_fulltext', 'index_debug'];
1515 foreach ($tableArray as $table) {
1516 if (IndexedSearchUtility::isTableUsed($table)) {
1517 $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1518 }
1519 }
1520
1521 // Removing all index_section records with hash_t3 set to this hash (this includes such
1522 // records set for external media on the page as well!). The re-insert of these records
1523 // are done in indexRegularDocument($file).
1524 if (IndexedSearchUtility::isTableUsed('index_section')) {
1525 $connectionPool->getConnectionForTable('index_section')
1526 ->delete('index_section', ['phash_t3' => (int)$phash]);
1527 }
1528 }
1529
1530 /********************************
1531 *
1532 * SQL; External media
1533 *
1534 *******************************/
1535 /**
1536 * Updates db with information about the file
1537 *
1538 * @param array $hash Array with phash and phash_grouping keys for file
1539 * @param string $file File name
1540 * @param array $subinfo Array of "static_page_arguments" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1541 * @param string $ext File extension determining the type of media.
1542 * @param int $mtime Modification time of file.
1543 * @param int $ctime Creation time of file.
1544 * @param int $size Size of file in bytes
1545 * @param int $content_md5h Content HASH value.
1546 * @param array $contentParts Standard content array (using only title and body for a file)
1547 */
1548 public function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts)
1549 {
1550 // Find item Type:
1551 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1552 $storeItemType = $storeItemType ?: $ext;
1553 // Remove any current data for this phash:
1554 $this->removeOldIndexedFiles($hash['phash']);
1555 // Split filename:
1556 $fileParts = parse_url($file);
1557 // Setting new
1558 $fields = [
1559 'phash' => $hash['phash'],
1560 'phash_grouping' => $hash['phash_grouping'],
1561 'static_page_arguments' => json_encode($subinfo),
1562 'contentHash' => $content_md5h,
1563 'data_filename' => $file,
1564 'item_type' => $storeItemType,
1565 'item_title' => trim($contentParts['title']) ?: PathUtility::basename($file),
1566 'item_description' => $this->bodyDescription($contentParts),
1567 'item_mtime' => $mtime,
1568 'item_size' => $size,
1569 'item_crdate' => $ctime,
1570 'tstamp' => $GLOBALS['EXEC_TIME'],
1571 'crdate' => $GLOBALS['EXEC_TIME'],
1572 'gr_list' => $this->conf['gr_list'],
1573 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1574 'recordUid' => (int)$this->conf['recordUid'],
1575 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1576 'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1577 'sys_language_uid' => (int)$this->conf['sys_language_uid']
1578 ];
1579 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1580 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1581 ->getConnectionForTable('index_phash');
1582 $connection->insert(
1583 'index_phash',
1584 $fields
1585 );
1586 }
1587 // PROCESSING index_fulltext
1588 $fields = [
1589 'phash' => $hash['phash'],
1590 'fulltextdata' => implode(' ', $contentParts),
1591 'metaphonedata' => $this->metaphoneContent
1592 ];
1593 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1594 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1595 }
1596 if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1597 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1598 ->getConnectionForTable('index_fulltext');
1599 $connection->insert('index_fulltext', $fields);
1600 }
1601 // PROCESSING index_debug
1602 if ($this->indexerConfig['debugMode']) {
1603 $fields = [
1604 'phash' => $hash['phash'],
1605 'debuginfo' => json_encode([
1606 'static_page_arguments' => $subinfo,
1607 'contentParts' => array_merge($contentParts, ['body' => substr($contentParts['body'], 0, 1000)]),
1608 'logs' => $this->internal_log,
1609 'lexer' => $this->lexerObj->debugString
1610 ])
1611 ];
1612 if (IndexedSearchUtility::isTableUsed('index_debug')) {
1613 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1614 ->getConnectionForTable('index_debug');
1615 $connection->insert('index_debug', $fields);
1616 }
1617 }
1618 }
1619
1620 /**
1621 * Stores file gr_list for a file IF it does not exist already
1622 *
1623 * @param int $hash phash value of file
1624 */
1625 public function submitFile_grlist($hash)
1626 {
1627 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1628 if (!IndexedSearchUtility::isTableUsed('index_grlist')) {
1629 return;
1630 }
1631
1632 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1633 ->getQueryBuilderForTable('index_grlist');
1634 $count = (int)$queryBuilder->count('*')
1635 ->from('index_grlist')
1636 ->where(
1637 $queryBuilder->expr()->eq(
1638 'phash',
1639 $queryBuilder->createNamedParameter($hash, \PDO::PARAM_INT)
1640 ),
1641 $queryBuilder->expr()->orX(
1642 $queryBuilder->expr()->eq(
1643 'hash_gr_list',
1644 $queryBuilder->createNamedParameter(
1645 IndexedSearchUtility::md5inthash($this->defaultGrList),
1646 \PDO::PARAM_INT
1647 )
1648 ),
1649 $queryBuilder->expr()->eq(
1650 'hash_gr_list',
1651 $queryBuilder->createNamedParameter(
1652 IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1653 \PDO::PARAM_INT
1654 )
1655 )
1656 )
1657 )
1658 ->execute()
1659 ->fetchColumn();
1660
1661 if ($count === 0) {
1662 $this->submit_grlist($hash, $hash);
1663 }
1664 }
1665
1666 /**
1667 * Stores file section for a file IF it does not exist
1668 *
1669 * @param int $hash phash value of file
1670 */
1671 public function submitFile_section($hash)
1672 {
1673 // Testing if there is already a section
1674 if (!IndexedSearchUtility::isTableUsed('index_section')) {
1675 return;
1676 }
1677
1678 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1679 ->getQueryBuilderForTable('index_section');
1680 $count = (int)$queryBuilder->count('phash')
1681 ->from('index_section')
1682 ->where(
1683 $queryBuilder->expr()->eq(
1684 'phash',
1685 $queryBuilder->createNamedParameter($hash, \PDO::PARAM_INT)
1686 ),
1687 $queryBuilder->expr()->eq(
1688 'page_id',
1689 $queryBuilder->createNamedParameter($this->conf['id'], \PDO::PARAM_INT)
1690 )
1691 )
1692 ->execute()
1693 ->fetchColumn();
1694
1695 if ($count === 0) {
1696 $this->submit_section($hash, $this->hash['phash']);
1697 }
1698 }
1699
1700 /**
1701 * Removes records for the indexed page, $phash
1702 *
1703 * @param int $phash phash value to flush
1704 */
1705 public function removeOldIndexedFiles($phash)
1706 {
1707 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1708 // Removing old registrations for tables.
1709 $tableArray = ['index_phash', 'index_grlist', 'index_fulltext', 'index_debug'];
1710 foreach ($tableArray as $table) {
1711 if (!IndexedSearchUtility::isTableUsed($table)) {
1712 continue;
1713 }
1714 $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1715 }
1716 }
1717
1718 /********************************
1719 *
1720 * SQL Helper functions
1721 *
1722 *******************************/
1723 /**
1724 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1725 * Return positive integer if the page needs to be indexed
1726 *
1727 * @param int $mtime mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1728 * @param int $phash "phash" used to select any already indexed page to see what its mtime is.
1729 * @return int Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1730 */
1731 public function checkMtimeTstamp($mtime, $phash)
1732 {
1733 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1734 // Not indexed (not in index_phash)
1735 $result = 4;
1736 } else {
1737 $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1738 ->select(
1739 ['item_mtime', 'tstamp'],
1740 'index_phash',
1741 ['phash' => (int)$phash],
1742 [],
1743 [],
1744 1
1745 )
1746 ->fetch();
1747 // If there was an indexing of the page...:
1748 if (!empty($row)) {
1749 if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < $GLOBALS['EXEC_TIME']) {
1750 // If max age is exceeded, index the page
1751 // The configured max-age was exceeded for the document and thus it's indexed.
1752 $result = 1;
1753 } else {
1754 if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < $GLOBALS['EXEC_TIME']) {
1755 // if minAge is not set or if minAge is exceeded, consider at mtime
1756 if ($mtime) {
1757 // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1758 if ($row['item_mtime'] != $mtime) {
1759 // And if mtime is different from the index_phash mtime, it's about time to re-index.
1760 // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1761 $result = 2;
1762 } else {
1763 // mtime matched the document, so no changes detected and no content updated
1764 $result = -1;
1765 if ($this->tstamp_maxAge) {
1766 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1767 } else {
1768 $this->updateTstamp($phash);
1769 $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1770 }
1771 }
1772 } else {
1773 // The minimum age was exceed, but mtime was not set, so the page was indexed.
1774 $result = 3;
1775 }
1776 } else {
1777 // The minimum age was not exceeded
1778 $result = -2;
1779 }
1780 }
1781 } else {
1782 // Page has never been indexed (is not represented in the index_phash table).
1783 $result = 4;
1784 }
1785 }
1786 return $result;
1787 }
1788
1789 /**
1790 * Check content hash in phash table
1791 *
1792 * @return mixed Returns TRUE if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1793 */
1794 public function checkContentHash()
1795 {
1796 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1797 $result = true;
1798 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1799 $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1800 ->select(
1801 ['phash'],
1802 'index_phash',
1803 [
1804 'phash_grouping' => (int)$this->hash['phash_grouping'],
1805 'contentHash' => (int)$this->content_md5h
1806 ],
1807 [],
1808 [],
1809 1
1810 )
1811 ->fetch();
1812
1813 if (!empty($row)) {
1814 $result = $row;
1815 }
1816 }
1817 return $result;
1818 }
1819
1820 /**
1821 * Check content hash for external documents
1822 * Returns TRUE if the document needs to be indexed (that is, there was no result)
1823 *
1824 * @param int $hashGr phash value to check (phash_grouping)
1825 * @param int $content_md5h Content hash to check
1826 * @return bool Returns TRUE if the document needs to be indexed (that is, there was no result)
1827 */
1828 public function checkExternalDocContentHash($hashGr, $content_md5h)
1829 {
1830 $result = true;
1831 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1832 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1833 ->getConnectionForTable('index_phash')
1834 ->count(
1835 '*',
1836 'index_phash',
1837 [
1838 'phash_grouping' => (int)$hashGr,
1839 'contentHash' => (int)$content_md5h
1840 ]
1841 );
1842
1843 $result = $count === 0;
1844 }
1845 return $result;
1846 }
1847
1848 /**
1849 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1850 *
1851 * @param int $phash_x Phash integer to test.
1852 * @return bool
1853 */
1854 public function is_grlist_set($phash_x)
1855 {
1856 $result = false;
1857 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1858 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1859 ->getConnectionForTable('index_grlist')
1860 ->count(
1861 'phash_x',
1862 'index_grlist',
1863 ['phash_x' => (int)$phash_x]
1864 );
1865
1866 $result = $count > 0;
1867 }
1868 return $result;
1869 }
1870
1871 /**
1872 * Check if a grlist-entry for this hash exists and if not so, write one.
1873 *
1874 * @param int $phash phash of the search result that should be found
1875 * @param int $phash_x The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1876 * @see submit_grlist()
1877 */
1878 public function update_grlist($phash, $phash_x)
1879 {
1880 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1881 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1882 ->getConnectionForTable('index_grlist')
1883 ->count(
1884 'phash',
1885 'index_grlist',
1886 [
1887 'phash' => (int)$phash,
1888 'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list'])
1889 ]
1890 );
1891
1892 if ($count === 0) {
1893 $this->submit_grlist($phash, $phash_x);
1894 $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1);
1895 }
1896 }
1897 }
1898
1899 /**
1900 * Update tstamp for a phash row.
1901 *
1902 * @param int $phash phash value
1903 * @param int $mtime If set, update the mtime field to this value.
1904 */
1905 public function updateTstamp($phash, $mtime = 0)
1906 {
1907 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1908 return;
1909 }
1910
1911 $updateFields = [
1912 'tstamp' => $GLOBALS['EXEC_TIME']
1913 ];
1914
1915 if ($mtime) {
1916 $updateFields['item_mtime'] = (int)$mtime;
1917 }
1918
1919 GeneralUtility::makeInstance(ConnectionPool::class)
1920 ->getConnectionForTable('index_phash')
1921 ->update(
1922 'index_phash',
1923 $updateFields,
1924 [
1925 'phash' => (int)$phash
1926 ]
1927 );
1928 }
1929
1930 /**
1931 * Update SetID of the index_phash record.
1932 *
1933 * @param int $phash phash value
1934 */
1935 public function updateSetId($phash)
1936 {
1937 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1938 return;
1939 }
1940
1941 GeneralUtility::makeInstance(ConnectionPool::class)
1942 ->getConnectionForTable('index_phash')
1943 ->update(
1944 'index_phash',
1945 [
1946 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1947 ],
1948 [
1949 'phash' => (int)$phash
1950 ]
1951 );
1952 }
1953
1954 /**
1955 * Update parsetime for phash row.
1956 *
1957 * @param int $phash phash value.
1958 * @param int $parsetime Parsetime value to set.
1959 */
1960 public function updateParsetime($phash, $parsetime)
1961 {
1962 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1963 return;
1964 }
1965
1966 GeneralUtility::makeInstance(ConnectionPool::class)
1967 ->getConnectionForTable('index_phash')
1968 ->update(
1969 'index_phash',
1970 [
1971 'parsetime' => (int)$parsetime
1972 ],
1973 [
1974 'phash' => (int)$phash
1975 ]
1976 );
1977 }
1978
1979 /**
1980 * Update section rootline for the page
1981 */
1982 public function updateRootline()
1983 {
1984 if (!IndexedSearchUtility::isTableUsed('index_section')) {
1985 return;
1986 }
1987
1988 $updateFields = [];
1989 $this->getRootLineFields($updateFields);
1990
1991 GeneralUtility::makeInstance(ConnectionPool::class)
1992 ->getConnectionForTable('index_section')
1993 ->update(
1994 'index_section',
1995 $updateFields,
1996 [
1997 'page_id' => (int)$this->conf['id']
1998 ]
1999 );
2000 }
2001
2002 /**
2003 * Adding values for root-line fields.
2004 * rl0, rl1 and rl2 are standard. A hook might add more.
2005 *
2006 * @param array $fieldArray Field array, passed by reference
2007 */
2008 public function getRootLineFields(array &$fieldArray)
2009 {
2010 $fieldArray['rl0'] = (int)$this->conf['rootline_uids'][0];
2011 $fieldArray['rl1'] = (int)$this->conf['rootline_uids'][1];
2012 $fieldArray['rl2'] = (int)$this->conf['rootline_uids'][2];
2013 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] ?? [] as $fieldName => $rootLineLevel) {
2014 $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
2015 }
2016 }
2017
2018 /********************************
2019 *
2020 * SQL; Submitting words
2021 *
2022 *******************************/
2023 /**
2024 * Adds new words to db
2025 *
2026 * @param array $wordListArray Word List array (where each word has information about position etc).
2027 */
2028 public function checkWordList($wordListArray)
2029 {
2030 if (!IndexedSearchUtility::isTableUsed('index_words') || empty($wordListArray)) {
2031 return;
2032 }
2033
2034 $wordListArrayCount = count($wordListArray);
2035 $phashArray = array_map('intval', array_column($wordListArray, 'hash'));
2036
2037 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('index_words');
2038 $count = (int)$queryBuilder->count('baseword')
2039 ->from('index_words')
2040 ->where(
2041 $queryBuilder->expr()->in(
2042 'wid',
2043 $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
2044 )
2045 )
2046 ->execute()
2047 ->fetchColumn();
2048
2049 if ($count !== $wordListArrayCount) {
2050 $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_words');
2051 $queryBuilder = $connection->createQueryBuilder();
2052
2053 $result = $queryBuilder->select('baseword')
2054 ->from('index_words')
2055 ->where(
2056 $queryBuilder->expr()->in(
2057 'wid',
2058 $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
2059 )
2060 )
2061 ->execute();
2062
2063 $this->log_setTSlogMessage('Inserting words: ' . ($wordListArrayCount - $count), 1);
2064 while ($row = $result->fetch()) {
2065 unset($wordListArray[$row['baseword']]);
2066 }
2067
2068 foreach ($wordListArray as $key => $val) {
2069 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as
2070 // long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...)
2071 // this is not a problem.
2072 $connection->insert(
2073 'index_words',
2074 [
2075 'wid' => $val['hash'],
2076 'baseword' => $key,
2077 'metaphone' => $val['metaphone']
2078 ]
2079 );
2080 }
2081 }
2082 }
2083
2084 /**
2085 * Submits RELATIONS between words and phash
2086 *
2087 * @param array $wordList Word list array
2088 * @param int $phash phash value
2089 */
2090 public function submitWords($wordList, $phash)
2091 {
2092 if (!IndexedSearchUtility::isTableUsed('index_rel')) {
2093 return;
2094 }
2095 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
2096 $queryBuilder = $connectionPool->getQueryBuilderForTable('index_words');
2097 $result = $queryBuilder->select('wid')
2098 ->from('index_words')
2099 ->where(
2100 $queryBuilder->expr()->neq('is_stopword', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT))
2101 )
2102 ->groupBy('wid')
2103 ->execute();
2104
2105 $stopWords = [];
2106 while ($row = $result->fetch()) {
2107 $stopWords[$row['wid']] = $row;
2108 }
2109
2110 $connectionPool->getConnectionForTable('index_rel')->delete('index_rel', ['phash' => (int)$phash]);
2111
2112 $fields = ['phash', 'wid', 'count', 'first', 'freq', 'flags'];
2113 $rows = [];
2114 foreach ($wordList as $val) {
2115 if (isset($stopWords[$val['hash']])) {
2116 continue;
2117 }
2118 $rows[] = [
2119 (int)$phash,
2120 (int)$val['hash'],
2121 (int)$val['count'],
2122 (int)$val['first'],
2123 $this->freqMap($val['count'] / $this->wordcount),
2124 $val['cmp'] & $this->flagBitMask
2125 ];
2126 }
2127
2128 if (!empty($rows)) {
2129 $connectionPool->getConnectionForTable('index_rel')->bulkInsert('index_rel', $rows, $fields);
2130 }
2131 }
2132
2133 /**
2134 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
2135 * and back.
2136 *
2137 * @param float $freq Frequency
2138 * @return int Frequency in range.
2139 */
2140 public function freqMap($freq)
2141 {
2142 $mapFactor = $this->freqMax * 100 * $this->freqRange;
2143 if ($freq <= 1) {
2144 $newFreq = $freq * $mapFactor;
2145 $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
2146 } else {
2147 $newFreq = $freq / $mapFactor;
2148 }
2149 return $newFreq;
2150 }
2151
2152 /********************************
2153 *
2154 * Hashing
2155 *
2156 *******************************/
2157 /**
2158 * Get search hash, T3 pages
2159 */
2160 public function setT3Hashes()
2161 {
2162 // Set main array:
2163 $hArray = [
2164 'id' => (int)$this->conf['id'],
2165 'type' => (int)$this->conf['type'],
2166 'sys_lang' => (int)$this->conf['sys_language_uid'],
2167 'MP' => (string)$this->conf['MP'],
2168 'staticPageArguments' => $this->conf['staticPageArguments'],
2169 ];
2170 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
2171 $this->hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2172 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
2173 $hArray['gr_list'] = (string)$this->conf['gr_list'];
2174 $this->hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2175 }
2176
2177 /**
2178 * Get search hash, external files
2179 *
2180 * @param string $file File name / path which identifies it on the server
2181 * @param array $subinfo Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
2182 * @return array Array with "phash_grouping" and "phash" inside.
2183 */
2184 public function setExtHashes($file, $subinfo = [])
2185 {
2186 // Set main array:
2187 $hash = [];
2188 $hArray = [
2189 'file' => $file
2190 ];
2191 // Set grouping hash:
2192 $hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2193 // Add subinfo
2194 $hArray['subinfo'] = $subinfo;
2195 $hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2196 return $hash;
2197 }
2198
2199 /*********************************
2200 *
2201 * Internal logging functions
2202 *
2203 *********************************/
2204 /**
2205 * Push function wrapper for TT logging
2206 *
2207 * @param string $msg Title to set
2208 * @param string $key Key (?)
2209 */
2210 public function log_push($msg, $key)
2211 {
2212 $this->timeTracker->push($msg, $key);
2213 }
2214
2215 /**
2216 * Pull function wrapper for TT logging
2217 */
2218 public function log_pull()
2219 {
2220 $this->timeTracker->pull();
2221 }
2222
2223 /**
2224 * Set log message function wrapper for TT logging
2225 *
2226 * @param string $msg Message to set
2227 * @param int $errorNum Error number
2228 */
2229 public function log_setTSlogMessage($msg, $errorNum = 0)
2230 {
2231 $this->timeTracker->setTSlogMessage($msg, $errorNum);
2232 $this->internal_log[] = $msg;
2233 }
2234
2235 /**
2236 * Makes sure that keywords are space-separated. This is important for their
2237 * proper displaying as a part of fulltext index.
2238 *
2239 * @param string $keywordList
2240 * @return string
2241 * @see http://forge.typo3.org/issues/14959
2242 */
2243 protected function addSpacesToKeywordList($keywordList)
2244 {
2245 $keywords = GeneralUtility::trimExplode(',', $keywordList);
2246 return ' ' . implode(', ', $keywords) . ' ';
2247 }
2248 }