[!!!][TASK] Remove last usages of $GLOBALS[T3_VAR]
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Indexer.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use Psr\Http\Message\ServerRequestInterface;
18 use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
19 use TYPO3\CMS\Core\Context\Context;
20 use TYPO3\CMS\Core\Context\LanguageAspect;
21 use TYPO3\CMS\Core\Core\Environment;
22 use TYPO3\CMS\Core\Database\Connection;
23 use TYPO3\CMS\Core\Database\ConnectionPool;
24 use TYPO3\CMS\Core\Routing\PageArguments;
25 use TYPO3\CMS\Core\TimeTracker\TimeTracker;
26 use TYPO3\CMS\Core\Utility\GeneralUtility;
27 use TYPO3\CMS\Core\Utility\HttpUtility;
28 use TYPO3\CMS\Core\Utility\MathUtility;
29 use TYPO3\CMS\Core\Utility\PathUtility;
30 use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
31 use TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility;
32
33 /**
34 * Indexing class for TYPO3 frontend
35 */
36 class Indexer
37 {
38
39 /**
40 * @var array
41 */
42 public $reasons = [
43 -1 => 'mtime matched the document, so no changes detected and no content updated',
44 -2 => 'The minimum age was not exceeded',
45 1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
46 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
47 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
48 4 => 'Page has never been indexed (is not represented in the index_phash table).'
49 ];
50
51 /**
52 * HTML code blocks to exclude from indexing
53 *
54 * @var string
55 */
56 public $excludeSections = 'script,style';
57
58 /**
59 * Supported Extensions for external files
60 *
61 * @var array
62 */
63 public $external_parsers = [];
64
65 /**
66 * External parser objects, keys are file extension names. Values are objects with certain methods.
67 * Fe-group list (pages might be indexed separately for each usergroup combination to support search
68 * in access limited pages!)
69 *
70 * @var string
71 */
72 public $defaultGrList = '0,-1';
73
74 /**
75 * Min/Max times
76 *
77 * @var int
78 */
79 public $tstamp_maxAge = 0;
80
81 /**
82 * If set, this tells a number of seconds that is the maximum age of an indexed document.
83 * Regardless of mtime the document will be re-indexed if this limit is exceeded.
84 *
85 * @var int
86 */
87 public $tstamp_minAge = 0;
88
89 /**
90 * If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
91 *
92 * @var int
93 */
94 public $maxExternalFiles = 0;
95
96 /**
97 * Max number of external files to index.
98 *
99 * @var bool
100 */
101 public $forceIndexing = false;
102
103 /**
104 * If TRUE, indexing is forced despite of hashes etc.
105 *
106 * @var bool
107 */
108 public $crawlerActive = false;
109
110 /**
111 * Set when crawler is detected (internal)
112 *
113 * @var array
114 */
115 public $defaultContentArray = [
116 'title' => '',
117 'description' => '',
118 'keywords' => '',
119 'body' => ''
120 ];
121
122 /**
123 * @var int
124 */
125 public $wordcount = 0;
126
127 /**
128 * @var int
129 */
130 public $externalFileCounter = 0;
131
132 /**
133 * @var array
134 */
135 public $conf = [];
136
137 /**
138 * Configuration set internally (see init functions for required keys and their meaning)
139 *
140 * @var array
141 */
142 public $indexerConfig = [];
143
144 /**
145 * Indexer configuration, coming from TYPO3's system configuration for EXT:indexed_search
146 *
147 * @var array
148 */
149 public $hash = [];
150
151 /**
152 * Hash array, contains phash and phash_grouping
153 *
154 * @var array
155 */
156 public $file_phash_arr = [];
157
158 /**
159 * Hash array for files
160 *
161 * @var array
162 */
163 public $contentParts = [];
164
165 /**
166 * Content of TYPO3 page
167 *
168 * @var string
169 */
170 public $content_md5h = '';
171
172 /**
173 * @var array
174 */
175 public $internal_log = [];
176
177 /**
178 * Internal log
179 *
180 * @var string
181 */
182 public $indexExternalUrl_content = '';
183
184 /**
185 * cHash params array
186 *
187 * @var array
188 */
189 public $cHashParams = [];
190
191 /**
192 * cHashparams array
193 *
194 * @var int
195 */
196 public $freqRange = 32000;
197
198 /**
199 * @var float
200 */
201 public $freqMax = 0.1;
202
203 /**
204 * @var bool
205 */
206 public $enableMetaphoneSearch = false;
207
208 /**
209 * @var bool
210 */
211 public $storeMetaphoneInfoAsWords;
212
213 /**
214 * @var string
215 */
216 public $metaphoneContent = '';
217
218 /**
219 * Metaphone object, if any
220 *
221 * @var \TYPO3\CMS\IndexedSearch\Utility\DoubleMetaPhoneUtility
222 */
223 public $metaphoneObj;
224
225 /**
226 * Lexer object for word splitting
227 *
228 * @var \TYPO3\CMS\IndexedSearch\Lexer
229 */
230 public $lexerObj;
231
232 /**
233 * @var bool
234 */
235 public $flagBitMask;
236
237 /**
238 * @var TimeTracker
239 */
240 protected $timeTracker;
241
242 /**
243 * Indexer constructor.
244 */
245 public function __construct()
246 {
247 $this->timeTracker = GeneralUtility::makeInstance(TimeTracker::class);
248 }
249
250 /**
251 * Parent Object (TSFE) Initialization
252 *
253 * @param TypoScriptFrontendController $pObj Parent Object, passed by reference
254 */
255 public function hook_indexContent(&$pObj)
256 {
257 // Indexer configuration from Extension Manager interface:
258 $disableFrontendIndexing = (bool)GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search', 'disableFrontendIndexing');
259 // Crawler activation:
260 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
261 if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler') && $pObj->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
262 // Setting simple log message:
263 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
264 // Setting variables:
265 $this->crawlerActive = true;
266 // Crawler active flag
267 $this->forceIndexing = true;
268 }
269 // Determine if page should be indexed, and if so, configure and initialize indexer
270 if ($pObj->config['config']['index_enable']) {
271 $this->log_push('Index page', '');
272 if (!$disableFrontendIndexing || $this->crawlerActive) {
273 if (!$pObj->page['no_search']) {
274 if (!$pObj->no_cache) {
275 /** @var LanguageAspect $languageAspect */
276 $languageAspect = GeneralUtility::makeInstance(Context::class)->getAspect('language');
277 if ($languageAspect->getId() === $languageAspect->getContentId()) {
278 // Setting up internal configuration from config array:
279 $this->conf = [];
280 // Information about page for which the indexing takes place
281 $this->conf['id'] = $pObj->id;
282 // Page id
283 $this->conf['type'] = $pObj->type;
284 // Page type
285 $this->conf['sys_language_uid'] = $languageAspect->getId();
286 // sys_language UID of the language of the indexing.
287 $this->conf['MP'] = $pObj->MP;
288 // MP variable, if any (Mount Points)
289 // Group list
290 $this->conf['gr_list'] = implode(',', GeneralUtility::makeInstance(Context::class)->getPropertyFromAspect('frontend.user', 'groupIds', [0, -1]));
291 // cHash string for additional parameters
292 $this->conf['cHash'] = $pObj->cHash;
293 // cHash array with additional parameters
294 $this->conf['cHash_array'] = $pObj->cHash_array;
295 // page arguments array
296 $this->conf['staticPageArguments'] = [];
297 /** @var PageArguments $pageArguments */
298 if ($GLOBALS['TYPO3_REQUEST'] instanceof ServerRequestInterface) {
299 $pageArguments = $GLOBALS['TYPO3_REQUEST']->getAttribute('routing', null);
300 if ($pageArguments instanceof PageArguments) {
301 $this->conf['staticPageArguments'] = $pageArguments->getStaticArguments();
302 }
303 }
304 // Array of the additional parameters
305 $this->conf['crdate'] = $pObj->page['crdate'];
306 // The creation date of the TYPO3 page
307
308 // Root line uids
309 $this->conf['rootline_uids'] = [];
310 foreach ($pObj->config['rootLine'] as $rlkey => $rldat) {
311 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
312 }
313 // Content of page:
314 $this->conf['content'] = $pObj->content;
315 // Content string (HTML of TYPO3 page)
316 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
317 // Alternative title for indexing
318 $this->conf['metaCharset'] = $pObj->metaCharset;
319 // Character set of content (will be converted to utf-8 during indexing)
320 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED'] ?? $pObj->page['SYS_LASTCHANGED'];
321 // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
322 // Configuration of behavior:
323 $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
324 // Whether to index external documents like PDF, DOC etc. (if possible)
325 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
326 // Length of description text (max 250, default 200)
327 $this->conf['index_metatags'] = $pObj->config['config']['index_metatags'] ?? true;
328 // Set to zero:
329 $this->conf['recordUid'] = 0;
330 $this->conf['freeIndexUid'] = 0;
331 $this->conf['freeIndexSetId'] = 0;
332 // Init and start indexing:
333 $this->init();
334 $this->indexTypo3PageContent();
335 } else {
336 $this->log_setTSlogMessage('Index page? No, languageId was different from contentId which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
337 }
338 } else {
339 $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
340 }
341 } else {
342 $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
343 }
344 } else {
345 $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
346 }
347 $this->log_pull();
348 }
349 }
350
351 /****************************
352 *
353 * Backend API
354 *
355 ****************************/
356 /**
357 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
358 *
359 * @param int $id The page uid, &id=
360 * @param int $type The page type, &type=
361 * @param int $sys_language_uid sys_language uid, typically &L=
362 * @param string $MP The MP variable (Mount Points), &MP=
363 * @param array $uidRL Rootline array of only UIDs.
364 * @param array $cHash_array Array of GET variables to register with this indexing
365 * @param bool $createCHash If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cacheable pages from the backend!
366 */
367 public function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array = [], $createCHash = false)
368 {
369 // Setting up internal configuration from config array:
370 $this->conf = [];
371 // Information about page for which the indexing takes place
372 $this->conf['id'] = $id;
373 // Page id (int)
374 $this->conf['type'] = $type;
375 // Page type (int)
376 $this->conf['sys_language_uid'] = $sys_language_uid;
377 // sys_language UID of the language of the indexing (int)
378 $this->conf['MP'] = $MP;
379 // MP variable, if any (Mount Points) (string)
380 $this->conf['gr_list'] = '0,-1';
381 // Group list (hardcoded for now...)
382 // cHash values:
383 if ($createCHash) {
384 /* @var \TYPO3\CMS\Frontend\Page\CacheHashCalculator $cacheHash */
385 $cacheHash = GeneralUtility::makeInstance(\TYPO3\CMS\Frontend\Page\CacheHashCalculator::class);
386 $this->conf['cHash'] = $cacheHash->generateForParameters(HttpUtility::buildQueryString($cHash_array));
387 } else {
388 $this->conf['cHash'] = '';
389 }
390 // cHash string for additional parameters
391 $this->conf['cHash_array'] = $cHash_array;
392 // Array of the additional parameters
393 // Set to defaults
394 $this->conf['freeIndexUid'] = 0;
395 $this->conf['freeIndexSetId'] = 0;
396
397 // Root line uids
398 $this->conf['rootline_uids'] = $uidRL;
399 // Configuration of behavior:
400 $this->conf['index_externals'] = 1;
401 // Whether to index external documents like PDF, DOC etc. (if possible)
402 $this->conf['index_descrLgd'] = 200;
403 // Length of description text (max 250, default 200)
404 $this->conf['index_metatags'] = true;
405 // Whether to index document keywords and description (if present)
406 // Init and start indexing:
407 $this->init();
408 }
409
410 /**
411 * Sets the free-index uid. Can be called right after backend_initIndexer()
412 *
413 * @param int $freeIndexUid Free index UID
414 * @param int $freeIndexSetId Set id - an integer identifying the "set" of indexing operations.
415 */
416 public function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId = 0)
417 {
418 $this->conf['freeIndexUid'] = $freeIndexUid;
419 $this->conf['freeIndexSetId'] = $freeIndexSetId;
420 }
421
422 /**
423 * Indexing records as the content of a TYPO3 page.
424 *
425 * @param string $title Title equivalent
426 * @param string $keywords Keywords equivalent
427 * @param string $description Description equivalent
428 * @param string $content The main content to index
429 * @param string $charset The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
430 * @param int $mtime Last modification time, in seconds
431 * @param int $crdate The creation date of the content, in seconds
432 * @param int $recordUid The record UID that the content comes from (for registration with the indexed rows)
433 */
434 public function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate = 0, $recordUid = 0)
435 {
436 // Content of page:
437 $this->conf['mtime'] = $mtime;
438 // Most recent modification time (seconds) of the content
439 $this->conf['crdate'] = $crdate;
440 // The creation date of the TYPO3 content
441 $this->conf['recordUid'] = $recordUid;
442 // UID of the record, if applicable
443 // Construct fake HTML for parsing:
444 $this->conf['content'] = '
445 <html>
446 <head>
447 <title>' . htmlspecialchars($title) . '</title>
448 <meta name="keywords" content="' . htmlspecialchars($keywords) . '" />
449 <meta name="description" content="' . htmlspecialchars($description) . '" />
450 </head>
451 <body>
452 ' . htmlspecialchars($content) . '
453 </body>
454 </html>';
455 // Content string (HTML of TYPO3 page)
456 // Initializing charset:
457 $this->conf['metaCharset'] = $charset;
458 // Character set of content (will be converted to utf-8 during indexing)
459 $this->conf['indexedDocTitle'] = '';
460 // Alternative title for indexing
461 // Index content as if it was a TYPO3 page:
462 $this->indexTypo3PageContent();
463 }
464
465 /********************************
466 *
467 * Initialization
468 *
469 *******************************/
470 /**
471 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
472 */
473 public function init()
474 {
475 // Initializing:
476 $this->cHashParams = $this->conf['cHash_array'];
477 if (is_array($this->cHashParams) && !empty($this->cHashParams)) {
478 if ($this->conf['cHash']) {
479 // Add this so that URL's come out right...
480 $this->cHashParams['cHash'] = $this->conf['cHash'];
481 }
482 unset($this->cHashParams['encryptionKey']);
483 }
484 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
485 $this->setT3Hashes();
486 // Indexer configuration from Extension Manager interface:
487 $this->indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
488 $this->tstamp_minAge = MathUtility::forceIntegerInRange($this->indexerConfig['minAge'] * 3600, 0);
489 $this->tstamp_maxAge = MathUtility::forceIntegerInRange($this->indexerConfig['maxAge'] * 3600, 0);
490 $this->maxExternalFiles = MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
491 $this->flagBitMask = MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
492 // Workaround: If the extension configuration was not updated yet, the value is not existing
493 $this->enableMetaphoneSearch = !isset($this->indexerConfig['enableMetaphoneSearch']) || $this->indexerConfig['enableMetaphoneSearch'];
494 $this->storeMetaphoneInfoAsWords = !IndexedSearchUtility::isTableUsed('index_words') && $this->enableMetaphoneSearch;
495 // Initialize external document parsers:
496 // Example configuration, see ext_localconf.php of this file!
497 if ($this->conf['index_externals']) {
498 $this->initializeExternalParsers();
499 }
500 // Initialize lexer (class that deconstructs the text into words):
501 $lexerObjectClassName = $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] ?: Lexer::class;
502 $this->lexerObj = GeneralUtility::makeInstance($lexerObjectClassName);
503 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
504 // Initialize metaphone hook:
505 // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
506 if ($this->enableMetaphoneSearch && $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']) {
507 $this->metaphoneObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']);
508 $this->metaphoneObj->pObj = $this;
509 }
510 }
511
512 /**
513 * Initialize external parsers
514 *
515 * @internal
516 * @see init()
517 */
518 public function initializeExternalParsers()
519 {
520 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] ?? [] as $extension => $className) {
521 $this->external_parsers[$extension] = GeneralUtility::makeInstance($className);
522 $this->external_parsers[$extension]->pObj = $this;
523 // Init parser and if it returns FALSE, unset its entry again:
524 if (!$this->external_parsers[$extension]->initParser($extension)) {
525 unset($this->external_parsers[$extension]);
526 }
527 }
528 }
529
530 /********************************
531 *
532 * Indexing; TYPO3 pages (HTML content)
533 *
534 *******************************/
535 /**
536 * Start indexing of the TYPO3 page
537 */
538 public function indexTypo3PageContent()
539 {
540 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
541 $is_grlist = $this->is_grlist_set($this->hash['phash']);
542 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
543 // Setting message:
544 if ($this->forceIndexing) {
545 $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
546 } elseif ($check > 0) {
547 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
548 } else {
549 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
550 }
551 // Divide into title,keywords,description and body:
552 $this->log_push('Split content', '');
553 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
554 if ($this->conf['indexedDocTitle']) {
555 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
556 }
557 $this->log_pull();
558 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so don't!)
559 $this->content_md5h = IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
560 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
561 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
562 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
563 $checkCHash = $this->checkContentHash();
564 if (!is_array($checkCHash) || $check === 1) {
565 $Pstart = GeneralUtility::milliseconds();
566 $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
567 $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
568 $this->log_pull();
569 // Splitting words
570 $this->log_push('Extract words from content', '');
571 $splitInWords = $this->processWordsInArrays($this->contentParts);
572 $this->log_pull();
573 // Analyze the indexed words.
574 $this->log_push('Analyze the extracted words', '');
575 $indexArr = $this->indexAnalyze($splitInWords);
576 $this->log_pull();
577 // Submitting page (phash) record
578 $this->log_push('Submitting page', '');
579 $this->submitPage();
580 $this->log_pull();
581 // Check words and submit to word list if not there
582 $this->log_push('Check word list and submit words', '');
583 if (IndexedSearchUtility::isTableUsed('index_words')) {
584 $this->checkWordList($indexArr);
585 $this->submitWords($indexArr, $this->hash['phash']);
586 }
587 $this->log_pull();
588 // Set parsetime
589 $this->updateParsetime($this->hash['phash'], GeneralUtility::milliseconds() - $Pstart);
590 // Checking external files if configured for.
591 $this->log_push('Checking external files', '');
592 if ($this->conf['index_externals']) {
593 $this->extractLinks($this->conf['content']);
594 }
595 $this->log_pull();
596 } else {
597 // Update the timestamp
598 $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
599 $this->updateSetId($this->hash['phash']);
600 // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
601 $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
602 $this->updateRootline();
603 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
604 }
605 } else {
606 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
607 }
608 }
609
610 /**
611 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
612 *
613 * @param string $content HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
614 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
615 * @see splitRegularContent()
616 */
617 public function splitHTMLContent($content)
618 {
619 // divide head from body ( u-ouh :) )
620 $contentArr = $this->defaultContentArray;
621 $contentArr['body'] = stristr($content, '<body');
622 $headPart = substr($content, 0, -strlen($contentArr['body']));
623 // get title
624 $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
625 $titleParts = explode(':', $contentArr['title'], 2);
626 $contentArr['title'] = trim($titleParts[1] ?? $titleParts[0]);
627 // get keywords and description metatags
628 if ($this->conf['index_metatags']) {
629 $meta = [];
630 $i = 0;
631 while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
632 $i++;
633 }
634 // @todo The code below stops at first unset tag. Is that correct?
635 for ($i = 0; isset($meta[$i]); $i++) {
636 $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i]);
637 if (stristr($meta[$i]['name'], 'keywords')) {
638 $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
639 }
640 if (stristr($meta[$i]['name'], 'description')) {
641 $contentArr['description'] .= ',' . $meta[$i]['content'];
642 }
643 }
644 }
645 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
646 $this->typoSearchTags($contentArr['body']);
647 // Get rid of unwanted sections (ie. scripting and style stuff) in body
648 $tagList = explode(',', $this->excludeSections);
649 foreach ($tagList as $tag) {
650 while ($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
651 }
652 }
653 // remove tags, but first make sure we don't concatenate words by doing it
654 $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
655 $contentArr['body'] = trim(strip_tags($contentArr['body']));
656 $contentArr['keywords'] = trim($contentArr['keywords']);
657 $contentArr['description'] = trim($contentArr['description']);
658 // Return array
659 return $contentArr;
660 }
661
662 /**
663 * Extract the charset value from HTML meta tag.
664 *
665 * @param string $content HTML content
666 * @return string The charset value if found.
667 */
668 public function getHTMLcharset($content)
669 {
670 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
671 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
672 return $reg2[1];
673 }
674 }
675 }
676
677 /**
678 * Converts a HTML document to utf-8
679 *
680 * @param string $content HTML content, any charset
681 * @param string $charset Optional charset (otherwise extracted from HTML)
682 * @return string Converted HTML
683 */
684 public function convertHTMLToUtf8($content, $charset = '')
685 {
686 // Find charset:
687 $charset = $charset ?: $this->getHTMLcharset($content);
688 $charset = trim(strtolower($charset));
689 // Convert charset:
690 if ($charset && $charset !== 'utf-8') {
691 $content = mb_convert_encoding($content, 'utf-8', $charset);
692 }
693 // Convert entities, assuming document is now UTF-8:
694 return html_entity_decode($content);
695 }
696
697 /**
698 * Finds first occurrence of embracing tags and returns the embraced content and the original string with
699 * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding
700 * <title> of document or removing <script>-sections
701 *
702 * @param string $string String to search in
703 * @param string $tagName Tag name, eg. "script
704 * @param string $tagContent Passed by reference: Content inside found tag
705 * @param string $stringAfter Passed by reference: Content after found tag
706 * @param string $paramList Passed by reference: Attributes of the found tag.
707 * @return bool Returns FALSE if tag was not found, otherwise TRUE.
708 */
709 public function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
710 {
711 $endTag = '</' . $tagName . '>';
712 $startTag = '<' . $tagName;
713 // stristr used because we want a case-insensitive search for the tag.
714 $isTagInText = stristr($string, $startTag);
715 // if the tag was not found, return FALSE
716 if (!$isTagInText) {
717 return false;
718 }
719 list($paramList, $isTagInText) = explode('>', substr($isTagInText, strlen($startTag)), 2);
720 $afterTagInText = stristr($isTagInText, $endTag);
721 if ($afterTagInText) {
722 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
723 $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
724 $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
725 } else {
726 $tagContent = '';
727 $stringAfter = $isTagInText;
728 }
729 return true;
730 }
731
732 /**
733 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
734 *
735 * @param string $body HTML Content, passed by reference
736 * @return bool Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE.
737 */
738 public function typoSearchTags(&$body)
739 {
740 $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
741 if (count($expBody) > 1) {
742 $body = '';
743 foreach ($expBody as $val) {
744 $part = explode('-->', $val, 2);
745 if (trim($part[0]) === 'begin') {
746 $body .= $part[1];
747 $prev = '';
748 } elseif (trim($part[0]) === 'end') {
749 $body .= $prev;
750 } else {
751 $prev = $val;
752 }
753 }
754 return true;
755 }
756 return false;
757 }
758
759 /**
760 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
761 *
762 * @param string $content HTML content
763 */
764 public function extractLinks($content)
765 {
766 // Get links:
767 $list = $this->extractHyperLinks($content);
768 if ($this->indexerConfig['useCrawlerForExternalFiles'] && \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler')) {
769 $crawler = GeneralUtility::makeInstance(\tx_crawler_lib::class);
770 }
771 // Traverse links:
772 foreach ($list as $linkInfo) {
773 // Decode entities:
774 if ($linkInfo['localPath']) {
775 // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
776 $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
777 } else {
778 $linkSource = htmlspecialchars_decode($linkInfo['href']);
779 }
780 // Parse URL:
781 $qParts = parse_url($linkSource);
782 // Check for jumpurl (TYPO3 specific thing...)
783 if ($qParts['query'] && strstr($qParts['query'], 'jumpurl=')) {
784 parse_str($qParts['query'], $getP);
785 $linkSource = $getP['jumpurl'];
786 $qParts = parse_url($linkSource);
787 }
788 if (!$linkInfo['localPath'] && $qParts['scheme']) {
789 if ($this->indexerConfig['indexExternalURLs']) {
790 // Index external URL (http or otherwise)
791 $this->indexExternalUrl($linkSource);
792 }
793 } elseif (!$qParts['query']) {
794 $linkSource = urldecode($linkSource);
795 if (GeneralUtility::isAllowedAbsPath($linkSource)) {
796 $localFile = $linkSource;
797 } else {
798 $localFile = GeneralUtility::getFileAbsFileName(Environment::getPublicPath() . '/' . $linkSource);
799 }
800 if ($localFile && @is_file($localFile)) {
801 // Index local file:
802 if ($linkInfo['localPath']) {
803 $fI = pathinfo($linkSource);
804 $ext = strtolower($fI['extension']);
805 if (is_object($crawler)) {
806 $params = [
807 'document' => $linkSource,
808 'alturl' => $linkInfo['href'],
809 'conf' => $this->conf
810 ];
811 unset($params['conf']['content']);
812 $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
813 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
814 } else {
815 $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
816 }
817 } else {
818 if (is_object($crawler)) {
819 $params = [
820 'document' => $linkSource,
821 'conf' => $this->conf
822 ];
823 unset($params['conf']['content']);
824 $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
825 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
826 } else {
827 $this->indexRegularDocument($linkSource);
828 }
829 }
830 }
831 }
832 }
833 }
834
835 /**
836 * Extracts all links to external documents from the HTML content string
837 *
838 * @param string $html
839 * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
840 * @see extractLinks()
841 */
842 public function extractHyperLinks($html)
843 {
844 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
845 $htmlParts = $htmlParser->splitTags('a', $html);
846 $hyperLinksData = [];
847 foreach ($htmlParts as $index => $tagData) {
848 if ($index % 2 !== 0) {
849 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
850 $firstTagName = $htmlParser->getFirstTagName($tagData);
851 if (strtolower($firstTagName) === 'a') {
852 if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] !== '#') {
853 $hyperLinksData[] = [
854 'tag' => $tagData,
855 'href' => $tagAttributes[0]['href'],
856 'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
857 ];
858 }
859 }
860 }
861 }
862 return $hyperLinksData;
863 }
864
865 /**
866 * Extracts the "base href" from content string.
867 *
868 * @param string $html Content to analyze
869 * @return string The base href or an empty string if not found
870 */
871 public function extractBaseHref($html)
872 {
873 $href = '';
874 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
875 $htmlParts = $htmlParser->splitTags('base', $html);
876 foreach ($htmlParts as $index => $tagData) {
877 if ($index % 2 !== 0) {
878 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
879 $firstTagName = $htmlParser->getFirstTagName($tagData);
880 if (strtolower($firstTagName) === 'base') {
881 $href = $tagAttributes[0]['href'];
882 if ($href) {
883 break;
884 }
885 }
886 }
887 }
888 return $href;
889 }
890
891 /******************************************
892 *
893 * Indexing; external URL
894 *
895 ******************************************/
896 /**
897 * Index External URLs HTML content
898 *
899 * @param string $externalUrl URL, eg. "http://typo3.org/
900 * @see indexRegularDocument()
901 */
902 public function indexExternalUrl($externalUrl)
903 {
904 // Get headers:
905 $urlHeaders = $this->getUrlHeaders($externalUrl);
906 if (stristr($urlHeaders['Content-Type'], 'text/html')) {
907 $content = ($this->indexExternalUrl_content = GeneralUtility::getUrl($externalUrl));
908 if ((string)$content !== '') {
909 // Create temporary file:
910 $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
911 if ($tmpFile) {
912 GeneralUtility::writeFile($tmpFile, $content);
913 // Index that file:
914 $this->indexRegularDocument($externalUrl, true, $tmpFile, 'html');
915 // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
916 unlink($tmpFile);
917 }
918 }
919 }
920 }
921
922 /**
923 * Getting HTTP request headers of URL
924 *
925 * @param string $url The URL
926 * @return mixed If no answer, returns FALSE. Otherwise an array where HTTP headers are keys
927 */
928 public function getUrlHeaders($url)
929 {
930 // Try to get the headers only
931 $content = GeneralUtility::getUrl($url, 2);
932 if ((string)$content !== '') {
933 // Compile headers:
934 $headers = GeneralUtility::trimExplode(LF, $content, true);
935 $retVal = [];
936 foreach ($headers as $line) {
937 if (trim($line) === '') {
938 break;
939 }
940 list($headKey, $headValue) = explode(':', $line, 2);
941 $retVal[$headKey] = $headValue;
942 }
943 return $retVal;
944 }
945 }
946
947 /**
948 * Checks if the file is local
949 *
950 * @param string $sourcePath
951 * @return string Absolute path to file if file is local, else empty string
952 */
953 protected function createLocalPath($sourcePath)
954 {
955 $localPath = '';
956 $pathFunctions = [
957 'createLocalPathUsingAbsRefPrefix',
958 'createLocalPathUsingDomainURL',
959 'createLocalPathFromAbsoluteURL',
960 'createLocalPathFromRelativeURL'
961 ];
962 foreach ($pathFunctions as $functionName) {
963 $localPath = $this->{$functionName}($sourcePath);
964 if ($localPath != '') {
965 break;
966 }
967 }
968 return $localPath;
969 }
970
971 /**
972 * Attempts to create a local file path by matching a current request URL.
973 *
974 * @param string $sourcePath
975 * @return string
976 */
977 protected function createLocalPathUsingDomainURL($sourcePath)
978 {
979 $localPath = '';
980 $baseURL = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
981 $baseURLLength = strlen($baseURL);
982 if (strpos($sourcePath, $baseURL) === 0) {
983 $sourcePath = substr($sourcePath, $baseURLLength);
984 $localPath = Environment::getPublicPath() . '/' . $sourcePath;
985 if (!self::isAllowedLocalFile($localPath)) {
986 $localPath = '';
987 }
988 }
989 return $localPath;
990 }
991
992 /**
993 * Attempts to create a local file path by matching absRefPrefix. This
994 * requires TSFE. If TSFE is missing, this function does nothing.
995 *
996 * @param string $sourcePath
997 * @return string
998 */
999 protected function createLocalPathUsingAbsRefPrefix($sourcePath)
1000 {
1001 $localPath = '';
1002 if (isset($GLOBALS['TSFE']) && $GLOBALS['TSFE'] instanceof TypoScriptFrontendController) {
1003 $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
1004 $absRefPrefixLength = strlen($absRefPrefix);
1005 if ($absRefPrefixLength > 0 && strpos($sourcePath, $absRefPrefix) === 0) {
1006 $sourcePath = substr($sourcePath, $absRefPrefixLength);
1007 $localPath = Environment::getPublicPath() . '/' . $sourcePath;
1008 if (!self::isAllowedLocalFile($localPath)) {
1009 $localPath = '';
1010 }
1011 }
1012 }
1013 return $localPath;
1014 }
1015
1016 /**
1017 * Attempts to create a local file path from the absolute URL without
1018 * schema.
1019 *
1020 * @param string $sourcePath
1021 * @return string
1022 */
1023 protected function createLocalPathFromAbsoluteURL($sourcePath)
1024 {
1025 $localPath = '';
1026 if ($sourcePath[0] === '/') {
1027 $sourcePath = substr($sourcePath, 1);
1028 $localPath = Environment::getPublicPath() . '/' . $sourcePath;
1029 if (!self::isAllowedLocalFile($localPath)) {
1030 $localPath = '';
1031 }
1032 }
1033 return $localPath;
1034 }
1035
1036 /**
1037 * Attempts to create a local file path from the relative URL.
1038 *
1039 * @param string $sourcePath
1040 * @return string
1041 */
1042 protected function createLocalPathFromRelativeURL($sourcePath)
1043 {
1044 $localPath = '';
1045 if (self::isRelativeURL($sourcePath)) {
1046 $localPath = Environment::getPublicPath() . '/' . $sourcePath;
1047 if (!self::isAllowedLocalFile($localPath)) {
1048 $localPath = '';
1049 }
1050 }
1051 return $localPath;
1052 }
1053
1054 /**
1055 * Checks if URL is relative.
1056 *
1057 * @param string $url
1058 * @return bool
1059 */
1060 protected static function isRelativeURL($url)
1061 {
1062 $urlParts = @parse_url($url);
1063 return (!isset($urlParts['scheme']) || $urlParts['scheme'] === '') && $urlParts['path'][0] !== '/';
1064 }
1065
1066 /**
1067 * Checks if the path points to the file inside the web site
1068 *
1069 * @param string $filePath
1070 * @return bool
1071 */
1072 protected static function isAllowedLocalFile($filePath)
1073 {
1074 $filePath = GeneralUtility::resolveBackPath($filePath);
1075 $insideWebPath = strpos($filePath, Environment::getPublicPath()) === 0;
1076 $isFile = is_file($filePath);
1077 return $insideWebPath && $isFile;
1078 }
1079
1080 /******************************************
1081 *
1082 * Indexing; external files (PDF, DOC, etc)
1083 *
1084 ******************************************/
1085 /**
1086 * Indexing a regular document given as $file (relative to public web path, local file)
1087 *
1088 * @param string $file Relative Filename, relative to public web path. It can also be an absolute path as long as it is inside the lockRootPath (validated with \TYPO3\CMS\Core\Utility\GeneralUtility::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
1089 * @param bool $force If set, indexing is forced (despite content hashes, mtime etc).
1090 * @param string $contentTmpFile Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
1091 * @param string $altExtension File extension for temporary file.
1092 */
1093 public function indexRegularDocument($file, $force = false, $contentTmpFile = '', $altExtension = '')
1094 {
1095 // Init
1096 $fI = pathinfo($file);
1097 $ext = $altExtension ?: strtolower($fI['extension']);
1098 // Create abs-path:
1099 if (!$contentTmpFile) {
1100 if (!GeneralUtility::isAbsPath($file)) {
1101 // Relative, prepend public web path:
1102 $absFile = GeneralUtility::getFileAbsFileName(Environment::getPublicPath() . '/' . $file);
1103 } else {
1104 // Absolute, pass-through:
1105 $absFile = $file;
1106 }
1107 $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
1108 } else {
1109 $absFile = $contentTmpFile;
1110 }
1111 // Indexing the document:
1112 if ($absFile && @is_file($absFile)) {
1113 if ($this->external_parsers[$ext]) {
1114 $fileInfo = stat($absFile);
1115 $cParts = $this->fileContentParts($ext, $absFile);
1116 foreach ($cParts as $cPKey) {
1117 $this->internal_log = [];
1118 $this->log_push('Index: ' . str_replace('.', '_', PathUtility::basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
1119 $Pstart = GeneralUtility::milliseconds();
1120 $subinfo = ['key' => $cPKey];
1121 // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1122 $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
1123 $check = $this->checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
1124 if ($check > 0 || $force) {
1125 if ($check > 0) {
1126 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
1127 } else {
1128 $this->log_setTSlogMessage('Indexing forced by flag', 1);
1129 }
1130 // Check external file counter:
1131 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1132 // Divide into title,keywords,description and body:
1133 $this->log_push('Split content', '');
1134 $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
1135 $this->log_pull();
1136 if (is_array($contentParts)) {
1137 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1138 $content_md5h = IndexedSearchUtility::md5inthash(implode($contentParts, ''));
1139 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1140 // Increment counter:
1141 $this->externalFileCounter++;
1142 // Splitting words
1143 $this->log_push('Extract words from content', '');
1144 $splitInWords = $this->processWordsInArrays($contentParts);
1145 $this->log_pull();
1146 // Analyze the indexed words.
1147 $this->log_push('Analyze the extracted words', '');
1148 $indexArr = $this->indexAnalyze($splitInWords);
1149 $this->log_pull();
1150 // Submitting page (phash) record
1151 $this->log_push('Submitting page', '');
1152 // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1153 $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], $content_md5h, $contentParts);
1154 $this->log_pull();
1155 // Check words and submit to word list if not there
1156 $this->log_push('Check word list and submit words', '');
1157 if (IndexedSearchUtility::isTableUsed('index_words')) {
1158 $this->checkWordList($indexArr);
1159 $this->submitWords($indexArr, $phash_arr['phash']);
1160 }
1161 $this->log_pull();
1162 // Set parsetime
1163 $this->updateParsetime($phash_arr['phash'], GeneralUtility::milliseconds() - $Pstart);
1164 } else {
1165 // Update the timestamp
1166 $this->updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
1167 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
1168 }
1169 } else {
1170 $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1171 }
1172 } else {
1173 $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
1174 }
1175 } else {
1176 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
1177 }
1178 // Checking and setting sections:
1179 $this->submitFile_section($phash_arr['phash']);
1180 // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1181 $this->log_pull();
1182 }
1183 } else {
1184 $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
1185 }
1186 } else {
1187 $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
1188 }
1189 }
1190
1191 /**
1192 * Reads the content of an external file being indexed.
1193 * The content from the external parser MUST be returned in utf-8!
1194 *
1195 * @param string $fileExtension File extension, eg. "pdf", "doc" etc.
1196 * @param string $absoluteFileName Absolute filename of file (must exist and be validated OK before calling function)
1197 * @param string $sectionPointer Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1198 * @return array Standard content array (title, description, keywords, body keys)
1199 */
1200 public function readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
1201 {
1202 $contentArray = null;
1203 // Consult relevant external document parser:
1204 if (is_object($this->external_parsers[$fileExtension])) {
1205 $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1206 }
1207 return $contentArray;
1208 }
1209
1210 /**
1211 * Creates an array with pointers to divisions of document.
1212 *
1213 * @param string $ext File extension
1214 * @param string $absFile Absolute filename (must exist and be validated OK before calling function)
1215 * @return array Array of pointers to sections that the document should be divided into
1216 */
1217 public function fileContentParts($ext, $absFile)
1218 {
1219 $cParts = [0];
1220 // Consult relevant external document parser:
1221 if (is_object($this->external_parsers[$ext])) {
1222 $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
1223 }
1224 return $cParts;
1225 }
1226
1227 /**
1228 * Splits non-HTML content (from external files for instance)
1229 *
1230 * @param string $content Input content (non-HTML) to index.
1231 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1232 * @see splitHTMLContent()
1233 */
1234 public function splitRegularContent($content)
1235 {
1236 $contentArr = $this->defaultContentArray;
1237 $contentArr['body'] = $content;
1238 return $contentArr;
1239 }
1240
1241 /**********************************
1242 *
1243 * Analysing content, Extracting words
1244 *
1245 **********************************/
1246 /**
1247 * Convert character set and HTML entities in the value of input content array keys
1248 *
1249 * @param array $contentArr Standard content array
1250 * @param string $charset Charset of the input content (converted to utf-8)
1251 */
1252 public function charsetEntity2utf8(&$contentArr, $charset)
1253 {
1254 // Convert charset if necessary
1255 foreach ($contentArr as $key => $value) {
1256 if ((string)$contentArr[$key] !== '') {
1257 if ($charset !== 'utf-8') {
1258 $contentArr[$key] = mb_convert_encoding($contentArr[$key], 'utf-8', $charset);
1259 }
1260 // decode all numeric / html-entities in the string to real characters:
1261 $contentArr[$key] = html_entity_decode($contentArr[$key]);
1262 }
1263 }
1264 }
1265
1266 /**
1267 * Processing words in the array from split*Content -functions
1268 *
1269 * @param array $contentArr Array of content to index, see splitHTMLContent() and splitRegularContent()
1270 * @return array Content input array modified so each key is not a unique array of words
1271 */
1272 public function processWordsInArrays($contentArr)
1273 {
1274 // split all parts to words
1275 foreach ($contentArr as $key => $value) {
1276 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1277 }
1278 // For title, keywords, and description we don't want duplicates:
1279 $contentArr['title'] = array_unique($contentArr['title']);
1280 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1281 $contentArr['description'] = array_unique($contentArr['description']);
1282 // Return modified array:
1283 return $contentArr;
1284 }
1285
1286 /**
1287 * Extracts the sample description text from the content array.
1288 *
1289 * @param array $contentArr Content array
1290 * @return string Description string
1291 */
1292 public function bodyDescription($contentArr)
1293 {
1294 // Setting description
1295 $maxL = MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1296 if ($maxL) {
1297 $bodyDescription = preg_replace('/\s+/u', ' ', $contentArr['body']);
1298 // Shorten the string:
1299 $bodyDescription = mb_strcut($bodyDescription, 0, $maxL, 'utf-8');
1300 }
1301 return $bodyDescription;
1302 }
1303
1304 /**
1305 * Analyzes content to use for indexing,
1306 *
1307 * @param array $content Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1308 * @return array Index Array (whatever that is...)
1309 */
1310 public function indexAnalyze($content)
1311 {
1312 $indexArr = [];
1313 $this->analyzeHeaderinfo($indexArr, $content, 'title', 7);
1314 $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1315 $this->analyzeHeaderinfo($indexArr, $content, 'description', 5);
1316 $this->analyzeBody($indexArr, $content);
1317 return $indexArr;
1318 }
1319
1320 /**
1321 * Calculates relevant information for headercontent
1322 *
1323 * @param array $retArr Index array, passed by reference
1324 * @param array $content Standard content array
1325 * @param string $key Key from standard content array
1326 * @param int $offset Bit-wise priority to type
1327 */
1328 public function analyzeHeaderinfo(&$retArr, $content, $key, $offset)
1329 {
1330 foreach ($content[$key] as $val) {
1331 $val = substr($val, 0, 60);
1332 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1333 if (!isset($retArr[$val])) {
1334 // Word ID (wid)
1335 $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1336 // Metaphone value is also 60 only chars long
1337 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1338 $retArr[$val]['metaphone'] = $metaphone;
1339 }
1340 // Build metaphone fulltext string (can be used for fulltext indexing)
1341 if ($this->storeMetaphoneInfoAsWords) {
1342 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1343 }
1344 // Priority used for flagBitMask feature (see extension configuration)
1345 $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | pow(2, $offset);
1346 // Increase number of occurrences
1347 $retArr[$val]['count']++;
1348 $this->wordcount++;
1349 }
1350 }
1351
1352 /**
1353 * Calculates relevant information for bodycontent
1354 *
1355 * @param array $retArr Index array, passed by reference
1356 * @param array $content Standard content array
1357 */
1358 public function analyzeBody(&$retArr, $content)
1359 {
1360 foreach ($content['body'] as $key => $val) {
1361 $val = substr($val, 0, 60);
1362 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1363 if (!isset($retArr[$val])) {
1364 // First occurrence (used for ranking results)
1365 $retArr[$val]['first'] = $key;
1366 // Word ID (wid)
1367 $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1368 // Metaphone value is also only 60 chars long
1369 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1370 $retArr[$val]['metaphone'] = $metaphone;
1371 }
1372 // Build metaphone fulltext string (can be used for fulltext indexing)
1373 if ($this->storeMetaphoneInfoAsWords) {
1374 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1375 }
1376 // Increase number of occurrences
1377 $retArr[$val]['count']++;
1378 $this->wordcount++;
1379 }
1380 }
1381
1382 /**
1383 * Creating metaphone based hash from input word
1384 *
1385 * @param string $word Word to convert
1386 * @param bool $returnRawMetaphoneValue If set, returns the raw metaphone value (not hashed)
1387 * @return mixed Metaphone hash integer (or raw value, string)
1388 */
1389 public function metaphone($word, $returnRawMetaphoneValue = false)
1390 {
1391 if (is_object($this->metaphoneObj)) {
1392 $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1393 } else {
1394 // Use native PHP function instead of advanced doubleMetaphone class
1395 $metaphoneRawValue = metaphone($word);
1396 }
1397 if ($returnRawMetaphoneValue) {
1398 $result = $metaphoneRawValue;
1399 } elseif ($metaphoneRawValue !== '') {
1400 // Create hash and return integer
1401 $result = IndexedSearchUtility::md5inthash($metaphoneRawValue);
1402 } else {
1403 $result = 0;
1404 }
1405 return $result;
1406 }
1407
1408 /********************************
1409 *
1410 * SQL; TYPO3 Pages
1411 *
1412 *******************************/
1413 /**
1414 * Updates db with information about the page (TYPO3 page, not external media)
1415 */
1416 public function submitPage()
1417 {
1418 // Remove any current data for this phash:
1419 $this->removeOldIndexedPages($this->hash['phash']);
1420 // setting new phash_row
1421 $fields = [
1422 'phash' => $this->hash['phash'],
1423 'phash_grouping' => $this->hash['phash_grouping'],
1424 'cHashParams' => serialize($this->cHashParams),
1425 'static_page_arguments' => json_encode($this->conf['staticPageArguments']),
1426 'contentHash' => $this->content_md5h,
1427 'data_page_id' => $this->conf['id'],
1428 'data_page_type' => $this->conf['type'],
1429 'data_page_mp' => $this->conf['MP'],
1430 'gr_list' => $this->conf['gr_list'],
1431 'item_type' => 0,
1432 // TYPO3 page
1433 'item_title' => $this->contentParts['title'],
1434 'item_description' => $this->bodyDescription($this->contentParts),
1435 'item_mtime' => (int)$this->conf['mtime'],
1436 'item_size' => strlen($this->conf['content']),
1437 'tstamp' => $GLOBALS['EXEC_TIME'],
1438 'crdate' => $GLOBALS['EXEC_TIME'],
1439 'item_crdate' => $this->conf['crdate'],
1440 // Creation date of page
1441 'sys_language_uid' => $this->conf['sys_language_uid'],
1442 // Sys language uid of the page. Should reflect which language it DOES actually display!
1443 'externalUrl' => 0,
1444 'recordUid' => (int)$this->conf['recordUid'],
1445 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1446 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1447 ];
1448 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1449 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1450 ->getConnectionForTable('index_phash');
1451 $connection->insert(
1452 'index_phash',
1453 $fields,
1454 ['cHashParams' => Connection::PARAM_LOB]
1455 );
1456 }
1457 // PROCESSING index_section
1458 $this->submit_section($this->hash['phash'], $this->hash['phash']);
1459 // PROCESSING index_grlist
1460 $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1461 // PROCESSING index_fulltext
1462 $fields = [
1463 'phash' => $this->hash['phash'],
1464 'fulltextdata' => implode(' ', $this->contentParts),
1465 'metaphonedata' => $this->metaphoneContent
1466 ];
1467 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1468 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1469 }
1470 if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1471 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1472 ->getConnectionForTable('index_fulltext');
1473 $connection->insert('index_fulltext', $fields);
1474 }
1475 // PROCESSING index_debug
1476 if ($this->indexerConfig['debugMode']) {
1477 $fields = [
1478 'phash' => $this->hash['phash'],
1479 'debuginfo' => serialize([
1480 'cHashParams' => $this->cHashParams,
1481 'external_parsers initialized' => array_keys($this->external_parsers),
1482 'conf' => array_merge($this->conf, ['content' => substr($this->conf['content'], 0, 1000)]),
1483 'contentParts' => array_merge($this->contentParts, ['body' => substr($this->contentParts['body'], 0, 1000)]),
1484 'logs' => $this->internal_log,
1485 'lexer' => $this->lexerObj->debugString
1486 ])
1487 ];
1488 if (IndexedSearchUtility::isTableUsed('index_debug')) {
1489 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1490 ->getConnectionForTable('index_debug');
1491 $connection->insert('index_debug', $fields);
1492 }
1493 }
1494 }
1495
1496 /**
1497 * Stores gr_list in the database.
1498 *
1499 * @param int $hash Search result record phash
1500 * @param int $phash_x Actual phash of current content
1501 * @see update_grlist()
1502 */
1503 public function submit_grlist($hash, $phash_x)
1504 {
1505 // Setting the gr_list record
1506 $fields = [
1507 'phash' => $hash,
1508 'phash_x' => $phash_x,
1509 'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1510 'gr_list' => $this->conf['gr_list']
1511 ];
1512 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1513 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1514 ->getConnectionForTable('index_grlist');
1515 $connection->insert('index_grlist', $fields);
1516 }
1517 }
1518
1519 /**
1520 * Stores section
1521 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1522 *
1523 * @param int $hash phash of TYPO3 parent search result record
1524 * @param int $hash_t3 phash of the file indexation search record
1525 */
1526 public function submit_section($hash, $hash_t3)
1527 {
1528 $fields = [
1529 'phash' => $hash,
1530 'phash_t3' => $hash_t3,
1531 'page_id' => (int)$this->conf['id']
1532 ];
1533 $this->getRootLineFields($fields);
1534 if (IndexedSearchUtility::isTableUsed('index_section')) {
1535 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1536 ->getConnectionForTable('index_section');
1537 $connection->insert('index_section', $fields);
1538 }
1539 }
1540
1541 /**
1542 * Removes records for the indexed page, $phash
1543 *
1544 * @param int $phash phash value to flush
1545 */
1546 public function removeOldIndexedPages($phash)
1547 {
1548 // Removing old registrations for all tables. Because the pages are TYPO3 pages
1549 // there can be nothing else than 1-1 relations here.
1550 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1551 $tableArray = ['index_phash', 'index_section', 'index_grlist', 'index_fulltext', 'index_debug'];
1552 foreach ($tableArray as $table) {
1553 if (IndexedSearchUtility::isTableUsed($table)) {
1554 $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1555 }
1556 }
1557
1558 // Removing all index_section records with hash_t3 set to this hash (this includes such
1559 // records set for external media on the page as well!). The re-insert of these records
1560 // are done in indexRegularDocument($file).
1561 if (IndexedSearchUtility::isTableUsed('index_section')) {
1562 $connectionPool->getConnectionForTable('index_section')
1563 ->delete('index_section', ['phash_t3' => (int)$phash]);
1564 }
1565 }
1566
1567 /********************************
1568 *
1569 * SQL; External media
1570 *
1571 *******************************/
1572 /**
1573 * Updates db with information about the file
1574 *
1575 * @param array $hash Array with phash and phash_grouping keys for file
1576 * @param string $file File name
1577 * @param array $subinfo Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1578 * @param string $ext File extension determining the type of media.
1579 * @param int $mtime Modification time of file.
1580 * @param int $ctime Creation time of file.
1581 * @param int $size Size of file in bytes
1582 * @param int $content_md5h Content HASH value.
1583 * @param array $contentParts Standard content array (using only title and body for a file)
1584 */
1585 public function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts)
1586 {
1587 // Find item Type:
1588 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1589 $storeItemType = $storeItemType ?: $ext;
1590 // Remove any current data for this phash:
1591 $this->removeOldIndexedFiles($hash['phash']);
1592 // Split filename:
1593 $fileParts = parse_url($file);
1594 // Setting new
1595 $fields = [
1596 'phash' => $hash['phash'],
1597 'phash_grouping' => $hash['phash_grouping'],
1598 'cHashParams' => serialize($subinfo),
1599 'contentHash' => $content_md5h,
1600 'data_filename' => $file,
1601 'item_type' => $storeItemType,
1602 'item_title' => trim($contentParts['title']) ?: PathUtility::basename($file),
1603 'item_description' => $this->bodyDescription($contentParts),
1604 'item_mtime' => $mtime,
1605 'item_size' => $size,
1606 'item_crdate' => $ctime,
1607 'tstamp' => $GLOBALS['EXEC_TIME'],
1608 'crdate' => $GLOBALS['EXEC_TIME'],
1609 'gr_list' => $this->conf['gr_list'],
1610 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1611 'recordUid' => (int)$this->conf['recordUid'],
1612 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1613 'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1614 'sys_language_uid' => (int)$this->conf['sys_language_uid']
1615 ];
1616 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1617 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1618 ->getConnectionForTable('index_phash');
1619 $connection->insert(
1620 'index_phash',
1621 $fields,
1622 ['cHashParams' => Connection::PARAM_LOB]
1623 );
1624 }
1625 // PROCESSING index_fulltext
1626 $fields = [
1627 'phash' => $hash['phash'],
1628 'fulltextdata' => implode(' ', $contentParts),
1629 'metaphonedata' => $this->metaphoneContent
1630 ];
1631 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1632 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1633 }
1634 if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1635 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1636 ->getConnectionForTable('index_fulltext');
1637 $connection->insert('index_fulltext', $fields);
1638 }
1639 // PROCESSING index_debug
1640 if ($this->indexerConfig['debugMode']) {
1641 $fields = [
1642 'phash' => $hash['phash'],
1643 'debuginfo' => serialize([
1644 'cHashParams' => $subinfo,
1645 'contentParts' => array_merge($contentParts, ['body' => substr($contentParts['body'], 0, 1000)]),
1646 'logs' => $this->internal_log,
1647 'lexer' => $this->lexerObj->debugString
1648 ])
1649 ];
1650 if (IndexedSearchUtility::isTableUsed('index_debug')) {
1651 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1652 ->getConnectionForTable('index_debug');
1653 $connection->insert('index_debug', $fields);
1654 }
1655 }
1656 }
1657
1658 /**
1659 * Stores file gr_list for a file IF it does not exist already
1660 *
1661 * @param int $hash phash value of file
1662 */
1663 public function submitFile_grlist($hash)
1664 {
1665 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1666 if (!IndexedSearchUtility::isTableUsed('index_grlist')) {
1667 return;
1668 }
1669
1670 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1671 ->getQueryBuilderForTable('index_grlist');
1672 $count = (int)$queryBuilder->count('*')
1673 ->from('index_grlist')
1674 ->where(
1675 $queryBuilder->expr()->eq(
1676 'phash',
1677 $queryBuilder->createNamedParameter($hash, \PDO::PARAM_INT)
1678 ),
1679 $queryBuilder->expr()->orX(
1680 $queryBuilder->expr()->eq(
1681 'hash_gr_list',
1682 $queryBuilder->createNamedParameter(
1683 IndexedSearchUtility::md5inthash($this->defaultGrList),
1684 \PDO::PARAM_INT
1685 )
1686 ),
1687 $queryBuilder->expr()->eq(
1688 'hash_gr_list',
1689 $queryBuilder->createNamedParameter(
1690 IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1691 \PDO::PARAM_INT
1692 )
1693 )
1694 )
1695 )
1696 ->execute()
1697 ->fetchColumn();
1698
1699 if ($count === 0) {
1700 $this->submit_grlist($hash, $hash);
1701 }
1702 }
1703
1704 /**
1705 * Stores file section for a file IF it does not exist
1706 *
1707 * @param int $hash phash value of file
1708 */
1709 public function submitFile_section($hash)
1710 {
1711 // Testing if there is already a section
1712 if (!IndexedSearchUtility::isTableUsed('index_section')) {
1713 return;
1714 }
1715
1716 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1717 ->getQueryBuilderForTable('index_section');
1718 $count = (int)$queryBuilder->count('phash')
1719 ->from('index_section')
1720 ->where(
1721 $queryBuilder->expr()->eq(
1722 'phash',
1723 $queryBuilder->createNamedParameter($hash, \PDO::PARAM_INT)
1724 ),
1725 $queryBuilder->expr()->eq(
1726 'page_id',
1727 $queryBuilder->createNamedParameter($this->conf['id'], \PDO::PARAM_INT)
1728 )
1729 )
1730 ->execute()
1731 ->fetchColumn();
1732
1733 if ($count === 0) {
1734 $this->submit_section($hash, $this->hash['phash']);
1735 }
1736 }
1737
1738 /**
1739 * Removes records for the indexed page, $phash
1740 *
1741 * @param int $phash phash value to flush
1742 */
1743 public function removeOldIndexedFiles($phash)
1744 {
1745 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1746 // Removing old registrations for tables.
1747 $tableArray = ['index_phash', 'index_grlist', 'index_fulltext', 'index_debug'];
1748 foreach ($tableArray as $table) {
1749 if (!IndexedSearchUtility::isTableUsed($table)) {
1750 continue;
1751 }
1752 $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1753 }
1754 }
1755
1756 /********************************
1757 *
1758 * SQL Helper functions
1759 *
1760 *******************************/
1761 /**
1762 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1763 * Return positive integer if the page needs to be indexed
1764 *
1765 * @param int $mtime mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1766 * @param int $phash "phash" used to select any already indexed page to see what its mtime is.
1767 * @return int Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1768 */
1769 public function checkMtimeTstamp($mtime, $phash)
1770 {
1771 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1772 // Not indexed (not in index_phash)
1773 $result = 4;
1774 } else {
1775 $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1776 ->select(
1777 ['item_mtime', 'tstamp'],
1778 'index_phash',
1779 ['phash' => (int)$phash],
1780 [],
1781 [],
1782 1
1783 )
1784 ->fetch();
1785 // If there was an indexing of the page...:
1786 if (!empty($row)) {
1787 if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < $GLOBALS['EXEC_TIME']) {
1788 // If max age is exceeded, index the page
1789 // The configured max-age was exceeded for the document and thus it's indexed.
1790 $result = 1;
1791 } else {
1792 if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < $GLOBALS['EXEC_TIME']) {
1793 // if minAge is not set or if minAge is exceeded, consider at mtime
1794 if ($mtime) {
1795 // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1796 if ($row['item_mtime'] != $mtime) {
1797 // And if mtime is different from the index_phash mtime, it's about time to re-index.
1798 // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1799 $result = 2;
1800 } else {
1801 // mtime matched the document, so no changes detected and no content updated
1802 $result = -1;
1803 if ($this->tstamp_maxAge) {
1804 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1805 } else {
1806 $this->updateTstamp($phash);
1807 $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1808 }
1809 }
1810 } else {
1811 // The minimum age was exceed, but mtime was not set, so the page was indexed.
1812 $result = 3;
1813 }
1814 } else {
1815 // The minimum age was not exceeded
1816 $result = -2;
1817 }
1818 }
1819 } else {
1820 // Page has never been indexed (is not represented in the index_phash table).
1821 $result = 4;
1822 }
1823 }
1824 return $result;
1825 }
1826
1827 /**
1828 * Check content hash in phash table
1829 *
1830 * @return mixed Returns TRUE if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1831 */
1832 public function checkContentHash()
1833 {
1834 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1835 $result = true;
1836 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1837 $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1838 ->select(
1839 ['phash'],
1840 'index_phash',
1841 [
1842 'phash_grouping' => (int)$this->hash['phash_grouping'],
1843 'contentHash' => (int)$this->content_md5h
1844 ],
1845 [],
1846 [],
1847 1
1848 )
1849 ->fetch();
1850
1851 if (!empty($row)) {
1852 $result = $row;
1853 }
1854 }
1855 return $result;
1856 }
1857
1858 /**
1859 * Check content hash for external documents
1860 * Returns TRUE if the document needs to be indexed (that is, there was no result)
1861 *
1862 * @param int $hashGr phash value to check (phash_grouping)
1863 * @param int $content_md5h Content hash to check
1864 * @return bool Returns TRUE if the document needs to be indexed (that is, there was no result)
1865 */
1866 public function checkExternalDocContentHash($hashGr, $content_md5h)
1867 {
1868 $result = true;
1869 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1870 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1871 ->getConnectionForTable('index_phash')
1872 ->count(
1873 '*',
1874 'index_phash',
1875 [
1876 'phash_grouping' => (int)$hashGr,
1877 'contentHash' => (int)$content_md5h
1878 ]
1879 );
1880
1881 $result = $count === 0;
1882 }
1883 return $result;
1884 }
1885
1886 /**
1887 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1888 *
1889 * @param int $phash_x Phash integer to test.
1890 * @return bool
1891 */
1892 public function is_grlist_set($phash_x)
1893 {
1894 $result = false;
1895 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1896 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1897 ->getConnectionForTable('index_grlist')
1898 ->count(
1899 'phash_x',
1900 'index_grlist',
1901 ['phash_x' => (int)$phash_x]
1902 );
1903
1904 $result = $count > 0;
1905 }
1906 return $result;
1907 }
1908
1909 /**
1910 * Check if an grlist-entry for this hash exists and if not so, write one.
1911 *
1912 * @param int $phash phash of the search result that should be found
1913 * @param int $phash_x The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1914 * @see submit_grlist()
1915 */
1916 public function update_grlist($phash, $phash_x)
1917 {
1918 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1919 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1920 ->getConnectionForTable('index_grlist')
1921 ->count(
1922 'phash',
1923 'index_grlist',
1924 [
1925 'phash' => (int)$phash,
1926 'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list'])
1927 ]
1928 );
1929
1930 if ($count === 0) {
1931 $this->submit_grlist($phash, $phash_x);
1932 $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1);
1933 }
1934 }
1935 }
1936
1937 /**
1938 * Update tstamp for a phash row.
1939 *
1940 * @param int $phash phash value
1941 * @param int $mtime If set, update the mtime field to this value.
1942 */
1943 public function updateTstamp($phash, $mtime = 0)
1944 {
1945 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1946 return;
1947 }
1948
1949 $updateFields = [
1950 'tstamp' => $GLOBALS['EXEC_TIME']
1951 ];
1952
1953 if ($mtime) {
1954 $updateFields['item_mtime'] = (int)$mtime;
1955 }
1956
1957 GeneralUtility::makeInstance(ConnectionPool::class)
1958 ->getConnectionForTable('index_phash')
1959 ->update(
1960 'index_phash',
1961 $updateFields,
1962 [
1963 'phash' => (int)$phash
1964 ]
1965 );
1966 }
1967
1968 /**
1969 * Update SetID of the index_phash record.
1970 *
1971 * @param int $phash phash value
1972 */
1973 public function updateSetId($phash)
1974 {
1975 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1976 return;
1977 }
1978
1979 GeneralUtility::makeInstance(ConnectionPool::class)
1980 ->getConnectionForTable('index_phash')
1981 ->update(
1982 'index_phash',
1983 [
1984 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1985 ],
1986 [
1987 'phash' => (int)$phash
1988 ]
1989 );
1990 }
1991
1992 /**
1993 * Update parsetime for phash row.
1994 *
1995 * @param int $phash phash value.
1996 * @param int $parsetime Parsetime value to set.
1997 */
1998 public function updateParsetime($phash, $parsetime)
1999 {
2000 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
2001 return;
2002 }
2003
2004 GeneralUtility::makeInstance(ConnectionPool::class)
2005 ->getConnectionForTable('index_phash')
2006 ->update(
2007 'index_phash',
2008 [
2009 'parsetime' => (int)$parsetime
2010 ],
2011 [
2012 'phash' => (int)$phash
2013 ]
2014 );
2015 }
2016
2017 /**
2018 * Update section rootline for the page
2019 */
2020 public function updateRootline()
2021 {
2022 if (!IndexedSearchUtility::isTableUsed('index_section')) {
2023 return;
2024 }
2025
2026 $updateFields = [];
2027 $this->getRootLineFields($updateFields);
2028
2029 GeneralUtility::makeInstance(ConnectionPool::class)
2030 ->getConnectionForTable('index_section')
2031 ->update(
2032 'index_section',
2033 $updateFields,
2034 [
2035 'page_id' => (int)$this->conf['id']
2036 ]
2037 );
2038 }
2039
2040 /**
2041 * Adding values for root-line fields.
2042 * rl0, rl1 and rl2 are standard. A hook might add more.
2043 *
2044 * @param array $fieldArray Field array, passed by reference
2045 */
2046 public function getRootLineFields(array &$fieldArray)
2047 {
2048 $fieldArray['rl0'] = (int)$this->conf['rootline_uids'][0];
2049 $fieldArray['rl1'] = (int)$this->conf['rootline_uids'][1];
2050 $fieldArray['rl2'] = (int)$this->conf['rootline_uids'][2];
2051 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] ?? [] as $fieldName => $rootLineLevel) {
2052 $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
2053 }
2054 }
2055
2056 /********************************
2057 *
2058 * SQL; Submitting words
2059 *
2060 *******************************/
2061 /**
2062 * Adds new words to db
2063 *
2064 * @param array $wordListArray Word List array (where each word has information about position etc).
2065 */
2066 public function checkWordList($wordListArray)
2067 {
2068 if (!IndexedSearchUtility::isTableUsed('index_words') || empty($wordListArray)) {
2069 return;
2070 }
2071
2072 $wordListArrayCount = count($wordListArray);
2073 $phashArray = array_map('intval', array_column($wordListArray, 'hash'));
2074
2075 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('index_words');
2076 $count = (int)$queryBuilder->count('baseword')
2077 ->from('index_words')
2078 ->where(
2079 $queryBuilder->expr()->in(
2080 'wid',
2081 $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
2082 )
2083 )
2084 ->execute()
2085 ->fetchColumn();
2086
2087 if ($count !== $wordListArrayCount) {
2088 $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_words');
2089 $queryBuilder = $connection->createQueryBuilder();
2090
2091 $result = $queryBuilder->select('baseword')
2092 ->from('index_words')
2093 ->where(
2094 $queryBuilder->expr()->in(
2095 'wid',
2096 $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
2097 )
2098 )
2099 ->execute();
2100
2101 $this->log_setTSlogMessage('Inserting words: ' . ($wordListArrayCount - $count), 1);
2102 while ($row = $result->fetch()) {
2103 unset($wordListArray[$row['baseword']]);
2104 }
2105
2106 foreach ($wordListArray as $key => $val) {
2107 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as
2108 // long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...)
2109 // this is not a problem.
2110 $connection->insert(
2111 'index_words',
2112 [
2113 'wid' => $val['hash'],
2114 'baseword' => $key,
2115 'metaphone' => $val['metaphone']
2116 ]
2117 );
2118 }
2119 }
2120 }
2121
2122 /**
2123 * Submits RELATIONS between words and phash
2124 *
2125 * @param array $wordList Word list array
2126 * @param int $phash phash value
2127 */
2128 public function submitWords($wordList, $phash)
2129 {
2130 if (!IndexedSearchUtility::isTableUsed('index_rel')) {
2131 return;
2132 }
2133 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
2134 $queryBuilder = $connectionPool->getQueryBuilderForTable('index_words');
2135 $result = $queryBuilder->select('wid')
2136 ->from('index_words')
2137 ->where(
2138 $queryBuilder->expr()->neq('is_stopword', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT))
2139 )
2140 ->groupBy('wid')
2141 ->execute();
2142
2143 $stopWords = [];
2144 while ($row = $result->fetch()) {
2145 $stopWords[$row['wid']] = $row;
2146 }
2147
2148 $connectionPool->getConnectionForTable('index_rel')->delete('index_rel', ['phash' => (int)$phash]);
2149
2150 $fields = ['phash', 'wid', 'count', 'first', 'freq', 'flags'];
2151 $rows = [];
2152 foreach ($wordList as $val) {
2153 if (isset($stopWords[$val['hash']])) {
2154 continue;
2155 }
2156 $rows[] = [
2157 (int)$phash,
2158 (int)$val['hash'],
2159 (int)$val['count'],
2160 (int)$val['first'],
2161 $this->freqMap($val['count'] / $this->wordcount),
2162 $val['cmp'] & $this->flagBitMask
2163 ];
2164 }
2165
2166 if (!empty($rows)) {
2167 $connectionPool->getConnectionForTable('index_rel')->bulkInsert('index_rel', $rows, $fields);
2168 }
2169 }
2170
2171 /**
2172 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
2173 * and back.
2174 *
2175 * @param float $freq Frequency
2176 * @return int Frequency in range.
2177 */
2178 public function freqMap($freq)
2179 {
2180 $mapFactor = $this->freqMax * 100 * $this->freqRange;
2181 if ($freq <= 1) {
2182 $newFreq = $freq * $mapFactor;
2183 $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
2184 } else {
2185 $newFreq = $freq / $mapFactor;
2186 }
2187 return $newFreq;
2188 }
2189
2190 /********************************
2191 *
2192 * Hashing
2193 *
2194 *******************************/
2195 /**
2196 * Get search hash, T3 pages
2197 */
2198 public function setT3Hashes()
2199 {
2200 // Set main array:
2201 $hArray = [
2202 'id' => (int)$this->conf['id'],
2203 'type' => (int)$this->conf['type'],
2204 'sys_lang' => (int)$this->conf['sys_language_uid'],
2205 'MP' => (string)$this->conf['MP'],
2206 'cHash' => $this->cHashParams,
2207 'staticPageArguments' => $this->conf['staticPageArguments'],
2208 ];
2209 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
2210 $this->hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2211 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
2212 $hArray['gr_list'] = (string)$this->conf['gr_list'];
2213 $this->hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2214 }
2215
2216 /**
2217 * Get search hash, external files
2218 *
2219 * @param string $file File name / path which identifies it on the server
2220 * @param array $subinfo Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
2221 * @return array Array with "phash_grouping" and "phash" inside.
2222 */
2223 public function setExtHashes($file, $subinfo = [])
2224 {
2225 // Set main array:
2226 $hash = [];
2227 $hArray = [
2228 'file' => $file
2229 ];
2230 // Set grouping hash:
2231 $hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2232 // Add subinfo
2233 $hArray['subinfo'] = $subinfo;
2234 $hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2235 return $hash;
2236 }
2237
2238 /*********************************
2239 *
2240 * Internal logging functions
2241 *
2242 *********************************/
2243 /**
2244 * Push function wrapper for TT logging
2245 *
2246 * @param string $msg Title to set
2247 * @param string $key Key (?)
2248 */
2249 public function log_push($msg, $key)
2250 {
2251 $this->timeTracker->push($msg, $key);
2252 }
2253
2254 /**
2255 * Pull function wrapper for TT logging
2256 */
2257 public function log_pull()
2258 {
2259 $this->timeTracker->pull();
2260 }
2261
2262 /**
2263 * Set log message function wrapper for TT logging
2264 *
2265 * @param string $msg Message to set
2266 * @param int $errorNum Error number
2267 */
2268 public function log_setTSlogMessage($msg, $errorNum = 0)
2269 {
2270 $this->timeTracker->setTSlogMessage($msg, $errorNum);
2271 $this->internal_log[] = $msg;
2272 }
2273
2274 /**
2275 * Makes sure that keywords are space-separated. This is impotant for their
2276 * proper displaying as a part of fulltext index.
2277 *
2278 * @param string $keywordList
2279 * @return string
2280 * @see http://forge.typo3.org/issues/14959
2281 */
2282 protected function addSpacesToKeywordList($keywordList)
2283 {
2284 $keywords = GeneralUtility::trimExplode(',', $keywordList);
2285 return ' ' . implode(', ', $keywords) . ' ';
2286 }
2287 }