[!!!][TASK] Remove deprecated code from EXT:indexed_search
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Indexer.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
18 use TYPO3\CMS\Core\Context\Context;
19 use TYPO3\CMS\Core\Context\LanguageAspect;
20 use TYPO3\CMS\Core\Core\Environment;
21 use TYPO3\CMS\Core\Database\Connection;
22 use TYPO3\CMS\Core\Database\ConnectionPool;
23 use TYPO3\CMS\Core\TimeTracker\TimeTracker;
24 use TYPO3\CMS\Core\Utility\GeneralUtility;
25 use TYPO3\CMS\Core\Utility\HttpUtility;
26 use TYPO3\CMS\Core\Utility\MathUtility;
27 use TYPO3\CMS\Core\Utility\PathUtility;
28 use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
29 use TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility;
30
31 /**
32 * Indexing class for TYPO3 frontend
33 */
34 class Indexer
35 {
36
37 /**
38 * @var array
39 */
40 public $reasons = [
41 -1 => 'mtime matched the document, so no changes detected and no content updated',
42 -2 => 'The minimum age was not exceeded',
43 1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
44 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
45 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
46 4 => 'Page has never been indexed (is not represented in the index_phash table).'
47 ];
48
49 /**
50 * HTML code blocks to exclude from indexing
51 *
52 * @var string
53 */
54 public $excludeSections = 'script,style';
55
56 /**
57 * Supported Extensions for external files
58 *
59 * @var array
60 */
61 public $external_parsers = [];
62
63 /**
64 * External parser objects, keys are file extension names. Values are objects with certain methods.
65 * Fe-group list (pages might be indexed separately for each usergroup combination to support search
66 * in access limited pages!)
67 *
68 * @var string
69 */
70 public $defaultGrList = '0,-1';
71
72 /**
73 * Min/Max times
74 *
75 * @var int
76 */
77 public $tstamp_maxAge = 0;
78
79 /**
80 * If set, this tells a number of seconds that is the maximum age of an indexed document.
81 * Regardless of mtime the document will be re-indexed if this limit is exceeded.
82 *
83 * @var int
84 */
85 public $tstamp_minAge = 0;
86
87 /**
88 * If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
89 *
90 * @var int
91 */
92 public $maxExternalFiles = 0;
93
94 /**
95 * Max number of external files to index.
96 *
97 * @var bool
98 */
99 public $forceIndexing = false;
100
101 /**
102 * If TRUE, indexing is forced despite of hashes etc.
103 *
104 * @var bool
105 */
106 public $crawlerActive = false;
107
108 /**
109 * Set when crawler is detected (internal)
110 *
111 * @var array
112 */
113 public $defaultContentArray = [
114 'title' => '',
115 'description' => '',
116 'keywords' => '',
117 'body' => ''
118 ];
119
120 /**
121 * @var int
122 */
123 public $wordcount = 0;
124
125 /**
126 * @var int
127 */
128 public $externalFileCounter = 0;
129
130 /**
131 * @var array
132 */
133 public $conf = [];
134
135 /**
136 * Configuration set internally (see init functions for required keys and their meaning)
137 *
138 * @var array
139 */
140 public $indexerConfig = [];
141
142 /**
143 * Indexer configuration, coming from TYPO3's system configuration for EXT:indexed_search
144 *
145 * @var array
146 */
147 public $hash = [];
148
149 /**
150 * Hash array, contains phash and phash_grouping
151 *
152 * @var array
153 */
154 public $file_phash_arr = [];
155
156 /**
157 * Hash array for files
158 *
159 * @var array
160 */
161 public $contentParts = [];
162
163 /**
164 * Content of TYPO3 page
165 *
166 * @var string
167 */
168 public $content_md5h = '';
169
170 /**
171 * @var array
172 */
173 public $internal_log = [];
174
175 /**
176 * Internal log
177 *
178 * @var string
179 */
180 public $indexExternalUrl_content = '';
181
182 /**
183 * @var array
184 */
185 public $cHashParams = [];
186
187 /**
188 * cHashparams array
189 *
190 * @var int
191 */
192 public $freqRange = 32000;
193
194 /**
195 * @var float
196 */
197 public $freqMax = 0.1;
198
199 /**
200 * @var bool
201 */
202 public $enableMetaphoneSearch = false;
203
204 /**
205 * @var bool
206 */
207 public $storeMetaphoneInfoAsWords;
208
209 /**
210 * @var string
211 */
212 public $metaphoneContent = '';
213
214 /**
215 * Metaphone object, if any
216 *
217 * @var \TYPO3\CMS\IndexedSearch\Utility\DoubleMetaPhoneUtility
218 */
219 public $metaphoneObj;
220
221 /**
222 * Lexer object for word splitting
223 *
224 * @var \TYPO3\CMS\IndexedSearch\Lexer
225 */
226 public $lexerObj;
227
228 /**
229 * @var bool
230 */
231 public $flagBitMask;
232
233 /**
234 * @var TimeTracker
235 */
236 protected $timeTracker;
237
238 /**
239 * Indexer constructor.
240 */
241 public function __construct()
242 {
243 $this->timeTracker = GeneralUtility::makeInstance(TimeTracker::class);
244 }
245
246 /**
247 * Parent Object (TSFE) Initialization
248 *
249 * @param TypoScriptFrontendController $pObj Parent Object, passed by reference
250 */
251 public function hook_indexContent(&$pObj)
252 {
253 // Indexer configuration from Extension Manager interface:
254 $disableFrontendIndexing = (bool)GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search', 'disableFrontendIndexing');
255 // Crawler activation:
256 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
257 if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler') && $pObj->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
258 // Setting simple log message:
259 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
260 // Setting variables:
261 $this->crawlerActive = true;
262 // Crawler active flag
263 $this->forceIndexing = true;
264 }
265 // Determine if page should be indexed, and if so, configure and initialize indexer
266 if ($pObj->config['config']['index_enable']) {
267 $this->log_push('Index page', '');
268 if (!$disableFrontendIndexing || $this->crawlerActive) {
269 if (!$pObj->page['no_search']) {
270 if (!$pObj->no_cache) {
271 /** @var LanguageAspect $languageAspect */
272 $languageAspect = GeneralUtility::makeInstance(Context::class)->getAspect('language');
273 if ($languageAspect->getId() === $languageAspect->getContentId()) {
274 // Setting up internal configuration from config array:
275 $this->conf = [];
276 // Information about page for which the indexing takes place
277 $this->conf['id'] = $pObj->id;
278 // Page id
279 $this->conf['type'] = $pObj->type;
280 // Page type
281 $this->conf['sys_language_uid'] = $languageAspect->getId();
282 // sys_language UID of the language of the indexing.
283 $this->conf['MP'] = $pObj->MP;
284 // MP variable, if any (Mount Points)
285 // Group list
286 $this->conf['gr_list'] = implode(',', GeneralUtility::makeInstance(Context::class)->getPropertyFromAspect('frontend.user', 'groupIds', [0, -1]));
287 $this->conf['cHash'] = $pObj->cHash;
288 // cHash string for additional parameters
289 $this->conf['cHash_array'] = $pObj->cHash_array;
290 // Array of the additional parameters
291 $this->conf['crdate'] = $pObj->page['crdate'];
292 // The creation date of the TYPO3 page
293
294 // Root line uids
295 $this->conf['rootline_uids'] = [];
296 foreach ($pObj->config['rootLine'] as $rlkey => $rldat) {
297 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
298 }
299 // Content of page:
300 $this->conf['content'] = $pObj->content;
301 // Content string (HTML of TYPO3 page)
302 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
303 // Alternative title for indexing
304 $this->conf['metaCharset'] = $pObj->metaCharset;
305 // Character set of content (will be converted to utf-8 during indexing)
306 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED'] ?? $pObj->page['SYS_LASTCHANGED'];
307 // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
308 // Configuration of behavior:
309 $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
310 // Whether to index external documents like PDF, DOC etc. (if possible)
311 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
312 // Length of description text (max 250, default 200)
313 $this->conf['index_metatags'] = $pObj->config['config']['index_metatags'] ?? true;
314 // Set to zero:
315 $this->conf['recordUid'] = 0;
316 $this->conf['freeIndexUid'] = 0;
317 $this->conf['freeIndexSetId'] = 0;
318 // Init and start indexing:
319 $this->init();
320 $this->indexTypo3PageContent();
321 } else {
322 $this->log_setTSlogMessage('Index page? No, languageId was different from contentId which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
323 }
324 } else {
325 $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
326 }
327 } else {
328 $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
329 }
330 } else {
331 $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
332 }
333 $this->log_pull();
334 }
335 }
336
337 /****************************
338 *
339 * Backend API
340 *
341 ****************************/
342 /**
343 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
344 *
345 * @param int $id The page uid, &id=
346 * @param int $type The page type, &type=
347 * @param int $sys_language_uid sys_language uid, typically &L=
348 * @param string $MP The MP variable (Mount Points), &MP=
349 * @param array $uidRL Rootline array of only UIDs.
350 * @param array $cHash_array Array of GET variables to register with this indexing
351 * @param bool $createCHash If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cacheable pages from the backend!
352 */
353 public function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array = [], $createCHash = false)
354 {
355 // Setting up internal configuration from config array:
356 $this->conf = [];
357 // Information about page for which the indexing takes place
358 $this->conf['id'] = $id;
359 // Page id (int)
360 $this->conf['type'] = $type;
361 // Page type (int)
362 $this->conf['sys_language_uid'] = $sys_language_uid;
363 // sys_language UID of the language of the indexing (int)
364 $this->conf['MP'] = $MP;
365 // MP variable, if any (Mount Points) (string)
366 $this->conf['gr_list'] = '0,-1';
367 // Group list (hardcoded for now...)
368 // cHash values:
369 if ($createCHash) {
370 /* @var \TYPO3\CMS\Frontend\Page\CacheHashCalculator $cacheHash */
371 $cacheHash = GeneralUtility::makeInstance(\TYPO3\CMS\Frontend\Page\CacheHashCalculator::class);
372 $this->conf['cHash'] = $cacheHash->generateForParameters(HttpUtility::buildQueryString($cHash_array));
373 } else {
374 $this->conf['cHash'] = '';
375 }
376 // cHash string for additional parameters
377 $this->conf['cHash_array'] = $cHash_array;
378 // Array of the additional parameters
379 // Set to defaults
380 $this->conf['freeIndexUid'] = 0;
381 $this->conf['freeIndexSetId'] = 0;
382
383 // Root line uids
384 $this->conf['rootline_uids'] = $uidRL;
385 // Configuration of behavior:
386 $this->conf['index_externals'] = 1;
387 // Whether to index external documents like PDF, DOC etc. (if possible)
388 $this->conf['index_descrLgd'] = 200;
389 // Length of description text (max 250, default 200)
390 $this->conf['index_metatags'] = true;
391 // Whether to index document keywords and description (if present)
392 // Init and start indexing:
393 $this->init();
394 }
395
396 /**
397 * Sets the free-index uid. Can be called right after backend_initIndexer()
398 *
399 * @param int $freeIndexUid Free index UID
400 * @param int $freeIndexSetId Set id - an integer identifying the "set" of indexing operations.
401 */
402 public function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId = 0)
403 {
404 $this->conf['freeIndexUid'] = $freeIndexUid;
405 $this->conf['freeIndexSetId'] = $freeIndexSetId;
406 }
407
408 /**
409 * Indexing records as the content of a TYPO3 page.
410 *
411 * @param string $title Title equivalent
412 * @param string $keywords Keywords equivalent
413 * @param string $description Description equivalent
414 * @param string $content The main content to index
415 * @param string $charset The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
416 * @param int $mtime Last modification time, in seconds
417 * @param int $crdate The creation date of the content, in seconds
418 * @param int $recordUid The record UID that the content comes from (for registration with the indexed rows)
419 */
420 public function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate = 0, $recordUid = 0)
421 {
422 // Content of page:
423 $this->conf['mtime'] = $mtime;
424 // Most recent modification time (seconds) of the content
425 $this->conf['crdate'] = $crdate;
426 // The creation date of the TYPO3 content
427 $this->conf['recordUid'] = $recordUid;
428 // UID of the record, if applicable
429 // Construct fake HTML for parsing:
430 $this->conf['content'] = '
431 <html>
432 <head>
433 <title>' . htmlspecialchars($title) . '</title>
434 <meta name="keywords" content="' . htmlspecialchars($keywords) . '" />
435 <meta name="description" content="' . htmlspecialchars($description) . '" />
436 </head>
437 <body>
438 ' . htmlspecialchars($content) . '
439 </body>
440 </html>';
441 // Content string (HTML of TYPO3 page)
442 // Initializing charset:
443 $this->conf['metaCharset'] = $charset;
444 // Character set of content (will be converted to utf-8 during indexing)
445 $this->conf['indexedDocTitle'] = '';
446 // Alternative title for indexing
447 // Index content as if it was a TYPO3 page:
448 $this->indexTypo3PageContent();
449 }
450
451 /********************************
452 *
453 * Initialization
454 *
455 *******************************/
456 /**
457 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
458 */
459 public function init()
460 {
461 // Initializing:
462 $this->cHashParams = $this->conf['cHash_array'];
463 if (is_array($this->cHashParams) && !empty($this->cHashParams)) {
464 if ($this->conf['cHash']) {
465 // Add this so that URL's come out right...
466 $this->cHashParams['cHash'] = $this->conf['cHash'];
467 }
468 unset($this->cHashParams['encryptionKey']);
469 }
470 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
471 $this->setT3Hashes();
472 // Indexer configuration from Extension Manager interface:
473 $this->indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
474 $this->tstamp_minAge = MathUtility::forceIntegerInRange($this->indexerConfig['minAge'] * 3600, 0);
475 $this->tstamp_maxAge = MathUtility::forceIntegerInRange($this->indexerConfig['maxAge'] * 3600, 0);
476 $this->maxExternalFiles = MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
477 $this->flagBitMask = MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
478 // Workaround: If the extension configuration was not updated yet, the value is not existing
479 $this->enableMetaphoneSearch = !isset($this->indexerConfig['enableMetaphoneSearch']) || $this->indexerConfig['enableMetaphoneSearch'];
480 $this->storeMetaphoneInfoAsWords = !IndexedSearchUtility::isTableUsed('index_words') && $this->enableMetaphoneSearch;
481 // Initialize external document parsers:
482 // Example configuration, see ext_localconf.php of this file!
483 if ($this->conf['index_externals']) {
484 $this->initializeExternalParsers();
485 }
486 // Initialize lexer (class that deconstructs the text into words):
487 $lexerObjectClassName = $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] ?: Lexer::class;
488 $this->lexerObj = GeneralUtility::makeInstance($lexerObjectClassName);
489 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
490 // Initialize metaphone hook:
491 // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
492 if ($this->enableMetaphoneSearch && $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']) {
493 $this->metaphoneObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']);
494 $this->metaphoneObj->pObj = $this;
495 }
496 }
497
498 /**
499 * Initialize external parsers
500 *
501 * @internal
502 * @see init()
503 */
504 public function initializeExternalParsers()
505 {
506 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] ?? [] as $extension => $className) {
507 $this->external_parsers[$extension] = GeneralUtility::makeInstance($className);
508 $this->external_parsers[$extension]->pObj = $this;
509 // Init parser and if it returns FALSE, unset its entry again:
510 if (!$this->external_parsers[$extension]->initParser($extension)) {
511 unset($this->external_parsers[$extension]);
512 }
513 }
514 }
515
516 /********************************
517 *
518 * Indexing; TYPO3 pages (HTML content)
519 *
520 *******************************/
521 /**
522 * Start indexing of the TYPO3 page
523 */
524 public function indexTypo3PageContent()
525 {
526 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
527 $is_grlist = $this->is_grlist_set($this->hash['phash']);
528 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
529 // Setting message:
530 if ($this->forceIndexing) {
531 $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
532 } elseif ($check > 0) {
533 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
534 } else {
535 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
536 }
537 // Divide into title,keywords,description and body:
538 $this->log_push('Split content', '');
539 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
540 if ($this->conf['indexedDocTitle']) {
541 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
542 }
543 $this->log_pull();
544 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so don't!)
545 $this->content_md5h = IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
546 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
547 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
548 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
549 $checkCHash = $this->checkContentHash();
550 if (!is_array($checkCHash) || $check === 1) {
551 $Pstart = GeneralUtility::milliseconds();
552 $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
553 $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
554 $this->log_pull();
555 // Splitting words
556 $this->log_push('Extract words from content', '');
557 $splitInWords = $this->processWordsInArrays($this->contentParts);
558 $this->log_pull();
559 // Analyze the indexed words.
560 $this->log_push('Analyze the extracted words', '');
561 $indexArr = $this->indexAnalyze($splitInWords);
562 $this->log_pull();
563 // Submitting page (phash) record
564 $this->log_push('Submitting page', '');
565 $this->submitPage();
566 $this->log_pull();
567 // Check words and submit to word list if not there
568 $this->log_push('Check word list and submit words', '');
569 if (IndexedSearchUtility::isTableUsed('index_words')) {
570 $this->checkWordList($indexArr);
571 $this->submitWords($indexArr, $this->hash['phash']);
572 }
573 $this->log_pull();
574 // Set parsetime
575 $this->updateParsetime($this->hash['phash'], GeneralUtility::milliseconds() - $Pstart);
576 // Checking external files if configured for.
577 $this->log_push('Checking external files', '');
578 if ($this->conf['index_externals']) {
579 $this->extractLinks($this->conf['content']);
580 }
581 $this->log_pull();
582 } else {
583 // Update the timestamp
584 $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
585 $this->updateSetId($this->hash['phash']);
586 // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
587 $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
588 $this->updateRootline();
589 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
590 }
591 } else {
592 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
593 }
594 }
595
596 /**
597 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
598 *
599 * @param string $content HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
600 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
601 * @see splitRegularContent()
602 */
603 public function splitHTMLContent($content)
604 {
605 // divide head from body ( u-ouh :) )
606 $contentArr = $this->defaultContentArray;
607 $contentArr['body'] = stristr($content, '<body');
608 $headPart = substr($content, 0, -strlen($contentArr['body']));
609 // get title
610 $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
611 $titleParts = explode(':', $contentArr['title'], 2);
612 $contentArr['title'] = trim($titleParts[1] ?? $titleParts[0]);
613 // get keywords and description metatags
614 if ($this->conf['index_metatags']) {
615 $meta = [];
616 $i = 0;
617 while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
618 $i++;
619 }
620 // @todo The code below stops at first unset tag. Is that correct?
621 for ($i = 0; isset($meta[$i]); $i++) {
622 $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i]);
623 if (stristr($meta[$i]['name'], 'keywords')) {
624 $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
625 }
626 if (stristr($meta[$i]['name'], 'description')) {
627 $contentArr['description'] .= ',' . $meta[$i]['content'];
628 }
629 }
630 }
631 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
632 $this->typoSearchTags($contentArr['body']);
633 // Get rid of unwanted sections (ie. scripting and style stuff) in body
634 $tagList = explode(',', $this->excludeSections);
635 foreach ($tagList as $tag) {
636 while ($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
637 }
638 }
639 // remove tags, but first make sure we don't concatenate words by doing it
640 $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
641 $contentArr['body'] = trim(strip_tags($contentArr['body']));
642 $contentArr['keywords'] = trim($contentArr['keywords']);
643 $contentArr['description'] = trim($contentArr['description']);
644 // Return array
645 return $contentArr;
646 }
647
648 /**
649 * Extract the charset value from HTML meta tag.
650 *
651 * @param string $content HTML content
652 * @return string The charset value if found.
653 */
654 public function getHTMLcharset($content)
655 {
656 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
657 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
658 return $reg2[1];
659 }
660 }
661 }
662
663 /**
664 * Converts a HTML document to utf-8
665 *
666 * @param string $content HTML content, any charset
667 * @param string $charset Optional charset (otherwise extracted from HTML)
668 * @return string Converted HTML
669 */
670 public function convertHTMLToUtf8($content, $charset = '')
671 {
672 // Find charset:
673 $charset = $charset ?: $this->getHTMLcharset($content);
674 $charset = trim(strtolower($charset));
675 // Convert charset:
676 if ($charset && $charset !== 'utf-8') {
677 $content = mb_convert_encoding($content, 'utf-8', $charset);
678 }
679 // Convert entities, assuming document is now UTF-8:
680 return html_entity_decode($content);
681 }
682
683 /**
684 * Finds first occurrence of embracing tags and returns the embraced content and the original string with
685 * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding
686 * <title> of document or removing <script>-sections
687 *
688 * @param string $string String to search in
689 * @param string $tagName Tag name, eg. "script
690 * @param string $tagContent Passed by reference: Content inside found tag
691 * @param string $stringAfter Passed by reference: Content after found tag
692 * @param string $paramList Passed by reference: Attributes of the found tag.
693 * @return bool Returns FALSE if tag was not found, otherwise TRUE.
694 */
695 public function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
696 {
697 $endTag = '</' . $tagName . '>';
698 $startTag = '<' . $tagName;
699 // stristr used because we want a case-insensitive search for the tag.
700 $isTagInText = stristr($string, $startTag);
701 // if the tag was not found, return FALSE
702 if (!$isTagInText) {
703 return false;
704 }
705 list($paramList, $isTagInText) = explode('>', substr($isTagInText, strlen($startTag)), 2);
706 $afterTagInText = stristr($isTagInText, $endTag);
707 if ($afterTagInText) {
708 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
709 $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
710 $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
711 } else {
712 $tagContent = '';
713 $stringAfter = $isTagInText;
714 }
715 return true;
716 }
717
718 /**
719 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
720 *
721 * @param string $body HTML Content, passed by reference
722 * @return bool Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE.
723 */
724 public function typoSearchTags(&$body)
725 {
726 $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
727 if (count($expBody) > 1) {
728 $body = '';
729 foreach ($expBody as $val) {
730 $part = explode('-->', $val, 2);
731 if (trim($part[0]) === 'begin') {
732 $body .= $part[1];
733 $prev = '';
734 } elseif (trim($part[0]) === 'end') {
735 $body .= $prev;
736 } else {
737 $prev = $val;
738 }
739 }
740 return true;
741 }
742 return false;
743 }
744
745 /**
746 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
747 *
748 * @param string $content HTML content
749 */
750 public function extractLinks($content)
751 {
752 // Get links:
753 $list = $this->extractHyperLinks($content);
754 if ($this->indexerConfig['useCrawlerForExternalFiles'] && \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler')) {
755 $crawler = GeneralUtility::makeInstance(\tx_crawler_lib::class);
756 }
757 // Traverse links:
758 foreach ($list as $linkInfo) {
759 // Decode entities:
760 if ($linkInfo['localPath']) {
761 // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
762 $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
763 } else {
764 $linkSource = htmlspecialchars_decode($linkInfo['href']);
765 }
766 // Parse URL:
767 $qParts = parse_url($linkSource);
768 // Check for jumpurl (TYPO3 specific thing...)
769 if ($qParts['query'] && strstr($qParts['query'], 'jumpurl=')) {
770 parse_str($qParts['query'], $getP);
771 $linkSource = $getP['jumpurl'];
772 $qParts = parse_url($linkSource);
773 }
774 if (!$linkInfo['localPath'] && $qParts['scheme']) {
775 if ($this->indexerConfig['indexExternalURLs']) {
776 // Index external URL (http or otherwise)
777 $this->indexExternalUrl($linkSource);
778 }
779 } elseif (!$qParts['query']) {
780 $linkSource = urldecode($linkSource);
781 if (GeneralUtility::isAllowedAbsPath($linkSource)) {
782 $localFile = $linkSource;
783 } else {
784 $localFile = GeneralUtility::getFileAbsFileName(Environment::getPublicPath() . '/' . $linkSource);
785 }
786 if ($localFile && @is_file($localFile)) {
787 // Index local file:
788 if ($linkInfo['localPath']) {
789 $fI = pathinfo($linkSource);
790 $ext = strtolower($fI['extension']);
791 if (is_object($crawler)) {
792 $params = [
793 'document' => $linkSource,
794 'alturl' => $linkInfo['href'],
795 'conf' => $this->conf
796 ];
797 unset($params['conf']['content']);
798 $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
799 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
800 } else {
801 $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
802 }
803 } else {
804 if (is_object($crawler)) {
805 $params = [
806 'document' => $linkSource,
807 'conf' => $this->conf
808 ];
809 unset($params['conf']['content']);
810 $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
811 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
812 } else {
813 $this->indexRegularDocument($linkSource);
814 }
815 }
816 }
817 }
818 }
819 }
820
821 /**
822 * Extracts all links to external documents from the HTML content string
823 *
824 * @param string $html
825 * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
826 * @see extractLinks()
827 */
828 public function extractHyperLinks($html)
829 {
830 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
831 $htmlParts = $htmlParser->splitTags('a', $html);
832 $hyperLinksData = [];
833 foreach ($htmlParts as $index => $tagData) {
834 if ($index % 2 !== 0) {
835 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
836 $firstTagName = $htmlParser->getFirstTagName($tagData);
837 if (strtolower($firstTagName) === 'a') {
838 if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] !== '#') {
839 $hyperLinksData[] = [
840 'tag' => $tagData,
841 'href' => $tagAttributes[0]['href'],
842 'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
843 ];
844 }
845 }
846 }
847 }
848 return $hyperLinksData;
849 }
850
851 /**
852 * Extracts the "base href" from content string.
853 *
854 * @param string $html Content to analyze
855 * @return string The base href or an empty string if not found
856 */
857 public function extractBaseHref($html)
858 {
859 $href = '';
860 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
861 $htmlParts = $htmlParser->splitTags('base', $html);
862 foreach ($htmlParts as $index => $tagData) {
863 if ($index % 2 !== 0) {
864 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
865 $firstTagName = $htmlParser->getFirstTagName($tagData);
866 if (strtolower($firstTagName) === 'base') {
867 $href = $tagAttributes[0]['href'];
868 if ($href) {
869 break;
870 }
871 }
872 }
873 }
874 return $href;
875 }
876
877 /******************************************
878 *
879 * Indexing; external URL
880 *
881 ******************************************/
882 /**
883 * Index External URLs HTML content
884 *
885 * @param string $externalUrl URL, eg. "http://typo3.org/
886 * @see indexRegularDocument()
887 */
888 public function indexExternalUrl($externalUrl)
889 {
890 // Get headers:
891 $urlHeaders = $this->getUrlHeaders($externalUrl);
892 if (stristr($urlHeaders['Content-Type'], 'text/html')) {
893 $content = ($this->indexExternalUrl_content = GeneralUtility::getUrl($externalUrl));
894 if ((string)$content !== '') {
895 // Create temporary file:
896 $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
897 if ($tmpFile) {
898 GeneralUtility::writeFile($tmpFile, $content);
899 // Index that file:
900 $this->indexRegularDocument($externalUrl, true, $tmpFile, 'html');
901 // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
902 unlink($tmpFile);
903 }
904 }
905 }
906 }
907
908 /**
909 * Getting HTTP request headers of URL
910 *
911 * @param string $url The URL
912 * @return mixed If no answer, returns FALSE. Otherwise an array where HTTP headers are keys
913 */
914 public function getUrlHeaders($url)
915 {
916 // Try to get the headers only
917 $content = GeneralUtility::getUrl($url, 2);
918 if ((string)$content !== '') {
919 // Compile headers:
920 $headers = GeneralUtility::trimExplode(LF, $content, true);
921 $retVal = [];
922 foreach ($headers as $line) {
923 if (trim($line) === '') {
924 break;
925 }
926 list($headKey, $headValue) = explode(':', $line, 2);
927 $retVal[$headKey] = $headValue;
928 }
929 return $retVal;
930 }
931 }
932
933 /**
934 * Checks if the file is local
935 *
936 * @param string $sourcePath
937 * @return string Absolute path to file if file is local, else empty string
938 */
939 protected function createLocalPath($sourcePath)
940 {
941 $localPath = '';
942 $pathFunctions = [
943 'createLocalPathFromT3vars',
944 'createLocalPathUsingAbsRefPrefix',
945 'createLocalPathUsingDomainURL',
946 'createLocalPathFromAbsoluteURL',
947 'createLocalPathFromRelativeURL'
948 ];
949 foreach ($pathFunctions as $functionName) {
950 $localPath = $this->{$functionName}($sourcePath);
951 if ($localPath != '') {
952 break;
953 }
954 }
955 return $localPath;
956 }
957
958 /**
959 * Attempts to create a local file path from T3VARs. This is useful for
960 * various download extensions that hide actual file name but still want the
961 * file to be indexed.
962 *
963 * @param string $sourcePath
964 * @return string
965 */
966 protected function createLocalPathFromT3vars($sourcePath)
967 {
968 $localPath = '';
969 $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'] ?? null;
970 if (is_array($indexLocalFiles)) {
971 $md5 = GeneralUtility::shortMD5($sourcePath);
972 // Note: not using self::isAllowedLocalFile here because this method
973 // is allowed to index files outside of the web site (for example,
974 // protected downloads)
975 if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
976 $localPath = $indexLocalFiles[$md5];
977 }
978 }
979 return $localPath;
980 }
981
982 /**
983 * Attempts to create a local file path by matching a current request URL.
984 *
985 * @param string $sourcePath
986 * @return string
987 */
988 protected function createLocalPathUsingDomainURL($sourcePath)
989 {
990 $localPath = '';
991 $baseURL = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
992 $baseURLLength = strlen($baseURL);
993 if (strpos($sourcePath, $baseURL) === 0) {
994 $sourcePath = substr($sourcePath, $baseURLLength);
995 $localPath = Environment::getPublicPath() . '/' . $sourcePath;
996 if (!self::isAllowedLocalFile($localPath)) {
997 $localPath = '';
998 }
999 }
1000 return $localPath;
1001 }
1002
1003 /**
1004 * Attempts to create a local file path by matching absRefPrefix. This
1005 * requires TSFE. If TSFE is missing, this function does nothing.
1006 *
1007 * @param string $sourcePath
1008 * @return string
1009 */
1010 protected function createLocalPathUsingAbsRefPrefix($sourcePath)
1011 {
1012 $localPath = '';
1013 if (isset($GLOBALS['TSFE']) && $GLOBALS['TSFE'] instanceof TypoScriptFrontendController) {
1014 $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
1015 $absRefPrefixLength = strlen($absRefPrefix);
1016 if ($absRefPrefixLength > 0 && strpos($sourcePath, $absRefPrefix) === 0) {
1017 $sourcePath = substr($sourcePath, $absRefPrefixLength);
1018 $localPath = Environment::getPublicPath() . '/' . $sourcePath;
1019 if (!self::isAllowedLocalFile($localPath)) {
1020 $localPath = '';
1021 }
1022 }
1023 }
1024 return $localPath;
1025 }
1026
1027 /**
1028 * Attempts to create a local file path from the absolute URL without
1029 * schema.
1030 *
1031 * @param string $sourcePath
1032 * @return string
1033 */
1034 protected function createLocalPathFromAbsoluteURL($sourcePath)
1035 {
1036 $localPath = '';
1037 if ($sourcePath[0] === '/') {
1038 $sourcePath = substr($sourcePath, 1);
1039 $localPath = Environment::getPublicPath() . '/' . $sourcePath;
1040 if (!self::isAllowedLocalFile($localPath)) {
1041 $localPath = '';
1042 }
1043 }
1044 return $localPath;
1045 }
1046
1047 /**
1048 * Attempts to create a local file path from the relative URL.
1049 *
1050 * @param string $sourcePath
1051 * @return string
1052 */
1053 protected function createLocalPathFromRelativeURL($sourcePath)
1054 {
1055 $localPath = '';
1056 if (self::isRelativeURL($sourcePath)) {
1057 $localPath = Environment::getPublicPath() . '/' . $sourcePath;
1058 if (!self::isAllowedLocalFile($localPath)) {
1059 $localPath = '';
1060 }
1061 }
1062 return $localPath;
1063 }
1064
1065 /**
1066 * Checks if URL is relative.
1067 *
1068 * @param string $url
1069 * @return bool
1070 */
1071 protected static function isRelativeURL($url)
1072 {
1073 $urlParts = @parse_url($url);
1074 return (!isset($urlParts['scheme']) || $urlParts['scheme'] === '') && $urlParts['path'][0] !== '/';
1075 }
1076
1077 /**
1078 * Checks if the path points to the file inside the web site
1079 *
1080 * @param string $filePath
1081 * @return bool
1082 */
1083 protected static function isAllowedLocalFile($filePath)
1084 {
1085 $filePath = GeneralUtility::resolveBackPath($filePath);
1086 $insideWebPath = strpos($filePath, Environment::getPublicPath()) === 0;
1087 $isFile = is_file($filePath);
1088 return $insideWebPath && $isFile;
1089 }
1090
1091 /******************************************
1092 *
1093 * Indexing; external files (PDF, DOC, etc)
1094 *
1095 ******************************************/
1096 /**
1097 * Indexing a regular document given as $file (relative to public web path, local file)
1098 *
1099 * @param string $file Relative Filename, relative to public web path. It can also be an absolute path as long as it is inside the lockRootPath (validated with \TYPO3\CMS\Core\Utility\GeneralUtility::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
1100 * @param bool $force If set, indexing is forced (despite content hashes, mtime etc).
1101 * @param string $contentTmpFile Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
1102 * @param string $altExtension File extension for temporary file.
1103 */
1104 public function indexRegularDocument($file, $force = false, $contentTmpFile = '', $altExtension = '')
1105 {
1106 // Init
1107 $fI = pathinfo($file);
1108 $ext = $altExtension ?: strtolower($fI['extension']);
1109 // Create abs-path:
1110 if (!$contentTmpFile) {
1111 if (!GeneralUtility::isAbsPath($file)) {
1112 // Relative, prepend public web path:
1113 $absFile = GeneralUtility::getFileAbsFileName(Environment::getPublicPath() . '/' . $file);
1114 } else {
1115 // Absolute, pass-through:
1116 $absFile = $file;
1117 }
1118 $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
1119 } else {
1120 $absFile = $contentTmpFile;
1121 }
1122 // Indexing the document:
1123 if ($absFile && @is_file($absFile)) {
1124 if ($this->external_parsers[$ext]) {
1125 $fileInfo = stat($absFile);
1126 $cParts = $this->fileContentParts($ext, $absFile);
1127 foreach ($cParts as $cPKey) {
1128 $this->internal_log = [];
1129 $this->log_push('Index: ' . str_replace('.', '_', PathUtility::basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
1130 $Pstart = GeneralUtility::milliseconds();
1131 $subinfo = ['key' => $cPKey];
1132 // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1133 $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
1134 $check = $this->checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
1135 if ($check > 0 || $force) {
1136 if ($check > 0) {
1137 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
1138 } else {
1139 $this->log_setTSlogMessage('Indexing forced by flag', 1);
1140 }
1141 // Check external file counter:
1142 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1143 // Divide into title,keywords,description and body:
1144 $this->log_push('Split content', '');
1145 $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
1146 $this->log_pull();
1147 if (is_array($contentParts)) {
1148 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1149 $content_md5h = IndexedSearchUtility::md5inthash(implode($contentParts, ''));
1150 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1151 // Increment counter:
1152 $this->externalFileCounter++;
1153 // Splitting words
1154 $this->log_push('Extract words from content', '');
1155 $splitInWords = $this->processWordsInArrays($contentParts);
1156 $this->log_pull();
1157 // Analyze the indexed words.
1158 $this->log_push('Analyze the extracted words', '');
1159 $indexArr = $this->indexAnalyze($splitInWords);
1160 $this->log_pull();
1161 // Submitting page (phash) record
1162 $this->log_push('Submitting page', '');
1163 // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1164 $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], $content_md5h, $contentParts);
1165 $this->log_pull();
1166 // Check words and submit to word list if not there
1167 $this->log_push('Check word list and submit words', '');
1168 if (IndexedSearchUtility::isTableUsed('index_words')) {
1169 $this->checkWordList($indexArr);
1170 $this->submitWords($indexArr, $phash_arr['phash']);
1171 }
1172 $this->log_pull();
1173 // Set parsetime
1174 $this->updateParsetime($phash_arr['phash'], GeneralUtility::milliseconds() - $Pstart);
1175 } else {
1176 // Update the timestamp
1177 $this->updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
1178 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
1179 }
1180 } else {
1181 $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1182 }
1183 } else {
1184 $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
1185 }
1186 } else {
1187 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
1188 }
1189 // Checking and setting sections:
1190 $this->submitFile_section($phash_arr['phash']);
1191 // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1192 $this->log_pull();
1193 }
1194 } else {
1195 $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
1196 }
1197 } else {
1198 $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
1199 }
1200 }
1201
1202 /**
1203 * Reads the content of an external file being indexed.
1204 * The content from the external parser MUST be returned in utf-8!
1205 *
1206 * @param string $fileExtension File extension, eg. "pdf", "doc" etc.
1207 * @param string $absoluteFileName Absolute filename of file (must exist and be validated OK before calling function)
1208 * @param string $sectionPointer Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1209 * @return array Standard content array (title, description, keywords, body keys)
1210 */
1211 public function readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
1212 {
1213 $contentArray = null;
1214 // Consult relevant external document parser:
1215 if (is_object($this->external_parsers[$fileExtension])) {
1216 $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1217 }
1218 return $contentArray;
1219 }
1220
1221 /**
1222 * Creates an array with pointers to divisions of document.
1223 *
1224 * @param string $ext File extension
1225 * @param string $absFile Absolute filename (must exist and be validated OK before calling function)
1226 * @return array Array of pointers to sections that the document should be divided into
1227 */
1228 public function fileContentParts($ext, $absFile)
1229 {
1230 $cParts = [0];
1231 // Consult relevant external document parser:
1232 if (is_object($this->external_parsers[$ext])) {
1233 $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
1234 }
1235 return $cParts;
1236 }
1237
1238 /**
1239 * Splits non-HTML content (from external files for instance)
1240 *
1241 * @param string $content Input content (non-HTML) to index.
1242 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1243 * @see splitHTMLContent()
1244 */
1245 public function splitRegularContent($content)
1246 {
1247 $contentArr = $this->defaultContentArray;
1248 $contentArr['body'] = $content;
1249 return $contentArr;
1250 }
1251
1252 /**********************************
1253 *
1254 * Analysing content, Extracting words
1255 *
1256 **********************************/
1257 /**
1258 * Convert character set and HTML entities in the value of input content array keys
1259 *
1260 * @param array $contentArr Standard content array
1261 * @param string $charset Charset of the input content (converted to utf-8)
1262 */
1263 public function charsetEntity2utf8(&$contentArr, $charset)
1264 {
1265 // Convert charset if necessary
1266 foreach ($contentArr as $key => $value) {
1267 if ((string)$contentArr[$key] !== '') {
1268 if ($charset !== 'utf-8') {
1269 $contentArr[$key] = mb_convert_encoding($contentArr[$key], 'utf-8', $charset);
1270 }
1271 // decode all numeric / html-entities in the string to real characters:
1272 $contentArr[$key] = html_entity_decode($contentArr[$key]);
1273 }
1274 }
1275 }
1276
1277 /**
1278 * Processing words in the array from split*Content -functions
1279 *
1280 * @param array $contentArr Array of content to index, see splitHTMLContent() and splitRegularContent()
1281 * @return array Content input array modified so each key is not a unique array of words
1282 */
1283 public function processWordsInArrays($contentArr)
1284 {
1285 // split all parts to words
1286 foreach ($contentArr as $key => $value) {
1287 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1288 }
1289 // For title, keywords, and description we don't want duplicates:
1290 $contentArr['title'] = array_unique($contentArr['title']);
1291 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1292 $contentArr['description'] = array_unique($contentArr['description']);
1293 // Return modified array:
1294 return $contentArr;
1295 }
1296
1297 /**
1298 * Extracts the sample description text from the content array.
1299 *
1300 * @param array $contentArr Content array
1301 * @return string Description string
1302 */
1303 public function bodyDescription($contentArr)
1304 {
1305 // Setting description
1306 $maxL = MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1307 if ($maxL) {
1308 $bodyDescription = preg_replace('/\s+/u', ' ', $contentArr['body']);
1309 // Shorten the string:
1310 $bodyDescription = mb_strcut($bodyDescription, 0, $maxL, 'utf-8');
1311 }
1312 return $bodyDescription;
1313 }
1314
1315 /**
1316 * Analyzes content to use for indexing,
1317 *
1318 * @param array $content Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1319 * @return array Index Array (whatever that is...)
1320 */
1321 public function indexAnalyze($content)
1322 {
1323 $indexArr = [];
1324 $this->analyzeHeaderinfo($indexArr, $content, 'title', 7);
1325 $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1326 $this->analyzeHeaderinfo($indexArr, $content, 'description', 5);
1327 $this->analyzeBody($indexArr, $content);
1328 return $indexArr;
1329 }
1330
1331 /**
1332 * Calculates relevant information for headercontent
1333 *
1334 * @param array $retArr Index array, passed by reference
1335 * @param array $content Standard content array
1336 * @param string $key Key from standard content array
1337 * @param int $offset Bit-wise priority to type
1338 */
1339 public function analyzeHeaderinfo(&$retArr, $content, $key, $offset)
1340 {
1341 foreach ($content[$key] as $val) {
1342 $val = substr($val, 0, 60);
1343 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1344 if (!isset($retArr[$val])) {
1345 // Word ID (wid)
1346 $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1347 // Metaphone value is also 60 only chars long
1348 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1349 $retArr[$val]['metaphone'] = $metaphone;
1350 }
1351 // Build metaphone fulltext string (can be used for fulltext indexing)
1352 if ($this->storeMetaphoneInfoAsWords) {
1353 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1354 }
1355 // Priority used for flagBitMask feature (see extension configuration)
1356 $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | pow(2, $offset);
1357 // Increase number of occurrences
1358 $retArr[$val]['count']++;
1359 $this->wordcount++;
1360 }
1361 }
1362
1363 /**
1364 * Calculates relevant information for bodycontent
1365 *
1366 * @param array $retArr Index array, passed by reference
1367 * @param array $content Standard content array
1368 */
1369 public function analyzeBody(&$retArr, $content)
1370 {
1371 foreach ($content['body'] as $key => $val) {
1372 $val = substr($val, 0, 60);
1373 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1374 if (!isset($retArr[$val])) {
1375 // First occurrence (used for ranking results)
1376 $retArr[$val]['first'] = $key;
1377 // Word ID (wid)
1378 $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1379 // Metaphone value is also only 60 chars long
1380 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1381 $retArr[$val]['metaphone'] = $metaphone;
1382 }
1383 // Build metaphone fulltext string (can be used for fulltext indexing)
1384 if ($this->storeMetaphoneInfoAsWords) {
1385 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1386 }
1387 // Increase number of occurrences
1388 $retArr[$val]['count']++;
1389 $this->wordcount++;
1390 }
1391 }
1392
1393 /**
1394 * Creating metaphone based hash from input word
1395 *
1396 * @param string $word Word to convert
1397 * @param bool $returnRawMetaphoneValue If set, returns the raw metaphone value (not hashed)
1398 * @return mixed Metaphone hash integer (or raw value, string)
1399 */
1400 public function metaphone($word, $returnRawMetaphoneValue = false)
1401 {
1402 if (is_object($this->metaphoneObj)) {
1403 $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1404 } else {
1405 // Use native PHP function instead of advanced doubleMetaphone class
1406 $metaphoneRawValue = metaphone($word);
1407 }
1408 if ($returnRawMetaphoneValue) {
1409 $result = $metaphoneRawValue;
1410 } elseif ($metaphoneRawValue !== '') {
1411 // Create hash and return integer
1412 $result = IndexedSearchUtility::md5inthash($metaphoneRawValue);
1413 } else {
1414 $result = 0;
1415 }
1416 return $result;
1417 }
1418
1419 /********************************
1420 *
1421 * SQL; TYPO3 Pages
1422 *
1423 *******************************/
1424 /**
1425 * Updates db with information about the page (TYPO3 page, not external media)
1426 */
1427 public function submitPage()
1428 {
1429 // Remove any current data for this phash:
1430 $this->removeOldIndexedPages($this->hash['phash']);
1431 // setting new phash_row
1432 $fields = [
1433 'phash' => $this->hash['phash'],
1434 'phash_grouping' => $this->hash['phash_grouping'],
1435 'cHashParams' => serialize($this->cHashParams),
1436 'contentHash' => $this->content_md5h,
1437 'data_page_id' => $this->conf['id'],
1438 'data_page_type' => $this->conf['type'],
1439 'data_page_mp' => $this->conf['MP'],
1440 'gr_list' => $this->conf['gr_list'],
1441 'item_type' => 0,
1442 // TYPO3 page
1443 'item_title' => $this->contentParts['title'],
1444 'item_description' => $this->bodyDescription($this->contentParts),
1445 'item_mtime' => (int)$this->conf['mtime'],
1446 'item_size' => strlen($this->conf['content']),
1447 'tstamp' => $GLOBALS['EXEC_TIME'],
1448 'crdate' => $GLOBALS['EXEC_TIME'],
1449 'item_crdate' => $this->conf['crdate'],
1450 // Creation date of page
1451 'sys_language_uid' => $this->conf['sys_language_uid'],
1452 // Sys language uid of the page. Should reflect which language it DOES actually display!
1453 'externalUrl' => 0,
1454 'recordUid' => (int)$this->conf['recordUid'],
1455 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1456 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1457 ];
1458 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1459 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1460 ->getConnectionForTable('index_phash');
1461 $connection->insert(
1462 'index_phash',
1463 $fields,
1464 ['cHashParams' => Connection::PARAM_LOB]
1465 );
1466 }
1467 // PROCESSING index_section
1468 $this->submit_section($this->hash['phash'], $this->hash['phash']);
1469 // PROCESSING index_grlist
1470 $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1471 // PROCESSING index_fulltext
1472 $fields = [
1473 'phash' => $this->hash['phash'],
1474 'fulltextdata' => implode(' ', $this->contentParts),
1475 'metaphonedata' => $this->metaphoneContent
1476 ];
1477 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1478 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1479 }
1480 if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1481 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1482 ->getConnectionForTable('index_fulltext');
1483 $connection->insert('index_fulltext', $fields);
1484 }
1485 // PROCESSING index_debug
1486 if ($this->indexerConfig['debugMode']) {
1487 $fields = [
1488 'phash' => $this->hash['phash'],
1489 'debuginfo' => serialize([
1490 'cHashParams' => $this->cHashParams,
1491 'external_parsers initialized' => array_keys($this->external_parsers),
1492 'conf' => array_merge($this->conf, ['content' => substr($this->conf['content'], 0, 1000)]),
1493 'contentParts' => array_merge($this->contentParts, ['body' => substr($this->contentParts['body'], 0, 1000)]),
1494 'logs' => $this->internal_log,
1495 'lexer' => $this->lexerObj->debugString
1496 ])
1497 ];
1498 if (IndexedSearchUtility::isTableUsed('index_debug')) {
1499 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1500 ->getConnectionForTable('index_debug');
1501 $connection->insert('index_debug', $fields);
1502 }
1503 }
1504 }
1505
1506 /**
1507 * Stores gr_list in the database.
1508 *
1509 * @param int $hash Search result record phash
1510 * @param int $phash_x Actual phash of current content
1511 * @see update_grlist()
1512 */
1513 public function submit_grlist($hash, $phash_x)
1514 {
1515 // Setting the gr_list record
1516 $fields = [
1517 'phash' => $hash,
1518 'phash_x' => $phash_x,
1519 'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1520 'gr_list' => $this->conf['gr_list']
1521 ];
1522 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1523 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1524 ->getConnectionForTable('index_grlist');
1525 $connection->insert('index_grlist', $fields);
1526 }
1527 }
1528
1529 /**
1530 * Stores section
1531 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1532 *
1533 * @param int $hash phash of TYPO3 parent search result record
1534 * @param int $hash_t3 phash of the file indexation search record
1535 */
1536 public function submit_section($hash, $hash_t3)
1537 {
1538 $fields = [
1539 'phash' => $hash,
1540 'phash_t3' => $hash_t3,
1541 'page_id' => (int)$this->conf['id']
1542 ];
1543 $this->getRootLineFields($fields);
1544 if (IndexedSearchUtility::isTableUsed('index_section')) {
1545 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1546 ->getConnectionForTable('index_section');
1547 $connection->insert('index_section', $fields);
1548 }
1549 }
1550
1551 /**
1552 * Removes records for the indexed page, $phash
1553 *
1554 * @param int $phash phash value to flush
1555 */
1556 public function removeOldIndexedPages($phash)
1557 {
1558 // Removing old registrations for all tables. Because the pages are TYPO3 pages
1559 // there can be nothing else than 1-1 relations here.
1560 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1561 $tableArray = ['index_phash', 'index_section', 'index_grlist', 'index_fulltext', 'index_debug'];
1562 foreach ($tableArray as $table) {
1563 if (IndexedSearchUtility::isTableUsed($table)) {
1564 $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1565 }
1566 }
1567
1568 // Removing all index_section records with hash_t3 set to this hash (this includes such
1569 // records set for external media on the page as well!). The re-insert of these records
1570 // are done in indexRegularDocument($file).
1571 if (IndexedSearchUtility::isTableUsed('index_section')) {
1572 $connectionPool->getConnectionForTable('index_section')
1573 ->delete('index_section', ['phash_t3' => (int)$phash]);
1574 }
1575 }
1576
1577 /********************************
1578 *
1579 * SQL; External media
1580 *
1581 *******************************/
1582 /**
1583 * Updates db with information about the file
1584 *
1585 * @param array $hash Array with phash and phash_grouping keys for file
1586 * @param string $file File name
1587 * @param array $subinfo Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1588 * @param string $ext File extension determining the type of media.
1589 * @param int $mtime Modification time of file.
1590 * @param int $ctime Creation time of file.
1591 * @param int $size Size of file in bytes
1592 * @param int $content_md5h Content HASH value.
1593 * @param array $contentParts Standard content array (using only title and body for a file)
1594 */
1595 public function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts)
1596 {
1597 // Find item Type:
1598 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1599 $storeItemType = $storeItemType ?: $ext;
1600 // Remove any current data for this phash:
1601 $this->removeOldIndexedFiles($hash['phash']);
1602 // Split filename:
1603 $fileParts = parse_url($file);
1604 // Setting new
1605 $fields = [
1606 'phash' => $hash['phash'],
1607 'phash_grouping' => $hash['phash_grouping'],
1608 'cHashParams' => serialize($subinfo),
1609 'contentHash' => $content_md5h,
1610 'data_filename' => $file,
1611 'item_type' => $storeItemType,
1612 'item_title' => trim($contentParts['title']) ?: PathUtility::basename($file),
1613 'item_description' => $this->bodyDescription($contentParts),
1614 'item_mtime' => $mtime,
1615 'item_size' => $size,
1616 'item_crdate' => $ctime,
1617 'tstamp' => $GLOBALS['EXEC_TIME'],
1618 'crdate' => $GLOBALS['EXEC_TIME'],
1619 'gr_list' => $this->conf['gr_list'],
1620 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1621 'recordUid' => (int)$this->conf['recordUid'],
1622 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1623 'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1624 'sys_language_uid' => (int)$this->conf['sys_language_uid']
1625 ];
1626 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1627 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1628 ->getConnectionForTable('index_phash');
1629 $connection->insert(
1630 'index_phash',
1631 $fields,
1632 ['cHashParams' => Connection::PARAM_LOB]
1633 );
1634 }
1635 // PROCESSING index_fulltext
1636 $fields = [
1637 'phash' => $hash['phash'],
1638 'fulltextdata' => implode(' ', $contentParts),
1639 'metaphonedata' => $this->metaphoneContent
1640 ];
1641 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1642 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1643 }
1644 if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1645 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1646 ->getConnectionForTable('index_fulltext');
1647 $connection->insert('index_fulltext', $fields);
1648 }
1649 // PROCESSING index_debug
1650 if ($this->indexerConfig['debugMode']) {
1651 $fields = [
1652 'phash' => $hash['phash'],
1653 'debuginfo' => serialize([
1654 'cHashParams' => $subinfo,
1655 'contentParts' => array_merge($contentParts, ['body' => substr($contentParts['body'], 0, 1000)]),
1656 'logs' => $this->internal_log,
1657 'lexer' => $this->lexerObj->debugString
1658 ])
1659 ];
1660 if (IndexedSearchUtility::isTableUsed('index_debug')) {
1661 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1662 ->getConnectionForTable('index_debug');
1663 $connection->insert('index_debug', $fields);
1664 }
1665 }
1666 }
1667
1668 /**
1669 * Stores file gr_list for a file IF it does not exist already
1670 *
1671 * @param int $hash phash value of file
1672 */
1673 public function submitFile_grlist($hash)
1674 {
1675 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1676 if (!IndexedSearchUtility::isTableUsed('index_grlist')) {
1677 return;
1678 }
1679
1680 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1681 ->getQueryBuilderForTable('index_grlist');
1682 $count = (int)$queryBuilder->count('*')
1683 ->from('index_grlist')
1684 ->where(
1685 $queryBuilder->expr()->eq(
1686 'phash',
1687 $queryBuilder->createNamedParameter($hash, \PDO::PARAM_INT)
1688 ),
1689 $queryBuilder->expr()->orX(
1690 $queryBuilder->expr()->eq(
1691 'hash_gr_list',
1692 $queryBuilder->createNamedParameter(
1693 IndexedSearchUtility::md5inthash($this->defaultGrList),
1694 \PDO::PARAM_INT
1695 )
1696 ),
1697 $queryBuilder->expr()->eq(
1698 'hash_gr_list',
1699 $queryBuilder->createNamedParameter(
1700 IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1701 \PDO::PARAM_INT
1702 )
1703 )
1704 )
1705 )
1706 ->execute()
1707 ->fetchColumn();
1708
1709 if ($count === 0) {
1710 $this->submit_grlist($hash, $hash);
1711 }
1712 }
1713
1714 /**
1715 * Stores file section for a file IF it does not exist
1716 *
1717 * @param int $hash phash value of file
1718 */
1719 public function submitFile_section($hash)
1720 {
1721 // Testing if there is already a section
1722 if (!IndexedSearchUtility::isTableUsed('index_section')) {
1723 return;
1724 }
1725
1726 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1727 ->getQueryBuilderForTable('index_section');
1728 $count = (int)$queryBuilder->count('phash')
1729 ->from('index_section')
1730 ->where(
1731 $queryBuilder->expr()->eq(
1732 'phash',
1733 $queryBuilder->createNamedParameter($hash, \PDO::PARAM_INT)
1734 ),
1735 $queryBuilder->expr()->eq(
1736 'page_id',
1737 $queryBuilder->createNamedParameter($this->conf['id'], \PDO::PARAM_INT)
1738 )
1739 )
1740 ->execute()
1741 ->fetchColumn();
1742
1743 if ($count === 0) {
1744 $this->submit_section($hash, $this->hash['phash']);
1745 }
1746 }
1747
1748 /**
1749 * Removes records for the indexed page, $phash
1750 *
1751 * @param int $phash phash value to flush
1752 */
1753 public function removeOldIndexedFiles($phash)
1754 {
1755 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1756 // Removing old registrations for tables.
1757 $tableArray = ['index_phash', 'index_grlist', 'index_fulltext', 'index_debug'];
1758 foreach ($tableArray as $table) {
1759 if (!IndexedSearchUtility::isTableUsed($table)) {
1760 continue;
1761 }
1762 $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1763 }
1764 }
1765
1766 /********************************
1767 *
1768 * SQL Helper functions
1769 *
1770 *******************************/
1771 /**
1772 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1773 * Return positive integer if the page needs to be indexed
1774 *
1775 * @param int $mtime mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1776 * @param int $phash "phash" used to select any already indexed page to see what its mtime is.
1777 * @return int Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1778 */
1779 public function checkMtimeTstamp($mtime, $phash)
1780 {
1781 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1782 // Not indexed (not in index_phash)
1783 $result = 4;
1784 } else {
1785 $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1786 ->select(
1787 ['item_mtime', 'tstamp'],
1788 'index_phash',
1789 ['phash' => (int)$phash],
1790 [],
1791 [],
1792 1
1793 )
1794 ->fetch();
1795 // If there was an indexing of the page...:
1796 if (!empty($row)) {
1797 if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < $GLOBALS['EXEC_TIME']) {
1798 // If max age is exceeded, index the page
1799 // The configured max-age was exceeded for the document and thus it's indexed.
1800 $result = 1;
1801 } else {
1802 if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < $GLOBALS['EXEC_TIME']) {
1803 // if minAge is not set or if minAge is exceeded, consider at mtime
1804 if ($mtime) {
1805 // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1806 if ($row['item_mtime'] != $mtime) {
1807 // And if mtime is different from the index_phash mtime, it's about time to re-index.
1808 // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1809 $result = 2;
1810 } else {
1811 // mtime matched the document, so no changes detected and no content updated
1812 $result = -1;
1813 if ($this->tstamp_maxAge) {
1814 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1815 } else {
1816 $this->updateTstamp($phash);
1817 $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1818 }
1819 }
1820 } else {
1821 // The minimum age was exceed, but mtime was not set, so the page was indexed.
1822 $result = 3;
1823 }
1824 } else {
1825 // The minimum age was not exceeded
1826 $result = -2;
1827 }
1828 }
1829 } else {
1830 // Page has never been indexed (is not represented in the index_phash table).
1831 $result = 4;
1832 }
1833 }
1834 return $result;
1835 }
1836
1837 /**
1838 * Check content hash in phash table
1839 *
1840 * @return mixed Returns TRUE if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1841 */
1842 public function checkContentHash()
1843 {
1844 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1845 $result = true;
1846 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1847 $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1848 ->select(
1849 ['phash'],
1850 'index_phash',
1851 [
1852 'phash_grouping' => (int)$this->hash['phash_grouping'],
1853 'contentHash' => (int)$this->content_md5h
1854 ],
1855 [],
1856 [],
1857 1
1858 )
1859 ->fetch();
1860
1861 if (!empty($row)) {
1862 $result = $row;
1863 }
1864 }
1865 return $result;
1866 }
1867
1868 /**
1869 * Check content hash for external documents
1870 * Returns TRUE if the document needs to be indexed (that is, there was no result)
1871 *
1872 * @param int $hashGr phash value to check (phash_grouping)
1873 * @param int $content_md5h Content hash to check
1874 * @return bool Returns TRUE if the document needs to be indexed (that is, there was no result)
1875 */
1876 public function checkExternalDocContentHash($hashGr, $content_md5h)
1877 {
1878 $result = true;
1879 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1880 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1881 ->getConnectionForTable('index_phash')
1882 ->count(
1883 '*',
1884 'index_phash',
1885 [
1886 'phash_grouping' => (int)$hashGr,
1887 'contentHash' => (int)$content_md5h
1888 ]
1889 );
1890
1891 $result = $count === 0;
1892 }
1893 return $result;
1894 }
1895
1896 /**
1897 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1898 *
1899 * @param int $phash_x Phash integer to test.
1900 * @return bool
1901 */
1902 public function is_grlist_set($phash_x)
1903 {
1904 $result = false;
1905 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1906 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1907 ->getConnectionForTable('index_grlist')
1908 ->count(
1909 'phash_x',
1910 'index_grlist',
1911 ['phash_x' => (int)$phash_x]
1912 );
1913
1914 $result = $count > 0;
1915 }
1916 return $result;
1917 }
1918
1919 /**
1920 * Check if an grlist-entry for this hash exists and if not so, write one.
1921 *
1922 * @param int $phash phash of the search result that should be found
1923 * @param int $phash_x The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1924 * @see submit_grlist()
1925 */
1926 public function update_grlist($phash, $phash_x)
1927 {
1928 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1929 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1930 ->getConnectionForTable('index_grlist')
1931 ->count(
1932 'phash',
1933 'index_grlist',
1934 [
1935 'phash' => (int)$phash,
1936 'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list'])
1937 ]
1938 );
1939
1940 if ($count === 0) {
1941 $this->submit_grlist($phash, $phash_x);
1942 $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1);
1943 }
1944 }
1945 }
1946
1947 /**
1948 * Update tstamp for a phash row.
1949 *
1950 * @param int $phash phash value
1951 * @param int $mtime If set, update the mtime field to this value.
1952 */
1953 public function updateTstamp($phash, $mtime = 0)
1954 {
1955 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1956 return;
1957 }
1958
1959 $updateFields = [
1960 'tstamp' => $GLOBALS['EXEC_TIME']
1961 ];
1962
1963 if ($mtime) {
1964 $updateFields['item_mtime'] = (int)$mtime;
1965 }
1966
1967 GeneralUtility::makeInstance(ConnectionPool::class)
1968 ->getConnectionForTable('index_phash')
1969 ->update(
1970 'index_phash',
1971 $updateFields,
1972 [
1973 'phash' => (int)$phash
1974 ]
1975 );
1976 }
1977
1978 /**
1979 * Update SetID of the index_phash record.
1980 *
1981 * @param int $phash phash value
1982 */
1983 public function updateSetId($phash)
1984 {
1985 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1986 return;
1987 }
1988
1989 GeneralUtility::makeInstance(ConnectionPool::class)
1990 ->getConnectionForTable('index_phash')
1991 ->update(
1992 'index_phash',
1993 [
1994 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1995 ],
1996 [
1997 'phash' => (int)$phash
1998 ]
1999 );
2000 }
2001
2002 /**
2003 * Update parsetime for phash row.
2004 *
2005 * @param int $phash phash value.
2006 * @param int $parsetime Parsetime value to set.
2007 */
2008 public function updateParsetime($phash, $parsetime)
2009 {
2010 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
2011 return;
2012 }
2013
2014 GeneralUtility::makeInstance(ConnectionPool::class)
2015 ->getConnectionForTable('index_phash')
2016 ->update(
2017 'index_phash',
2018 [
2019 'parsetime' => (int)$parsetime
2020 ],
2021 [
2022 'phash' => (int)$phash
2023 ]
2024 );
2025 }
2026
2027 /**
2028 * Update section rootline for the page
2029 */
2030 public function updateRootline()
2031 {
2032 if (!IndexedSearchUtility::isTableUsed('index_section')) {
2033 return;
2034 }
2035
2036 $updateFields = [];
2037 $this->getRootLineFields($updateFields);
2038
2039 GeneralUtility::makeInstance(ConnectionPool::class)
2040 ->getConnectionForTable('index_section')
2041 ->update(
2042 'index_section',
2043 $updateFields,
2044 [
2045 'page_id' => (int)$this->conf['id']
2046 ]
2047 );
2048 }
2049
2050 /**
2051 * Adding values for root-line fields.
2052 * rl0, rl1 and rl2 are standard. A hook might add more.
2053 *
2054 * @param array $fieldArray Field array, passed by reference
2055 */
2056 public function getRootLineFields(array &$fieldArray)
2057 {
2058 $fieldArray['rl0'] = (int)$this->conf['rootline_uids'][0];
2059 $fieldArray['rl1'] = (int)$this->conf['rootline_uids'][1];
2060 $fieldArray['rl2'] = (int)$this->conf['rootline_uids'][2];
2061 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] ?? [] as $fieldName => $rootLineLevel) {
2062 $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
2063 }
2064 }
2065
2066 /********************************
2067 *
2068 * SQL; Submitting words
2069 *
2070 *******************************/
2071 /**
2072 * Adds new words to db
2073 *
2074 * @param array $wordListArray Word List array (where each word has information about position etc).
2075 */
2076 public function checkWordList($wordListArray)
2077 {
2078 if (!IndexedSearchUtility::isTableUsed('index_words') || empty($wordListArray)) {
2079 return;
2080 }
2081
2082 $wordListArrayCount = count($wordListArray);
2083 $phashArray = array_map('intval', array_column($wordListArray, 'hash'));
2084
2085 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('index_words');
2086 $count = (int)$queryBuilder->count('baseword')
2087 ->from('index_words')
2088 ->where(
2089 $queryBuilder->expr()->in(
2090 'wid',
2091 $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
2092 )
2093 )
2094 ->execute()
2095 ->fetchColumn();
2096
2097 if ($count !== $wordListArrayCount) {
2098 $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_words');
2099 $queryBuilder = $connection->createQueryBuilder();
2100
2101 $result = $queryBuilder->select('baseword')
2102 ->from('index_words')
2103 ->where(
2104 $queryBuilder->expr()->in(
2105 'wid',
2106 $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
2107 )
2108 )
2109 ->execute();
2110
2111 $this->log_setTSlogMessage('Inserting words: ' . ($wordListArrayCount - $count), 1);
2112 while ($row = $result->fetch()) {
2113 unset($wordListArray[$row['baseword']]);
2114 }
2115
2116 foreach ($wordListArray as $key => $val) {
2117 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as
2118 // long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...)
2119 // this is not a problem.
2120 $connection->insert(
2121 'index_words',
2122 [
2123 'wid' => $val['hash'],
2124 'baseword' => $key,
2125 'metaphone' => $val['metaphone']
2126 ]
2127 );
2128 }
2129 }
2130 }
2131
2132 /**
2133 * Submits RELATIONS between words and phash
2134 *
2135 * @param array $wordList Word list array
2136 * @param int $phash phash value
2137 */
2138 public function submitWords($wordList, $phash)
2139 {
2140 if (!IndexedSearchUtility::isTableUsed('index_rel')) {
2141 return;
2142 }
2143 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
2144 $queryBuilder = $connectionPool->getQueryBuilderForTable('index_words');
2145 $result = $queryBuilder->select('wid')
2146 ->from('index_words')
2147 ->where(
2148 $queryBuilder->expr()->neq('is_stopword', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT))
2149 )
2150 ->groupBy('wid')
2151 ->execute();
2152
2153 $stopWords = [];
2154 while ($row = $result->fetch()) {
2155 $stopWords[$row['wid']] = $row;
2156 }
2157
2158 $connectionPool->getConnectionForTable('index_rel')->delete('index_rel', ['phash' => (int)$phash]);
2159
2160 $fields = ['phash', 'wid', 'count', 'first', 'freq', 'flags'];
2161 $rows = [];
2162 foreach ($wordList as $val) {
2163 if (isset($stopWords[$val['hash']])) {
2164 continue;
2165 }
2166 $rows[] = [
2167 (int)$phash,
2168 (int)$val['hash'],
2169 (int)$val['count'],
2170 (int)$val['first'],
2171 $this->freqMap($val['count'] / $this->wordcount),
2172 $val['cmp'] & $this->flagBitMask
2173 ];
2174 }
2175
2176 if (!empty($rows)) {
2177 $connectionPool->getConnectionForTable('index_rel')->bulkInsert('index_rel', $rows, $fields);
2178 }
2179 }
2180
2181 /**
2182 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
2183 * and back.
2184 *
2185 * @param float $freq Frequency
2186 * @return int Frequency in range.
2187 */
2188 public function freqMap($freq)
2189 {
2190 $mapFactor = $this->freqMax * 100 * $this->freqRange;
2191 if ($freq <= 1) {
2192 $newFreq = $freq * $mapFactor;
2193 $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
2194 } else {
2195 $newFreq = $freq / $mapFactor;
2196 }
2197 return $newFreq;
2198 }
2199
2200 /********************************
2201 *
2202 * Hashing
2203 *
2204 *******************************/
2205 /**
2206 * Get search hash, T3 pages
2207 */
2208 public function setT3Hashes()
2209 {
2210 // Set main array:
2211 $hArray = [
2212 'id' => (int)$this->conf['id'],
2213 'type' => (int)$this->conf['type'],
2214 'sys_lang' => (int)$this->conf['sys_language_uid'],
2215 'MP' => (string)$this->conf['MP'],
2216 'cHash' => $this->cHashParams
2217 ];
2218 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
2219 $this->hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2220 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
2221 $hArray['gr_list'] = (string)$this->conf['gr_list'];
2222 $this->hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2223 }
2224
2225 /**
2226 * Get search hash, external files
2227 *
2228 * @param string $file File name / path which identifies it on the server
2229 * @param array $subinfo Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
2230 * @return array Array with "phash_grouping" and "phash" inside.
2231 */
2232 public function setExtHashes($file, $subinfo = [])
2233 {
2234 // Set main array:
2235 $hash = [];
2236 $hArray = [
2237 'file' => $file
2238 ];
2239 // Set grouping hash:
2240 $hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2241 // Add subinfo
2242 $hArray['subinfo'] = $subinfo;
2243 $hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2244 return $hash;
2245 }
2246
2247 /*********************************
2248 *
2249 * Internal logging functions
2250 *
2251 *********************************/
2252 /**
2253 * Push function wrapper for TT logging
2254 *
2255 * @param string $msg Title to set
2256 * @param string $key Key (?)
2257 */
2258 public function log_push($msg, $key)
2259 {
2260 $this->timeTracker->push($msg, $key);
2261 }
2262
2263 /**
2264 * Pull function wrapper for TT logging
2265 */
2266 public function log_pull()
2267 {
2268 $this->timeTracker->pull();
2269 }
2270
2271 /**
2272 * Set log message function wrapper for TT logging
2273 *
2274 * @param string $msg Message to set
2275 * @param int $errorNum Error number
2276 */
2277 public function log_setTSlogMessage($msg, $errorNum = 0)
2278 {
2279 $this->timeTracker->setTSlogMessage($msg, $errorNum);
2280 $this->internal_log[] = $msg;
2281 }
2282
2283 /**
2284 * Makes sure that keywords are space-separated. This is impotant for their
2285 * proper displaying as a part of fulltext index.
2286 *
2287 * @param string $keywordList
2288 * @return string
2289 * @see http://forge.typo3.org/issues/14959
2290 */
2291 protected function addSpacesToKeywordList($keywordList)
2292 {
2293 $keywords = GeneralUtility::trimExplode(',', $keywordList);
2294 return ' ' . implode(', ', $keywords) . ' ';
2295 }
2296 }