[!!!][TASK] Remove deprecated code related to TSFE
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Indexer.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
18 use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
19 use TYPO3\CMS\Core\Context\Context;
20 use TYPO3\CMS\Core\Context\LanguageAspect;
21 use TYPO3\CMS\Core\Core\Environment;
22 use TYPO3\CMS\Core\Database\Connection;
23 use TYPO3\CMS\Core\Database\ConnectionPool;
24 use TYPO3\CMS\Core\TimeTracker\TimeTracker;
25 use TYPO3\CMS\Core\Utility\GeneralUtility;
26 use TYPO3\CMS\Core\Utility\HttpUtility;
27 use TYPO3\CMS\Core\Utility\MathUtility;
28 use TYPO3\CMS\Core\Utility\PathUtility;
29 use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
30 use TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility;
31
32 /**
33 * Indexing class for TYPO3 frontend
34 */
35 class Indexer
36 {
37 use PublicPropertyDeprecationTrait;
38
39 /**
40 * List of all deprecated public properties
41 * @var array
42 */
43 protected $deprecatedPublicProperties = [
44 'csObj' => 'Using $csObj within Indexing is discouraged, the property will be removed in TYPO3 v10.0 - if needed instantiate CharsetConverter yourself.',
45 ];
46
47 /**
48 * @var array
49 */
50 public $reasons = [
51 -1 => 'mtime matched the document, so no changes detected and no content updated',
52 -2 => 'The minimum age was not exceeded',
53 1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
54 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
55 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
56 4 => 'Page has never been indexed (is not represented in the index_phash table).'
57 ];
58
59 /**
60 * HTML code blocks to exclude from indexing
61 *
62 * @var string
63 */
64 public $excludeSections = 'script,style';
65
66 /**
67 * Supported Extensions for external files
68 *
69 * @var array
70 */
71 public $external_parsers = [];
72
73 /**
74 * External parser objects, keys are file extension names. Values are objects with certain methods.
75 * Fe-group list (pages might be indexed separately for each usergroup combination to support search
76 * in access limited pages!)
77 *
78 * @var string
79 */
80 public $defaultGrList = '0,-1';
81
82 /**
83 * Min/Max times
84 *
85 * @var int
86 */
87 public $tstamp_maxAge = 0;
88
89 /**
90 * If set, this tells a number of seconds that is the maximum age of an indexed document.
91 * Regardless of mtime the document will be re-indexed if this limit is exceeded.
92 *
93 * @var int
94 */
95 public $tstamp_minAge = 0;
96
97 /**
98 * If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
99 *
100 * @var int
101 */
102 public $maxExternalFiles = 0;
103
104 /**
105 * Max number of external files to index.
106 *
107 * @var bool
108 */
109 public $forceIndexing = false;
110
111 /**
112 * If TRUE, indexing is forced despite of hashes etc.
113 *
114 * @var bool
115 */
116 public $crawlerActive = false;
117
118 /**
119 * Set when crawler is detected (internal)
120 *
121 * @var array
122 */
123 public $defaultContentArray = [
124 'title' => '',
125 'description' => '',
126 'keywords' => '',
127 'body' => ''
128 ];
129
130 /**
131 * @var int
132 */
133 public $wordcount = 0;
134
135 /**
136 * @var int
137 */
138 public $externalFileCounter = 0;
139
140 /**
141 * @var array
142 */
143 public $conf = [];
144
145 /**
146 * Configuration set internally (see init functions for required keys and their meaning)
147 *
148 * @var array
149 */
150 public $indexerConfig = [];
151
152 /**
153 * Indexer configuration, coming from TYPO3's system configuration for EXT:indexed_search
154 *
155 * @var array
156 */
157 public $hash = [];
158
159 /**
160 * Hash array, contains phash and phash_grouping
161 *
162 * @var array
163 */
164 public $file_phash_arr = [];
165
166 /**
167 * Hash array for files
168 *
169 * @var array
170 */
171 public $contentParts = [];
172
173 /**
174 * Content of TYPO3 page
175 *
176 * @var string
177 */
178 public $content_md5h = '';
179
180 /**
181 * @var array
182 */
183 public $internal_log = [];
184
185 /**
186 * Internal log
187 *
188 * @var string
189 */
190 public $indexExternalUrl_content = '';
191
192 /**
193 * @var array
194 */
195 public $cHashParams = [];
196
197 /**
198 * cHashparams array
199 *
200 * @var int
201 */
202 public $freqRange = 32000;
203
204 /**
205 * @var float
206 */
207 public $freqMax = 0.1;
208
209 /**
210 * @var bool
211 */
212 public $enableMetaphoneSearch = false;
213
214 /**
215 * @var bool
216 */
217 public $storeMetaphoneInfoAsWords;
218
219 /**
220 * @var string
221 */
222 public $metaphoneContent = '';
223
224 /**
225 * Charset class object
226 *
227 * @var \TYPO3\CMS\Core\Charset\CharsetConverter
228 * @deprecated since TYPO3 v9.3, will be removed in TYPO3 v10.0 (also the instantiation in the init() method).
229 */
230 protected $csObj;
231
232 /**
233 * Metaphone object, if any
234 *
235 * @var \TYPO3\CMS\IndexedSearch\Utility\DoubleMetaPhoneUtility
236 */
237 public $metaphoneObj;
238
239 /**
240 * Lexer object for word splitting
241 *
242 * @var \TYPO3\CMS\IndexedSearch\Lexer
243 */
244 public $lexerObj;
245
246 /**
247 * @var bool
248 */
249 public $flagBitMask;
250
251 /**
252 * @var TimeTracker
253 */
254 protected $timeTracker;
255
256 /**
257 * Indexer constructor.
258 */
259 public function __construct()
260 {
261 $this->timeTracker = GeneralUtility::makeInstance(TimeTracker::class);
262 }
263
264 /**
265 * Parent Object (TSFE) Initialization
266 *
267 * @param TypoScriptFrontendController $pObj Parent Object, passed by reference
268 */
269 public function hook_indexContent(&$pObj)
270 {
271 // Indexer configuration from Extension Manager interface:
272 $disableFrontendIndexing = (bool)GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search', 'disableFrontendIndexing');
273 // Crawler activation:
274 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
275 if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler') && $pObj->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
276 // Setting simple log message:
277 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
278 // Setting variables:
279 $this->crawlerActive = true;
280 // Crawler active flag
281 $this->forceIndexing = true;
282 }
283 // Determine if page should be indexed, and if so, configure and initialize indexer
284 if ($pObj->config['config']['index_enable']) {
285 $this->log_push('Index page', '');
286 if (!$disableFrontendIndexing || $this->crawlerActive) {
287 if (!$pObj->page['no_search']) {
288 if (!$pObj->no_cache) {
289 /** @var LanguageAspect $languageAspect */
290 $languageAspect = GeneralUtility::makeInstance(Context::class)->getAspect('language');
291 if ($languageAspect->getId() === $languageAspect->getContentId()) {
292 // Setting up internal configuration from config array:
293 $this->conf = [];
294 // Information about page for which the indexing takes place
295 $this->conf['id'] = $pObj->id;
296 // Page id
297 $this->conf['type'] = $pObj->type;
298 // Page type
299 $this->conf['sys_language_uid'] = $languageAspect->getId();
300 // sys_language UID of the language of the indexing.
301 $this->conf['MP'] = $pObj->MP;
302 // MP variable, if any (Mount Points)
303 // Group list
304 $this->conf['gr_list'] = implode(',', GeneralUtility::makeInstance(Context::class)->getPropertyFromAspect('frontend.user', 'groupIds', [0, -1]));
305 $this->conf['cHash'] = $pObj->cHash;
306 // cHash string for additional parameters
307 $this->conf['cHash_array'] = $pObj->cHash_array;
308 // Array of the additional parameters
309 $this->conf['crdate'] = $pObj->page['crdate'];
310 // The creation date of the TYPO3 page
311
312 // Root line uids
313 $this->conf['rootline_uids'] = [];
314 foreach ($pObj->config['rootLine'] as $rlkey => $rldat) {
315 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
316 }
317 // Content of page:
318 $this->conf['content'] = $pObj->content;
319 // Content string (HTML of TYPO3 page)
320 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
321 // Alternative title for indexing
322 $this->conf['metaCharset'] = $pObj->metaCharset;
323 // Character set of content (will be converted to utf-8 during indexing)
324 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED'] ?? $pObj->page['SYS_LASTCHANGED'];
325 // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
326 // Configuration of behavior:
327 $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
328 // Whether to index external documents like PDF, DOC etc. (if possible)
329 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
330 // Length of description text (max 250, default 200)
331 $this->conf['index_metatags'] = $pObj->config['config']['index_metatags'] ?? true;
332 // Set to zero:
333 $this->conf['recordUid'] = 0;
334 $this->conf['freeIndexUid'] = 0;
335 $this->conf['freeIndexSetId'] = 0;
336 // Init and start indexing:
337 $this->init();
338 $this->indexTypo3PageContent();
339 } else {
340 $this->log_setTSlogMessage('Index page? No, languageId was different from contentId which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
341 }
342 } else {
343 $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
344 }
345 } else {
346 $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
347 }
348 } else {
349 $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
350 }
351 $this->log_pull();
352 }
353 }
354
355 /****************************
356 *
357 * Backend API
358 *
359 ****************************/
360 /**
361 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
362 *
363 * @param int $id The page uid, &id=
364 * @param int $type The page type, &type=
365 * @param int $sys_language_uid sys_language uid, typically &L=
366 * @param string $MP The MP variable (Mount Points), &MP=
367 * @param array $uidRL Rootline array of only UIDs.
368 * @param array $cHash_array Array of GET variables to register with this indexing
369 * @param bool $createCHash If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cacheable pages from the backend!
370 */
371 public function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array = [], $createCHash = false)
372 {
373 // Setting up internal configuration from config array:
374 $this->conf = [];
375 // Information about page for which the indexing takes place
376 $this->conf['id'] = $id;
377 // Page id (int)
378 $this->conf['type'] = $type;
379 // Page type (int)
380 $this->conf['sys_language_uid'] = $sys_language_uid;
381 // sys_language UID of the language of the indexing (int)
382 $this->conf['MP'] = $MP;
383 // MP variable, if any (Mount Points) (string)
384 $this->conf['gr_list'] = '0,-1';
385 // Group list (hardcoded for now...)
386 // cHash values:
387 if ($createCHash) {
388 /* @var \TYPO3\CMS\Frontend\Page\CacheHashCalculator $cacheHash */
389 $cacheHash = GeneralUtility::makeInstance(\TYPO3\CMS\Frontend\Page\CacheHashCalculator::class);
390 $this->conf['cHash'] = $cacheHash->generateForParameters(HttpUtility::buildQueryString($cHash_array));
391 } else {
392 $this->conf['cHash'] = '';
393 }
394 // cHash string for additional parameters
395 $this->conf['cHash_array'] = $cHash_array;
396 // Array of the additional parameters
397 // Set to defaults
398 $this->conf['freeIndexUid'] = 0;
399 $this->conf['freeIndexSetId'] = 0;
400
401 // Root line uids
402 $this->conf['rootline_uids'] = $uidRL;
403 // Configuration of behavior:
404 $this->conf['index_externals'] = 1;
405 // Whether to index external documents like PDF, DOC etc. (if possible)
406 $this->conf['index_descrLgd'] = 200;
407 // Length of description text (max 250, default 200)
408 $this->conf['index_metatags'] = true;
409 // Whether to index document keywords and description (if present)
410 // Init and start indexing:
411 $this->init();
412 }
413
414 /**
415 * Sets the free-index uid. Can be called right after backend_initIndexer()
416 *
417 * @param int $freeIndexUid Free index UID
418 * @param int $freeIndexSetId Set id - an integer identifying the "set" of indexing operations.
419 */
420 public function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId = 0)
421 {
422 $this->conf['freeIndexUid'] = $freeIndexUid;
423 $this->conf['freeIndexSetId'] = $freeIndexSetId;
424 }
425
426 /**
427 * Indexing records as the content of a TYPO3 page.
428 *
429 * @param string $title Title equivalent
430 * @param string $keywords Keywords equivalent
431 * @param string $description Description equivalent
432 * @param string $content The main content to index
433 * @param string $charset The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
434 * @param int $mtime Last modification time, in seconds
435 * @param int $crdate The creation date of the content, in seconds
436 * @param int $recordUid The record UID that the content comes from (for registration with the indexed rows)
437 */
438 public function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate = 0, $recordUid = 0)
439 {
440 // Content of page:
441 $this->conf['mtime'] = $mtime;
442 // Most recent modification time (seconds) of the content
443 $this->conf['crdate'] = $crdate;
444 // The creation date of the TYPO3 content
445 $this->conf['recordUid'] = $recordUid;
446 // UID of the record, if applicable
447 // Construct fake HTML for parsing:
448 $this->conf['content'] = '
449 <html>
450 <head>
451 <title>' . htmlspecialchars($title) . '</title>
452 <meta name="keywords" content="' . htmlspecialchars($keywords) . '" />
453 <meta name="description" content="' . htmlspecialchars($description) . '" />
454 </head>
455 <body>
456 ' . htmlspecialchars($content) . '
457 </body>
458 </html>';
459 // Content string (HTML of TYPO3 page)
460 // Initializing charset:
461 $this->conf['metaCharset'] = $charset;
462 // Character set of content (will be converted to utf-8 during indexing)
463 $this->conf['indexedDocTitle'] = '';
464 // Alternative title for indexing
465 // Index content as if it was a TYPO3 page:
466 $this->indexTypo3PageContent();
467 }
468
469 /********************************
470 *
471 * Initialization
472 *
473 *******************************/
474 /**
475 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
476 */
477 public function init()
478 {
479 // Initializing:
480 $this->cHashParams = $this->conf['cHash_array'];
481 if (is_array($this->cHashParams) && !empty($this->cHashParams)) {
482 if ($this->conf['cHash']) {
483 // Add this so that URL's come out right...
484 $this->cHashParams['cHash'] = $this->conf['cHash'];
485 }
486 unset($this->cHashParams['encryptionKey']);
487 }
488 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
489 $this->setT3Hashes();
490 // Indexer configuration from Extension Manager interface:
491 $this->indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
492 $this->tstamp_minAge = MathUtility::forceIntegerInRange($this->indexerConfig['minAge'] * 3600, 0);
493 $this->tstamp_maxAge = MathUtility::forceIntegerInRange($this->indexerConfig['maxAge'] * 3600, 0);
494 $this->maxExternalFiles = MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
495 $this->flagBitMask = MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
496 // Workaround: If the extension configuration was not updated yet, the value is not existing
497 $this->enableMetaphoneSearch = !isset($this->indexerConfig['enableMetaphoneSearch']) || $this->indexerConfig['enableMetaphoneSearch'];
498 $this->storeMetaphoneInfoAsWords = !IndexedSearchUtility::isTableUsed('index_words') && $this->enableMetaphoneSearch;
499 // Initialize external document parsers:
500 // Example configuration, see ext_localconf.php of this file!
501 if ($this->conf['index_externals']) {
502 $this->initializeExternalParsers();
503 }
504 // Initialize lexer (class that deconstructs the text into words):
505 $lexerObjectClassName = $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] ?: Lexer::class;
506 $this->lexerObj = GeneralUtility::makeInstance($lexerObjectClassName);
507 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
508 // Initialize metaphone hook:
509 // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
510 if ($this->enableMetaphoneSearch && $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']) {
511 $this->metaphoneObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']);
512 $this->metaphoneObj->pObj = $this;
513 }
514 // Init charset class:
515 $this->csObj = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Charset\CharsetConverter::class);
516 }
517
518 /**
519 * Initialize external parsers
520 *
521 * @internal
522 * @see init()
523 */
524 public function initializeExternalParsers()
525 {
526 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] ?? [] as $extension => $className) {
527 $this->external_parsers[$extension] = GeneralUtility::makeInstance($className);
528 $this->external_parsers[$extension]->pObj = $this;
529 // Init parser and if it returns FALSE, unset its entry again:
530 if (!$this->external_parsers[$extension]->initParser($extension)) {
531 unset($this->external_parsers[$extension]);
532 }
533 }
534 }
535
536 /********************************
537 *
538 * Indexing; TYPO3 pages (HTML content)
539 *
540 *******************************/
541 /**
542 * Start indexing of the TYPO3 page
543 */
544 public function indexTypo3PageContent()
545 {
546 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
547 $is_grlist = $this->is_grlist_set($this->hash['phash']);
548 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
549 // Setting message:
550 if ($this->forceIndexing) {
551 $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
552 } elseif ($check > 0) {
553 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
554 } else {
555 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
556 }
557 // Divide into title,keywords,description and body:
558 $this->log_push('Split content', '');
559 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
560 if ($this->conf['indexedDocTitle']) {
561 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
562 }
563 $this->log_pull();
564 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so don't!)
565 $this->content_md5h = IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
566 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
567 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
568 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
569 $checkCHash = $this->checkContentHash();
570 if (!is_array($checkCHash) || $check === 1) {
571 $Pstart = GeneralUtility::milliseconds();
572 $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
573 $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
574 $this->log_pull();
575 // Splitting words
576 $this->log_push('Extract words from content', '');
577 $splitInWords = $this->processWordsInArrays($this->contentParts);
578 $this->log_pull();
579 // Analyze the indexed words.
580 $this->log_push('Analyze the extracted words', '');
581 $indexArr = $this->indexAnalyze($splitInWords);
582 $this->log_pull();
583 // Submitting page (phash) record
584 $this->log_push('Submitting page', '');
585 $this->submitPage();
586 $this->log_pull();
587 // Check words and submit to word list if not there
588 $this->log_push('Check word list and submit words', '');
589 if (IndexedSearchUtility::isTableUsed('index_words')) {
590 $this->checkWordList($indexArr);
591 $this->submitWords($indexArr, $this->hash['phash']);
592 }
593 $this->log_pull();
594 // Set parsetime
595 $this->updateParsetime($this->hash['phash'], GeneralUtility::milliseconds() - $Pstart);
596 // Checking external files if configured for.
597 $this->log_push('Checking external files', '');
598 if ($this->conf['index_externals']) {
599 $this->extractLinks($this->conf['content']);
600 }
601 $this->log_pull();
602 } else {
603 // Update the timestamp
604 $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
605 $this->updateSetId($this->hash['phash']);
606 // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
607 $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
608 $this->updateRootline();
609 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
610 }
611 } else {
612 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
613 }
614 }
615
616 /**
617 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
618 *
619 * @param string $content HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
620 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
621 * @see splitRegularContent()
622 */
623 public function splitHTMLContent($content)
624 {
625 // divide head from body ( u-ouh :) )
626 $contentArr = $this->defaultContentArray;
627 $contentArr['body'] = stristr($content, '<body');
628 $headPart = substr($content, 0, -strlen($contentArr['body']));
629 // get title
630 $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
631 $titleParts = explode(':', $contentArr['title'], 2);
632 $contentArr['title'] = trim($titleParts[1] ?? $titleParts[0]);
633 // get keywords and description metatags
634 if ($this->conf['index_metatags']) {
635 $meta = [];
636 $i = 0;
637 while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
638 $i++;
639 }
640 // @todo The code below stops at first unset tag. Is that correct?
641 for ($i = 0; isset($meta[$i]); $i++) {
642 $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i]);
643 if (stristr($meta[$i]['name'], 'keywords')) {
644 $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
645 }
646 if (stristr($meta[$i]['name'], 'description')) {
647 $contentArr['description'] .= ',' . $meta[$i]['content'];
648 }
649 }
650 }
651 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
652 $this->typoSearchTags($contentArr['body']);
653 // Get rid of unwanted sections (ie. scripting and style stuff) in body
654 $tagList = explode(',', $this->excludeSections);
655 foreach ($tagList as $tag) {
656 while ($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
657 }
658 }
659 // remove tags, but first make sure we don't concatenate words by doing it
660 $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
661 $contentArr['body'] = trim(strip_tags($contentArr['body']));
662 $contentArr['keywords'] = trim($contentArr['keywords']);
663 $contentArr['description'] = trim($contentArr['description']);
664 // Return array
665 return $contentArr;
666 }
667
668 /**
669 * Extract the charset value from HTML meta tag.
670 *
671 * @param string $content HTML content
672 * @return string The charset value if found.
673 */
674 public function getHTMLcharset($content)
675 {
676 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
677 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
678 return $reg2[1];
679 }
680 }
681 }
682
683 /**
684 * Converts a HTML document to utf-8
685 *
686 * @param string $content HTML content, any charset
687 * @param string $charset Optional charset (otherwise extracted from HTML)
688 * @return string Converted HTML
689 */
690 public function convertHTMLToUtf8($content, $charset = '')
691 {
692 // Find charset:
693 $charset = $charset ?: $this->getHTMLcharset($content);
694 $charset = trim(strtolower($charset));
695 // Convert charset:
696 if ($charset && $charset !== 'utf-8') {
697 $content = mb_convert_encoding($content, 'utf-8', $charset);
698 }
699 // Convert entities, assuming document is now UTF-8:
700 return html_entity_decode($content);
701 }
702
703 /**
704 * Finds first occurrence of embracing tags and returns the embraced content and the original string with
705 * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding
706 * <title> of document or removing <script>-sections
707 *
708 * @param string $string String to search in
709 * @param string $tagName Tag name, eg. "script
710 * @param string $tagContent Passed by reference: Content inside found tag
711 * @param string $stringAfter Passed by reference: Content after found tag
712 * @param string $paramList Passed by reference: Attributes of the found tag.
713 * @return bool Returns FALSE if tag was not found, otherwise TRUE.
714 */
715 public function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
716 {
717 $endTag = '</' . $tagName . '>';
718 $startTag = '<' . $tagName;
719 // stristr used because we want a case-insensitive search for the tag.
720 $isTagInText = stristr($string, $startTag);
721 // if the tag was not found, return FALSE
722 if (!$isTagInText) {
723 return false;
724 }
725 list($paramList, $isTagInText) = explode('>', substr($isTagInText, strlen($startTag)), 2);
726 $afterTagInText = stristr($isTagInText, $endTag);
727 if ($afterTagInText) {
728 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
729 $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
730 $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
731 } else {
732 $tagContent = '';
733 $stringAfter = $isTagInText;
734 }
735 return true;
736 }
737
738 /**
739 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
740 *
741 * @param string $body HTML Content, passed by reference
742 * @return bool Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE.
743 */
744 public function typoSearchTags(&$body)
745 {
746 $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
747 if (count($expBody) > 1) {
748 $body = '';
749 foreach ($expBody as $val) {
750 $part = explode('-->', $val, 2);
751 if (trim($part[0]) === 'begin') {
752 $body .= $part[1];
753 $prev = '';
754 } elseif (trim($part[0]) === 'end') {
755 $body .= $prev;
756 } else {
757 $prev = $val;
758 }
759 }
760 return true;
761 }
762 return false;
763 }
764
765 /**
766 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
767 *
768 * @param string $content HTML content
769 */
770 public function extractLinks($content)
771 {
772 // Get links:
773 $list = $this->extractHyperLinks($content);
774 if ($this->indexerConfig['useCrawlerForExternalFiles'] && \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler')) {
775 $crawler = GeneralUtility::makeInstance(\tx_crawler_lib::class);
776 }
777 // Traverse links:
778 foreach ($list as $linkInfo) {
779 // Decode entities:
780 if ($linkInfo['localPath']) {
781 // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
782 $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
783 } else {
784 $linkSource = htmlspecialchars_decode($linkInfo['href']);
785 }
786 // Parse URL:
787 $qParts = parse_url($linkSource);
788 // Check for jumpurl (TYPO3 specific thing...)
789 if ($qParts['query'] && strstr($qParts['query'], 'jumpurl=')) {
790 parse_str($qParts['query'], $getP);
791 $linkSource = $getP['jumpurl'];
792 $qParts = parse_url($linkSource);
793 }
794 if (!$linkInfo['localPath'] && $qParts['scheme']) {
795 if ($this->indexerConfig['indexExternalURLs']) {
796 // Index external URL (http or otherwise)
797 $this->indexExternalUrl($linkSource);
798 }
799 } elseif (!$qParts['query']) {
800 $linkSource = urldecode($linkSource);
801 if (GeneralUtility::isAllowedAbsPath($linkSource)) {
802 $localFile = $linkSource;
803 } else {
804 $localFile = GeneralUtility::getFileAbsFileName(Environment::getPublicPath() . '/' . $linkSource);
805 }
806 if ($localFile && @is_file($localFile)) {
807 // Index local file:
808 if ($linkInfo['localPath']) {
809 $fI = pathinfo($linkSource);
810 $ext = strtolower($fI['extension']);
811 if (is_object($crawler)) {
812 $params = [
813 'document' => $linkSource,
814 'alturl' => $linkInfo['href'],
815 'conf' => $this->conf
816 ];
817 unset($params['conf']['content']);
818 $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
819 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
820 } else {
821 $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
822 }
823 } else {
824 if (is_object($crawler)) {
825 $params = [
826 'document' => $linkSource,
827 'conf' => $this->conf
828 ];
829 unset($params['conf']['content']);
830 $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
831 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
832 } else {
833 $this->indexRegularDocument($linkSource);
834 }
835 }
836 }
837 }
838 }
839 }
840
841 /**
842 * Extracts all links to external documents from the HTML content string
843 *
844 * @param string $html
845 * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
846 * @see extractLinks()
847 */
848 public function extractHyperLinks($html)
849 {
850 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
851 $htmlParts = $htmlParser->splitTags('a', $html);
852 $hyperLinksData = [];
853 foreach ($htmlParts as $index => $tagData) {
854 if ($index % 2 !== 0) {
855 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
856 $firstTagName = $htmlParser->getFirstTagName($tagData);
857 if (strtolower($firstTagName) === 'a') {
858 if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] !== '#') {
859 $hyperLinksData[] = [
860 'tag' => $tagData,
861 'href' => $tagAttributes[0]['href'],
862 'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
863 ];
864 }
865 }
866 }
867 }
868 return $hyperLinksData;
869 }
870
871 /**
872 * Extracts the "base href" from content string.
873 *
874 * @param string $html Content to analyze
875 * @return string The base href or an empty string if not found
876 */
877 public function extractBaseHref($html)
878 {
879 $href = '';
880 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
881 $htmlParts = $htmlParser->splitTags('base', $html);
882 foreach ($htmlParts as $index => $tagData) {
883 if ($index % 2 !== 0) {
884 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
885 $firstTagName = $htmlParser->getFirstTagName($tagData);
886 if (strtolower($firstTagName) === 'base') {
887 $href = $tagAttributes[0]['href'];
888 if ($href) {
889 break;
890 }
891 }
892 }
893 }
894 return $href;
895 }
896
897 /******************************************
898 *
899 * Indexing; external URL
900 *
901 ******************************************/
902 /**
903 * Index External URLs HTML content
904 *
905 * @param string $externalUrl URL, eg. "http://typo3.org/
906 * @see indexRegularDocument()
907 */
908 public function indexExternalUrl($externalUrl)
909 {
910 // Get headers:
911 $urlHeaders = $this->getUrlHeaders($externalUrl);
912 if (stristr($urlHeaders['Content-Type'], 'text/html')) {
913 $content = ($this->indexExternalUrl_content = GeneralUtility::getUrl($externalUrl));
914 if ((string)$content !== '') {
915 // Create temporary file:
916 $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
917 if ($tmpFile) {
918 GeneralUtility::writeFile($tmpFile, $content);
919 // Index that file:
920 $this->indexRegularDocument($externalUrl, true, $tmpFile, 'html');
921 // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
922 unlink($tmpFile);
923 }
924 }
925 }
926 }
927
928 /**
929 * Getting HTTP request headers of URL
930 *
931 * @param string $url The URL
932 * @return mixed If no answer, returns FALSE. Otherwise an array where HTTP headers are keys
933 */
934 public function getUrlHeaders($url)
935 {
936 // Try to get the headers only
937 $content = GeneralUtility::getUrl($url, 2);
938 if ((string)$content !== '') {
939 // Compile headers:
940 $headers = GeneralUtility::trimExplode(LF, $content, true);
941 $retVal = [];
942 foreach ($headers as $line) {
943 if (trim($line) === '') {
944 break;
945 }
946 list($headKey, $headValue) = explode(':', $line, 2);
947 $retVal[$headKey] = $headValue;
948 }
949 return $retVal;
950 }
951 }
952
953 /**
954 * Checks if the file is local
955 *
956 * @param string $sourcePath
957 * @return string Absolute path to file if file is local, else empty string
958 */
959 protected function createLocalPath($sourcePath)
960 {
961 $localPath = '';
962 $pathFunctions = [
963 'createLocalPathFromT3vars',
964 'createLocalPathUsingAbsRefPrefix',
965 'createLocalPathUsingDomainURL',
966 'createLocalPathFromAbsoluteURL',
967 'createLocalPathFromRelativeURL'
968 ];
969 foreach ($pathFunctions as $functionName) {
970 $localPath = $this->{$functionName}($sourcePath);
971 if ($localPath != '') {
972 break;
973 }
974 }
975 return $localPath;
976 }
977
978 /**
979 * Attempts to create a local file path from T3VARs. This is useful for
980 * various download extensions that hide actual file name but still want the
981 * file to be indexed.
982 *
983 * @param string $sourcePath
984 * @return string
985 */
986 protected function createLocalPathFromT3vars($sourcePath)
987 {
988 $localPath = '';
989 $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'] ?? null;
990 if (is_array($indexLocalFiles)) {
991 $md5 = GeneralUtility::shortMD5($sourcePath);
992 // Note: not using self::isAllowedLocalFile here because this method
993 // is allowed to index files outside of the web site (for example,
994 // protected downloads)
995 if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
996 $localPath = $indexLocalFiles[$md5];
997 }
998 }
999 return $localPath;
1000 }
1001
1002 /**
1003 * Attempts to create a local file path by matching a current request URL.
1004 *
1005 * @param string $sourcePath
1006 * @return string
1007 */
1008 protected function createLocalPathUsingDomainURL($sourcePath)
1009 {
1010 $localPath = '';
1011 $baseURL = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
1012 $baseURLLength = strlen($baseURL);
1013 if (strpos($sourcePath, $baseURL) === 0) {
1014 $sourcePath = substr($sourcePath, $baseURLLength);
1015 $localPath = Environment::getPublicPath() . '/' . $sourcePath;
1016 if (!self::isAllowedLocalFile($localPath)) {
1017 $localPath = '';
1018 }
1019 }
1020 return $localPath;
1021 }
1022
1023 /**
1024 * Attempts to create a local file path by matching absRefPrefix. This
1025 * requires TSFE. If TSFE is missing, this function does nothing.
1026 *
1027 * @param string $sourcePath
1028 * @return string
1029 */
1030 protected function createLocalPathUsingAbsRefPrefix($sourcePath)
1031 {
1032 $localPath = '';
1033 if (isset($GLOBALS['TSFE']) && $GLOBALS['TSFE'] instanceof TypoScriptFrontendController) {
1034 $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
1035 $absRefPrefixLength = strlen($absRefPrefix);
1036 if ($absRefPrefixLength > 0 && strpos($sourcePath, $absRefPrefix) === 0) {
1037 $sourcePath = substr($sourcePath, $absRefPrefixLength);
1038 $localPath = Environment::getPublicPath() . '/' . $sourcePath;
1039 if (!self::isAllowedLocalFile($localPath)) {
1040 $localPath = '';
1041 }
1042 }
1043 }
1044 return $localPath;
1045 }
1046
1047 /**
1048 * Attempts to create a local file path from the absolute URL without
1049 * schema.
1050 *
1051 * @param string $sourcePath
1052 * @return string
1053 */
1054 protected function createLocalPathFromAbsoluteURL($sourcePath)
1055 {
1056 $localPath = '';
1057 if ($sourcePath[0] === '/') {
1058 $sourcePath = substr($sourcePath, 1);
1059 $localPath = Environment::getPublicPath() . '/' . $sourcePath;
1060 if (!self::isAllowedLocalFile($localPath)) {
1061 $localPath = '';
1062 }
1063 }
1064 return $localPath;
1065 }
1066
1067 /**
1068 * Attempts to create a local file path from the relative URL.
1069 *
1070 * @param string $sourcePath
1071 * @return string
1072 */
1073 protected function createLocalPathFromRelativeURL($sourcePath)
1074 {
1075 $localPath = '';
1076 if (self::isRelativeURL($sourcePath)) {
1077 $localPath = Environment::getPublicPath() . '/' . $sourcePath;
1078 if (!self::isAllowedLocalFile($localPath)) {
1079 $localPath = '';
1080 }
1081 }
1082 return $localPath;
1083 }
1084
1085 /**
1086 * Checks if URL is relative.
1087 *
1088 * @param string $url
1089 * @return bool
1090 */
1091 protected static function isRelativeURL($url)
1092 {
1093 $urlParts = @parse_url($url);
1094 return (!isset($urlParts['scheme']) || $urlParts['scheme'] === '') && $urlParts['path'][0] !== '/';
1095 }
1096
1097 /**
1098 * Checks if the path points to the file inside the web site
1099 *
1100 * @param string $filePath
1101 * @return bool
1102 */
1103 protected static function isAllowedLocalFile($filePath)
1104 {
1105 $filePath = GeneralUtility::resolveBackPath($filePath);
1106 $insideWebPath = strpos($filePath, Environment::getPublicPath()) === 0;
1107 $isFile = is_file($filePath);
1108 return $insideWebPath && $isFile;
1109 }
1110
1111 /******************************************
1112 *
1113 * Indexing; external files (PDF, DOC, etc)
1114 *
1115 ******************************************/
1116 /**
1117 * Indexing a regular document given as $file (relative to public web path, local file)
1118 *
1119 * @param string $file Relative Filename, relative to public web path. It can also be an absolute path as long as it is inside the lockRootPath (validated with \TYPO3\CMS\Core\Utility\GeneralUtility::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
1120 * @param bool $force If set, indexing is forced (despite content hashes, mtime etc).
1121 * @param string $contentTmpFile Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
1122 * @param string $altExtension File extension for temporary file.
1123 */
1124 public function indexRegularDocument($file, $force = false, $contentTmpFile = '', $altExtension = '')
1125 {
1126 // Init
1127 $fI = pathinfo($file);
1128 $ext = $altExtension ?: strtolower($fI['extension']);
1129 // Create abs-path:
1130 if (!$contentTmpFile) {
1131 if (!GeneralUtility::isAbsPath($file)) {
1132 // Relative, prepend public web path:
1133 $absFile = GeneralUtility::getFileAbsFileName(Environment::getPublicPath() . '/' . $file);
1134 } else {
1135 // Absolute, pass-through:
1136 $absFile = $file;
1137 }
1138 $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
1139 } else {
1140 $absFile = $contentTmpFile;
1141 }
1142 // Indexing the document:
1143 if ($absFile && @is_file($absFile)) {
1144 if ($this->external_parsers[$ext]) {
1145 $fileInfo = stat($absFile);
1146 $cParts = $this->fileContentParts($ext, $absFile);
1147 foreach ($cParts as $cPKey) {
1148 $this->internal_log = [];
1149 $this->log_push('Index: ' . str_replace('.', '_', PathUtility::basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
1150 $Pstart = GeneralUtility::milliseconds();
1151 $subinfo = ['key' => $cPKey];
1152 // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1153 $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
1154 $check = $this->checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
1155 if ($check > 0 || $force) {
1156 if ($check > 0) {
1157 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
1158 } else {
1159 $this->log_setTSlogMessage('Indexing forced by flag', 1);
1160 }
1161 // Check external file counter:
1162 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1163 // Divide into title,keywords,description and body:
1164 $this->log_push('Split content', '');
1165 $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
1166 $this->log_pull();
1167 if (is_array($contentParts)) {
1168 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1169 $content_md5h = IndexedSearchUtility::md5inthash(implode($contentParts, ''));
1170 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1171 // Increment counter:
1172 $this->externalFileCounter++;
1173 // Splitting words
1174 $this->log_push('Extract words from content', '');
1175 $splitInWords = $this->processWordsInArrays($contentParts);
1176 $this->log_pull();
1177 // Analyze the indexed words.
1178 $this->log_push('Analyze the extracted words', '');
1179 $indexArr = $this->indexAnalyze($splitInWords);
1180 $this->log_pull();
1181 // Submitting page (phash) record
1182 $this->log_push('Submitting page', '');
1183 // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1184 $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], $content_md5h, $contentParts);
1185 $this->log_pull();
1186 // Check words and submit to word list if not there
1187 $this->log_push('Check word list and submit words', '');
1188 if (IndexedSearchUtility::isTableUsed('index_words')) {
1189 $this->checkWordList($indexArr);
1190 $this->submitWords($indexArr, $phash_arr['phash']);
1191 }
1192 $this->log_pull();
1193 // Set parsetime
1194 $this->updateParsetime($phash_arr['phash'], GeneralUtility::milliseconds() - $Pstart);
1195 } else {
1196 // Update the timestamp
1197 $this->updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
1198 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
1199 }
1200 } else {
1201 $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1202 }
1203 } else {
1204 $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
1205 }
1206 } else {
1207 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
1208 }
1209 // Checking and setting sections:
1210 $this->submitFile_section($phash_arr['phash']);
1211 // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1212 $this->log_pull();
1213 }
1214 } else {
1215 $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
1216 }
1217 } else {
1218 $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
1219 }
1220 }
1221
1222 /**
1223 * Reads the content of an external file being indexed.
1224 * The content from the external parser MUST be returned in utf-8!
1225 *
1226 * @param string $fileExtension File extension, eg. "pdf", "doc" etc.
1227 * @param string $absoluteFileName Absolute filename of file (must exist and be validated OK before calling function)
1228 * @param string $sectionPointer Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1229 * @return array Standard content array (title, description, keywords, body keys)
1230 */
1231 public function readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
1232 {
1233 $contentArray = null;
1234 // Consult relevant external document parser:
1235 if (is_object($this->external_parsers[$fileExtension])) {
1236 $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1237 }
1238 return $contentArray;
1239 }
1240
1241 /**
1242 * Creates an array with pointers to divisions of document.
1243 *
1244 * @param string $ext File extension
1245 * @param string $absFile Absolute filename (must exist and be validated OK before calling function)
1246 * @return array Array of pointers to sections that the document should be divided into
1247 */
1248 public function fileContentParts($ext, $absFile)
1249 {
1250 $cParts = [0];
1251 // Consult relevant external document parser:
1252 if (is_object($this->external_parsers[$ext])) {
1253 $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
1254 }
1255 return $cParts;
1256 }
1257
1258 /**
1259 * Splits non-HTML content (from external files for instance)
1260 *
1261 * @param string $content Input content (non-HTML) to index.
1262 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1263 * @see splitHTMLContent()
1264 */
1265 public function splitRegularContent($content)
1266 {
1267 $contentArr = $this->defaultContentArray;
1268 $contentArr['body'] = $content;
1269 return $contentArr;
1270 }
1271
1272 /**********************************
1273 *
1274 * Analysing content, Extracting words
1275 *
1276 **********************************/
1277 /**
1278 * Convert character set and HTML entities in the value of input content array keys
1279 *
1280 * @param array $contentArr Standard content array
1281 * @param string $charset Charset of the input content (converted to utf-8)
1282 */
1283 public function charsetEntity2utf8(&$contentArr, $charset)
1284 {
1285 // Convert charset if necessary
1286 foreach ($contentArr as $key => $value) {
1287 if ((string)$contentArr[$key] !== '') {
1288 if ($charset !== 'utf-8') {
1289 $contentArr[$key] = mb_convert_encoding($contentArr[$key], 'utf-8', $charset);
1290 }
1291 // decode all numeric / html-entities in the string to real characters:
1292 $contentArr[$key] = html_entity_decode($contentArr[$key]);
1293 }
1294 }
1295 }
1296
1297 /**
1298 * Processing words in the array from split*Content -functions
1299 *
1300 * @param array $contentArr Array of content to index, see splitHTMLContent() and splitRegularContent()
1301 * @return array Content input array modified so each key is not a unique array of words
1302 */
1303 public function processWordsInArrays($contentArr)
1304 {
1305 // split all parts to words
1306 foreach ($contentArr as $key => $value) {
1307 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1308 }
1309 // For title, keywords, and description we don't want duplicates:
1310 $contentArr['title'] = array_unique($contentArr['title']);
1311 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1312 $contentArr['description'] = array_unique($contentArr['description']);
1313 // Return modified array:
1314 return $contentArr;
1315 }
1316
1317 /**
1318 * Extracts the sample description text from the content array.
1319 *
1320 * @param array $contentArr Content array
1321 * @return string Description string
1322 */
1323 public function bodyDescription($contentArr)
1324 {
1325 // Setting description
1326 $maxL = MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1327 if ($maxL) {
1328 $bodyDescription = preg_replace('/\s+/u', ' ', $contentArr['body']);
1329 // Shorten the string:
1330 $bodyDescription = mb_strcut($bodyDescription, 0, $maxL, 'utf-8');
1331 }
1332 return $bodyDescription;
1333 }
1334
1335 /**
1336 * Analyzes content to use for indexing,
1337 *
1338 * @param array $content Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1339 * @return array Index Array (whatever that is...)
1340 */
1341 public function indexAnalyze($content)
1342 {
1343 $indexArr = [];
1344 $this->analyzeHeaderinfo($indexArr, $content, 'title', 7);
1345 $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1346 $this->analyzeHeaderinfo($indexArr, $content, 'description', 5);
1347 $this->analyzeBody($indexArr, $content);
1348 return $indexArr;
1349 }
1350
1351 /**
1352 * Calculates relevant information for headercontent
1353 *
1354 * @param array $retArr Index array, passed by reference
1355 * @param array $content Standard content array
1356 * @param string $key Key from standard content array
1357 * @param int $offset Bit-wise priority to type
1358 */
1359 public function analyzeHeaderinfo(&$retArr, $content, $key, $offset)
1360 {
1361 foreach ($content[$key] as $val) {
1362 $val = substr($val, 0, 60);
1363 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1364 if (!isset($retArr[$val])) {
1365 // Word ID (wid)
1366 $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1367 // Metaphone value is also 60 only chars long
1368 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1369 $retArr[$val]['metaphone'] = $metaphone;
1370 }
1371 // Build metaphone fulltext string (can be used for fulltext indexing)
1372 if ($this->storeMetaphoneInfoAsWords) {
1373 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1374 }
1375 // Priority used for flagBitMask feature (see extension configuration)
1376 $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | pow(2, $offset);
1377 // Increase number of occurrences
1378 $retArr[$val]['count']++;
1379 $this->wordcount++;
1380 }
1381 }
1382
1383 /**
1384 * Calculates relevant information for bodycontent
1385 *
1386 * @param array $retArr Index array, passed by reference
1387 * @param array $content Standard content array
1388 */
1389 public function analyzeBody(&$retArr, $content)
1390 {
1391 foreach ($content['body'] as $key => $val) {
1392 $val = substr($val, 0, 60);
1393 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1394 if (!isset($retArr[$val])) {
1395 // First occurrence (used for ranking results)
1396 $retArr[$val]['first'] = $key;
1397 // Word ID (wid)
1398 $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1399 // Metaphone value is also only 60 chars long
1400 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1401 $retArr[$val]['metaphone'] = $metaphone;
1402 }
1403 // Build metaphone fulltext string (can be used for fulltext indexing)
1404 if ($this->storeMetaphoneInfoAsWords) {
1405 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1406 }
1407 // Increase number of occurrences
1408 $retArr[$val]['count']++;
1409 $this->wordcount++;
1410 }
1411 }
1412
1413 /**
1414 * Creating metaphone based hash from input word
1415 *
1416 * @param string $word Word to convert
1417 * @param bool $returnRawMetaphoneValue If set, returns the raw metaphone value (not hashed)
1418 * @return mixed Metaphone hash integer (or raw value, string)
1419 */
1420 public function metaphone($word, $returnRawMetaphoneValue = false)
1421 {
1422 if (is_object($this->metaphoneObj)) {
1423 $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1424 } else {
1425 // Use native PHP function instead of advanced doubleMetaphone class
1426 $metaphoneRawValue = metaphone($word);
1427 }
1428 if ($returnRawMetaphoneValue) {
1429 $result = $metaphoneRawValue;
1430 } elseif ($metaphoneRawValue !== '') {
1431 // Create hash and return integer
1432 $result = IndexedSearchUtility::md5inthash($metaphoneRawValue);
1433 } else {
1434 $result = 0;
1435 }
1436 return $result;
1437 }
1438
1439 /********************************
1440 *
1441 * SQL; TYPO3 Pages
1442 *
1443 *******************************/
1444 /**
1445 * Updates db with information about the page (TYPO3 page, not external media)
1446 */
1447 public function submitPage()
1448 {
1449 // Remove any current data for this phash:
1450 $this->removeOldIndexedPages($this->hash['phash']);
1451 // setting new phash_row
1452 $fields = [
1453 'phash' => $this->hash['phash'],
1454 'phash_grouping' => $this->hash['phash_grouping'],
1455 'cHashParams' => serialize($this->cHashParams),
1456 'contentHash' => $this->content_md5h,
1457 'data_page_id' => $this->conf['id'],
1458 'data_page_type' => $this->conf['type'],
1459 'data_page_mp' => $this->conf['MP'],
1460 'gr_list' => $this->conf['gr_list'],
1461 'item_type' => 0,
1462 // TYPO3 page
1463 'item_title' => $this->contentParts['title'],
1464 'item_description' => $this->bodyDescription($this->contentParts),
1465 'item_mtime' => (int)$this->conf['mtime'],
1466 'item_size' => strlen($this->conf['content']),
1467 'tstamp' => $GLOBALS['EXEC_TIME'],
1468 'crdate' => $GLOBALS['EXEC_TIME'],
1469 'item_crdate' => $this->conf['crdate'],
1470 // Creation date of page
1471 'sys_language_uid' => $this->conf['sys_language_uid'],
1472 // Sys language uid of the page. Should reflect which language it DOES actually display!
1473 'externalUrl' => 0,
1474 'recordUid' => (int)$this->conf['recordUid'],
1475 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1476 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1477 ];
1478 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1479 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1480 ->getConnectionForTable('index_phash');
1481 $connection->insert(
1482 'index_phash',
1483 $fields,
1484 ['cHashParams' => Connection::PARAM_LOB]
1485 );
1486 }
1487 // PROCESSING index_section
1488 $this->submit_section($this->hash['phash'], $this->hash['phash']);
1489 // PROCESSING index_grlist
1490 $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1491 // PROCESSING index_fulltext
1492 $fields = [
1493 'phash' => $this->hash['phash'],
1494 'fulltextdata' => implode(' ', $this->contentParts),
1495 'metaphonedata' => $this->metaphoneContent
1496 ];
1497 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1498 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1499 }
1500 if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1501 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1502 ->getConnectionForTable('index_fulltext');
1503 $connection->insert('index_fulltext', $fields);
1504 }
1505 // PROCESSING index_debug
1506 if ($this->indexerConfig['debugMode']) {
1507 $fields = [
1508 'phash' => $this->hash['phash'],
1509 'debuginfo' => serialize([
1510 'cHashParams' => $this->cHashParams,
1511 'external_parsers initialized' => array_keys($this->external_parsers),
1512 'conf' => array_merge($this->conf, ['content' => substr($this->conf['content'], 0, 1000)]),
1513 'contentParts' => array_merge($this->contentParts, ['body' => substr($this->contentParts['body'], 0, 1000)]),
1514 'logs' => $this->internal_log,
1515 'lexer' => $this->lexerObj->debugString
1516 ])
1517 ];
1518 if (IndexedSearchUtility::isTableUsed('index_debug')) {
1519 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1520 ->getConnectionForTable('index_debug');
1521 $connection->insert('index_debug', $fields);
1522 }
1523 }
1524 }
1525
1526 /**
1527 * Stores gr_list in the database.
1528 *
1529 * @param int $hash Search result record phash
1530 * @param int $phash_x Actual phash of current content
1531 * @see update_grlist()
1532 */
1533 public function submit_grlist($hash, $phash_x)
1534 {
1535 // Setting the gr_list record
1536 $fields = [
1537 'phash' => $hash,
1538 'phash_x' => $phash_x,
1539 'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1540 'gr_list' => $this->conf['gr_list']
1541 ];
1542 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1543 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1544 ->getConnectionForTable('index_grlist');
1545 $connection->insert('index_grlist', $fields);
1546 }
1547 }
1548
1549 /**
1550 * Stores section
1551 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1552 *
1553 * @param int $hash phash of TYPO3 parent search result record
1554 * @param int $hash_t3 phash of the file indexation search record
1555 */
1556 public function submit_section($hash, $hash_t3)
1557 {
1558 $fields = [
1559 'phash' => $hash,
1560 'phash_t3' => $hash_t3,
1561 'page_id' => (int)$this->conf['id']
1562 ];
1563 $this->getRootLineFields($fields);
1564 if (IndexedSearchUtility::isTableUsed('index_section')) {
1565 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1566 ->getConnectionForTable('index_section');
1567 $connection->insert('index_section', $fields);
1568 }
1569 }
1570
1571 /**
1572 * Removes records for the indexed page, $phash
1573 *
1574 * @param int $phash phash value to flush
1575 */
1576 public function removeOldIndexedPages($phash)
1577 {
1578 // Removing old registrations for all tables. Because the pages are TYPO3 pages
1579 // there can be nothing else than 1-1 relations here.
1580 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1581 $tableArray = ['index_phash', 'index_section', 'index_grlist', 'index_fulltext', 'index_debug'];
1582 foreach ($tableArray as $table) {
1583 if (IndexedSearchUtility::isTableUsed($table)) {
1584 $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1585 }
1586 }
1587
1588 // Removing all index_section records with hash_t3 set to this hash (this includes such
1589 // records set for external media on the page as well!). The re-insert of these records
1590 // are done in indexRegularDocument($file).
1591 if (IndexedSearchUtility::isTableUsed('index_section')) {
1592 $connectionPool->getConnectionForTable('index_section')
1593 ->delete('index_section', ['phash_t3' => (int)$phash]);
1594 }
1595 }
1596
1597 /********************************
1598 *
1599 * SQL; External media
1600 *
1601 *******************************/
1602 /**
1603 * Updates db with information about the file
1604 *
1605 * @param array $hash Array with phash and phash_grouping keys for file
1606 * @param string $file File name
1607 * @param array $subinfo Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1608 * @param string $ext File extension determining the type of media.
1609 * @param int $mtime Modification time of file.
1610 * @param int $ctime Creation time of file.
1611 * @param int $size Size of file in bytes
1612 * @param int $content_md5h Content HASH value.
1613 * @param array $contentParts Standard content array (using only title and body for a file)
1614 */
1615 public function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts)
1616 {
1617 // Find item Type:
1618 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1619 $storeItemType = $storeItemType ?: $ext;
1620 // Remove any current data for this phash:
1621 $this->removeOldIndexedFiles($hash['phash']);
1622 // Split filename:
1623 $fileParts = parse_url($file);
1624 // Setting new
1625 $fields = [
1626 'phash' => $hash['phash'],
1627 'phash_grouping' => $hash['phash_grouping'],
1628 'cHashParams' => serialize($subinfo),
1629 'contentHash' => $content_md5h,
1630 'data_filename' => $file,
1631 'item_type' => $storeItemType,
1632 'item_title' => trim($contentParts['title']) ?: PathUtility::basename($file),
1633 'item_description' => $this->bodyDescription($contentParts),
1634 'item_mtime' => $mtime,
1635 'item_size' => $size,
1636 'item_crdate' => $ctime,
1637 'tstamp' => $GLOBALS['EXEC_TIME'],
1638 'crdate' => $GLOBALS['EXEC_TIME'],
1639 'gr_list' => $this->conf['gr_list'],
1640 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1641 'recordUid' => (int)$this->conf['recordUid'],
1642 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1643 'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1644 'sys_language_uid' => (int)$this->conf['sys_language_uid']
1645 ];
1646 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1647 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1648 ->getConnectionForTable('index_phash');
1649 $connection->insert(
1650 'index_phash',
1651 $fields,
1652 ['cHashParams' => Connection::PARAM_LOB]
1653 );
1654 }
1655 // PROCESSING index_fulltext
1656 $fields = [
1657 'phash' => $hash['phash'],
1658 'fulltextdata' => implode(' ', $contentParts),
1659 'metaphonedata' => $this->metaphoneContent
1660 ];
1661 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1662 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1663 }
1664 if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1665 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1666 ->getConnectionForTable('index_fulltext');
1667 $connection->insert('index_fulltext', $fields);
1668 }
1669 // PROCESSING index_debug
1670 if ($this->indexerConfig['debugMode']) {
1671 $fields = [
1672 'phash' => $hash['phash'],
1673 'debuginfo' => serialize([
1674 'cHashParams' => $subinfo,
1675 'contentParts' => array_merge($contentParts, ['body' => substr($contentParts['body'], 0, 1000)]),
1676 'logs' => $this->internal_log,
1677 'lexer' => $this->lexerObj->debugString
1678 ])
1679 ];
1680 if (IndexedSearchUtility::isTableUsed('index_debug')) {
1681 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1682 ->getConnectionForTable('index_debug');
1683 $connection->insert('index_debug', $fields);
1684 }
1685 }
1686 }
1687
1688 /**
1689 * Stores file gr_list for a file IF it does not exist already
1690 *
1691 * @param int $hash phash value of file
1692 */
1693 public function submitFile_grlist($hash)
1694 {
1695 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1696 if (!IndexedSearchUtility::isTableUsed('index_grlist')) {
1697 return;
1698 }
1699
1700 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1701 ->getQueryBuilderForTable('index_grlist');
1702 $count = (int)$queryBuilder->count('*')
1703 ->from('index_grlist')
1704 ->where(
1705 $queryBuilder->expr()->eq(
1706 'phash',
1707 $queryBuilder->createNamedParameter($hash, \PDO::PARAM_INT)
1708 ),
1709 $queryBuilder->expr()->orX(
1710 $queryBuilder->expr()->eq(
1711 'hash_gr_list',
1712 $queryBuilder->createNamedParameter(
1713 IndexedSearchUtility::md5inthash($this->defaultGrList),
1714 \PDO::PARAM_INT
1715 )
1716 ),
1717 $queryBuilder->expr()->eq(
1718 'hash_gr_list',
1719 $queryBuilder->createNamedParameter(
1720 IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1721 \PDO::PARAM_INT
1722 )
1723 )
1724 )
1725 )
1726 ->execute()
1727 ->fetchColumn();
1728
1729 if ($count === 0) {
1730 $this->submit_grlist($hash, $hash);
1731 }
1732 }
1733
1734 /**
1735 * Stores file section for a file IF it does not exist
1736 *
1737 * @param int $hash phash value of file
1738 */
1739 public function submitFile_section($hash)
1740 {
1741 // Testing if there is already a section
1742 if (!IndexedSearchUtility::isTableUsed('index_section')) {
1743 return;
1744 }
1745
1746 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1747 ->getQueryBuilderForTable('index_section');
1748 $count = (int)$queryBuilder->count('phash')
1749 ->from('index_section')
1750 ->where(
1751 $queryBuilder->expr()->eq(
1752 'phash',
1753 $queryBuilder->createNamedParameter($hash, \PDO::PARAM_INT)
1754 ),
1755 $queryBuilder->expr()->eq(
1756 'page_id',
1757 $queryBuilder->createNamedParameter($this->conf['id'], \PDO::PARAM_INT)
1758 )
1759 )
1760 ->execute()
1761 ->fetchColumn();
1762
1763 if ($count === 0) {
1764 $this->submit_section($hash, $this->hash['phash']);
1765 }
1766 }
1767
1768 /**
1769 * Removes records for the indexed page, $phash
1770 *
1771 * @param int $phash phash value to flush
1772 */
1773 public function removeOldIndexedFiles($phash)
1774 {
1775 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1776 // Removing old registrations for tables.
1777 $tableArray = ['index_phash', 'index_grlist', 'index_fulltext', 'index_debug'];
1778 foreach ($tableArray as $table) {
1779 if (!IndexedSearchUtility::isTableUsed($table)) {
1780 continue;
1781 }
1782 $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1783 }
1784 }
1785
1786 /********************************
1787 *
1788 * SQL Helper functions
1789 *
1790 *******************************/
1791 /**
1792 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1793 * Return positive integer if the page needs to be indexed
1794 *
1795 * @param int $mtime mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1796 * @param int $phash "phash" used to select any already indexed page to see what its mtime is.
1797 * @return int Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1798 */
1799 public function checkMtimeTstamp($mtime, $phash)
1800 {
1801 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1802 // Not indexed (not in index_phash)
1803 $result = 4;
1804 } else {
1805 $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1806 ->select(
1807 ['item_mtime', 'tstamp'],
1808 'index_phash',
1809 ['phash' => (int)$phash],
1810 [],
1811 [],
1812 1
1813 )
1814 ->fetch();
1815 // If there was an indexing of the page...:
1816 if (!empty($row)) {
1817 if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < $GLOBALS['EXEC_TIME']) {
1818 // If max age is exceeded, index the page
1819 // The configured max-age was exceeded for the document and thus it's indexed.
1820 $result = 1;
1821 } else {
1822 if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < $GLOBALS['EXEC_TIME']) {
1823 // if minAge is not set or if minAge is exceeded, consider at mtime
1824 if ($mtime) {
1825 // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1826 if ($row['item_mtime'] != $mtime) {
1827 // And if mtime is different from the index_phash mtime, it's about time to re-index.
1828 // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1829 $result = 2;
1830 } else {
1831 // mtime matched the document, so no changes detected and no content updated
1832 $result = -1;
1833 if ($this->tstamp_maxAge) {
1834 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1835 } else {
1836 $this->updateTstamp($phash);
1837 $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1838 }
1839 }
1840 } else {
1841 // The minimum age was exceed, but mtime was not set, so the page was indexed.
1842 $result = 3;
1843 }
1844 } else {
1845 // The minimum age was not exceeded
1846 $result = -2;
1847 }
1848 }
1849 } else {
1850 // Page has never been indexed (is not represented in the index_phash table).
1851 $result = 4;
1852 }
1853 }
1854 return $result;
1855 }
1856
1857 /**
1858 * Check content hash in phash table
1859 *
1860 * @return mixed Returns TRUE if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1861 */
1862 public function checkContentHash()
1863 {
1864 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1865 $result = true;
1866 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1867 $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1868 ->select(
1869 ['phash'],
1870 'index_phash',
1871 [
1872 'phash_grouping' => (int)$this->hash['phash_grouping'],
1873 'contentHash' => (int)$this->content_md5h
1874 ],
1875 [],
1876 [],
1877 1
1878 )
1879 ->fetch();
1880
1881 if (!empty($row)) {
1882 $result = $row;
1883 }
1884 }
1885 return $result;
1886 }
1887
1888 /**
1889 * Check content hash for external documents
1890 * Returns TRUE if the document needs to be indexed (that is, there was no result)
1891 *
1892 * @param int $hashGr phash value to check (phash_grouping)
1893 * @param int $content_md5h Content hash to check
1894 * @return bool Returns TRUE if the document needs to be indexed (that is, there was no result)
1895 */
1896 public function checkExternalDocContentHash($hashGr, $content_md5h)
1897 {
1898 $result = true;
1899 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1900 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1901 ->getConnectionForTable('index_phash')
1902 ->count(
1903 '*',
1904 'index_phash',
1905 [
1906 'phash_grouping' => (int)$hashGr,
1907 'contentHash' => (int)$content_md5h
1908 ]
1909 );
1910
1911 $result = $count === 0;
1912 }
1913 return $result;
1914 }
1915
1916 /**
1917 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1918 *
1919 * @param int $phash_x Phash integer to test.
1920 * @return bool
1921 */
1922 public function is_grlist_set($phash_x)
1923 {
1924 $result = false;
1925 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1926 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1927 ->getConnectionForTable('index_grlist')
1928 ->count(
1929 'phash_x',
1930 'index_grlist',
1931 ['phash_x' => (int)$phash_x]
1932 );
1933
1934 $result = $count > 0;
1935 }
1936 return $result;
1937 }
1938
1939 /**
1940 * Check if an grlist-entry for this hash exists and if not so, write one.
1941 *
1942 * @param int $phash phash of the search result that should be found
1943 * @param int $phash_x The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1944 * @see submit_grlist()
1945 */
1946 public function update_grlist($phash, $phash_x)
1947 {
1948 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1949 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1950 ->getConnectionForTable('index_grlist')
1951 ->count(
1952 'phash',
1953 'index_grlist',
1954 [
1955 'phash' => (int)$phash,
1956 'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list'])
1957 ]
1958 );
1959
1960 if ($count === 0) {
1961 $this->submit_grlist($phash, $phash_x);
1962 $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1);
1963 }
1964 }
1965 }
1966
1967 /**
1968 * Update tstamp for a phash row.
1969 *
1970 * @param int $phash phash value
1971 * @param int $mtime If set, update the mtime field to this value.
1972 */
1973 public function updateTstamp($phash, $mtime = 0)
1974 {
1975 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1976 return;
1977 }
1978
1979 $updateFields = [
1980 'tstamp' => $GLOBALS['EXEC_TIME']
1981 ];
1982
1983 if ($mtime) {
1984 $updateFields['item_mtime'] = (int)$mtime;
1985 }
1986
1987 GeneralUtility::makeInstance(ConnectionPool::class)
1988 ->getConnectionForTable('index_phash')
1989 ->update(
1990 'index_phash',
1991 $updateFields,
1992 [
1993 'phash' => (int)$phash
1994 ]
1995 );
1996 }
1997
1998 /**
1999 * Update SetID of the index_phash record.
2000 *
2001 * @param int $phash phash value
2002 */
2003 public function updateSetId($phash)
2004 {
2005 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
2006 return;
2007 }
2008
2009 GeneralUtility::makeInstance(ConnectionPool::class)
2010 ->getConnectionForTable('index_phash')
2011 ->update(
2012 'index_phash',
2013 [
2014 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
2015 ],
2016 [
2017 'phash' => (int)$phash
2018 ]
2019 );
2020 }
2021
2022 /**
2023 * Update parsetime for phash row.
2024 *
2025 * @param int $phash phash value.
2026 * @param int $parsetime Parsetime value to set.
2027 */
2028 public function updateParsetime($phash, $parsetime)
2029 {
2030 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
2031 return;
2032 }
2033
2034 GeneralUtility::makeInstance(ConnectionPool::class)
2035 ->getConnectionForTable('index_phash')
2036 ->update(
2037 'index_phash',
2038 [
2039 'parsetime' => (int)$parsetime
2040 ],
2041 [
2042 'phash' => (int)$phash
2043 ]
2044 );
2045 }
2046
2047 /**
2048 * Update section rootline for the page
2049 */
2050 public function updateRootline()
2051 {
2052 if (!IndexedSearchUtility::isTableUsed('index_section')) {
2053 return;
2054 }
2055
2056 $updateFields = [];
2057 $this->getRootLineFields($updateFields);
2058
2059 GeneralUtility::makeInstance(ConnectionPool::class)
2060 ->getConnectionForTable('index_section')
2061 ->update(
2062 'index_section',
2063 $updateFields,
2064 [
2065 'page_id' => (int)$this->conf['id']
2066 ]
2067 );
2068 }
2069
2070 /**
2071 * Adding values for root-line fields.
2072 * rl0, rl1 and rl2 are standard. A hook might add more.
2073 *
2074 * @param array $fieldArray Field array, passed by reference
2075 */
2076 public function getRootLineFields(array &$fieldArray)
2077 {
2078 $fieldArray['rl0'] = (int)$this->conf['rootline_uids'][0];
2079 $fieldArray['rl1'] = (int)$this->conf['rootline_uids'][1];
2080 $fieldArray['rl2'] = (int)$this->conf['rootline_uids'][2];
2081 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] ?? [] as $fieldName => $rootLineLevel) {
2082 $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
2083 }
2084 }
2085
2086 /********************************
2087 *
2088 * SQL; Submitting words
2089 *
2090 *******************************/
2091 /**
2092 * Adds new words to db
2093 *
2094 * @param array $wordListArray Word List array (where each word has information about position etc).
2095 */
2096 public function checkWordList($wordListArray)
2097 {
2098 if (!IndexedSearchUtility::isTableUsed('index_words') || empty($wordListArray)) {
2099 return;
2100 }
2101
2102 $wordListArrayCount = count($wordListArray);
2103 $phashArray = array_map('intval', array_column($wordListArray, 'hash'));
2104
2105 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('index_words');
2106 $count = (int)$queryBuilder->count('baseword')
2107 ->from('index_words')
2108 ->where(
2109 $queryBuilder->expr()->in(
2110 'wid',
2111 $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
2112 )
2113 )
2114 ->execute()
2115 ->fetchColumn();
2116
2117 if ($count !== $wordListArrayCount) {
2118 $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_words');
2119 $queryBuilder = $connection->createQueryBuilder();
2120
2121 $result = $queryBuilder->select('baseword')
2122 ->from('index_words')
2123 ->where(
2124 $queryBuilder->expr()->in(
2125 'wid',
2126 $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
2127 )
2128 )
2129 ->execute();
2130
2131 $this->log_setTSlogMessage('Inserting words: ' . ($wordListArrayCount - $count), 1);
2132 while ($row = $result->fetch()) {
2133 unset($wordListArray[$row['baseword']]);
2134 }
2135
2136 foreach ($wordListArray as $key => $val) {
2137 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as
2138 // long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...)
2139 // this is not a problem.
2140 $connection->insert(
2141 'index_words',
2142 [
2143 'wid' => $val['hash'],
2144 'baseword' => $key,
2145 'metaphone' => $val['metaphone']
2146 ]
2147 );
2148 }
2149 }
2150 }
2151
2152 /**
2153 * Submits RELATIONS between words and phash
2154 *
2155 * @param array $wordList Word list array
2156 * @param int $phash phash value
2157 */
2158 public function submitWords($wordList, $phash)
2159 {
2160 if (!IndexedSearchUtility::isTableUsed('index_rel')) {
2161 return;
2162 }
2163 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
2164 $queryBuilder = $connectionPool->getQueryBuilderForTable('index_words');
2165 $result = $queryBuilder->select('wid')
2166 ->from('index_words')
2167 ->where(
2168 $queryBuilder->expr()->neq('is_stopword', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT))
2169 )
2170 ->groupBy('wid')
2171 ->execute();
2172
2173 $stopWords = [];
2174 while ($row = $result->fetch()) {
2175 $stopWords[$row['wid']] = $row;
2176 }
2177
2178 $connectionPool->getConnectionForTable('index_rel')->delete('index_rel', ['phash' => (int)$phash]);
2179
2180 $fields = ['phash', 'wid', 'count', 'first', 'freq', 'flags'];
2181 $rows = [];
2182 foreach ($wordList as $val) {
2183 if (isset($stopWords[$val['hash']])) {
2184 continue;
2185 }
2186 $rows[] = [
2187 (int)$phash,
2188 (int)$val['hash'],
2189 (int)$val['count'],
2190 (int)$val['first'],
2191 $this->freqMap($val['count'] / $this->wordcount),
2192 $val['cmp'] & $this->flagBitMask
2193 ];
2194 }
2195
2196 if (!empty($rows)) {
2197 $connectionPool->getConnectionForTable('index_rel')->bulkInsert('index_rel', $rows, $fields);
2198 }
2199 }
2200
2201 /**
2202 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
2203 * and back.
2204 *
2205 * @param float $freq Frequency
2206 * @return int Frequency in range.
2207 */
2208 public function freqMap($freq)
2209 {
2210 $mapFactor = $this->freqMax * 100 * $this->freqRange;
2211 if ($freq <= 1) {
2212 $newFreq = $freq * $mapFactor;
2213 $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
2214 } else {
2215 $newFreq = $freq / $mapFactor;
2216 }
2217 return $newFreq;
2218 }
2219
2220 /********************************
2221 *
2222 * Hashing
2223 *
2224 *******************************/
2225 /**
2226 * Get search hash, T3 pages
2227 */
2228 public function setT3Hashes()
2229 {
2230 // Set main array:
2231 $hArray = [
2232 'id' => (int)$this->conf['id'],
2233 'type' => (int)$this->conf['type'],
2234 'sys_lang' => (int)$this->conf['sys_language_uid'],
2235 'MP' => (string)$this->conf['MP'],
2236 'cHash' => $this->cHashParams
2237 ];
2238 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
2239 $this->hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2240 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
2241 $hArray['gr_list'] = (string)$this->conf['gr_list'];
2242 $this->hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2243 }
2244
2245 /**
2246 * Get search hash, external files
2247 *
2248 * @param string $file File name / path which identifies it on the server
2249 * @param array $subinfo Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
2250 * @return array Array with "phash_grouping" and "phash" inside.
2251 */
2252 public function setExtHashes($file, $subinfo = [])
2253 {
2254 // Set main array:
2255 $hash = [];
2256 $hArray = [
2257 'file' => $file
2258 ];
2259 // Set grouping hash:
2260 $hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2261 // Add subinfo
2262 $hArray['subinfo'] = $subinfo;
2263 $hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2264 return $hash;
2265 }
2266
2267 /*********************************
2268 *
2269 * Internal logging functions
2270 *
2271 *********************************/
2272 /**
2273 * Push function wrapper for TT logging
2274 *
2275 * @param string $msg Title to set
2276 * @param string $key Key (?)
2277 */
2278 public function log_push($msg, $key)
2279 {
2280 $this->timeTracker->push($msg, $key);
2281 }
2282
2283 /**
2284 * Pull function wrapper for TT logging
2285 */
2286 public function log_pull()
2287 {
2288 $this->timeTracker->pull();
2289 }
2290
2291 /**
2292 * Set log message function wrapper for TT logging
2293 *
2294 * @param string $msg Message to set
2295 * @param int $errorNum Error number
2296 */
2297 public function log_setTSlogMessage($msg, $errorNum = 0)
2298 {
2299 $this->timeTracker->setTSlogMessage($msg, $errorNum);
2300 $this->internal_log[] = $msg;
2301 }
2302
2303 /**
2304 * Makes sure that keywords are space-separated. This is impotant for their
2305 * proper displaying as a part of fulltext index.
2306 *
2307 * @param string $keywordList
2308 * @return string
2309 * @see http://forge.typo3.org/issues/14959
2310 */
2311 protected function addSpacesToKeywordList($keywordList)
2312 {
2313 $keywords = GeneralUtility::trimExplode(',', $keywordList);
2314 return ' ' . implode(', ', $keywords) . ' ';
2315 }
2316 }