[TASK] Make indexed search tests notice free
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Indexer.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
18 use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
19 use TYPO3\CMS\Core\Context\Context;
20 use TYPO3\CMS\Core\Context\LanguageAspect;
21 use TYPO3\CMS\Core\Core\Environment;
22 use TYPO3\CMS\Core\Database\Connection;
23 use TYPO3\CMS\Core\Database\ConnectionPool;
24 use TYPO3\CMS\Core\TimeTracker\TimeTracker;
25 use TYPO3\CMS\Core\Utility\GeneralUtility;
26 use TYPO3\CMS\Core\Utility\MathUtility;
27 use TYPO3\CMS\Core\Utility\PathUtility;
28 use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
29 use TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility;
30
31 /**
32 * Indexing class for TYPO3 frontend
33 */
34 class Indexer
35 {
36 use PublicPropertyDeprecationTrait;
37
38 /**
39 * List of all deprecated public properties
40 * @var array
41 */
42 protected $deprecatedPublicProperties = [
43 'csObj' => 'Using $csObj within Indexing is discouraged, the property will be removed in TYPO3 v10.0 - if needed instantiate CharsetConverter yourself.',
44 ];
45
46 /**
47 * @var array
48 */
49 public $reasons = [
50 -1 => 'mtime matched the document, so no changes detected and no content updated',
51 -2 => 'The minimum age was not exceeded',
52 1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
53 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
54 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
55 4 => 'Page has never been indexed (is not represented in the index_phash table).'
56 ];
57
58 /**
59 * HTML code blocks to exclude from indexing
60 *
61 * @var string
62 */
63 public $excludeSections = 'script,style';
64
65 /**
66 * Supported Extensions for external files
67 *
68 * @var array
69 */
70 public $external_parsers = [];
71
72 /**
73 * External parser objects, keys are file extension names. Values are objects with certain methods.
74 * Fe-group list (pages might be indexed separately for each usergroup combination to support search
75 * in access limited pages!)
76 *
77 * @var string
78 */
79 public $defaultGrList = '0,-1';
80
81 /**
82 * Min/Max times
83 *
84 * @var int
85 */
86 public $tstamp_maxAge = 0;
87
88 /**
89 * If set, this tells a number of seconds that is the maximum age of an indexed document.
90 * Regardless of mtime the document will be re-indexed if this limit is exceeded.
91 *
92 * @var int
93 */
94 public $tstamp_minAge = 0;
95
96 /**
97 * If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
98 *
99 * @var int
100 */
101 public $maxExternalFiles = 0;
102
103 /**
104 * Max number of external files to index.
105 *
106 * @var bool
107 */
108 public $forceIndexing = false;
109
110 /**
111 * If TRUE, indexing is forced despite of hashes etc.
112 *
113 * @var bool
114 */
115 public $crawlerActive = false;
116
117 /**
118 * Set when crawler is detected (internal)
119 *
120 * @var array
121 */
122 public $defaultContentArray = [
123 'title' => '',
124 'description' => '',
125 'keywords' => '',
126 'body' => ''
127 ];
128
129 /**
130 * @var int
131 */
132 public $wordcount = 0;
133
134 /**
135 * @var int
136 */
137 public $externalFileCounter = 0;
138
139 /**
140 * @var array
141 */
142 public $conf = [];
143
144 /**
145 * Configuration set internally (see init functions for required keys and their meaning)
146 *
147 * @var array
148 */
149 public $indexerConfig = [];
150
151 /**
152 * Indexer configuration, coming from TYPO3's system configuration for EXT:indexed_search
153 *
154 * @var array
155 */
156 public $hash = [];
157
158 /**
159 * Hash array, contains phash and phash_grouping
160 *
161 * @var array
162 */
163 public $file_phash_arr = [];
164
165 /**
166 * Hash array for files
167 *
168 * @var array
169 */
170 public $contentParts = [];
171
172 /**
173 * Content of TYPO3 page
174 *
175 * @var string
176 */
177 public $content_md5h = '';
178
179 /**
180 * @var array
181 */
182 public $internal_log = [];
183
184 /**
185 * Internal log
186 *
187 * @var string
188 */
189 public $indexExternalUrl_content = '';
190
191 /**
192 * @var array
193 */
194 public $cHashParams = [];
195
196 /**
197 * cHashparams array
198 *
199 * @var int
200 */
201 public $freqRange = 32000;
202
203 /**
204 * @var float
205 */
206 public $freqMax = 0.1;
207
208 /**
209 * @var bool
210 */
211 public $enableMetaphoneSearch = false;
212
213 /**
214 * @var bool
215 */
216 public $storeMetaphoneInfoAsWords;
217
218 /**
219 * @var string
220 */
221 public $metaphoneContent = '';
222
223 /**
224 * Charset class object
225 *
226 * @var \TYPO3\CMS\Core\Charset\CharsetConverter
227 * @deprecated since TYPO3 v9.3, will be removed in TYPO3 v10 (also the instantiation in the init() method).
228 */
229 protected $csObj;
230
231 /**
232 * Metaphone object, if any
233 *
234 * @var \TYPO3\CMS\IndexedSearch\Utility\DoubleMetaPhoneUtility
235 */
236 public $metaphoneObj;
237
238 /**
239 * Lexer object for word splitting
240 *
241 * @var \TYPO3\CMS\IndexedSearch\Lexer
242 */
243 public $lexerObj;
244
245 /**
246 * @var bool
247 */
248 public $flagBitMask;
249
250 /**
251 * @var TimeTracker
252 */
253 protected $timeTracker;
254
255 /**
256 * Indexer constructor.
257 */
258 public function __construct()
259 {
260 $this->timeTracker = GeneralUtility::makeInstance(TimeTracker::class);
261 }
262
263 /**
264 * Parent Object (TSFE) Initialization
265 *
266 * @param TypoScriptFrontendController $pObj Parent Object, passed by reference
267 */
268 public function hook_indexContent(&$pObj)
269 {
270 // Indexer configuration from Extension Manager interface:
271 $disableFrontendIndexing = (bool)GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search', 'disableFrontendIndexing');
272 // Crawler activation:
273 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
274 if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler') && $pObj->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
275 // Setting simple log message:
276 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
277 // Setting variables:
278 $this->crawlerActive = true;
279 // Crawler active flag
280 $this->forceIndexing = true;
281 }
282 // Determine if page should be indexed, and if so, configure and initialize indexer
283 if ($pObj->config['config']['index_enable']) {
284 $this->log_push('Index page', '');
285 if (!$disableFrontendIndexing || $this->crawlerActive) {
286 if (!$pObj->page['no_search']) {
287 if (!$pObj->no_cache) {
288 /** @var LanguageAspect $languageAspect */
289 $languageAspect = GeneralUtility::makeInstance(Context::class)->getAspect('language');
290 if ($languageAspect->getId() === $languageAspect->getContentId()) {
291 // Setting up internal configuration from config array:
292 $this->conf = [];
293 // Information about page for which the indexing takes place
294 $this->conf['id'] = $pObj->id;
295 // Page id
296 $this->conf['type'] = $pObj->type;
297 // Page type
298 $this->conf['sys_language_uid'] = $languageAspect->getId();
299 // sys_language UID of the language of the indexing.
300 $this->conf['MP'] = $pObj->MP;
301 // MP variable, if any (Mount Points)
302 // Group list
303 $this->conf['gr_list'] = implode(',', GeneralUtility::makeInstance(Context::class)->getPropertyFromAspect('frontend.user', 'groupIds', [0, -1]));
304 $this->conf['cHash'] = $pObj->cHash;
305 // cHash string for additional parameters
306 $this->conf['cHash_array'] = $pObj->cHash_array;
307 // Array of the additional parameters
308 $this->conf['crdate'] = $pObj->page['crdate'];
309 // The creation date of the TYPO3 page
310
311 // reg1 of the caching table. Not known what practical use this has.
312 // @deprecated since TYPO3 v9, will be removed in TYPO3 v10. Remove along with database field data_page_reg1
313 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;
314
315 // Root line uids
316 $this->conf['rootline_uids'] = [];
317 foreach ($pObj->config['rootLine'] as $rlkey => $rldat) {
318 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
319 }
320 // Content of page:
321 $this->conf['content'] = $pObj->content;
322 // Content string (HTML of TYPO3 page)
323 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
324 // Alternative title for indexing
325 $this->conf['metaCharset'] = $pObj->metaCharset;
326 // Character set of content (will be converted to utf-8 during indexing)
327 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED'] ?? $pObj->page['SYS_LASTCHANGED'];
328 // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
329 // Configuration of behavior:
330 $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
331 // Whether to index external documents like PDF, DOC etc. (if possible)
332 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
333 // Length of description text (max 250, default 200)
334 $this->conf['index_metatags'] = $pObj->config['config']['index_metatags'] ?? true;
335 // Set to zero:
336 $this->conf['recordUid'] = 0;
337 $this->conf['freeIndexUid'] = 0;
338 $this->conf['freeIndexSetId'] = 0;
339 // Init and start indexing:
340 $this->init();
341 $this->indexTypo3PageContent();
342 } else {
343 $this->log_setTSlogMessage('Index page? No, languageId was different from contentId which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
344 }
345 } else {
346 $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
347 }
348 } else {
349 $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
350 }
351 } else {
352 $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
353 }
354 $this->log_pull();
355 }
356 }
357
358 /****************************
359 *
360 * Backend API
361 *
362 ****************************/
363 /**
364 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
365 *
366 * @param int $id The page uid, &id=
367 * @param int $type The page type, &type=
368 * @param int $sys_language_uid sys_language uid, typically &L=
369 * @param string $MP The MP variable (Mount Points), &MP=
370 * @param array $uidRL Rootline array of only UIDs.
371 * @param array $cHash_array Array of GET variables to register with this indexing
372 * @param bool $createCHash If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cacheable pages from the backend!
373 */
374 public function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array = [], $createCHash = false)
375 {
376 // Setting up internal configuration from config array:
377 $this->conf = [];
378 // Information about page for which the indexing takes place
379 $this->conf['id'] = $id;
380 // Page id (int)
381 $this->conf['type'] = $type;
382 // Page type (int)
383 $this->conf['sys_language_uid'] = $sys_language_uid;
384 // sys_language UID of the language of the indexing (int)
385 $this->conf['MP'] = $MP;
386 // MP variable, if any (Mount Points) (string)
387 $this->conf['gr_list'] = '0,-1';
388 // Group list (hardcoded for now...)
389 // cHash values:
390 if ($createCHash) {
391 /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
392 $cacheHash = GeneralUtility::makeInstance(\TYPO3\CMS\Frontend\Page\CacheHashCalculator::class);
393 $this->conf['cHash'] = $cacheHash->generateForParameters(GeneralUtility::implodeArrayForUrl('', $cHash_array));
394 } else {
395 $this->conf['cHash'] = '';
396 }
397 // cHash string for additional parameters
398 $this->conf['cHash_array'] = $cHash_array;
399 // Array of the additional parameters
400 // Set to defaults
401 $this->conf['freeIndexUid'] = 0;
402 $this->conf['freeIndexSetId'] = 0;
403
404 // @deprecated since TYPO3 v9, will be removed in TYPO3 v10. Remove along with database field data_page_reg1
405 $this->conf['page_cache_reg1'] = 0;
406
407 // Root line uids
408 $this->conf['rootline_uids'] = $uidRL;
409 // Configuration of behavior:
410 $this->conf['index_externals'] = 1;
411 // Whether to index external documents like PDF, DOC etc. (if possible)
412 $this->conf['index_descrLgd'] = 200;
413 // Length of description text (max 250, default 200)
414 $this->conf['index_metatags'] = true;
415 // Whether to index document keywords and description (if present)
416 // Init and start indexing:
417 $this->init();
418 }
419
420 /**
421 * Sets the free-index uid. Can be called right after backend_initIndexer()
422 *
423 * @param int $freeIndexUid Free index UID
424 * @param int $freeIndexSetId Set id - an integer identifying the "set" of indexing operations.
425 */
426 public function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId = 0)
427 {
428 $this->conf['freeIndexUid'] = $freeIndexUid;
429 $this->conf['freeIndexSetId'] = $freeIndexSetId;
430 }
431
432 /**
433 * Indexing records as the content of a TYPO3 page.
434 *
435 * @param string $title Title equivalent
436 * @param string $keywords Keywords equivalent
437 * @param string $description Description equivalent
438 * @param string $content The main content to index
439 * @param string $charset The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
440 * @param int $mtime Last modification time, in seconds
441 * @param int $crdate The creation date of the content, in seconds
442 * @param int $recordUid The record UID that the content comes from (for registration with the indexed rows)
443 */
444 public function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate = 0, $recordUid = 0)
445 {
446 // Content of page:
447 $this->conf['mtime'] = $mtime;
448 // Most recent modification time (seconds) of the content
449 $this->conf['crdate'] = $crdate;
450 // The creation date of the TYPO3 content
451 $this->conf['recordUid'] = $recordUid;
452 // UID of the record, if applicable
453 // Construct fake HTML for parsing:
454 $this->conf['content'] = '
455 <html>
456 <head>
457 <title>' . htmlspecialchars($title) . '</title>
458 <meta name="keywords" content="' . htmlspecialchars($keywords) . '" />
459 <meta name="description" content="' . htmlspecialchars($description) . '" />
460 </head>
461 <body>
462 ' . htmlspecialchars($content) . '
463 </body>
464 </html>';
465 // Content string (HTML of TYPO3 page)
466 // Initializing charset:
467 $this->conf['metaCharset'] = $charset;
468 // Character set of content (will be converted to utf-8 during indexing)
469 $this->conf['indexedDocTitle'] = '';
470 // Alternative title for indexing
471 // Index content as if it was a TYPO3 page:
472 $this->indexTypo3PageContent();
473 }
474
475 /********************************
476 *
477 * Initialization
478 *
479 *******************************/
480 /**
481 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
482 */
483 public function init()
484 {
485 // Initializing:
486 $this->cHashParams = $this->conf['cHash_array'];
487 if (is_array($this->cHashParams) && !empty($this->cHashParams)) {
488 if ($this->conf['cHash']) {
489 // Add this so that URL's come out right...
490 $this->cHashParams['cHash'] = $this->conf['cHash'];
491 }
492 unset($this->cHashParams['encryptionKey']);
493 }
494 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
495 $this->setT3Hashes();
496 // Indexer configuration from Extension Manager interface:
497 $this->indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
498 $this->tstamp_minAge = MathUtility::forceIntegerInRange($this->indexerConfig['minAge'] * 3600, 0);
499 $this->tstamp_maxAge = MathUtility::forceIntegerInRange($this->indexerConfig['maxAge'] * 3600, 0);
500 $this->maxExternalFiles = MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
501 $this->flagBitMask = MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
502 // Workaround: If the extension configuration was not updated yet, the value is not existing
503 $this->enableMetaphoneSearch = !isset($this->indexerConfig['enableMetaphoneSearch']) || $this->indexerConfig['enableMetaphoneSearch'];
504 $this->storeMetaphoneInfoAsWords = !IndexedSearchUtility::isTableUsed('index_words') && $this->enableMetaphoneSearch;
505 // Initialize external document parsers:
506 // Example configuration, see ext_localconf.php of this file!
507 if ($this->conf['index_externals']) {
508 $this->initializeExternalParsers();
509 }
510 // Initialize lexer (class that deconstructs the text into words):
511 $lexerObjectClassName = $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] ?: Lexer::class;
512 $this->lexerObj = GeneralUtility::makeInstance($lexerObjectClassName);
513 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
514 // Initialize metaphone hook:
515 // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
516 if ($this->enableMetaphoneSearch && $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']) {
517 $this->metaphoneObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']);
518 $this->metaphoneObj->pObj = $this;
519 }
520 // Init charset class:
521 $this->csObj = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Charset\CharsetConverter::class);
522 }
523
524 /**
525 * Initialize external parsers
526 *
527 * @access private
528 * @see init()
529 */
530 public function initializeExternalParsers()
531 {
532 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] ?? [] as $extension => $className) {
533 $this->external_parsers[$extension] = GeneralUtility::makeInstance($className);
534 $this->external_parsers[$extension]->pObj = $this;
535 // Init parser and if it returns FALSE, unset its entry again:
536 if (!$this->external_parsers[$extension]->initParser($extension)) {
537 unset($this->external_parsers[$extension]);
538 }
539 }
540 }
541
542 /********************************
543 *
544 * Indexing; TYPO3 pages (HTML content)
545 *
546 *******************************/
547 /**
548 * Start indexing of the TYPO3 page
549 */
550 public function indexTypo3PageContent()
551 {
552 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
553 $is_grlist = $this->is_grlist_set($this->hash['phash']);
554 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
555 // Setting message:
556 if ($this->forceIndexing) {
557 $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
558 } elseif ($check > 0) {
559 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
560 } else {
561 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
562 }
563 // Divide into title,keywords,description and body:
564 $this->log_push('Split content', '');
565 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
566 if ($this->conf['indexedDocTitle']) {
567 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
568 }
569 $this->log_pull();
570 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so don't!)
571 $this->content_md5h = IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
572 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
573 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
574 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
575 $checkCHash = $this->checkContentHash();
576 if (!is_array($checkCHash) || $check === 1) {
577 $Pstart = GeneralUtility::milliseconds();
578 $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
579 $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
580 $this->log_pull();
581 // Splitting words
582 $this->log_push('Extract words from content', '');
583 $splitInWords = $this->processWordsInArrays($this->contentParts);
584 $this->log_pull();
585 // Analyse the indexed words.
586 $this->log_push('Analyse the extracted words', '');
587 $indexArr = $this->indexAnalyze($splitInWords);
588 $this->log_pull();
589 // Submitting page (phash) record
590 $this->log_push('Submitting page', '');
591 $this->submitPage();
592 $this->log_pull();
593 // Check words and submit to word list if not there
594 $this->log_push('Check word list and submit words', '');
595 if (IndexedSearchUtility::isTableUsed('index_words')) {
596 $this->checkWordList($indexArr);
597 $this->submitWords($indexArr, $this->hash['phash']);
598 }
599 $this->log_pull();
600 // Set parsetime
601 $this->updateParsetime($this->hash['phash'], GeneralUtility::milliseconds() - $Pstart);
602 // Checking external files if configured for.
603 $this->log_push('Checking external files', '');
604 if ($this->conf['index_externals']) {
605 $this->extractLinks($this->conf['content']);
606 }
607 $this->log_pull();
608 } else {
609 // Update the timestamp
610 $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
611 $this->updateSetId($this->hash['phash']);
612 // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
613 $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
614 $this->updateRootline();
615 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
616 }
617 } else {
618 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
619 }
620 }
621
622 /**
623 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
624 *
625 * @param string $content HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
626 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
627 * @see splitRegularContent()
628 */
629 public function splitHTMLContent($content)
630 {
631 // divide head from body ( u-ouh :) )
632 $contentArr = $this->defaultContentArray;
633 $contentArr['body'] = stristr($content, '<body');
634 $headPart = substr($content, 0, -strlen($contentArr['body']));
635 // get title
636 $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
637 $titleParts = explode(':', $contentArr['title'], 2);
638 $contentArr['title'] = trim($titleParts[1] ?? $titleParts[0]);
639 // get keywords and description metatags
640 if ($this->conf['index_metatags']) {
641 $meta = [];
642 $i = 0;
643 while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
644 $i++;
645 }
646 // @todo The code below stops at first unset tag. Is that correct?
647 for ($i = 0; isset($meta[$i]); $i++) {
648 $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i]);
649 if (stristr($meta[$i]['name'], 'keywords')) {
650 $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
651 }
652 if (stristr($meta[$i]['name'], 'description')) {
653 $contentArr['description'] .= ',' . $meta[$i]['content'];
654 }
655 }
656 }
657 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
658 $this->typoSearchTags($contentArr['body']);
659 // Get rid of unwanted sections (ie. scripting and style stuff) in body
660 $tagList = explode(',', $this->excludeSections);
661 foreach ($tagList as $tag) {
662 while ($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
663 }
664 }
665 // remove tags, but first make sure we don't concatenate words by doing it
666 $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
667 $contentArr['body'] = trim(strip_tags($contentArr['body']));
668 $contentArr['keywords'] = trim($contentArr['keywords']);
669 $contentArr['description'] = trim($contentArr['description']);
670 // Return array
671 return $contentArr;
672 }
673
674 /**
675 * Extract the charset value from HTML meta tag.
676 *
677 * @param string $content HTML content
678 * @return string The charset value if found.
679 */
680 public function getHTMLcharset($content)
681 {
682 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
683 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
684 return $reg2[1];
685 }
686 }
687 }
688
689 /**
690 * Converts a HTML document to utf-8
691 *
692 * @param string $content HTML content, any charset
693 * @param string $charset Optional charset (otherwise extracted from HTML)
694 * @return string Converted HTML
695 */
696 public function convertHTMLToUtf8($content, $charset = '')
697 {
698 // Find charset:
699 $charset = $charset ?: $this->getHTMLcharset($content);
700 $charset = trim(strtolower($charset));
701 // Convert charset:
702 if ($charset && $charset !== 'utf-8') {
703 $content = mb_convert_encoding($content, 'utf-8', $charset);
704 }
705 // Convert entities, assuming document is now UTF-8:
706 return html_entity_decode($content);
707 }
708
709 /**
710 * Finds first occurrence of embracing tags and returns the embraced content and the original string with
711 * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding
712 * <title> of document or removing <script>-sections
713 *
714 * @param string $string String to search in
715 * @param string $tagName Tag name, eg. "script
716 * @param string $tagContent Passed by reference: Content inside found tag
717 * @param string $stringAfter Passed by reference: Content after found tag
718 * @param string $paramList Passed by reference: Attributes of the found tag.
719 * @return bool Returns FALSE if tag was not found, otherwise TRUE.
720 */
721 public function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
722 {
723 $endTag = '</' . $tagName . '>';
724 $startTag = '<' . $tagName;
725 // stristr used because we want a case-insensitive search for the tag.
726 $isTagInText = stristr($string, $startTag);
727 // if the tag was not found, return FALSE
728 if (!$isTagInText) {
729 return false;
730 }
731 list($paramList, $isTagInText) = explode('>', substr($isTagInText, strlen($startTag)), 2);
732 $afterTagInText = stristr($isTagInText, $endTag);
733 if ($afterTagInText) {
734 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
735 $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
736 $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
737 } else {
738 $tagContent = '';
739 $stringAfter = $isTagInText;
740 }
741 return true;
742 }
743
744 /**
745 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
746 *
747 * @param string $body HTML Content, passed by reference
748 * @return bool Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE.
749 */
750 public function typoSearchTags(&$body)
751 {
752 $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
753 if (count($expBody) > 1) {
754 $body = '';
755 foreach ($expBody as $val) {
756 $part = explode('-->', $val, 2);
757 if (trim($part[0]) === 'begin') {
758 $body .= $part[1];
759 $prev = '';
760 } elseif (trim($part[0]) === 'end') {
761 $body .= $prev;
762 } else {
763 $prev = $val;
764 }
765 }
766 return true;
767 }
768 return false;
769 }
770
771 /**
772 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
773 *
774 * @param string $content HTML content
775 */
776 public function extractLinks($content)
777 {
778 // Get links:
779 $list = $this->extractHyperLinks($content);
780 if ($this->indexerConfig['useCrawlerForExternalFiles'] && \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler')) {
781 $crawler = GeneralUtility::makeInstance(\tx_crawler_lib::class);
782 }
783 // Traverse links:
784 foreach ($list as $linkInfo) {
785 // Decode entities:
786 if ($linkInfo['localPath']) {
787 // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
788 $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
789 } else {
790 $linkSource = htmlspecialchars_decode($linkInfo['href']);
791 }
792 // Parse URL:
793 $qParts = parse_url($linkSource);
794 // Check for jumpurl (TYPO3 specific thing...)
795 if ($qParts['query'] && strstr($qParts['query'], 'jumpurl=')) {
796 parse_str($qParts['query'], $getP);
797 $linkSource = $getP['jumpurl'];
798 $qParts = parse_url($linkSource);
799 }
800 if (!$linkInfo['localPath'] && $qParts['scheme']) {
801 if ($this->indexerConfig['indexExternalURLs']) {
802 // Index external URL (http or otherwise)
803 $this->indexExternalUrl($linkSource);
804 }
805 } elseif (!$qParts['query']) {
806 $linkSource = urldecode($linkSource);
807 if (GeneralUtility::isAllowedAbsPath($linkSource)) {
808 $localFile = $linkSource;
809 } else {
810 $localFile = GeneralUtility::getFileAbsFileName(Environment::getPublicPath() . '/' . $linkSource);
811 }
812 if ($localFile && @is_file($localFile)) {
813 // Index local file:
814 if ($linkInfo['localPath']) {
815 $fI = pathinfo($linkSource);
816 $ext = strtolower($fI['extension']);
817 if (is_object($crawler)) {
818 $params = [
819 'document' => $linkSource,
820 'alturl' => $linkInfo['href'],
821 'conf' => $this->conf
822 ];
823 unset($params['conf']['content']);
824 $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
825 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
826 } else {
827 $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
828 }
829 } else {
830 if (is_object($crawler)) {
831 $params = [
832 'document' => $linkSource,
833 'conf' => $this->conf
834 ];
835 unset($params['conf']['content']);
836 $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
837 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
838 } else {
839 $this->indexRegularDocument($linkSource);
840 }
841 }
842 }
843 }
844 }
845 }
846
847 /**
848 * Extracts all links to external documents from the HTML content string
849 *
850 * @param string $html
851 * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
852 * @see extractLinks()
853 */
854 public function extractHyperLinks($html)
855 {
856 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
857 $htmlParts = $htmlParser->splitTags('a', $html);
858 $hyperLinksData = [];
859 foreach ($htmlParts as $index => $tagData) {
860 if ($index % 2 !== 0) {
861 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
862 $firstTagName = $htmlParser->getFirstTagName($tagData);
863 if (strtolower($firstTagName) === 'a') {
864 if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] !== '#') {
865 $hyperLinksData[] = [
866 'tag' => $tagData,
867 'href' => $tagAttributes[0]['href'],
868 'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
869 ];
870 }
871 }
872 }
873 }
874 return $hyperLinksData;
875 }
876
877 /**
878 * Extracts the "base href" from content string.
879 *
880 * @param string $html Content to analyze
881 * @return string The base href or an empty string if not found
882 */
883 public function extractBaseHref($html)
884 {
885 $href = '';
886 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
887 $htmlParts = $htmlParser->splitTags('base', $html);
888 foreach ($htmlParts as $index => $tagData) {
889 if ($index % 2 !== 0) {
890 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
891 $firstTagName = $htmlParser->getFirstTagName($tagData);
892 if (strtolower($firstTagName) === 'base') {
893 $href = $tagAttributes[0]['href'];
894 if ($href) {
895 break;
896 }
897 }
898 }
899 }
900 return $href;
901 }
902
903 /******************************************
904 *
905 * Indexing; external URL
906 *
907 ******************************************/
908 /**
909 * Index External URLs HTML content
910 *
911 * @param string $externalUrl URL, eg. "http://typo3.org/
912 * @see indexRegularDocument()
913 */
914 public function indexExternalUrl($externalUrl)
915 {
916 // Get headers:
917 $urlHeaders = $this->getUrlHeaders($externalUrl);
918 if (stristr($urlHeaders['Content-Type'], 'text/html')) {
919 $content = ($this->indexExternalUrl_content = GeneralUtility::getUrl($externalUrl));
920 if ((string)$content !== '') {
921 // Create temporary file:
922 $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
923 if ($tmpFile) {
924 GeneralUtility::writeFile($tmpFile, $content);
925 // Index that file:
926 $this->indexRegularDocument($externalUrl, true, $tmpFile, 'html');
927 // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
928 unlink($tmpFile);
929 }
930 }
931 }
932 }
933
934 /**
935 * Getting HTTP request headers of URL
936 *
937 * @param string $url The URL
938 * @return mixed If no answer, returns FALSE. Otherwise an array where HTTP headers are keys
939 */
940 public function getUrlHeaders($url)
941 {
942 // Try to get the headers only
943 $content = GeneralUtility::getUrl($url, 2);
944 if ((string)$content !== '') {
945 // Compile headers:
946 $headers = GeneralUtility::trimExplode(LF, $content, true);
947 $retVal = [];
948 foreach ($headers as $line) {
949 if (trim($line) === '') {
950 break;
951 }
952 list($headKey, $headValue) = explode(':', $line, 2);
953 $retVal[$headKey] = $headValue;
954 }
955 return $retVal;
956 }
957 }
958
959 /**
960 * Checks if the file is local
961 *
962 * @param string $sourcePath
963 * @return string Absolute path to file if file is local, else empty string
964 */
965 protected function createLocalPath($sourcePath)
966 {
967 $localPath = '';
968 $pathFunctions = [
969 'createLocalPathFromT3vars',
970 'createLocalPathUsingAbsRefPrefix',
971 'createLocalPathUsingDomainURL',
972 'createLocalPathFromAbsoluteURL',
973 'createLocalPathFromRelativeURL'
974 ];
975 foreach ($pathFunctions as $functionName) {
976 $localPath = $this->{$functionName}($sourcePath);
977 if ($localPath != '') {
978 break;
979 }
980 }
981 return $localPath;
982 }
983
984 /**
985 * Attempts to create a local file path from T3VARs. This is useful for
986 * various download extensions that hide actual file name but still want the
987 * file to be indexed.
988 *
989 * @param string $sourcePath
990 * @return string
991 */
992 protected function createLocalPathFromT3vars($sourcePath)
993 {
994 $localPath = '';
995 $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'] ?? null;
996 if (is_array($indexLocalFiles)) {
997 $md5 = GeneralUtility::shortMD5($sourcePath);
998 // Note: not using self::isAllowedLocalFile here because this method
999 // is allowed to index files outside of the web site (for example,
1000 // protected downloads)
1001 if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
1002 $localPath = $indexLocalFiles[$md5];
1003 }
1004 }
1005 return $localPath;
1006 }
1007
1008 /**
1009 * Attempts to create a local file path by matching a current request URL.
1010 *
1011 * @param string $sourcePath
1012 * @return string
1013 */
1014 protected function createLocalPathUsingDomainURL($sourcePath)
1015 {
1016 $localPath = '';
1017 $baseURL = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
1018 $baseURLLength = strlen($baseURL);
1019 if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
1020 $sourcePath = substr($sourcePath, $baseURLLength);
1021 $localPath = Environment::getPublicPath() . '/' . $sourcePath;
1022 if (!self::isAllowedLocalFile($localPath)) {
1023 $localPath = '';
1024 }
1025 }
1026 return $localPath;
1027 }
1028
1029 /**
1030 * Attempts to create a local file path by matching absRefPrefix. This
1031 * requires TSFE. If TSFE is missing, this function does nothing.
1032 *
1033 * @param string $sourcePath
1034 * @return string
1035 */
1036 protected function createLocalPathUsingAbsRefPrefix($sourcePath)
1037 {
1038 $localPath = '';
1039 if (isset($GLOBALS['TSFE']) && $GLOBALS['TSFE'] instanceof TypoScriptFrontendController) {
1040 $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
1041 $absRefPrefixLength = strlen($absRefPrefix);
1042 if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
1043 $sourcePath = substr($sourcePath, $absRefPrefixLength);
1044 $localPath = Environment::getPublicPath() . '/' . $sourcePath;
1045 if (!self::isAllowedLocalFile($localPath)) {
1046 $localPath = '';
1047 }
1048 }
1049 }
1050 return $localPath;
1051 }
1052
1053 /**
1054 * Attempts to create a local file path from the absolute URL without
1055 * schema.
1056 *
1057 * @param string $sourcePath
1058 * @return string
1059 */
1060 protected function createLocalPathFromAbsoluteURL($sourcePath)
1061 {
1062 $localPath = '';
1063 if ($sourcePath[0] === '/') {
1064 $sourcePath = substr($sourcePath, 1);
1065 $localPath = Environment::getPublicPath() . '/' . $sourcePath;
1066 if (!self::isAllowedLocalFile($localPath)) {
1067 $localPath = '';
1068 }
1069 }
1070 return $localPath;
1071 }
1072
1073 /**
1074 * Attempts to create a local file path from the relative URL.
1075 *
1076 * @param string $sourcePath
1077 * @return string
1078 */
1079 protected function createLocalPathFromRelativeURL($sourcePath)
1080 {
1081 $localPath = '';
1082 if (self::isRelativeURL($sourcePath)) {
1083 $localPath = Environment::getPublicPath() . '/' . $sourcePath;
1084 if (!self::isAllowedLocalFile($localPath)) {
1085 $localPath = '';
1086 }
1087 }
1088 return $localPath;
1089 }
1090
1091 /**
1092 * Checks if URL is relative.
1093 *
1094 * @param string $url
1095 * @return bool
1096 */
1097 protected static function isRelativeURL($url)
1098 {
1099 $urlParts = @parse_url($url);
1100 return (!isset($urlParts['scheme']) || $urlParts['scheme'] === '') && $urlParts['path'][0] !== '/';
1101 }
1102
1103 /**
1104 * Checks if the path points to the file inside the web site
1105 *
1106 * @param string $filePath
1107 * @return bool
1108 */
1109 protected static function isAllowedLocalFile($filePath)
1110 {
1111 $filePath = GeneralUtility::resolveBackPath($filePath);
1112 $insideWebPath = substr($filePath, 0, strlen(Environment::getPublicPath())) === Environment::getPublicPath();
1113 $isFile = is_file($filePath);
1114 return $insideWebPath && $isFile;
1115 }
1116
1117 /******************************************
1118 *
1119 * Indexing; external files (PDF, DOC, etc)
1120 *
1121 ******************************************/
1122 /**
1123 * Indexing a regular document given as $file (relative to public web path, local file)
1124 *
1125 * @param string $file Relative Filename, relative to public web path. It can also be an absolute path as long as it is inside the lockRootPath (validated with \TYPO3\CMS\Core\Utility\GeneralUtility::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
1126 * @param bool $force If set, indexing is forced (despite content hashes, mtime etc).
1127 * @param string $contentTmpFile Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
1128 * @param string $altExtension File extension for temporary file.
1129 */
1130 public function indexRegularDocument($file, $force = false, $contentTmpFile = '', $altExtension = '')
1131 {
1132 // Init
1133 $fI = pathinfo($file);
1134 $ext = $altExtension ?: strtolower($fI['extension']);
1135 // Create abs-path:
1136 if (!$contentTmpFile) {
1137 if (!GeneralUtility::isAbsPath($file)) {
1138 // Relative, prepend public web path:
1139 $absFile = GeneralUtility::getFileAbsFileName(Environment::getPublicPath() . '/' . $file);
1140 } else {
1141 // Absolute, pass-through:
1142 $absFile = $file;
1143 }
1144 $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
1145 } else {
1146 $absFile = $contentTmpFile;
1147 }
1148 // Indexing the document:
1149 if ($absFile && @is_file($absFile)) {
1150 if ($this->external_parsers[$ext]) {
1151 $fileInfo = stat($absFile);
1152 $cParts = $this->fileContentParts($ext, $absFile);
1153 foreach ($cParts as $cPKey) {
1154 $this->internal_log = [];
1155 $this->log_push('Index: ' . str_replace('.', '_', PathUtility::basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
1156 $Pstart = GeneralUtility::milliseconds();
1157 $subinfo = ['key' => $cPKey];
1158 // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1159 $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
1160 $check = $this->checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
1161 if ($check > 0 || $force) {
1162 if ($check > 0) {
1163 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
1164 } else {
1165 $this->log_setTSlogMessage('Indexing forced by flag', 1);
1166 }
1167 // Check external file counter:
1168 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1169 // Divide into title,keywords,description and body:
1170 $this->log_push('Split content', '');
1171 $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
1172 $this->log_pull();
1173 if (is_array($contentParts)) {
1174 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1175 $content_md5h = IndexedSearchUtility::md5inthash(implode($contentParts, ''));
1176 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1177 // Increment counter:
1178 $this->externalFileCounter++;
1179 // Splitting words
1180 $this->log_push('Extract words from content', '');
1181 $splitInWords = $this->processWordsInArrays($contentParts);
1182 $this->log_pull();
1183 // Analyse the indexed words.
1184 $this->log_push('Analyse the extracted words', '');
1185 $indexArr = $this->indexAnalyze($splitInWords);
1186 $this->log_pull();
1187 // Submitting page (phash) record
1188 $this->log_push('Submitting page', '');
1189 // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1190 $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], $content_md5h, $contentParts);
1191 $this->log_pull();
1192 // Check words and submit to word list if not there
1193 $this->log_push('Check word list and submit words', '');
1194 if (IndexedSearchUtility::isTableUsed('index_words')) {
1195 $this->checkWordList($indexArr);
1196 $this->submitWords($indexArr, $phash_arr['phash']);
1197 }
1198 $this->log_pull();
1199 // Set parsetime
1200 $this->updateParsetime($phash_arr['phash'], GeneralUtility::milliseconds() - $Pstart);
1201 } else {
1202 // Update the timestamp
1203 $this->updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
1204 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
1205 }
1206 } else {
1207 $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1208 }
1209 } else {
1210 $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
1211 }
1212 } else {
1213 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
1214 }
1215 // Checking and setting sections:
1216 $this->submitFile_section($phash_arr['phash']);
1217 // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1218 $this->log_pull();
1219 }
1220 } else {
1221 $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
1222 }
1223 } else {
1224 $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
1225 }
1226 }
1227
1228 /**
1229 * Reads the content of an external file being indexed.
1230 * The content from the external parser MUST be returned in utf-8!
1231 *
1232 * @param string $fileExtension File extension, eg. "pdf", "doc" etc.
1233 * @param string $absoluteFileName Absolute filename of file (must exist and be validated OK before calling function)
1234 * @param string $sectionPointer Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1235 * @return array Standard content array (title, description, keywords, body keys)
1236 */
1237 public function readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
1238 {
1239 $contentArray = null;
1240 // Consult relevant external document parser:
1241 if (is_object($this->external_parsers[$fileExtension])) {
1242 $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1243 }
1244 return $contentArray;
1245 }
1246
1247 /**
1248 * Creates an array with pointers to divisions of document.
1249 *
1250 * @param string $ext File extension
1251 * @param string $absFile Absolute filename (must exist and be validated OK before calling function)
1252 * @return array Array of pointers to sections that the document should be divided into
1253 */
1254 public function fileContentParts($ext, $absFile)
1255 {
1256 $cParts = [0];
1257 // Consult relevant external document parser:
1258 if (is_object($this->external_parsers[$ext])) {
1259 $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
1260 }
1261 return $cParts;
1262 }
1263
1264 /**
1265 * Splits non-HTML content (from external files for instance)
1266 *
1267 * @param string $content Input content (non-HTML) to index.
1268 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1269 * @see splitHTMLContent()
1270 */
1271 public function splitRegularContent($content)
1272 {
1273 $contentArr = $this->defaultContentArray;
1274 $contentArr['body'] = $content;
1275 return $contentArr;
1276 }
1277
1278 /**********************************
1279 *
1280 * Analysing content, Extracting words
1281 *
1282 **********************************/
1283 /**
1284 * Convert character set and HTML entities in the value of input content array keys
1285 *
1286 * @param array $contentArr Standard content array
1287 * @param string $charset Charset of the input content (converted to utf-8)
1288 */
1289 public function charsetEntity2utf8(&$contentArr, $charset)
1290 {
1291 // Convert charset if necessary
1292 foreach ($contentArr as $key => $value) {
1293 if ((string)$contentArr[$key] !== '') {
1294 if ($charset !== 'utf-8') {
1295 $contentArr[$key] = mb_convert_encoding($contentArr[$key], 'utf-8', $charset);
1296 }
1297 // decode all numeric / html-entities in the string to real characters:
1298 $contentArr[$key] = html_entity_decode($contentArr[$key]);
1299 }
1300 }
1301 }
1302
1303 /**
1304 * Processing words in the array from split*Content -functions
1305 *
1306 * @param array $contentArr Array of content to index, see splitHTMLContent() and splitRegularContent()
1307 * @return array Content input array modified so each key is not a unique array of words
1308 */
1309 public function processWordsInArrays($contentArr)
1310 {
1311 // split all parts to words
1312 foreach ($contentArr as $key => $value) {
1313 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1314 }
1315 // For title, keywords, and description we don't want duplicates:
1316 $contentArr['title'] = array_unique($contentArr['title']);
1317 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1318 $contentArr['description'] = array_unique($contentArr['description']);
1319 // Return modified array:
1320 return $contentArr;
1321 }
1322
1323 /**
1324 * Extracts the sample description text from the content array.
1325 *
1326 * @param array $contentArr Content array
1327 * @return string Description string
1328 */
1329 public function bodyDescription($contentArr)
1330 {
1331 // Setting description
1332 $maxL = MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1333 if ($maxL) {
1334 $bodyDescription = preg_replace('/\s+/u', ' ', $contentArr['body']);
1335 // Shorten the string:
1336 $bodyDescription = mb_strcut($bodyDescription, 0, $maxL, 'utf-8');
1337 }
1338 return $bodyDescription;
1339 }
1340
1341 /**
1342 * Analyzes content to use for indexing,
1343 *
1344 * @param array $content Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1345 * @return array Index Array (whatever that is...)
1346 */
1347 public function indexAnalyze($content)
1348 {
1349 $indexArr = [];
1350 $this->analyzeHeaderinfo($indexArr, $content, 'title', 7);
1351 $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1352 $this->analyzeHeaderinfo($indexArr, $content, 'description', 5);
1353 $this->analyzeBody($indexArr, $content);
1354 return $indexArr;
1355 }
1356
1357 /**
1358 * Calculates relevant information for headercontent
1359 *
1360 * @param array $retArr Index array, passed by reference
1361 * @param array $content Standard content array
1362 * @param string $key Key from standard content array
1363 * @param int $offset Bit-wise priority to type
1364 */
1365 public function analyzeHeaderinfo(&$retArr, $content, $key, $offset)
1366 {
1367 foreach ($content[$key] as $val) {
1368 $val = substr($val, 0, 60);
1369 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1370 if (!isset($retArr[$val])) {
1371 // Word ID (wid)
1372 $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1373 // Metaphone value is also 60 only chars long
1374 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1375 $retArr[$val]['metaphone'] = $metaphone;
1376 }
1377 // Build metaphone fulltext string (can be used for fulltext indexing)
1378 if ($this->storeMetaphoneInfoAsWords) {
1379 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1380 }
1381 // Priority used for flagBitMask feature (see extension configuration)
1382 $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | pow(2, $offset);
1383 // Increase number of occurrences
1384 $retArr[$val]['count']++;
1385 $this->wordcount++;
1386 }
1387 }
1388
1389 /**
1390 * Calculates relevant information for bodycontent
1391 *
1392 * @param array $retArr Index array, passed by reference
1393 * @param array $content Standard content array
1394 */
1395 public function analyzeBody(&$retArr, $content)
1396 {
1397 foreach ($content['body'] as $key => $val) {
1398 $val = substr($val, 0, 60);
1399 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1400 if (!isset($retArr[$val])) {
1401 // First occurrence (used for ranking results)
1402 $retArr[$val]['first'] = $key;
1403 // Word ID (wid)
1404 $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1405 // Metaphone value is also only 60 chars long
1406 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1407 $retArr[$val]['metaphone'] = $metaphone;
1408 }
1409 // Build metaphone fulltext string (can be used for fulltext indexing)
1410 if ($this->storeMetaphoneInfoAsWords) {
1411 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1412 }
1413 // Increase number of occurrences
1414 $retArr[$val]['count']++;
1415 $this->wordcount++;
1416 }
1417 }
1418
1419 /**
1420 * Creating metaphone based hash from input word
1421 *
1422 * @param string $word Word to convert
1423 * @param bool $returnRawMetaphoneValue If set, returns the raw metaphone value (not hashed)
1424 * @return mixed Metaphone hash integer (or raw value, string)
1425 */
1426 public function metaphone($word, $returnRawMetaphoneValue = false)
1427 {
1428 if (is_object($this->metaphoneObj)) {
1429 $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1430 } else {
1431 // Use native PHP function instead of advanced doubleMetaphone class
1432 $metaphoneRawValue = metaphone($word);
1433 }
1434 if ($returnRawMetaphoneValue) {
1435 $result = $metaphoneRawValue;
1436 } elseif ($metaphoneRawValue !== '') {
1437 // Create hash and return integer
1438 $result = IndexedSearchUtility::md5inthash($metaphoneRawValue);
1439 } else {
1440 $result = 0;
1441 }
1442 return $result;
1443 }
1444
1445 /********************************
1446 *
1447 * SQL; TYPO3 Pages
1448 *
1449 *******************************/
1450 /**
1451 * Updates db with information about the page (TYPO3 page, not external media)
1452 */
1453 public function submitPage()
1454 {
1455 // Remove any current data for this phash:
1456 $this->removeOldIndexedPages($this->hash['phash']);
1457 // setting new phash_row
1458 $fields = [
1459 'phash' => $this->hash['phash'],
1460 'phash_grouping' => $this->hash['phash_grouping'],
1461 'cHashParams' => serialize($this->cHashParams),
1462 'contentHash' => $this->content_md5h,
1463 'data_page_id' => $this->conf['id'],
1464 // @deprecated since TYPO3 v9, will be removed in TYPO3 v10. Remove along with database field data_page_reg1
1465 'data_page_reg1' => $this->conf['page_cache_reg1'],
1466 'data_page_type' => $this->conf['type'],
1467 'data_page_mp' => $this->conf['MP'],
1468 'gr_list' => $this->conf['gr_list'],
1469 'item_type' => 0,
1470 // TYPO3 page
1471 'item_title' => $this->contentParts['title'],
1472 'item_description' => $this->bodyDescription($this->contentParts),
1473 'item_mtime' => (int)$this->conf['mtime'],
1474 'item_size' => strlen($this->conf['content']),
1475 'tstamp' => $GLOBALS['EXEC_TIME'],
1476 'crdate' => $GLOBALS['EXEC_TIME'],
1477 'item_crdate' => $this->conf['crdate'],
1478 // Creation date of page
1479 'sys_language_uid' => $this->conf['sys_language_uid'],
1480 // Sys language uid of the page. Should reflect which language it DOES actually display!
1481 'externalUrl' => 0,
1482 'recordUid' => (int)$this->conf['recordUid'],
1483 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1484 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1485 ];
1486 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1487 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1488 ->getConnectionForTable('index_phash');
1489 $connection->insert(
1490 'index_phash',
1491 $fields,
1492 ['cHashParams' => Connection::PARAM_LOB]
1493 );
1494 }
1495 // PROCESSING index_section
1496 $this->submit_section($this->hash['phash'], $this->hash['phash']);
1497 // PROCESSING index_grlist
1498 $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1499 // PROCESSING index_fulltext
1500 $fields = [
1501 'phash' => $this->hash['phash'],
1502 'fulltextdata' => implode(' ', $this->contentParts),
1503 'metaphonedata' => $this->metaphoneContent
1504 ];
1505 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1506 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1507 }
1508 if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1509 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1510 ->getConnectionForTable('index_fulltext');
1511 $connection->insert('index_fulltext', $fields);
1512 }
1513 // PROCESSING index_debug
1514 if ($this->indexerConfig['debugMode']) {
1515 $fields = [
1516 'phash' => $this->hash['phash'],
1517 'debuginfo' => serialize([
1518 'cHashParams' => $this->cHashParams,
1519 'external_parsers initialized' => array_keys($this->external_parsers),
1520 'conf' => array_merge($this->conf, ['content' => substr($this->conf['content'], 0, 1000)]),
1521 'contentParts' => array_merge($this->contentParts, ['body' => substr($this->contentParts['body'], 0, 1000)]),
1522 'logs' => $this->internal_log,
1523 'lexer' => $this->lexerObj->debugString
1524 ])
1525 ];
1526 if (IndexedSearchUtility::isTableUsed('index_debug')) {
1527 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1528 ->getConnectionForTable('index_debug');
1529 $connection->insert('index_debug', $fields);
1530 }
1531 }
1532 }
1533
1534 /**
1535 * Stores gr_list in the database.
1536 *
1537 * @param int $hash Search result record phash
1538 * @param int $phash_x Actual phash of current content
1539 * @see update_grlist()
1540 */
1541 public function submit_grlist($hash, $phash_x)
1542 {
1543 // Setting the gr_list record
1544 $fields = [
1545 'phash' => $hash,
1546 'phash_x' => $phash_x,
1547 'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1548 'gr_list' => $this->conf['gr_list']
1549 ];
1550 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1551 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1552 ->getConnectionForTable('index_grlist');
1553 $connection->insert('index_grlist', $fields);
1554 }
1555 }
1556
1557 /**
1558 * Stores section
1559 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1560 *
1561 * @param int $hash phash of TYPO3 parent search result record
1562 * @param int $hash_t3 phash of the file indexation search record
1563 */
1564 public function submit_section($hash, $hash_t3)
1565 {
1566 $fields = [
1567 'phash' => $hash,
1568 'phash_t3' => $hash_t3,
1569 'page_id' => (int)$this->conf['id']
1570 ];
1571 $this->getRootLineFields($fields);
1572 if (IndexedSearchUtility::isTableUsed('index_section')) {
1573 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1574 ->getConnectionForTable('index_section');
1575 $connection->insert('index_section', $fields);
1576 }
1577 }
1578
1579 /**
1580 * Removes records for the indexed page, $phash
1581 *
1582 * @param int $phash phash value to flush
1583 */
1584 public function removeOldIndexedPages($phash)
1585 {
1586 // Removing old registrations for all tables. Because the pages are TYPO3 pages
1587 // there can be nothing else than 1-1 relations here.
1588 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1589 $tableArray = ['index_phash', 'index_section', 'index_grlist', 'index_fulltext', 'index_debug'];
1590 foreach ($tableArray as $table) {
1591 if (IndexedSearchUtility::isTableUsed($table)) {
1592 $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1593 }
1594 }
1595
1596 // Removing all index_section records with hash_t3 set to this hash (this includes such
1597 // records set for external media on the page as well!). The re-insert of these records
1598 // are done in indexRegularDocument($file).
1599 if (IndexedSearchUtility::isTableUsed('index_section')) {
1600 $connectionPool->getConnectionForTable('index_section')
1601 ->delete('index_section', ['phash_t3' => (int)$phash]);
1602 }
1603 }
1604
1605 /********************************
1606 *
1607 * SQL; External media
1608 *
1609 *******************************/
1610 /**
1611 * Updates db with information about the file
1612 *
1613 * @param array $hash Array with phash and phash_grouping keys for file
1614 * @param string $file File name
1615 * @param array $subinfo Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1616 * @param string $ext File extension determining the type of media.
1617 * @param int $mtime Modification time of file.
1618 * @param int $ctime Creation time of file.
1619 * @param int $size Size of file in bytes
1620 * @param int $content_md5h Content HASH value.
1621 * @param array $contentParts Standard content array (using only title and body for a file)
1622 */
1623 public function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts)
1624 {
1625 // Find item Type:
1626 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1627 $storeItemType = $storeItemType ?: $ext;
1628 // Remove any current data for this phash:
1629 $this->removeOldIndexedFiles($hash['phash']);
1630 // Split filename:
1631 $fileParts = parse_url($file);
1632 // Setting new
1633 $fields = [
1634 'phash' => $hash['phash'],
1635 'phash_grouping' => $hash['phash_grouping'],
1636 'cHashParams' => serialize($subinfo),
1637 'contentHash' => $content_md5h,
1638 'data_filename' => $file,
1639 'item_type' => $storeItemType,
1640 'item_title' => trim($contentParts['title']) ?: PathUtility::basename($file),
1641 'item_description' => $this->bodyDescription($contentParts),
1642 'item_mtime' => $mtime,
1643 'item_size' => $size,
1644 'item_crdate' => $ctime,
1645 'tstamp' => $GLOBALS['EXEC_TIME'],
1646 'crdate' => $GLOBALS['EXEC_TIME'],
1647 'gr_list' => $this->conf['gr_list'],
1648 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1649 'recordUid' => (int)$this->conf['recordUid'],
1650 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1651 'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1652 'sys_language_uid' => (int)$this->conf['sys_language_uid']
1653 ];
1654 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1655 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1656 ->getConnectionForTable('index_phash');
1657 $connection->insert(
1658 'index_phash',
1659 $fields,
1660 ['cHashParams' => Connection::PARAM_LOB]
1661 );
1662 }
1663 // PROCESSING index_fulltext
1664 $fields = [
1665 'phash' => $hash['phash'],
1666 'fulltextdata' => implode(' ', $contentParts),
1667 'metaphonedata' => $this->metaphoneContent
1668 ];
1669 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1670 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1671 }
1672 if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1673 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1674 ->getConnectionForTable('index_fulltext');
1675 $connection->insert('index_fulltext', $fields);
1676 }
1677 // PROCESSING index_debug
1678 if ($this->indexerConfig['debugMode']) {
1679 $fields = [
1680 'phash' => $hash['phash'],
1681 'debuginfo' => serialize([
1682 'cHashParams' => $subinfo,
1683 'contentParts' => array_merge($contentParts, ['body' => substr($contentParts['body'], 0, 1000)]),
1684 'logs' => $this->internal_log,
1685 'lexer' => $this->lexerObj->debugString
1686 ])
1687 ];
1688 if (IndexedSearchUtility::isTableUsed('index_debug')) {
1689 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1690 ->getConnectionForTable('index_debug');
1691 $connection->insert('index_debug', $fields);
1692 }
1693 }
1694 }
1695
1696 /**
1697 * Stores file gr_list for a file IF it does not exist already
1698 *
1699 * @param int $hash phash value of file
1700 */
1701 public function submitFile_grlist($hash)
1702 {
1703 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1704 if (!IndexedSearchUtility::isTableUsed('index_grlist')) {
1705 return;
1706 }
1707
1708 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1709 ->getQueryBuilderForTable('index_grlist');
1710 $count = (int)$queryBuilder->count('*')
1711 ->from('index_grlist')
1712 ->where(
1713 $queryBuilder->expr()->eq(
1714 'phash',
1715 $queryBuilder->createNamedParameter($hash, \PDO::PARAM_INT)
1716 ),
1717 $queryBuilder->expr()->orX(
1718 $queryBuilder->expr()->eq(
1719 'hash_gr_list',
1720 $queryBuilder->createNamedParameter(
1721 IndexedSearchUtility::md5inthash($this->defaultGrList),
1722 \PDO::PARAM_INT
1723 )
1724 ),
1725 $queryBuilder->expr()->eq(
1726 'hash_gr_list',
1727 $queryBuilder->createNamedParameter(
1728 IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1729 \PDO::PARAM_INT
1730 )
1731 )
1732 )
1733 )
1734 ->execute()
1735 ->fetchColumn();
1736
1737 if ($count === 0) {
1738 $this->submit_grlist($hash, $hash);
1739 }
1740 }
1741
1742 /**
1743 * Stores file section for a file IF it does not exist
1744 *
1745 * @param int $hash phash value of file
1746 */
1747 public function submitFile_section($hash)
1748 {
1749 // Testing if there is already a section
1750 if (!IndexedSearchUtility::isTableUsed('index_section')) {
1751 return;
1752 }
1753
1754 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1755 ->getQueryBuilderForTable('index_section');
1756 $count = (int)$queryBuilder->count('phash')
1757 ->from('index_section')
1758 ->where(
1759 $queryBuilder->expr()->eq(
1760 'phash',
1761 $queryBuilder->createNamedParameter($hash, \PDO::PARAM_INT)
1762 ),
1763 $queryBuilder->expr()->eq(
1764 'page_id',
1765 $queryBuilder->createNamedParameter($this->conf['id'], \PDO::PARAM_INT)
1766 )
1767 )
1768 ->execute()
1769 ->fetchColumn();
1770
1771 if ($count === 0) {
1772 $this->submit_section($hash, $this->hash['phash']);
1773 }
1774 }
1775
1776 /**
1777 * Removes records for the indexed page, $phash
1778 *
1779 * @param int $phash phash value to flush
1780 */
1781 public function removeOldIndexedFiles($phash)
1782 {
1783 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1784 // Removing old registrations for tables.
1785 $tableArray = ['index_phash', 'index_grlist', 'index_fulltext', 'index_debug'];
1786 foreach ($tableArray as $table) {
1787 if (!IndexedSearchUtility::isTableUsed($table)) {
1788 continue;
1789 }
1790 $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1791 }
1792 }
1793
1794 /********************************
1795 *
1796 * SQL Helper functions
1797 *
1798 *******************************/
1799 /**
1800 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1801 * Return positive integer if the page needs to be indexed
1802 *
1803 * @param int $mtime mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1804 * @param int $phash "phash" used to select any already indexed page to see what its mtime is.
1805 * @return int Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1806 */
1807 public function checkMtimeTstamp($mtime, $phash)
1808 {
1809 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1810 // Not indexed (not in index_phash)
1811 $result = 4;
1812 } else {
1813 $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1814 ->select(
1815 ['item_mtime', 'tstamp'],
1816 'index_phash',
1817 ['phash' => (int)$phash],
1818 [],
1819 [],
1820 1
1821 )
1822 ->fetch();
1823 // If there was an indexing of the page...:
1824 if (!empty($row)) {
1825 if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < $GLOBALS['EXEC_TIME']) {
1826 // If max age is exceeded, index the page
1827 // The configured max-age was exceeded for the document and thus it's indexed.
1828 $result = 1;
1829 } else {
1830 if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < $GLOBALS['EXEC_TIME']) {
1831 // if minAge is not set or if minAge is exceeded, consider at mtime
1832 if ($mtime) {
1833 // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1834 if ($row['item_mtime'] != $mtime) {
1835 // And if mtime is different from the index_phash mtime, it's about time to re-index.
1836 // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1837 $result = 2;
1838 } else {
1839 // mtime matched the document, so no changes detected and no content updated
1840 $result = -1;
1841 if ($this->tstamp_maxAge) {
1842 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1843 } else {
1844 $this->updateTstamp($phash);
1845 $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1846 }
1847 }
1848 } else {
1849 // The minimum age was exceed, but mtime was not set, so the page was indexed.
1850 $result = 3;
1851 }
1852 } else {
1853 // The minimum age was not exceeded
1854 $result = -2;
1855 }
1856 }
1857 } else {
1858 // Page has never been indexed (is not represented in the index_phash table).
1859 $result = 4;
1860 }
1861 }
1862 return $result;
1863 }
1864
1865 /**
1866 * Check content hash in phash table
1867 *
1868 * @return mixed Returns TRUE if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1869 */
1870 public function checkContentHash()
1871 {
1872 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1873 $result = true;
1874 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1875 $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1876 ->select(
1877 ['phash'],
1878 'index_phash',
1879 [
1880 'phash_grouping' => (int)$this->hash['phash_grouping'],
1881 'contentHash' => (int)$this->content_md5h
1882 ],
1883 [],
1884 [],
1885 1
1886 )
1887 ->fetch();
1888
1889 if (!empty($row)) {
1890 $result = $row;
1891 }
1892 }
1893 return $result;
1894 }
1895
1896 /**
1897 * Check content hash for external documents
1898 * Returns TRUE if the document needs to be indexed (that is, there was no result)
1899 *
1900 * @param int $hashGr phash value to check (phash_grouping)
1901 * @param int $content_md5h Content hash to check
1902 * @return bool Returns TRUE if the document needs to be indexed (that is, there was no result)
1903 */
1904 public function checkExternalDocContentHash($hashGr, $content_md5h)
1905 {
1906 $result = true;
1907 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1908 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1909 ->getConnectionForTable('index_phash')
1910 ->count(
1911 '*',
1912 'index_phash',
1913 [
1914 'phash_grouping' => (int)$hashGr,
1915 'contentHash' => (int)$content_md5h
1916 ]
1917 );
1918
1919 $result = $count === 0;
1920 }
1921 return $result;
1922 }
1923
1924 /**
1925 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1926 *
1927 * @param int $phash_x Phash integer to test.
1928 * @return bool
1929 */
1930 public function is_grlist_set($phash_x)
1931 {
1932 $result = false;
1933 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1934 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1935 ->getConnectionForTable('index_grlist')
1936 ->count(
1937 'phash_x',
1938 'index_grlist',
1939 ['phash_x' => (int)$phash_x]
1940 );
1941
1942 $result = $count > 0;
1943 }
1944 return $result;
1945 }
1946
1947 /**
1948 * Check if an grlist-entry for this hash exists and if not so, write one.
1949 *
1950 * @param int $phash phash of the search result that should be found
1951 * @param int $phash_x The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1952 * @see submit_grlist()
1953 */
1954 public function update_grlist($phash, $phash_x)
1955 {
1956 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1957 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1958 ->getConnectionForTable('index_grlist')
1959 ->count(
1960 'phash',
1961 'index_grlist',
1962 [
1963 'phash' => (int)$phash,
1964 'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list'])
1965 ]
1966 );
1967
1968 if ($count === 0) {
1969 $this->submit_grlist($phash, $phash_x);
1970 $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1);
1971 }
1972 }
1973 }
1974
1975 /**
1976 * Update tstamp for a phash row.
1977 *
1978 * @param int $phash phash value
1979 * @param int $mtime If set, update the mtime field to this value.
1980 */
1981 public function updateTstamp($phash, $mtime = 0)
1982 {
1983 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1984 return;
1985 }
1986
1987 $updateFields = [
1988 'tstamp' => $GLOBALS['EXEC_TIME']
1989 ];
1990
1991 if ($mtime) {
1992 $updateFields['item_mtime'] = (int)$mtime;
1993 }
1994
1995 GeneralUtility::makeInstance(ConnectionPool::class)
1996 ->getConnectionForTable('index_phash')
1997 ->update(
1998 'index_phash',
1999 $updateFields,
2000 [
2001 'phash' => (int)$phash
2002 ]
2003 );
2004 }
2005
2006 /**
2007 * Update SetID of the index_phash record.
2008 *
2009 * @param int $phash phash value
2010 */
2011 public function updateSetId($phash)
2012 {
2013 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
2014 return;
2015 }
2016
2017 GeneralUtility::makeInstance(ConnectionPool::class)
2018 ->getConnectionForTable('index_phash')
2019 ->update(
2020 'index_phash',
2021 [
2022 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
2023 ],
2024 [
2025 'phash' => (int)$phash
2026 ]
2027 );
2028 }
2029
2030 /**
2031 * Update parsetime for phash row.
2032 *
2033 * @param int $phash phash value.
2034 * @param int $parsetime Parsetime value to set.
2035 */
2036 public function updateParsetime($phash, $parsetime)
2037 {
2038 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
2039 return;
2040 }
2041
2042 GeneralUtility::makeInstance(ConnectionPool::class)
2043 ->getConnectionForTable('index_phash')
2044 ->update(
2045 'index_phash',
2046 [
2047 'parsetime' => (int)$parsetime
2048 ],
2049 [
2050 'phash' => (int)$phash
2051 ]
2052 );
2053 }
2054
2055 /**
2056 * Update section rootline for the page
2057 */
2058 public function updateRootline()
2059 {
2060 if (!IndexedSearchUtility::isTableUsed('index_section')) {
2061 return;
2062 }
2063
2064 $updateFields = [];
2065 $this->getRootLineFields($updateFields);
2066
2067 GeneralUtility::makeInstance(ConnectionPool::class)
2068 ->getConnectionForTable('index_section')
2069 ->update(
2070 'index_section',
2071 $updateFields,
2072 [
2073 'page_id' => (int)$this->conf['id']
2074 ]
2075 );
2076 }
2077
2078 /**
2079 * Adding values for root-line fields.
2080 * rl0, rl1 and rl2 are standard. A hook might add more.
2081 *
2082 * @param array $fieldArray Field array, passed by reference
2083 */
2084 public function getRootLineFields(array &$fieldArray)
2085 {
2086 $fieldArray['rl0'] = (int)$this->conf['rootline_uids'][0];
2087 $fieldArray['rl1'] = (int)$this->conf['rootline_uids'][1];
2088 $fieldArray['rl2'] = (int)$this->conf['rootline_uids'][2];
2089 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] ?? [] as $fieldName => $rootLineLevel) {
2090 $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
2091 }
2092 }
2093
2094 /********************************
2095 *
2096 * SQL; Submitting words
2097 *
2098 *******************************/
2099 /**
2100 * Adds new words to db
2101 *
2102 * @param array $wordListArray Word List array (where each word has information about position etc).
2103 */
2104 public function checkWordList($wordListArray)
2105 {
2106 if (!IndexedSearchUtility::isTableUsed('index_words') || empty($wordListArray)) {
2107 return;
2108 }
2109
2110 $wordListArrayCount = count($wordListArray);
2111 $phashArray = array_map('intval', array_column($wordListArray, 'hash'));
2112
2113 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('index_words');
2114 $count = (int)$queryBuilder->count('baseword')
2115 ->from('index_words')
2116 ->where(
2117 $queryBuilder->expr()->in(
2118 'wid',
2119 $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
2120 )
2121 )
2122 ->execute()
2123 ->fetchColumn();
2124
2125 if ($count !== $wordListArrayCount) {
2126 $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_words');
2127 $queryBuilder = $connection->createQueryBuilder();
2128
2129 $result = $queryBuilder->select('baseword')
2130 ->from('index_words')
2131 ->where(
2132 $queryBuilder->expr()->in(
2133 'wid',
2134 $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
2135 )
2136 )
2137 ->execute();
2138
2139 $this->log_setTSlogMessage('Inserting words: ' . ($wordListArrayCount - $count), 1);
2140 while ($row = $result->fetch()) {
2141 unset($wordListArray[$row['baseword']]);
2142 }
2143
2144 foreach ($wordListArray as $key => $val) {
2145 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as
2146 // long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...)
2147 // this is not a problem.
2148 $connection->insert(
2149 'index_words',
2150 [
2151 'wid' => $val['hash'],
2152 'baseword' => $key,
2153 'metaphone' => $val['metaphone']
2154 ]
2155 );
2156 }
2157 }
2158 }
2159
2160 /**
2161 * Submits RELATIONS between words and phash
2162 *
2163 * @param array $wordList Word list array
2164 * @param int $phash phash value
2165 */
2166 public function submitWords($wordList, $phash)
2167 {
2168 if (!IndexedSearchUtility::isTableUsed('index_rel')) {
2169 return;
2170 }
2171 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
2172 $queryBuilder = $connectionPool->getQueryBuilderForTable('index_words');
2173 $result = $queryBuilder->select('wid')
2174 ->from('index_words')
2175 ->where(
2176 $queryBuilder->expr()->neq('is_stopword', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT))
2177 )
2178 ->groupBy('wid')
2179 ->execute();
2180
2181 $stopWords = [];
2182 while ($row = $result->fetch()) {
2183 $stopWords[$row['wid']] = $row;
2184 }
2185
2186 $connectionPool->getConnectionForTable('index_rel')->delete('index_rel', ['phash' => (int)$phash]);
2187
2188 $fields = ['phash', 'wid', 'count', 'first', 'freq', 'flags'];
2189 $rows = [];
2190 foreach ($wordList as $val) {
2191 if (isset($stopWords[$val['hash']])) {
2192 continue;
2193 }
2194 $rows[] = [
2195 (int)$phash,
2196 (int)$val['hash'],
2197 (int)$val['count'],
2198 (int)$val['first'],
2199 $this->freqMap($val['count'] / $this->wordcount),
2200 $val['cmp'] & $this->flagBitMask
2201 ];
2202 }
2203
2204 if (!empty($rows)) {
2205 $connectionPool->getConnectionForTable('index_rel')->bulkInsert('index_rel', $rows, $fields);
2206 }
2207 }
2208
2209 /**
2210 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
2211 * and back.
2212 *
2213 * @param float $freq Frequency
2214 * @return int Frequency in range.
2215 */
2216 public function freqMap($freq)
2217 {
2218 $mapFactor = $this->freqMax * 100 * $this->freqRange;
2219 if ($freq <= 1) {
2220 $newFreq = $freq * $mapFactor;
2221 $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
2222 } else {
2223 $newFreq = $freq / $mapFactor;
2224 }
2225 return $newFreq;
2226 }
2227
2228 /********************************
2229 *
2230 * Hashing
2231 *
2232 *******************************/
2233 /**
2234 * Get search hash, T3 pages
2235 */
2236 public function setT3Hashes()
2237 {
2238 // Set main array:
2239 $hArray = [
2240 'id' => (int)$this->conf['id'],
2241 'type' => (int)$this->conf['type'],
2242 'sys_lang' => (int)$this->conf['sys_language_uid'],
2243 'MP' => (string)$this->conf['MP'],
2244 'cHash' => $this->cHashParams
2245 ];
2246 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
2247 $this->hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2248 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
2249 $hArray['gr_list'] = (string)$this->conf['gr_list'];
2250 $this->hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2251 }
2252
2253 /**
2254 * Get search hash, external files
2255 *
2256 * @param string $file File name / path which identifies it on the server
2257 * @param array $subinfo Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
2258 * @return array Array with "phash_grouping" and "phash" inside.
2259 */
2260 public function setExtHashes($file, $subinfo = [])
2261 {
2262 // Set main array:
2263 $hash = [];
2264 $hArray = [
2265 'file' => $file
2266 ];
2267 // Set grouping hash:
2268 $hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2269 // Add subinfo
2270 $hArray['subinfo'] = $subinfo;
2271 $hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2272 return $hash;
2273 }
2274
2275 /*********************************
2276 *
2277 * Internal logging functions
2278 *
2279 *********************************/
2280 /**
2281 * Push function wrapper for TT logging
2282 *
2283 * @param string $msg Title to set
2284 * @param string $key Key (?)
2285 */
2286 public function log_push($msg, $key)
2287 {
2288 $this->timeTracker->push($msg, $key);
2289 }
2290
2291 /**
2292 * Pull function wrapper for TT logging
2293 */
2294 public function log_pull()
2295 {
2296 $this->timeTracker->pull();
2297 }
2298
2299 /**
2300 * Set log message function wrapper for TT logging
2301 *
2302 * @param string $msg Message to set
2303 * @param int $errorNum Error number
2304 */
2305 public function log_setTSlogMessage($msg, $errorNum = 0)
2306 {
2307 $this->timeTracker->setTSlogMessage($msg, $errorNum);
2308 $this->internal_log[] = $msg;
2309 }
2310
2311 /**
2312 * Makes sure that keywords are space-separated. This is impotant for their
2313 * proper displaying as a part of fulltext index.
2314 *
2315 * @param string $keywordList
2316 * @return string
2317 * @see http://forge.typo3.org/issues/14959
2318 */
2319 protected function addSpacesToKeywordList($keywordList)
2320 {
2321 $keywords = GeneralUtility::trimExplode(',', $keywordList);
2322 return ' ' . implode(', ', $keywords) . ' ';
2323 }
2324 }