098aea3722bafa5b20f1a19dd458f68d4a2e2dc0
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Indexer.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
18 use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
19 use TYPO3\CMS\Core\Database\Connection;
20 use TYPO3\CMS\Core\Database\ConnectionPool;
21 use TYPO3\CMS\Core\TimeTracker\TimeTracker;
22 use TYPO3\CMS\Core\Utility\GeneralUtility;
23 use TYPO3\CMS\Core\Utility\MathUtility;
24 use TYPO3\CMS\Core\Utility\PathUtility;
25 use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
26 use TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility;
27
28 /**
29 * Indexing class for TYPO3 frontend
30 */
31 class Indexer
32 {
33 use PublicPropertyDeprecationTrait;
34
35 /**
36 * List of all deprecated public properties
37 * @var array
38 */
39 protected $deprecatedPublicProperties = [
40 'csObj' => 'Using $csObj within Indexing is discouraged, the property will be removed in TYPO3 v10.0 - if needed instantiate CharsetConverter yourself.',
41 ];
42
43 /**
44 * @var array
45 */
46 public $reasons = [
47 -1 => 'mtime matched the document, so no changes detected and no content updated',
48 -2 => 'The minimum age was not exceeded',
49 1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
50 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
51 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
52 4 => 'Page has never been indexed (is not represented in the index_phash table).'
53 ];
54
55 /**
56 * HTML code blocks to exclude from indexing
57 *
58 * @var string
59 */
60 public $excludeSections = 'script,style';
61
62 /**
63 * Supported Extensions for external files
64 *
65 * @var array
66 */
67 public $external_parsers = [];
68
69 /**
70 * External parser objects, keys are file extension names. Values are objects with certain methods.
71 * Fe-group list (pages might be indexed separately for each usergroup combination to support search
72 * in access limited pages!)
73 *
74 * @var string
75 */
76 public $defaultGrList = '0,-1';
77
78 /**
79 * Min/Max times
80 *
81 * @var int
82 */
83 public $tstamp_maxAge = 0;
84
85 /**
86 * If set, this tells a number of seconds that is the maximum age of an indexed document.
87 * Regardless of mtime the document will be re-indexed if this limit is exceeded.
88 *
89 * @var int
90 */
91 public $tstamp_minAge = 0;
92
93 /**
94 * If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
95 *
96 * @var int
97 */
98 public $maxExternalFiles = 0;
99
100 /**
101 * Max number of external files to index.
102 *
103 * @var bool
104 */
105 public $forceIndexing = false;
106
107 /**
108 * If TRUE, indexing is forced despite of hashes etc.
109 *
110 * @var bool
111 */
112 public $crawlerActive = false;
113
114 /**
115 * Set when crawler is detected (internal)
116 *
117 * @var array
118 */
119 public $defaultContentArray = [
120 'title' => '',
121 'description' => '',
122 'keywords' => '',
123 'body' => ''
124 ];
125
126 /**
127 * @var int
128 */
129 public $wordcount = 0;
130
131 /**
132 * @var int
133 */
134 public $externalFileCounter = 0;
135
136 /**
137 * @var array
138 */
139 public $conf = [];
140
141 /**
142 * Configuration set internally (see init functions for required keys and their meaning)
143 *
144 * @var array
145 */
146 public $indexerConfig = [];
147
148 /**
149 * Indexer configuration, coming from TYPO3's system configuration for EXT:indexed_search
150 *
151 * @var array
152 */
153 public $hash = [];
154
155 /**
156 * Hash array, contains phash and phash_grouping
157 *
158 * @var array
159 */
160 public $file_phash_arr = [];
161
162 /**
163 * Hash array for files
164 *
165 * @var array
166 */
167 public $contentParts = [];
168
169 /**
170 * Content of TYPO3 page
171 *
172 * @var string
173 */
174 public $content_md5h = '';
175
176 /**
177 * @var array
178 */
179 public $internal_log = [];
180
181 /**
182 * Internal log
183 *
184 * @var string
185 */
186 public $indexExternalUrl_content = '';
187
188 /**
189 * @var array
190 */
191 public $cHashParams = [];
192
193 /**
194 * cHashparams array
195 *
196 * @var int
197 */
198 public $freqRange = 32000;
199
200 /**
201 * @var float
202 */
203 public $freqMax = 0.1;
204
205 /**
206 * @var bool
207 */
208 public $enableMetaphoneSearch = false;
209
210 /**
211 * @var bool
212 */
213 public $storeMetaphoneInfoAsWords;
214
215 /**
216 * @var string
217 */
218 public $metaphoneContent = '';
219
220 /**
221 * Charset class object
222 *
223 * @var \TYPO3\CMS\Core\Charset\CharsetConverter
224 * @deprecated since TYPO3 v9.3, will be removed in TYPO3 v10 (also the instantiation in the init() method).
225 */
226 protected $csObj;
227
228 /**
229 * Metaphone object, if any
230 *
231 * @var \TYPO3\CMS\IndexedSearch\Utility\DoubleMetaPhoneUtility
232 */
233 public $metaphoneObj;
234
235 /**
236 * Lexer object for word splitting
237 *
238 * @var \TYPO3\CMS\IndexedSearch\Lexer
239 */
240 public $lexerObj;
241
242 /**
243 * @var bool
244 */
245 public $flagBitMask;
246
247 /**
248 * @var TimeTracker
249 */
250 protected $timeTracker;
251
252 /**
253 * Indexer constructor.
254 */
255 public function __construct()
256 {
257 $this->timeTracker = GeneralUtility::makeInstance(TimeTracker::class);
258 }
259
260 /**
261 * Parent Object (TSFE) Initialization
262 *
263 * @param TypoScriptFrontendController $pObj Parent Object, passed by reference
264 */
265 public function hook_indexContent(&$pObj)
266 {
267 // Indexer configuration from Extension Manager interface:
268 $disableFrontendIndexing = (bool)GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search', 'disableFrontendIndexing');
269 // Crawler activation:
270 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
271 if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler') && $pObj->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
272 // Setting simple log message:
273 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
274 // Setting variables:
275 $this->crawlerActive = true;
276 // Crawler active flag
277 $this->forceIndexing = true;
278 }
279 // Determine if page should be indexed, and if so, configure and initialize indexer
280 if ($pObj->config['config']['index_enable']) {
281 $this->log_push('Index page', '');
282 if (!$disableFrontendIndexing || $this->crawlerActive) {
283 if (!$pObj->page['no_search']) {
284 if (!$pObj->no_cache) {
285 if ((int)$pObj->sys_language_uid === (int)$pObj->sys_language_content) {
286 // Setting up internal configuration from config array:
287 $this->conf = [];
288 // Information about page for which the indexing takes place
289 $this->conf['id'] = $pObj->id;
290 // Page id
291 $this->conf['type'] = $pObj->type;
292 // Page type
293 $this->conf['sys_language_uid'] = $pObj->sys_language_uid;
294 // sys_language UID of the language of the indexing.
295 $this->conf['MP'] = $pObj->MP;
296 // MP variable, if any (Mount Points)
297 $this->conf['gr_list'] = $pObj->gr_list;
298 // Group list
299 $this->conf['cHash'] = $pObj->cHash;
300 // cHash string for additional parameters
301 $this->conf['cHash_array'] = $pObj->cHash_array;
302 // Array of the additional parameters
303 $this->conf['crdate'] = $pObj->page['crdate'];
304 // The creation date of the TYPO3 page
305
306 // reg1 of the caching table. Not known what practical use this has.
307 // @deprecated since TYPO3 v9, will be removed in TYPO3 v10. Remove along with database field data_page_reg1
308 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;
309
310 // Root line uids
311 $this->conf['rootline_uids'] = [];
312 foreach ($pObj->config['rootLine'] as $rlkey => $rldat) {
313 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
314 }
315 // Content of page:
316 $this->conf['content'] = $pObj->content;
317 // Content string (HTML of TYPO3 page)
318 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
319 // Alternative title for indexing
320 $this->conf['metaCharset'] = $pObj->metaCharset;
321 // Character set of content (will be converted to utf-8 during indexing)
322 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED'] ?? $pObj->page['SYS_LASTCHANGED'];
323 // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
324 // Configuration of behavior:
325 $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
326 // Whether to index external documents like PDF, DOC etc. (if possible)
327 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
328 // Length of description text (max 250, default 200)
329 $this->conf['index_metatags'] = $pObj->config['config']['index_metatags'] ?? true;
330 // Set to zero:
331 $this->conf['recordUid'] = 0;
332 $this->conf['freeIndexUid'] = 0;
333 $this->conf['freeIndexSetId'] = 0;
334 // Init and start indexing:
335 $this->init();
336 $this->indexTypo3PageContent();
337 } else {
338 $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
339 }
340 } else {
341 $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
342 }
343 } else {
344 $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
345 }
346 } else {
347 $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
348 }
349 $this->log_pull();
350 }
351 }
352
353 /****************************
354 *
355 * Backend API
356 *
357 ****************************/
358 /**
359 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
360 *
361 * @param int $id The page uid, &id=
362 * @param int $type The page type, &type=
363 * @param int $sys_language_uid sys_language uid, typically &L=
364 * @param string $MP The MP variable (Mount Points), &MP=
365 * @param array $uidRL Rootline array of only UIDs.
366 * @param array $cHash_array Array of GET variables to register with this indexing
367 * @param bool $createCHash If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cacheable pages from the backend!
368 */
369 public function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array = [], $createCHash = false)
370 {
371 // Setting up internal configuration from config array:
372 $this->conf = [];
373 // Information about page for which the indexing takes place
374 $this->conf['id'] = $id;
375 // Page id (int)
376 $this->conf['type'] = $type;
377 // Page type (int)
378 $this->conf['sys_language_uid'] = $sys_language_uid;
379 // sys_language UID of the language of the indexing (int)
380 $this->conf['MP'] = $MP;
381 // MP variable, if any (Mount Points) (string)
382 $this->conf['gr_list'] = '0,-1';
383 // Group list (hardcoded for now...)
384 // cHash values:
385 if ($createCHash) {
386 /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
387 $cacheHash = GeneralUtility::makeInstance(\TYPO3\CMS\Frontend\Page\CacheHashCalculator::class);
388 $this->conf['cHash'] = $cacheHash->generateForParameters(GeneralUtility::implodeArrayForUrl('', $cHash_array));
389 } else {
390 $this->conf['cHash'] = '';
391 }
392 // cHash string for additional parameters
393 $this->conf['cHash_array'] = $cHash_array;
394 // Array of the additional parameters
395 // Set to defaults
396 $this->conf['freeIndexUid'] = 0;
397 $this->conf['freeIndexSetId'] = 0;
398
399 // @deprecated since TYPO3 v9, will be removed in TYPO3 v10. Remove along with database field data_page_reg1
400 $this->conf['page_cache_reg1'] = 0;
401
402 // Root line uids
403 $this->conf['rootline_uids'] = $uidRL;
404 // Configuration of behavior:
405 $this->conf['index_externals'] = 1;
406 // Whether to index external documents like PDF, DOC etc. (if possible)
407 $this->conf['index_descrLgd'] = 200;
408 // Length of description text (max 250, default 200)
409 $this->conf['index_metatags'] = true;
410 // Whether to index document keywords and description (if present)
411 // Init and start indexing:
412 $this->init();
413 }
414
415 /**
416 * Sets the free-index uid. Can be called right after backend_initIndexer()
417 *
418 * @param int $freeIndexUid Free index UID
419 * @param int $freeIndexSetId Set id - an integer identifying the "set" of indexing operations.
420 */
421 public function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId = 0)
422 {
423 $this->conf['freeIndexUid'] = $freeIndexUid;
424 $this->conf['freeIndexSetId'] = $freeIndexSetId;
425 }
426
427 /**
428 * Indexing records as the content of a TYPO3 page.
429 *
430 * @param string $title Title equivalent
431 * @param string $keywords Keywords equivalent
432 * @param string $description Description equivalent
433 * @param string $content The main content to index
434 * @param string $charset The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
435 * @param int $mtime Last modification time, in seconds
436 * @param int $crdate The creation date of the content, in seconds
437 * @param int $recordUid The record UID that the content comes from (for registration with the indexed rows)
438 */
439 public function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate = 0, $recordUid = 0)
440 {
441 // Content of page:
442 $this->conf['mtime'] = $mtime;
443 // Most recent modification time (seconds) of the content
444 $this->conf['crdate'] = $crdate;
445 // The creation date of the TYPO3 content
446 $this->conf['recordUid'] = $recordUid;
447 // UID of the record, if applicable
448 // Construct fake HTML for parsing:
449 $this->conf['content'] = '
450 <html>
451 <head>
452 <title>' . htmlspecialchars($title) . '</title>
453 <meta name="keywords" content="' . htmlspecialchars($keywords) . '" />
454 <meta name="description" content="' . htmlspecialchars($description) . '" />
455 </head>
456 <body>
457 ' . htmlspecialchars($content) . '
458 </body>
459 </html>';
460 // Content string (HTML of TYPO3 page)
461 // Initializing charset:
462 $this->conf['metaCharset'] = $charset;
463 // Character set of content (will be converted to utf-8 during indexing)
464 $this->conf['indexedDocTitle'] = '';
465 // Alternative title for indexing
466 // Index content as if it was a TYPO3 page:
467 $this->indexTypo3PageContent();
468 }
469
470 /********************************
471 *
472 * Initialization
473 *
474 *******************************/
475 /**
476 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
477 */
478 public function init()
479 {
480 // Initializing:
481 $this->cHashParams = $this->conf['cHash_array'];
482 if (is_array($this->cHashParams) && !empty($this->cHashParams)) {
483 if ($this->conf['cHash']) {
484 // Add this so that URL's come out right...
485 $this->cHashParams['cHash'] = $this->conf['cHash'];
486 }
487 unset($this->cHashParams['encryptionKey']);
488 }
489 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
490 $this->setT3Hashes();
491 // Indexer configuration from Extension Manager interface:
492 $this->indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
493 $this->tstamp_minAge = MathUtility::forceIntegerInRange($this->indexerConfig['minAge'] * 3600, 0);
494 $this->tstamp_maxAge = MathUtility::forceIntegerInRange($this->indexerConfig['maxAge'] * 3600, 0);
495 $this->maxExternalFiles = MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
496 $this->flagBitMask = MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
497 // Workaround: If the extension configuration was not updated yet, the value is not existing
498 $this->enableMetaphoneSearch = !isset($this->indexerConfig['enableMetaphoneSearch']) || $this->indexerConfig['enableMetaphoneSearch'];
499 $this->storeMetaphoneInfoAsWords = !IndexedSearchUtility::isTableUsed('index_words') && $this->enableMetaphoneSearch;
500 // Initialize external document parsers:
501 // Example configuration, see ext_localconf.php of this file!
502 if ($this->conf['index_externals']) {
503 $this->initializeExternalParsers();
504 }
505 // Initialize lexer (class that deconstructs the text into words):
506 $lexerObjectClassName = $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] ?: Lexer::class;
507 $this->lexerObj = GeneralUtility::makeInstance($lexerObjectClassName);
508 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
509 // Initialize metaphone hook:
510 // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
511 if ($this->enableMetaphoneSearch && $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']) {
512 $this->metaphoneObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']);
513 $this->metaphoneObj->pObj = $this;
514 }
515 // Init charset class:
516 $this->csObj = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Charset\CharsetConverter::class);
517 }
518
519 /**
520 * Initialize external parsers
521 *
522 * @access private
523 * @see init()
524 */
525 public function initializeExternalParsers()
526 {
527 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] ?? [] as $extension => $className) {
528 $this->external_parsers[$extension] = GeneralUtility::makeInstance($className);
529 $this->external_parsers[$extension]->pObj = $this;
530 // Init parser and if it returns FALSE, unset its entry again:
531 if (!$this->external_parsers[$extension]->initParser($extension)) {
532 unset($this->external_parsers[$extension]);
533 }
534 }
535 }
536
537 /********************************
538 *
539 * Indexing; TYPO3 pages (HTML content)
540 *
541 *******************************/
542 /**
543 * Start indexing of the TYPO3 page
544 */
545 public function indexTypo3PageContent()
546 {
547 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
548 $is_grlist = $this->is_grlist_set($this->hash['phash']);
549 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
550 // Setting message:
551 if ($this->forceIndexing) {
552 $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
553 } elseif ($check > 0) {
554 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
555 } else {
556 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
557 }
558 // Divide into title,keywords,description and body:
559 $this->log_push('Split content', '');
560 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
561 if ($this->conf['indexedDocTitle']) {
562 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
563 }
564 $this->log_pull();
565 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so don't!)
566 $this->content_md5h = IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
567 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
568 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
569 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
570 $checkCHash = $this->checkContentHash();
571 if (!is_array($checkCHash) || $check === 1) {
572 $Pstart = GeneralUtility::milliseconds();
573 $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
574 $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
575 $this->log_pull();
576 // Splitting words
577 $this->log_push('Extract words from content', '');
578 $splitInWords = $this->processWordsInArrays($this->contentParts);
579 $this->log_pull();
580 // Analyse the indexed words.
581 $this->log_push('Analyse the extracted words', '');
582 $indexArr = $this->indexAnalyze($splitInWords);
583 $this->log_pull();
584 // Submitting page (phash) record
585 $this->log_push('Submitting page', '');
586 $this->submitPage();
587 $this->log_pull();
588 // Check words and submit to word list if not there
589 $this->log_push('Check word list and submit words', '');
590 if (IndexedSearchUtility::isTableUsed('index_words')) {
591 $this->checkWordList($indexArr);
592 $this->submitWords($indexArr, $this->hash['phash']);
593 }
594 $this->log_pull();
595 // Set parsetime
596 $this->updateParsetime($this->hash['phash'], GeneralUtility::milliseconds() - $Pstart);
597 // Checking external files if configured for.
598 $this->log_push('Checking external files', '');
599 if ($this->conf['index_externals']) {
600 $this->extractLinks($this->conf['content']);
601 }
602 $this->log_pull();
603 } else {
604 // Update the timestamp
605 $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
606 $this->updateSetId($this->hash['phash']);
607 // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
608 $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
609 $this->updateRootline();
610 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
611 }
612 } else {
613 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
614 }
615 }
616
617 /**
618 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
619 *
620 * @param string $content HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
621 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
622 * @see splitRegularContent()
623 */
624 public function splitHTMLContent($content)
625 {
626 // divide head from body ( u-ouh :) )
627 $contentArr = $this->defaultContentArray;
628 $contentArr['body'] = stristr($content, '<body');
629 $headPart = substr($content, 0, -strlen($contentArr['body']));
630 // get title
631 $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
632 $titleParts = explode(':', $contentArr['title'], 2);
633 $contentArr['title'] = trim($titleParts[1] ?? $titleParts[0]);
634 // get keywords and description metatags
635 if ($this->conf['index_metatags']) {
636 $meta = [];
637 $i = 0;
638 while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
639 $i++;
640 }
641 // @todo The code below stops at first unset tag. Is that correct?
642 for ($i = 0; isset($meta[$i]); $i++) {
643 $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i]);
644 if (stristr($meta[$i]['name'], 'keywords')) {
645 $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
646 }
647 if (stristr($meta[$i]['name'], 'description')) {
648 $contentArr['description'] .= ',' . $meta[$i]['content'];
649 }
650 }
651 }
652 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
653 $this->typoSearchTags($contentArr['body']);
654 // Get rid of unwanted sections (ie. scripting and style stuff) in body
655 $tagList = explode(',', $this->excludeSections);
656 foreach ($tagList as $tag) {
657 while ($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
658 }
659 }
660 // remove tags, but first make sure we don't concatenate words by doing it
661 $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
662 $contentArr['body'] = trim(strip_tags($contentArr['body']));
663 $contentArr['keywords'] = trim($contentArr['keywords']);
664 $contentArr['description'] = trim($contentArr['description']);
665 // Return array
666 return $contentArr;
667 }
668
669 /**
670 * Extract the charset value from HTML meta tag.
671 *
672 * @param string $content HTML content
673 * @return string The charset value if found.
674 */
675 public function getHTMLcharset($content)
676 {
677 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
678 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
679 return $reg2[1];
680 }
681 }
682 }
683
684 /**
685 * Converts a HTML document to utf-8
686 *
687 * @param string $content HTML content, any charset
688 * @param string $charset Optional charset (otherwise extracted from HTML)
689 * @return string Converted HTML
690 */
691 public function convertHTMLToUtf8($content, $charset = '')
692 {
693 // Find charset:
694 $charset = $charset ?: $this->getHTMLcharset($content);
695 $charset = trim(strtolower($charset));
696 // Convert charset:
697 if ($charset && $charset !== 'utf-8') {
698 $content = mb_convert_encoding($content, 'utf-8', $charset);
699 }
700 // Convert entities, assuming document is now UTF-8:
701 return html_entity_decode($content);
702 }
703
704 /**
705 * Finds first occurrence of embracing tags and returns the embraced content and the original string with
706 * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding
707 * <title> of document or removing <script>-sections
708 *
709 * @param string $string String to search in
710 * @param string $tagName Tag name, eg. "script
711 * @param string $tagContent Passed by reference: Content inside found tag
712 * @param string $stringAfter Passed by reference: Content after found tag
713 * @param string $paramList Passed by reference: Attributes of the found tag.
714 * @return bool Returns FALSE if tag was not found, otherwise TRUE.
715 */
716 public function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
717 {
718 $endTag = '</' . $tagName . '>';
719 $startTag = '<' . $tagName;
720 // stristr used because we want a case-insensitive search for the tag.
721 $isTagInText = stristr($string, $startTag);
722 // if the tag was not found, return FALSE
723 if (!$isTagInText) {
724 return false;
725 }
726 list($paramList, $isTagInText) = explode('>', substr($isTagInText, strlen($startTag)), 2);
727 $afterTagInText = stristr($isTagInText, $endTag);
728 if ($afterTagInText) {
729 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
730 $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
731 $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
732 } else {
733 $tagContent = '';
734 $stringAfter = $isTagInText;
735 }
736 return true;
737 }
738
739 /**
740 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
741 *
742 * @param string $body HTML Content, passed by reference
743 * @return bool Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE.
744 */
745 public function typoSearchTags(&$body)
746 {
747 $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
748 if (count($expBody) > 1) {
749 $body = '';
750 foreach ($expBody as $val) {
751 $part = explode('-->', $val, 2);
752 if (trim($part[0]) === 'begin') {
753 $body .= $part[1];
754 $prev = '';
755 } elseif (trim($part[0]) === 'end') {
756 $body .= $prev;
757 } else {
758 $prev = $val;
759 }
760 }
761 return true;
762 }
763 return false;
764 }
765
766 /**
767 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
768 *
769 * @param string $content HTML content
770 */
771 public function extractLinks($content)
772 {
773 // Get links:
774 $list = $this->extractHyperLinks($content);
775 if ($this->indexerConfig['useCrawlerForExternalFiles'] && \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler')) {
776 $crawler = GeneralUtility::makeInstance(\tx_crawler_lib::class);
777 }
778 // Traverse links:
779 foreach ($list as $linkInfo) {
780 // Decode entities:
781 if ($linkInfo['localPath']) {
782 // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
783 $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
784 } else {
785 $linkSource = htmlspecialchars_decode($linkInfo['href']);
786 }
787 // Parse URL:
788 $qParts = parse_url($linkSource);
789 // Check for jumpurl (TYPO3 specific thing...)
790 if ($qParts['query'] && strstr($qParts['query'], 'jumpurl=')) {
791 parse_str($qParts['query'], $getP);
792 $linkSource = $getP['jumpurl'];
793 $qParts = parse_url($linkSource);
794 }
795 if (!$linkInfo['localPath'] && $qParts['scheme']) {
796 if ($this->indexerConfig['indexExternalURLs']) {
797 // Index external URL (http or otherwise)
798 $this->indexExternalUrl($linkSource);
799 }
800 } elseif (!$qParts['query']) {
801 $linkSource = urldecode($linkSource);
802 if (GeneralUtility::isAllowedAbsPath($linkSource)) {
803 $localFile = $linkSource;
804 } else {
805 $localFile = GeneralUtility::getFileAbsFileName(PATH_site . $linkSource);
806 }
807 if ($localFile && @is_file($localFile)) {
808 // Index local file:
809 if ($linkInfo['localPath']) {
810 $fI = pathinfo($linkSource);
811 $ext = strtolower($fI['extension']);
812 if (is_object($crawler)) {
813 $params = [
814 'document' => $linkSource,
815 'alturl' => $linkInfo['href'],
816 'conf' => $this->conf
817 ];
818 unset($params['conf']['content']);
819 $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
820 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
821 } else {
822 $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
823 }
824 } else {
825 if (is_object($crawler)) {
826 $params = [
827 'document' => $linkSource,
828 'conf' => $this->conf
829 ];
830 unset($params['conf']['content']);
831 $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
832 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
833 } else {
834 $this->indexRegularDocument($linkSource);
835 }
836 }
837 }
838 }
839 }
840 }
841
842 /**
843 * Extracts all links to external documents from the HTML content string
844 *
845 * @param string $html
846 * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
847 * @see extractLinks()
848 */
849 public function extractHyperLinks($html)
850 {
851 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
852 $htmlParts = $htmlParser->splitTags('a', $html);
853 $hyperLinksData = [];
854 foreach ($htmlParts as $index => $tagData) {
855 if ($index % 2 !== 0) {
856 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
857 $firstTagName = $htmlParser->getFirstTagName($tagData);
858 if (strtolower($firstTagName) === 'a') {
859 if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] !== '#') {
860 $hyperLinksData[] = [
861 'tag' => $tagData,
862 'href' => $tagAttributes[0]['href'],
863 'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
864 ];
865 }
866 }
867 }
868 }
869 return $hyperLinksData;
870 }
871
872 /**
873 * Extracts the "base href" from content string.
874 *
875 * @param string $html Content to analyze
876 * @return string The base href or an empty string if not found
877 */
878 public function extractBaseHref($html)
879 {
880 $href = '';
881 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
882 $htmlParts = $htmlParser->splitTags('base', $html);
883 foreach ($htmlParts as $index => $tagData) {
884 if ($index % 2 !== 0) {
885 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
886 $firstTagName = $htmlParser->getFirstTagName($tagData);
887 if (strtolower($firstTagName) === 'base') {
888 $href = $tagAttributes[0]['href'];
889 if ($href) {
890 break;
891 }
892 }
893 }
894 }
895 return $href;
896 }
897
898 /******************************************
899 *
900 * Indexing; external URL
901 *
902 ******************************************/
903 /**
904 * Index External URLs HTML content
905 *
906 * @param string $externalUrl URL, eg. "http://typo3.org/
907 * @see indexRegularDocument()
908 */
909 public function indexExternalUrl($externalUrl)
910 {
911 // Get headers:
912 $urlHeaders = $this->getUrlHeaders($externalUrl);
913 if (stristr($urlHeaders['Content-Type'], 'text/html')) {
914 $content = ($this->indexExternalUrl_content = GeneralUtility::getUrl($externalUrl));
915 if ((string)$content !== '') {
916 // Create temporary file:
917 $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
918 if ($tmpFile) {
919 GeneralUtility::writeFile($tmpFile, $content);
920 // Index that file:
921 $this->indexRegularDocument($externalUrl, true, $tmpFile, 'html');
922 // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
923 unlink($tmpFile);
924 }
925 }
926 }
927 }
928
929 /**
930 * Getting HTTP request headers of URL
931 *
932 * @param string $url The URL
933 * @return mixed If no answer, returns FALSE. Otherwise an array where HTTP headers are keys
934 */
935 public function getUrlHeaders($url)
936 {
937 // Try to get the headers only
938 $content = GeneralUtility::getUrl($url, 2);
939 if ((string)$content !== '') {
940 // Compile headers:
941 $headers = GeneralUtility::trimExplode(LF, $content, true);
942 $retVal = [];
943 foreach ($headers as $line) {
944 if (trim($line) === '') {
945 break;
946 }
947 list($headKey, $headValue) = explode(':', $line, 2);
948 $retVal[$headKey] = $headValue;
949 }
950 return $retVal;
951 }
952 }
953
954 /**
955 * Checks if the file is local
956 *
957 * @param string $sourcePath
958 * @return string Absolute path to file if file is local, else empty string
959 */
960 protected function createLocalPath($sourcePath)
961 {
962 $localPath = '';
963 $pathFunctions = [
964 'createLocalPathFromT3vars',
965 'createLocalPathUsingAbsRefPrefix',
966 'createLocalPathUsingDomainURL',
967 'createLocalPathFromAbsoluteURL',
968 'createLocalPathFromRelativeURL'
969 ];
970 foreach ($pathFunctions as $functionName) {
971 $localPath = $this->{$functionName}($sourcePath);
972 if ($localPath != '') {
973 break;
974 }
975 }
976 return $localPath;
977 }
978
979 /**
980 * Attempts to create a local file path from T3VARs. This is useful for
981 * various download extensions that hide actual file name but still want the
982 * file to be indexed.
983 *
984 * @param string $sourcePath
985 * @return string
986 */
987 protected function createLocalPathFromT3vars($sourcePath)
988 {
989 $localPath = '';
990 $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
991 if (is_array($indexLocalFiles)) {
992 $md5 = GeneralUtility::shortMD5($sourcePath);
993 // Note: not using self::isAllowedLocalFile here because this method
994 // is allowed to index files outside of the web site (for example,
995 // protected downloads)
996 if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
997 $localPath = $indexLocalFiles[$md5];
998 }
999 }
1000 return $localPath;
1001 }
1002
1003 /**
1004 * Attempts to create a local file path by matching a current request URL.
1005 *
1006 * @param string $sourcePath
1007 * @return string
1008 */
1009 protected function createLocalPathUsingDomainURL($sourcePath)
1010 {
1011 $localPath = '';
1012 $baseURL = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
1013 $baseURLLength = strlen($baseURL);
1014 if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
1015 $sourcePath = substr($sourcePath, $baseURLLength);
1016 $localPath = PATH_site . $sourcePath;
1017 if (!self::isAllowedLocalFile($localPath)) {
1018 $localPath = '';
1019 }
1020 }
1021 return $localPath;
1022 }
1023
1024 /**
1025 * Attempts to create a local file path by matching absRefPrefix. This
1026 * requires TSFE. If TSFE is missing, this function does nothing.
1027 *
1028 * @param string $sourcePath
1029 * @return string
1030 */
1031 protected function createLocalPathUsingAbsRefPrefix($sourcePath)
1032 {
1033 $localPath = '';
1034 if ($GLOBALS['TSFE'] instanceof \TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController) {
1035 $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
1036 $absRefPrefixLength = strlen($absRefPrefix);
1037 if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
1038 $sourcePath = substr($sourcePath, $absRefPrefixLength);
1039 $localPath = PATH_site . $sourcePath;
1040 if (!self::isAllowedLocalFile($localPath)) {
1041 $localPath = '';
1042 }
1043 }
1044 }
1045 return $localPath;
1046 }
1047
1048 /**
1049 * Attempts to create a local file path from the absolute URL without
1050 * schema.
1051 *
1052 * @param string $sourcePath
1053 * @return string
1054 */
1055 protected function createLocalPathFromAbsoluteURL($sourcePath)
1056 {
1057 $localPath = '';
1058 if ($sourcePath[0] === '/') {
1059 $sourcePath = substr($sourcePath, 1);
1060 $localPath = PATH_site . $sourcePath;
1061 if (!self::isAllowedLocalFile($localPath)) {
1062 $localPath = '';
1063 }
1064 }
1065 return $localPath;
1066 }
1067
1068 /**
1069 * Attempts to create a local file path from the relative URL.
1070 *
1071 * @param string $sourcePath
1072 * @return string
1073 */
1074 protected function createLocalPathFromRelativeURL($sourcePath)
1075 {
1076 $localPath = '';
1077 if (self::isRelativeURL($sourcePath)) {
1078 $localPath = PATH_site . $sourcePath;
1079 if (!self::isAllowedLocalFile($localPath)) {
1080 $localPath = '';
1081 }
1082 }
1083 return $localPath;
1084 }
1085
1086 /**
1087 * Checks if URL is relative.
1088 *
1089 * @param string $url
1090 * @return bool
1091 */
1092 protected static function isRelativeURL($url)
1093 {
1094 $urlParts = @parse_url($url);
1095 return $urlParts['scheme'] == '' && $urlParts['path'][0] !== '/';
1096 }
1097
1098 /**
1099 * Checks if the path points to the file inside the web site
1100 *
1101 * @param string $filePath
1102 * @return bool
1103 */
1104 protected static function isAllowedLocalFile($filePath)
1105 {
1106 $filePath = GeneralUtility::resolveBackPath($filePath);
1107 $insideWebPath = substr($filePath, 0, strlen(PATH_site)) == PATH_site;
1108 $isFile = is_file($filePath);
1109 return $insideWebPath && $isFile;
1110 }
1111
1112 /******************************************
1113 *
1114 * Indexing; external files (PDF, DOC, etc)
1115 *
1116 ******************************************/
1117 /**
1118 * Indexing a regular document given as $file (relative to PATH_site, local file)
1119 *
1120 * @param string $file Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with \TYPO3\CMS\Core\Utility\GeneralUtility::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
1121 * @param bool $force If set, indexing is forced (despite content hashes, mtime etc).
1122 * @param string $contentTmpFile Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
1123 * @param string $altExtension File extension for temporary file.
1124 */
1125 public function indexRegularDocument($file, $force = false, $contentTmpFile = '', $altExtension = '')
1126 {
1127 // Init
1128 $fI = pathinfo($file);
1129 $ext = $altExtension ?: strtolower($fI['extension']);
1130 // Create abs-path:
1131 if (!$contentTmpFile) {
1132 if (!GeneralUtility::isAbsPath($file)) {
1133 // Relative, prepend PATH_site:
1134 $absFile = GeneralUtility::getFileAbsFileName(PATH_site . $file);
1135 } else {
1136 // Absolute, pass-through:
1137 $absFile = $file;
1138 }
1139 $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
1140 } else {
1141 $absFile = $contentTmpFile;
1142 }
1143 // Indexing the document:
1144 if ($absFile && @is_file($absFile)) {
1145 if ($this->external_parsers[$ext]) {
1146 $fileInfo = stat($absFile);
1147 $cParts = $this->fileContentParts($ext, $absFile);
1148 foreach ($cParts as $cPKey) {
1149 $this->internal_log = [];
1150 $this->log_push('Index: ' . str_replace('.', '_', PathUtility::basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
1151 $Pstart = GeneralUtility::milliseconds();
1152 $subinfo = ['key' => $cPKey];
1153 // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1154 $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
1155 $check = $this->checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
1156 if ($check > 0 || $force) {
1157 if ($check > 0) {
1158 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
1159 } else {
1160 $this->log_setTSlogMessage('Indexing forced by flag', 1);
1161 }
1162 // Check external file counter:
1163 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1164 // Divide into title,keywords,description and body:
1165 $this->log_push('Split content', '');
1166 $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
1167 $this->log_pull();
1168 if (is_array($contentParts)) {
1169 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1170 $content_md5h = IndexedSearchUtility::md5inthash(implode($contentParts, ''));
1171 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1172 // Increment counter:
1173 $this->externalFileCounter++;
1174 // Splitting words
1175 $this->log_push('Extract words from content', '');
1176 $splitInWords = $this->processWordsInArrays($contentParts);
1177 $this->log_pull();
1178 // Analyse the indexed words.
1179 $this->log_push('Analyse the extracted words', '');
1180 $indexArr = $this->indexAnalyze($splitInWords);
1181 $this->log_pull();
1182 // Submitting page (phash) record
1183 $this->log_push('Submitting page', '');
1184 // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1185 $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], $content_md5h, $contentParts);
1186 $this->log_pull();
1187 // Check words and submit to word list if not there
1188 $this->log_push('Check word list and submit words', '');
1189 if (IndexedSearchUtility::isTableUsed('index_words')) {
1190 $this->checkWordList($indexArr);
1191 $this->submitWords($indexArr, $phash_arr['phash']);
1192 }
1193 $this->log_pull();
1194 // Set parsetime
1195 $this->updateParsetime($phash_arr['phash'], GeneralUtility::milliseconds() - $Pstart);
1196 } else {
1197 // Update the timestamp
1198 $this->updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
1199 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
1200 }
1201 } else {
1202 $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1203 }
1204 } else {
1205 $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
1206 }
1207 } else {
1208 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
1209 }
1210 // Checking and setting sections:
1211 $this->submitFile_section($phash_arr['phash']);
1212 // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1213 $this->log_pull();
1214 }
1215 } else {
1216 $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
1217 }
1218 } else {
1219 $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
1220 }
1221 }
1222
1223 /**
1224 * Reads the content of an external file being indexed.
1225 * The content from the external parser MUST be returned in utf-8!
1226 *
1227 * @param string $fileExtension File extension, eg. "pdf", "doc" etc.
1228 * @param string $absoluteFileName Absolute filename of file (must exist and be validated OK before calling function)
1229 * @param string $sectionPointer Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1230 * @return array Standard content array (title, description, keywords, body keys)
1231 */
1232 public function readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
1233 {
1234 $contentArray = null;
1235 // Consult relevant external document parser:
1236 if (is_object($this->external_parsers[$fileExtension])) {
1237 $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1238 }
1239 return $contentArray;
1240 }
1241
1242 /**
1243 * Creates an array with pointers to divisions of document.
1244 *
1245 * @param string $ext File extension
1246 * @param string $absFile Absolute filename (must exist and be validated OK before calling function)
1247 * @return array Array of pointers to sections that the document should be divided into
1248 */
1249 public function fileContentParts($ext, $absFile)
1250 {
1251 $cParts = [0];
1252 // Consult relevant external document parser:
1253 if (is_object($this->external_parsers[$ext])) {
1254 $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
1255 }
1256 return $cParts;
1257 }
1258
1259 /**
1260 * Splits non-HTML content (from external files for instance)
1261 *
1262 * @param string $content Input content (non-HTML) to index.
1263 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1264 * @see splitHTMLContent()
1265 */
1266 public function splitRegularContent($content)
1267 {
1268 $contentArr = $this->defaultContentArray;
1269 $contentArr['body'] = $content;
1270 return $contentArr;
1271 }
1272
1273 /**********************************
1274 *
1275 * Analysing content, Extracting words
1276 *
1277 **********************************/
1278 /**
1279 * Convert character set and HTML entities in the value of input content array keys
1280 *
1281 * @param array $contentArr Standard content array
1282 * @param string $charset Charset of the input content (converted to utf-8)
1283 */
1284 public function charsetEntity2utf8(&$contentArr, $charset)
1285 {
1286 // Convert charset if necessary
1287 foreach ($contentArr as $key => $value) {
1288 if ((string)$contentArr[$key] !== '') {
1289 if ($charset !== 'utf-8') {
1290 $contentArr[$key] = mb_convert_encoding($contentArr[$key], 'utf-8', $charset);
1291 }
1292 // decode all numeric / html-entities in the string to real characters:
1293 $contentArr[$key] = html_entity_decode($contentArr[$key]);
1294 }
1295 }
1296 }
1297
1298 /**
1299 * Processing words in the array from split*Content -functions
1300 *
1301 * @param array $contentArr Array of content to index, see splitHTMLContent() and splitRegularContent()
1302 * @return array Content input array modified so each key is not a unique array of words
1303 */
1304 public function processWordsInArrays($contentArr)
1305 {
1306 // split all parts to words
1307 foreach ($contentArr as $key => $value) {
1308 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1309 }
1310 // For title, keywords, and description we don't want duplicates:
1311 $contentArr['title'] = array_unique($contentArr['title']);
1312 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1313 $contentArr['description'] = array_unique($contentArr['description']);
1314 // Return modified array:
1315 return $contentArr;
1316 }
1317
1318 /**
1319 * Extracts the sample description text from the content array.
1320 *
1321 * @param array $contentArr Content array
1322 * @return string Description string
1323 */
1324 public function bodyDescription($contentArr)
1325 {
1326 // Setting description
1327 $maxL = MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1328 if ($maxL) {
1329 $bodyDescription = preg_replace('/\s+/u', ' ', $contentArr['body']);
1330 // Shorten the string:
1331 $bodyDescription = mb_strcut($bodyDescription, 0, $maxL, 'utf-8');
1332 }
1333 return $bodyDescription;
1334 }
1335
1336 /**
1337 * Analyzes content to use for indexing,
1338 *
1339 * @param array $content Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1340 * @return array Index Array (whatever that is...)
1341 */
1342 public function indexAnalyze($content)
1343 {
1344 $indexArr = [];
1345 $this->analyzeHeaderinfo($indexArr, $content, 'title', 7);
1346 $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1347 $this->analyzeHeaderinfo($indexArr, $content, 'description', 5);
1348 $this->analyzeBody($indexArr, $content);
1349 return $indexArr;
1350 }
1351
1352 /**
1353 * Calculates relevant information for headercontent
1354 *
1355 * @param array $retArr Index array, passed by reference
1356 * @param array $content Standard content array
1357 * @param string $key Key from standard content array
1358 * @param int $offset Bit-wise priority to type
1359 */
1360 public function analyzeHeaderinfo(&$retArr, $content, $key, $offset)
1361 {
1362 foreach ($content[$key] as $val) {
1363 $val = substr($val, 0, 60);
1364 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1365 if (!isset($retArr[$val])) {
1366 // Word ID (wid)
1367 $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1368 // Metaphone value is also 60 only chars long
1369 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1370 $retArr[$val]['metaphone'] = $metaphone;
1371 }
1372 // Build metaphone fulltext string (can be used for fulltext indexing)
1373 if ($this->storeMetaphoneInfoAsWords) {
1374 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1375 }
1376 // Priority used for flagBitMask feature (see extension configuration)
1377 $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | pow(2, $offset);
1378 // Increase number of occurrences
1379 $retArr[$val]['count']++;
1380 $this->wordcount++;
1381 }
1382 }
1383
1384 /**
1385 * Calculates relevant information for bodycontent
1386 *
1387 * @param array $retArr Index array, passed by reference
1388 * @param array $content Standard content array
1389 */
1390 public function analyzeBody(&$retArr, $content)
1391 {
1392 foreach ($content['body'] as $key => $val) {
1393 $val = substr($val, 0, 60);
1394 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1395 if (!isset($retArr[$val])) {
1396 // First occurrence (used for ranking results)
1397 $retArr[$val]['first'] = $key;
1398 // Word ID (wid)
1399 $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1400 // Metaphone value is also only 60 chars long
1401 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1402 $retArr[$val]['metaphone'] = $metaphone;
1403 }
1404 // Build metaphone fulltext string (can be used for fulltext indexing)
1405 if ($this->storeMetaphoneInfoAsWords) {
1406 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1407 }
1408 // Increase number of occurrences
1409 $retArr[$val]['count']++;
1410 $this->wordcount++;
1411 }
1412 }
1413
1414 /**
1415 * Creating metaphone based hash from input word
1416 *
1417 * @param string $word Word to convert
1418 * @param bool $returnRawMetaphoneValue If set, returns the raw metaphone value (not hashed)
1419 * @return mixed Metaphone hash integer (or raw value, string)
1420 */
1421 public function metaphone($word, $returnRawMetaphoneValue = false)
1422 {
1423 if (is_object($this->metaphoneObj)) {
1424 $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1425 } else {
1426 // Use native PHP function instead of advanced doubleMetaphone class
1427 $metaphoneRawValue = metaphone($word);
1428 }
1429 if ($returnRawMetaphoneValue) {
1430 $result = $metaphoneRawValue;
1431 } elseif ($metaphoneRawValue !== '') {
1432 // Create hash and return integer
1433 $result = IndexedSearchUtility::md5inthash($metaphoneRawValue);
1434 } else {
1435 $result = 0;
1436 }
1437 return $result;
1438 }
1439
1440 /********************************
1441 *
1442 * SQL; TYPO3 Pages
1443 *
1444 *******************************/
1445 /**
1446 * Updates db with information about the page (TYPO3 page, not external media)
1447 */
1448 public function submitPage()
1449 {
1450 // Remove any current data for this phash:
1451 $this->removeOldIndexedPages($this->hash['phash']);
1452 // setting new phash_row
1453 $fields = [
1454 'phash' => $this->hash['phash'],
1455 'phash_grouping' => $this->hash['phash_grouping'],
1456 'cHashParams' => serialize($this->cHashParams),
1457 'contentHash' => $this->content_md5h,
1458 'data_page_id' => $this->conf['id'],
1459 // @deprecated since TYPO3 v9, will be removed in TYPO3 v10. Remove along with database field data_page_reg1
1460 'data_page_reg1' => $this->conf['page_cache_reg1'],
1461 'data_page_type' => $this->conf['type'],
1462 'data_page_mp' => $this->conf['MP'],
1463 'gr_list' => $this->conf['gr_list'],
1464 'item_type' => 0,
1465 // TYPO3 page
1466 'item_title' => $this->contentParts['title'],
1467 'item_description' => $this->bodyDescription($this->contentParts),
1468 'item_mtime' => (int)$this->conf['mtime'],
1469 'item_size' => strlen($this->conf['content']),
1470 'tstamp' => $GLOBALS['EXEC_TIME'],
1471 'crdate' => $GLOBALS['EXEC_TIME'],
1472 'item_crdate' => $this->conf['crdate'],
1473 // Creation date of page
1474 'sys_language_uid' => $this->conf['sys_language_uid'],
1475 // Sys language uid of the page. Should reflect which language it DOES actually display!
1476 'externalUrl' => 0,
1477 'recordUid' => (int)$this->conf['recordUid'],
1478 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1479 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1480 ];
1481 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1482 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1483 ->getConnectionForTable('index_phash');
1484 $connection->insert(
1485 'index_phash',
1486 $fields,
1487 ['cHashParams' => Connection::PARAM_LOB]
1488 );
1489 }
1490 // PROCESSING index_section
1491 $this->submit_section($this->hash['phash'], $this->hash['phash']);
1492 // PROCESSING index_grlist
1493 $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1494 // PROCESSING index_fulltext
1495 $fields = [
1496 'phash' => $this->hash['phash'],
1497 'fulltextdata' => implode(' ', $this->contentParts),
1498 'metaphonedata' => $this->metaphoneContent
1499 ];
1500 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1501 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1502 }
1503 if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1504 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1505 ->getConnectionForTable('index_fulltext');
1506 $connection->insert('index_fulltext', $fields);
1507 }
1508 // PROCESSING index_debug
1509 if ($this->indexerConfig['debugMode']) {
1510 $fields = [
1511 'phash' => $this->hash['phash'],
1512 'debuginfo' => serialize([
1513 'cHashParams' => $this->cHashParams,
1514 'external_parsers initialized' => array_keys($this->external_parsers),
1515 'conf' => array_merge($this->conf, ['content' => substr($this->conf['content'], 0, 1000)]),
1516 'contentParts' => array_merge($this->contentParts, ['body' => substr($this->contentParts['body'], 0, 1000)]),
1517 'logs' => $this->internal_log,
1518 'lexer' => $this->lexerObj->debugString
1519 ])
1520 ];
1521 if (IndexedSearchUtility::isTableUsed('index_debug')) {
1522 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1523 ->getConnectionForTable('index_debug');
1524 $connection->insert('index_debug', $fields);
1525 }
1526 }
1527 }
1528
1529 /**
1530 * Stores gr_list in the database.
1531 *
1532 * @param int $hash Search result record phash
1533 * @param int $phash_x Actual phash of current content
1534 * @see update_grlist()
1535 */
1536 public function submit_grlist($hash, $phash_x)
1537 {
1538 // Setting the gr_list record
1539 $fields = [
1540 'phash' => $hash,
1541 'phash_x' => $phash_x,
1542 'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1543 'gr_list' => $this->conf['gr_list']
1544 ];
1545 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1546 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1547 ->getConnectionForTable('index_grlist');
1548 $connection->insert('index_grlist', $fields);
1549 }
1550 }
1551
1552 /**
1553 * Stores section
1554 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1555 *
1556 * @param int $hash phash of TYPO3 parent search result record
1557 * @param int $hash_t3 phash of the file indexation search record
1558 */
1559 public function submit_section($hash, $hash_t3)
1560 {
1561 $fields = [
1562 'phash' => $hash,
1563 'phash_t3' => $hash_t3,
1564 'page_id' => (int)$this->conf['id']
1565 ];
1566 $this->getRootLineFields($fields);
1567 if (IndexedSearchUtility::isTableUsed('index_section')) {
1568 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1569 ->getConnectionForTable('index_section');
1570 $connection->insert('index_section', $fields);
1571 }
1572 }
1573
1574 /**
1575 * Removes records for the indexed page, $phash
1576 *
1577 * @param int $phash phash value to flush
1578 */
1579 public function removeOldIndexedPages($phash)
1580 {
1581 // Removing old registrations for all tables. Because the pages are TYPO3 pages
1582 // there can be nothing else than 1-1 relations here.
1583 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1584 $tableArray = ['index_phash', 'index_section', 'index_grlist', 'index_fulltext', 'index_debug'];
1585 foreach ($tableArray as $table) {
1586 if (IndexedSearchUtility::isTableUsed($table)) {
1587 $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1588 }
1589 }
1590
1591 // Removing all index_section records with hash_t3 set to this hash (this includes such
1592 // records set for external media on the page as well!). The re-insert of these records
1593 // are done in indexRegularDocument($file).
1594 if (IndexedSearchUtility::isTableUsed('index_section')) {
1595 $connectionPool->getConnectionForTable('index_section')
1596 ->delete('index_section', ['phash_t3' => (int)$phash]);
1597 }
1598 }
1599
1600 /********************************
1601 *
1602 * SQL; External media
1603 *
1604 *******************************/
1605 /**
1606 * Updates db with information about the file
1607 *
1608 * @param array $hash Array with phash and phash_grouping keys for file
1609 * @param string $file File name
1610 * @param array $subinfo Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1611 * @param string $ext File extension determining the type of media.
1612 * @param int $mtime Modification time of file.
1613 * @param int $ctime Creation time of file.
1614 * @param int $size Size of file in bytes
1615 * @param int $content_md5h Content HASH value.
1616 * @param array $contentParts Standard content array (using only title and body for a file)
1617 */
1618 public function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts)
1619 {
1620 // Find item Type:
1621 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1622 $storeItemType = $storeItemType ?: $ext;
1623 // Remove any current data for this phash:
1624 $this->removeOldIndexedFiles($hash['phash']);
1625 // Split filename:
1626 $fileParts = parse_url($file);
1627 // Setting new
1628 $fields = [
1629 'phash' => $hash['phash'],
1630 'phash_grouping' => $hash['phash_grouping'],
1631 'cHashParams' => serialize($subinfo),
1632 'contentHash' => $content_md5h,
1633 'data_filename' => $file,
1634 'item_type' => $storeItemType,
1635 'item_title' => trim($contentParts['title']) ?: PathUtility::basename($file),
1636 'item_description' => $this->bodyDescription($contentParts),
1637 'item_mtime' => $mtime,
1638 'item_size' => $size,
1639 'item_crdate' => $ctime,
1640 'tstamp' => $GLOBALS['EXEC_TIME'],
1641 'crdate' => $GLOBALS['EXEC_TIME'],
1642 'gr_list' => $this->conf['gr_list'],
1643 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1644 'recordUid' => (int)$this->conf['recordUid'],
1645 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1646 'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1647 'sys_language_uid' => (int)$this->conf['sys_language_uid']
1648 ];
1649 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1650 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1651 ->getConnectionForTable('index_phash');
1652 $connection->insert(
1653 'index_phash',
1654 $fields,
1655 ['cHashParams' => Connection::PARAM_LOB]
1656 );
1657 }
1658 // PROCESSING index_fulltext
1659 $fields = [
1660 'phash' => $hash['phash'],
1661 'fulltextdata' => implode(' ', $contentParts),
1662 'metaphonedata' => $this->metaphoneContent
1663 ];
1664 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1665 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1666 }
1667 if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1668 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1669 ->getConnectionForTable('index_fulltext');
1670 $connection->insert('index_fulltext', $fields);
1671 }
1672 // PROCESSING index_debug
1673 if ($this->indexerConfig['debugMode']) {
1674 $fields = [
1675 'phash' => $hash['phash'],
1676 'debuginfo' => serialize([
1677 'cHashParams' => $subinfo,
1678 'contentParts' => array_merge($contentParts, ['body' => substr($contentParts['body'], 0, 1000)]),
1679 'logs' => $this->internal_log,
1680 'lexer' => $this->lexerObj->debugString
1681 ])
1682 ];
1683 if (IndexedSearchUtility::isTableUsed('index_debug')) {
1684 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1685 ->getConnectionForTable('index_debug');
1686 $connection->insert('index_debug', $fields);
1687 }
1688 }
1689 }
1690
1691 /**
1692 * Stores file gr_list for a file IF it does not exist already
1693 *
1694 * @param int $hash phash value of file
1695 */
1696 public function submitFile_grlist($hash)
1697 {
1698 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1699 if (!IndexedSearchUtility::isTableUsed('index_grlist')) {
1700 return;
1701 }
1702
1703 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1704 ->getQueryBuilderForTable('index_grlist');
1705 $count = (int)$queryBuilder->count('*')
1706 ->from('index_grlist')
1707 ->where(
1708 $queryBuilder->expr()->eq(
1709 'phash',
1710 $queryBuilder->createNamedParameter($hash, \PDO::PARAM_INT)
1711 ),
1712 $queryBuilder->expr()->orX(
1713 $queryBuilder->expr()->eq(
1714 'hash_gr_list',
1715 $queryBuilder->createNamedParameter(
1716 IndexedSearchUtility::md5inthash($this->defaultGrList),
1717 \PDO::PARAM_INT
1718 )
1719 ),
1720 $queryBuilder->expr()->eq(
1721 'hash_gr_list',
1722 $queryBuilder->createNamedParameter(
1723 IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1724 \PDO::PARAM_INT
1725 )
1726 )
1727 )
1728 )
1729 ->execute()
1730 ->fetchColumn();
1731
1732 if ($count === 0) {
1733 $this->submit_grlist($hash, $hash);
1734 }
1735 }
1736
1737 /**
1738 * Stores file section for a file IF it does not exist
1739 *
1740 * @param int $hash phash value of file
1741 */
1742 public function submitFile_section($hash)
1743 {
1744 // Testing if there is already a section
1745 if (!IndexedSearchUtility::isTableUsed('index_section')) {
1746 return;
1747 }
1748
1749 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1750 ->getQueryBuilderForTable('index_section');
1751 $count = (int)$queryBuilder->count('phash')
1752 ->from('index_section')
1753 ->where(
1754 $queryBuilder->expr()->eq(
1755 'phash',
1756 $queryBuilder->createNamedParameter($hash, \PDO::PARAM_INT)
1757 ),
1758 $queryBuilder->expr()->eq(
1759 'page_id',
1760 $queryBuilder->createNamedParameter($this->conf['id'], \PDO::PARAM_INT)
1761 )
1762 )
1763 ->execute()
1764 ->fetchColumn();
1765
1766 if ($count === 0) {
1767 $this->submit_section($hash, $this->hash['phash']);
1768 }
1769 }
1770
1771 /**
1772 * Removes records for the indexed page, $phash
1773 *
1774 * @param int $phash phash value to flush
1775 */
1776 public function removeOldIndexedFiles($phash)
1777 {
1778 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1779 // Removing old registrations for tables.
1780 $tableArray = ['index_phash', 'index_grlist', 'index_fulltext', 'index_debug'];
1781 foreach ($tableArray as $table) {
1782 if (!IndexedSearchUtility::isTableUsed($table)) {
1783 continue;
1784 }
1785 $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1786 }
1787 }
1788
1789 /********************************
1790 *
1791 * SQL Helper functions
1792 *
1793 *******************************/
1794 /**
1795 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1796 * Return positive integer if the page needs to be indexed
1797 *
1798 * @param int $mtime mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1799 * @param int $phash "phash" used to select any already indexed page to see what its mtime is.
1800 * @return int Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1801 */
1802 public function checkMtimeTstamp($mtime, $phash)
1803 {
1804 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1805 // Not indexed (not in index_phash)
1806 $result = 4;
1807 } else {
1808 $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1809 ->select(
1810 ['item_mtime', 'tstamp'],
1811 'index_phash',
1812 ['phash' => (int)$phash],
1813 [],
1814 [],
1815 1
1816 )
1817 ->fetch();
1818 // If there was an indexing of the page...:
1819 if (!empty($row)) {
1820 if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < $GLOBALS['EXEC_TIME']) {
1821 // If max age is exceeded, index the page
1822 // The configured max-age was exceeded for the document and thus it's indexed.
1823 $result = 1;
1824 } else {
1825 if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < $GLOBALS['EXEC_TIME']) {
1826 // if minAge is not set or if minAge is exceeded, consider at mtime
1827 if ($mtime) {
1828 // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1829 if ($row['item_mtime'] != $mtime) {
1830 // And if mtime is different from the index_phash mtime, it's about time to re-index.
1831 // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1832 $result = 2;
1833 } else {
1834 // mtime matched the document, so no changes detected and no content updated
1835 $result = -1;
1836 if ($this->tstamp_maxAge) {
1837 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1838 } else {
1839 $this->updateTstamp($phash);
1840 $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1841 }
1842 }
1843 } else {
1844 // The minimum age was exceed, but mtime was not set, so the page was indexed.
1845 $result = 3;
1846 }
1847 } else {
1848 // The minimum age was not exceeded
1849 $result = -2;
1850 }
1851 }
1852 } else {
1853 // Page has never been indexed (is not represented in the index_phash table).
1854 $result = 4;
1855 }
1856 }
1857 return $result;
1858 }
1859
1860 /**
1861 * Check content hash in phash table
1862 *
1863 * @return mixed Returns TRUE if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1864 */
1865 public function checkContentHash()
1866 {
1867 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1868 $result = true;
1869 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1870 $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1871 ->select(
1872 ['phash'],
1873 'index_phash',
1874 [
1875 'phash_grouping' => (int)$this->hash['phash_grouping'],
1876 'contentHash' => (int)$this->content_md5h
1877 ],
1878 [],
1879 [],
1880 1
1881 )
1882 ->fetch();
1883
1884 if (!empty($row)) {
1885 $result = $row;
1886 }
1887 }
1888 return $result;
1889 }
1890
1891 /**
1892 * Check content hash for external documents
1893 * Returns TRUE if the document needs to be indexed (that is, there was no result)
1894 *
1895 * @param int $hashGr phash value to check (phash_grouping)
1896 * @param int $content_md5h Content hash to check
1897 * @return bool Returns TRUE if the document needs to be indexed (that is, there was no result)
1898 */
1899 public function checkExternalDocContentHash($hashGr, $content_md5h)
1900 {
1901 $result = true;
1902 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1903 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1904 ->getConnectionForTable('index_phash')
1905 ->count(
1906 '*',
1907 'index_phash',
1908 [
1909 'phash_grouping' => (int)$hashGr,
1910 'contentHash' => (int)$content_md5h
1911 ]
1912 );
1913
1914 $result = $count === 0;
1915 }
1916 return $result;
1917 }
1918
1919 /**
1920 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1921 *
1922 * @param int $phash_x Phash integer to test.
1923 * @return bool
1924 */
1925 public function is_grlist_set($phash_x)
1926 {
1927 $result = false;
1928 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1929 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1930 ->getConnectionForTable('index_grlist')
1931 ->count(
1932 'phash_x',
1933 'index_grlist',
1934 ['phash_x' => (int)$phash_x]
1935 );
1936
1937 $result = $count > 0;
1938 }
1939 return $result;
1940 }
1941
1942 /**
1943 * Check if an grlist-entry for this hash exists and if not so, write one.
1944 *
1945 * @param int $phash phash of the search result that should be found
1946 * @param int $phash_x The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1947 * @see submit_grlist()
1948 */
1949 public function update_grlist($phash, $phash_x)
1950 {
1951 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1952 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1953 ->getConnectionForTable('index_grlist')
1954 ->count(
1955 'phash',
1956 'index_grlist',
1957 [
1958 'phash' => (int)$phash,
1959 'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list'])
1960 ]
1961 );
1962
1963 if ($count === 0) {
1964 $this->submit_grlist($phash, $phash_x);
1965 $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1);
1966 }
1967 }
1968 }
1969
1970 /**
1971 * Update tstamp for a phash row.
1972 *
1973 * @param int $phash phash value
1974 * @param int $mtime If set, update the mtime field to this value.
1975 */
1976 public function updateTstamp($phash, $mtime = 0)
1977 {
1978 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1979 return;
1980 }
1981
1982 $updateFields = [
1983 'tstamp' => $GLOBALS['EXEC_TIME']
1984 ];
1985
1986 if ($mtime) {
1987 $updateFields['item_mtime'] = (int)$mtime;
1988 }
1989
1990 GeneralUtility::makeInstance(ConnectionPool::class)
1991 ->getConnectionForTable('index_phash')
1992 ->update(
1993 'index_phash',
1994 $updateFields,
1995 [
1996 'phash' => (int)$phash
1997 ]
1998 );
1999 }
2000
2001 /**
2002 * Update SetID of the index_phash record.
2003 *
2004 * @param int $phash phash value
2005 */
2006 public function updateSetId($phash)
2007 {
2008 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
2009 return;
2010 }
2011
2012 GeneralUtility::makeInstance(ConnectionPool::class)
2013 ->getConnectionForTable('index_phash')
2014 ->update(
2015 'index_phash',
2016 [
2017 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
2018 ],
2019 [
2020 'phash' => (int)$phash
2021 ]
2022 );
2023 }
2024
2025 /**
2026 * Update parsetime for phash row.
2027 *
2028 * @param int $phash phash value.
2029 * @param int $parsetime Parsetime value to set.
2030 */
2031 public function updateParsetime($phash, $parsetime)
2032 {
2033 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
2034 return;
2035 }
2036
2037 GeneralUtility::makeInstance(ConnectionPool::class)
2038 ->getConnectionForTable('index_phash')
2039 ->update(
2040 'index_phash',
2041 [
2042 'parsetime' => (int)$parsetime
2043 ],
2044 [
2045 'phash' => (int)$phash
2046 ]
2047 );
2048 }
2049
2050 /**
2051 * Update section rootline for the page
2052 */
2053 public function updateRootline()
2054 {
2055 if (!IndexedSearchUtility::isTableUsed('index_section')) {
2056 return;
2057 }
2058
2059 $updateFields = [];
2060 $this->getRootLineFields($updateFields);
2061
2062 GeneralUtility::makeInstance(ConnectionPool::class)
2063 ->getConnectionForTable('index_section')
2064 ->update(
2065 'index_section',
2066 $updateFields,
2067 [
2068 'page_id' => (int)$this->conf['id']
2069 ]
2070 );
2071 }
2072
2073 /**
2074 * Adding values for root-line fields.
2075 * rl0, rl1 and rl2 are standard. A hook might add more.
2076 *
2077 * @param array $fieldArray Field array, passed by reference
2078 */
2079 public function getRootLineFields(array &$fieldArray)
2080 {
2081 $fieldArray['rl0'] = (int)$this->conf['rootline_uids'][0];
2082 $fieldArray['rl1'] = (int)$this->conf['rootline_uids'][1];
2083 $fieldArray['rl2'] = (int)$this->conf['rootline_uids'][2];
2084 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] ?? [] as $fieldName => $rootLineLevel) {
2085 $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
2086 }
2087 }
2088
2089 /********************************
2090 *
2091 * SQL; Submitting words
2092 *
2093 *******************************/
2094 /**
2095 * Adds new words to db
2096 *
2097 * @param array $wordListArray Word List array (where each word has information about position etc).
2098 */
2099 public function checkWordList($wordListArray)
2100 {
2101 if (!IndexedSearchUtility::isTableUsed('index_words') || empty($wordListArray)) {
2102 return;
2103 }
2104
2105 $wordListArrayCount = count($wordListArray);
2106 $phashArray = array_map('intval', array_column($wordListArray, 'hash'));
2107
2108 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('index_words');
2109 $count = (int)$queryBuilder->count('baseword')
2110 ->from('index_words')
2111 ->where(
2112 $queryBuilder->expr()->in(
2113 'wid',
2114 $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
2115 )
2116 )
2117 ->execute()
2118 ->fetchColumn();
2119
2120 if ($count !== $wordListArrayCount) {
2121 $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_words');
2122 $queryBuilder = $connection->createQueryBuilder();
2123
2124 $result = $queryBuilder->select('baseword')
2125 ->from('index_words')
2126 ->where(
2127 $queryBuilder->expr()->in(
2128 'wid',
2129 $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
2130 )
2131 )
2132 ->execute();
2133
2134 $this->log_setTSlogMessage('Inserting words: ' . ($wordListArrayCount - $count), 1);
2135 while ($row = $result->fetch()) {
2136 unset($wordListArray[$row['baseword']]);
2137 }
2138
2139 foreach ($wordListArray as $key => $val) {
2140 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as
2141 // long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...)
2142 // this is not a problem.
2143 $connection->insert(
2144 'index_words',
2145 [
2146 'wid' => $val['hash'],
2147 'baseword' => $key,
2148 'metaphone' => $val['metaphone']
2149 ]
2150 );
2151 }
2152 }
2153 }
2154
2155 /**
2156 * Submits RELATIONS between words and phash
2157 *
2158 * @param array $wordList Word list array
2159 * @param int $phash phash value
2160 */
2161 public function submitWords($wordList, $phash)
2162 {
2163 if (!IndexedSearchUtility::isTableUsed('index_rel')) {
2164 return;
2165 }
2166 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
2167 $queryBuilder = $connectionPool->getQueryBuilderForTable('index_words');
2168 $result = $queryBuilder->select('wid')
2169 ->from('index_words')
2170 ->where(
2171 $queryBuilder->expr()->neq('is_stopword', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT))
2172 )
2173 ->groupBy('wid')
2174 ->execute();
2175
2176 $stopWords = [];
2177 while ($row = $result->fetch()) {
2178 $stopWords[$row['wid']] = $row;
2179 }
2180
2181 $connectionPool->getConnectionForTable('index_rel')->delete('index_rel', ['phash' => (int)$phash]);
2182
2183 $fields = ['phash', 'wid', 'count', 'first', 'freq', 'flags'];
2184 $rows = [];
2185 foreach ($wordList as $val) {
2186 if (isset($stopWords[$val['hash']])) {
2187 continue;
2188 }
2189 $rows[] = [
2190 (int)$phash,
2191 (int)$val['hash'],
2192 (int)$val['count'],
2193 (int)$val['first'],
2194 $this->freqMap($val['count'] / $this->wordcount),
2195 $val['cmp'] & $this->flagBitMask
2196 ];
2197 }
2198
2199 if (!empty($rows)) {
2200 $connectionPool->getConnectionForTable('index_rel')->bulkInsert('index_rel', $rows, $fields);
2201 }
2202 }
2203
2204 /**
2205 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
2206 * and back.
2207 *
2208 * @param float $freq Frequency
2209 * @return int Frequency in range.
2210 */
2211 public function freqMap($freq)
2212 {
2213 $mapFactor = $this->freqMax * 100 * $this->freqRange;
2214 if ($freq <= 1) {
2215 $newFreq = $freq * $mapFactor;
2216 $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
2217 } else {
2218 $newFreq = $freq / $mapFactor;
2219 }
2220 return $newFreq;
2221 }
2222
2223 /********************************
2224 *
2225 * Hashing
2226 *
2227 *******************************/
2228 /**
2229 * Get search hash, T3 pages
2230 */
2231 public function setT3Hashes()
2232 {
2233 // Set main array:
2234 $hArray = [
2235 'id' => (int)$this->conf['id'],
2236 'type' => (int)$this->conf['type'],
2237 'sys_lang' => (int)$this->conf['sys_language_uid'],
2238 'MP' => (string)$this->conf['MP'],
2239 'cHash' => $this->cHashParams
2240 ];
2241 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
2242 $this->hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2243 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
2244 $hArray['gr_list'] = (string)$this->conf['gr_list'];
2245 $this->hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2246 }
2247
2248 /**
2249 * Get search hash, external files
2250 *
2251 * @param string $file File name / path which identifies it on the server
2252 * @param array $subinfo Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
2253 * @return array Array with "phash_grouping" and "phash" inside.
2254 */
2255 public function setExtHashes($file, $subinfo = [])
2256 {
2257 // Set main array:
2258 $hash = [];
2259 $hArray = [
2260 'file' => $file
2261 ];
2262 // Set grouping hash:
2263 $hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2264 // Add subinfo
2265 $hArray['subinfo'] = $subinfo;
2266 $hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2267 return $hash;
2268 }
2269
2270 /*********************************
2271 *
2272 * Internal logging functions
2273 *
2274 *********************************/
2275 /**
2276 * Push function wrapper for TT logging
2277 *
2278 * @param string $msg Title to set
2279 * @param string $key Key (?)
2280 */
2281 public function log_push($msg, $key)
2282 {
2283 $this->timeTracker->push($msg, $key);
2284 }
2285
2286 /**
2287 * Pull function wrapper for TT logging
2288 */
2289 public function log_pull()
2290 {
2291 $this->timeTracker->pull();
2292 }
2293
2294 /**
2295 * Set log message function wrapper for TT logging
2296 *
2297 * @param string $msg Message to set
2298 * @param int $errorNum Error number
2299 */
2300 public function log_setTSlogMessage($msg, $errorNum = 0)
2301 {
2302 $this->timeTracker->setTSlogMessage($msg, $errorNum);
2303 $this->internal_log[] = $msg;
2304 }
2305
2306 /**
2307 * Makes sure that keywords are space-separated. This is impotant for their
2308 * proper displaying as a part of fulltext index.
2309 *
2310 * @param string $keywordList
2311 * @return string
2312 * @see http://forge.typo3.org/issues/14959
2313 */
2314 protected function addSpacesToKeywordList($keywordList)
2315 {
2316 $keywords = GeneralUtility::trimExplode(',', $keywordList);
2317 return ' ' . implode(', ', $keywords) . ' ';
2318 }
2319 }