[TASK] Use null coalescing operator where possible
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Indexer.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
18 use TYPO3\CMS\Core\Database\Connection;
19 use TYPO3\CMS\Core\Database\ConnectionPool;
20 use TYPO3\CMS\Core\TimeTracker\TimeTracker;
21 use TYPO3\CMS\Core\Utility\GeneralUtility;
22 use TYPO3\CMS\Core\Utility\MathUtility;
23 use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
24 use TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility;
25
26 /**
27 * Indexing class for TYPO3 frontend
28 */
29 class Indexer
30 {
31 /**
32 * @var array
33 */
34 public $reasons = [
35 -1 => 'mtime matched the document, so no changes detected and no content updated',
36 -2 => 'The minimum age was not exceeded',
37 1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
38 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
39 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
40 4 => 'Page has never been indexed (is not represented in the index_phash table).'
41 ];
42
43 /**
44 * HTML code blocks to exclude from indexing
45 *
46 * @var string
47 */
48 public $excludeSections = 'script,style';
49
50 /**
51 * Supported Extensions for external files
52 *
53 * @var array
54 */
55 public $external_parsers = [];
56
57 /**
58 * External parser objects, keys are file extension names. Values are objects with certain methods.
59 * Fe-group list (pages might be indexed separately for each usergroup combination to support search
60 * in access limited pages!)
61 *
62 * @var string
63 */
64 public $defaultGrList = '0,-1';
65
66 /**
67 * Min/Max times
68 *
69 * @var int
70 */
71 public $tstamp_maxAge = 0;
72
73 /**
74 * If set, this tells a number of seconds that is the maximum age of an indexed document.
75 * Regardless of mtime the document will be re-indexed if this limit is exceeded.
76 *
77 * @var int
78 */
79 public $tstamp_minAge = 0;
80
81 /**
82 * If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
83 *
84 * @var int
85 */
86 public $maxExternalFiles = 0;
87
88 /**
89 * Max number of external files to index.
90 *
91 * @var bool
92 */
93 public $forceIndexing = false;
94
95 /**
96 * If TRUE, indexing is forced despite of hashes etc.
97 *
98 * @var bool
99 */
100 public $crawlerActive = false;
101
102 /**
103 * Set when crawler is detected (internal)
104 *
105 * @var array
106 */
107 public $defaultContentArray = [
108 'title' => '',
109 'description' => '',
110 'keywords' => '',
111 'body' => ''
112 ];
113
114 /**
115 * @var int
116 */
117 public $wordcount = 0;
118
119 /**
120 * @var int
121 */
122 public $externalFileCounter = 0;
123
124 /**
125 * @var array
126 */
127 public $conf = [];
128
129 /**
130 * Configuration set internally (see init functions for required keys and their meaning)
131 *
132 * @var array
133 */
134 public $indexerConfig = [];
135
136 /**
137 * Indexer configuration, coming from TYPO3's system configuration for EXT:indexed_search
138 *
139 * @var array
140 */
141 public $hash = [];
142
143 /**
144 * Hash array, contains phash and phash_grouping
145 *
146 * @var array
147 */
148 public $file_phash_arr = [];
149
150 /**
151 * Hash array for files
152 *
153 * @var array
154 */
155 public $contentParts = [];
156
157 /**
158 * Content of TYPO3 page
159 *
160 * @var string
161 */
162 public $content_md5h = '';
163
164 /**
165 * @var array
166 */
167 public $internal_log = [];
168
169 /**
170 * Internal log
171 *
172 * @var string
173 */
174 public $indexExternalUrl_content = '';
175
176 /**
177 * @var array
178 */
179 public $cHashParams = [];
180
181 /**
182 * cHashparams array
183 *
184 * @var int
185 */
186 public $freqRange = 32000;
187
188 /**
189 * @var float
190 */
191 public $freqMax = 0.1;
192
193 /**
194 * @var bool
195 */
196 public $enableMetaphoneSearch = false;
197
198 /**
199 * @var bool
200 */
201 public $storeMetaphoneInfoAsWords;
202
203 /**
204 * @var string
205 */
206 public $metaphoneContent = '';
207
208 /**
209 * Charset class object
210 *
211 * @var \TYPO3\CMS\Core\Charset\CharsetConverter
212 */
213 public $csObj;
214
215 /**
216 * Metaphone object, if any
217 *
218 * @var \TYPO3\CMS\IndexedSearch\Utility\DoubleMetaPhoneUtility
219 */
220 public $metaphoneObj;
221
222 /**
223 * Lexer object for word splitting
224 *
225 * @var \TYPO3\CMS\IndexedSearch\Lexer
226 */
227 public $lexerObj;
228
229 /**
230 * @var bool
231 */
232 public $flagBitMask;
233
234 /**
235 * @var TimeTracker
236 */
237 protected $timeTracker;
238
239 /**
240 * Indexer constructor.
241 */
242 public function __construct()
243 {
244 $this->timeTracker = GeneralUtility::makeInstance(TimeTracker::class);
245 }
246
247 /**
248 * Parent Object (TSFE) Initialization
249 *
250 * @param TypoScriptFrontendController $pObj Parent Object, passed by reference
251 */
252 public function hook_indexContent(&$pObj)
253 {
254 // Indexer configuration from Extension Manager interface:
255 $disableFrontendIndexing = (bool)GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search', 'disableFrontendIndexing');
256 // Crawler activation:
257 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
258 if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler') && $pObj->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
259 // Setting simple log message:
260 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
261 // Setting variables:
262 $this->crawlerActive = true;
263 // Crawler active flag
264 $this->forceIndexing = true;
265 }
266 // Determine if page should be indexed, and if so, configure and initialize indexer
267 if ($pObj->config['config']['index_enable']) {
268 $this->log_push('Index page', '');
269 if (!$disableFrontendIndexing || $this->crawlerActive) {
270 if (!$pObj->page['no_search']) {
271 if (!$pObj->no_cache) {
272 if ((int)$pObj->sys_language_uid === (int)$pObj->sys_language_content) {
273 // Setting up internal configuration from config array:
274 $this->conf = [];
275 // Information about page for which the indexing takes place
276 $this->conf['id'] = $pObj->id;
277 // Page id
278 $this->conf['type'] = $pObj->type;
279 // Page type
280 $this->conf['sys_language_uid'] = $pObj->sys_language_uid;
281 // sys_language UID of the language of the indexing.
282 $this->conf['MP'] = $pObj->MP;
283 // MP variable, if any (Mount Points)
284 $this->conf['gr_list'] = $pObj->gr_list;
285 // Group list
286 $this->conf['cHash'] = $pObj->cHash;
287 // cHash string for additional parameters
288 $this->conf['cHash_array'] = $pObj->cHash_array;
289 // Array of the additional parameters
290 $this->conf['crdate'] = $pObj->page['crdate'];
291 // The creation date of the TYPO3 page
292 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;
293 // reg1 of the caching table. Not known what practical use this has.
294 // Root line uids
295 $this->conf['rootline_uids'] = [];
296 foreach ($pObj->config['rootLine'] as $rlkey => $rldat) {
297 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
298 }
299 // Content of page:
300 $this->conf['content'] = $pObj->content;
301 // Content string (HTML of TYPO3 page)
302 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
303 // Alternative title for indexing
304 $this->conf['metaCharset'] = $pObj->metaCharset;
305 // Character set of content (will be converted to utf-8 during indexing)
306 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED'] ?? $pObj->page['SYS_LASTCHANGED'];
307 // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
308 // Configuration of behavior:
309 $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
310 // Whether to index external documents like PDF, DOC etc. (if possible)
311 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
312 // Length of description text (max 250, default 200)
313 $this->conf['index_metatags'] = $pObj->config['config']['index_metatags'] ?? true;
314 // Set to zero:
315 $this->conf['recordUid'] = 0;
316 $this->conf['freeIndexUid'] = 0;
317 $this->conf['freeIndexSetId'] = 0;
318 // Init and start indexing:
319 $this->init();
320 $this->indexTypo3PageContent();
321 } else {
322 $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
323 }
324 } else {
325 $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
326 }
327 } else {
328 $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
329 }
330 } else {
331 $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
332 }
333 $this->log_pull();
334 }
335 }
336
337 /****************************
338 *
339 * Backend API
340 *
341 ****************************/
342 /**
343 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
344 *
345 * @param int $id The page uid, &id=
346 * @param int $type The page type, &type=
347 * @param int $sys_language_uid sys_language uid, typically &L=
348 * @param string $MP The MP variable (Mount Points), &MP=
349 * @param array $uidRL Rootline array of only UIDs.
350 * @param array $cHash_array Array of GET variables to register with this indexing
351 * @param bool $createCHash If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cacheable pages from the backend!
352 */
353 public function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array = [], $createCHash = false)
354 {
355 // Setting up internal configuration from config array:
356 $this->conf = [];
357 // Information about page for which the indexing takes place
358 $this->conf['id'] = $id;
359 // Page id (int)
360 $this->conf['type'] = $type;
361 // Page type (int)
362 $this->conf['sys_language_uid'] = $sys_language_uid;
363 // sys_language UID of the language of the indexing (int)
364 $this->conf['MP'] = $MP;
365 // MP variable, if any (Mount Points) (string)
366 $this->conf['gr_list'] = '0,-1';
367 // Group list (hardcoded for now...)
368 // cHash values:
369 if ($createCHash) {
370 /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
371 $cacheHash = GeneralUtility::makeInstance(\TYPO3\CMS\Frontend\Page\CacheHashCalculator::class);
372 $this->conf['cHash'] = $cacheHash->generateForParameters(GeneralUtility::implodeArrayForUrl('', $cHash_array));
373 } else {
374 $this->conf['cHash'] = '';
375 }
376 // cHash string for additional parameters
377 $this->conf['cHash_array'] = $cHash_array;
378 // Array of the additional parameters
379 // Set to defaults
380 $this->conf['freeIndexUid'] = 0;
381 $this->conf['freeIndexSetId'] = 0;
382 $this->conf['page_cache_reg1'] = '';
383 // Root line uids
384 $this->conf['rootline_uids'] = $uidRL;
385 // Configuration of behavior:
386 $this->conf['index_externals'] = 1;
387 // Whether to index external documents like PDF, DOC etc. (if possible)
388 $this->conf['index_descrLgd'] = 200;
389 // Length of description text (max 250, default 200)
390 $this->conf['index_metatags'] = true;
391 // Whether to index document keywords and description (if present)
392 // Init and start indexing:
393 $this->init();
394 }
395
396 /**
397 * Sets the free-index uid. Can be called right after backend_initIndexer()
398 *
399 * @param int $freeIndexUid Free index UID
400 * @param int $freeIndexSetId Set id - an integer identifying the "set" of indexing operations.
401 */
402 public function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId = 0)
403 {
404 $this->conf['freeIndexUid'] = $freeIndexUid;
405 $this->conf['freeIndexSetId'] = $freeIndexSetId;
406 }
407
408 /**
409 * Indexing records as the content of a TYPO3 page.
410 *
411 * @param string $title Title equivalent
412 * @param string $keywords Keywords equivalent
413 * @param string $description Description equivalent
414 * @param string $content The main content to index
415 * @param string $charset The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
416 * @param int $mtime Last modification time, in seconds
417 * @param int $crdate The creation date of the content, in seconds
418 * @param int $recordUid The record UID that the content comes from (for registration with the indexed rows)
419 */
420 public function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate = 0, $recordUid = 0)
421 {
422 // Content of page:
423 $this->conf['mtime'] = $mtime;
424 // Most recent modification time (seconds) of the content
425 $this->conf['crdate'] = $crdate;
426 // The creation date of the TYPO3 content
427 $this->conf['recordUid'] = $recordUid;
428 // UID of the record, if applicable
429 // Construct fake HTML for parsing:
430 $this->conf['content'] = '
431 <html>
432 <head>
433 <title>' . htmlspecialchars($title) . '</title>
434 <meta name="keywords" content="' . htmlspecialchars($keywords) . '" />
435 <meta name="description" content="' . htmlspecialchars($description) . '" />
436 </head>
437 <body>
438 ' . htmlspecialchars($content) . '
439 </body>
440 </html>';
441 // Content string (HTML of TYPO3 page)
442 // Initializing charset:
443 $this->conf['metaCharset'] = $charset;
444 // Character set of content (will be converted to utf-8 during indexing)
445 $this->conf['indexedDocTitle'] = '';
446 // Alternative title for indexing
447 // Index content as if it was a TYPO3 page:
448 $this->indexTypo3PageContent();
449 }
450
451 /********************************
452 *
453 * Initialization
454 *
455 *******************************/
456 /**
457 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
458 */
459 public function init()
460 {
461 // Initializing:
462 $this->cHashParams = $this->conf['cHash_array'];
463 if (is_array($this->cHashParams) && !empty($this->cHashParams)) {
464 if ($this->conf['cHash']) {
465 // Add this so that URL's come out right...
466 $this->cHashParams['cHash'] = $this->conf['cHash'];
467 }
468 unset($this->cHashParams['encryptionKey']);
469 }
470 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
471 $this->setT3Hashes();
472 // Indexer configuration from Extension Manager interface:
473 $this->indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
474 $this->tstamp_minAge = MathUtility::forceIntegerInRange($this->indexerConfig['minAge'] * 3600, 0);
475 $this->tstamp_maxAge = MathUtility::forceIntegerInRange($this->indexerConfig['maxAge'] * 3600, 0);
476 $this->maxExternalFiles = MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
477 $this->flagBitMask = MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
478 // Workaround: If the extension configuration was not updated yet, the value is not existing
479 $this->enableMetaphoneSearch = !isset($this->indexerConfig['enableMetaphoneSearch']) || $this->indexerConfig['enableMetaphoneSearch'];
480 $this->storeMetaphoneInfoAsWords = !IndexedSearchUtility::isTableUsed('index_words') && $this->enableMetaphoneSearch;
481 // Initialize external document parsers:
482 // Example configuration, see ext_localconf.php of this file!
483 if ($this->conf['index_externals']) {
484 $this->initializeExternalParsers();
485 }
486 // Initialize lexer (class that deconstructs the text into words):
487 $lexerObjectClassName = $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] ?: Lexer::class;
488 $this->lexerObj = GeneralUtility::makeInstance($lexerObjectClassName);
489 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
490 // Initialize metaphone hook:
491 // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
492 if ($this->enableMetaphoneSearch && $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']) {
493 $this->metaphoneObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']);
494 $this->metaphoneObj->pObj = $this;
495 }
496 // Init charset class:
497 $this->csObj = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Charset\CharsetConverter::class);
498 }
499
500 /**
501 * Initialize external parsers
502 *
503 * @access private
504 * @see init()
505 */
506 public function initializeExternalParsers()
507 {
508 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] ?? [] as $extension => $className) {
509 $this->external_parsers[$extension] = GeneralUtility::makeInstance($className);
510 $this->external_parsers[$extension]->pObj = $this;
511 // Init parser and if it returns FALSE, unset its entry again:
512 if (!$this->external_parsers[$extension]->initParser($extension)) {
513 unset($this->external_parsers[$extension]);
514 }
515 }
516 }
517
518 /********************************
519 *
520 * Indexing; TYPO3 pages (HTML content)
521 *
522 *******************************/
523 /**
524 * Start indexing of the TYPO3 page
525 */
526 public function indexTypo3PageContent()
527 {
528 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
529 $is_grlist = $this->is_grlist_set($this->hash['phash']);
530 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
531 // Setting message:
532 if ($this->forceIndexing) {
533 $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
534 } elseif ($check > 0) {
535 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
536 } else {
537 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
538 }
539 // Divide into title,keywords,description and body:
540 $this->log_push('Split content', '');
541 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
542 if ($this->conf['indexedDocTitle']) {
543 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
544 }
545 $this->log_pull();
546 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so don't!)
547 $this->content_md5h = IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
548 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
549 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
550 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
551 $checkCHash = $this->checkContentHash();
552 if (!is_array($checkCHash) || $check === 1) {
553 $Pstart = GeneralUtility::milliseconds();
554 $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
555 $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
556 $this->log_pull();
557 // Splitting words
558 $this->log_push('Extract words from content', '');
559 $splitInWords = $this->processWordsInArrays($this->contentParts);
560 $this->log_pull();
561 // Analyse the indexed words.
562 $this->log_push('Analyse the extracted words', '');
563 $indexArr = $this->indexAnalyze($splitInWords);
564 $this->log_pull();
565 // Submitting page (phash) record
566 $this->log_push('Submitting page', '');
567 $this->submitPage();
568 $this->log_pull();
569 // Check words and submit to word list if not there
570 $this->log_push('Check word list and submit words', '');
571 if (IndexedSearchUtility::isTableUsed('index_words')) {
572 $this->checkWordList($indexArr);
573 $this->submitWords($indexArr, $this->hash['phash']);
574 }
575 $this->log_pull();
576 // Set parsetime
577 $this->updateParsetime($this->hash['phash'], GeneralUtility::milliseconds() - $Pstart);
578 // Checking external files if configured for.
579 $this->log_push('Checking external files', '');
580 if ($this->conf['index_externals']) {
581 $this->extractLinks($this->conf['content']);
582 }
583 $this->log_pull();
584 } else {
585 // Update the timestamp
586 $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
587 $this->updateSetId($this->hash['phash']);
588 // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
589 $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
590 $this->updateRootline();
591 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
592 }
593 } else {
594 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
595 }
596 }
597
598 /**
599 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
600 *
601 * @param string $content HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
602 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
603 * @see splitRegularContent()
604 */
605 public function splitHTMLContent($content)
606 {
607 // divide head from body ( u-ouh :) )
608 $contentArr = $this->defaultContentArray;
609 $contentArr['body'] = stristr($content, '<body');
610 $headPart = substr($content, 0, -strlen($contentArr['body']));
611 // get title
612 $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
613 $titleParts = explode(':', $contentArr['title'], 2);
614 $contentArr['title'] = trim($titleParts[1] ?? $titleParts[0]);
615 // get keywords and description metatags
616 if ($this->conf['index_metatags']) {
617 $meta = [];
618 $i = 0;
619 while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
620 $i++;
621 }
622 // @todo The code below stops at first unset tag. Is that correct?
623 for ($i = 0; isset($meta[$i]); $i++) {
624 $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i]);
625 if (stristr($meta[$i]['name'], 'keywords')) {
626 $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
627 }
628 if (stristr($meta[$i]['name'], 'description')) {
629 $contentArr['description'] .= ',' . $meta[$i]['content'];
630 }
631 }
632 }
633 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
634 $this->typoSearchTags($contentArr['body']);
635 // Get rid of unwanted sections (ie. scripting and style stuff) in body
636 $tagList = explode(',', $this->excludeSections);
637 foreach ($tagList as $tag) {
638 while ($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
639 }
640 }
641 // remove tags, but first make sure we don't concatenate words by doing it
642 $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
643 $contentArr['body'] = trim(strip_tags($contentArr['body']));
644 $contentArr['keywords'] = trim($contentArr['keywords']);
645 $contentArr['description'] = trim($contentArr['description']);
646 // Return array
647 return $contentArr;
648 }
649
650 /**
651 * Extract the charset value from HTML meta tag.
652 *
653 * @param string $content HTML content
654 * @return string The charset value if found.
655 */
656 public function getHTMLcharset($content)
657 {
658 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
659 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
660 return $reg2[1];
661 }
662 }
663 }
664
665 /**
666 * Converts a HTML document to utf-8
667 *
668 * @param string $content HTML content, any charset
669 * @param string $charset Optional charset (otherwise extracted from HTML)
670 * @return string Converted HTML
671 */
672 public function convertHTMLToUtf8($content, $charset = '')
673 {
674 // Find charset:
675 $charset = $charset ?: $this->getHTMLcharset($content);
676 $charset = trim(strtolower($charset));
677 // Convert charset:
678 if ($charset && $charset !== 'utf-8') {
679 $content = mb_convert_encoding($content, 'utf-8', $charset);
680 }
681 // Convert entities, assuming document is now UTF-8:
682 return html_entity_decode($content);
683 }
684
685 /**
686 * Finds first occurrence of embracing tags and returns the embraced content and the original string with
687 * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding
688 * <title> of document or removing <script>-sections
689 *
690 * @param string $string String to search in
691 * @param string $tagName Tag name, eg. "script
692 * @param string $tagContent Passed by reference: Content inside found tag
693 * @param string $stringAfter Passed by reference: Content after found tag
694 * @param string $paramList Passed by reference: Attributes of the found tag.
695 * @return bool Returns FALSE if tag was not found, otherwise TRUE.
696 */
697 public function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
698 {
699 $endTag = '</' . $tagName . '>';
700 $startTag = '<' . $tagName;
701 // stristr used because we want a case-insensitive search for the tag.
702 $isTagInText = stristr($string, $startTag);
703 // if the tag was not found, return FALSE
704 if (!$isTagInText) {
705 return false;
706 }
707 list($paramList, $isTagInText) = explode('>', substr($isTagInText, strlen($startTag)), 2);
708 $afterTagInText = stristr($isTagInText, $endTag);
709 if ($afterTagInText) {
710 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
711 $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
712 $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
713 } else {
714 $tagContent = '';
715 $stringAfter = $isTagInText;
716 }
717 return true;
718 }
719
720 /**
721 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
722 *
723 * @param string $body HTML Content, passed by reference
724 * @return bool Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE.
725 */
726 public function typoSearchTags(&$body)
727 {
728 $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
729 if (count($expBody) > 1) {
730 $body = '';
731 foreach ($expBody as $val) {
732 $part = explode('-->', $val, 2);
733 if (trim($part[0]) === 'begin') {
734 $body .= $part[1];
735 $prev = '';
736 } elseif (trim($part[0]) === 'end') {
737 $body .= $prev;
738 } else {
739 $prev = $val;
740 }
741 }
742 return true;
743 }
744 return false;
745 }
746
747 /**
748 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
749 *
750 * @param string $content HTML content
751 */
752 public function extractLinks($content)
753 {
754 // Get links:
755 $list = $this->extractHyperLinks($content);
756 if ($this->indexerConfig['useCrawlerForExternalFiles'] && \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler')) {
757 $crawler = GeneralUtility::makeInstance(\tx_crawler_lib::class);
758 }
759 // Traverse links:
760 foreach ($list as $linkInfo) {
761 // Decode entities:
762 if ($linkInfo['localPath']) {
763 // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
764 $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
765 } else {
766 $linkSource = htmlspecialchars_decode($linkInfo['href']);
767 }
768 // Parse URL:
769 $qParts = parse_url($linkSource);
770 // Check for jumpurl (TYPO3 specific thing...)
771 if ($qParts['query'] && strstr($qParts['query'], 'jumpurl=')) {
772 parse_str($qParts['query'], $getP);
773 $linkSource = $getP['jumpurl'];
774 $qParts = parse_url($linkSource);
775 }
776 if (!$linkInfo['localPath'] && $qParts['scheme']) {
777 if ($this->indexerConfig['indexExternalURLs']) {
778 // Index external URL (http or otherwise)
779 $this->indexExternalUrl($linkSource);
780 }
781 } elseif (!$qParts['query']) {
782 $linkSource = urldecode($linkSource);
783 if (GeneralUtility::isAllowedAbsPath($linkSource)) {
784 $localFile = $linkSource;
785 } else {
786 $localFile = GeneralUtility::getFileAbsFileName(PATH_site . $linkSource);
787 }
788 if ($localFile && @is_file($localFile)) {
789 // Index local file:
790 if ($linkInfo['localPath']) {
791 $fI = pathinfo($linkSource);
792 $ext = strtolower($fI['extension']);
793 if (is_object($crawler)) {
794 $params = [
795 'document' => $linkSource,
796 'alturl' => $linkInfo['href'],
797 'conf' => $this->conf
798 ];
799 unset($params['conf']['content']);
800 $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
801 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
802 } else {
803 $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
804 }
805 } else {
806 if (is_object($crawler)) {
807 $params = [
808 'document' => $linkSource,
809 'conf' => $this->conf
810 ];
811 unset($params['conf']['content']);
812 $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
813 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
814 } else {
815 $this->indexRegularDocument($linkSource);
816 }
817 }
818 }
819 }
820 }
821 }
822
823 /**
824 * Extracts all links to external documents from the HTML content string
825 *
826 * @param string $html
827 * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
828 * @see extractLinks()
829 */
830 public function extractHyperLinks($html)
831 {
832 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
833 $htmlParts = $htmlParser->splitTags('a', $html);
834 $hyperLinksData = [];
835 foreach ($htmlParts as $index => $tagData) {
836 if ($index % 2 !== 0) {
837 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
838 $firstTagName = $htmlParser->getFirstTagName($tagData);
839 if (strtolower($firstTagName) === 'a') {
840 if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] !== '#') {
841 $hyperLinksData[] = [
842 'tag' => $tagData,
843 'href' => $tagAttributes[0]['href'],
844 'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
845 ];
846 }
847 }
848 }
849 }
850 return $hyperLinksData;
851 }
852
853 /**
854 * Extracts the "base href" from content string.
855 *
856 * @param string $html Content to analyze
857 * @return string The base href or an empty string if not found
858 */
859 public function extractBaseHref($html)
860 {
861 $href = '';
862 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
863 $htmlParts = $htmlParser->splitTags('base', $html);
864 foreach ($htmlParts as $index => $tagData) {
865 if ($index % 2 !== 0) {
866 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
867 $firstTagName = $htmlParser->getFirstTagName($tagData);
868 if (strtolower($firstTagName) === 'base') {
869 $href = $tagAttributes[0]['href'];
870 if ($href) {
871 break;
872 }
873 }
874 }
875 }
876 return $href;
877 }
878
879 /******************************************
880 *
881 * Indexing; external URL
882 *
883 ******************************************/
884 /**
885 * Index External URLs HTML content
886 *
887 * @param string $externalUrl URL, eg. "http://typo3.org/
888 * @see indexRegularDocument()
889 */
890 public function indexExternalUrl($externalUrl)
891 {
892 // Get headers:
893 $urlHeaders = $this->getUrlHeaders($externalUrl);
894 if (stristr($urlHeaders['Content-Type'], 'text/html')) {
895 $content = ($this->indexExternalUrl_content = GeneralUtility::getUrl($externalUrl));
896 if ((string)$content !== '') {
897 // Create temporary file:
898 $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
899 if ($tmpFile) {
900 GeneralUtility::writeFile($tmpFile, $content);
901 // Index that file:
902 $this->indexRegularDocument($externalUrl, true, $tmpFile, 'html');
903 // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
904 unlink($tmpFile);
905 }
906 }
907 }
908 }
909
910 /**
911 * Getting HTTP request headers of URL
912 *
913 * @param string $url The URL
914 * @return mixed If no answer, returns FALSE. Otherwise an array where HTTP headers are keys
915 */
916 public function getUrlHeaders($url)
917 {
918 // Try to get the headers only
919 $content = GeneralUtility::getUrl($url, 2);
920 if ((string)$content !== '') {
921 // Compile headers:
922 $headers = GeneralUtility::trimExplode(LF, $content, true);
923 $retVal = [];
924 foreach ($headers as $line) {
925 if (trim($line) === '') {
926 break;
927 }
928 list($headKey, $headValue) = explode(':', $line, 2);
929 $retVal[$headKey] = $headValue;
930 }
931 return $retVal;
932 }
933 }
934
935 /**
936 * Checks if the file is local
937 *
938 * @param string $sourcePath
939 * @return string Absolute path to file if file is local, else empty string
940 */
941 protected function createLocalPath($sourcePath)
942 {
943 $localPath = '';
944 static $pathFunctions = [
945 'createLocalPathFromT3vars',
946 'createLocalPathUsingAbsRefPrefix',
947 'createLocalPathUsingDomainURL',
948 'createLocalPathFromAbsoluteURL',
949 'createLocalPathFromRelativeURL'
950 ];
951 foreach ($pathFunctions as $functionName) {
952 $localPath = $this->{$functionName}($sourcePath);
953 if ($localPath != '') {
954 break;
955 }
956 }
957 return $localPath;
958 }
959
960 /**
961 * Attempts to create a local file path from T3VARs. This is useful for
962 * various download extensions that hide actual file name but still want the
963 * file to be indexed.
964 *
965 * @param string $sourcePath
966 * @return string
967 */
968 protected function createLocalPathFromT3vars($sourcePath)
969 {
970 $localPath = '';
971 $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
972 if (is_array($indexLocalFiles)) {
973 $md5 = GeneralUtility::shortMD5($sourcePath);
974 // Note: not using self::isAllowedLocalFile here because this method
975 // is allowed to index files outside of the web site (for example,
976 // protected downloads)
977 if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
978 $localPath = $indexLocalFiles[$md5];
979 }
980 }
981 return $localPath;
982 }
983
984 /**
985 * Attempts to create a local file path by matching a current request URL.
986 *
987 * @param string $sourcePath
988 * @return string
989 */
990 protected function createLocalPathUsingDomainURL($sourcePath)
991 {
992 $localPath = '';
993 $baseURL = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
994 $baseURLLength = strlen($baseURL);
995 if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
996 $sourcePath = substr($sourcePath, $baseURLLength);
997 $localPath = PATH_site . $sourcePath;
998 if (!self::isAllowedLocalFile($localPath)) {
999 $localPath = '';
1000 }
1001 }
1002 return $localPath;
1003 }
1004
1005 /**
1006 * Attempts to create a local file path by matching absRefPrefix. This
1007 * requires TSFE. If TSFE is missing, this function does nothing.
1008 *
1009 * @param string $sourcePath
1010 * @return string
1011 */
1012 protected function createLocalPathUsingAbsRefPrefix($sourcePath)
1013 {
1014 $localPath = '';
1015 if ($GLOBALS['TSFE'] instanceof \TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController) {
1016 $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
1017 $absRefPrefixLength = strlen($absRefPrefix);
1018 if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
1019 $sourcePath = substr($sourcePath, $absRefPrefixLength);
1020 $localPath = PATH_site . $sourcePath;
1021 if (!self::isAllowedLocalFile($localPath)) {
1022 $localPath = '';
1023 }
1024 }
1025 }
1026 return $localPath;
1027 }
1028
1029 /**
1030 * Attempts to create a local file path from the absolute URL without
1031 * schema.
1032 *
1033 * @param string $sourcePath
1034 * @return string
1035 */
1036 protected function createLocalPathFromAbsoluteURL($sourcePath)
1037 {
1038 $localPath = '';
1039 if ($sourcePath[0] === '/') {
1040 $sourcePath = substr($sourcePath, 1);
1041 $localPath = PATH_site . $sourcePath;
1042 if (!self::isAllowedLocalFile($localPath)) {
1043 $localPath = '';
1044 }
1045 }
1046 return $localPath;
1047 }
1048
1049 /**
1050 * Attempts to create a local file path from the relative URL.
1051 *
1052 * @param string $sourcePath
1053 * @return string
1054 */
1055 protected function createLocalPathFromRelativeURL($sourcePath)
1056 {
1057 $localPath = '';
1058 if (self::isRelativeURL($sourcePath)) {
1059 $localPath = PATH_site . $sourcePath;
1060 if (!self::isAllowedLocalFile($localPath)) {
1061 $localPath = '';
1062 }
1063 }
1064 return $localPath;
1065 }
1066
1067 /**
1068 * Checks if URL is relative.
1069 *
1070 * @param string $url
1071 * @return bool
1072 */
1073 protected static function isRelativeURL($url)
1074 {
1075 $urlParts = @parse_url($url);
1076 return $urlParts['scheme'] == '' && $urlParts['path'][0] !== '/';
1077 }
1078
1079 /**
1080 * Checks if the path points to the file inside the web site
1081 *
1082 * @param string $filePath
1083 * @return bool
1084 */
1085 protected static function isAllowedLocalFile($filePath)
1086 {
1087 $filePath = GeneralUtility::resolveBackPath($filePath);
1088 $insideWebPath = substr($filePath, 0, strlen(PATH_site)) == PATH_site;
1089 $isFile = is_file($filePath);
1090 return $insideWebPath && $isFile;
1091 }
1092
1093 /******************************************
1094 *
1095 * Indexing; external files (PDF, DOC, etc)
1096 *
1097 ******************************************/
1098 /**
1099 * Indexing a regular document given as $file (relative to PATH_site, local file)
1100 *
1101 * @param string $file Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with \TYPO3\CMS\Core\Utility\GeneralUtility::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
1102 * @param bool $force If set, indexing is forced (despite content hashes, mtime etc).
1103 * @param string $contentTmpFile Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
1104 * @param string $altExtension File extension for temporary file.
1105 */
1106 public function indexRegularDocument($file, $force = false, $contentTmpFile = '', $altExtension = '')
1107 {
1108 // Init
1109 $fI = pathinfo($file);
1110 $ext = $altExtension ?: strtolower($fI['extension']);
1111 // Create abs-path:
1112 if (!$contentTmpFile) {
1113 if (!GeneralUtility::isAbsPath($file)) {
1114 // Relative, prepend PATH_site:
1115 $absFile = GeneralUtility::getFileAbsFileName(PATH_site . $file);
1116 } else {
1117 // Absolute, pass-through:
1118 $absFile = $file;
1119 }
1120 $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
1121 } else {
1122 $absFile = $contentTmpFile;
1123 }
1124 // Indexing the document:
1125 if ($absFile && @is_file($absFile)) {
1126 if ($this->external_parsers[$ext]) {
1127 $fileInfo = stat($absFile);
1128 $cParts = $this->fileContentParts($ext, $absFile);
1129 foreach ($cParts as $cPKey) {
1130 $this->internal_log = [];
1131 $this->log_push('Index: ' . str_replace('.', '_', basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
1132 $Pstart = GeneralUtility::milliseconds();
1133 $subinfo = ['key' => $cPKey];
1134 // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1135 $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
1136 $check = $this->checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
1137 if ($check > 0 || $force) {
1138 if ($check > 0) {
1139 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
1140 } else {
1141 $this->log_setTSlogMessage('Indexing forced by flag', 1);
1142 }
1143 // Check external file counter:
1144 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1145 // Divide into title,keywords,description and body:
1146 $this->log_push('Split content', '');
1147 $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
1148 $this->log_pull();
1149 if (is_array($contentParts)) {
1150 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1151 $content_md5h = IndexedSearchUtility::md5inthash(implode($contentParts, ''));
1152 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1153 // Increment counter:
1154 $this->externalFileCounter++;
1155 // Splitting words
1156 $this->log_push('Extract words from content', '');
1157 $splitInWords = $this->processWordsInArrays($contentParts);
1158 $this->log_pull();
1159 // Analyse the indexed words.
1160 $this->log_push('Analyse the extracted words', '');
1161 $indexArr = $this->indexAnalyze($splitInWords);
1162 $this->log_pull();
1163 // Submitting page (phash) record
1164 $this->log_push('Submitting page', '');
1165 // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1166 $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], $content_md5h, $contentParts);
1167 $this->log_pull();
1168 // Check words and submit to word list if not there
1169 $this->log_push('Check word list and submit words', '');
1170 if (IndexedSearchUtility::isTableUsed('index_words')) {
1171 $this->checkWordList($indexArr);
1172 $this->submitWords($indexArr, $phash_arr['phash']);
1173 }
1174 $this->log_pull();
1175 // Set parsetime
1176 $this->updateParsetime($phash_arr['phash'], GeneralUtility::milliseconds() - $Pstart);
1177 } else {
1178 // Update the timestamp
1179 $this->updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
1180 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
1181 }
1182 } else {
1183 $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1184 }
1185 } else {
1186 $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
1187 }
1188 } else {
1189 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
1190 }
1191 // Checking and setting sections:
1192 $this->submitFile_section($phash_arr['phash']);
1193 // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1194 $this->log_pull();
1195 }
1196 } else {
1197 $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
1198 }
1199 } else {
1200 $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
1201 }
1202 }
1203
1204 /**
1205 * Reads the content of an external file being indexed.
1206 * The content from the external parser MUST be returned in utf-8!
1207 *
1208 * @param string $fileExtension File extension, eg. "pdf", "doc" etc.
1209 * @param string $absoluteFileName Absolute filename of file (must exist and be validated OK before calling function)
1210 * @param string $sectionPointer Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1211 * @return array Standard content array (title, description, keywords, body keys)
1212 */
1213 public function readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
1214 {
1215 $contentArray = null;
1216 // Consult relevant external document parser:
1217 if (is_object($this->external_parsers[$fileExtension])) {
1218 $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1219 }
1220 return $contentArray;
1221 }
1222
1223 /**
1224 * Creates an array with pointers to divisions of document.
1225 *
1226 * @param string $ext File extension
1227 * @param string $absFile Absolute filename (must exist and be validated OK before calling function)
1228 * @return array Array of pointers to sections that the document should be divided into
1229 */
1230 public function fileContentParts($ext, $absFile)
1231 {
1232 $cParts = [0];
1233 // Consult relevant external document parser:
1234 if (is_object($this->external_parsers[$ext])) {
1235 $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
1236 }
1237 return $cParts;
1238 }
1239
1240 /**
1241 * Splits non-HTML content (from external files for instance)
1242 *
1243 * @param string $content Input content (non-HTML) to index.
1244 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1245 * @see splitHTMLContent()
1246 */
1247 public function splitRegularContent($content)
1248 {
1249 $contentArr = $this->defaultContentArray;
1250 $contentArr['body'] = $content;
1251 return $contentArr;
1252 }
1253
1254 /**********************************
1255 *
1256 * Analysing content, Extracting words
1257 *
1258 **********************************/
1259 /**
1260 * Convert character set and HTML entities in the value of input content array keys
1261 *
1262 * @param array $contentArr Standard content array
1263 * @param string $charset Charset of the input content (converted to utf-8)
1264 */
1265 public function charsetEntity2utf8(&$contentArr, $charset)
1266 {
1267 // Convert charset if necessary
1268 foreach ($contentArr as $key => $value) {
1269 if ((string)$contentArr[$key] !== '') {
1270 if ($charset !== 'utf-8') {
1271 $contentArr[$key] = mb_convert_encoding($contentArr[$key], 'utf-8', $charset);
1272 }
1273 // decode all numeric / html-entities in the string to real characters:
1274 $contentArr[$key] = html_entity_decode($contentArr[$key]);
1275 }
1276 }
1277 }
1278
1279 /**
1280 * Processing words in the array from split*Content -functions
1281 *
1282 * @param array $contentArr Array of content to index, see splitHTMLContent() and splitRegularContent()
1283 * @return array Content input array modified so each key is not a unique array of words
1284 */
1285 public function processWordsInArrays($contentArr)
1286 {
1287 // split all parts to words
1288 foreach ($contentArr as $key => $value) {
1289 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1290 }
1291 // For title, keywords, and description we don't want duplicates:
1292 $contentArr['title'] = array_unique($contentArr['title']);
1293 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1294 $contentArr['description'] = array_unique($contentArr['description']);
1295 // Return modified array:
1296 return $contentArr;
1297 }
1298
1299 /**
1300 * Extracts the sample description text from the content array.
1301 *
1302 * @param array $contentArr Content array
1303 * @return string Description string
1304 */
1305 public function bodyDescription($contentArr)
1306 {
1307 // Setting description
1308 $maxL = MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1309 if ($maxL) {
1310 $bodyDescription = preg_replace('/\s+/u', ' ', $contentArr['body']);
1311 // Shorten the string:
1312 $bodyDescription = mb_strcut($bodyDescription, 0, $maxL, 'utf-8');
1313 }
1314 return $bodyDescription;
1315 }
1316
1317 /**
1318 * Analyzes content to use for indexing,
1319 *
1320 * @param array $content Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1321 * @return array Index Array (whatever that is...)
1322 */
1323 public function indexAnalyze($content)
1324 {
1325 $indexArr = [];
1326 $this->analyzeHeaderinfo($indexArr, $content, 'title', 7);
1327 $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1328 $this->analyzeHeaderinfo($indexArr, $content, 'description', 5);
1329 $this->analyzeBody($indexArr, $content);
1330 return $indexArr;
1331 }
1332
1333 /**
1334 * Calculates relevant information for headercontent
1335 *
1336 * @param array $retArr Index array, passed by reference
1337 * @param array $content Standard content array
1338 * @param string $key Key from standard content array
1339 * @param int $offset Bit-wise priority to type
1340 */
1341 public function analyzeHeaderinfo(&$retArr, $content, $key, $offset)
1342 {
1343 foreach ($content[$key] as $val) {
1344 $val = substr($val, 0, 60);
1345 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1346 if (!isset($retArr[$val])) {
1347 // Word ID (wid)
1348 $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1349 // Metaphone value is also 60 only chars long
1350 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1351 $retArr[$val]['metaphone'] = $metaphone;
1352 }
1353 // Build metaphone fulltext string (can be used for fulltext indexing)
1354 if ($this->storeMetaphoneInfoAsWords) {
1355 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1356 }
1357 // Priority used for flagBitMask feature (see extension configuration)
1358 $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | pow(2, $offset);
1359 // Increase number of occurrences
1360 $retArr[$val]['count']++;
1361 $this->wordcount++;
1362 }
1363 }
1364
1365 /**
1366 * Calculates relevant information for bodycontent
1367 *
1368 * @param array $retArr Index array, passed by reference
1369 * @param array $content Standard content array
1370 */
1371 public function analyzeBody(&$retArr, $content)
1372 {
1373 foreach ($content['body'] as $key => $val) {
1374 $val = substr($val, 0, 60);
1375 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1376 if (!isset($retArr[$val])) {
1377 // First occurrence (used for ranking results)
1378 $retArr[$val]['first'] = $key;
1379 // Word ID (wid)
1380 $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1381 // Metaphone value is also only 60 chars long
1382 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1383 $retArr[$val]['metaphone'] = $metaphone;
1384 }
1385 // Build metaphone fulltext string (can be used for fulltext indexing)
1386 if ($this->storeMetaphoneInfoAsWords) {
1387 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1388 }
1389 // Increase number of occurrences
1390 $retArr[$val]['count']++;
1391 $this->wordcount++;
1392 }
1393 }
1394
1395 /**
1396 * Creating metaphone based hash from input word
1397 *
1398 * @param string $word Word to convert
1399 * @param bool $returnRawMetaphoneValue If set, returns the raw metaphone value (not hashed)
1400 * @return mixed Metaphone hash integer (or raw value, string)
1401 */
1402 public function metaphone($word, $returnRawMetaphoneValue = false)
1403 {
1404 if (is_object($this->metaphoneObj)) {
1405 $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1406 } else {
1407 // Use native PHP function instead of advanced doubleMetaphone class
1408 $metaphoneRawValue = metaphone($word);
1409 }
1410 if ($returnRawMetaphoneValue) {
1411 $result = $metaphoneRawValue;
1412 } elseif ($metaphoneRawValue !== '') {
1413 // Create hash and return integer
1414 $result = IndexedSearchUtility::md5inthash($metaphoneRawValue);
1415 } else {
1416 $result = 0;
1417 }
1418 return $result;
1419 }
1420
1421 /********************************
1422 *
1423 * SQL; TYPO3 Pages
1424 *
1425 *******************************/
1426 /**
1427 * Updates db with information about the page (TYPO3 page, not external media)
1428 */
1429 public function submitPage()
1430 {
1431 // Remove any current data for this phash:
1432 $this->removeOldIndexedPages($this->hash['phash']);
1433 // setting new phash_row
1434 $fields = [
1435 'phash' => $this->hash['phash'],
1436 'phash_grouping' => $this->hash['phash_grouping'],
1437 'cHashParams' => serialize($this->cHashParams),
1438 'contentHash' => $this->content_md5h,
1439 'data_page_id' => $this->conf['id'],
1440 'data_page_reg1' => $this->conf['page_cache_reg1'],
1441 'data_page_type' => $this->conf['type'],
1442 'data_page_mp' => $this->conf['MP'],
1443 'gr_list' => $this->conf['gr_list'],
1444 'item_type' => 0,
1445 // TYPO3 page
1446 'item_title' => $this->contentParts['title'],
1447 'item_description' => $this->bodyDescription($this->contentParts),
1448 'item_mtime' => (int)$this->conf['mtime'],
1449 'item_size' => strlen($this->conf['content']),
1450 'tstamp' => $GLOBALS['EXEC_TIME'],
1451 'crdate' => $GLOBALS['EXEC_TIME'],
1452 'item_crdate' => $this->conf['crdate'],
1453 // Creation date of page
1454 'sys_language_uid' => $this->conf['sys_language_uid'],
1455 // Sys language uid of the page. Should reflect which language it DOES actually display!
1456 'externalUrl' => 0,
1457 'recordUid' => (int)$this->conf['recordUid'],
1458 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1459 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1460 ];
1461 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1462 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1463 ->getConnectionForTable('index_phash');
1464 $connection->insert(
1465 'index_phash',
1466 $fields,
1467 ['cHashParams' => Connection::PARAM_LOB]
1468 );
1469 }
1470 // PROCESSING index_section
1471 $this->submit_section($this->hash['phash'], $this->hash['phash']);
1472 // PROCESSING index_grlist
1473 $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1474 // PROCESSING index_fulltext
1475 $fields = [
1476 'phash' => $this->hash['phash'],
1477 'fulltextdata' => implode(' ', $this->contentParts),
1478 'metaphonedata' => $this->metaphoneContent
1479 ];
1480 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1481 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1482 }
1483 if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1484 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1485 ->getConnectionForTable('index_fulltext');
1486 $connection->insert('index_fulltext', $fields);
1487 }
1488 // PROCESSING index_debug
1489 if ($this->indexerConfig['debugMode']) {
1490 $fields = [
1491 'phash' => $this->hash['phash'],
1492 'debuginfo' => serialize([
1493 'cHashParams' => $this->cHashParams,
1494 'external_parsers initialized' => array_keys($this->external_parsers),
1495 'conf' => array_merge($this->conf, ['content' => substr($this->conf['content'], 0, 1000)]),
1496 'contentParts' => array_merge($this->contentParts, ['body' => substr($this->contentParts['body'], 0, 1000)]),
1497 'logs' => $this->internal_log,
1498 'lexer' => $this->lexerObj->debugString
1499 ])
1500 ];
1501 if (IndexedSearchUtility::isTableUsed('index_debug')) {
1502 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1503 ->getConnectionForTable('index_debug');
1504 $connection->insert('index_debug', $fields);
1505 }
1506 }
1507 }
1508
1509 /**
1510 * Stores gr_list in the database.
1511 *
1512 * @param int $hash Search result record phash
1513 * @param int $phash_x Actual phash of current content
1514 * @see update_grlist()
1515 */
1516 public function submit_grlist($hash, $phash_x)
1517 {
1518 // Setting the gr_list record
1519 $fields = [
1520 'phash' => $hash,
1521 'phash_x' => $phash_x,
1522 'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1523 'gr_list' => $this->conf['gr_list']
1524 ];
1525 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1526 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1527 ->getConnectionForTable('index_grlist');
1528 $connection->insert('index_grlist', $fields);
1529 }
1530 }
1531
1532 /**
1533 * Stores section
1534 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1535 *
1536 * @param int $hash phash of TYPO3 parent search result record
1537 * @param int $hash_t3 phash of the file indexation search record
1538 */
1539 public function submit_section($hash, $hash_t3)
1540 {
1541 $fields = [
1542 'phash' => $hash,
1543 'phash_t3' => $hash_t3,
1544 'page_id' => (int)$this->conf['id']
1545 ];
1546 $this->getRootLineFields($fields);
1547 if (IndexedSearchUtility::isTableUsed('index_section')) {
1548 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1549 ->getConnectionForTable('index_section');
1550 $connection->insert('index_section', $fields);
1551 }
1552 }
1553
1554 /**
1555 * Removes records for the indexed page, $phash
1556 *
1557 * @param int $phash phash value to flush
1558 */
1559 public function removeOldIndexedPages($phash)
1560 {
1561 // Removing old registrations for all tables. Because the pages are TYPO3 pages
1562 // there can be nothing else than 1-1 relations here.
1563 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1564 $tableArray = ['index_phash', 'index_section', 'index_grlist', 'index_fulltext', 'index_debug'];
1565 foreach ($tableArray as $table) {
1566 if (IndexedSearchUtility::isTableUsed($table)) {
1567 $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1568 }
1569 }
1570
1571 // Removing all index_section records with hash_t3 set to this hash (this includes such
1572 // records set for external media on the page as well!). The re-insert of these records
1573 // are done in indexRegularDocument($file).
1574 if (IndexedSearchUtility::isTableUsed('index_section')) {
1575 $connectionPool->getConnectionForTable('index_section')
1576 ->delete('index_section', ['phash_t3' => (int)$phash]);
1577 }
1578 }
1579
1580 /********************************
1581 *
1582 * SQL; External media
1583 *
1584 *******************************/
1585 /**
1586 * Updates db with information about the file
1587 *
1588 * @param array $hash Array with phash and phash_grouping keys for file
1589 * @param string $file File name
1590 * @param array $subinfo Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1591 * @param string $ext File extension determining the type of media.
1592 * @param int $mtime Modification time of file.
1593 * @param int $ctime Creation time of file.
1594 * @param int $size Size of file in bytes
1595 * @param int $content_md5h Content HASH value.
1596 * @param array $contentParts Standard content array (using only title and body for a file)
1597 */
1598 public function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts)
1599 {
1600 // Find item Type:
1601 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1602 $storeItemType = $storeItemType ?: $ext;
1603 // Remove any current data for this phash:
1604 $this->removeOldIndexedFiles($hash['phash']);
1605 // Split filename:
1606 $fileParts = parse_url($file);
1607 // Setting new
1608 $fields = [
1609 'phash' => $hash['phash'],
1610 'phash_grouping' => $hash['phash_grouping'],
1611 'cHashParams' => serialize($subinfo),
1612 'contentHash' => $content_md5h,
1613 'data_filename' => $file,
1614 'item_type' => $storeItemType,
1615 'item_title' => trim($contentParts['title']) ?: basename($file),
1616 'item_description' => $this->bodyDescription($contentParts),
1617 'item_mtime' => $mtime,
1618 'item_size' => $size,
1619 'item_crdate' => $ctime,
1620 'tstamp' => $GLOBALS['EXEC_TIME'],
1621 'crdate' => $GLOBALS['EXEC_TIME'],
1622 'gr_list' => $this->conf['gr_list'],
1623 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1624 'recordUid' => (int)$this->conf['recordUid'],
1625 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1626 'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1627 'sys_language_uid' => (int)$this->conf['sys_language_uid']
1628 ];
1629 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1630 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1631 ->getConnectionForTable('index_phash');
1632 $connection->insert(
1633 'index_phash',
1634 $fields,
1635 ['cHashParams' => Connection::PARAM_LOB]
1636 );
1637 }
1638 // PROCESSING index_fulltext
1639 $fields = [
1640 'phash' => $hash['phash'],
1641 'fulltextdata' => implode(' ', $contentParts),
1642 'metaphonedata' => $this->metaphoneContent
1643 ];
1644 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1645 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1646 }
1647 if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1648 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1649 ->getConnectionForTable('index_fulltext');
1650 $connection->insert('index_fulltext', $fields);
1651 }
1652 // PROCESSING index_debug
1653 if ($this->indexerConfig['debugMode']) {
1654 $fields = [
1655 'phash' => $hash['phash'],
1656 'debuginfo' => serialize([
1657 'cHashParams' => $subinfo,
1658 'contentParts' => array_merge($contentParts, ['body' => substr($contentParts['body'], 0, 1000)]),
1659 'logs' => $this->internal_log,
1660 'lexer' => $this->lexerObj->debugString
1661 ])
1662 ];
1663 if (IndexedSearchUtility::isTableUsed('index_debug')) {
1664 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1665 ->getConnectionForTable('index_debug');
1666 $connection->insert('index_debug', $fields);
1667 }
1668 }
1669 }
1670
1671 /**
1672 * Stores file gr_list for a file IF it does not exist already
1673 *
1674 * @param int $hash phash value of file
1675 */
1676 public function submitFile_grlist($hash)
1677 {
1678 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1679 if (!IndexedSearchUtility::isTableUsed('index_grlist')) {
1680 return;
1681 }
1682
1683 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1684 ->getQueryBuilderForTable('index_grlist');
1685 $count = (int)$queryBuilder->count('*')
1686 ->from('index_grlist')
1687 ->where(
1688 $queryBuilder->expr()->eq(
1689 'phash',
1690 $queryBuilder->createNamedParameter($hash, \PDO::PARAM_INT)
1691 ),
1692 $queryBuilder->expr()->orX(
1693 $queryBuilder->expr()->eq(
1694 'hash_gr_list',
1695 $queryBuilder->createNamedParameter(
1696 IndexedSearchUtility::md5inthash($this->defaultGrList),
1697 \PDO::PARAM_INT
1698 )
1699 ),
1700 $queryBuilder->expr()->eq(
1701 'hash_gr_list',
1702 $queryBuilder->createNamedParameter(
1703 IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1704 \PDO::PARAM_INT
1705 )
1706 )
1707 )
1708 )
1709 ->execute()
1710 ->fetchColumn();
1711
1712 if ($count === 0) {
1713 $this->submit_grlist($hash, $hash);
1714 }
1715 }
1716
1717 /**
1718 * Stores file section for a file IF it does not exist
1719 *
1720 * @param int $hash phash value of file
1721 */
1722 public function submitFile_section($hash)
1723 {
1724 // Testing if there is already a section
1725 if (!IndexedSearchUtility::isTableUsed('index_section')) {
1726 return;
1727 }
1728
1729 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1730 ->getQueryBuilderForTable('index_section');
1731 $count = (int)$queryBuilder->count('phash')
1732 ->from('index_section')
1733 ->where(
1734 $queryBuilder->expr()->eq(
1735 'phash',
1736 $queryBuilder->createNamedParameter($hash, \PDO::PARAM_INT)
1737 ),
1738 $queryBuilder->expr()->eq(
1739 'page_id',
1740 $queryBuilder->createNamedParameter($this->conf['id'], \PDO::PARAM_INT)
1741 )
1742 )
1743 ->execute()
1744 ->fetchColumn();
1745
1746 if ($count === 0) {
1747 $this->submit_section($hash, $this->hash['phash']);
1748 }
1749 }
1750
1751 /**
1752 * Removes records for the indexed page, $phash
1753 *
1754 * @param int $phash phash value to flush
1755 */
1756 public function removeOldIndexedFiles($phash)
1757 {
1758 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1759 // Removing old registrations for tables.
1760 $tableArray = ['index_phash', 'index_grlist', 'index_fulltext', 'index_debug'];
1761 foreach ($tableArray as $table) {
1762 if (!IndexedSearchUtility::isTableUsed($table)) {
1763 continue;
1764 }
1765 $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1766 }
1767 }
1768
1769 /********************************
1770 *
1771 * SQL Helper functions
1772 *
1773 *******************************/
1774 /**
1775 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1776 * Return positive integer if the page needs to be indexed
1777 *
1778 * @param int $mtime mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1779 * @param int $phash "phash" used to select any already indexed page to see what its mtime is.
1780 * @return int Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1781 */
1782 public function checkMtimeTstamp($mtime, $phash)
1783 {
1784 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1785 // Not indexed (not in index_phash)
1786 $result = 4;
1787 } else {
1788 $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1789 ->select(
1790 ['item_mtime', 'tstamp'],
1791 'index_phash',
1792 ['phash' => (int)$phash],
1793 [],
1794 [],
1795 1
1796 )
1797 ->fetch();
1798 // If there was an indexing of the page...:
1799 if (!empty($row)) {
1800 if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < $GLOBALS['EXEC_TIME']) {
1801 // If max age is exceeded, index the page
1802 // The configured max-age was exceeded for the document and thus it's indexed.
1803 $result = 1;
1804 } else {
1805 if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < $GLOBALS['EXEC_TIME']) {
1806 // if minAge is not set or if minAge is exceeded, consider at mtime
1807 if ($mtime) {
1808 // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1809 if ($row['item_mtime'] != $mtime) {
1810 // And if mtime is different from the index_phash mtime, it's about time to re-index.
1811 // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1812 $result = 2;
1813 } else {
1814 // mtime matched the document, so no changes detected and no content updated
1815 $result = -1;
1816 if ($this->tstamp_maxAge) {
1817 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1818 } else {
1819 $this->updateTstamp($phash);
1820 $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1821 }
1822 }
1823 } else {
1824 // The minimum age was exceed, but mtime was not set, so the page was indexed.
1825 $result = 3;
1826 }
1827 } else {
1828 // The minimum age was not exceeded
1829 $result = -2;
1830 }
1831 }
1832 } else {
1833 // Page has never been indexed (is not represented in the index_phash table).
1834 $result = 4;
1835 }
1836 }
1837 return $result;
1838 }
1839
1840 /**
1841 * Check content hash in phash table
1842 *
1843 * @return mixed Returns TRUE if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1844 */
1845 public function checkContentHash()
1846 {
1847 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1848 $result = true;
1849 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1850 $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1851 ->select(
1852 ['phash'],
1853 'index_phash',
1854 [
1855 'phash_grouping' => (int)$this->hash['phash_grouping'],
1856 'contentHash' => (int)$this->content_md5h
1857 ],
1858 [],
1859 [],
1860 1
1861 )
1862 ->fetch();
1863
1864 if (!empty($row)) {
1865 $result = $row;
1866 }
1867 }
1868 return $result;
1869 }
1870
1871 /**
1872 * Check content hash for external documents
1873 * Returns TRUE if the document needs to be indexed (that is, there was no result)
1874 *
1875 * @param int $hashGr phash value to check (phash_grouping)
1876 * @param int $content_md5h Content hash to check
1877 * @return bool Returns TRUE if the document needs to be indexed (that is, there was no result)
1878 */
1879 public function checkExternalDocContentHash($hashGr, $content_md5h)
1880 {
1881 $result = true;
1882 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1883 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1884 ->getConnectionForTable('index_phash')
1885 ->count(
1886 '*',
1887 'index_phash',
1888 [
1889 'phash_grouping' => (int)$hashGr,
1890 'contentHash' => (int)$content_md5h
1891 ]
1892 );
1893
1894 $result = $count === 0;
1895 }
1896 return $result;
1897 }
1898
1899 /**
1900 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1901 *
1902 * @param int $phash_x Phash integer to test.
1903 * @return bool
1904 */
1905 public function is_grlist_set($phash_x)
1906 {
1907 $result = false;
1908 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1909 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1910 ->getConnectionForTable('index_grlist')
1911 ->count(
1912 'phash_x',
1913 'index_grlist',
1914 ['phash_x' => (int)$phash_x]
1915 );
1916
1917 $result = $count > 0;
1918 }
1919 return $result;
1920 }
1921
1922 /**
1923 * Check if an grlist-entry for this hash exists and if not so, write one.
1924 *
1925 * @param int $phash phash of the search result that should be found
1926 * @param int $phash_x The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1927 * @see submit_grlist()
1928 */
1929 public function update_grlist($phash, $phash_x)
1930 {
1931 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1932 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1933 ->getConnectionForTable('index_grlist')
1934 ->count(
1935 'phash',
1936 'index_grlist',
1937 [
1938 'phash' => (int)$phash,
1939 'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list'])
1940 ]
1941 );
1942
1943 if ($count === 0) {
1944 $this->submit_grlist($phash, $phash_x);
1945 $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1);
1946 }
1947 }
1948 }
1949
1950 /**
1951 * Update tstamp for a phash row.
1952 *
1953 * @param int $phash phash value
1954 * @param int $mtime If set, update the mtime field to this value.
1955 */
1956 public function updateTstamp($phash, $mtime = 0)
1957 {
1958 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1959 return;
1960 }
1961
1962 $updateFields = [
1963 'tstamp' => $GLOBALS['EXEC_TIME']
1964 ];
1965
1966 if ($mtime) {
1967 $updateFields['item_mtime'] = (int)$mtime;
1968 }
1969
1970 GeneralUtility::makeInstance(ConnectionPool::class)
1971 ->getConnectionForTable('index_phash')
1972 ->update(
1973 'index_phash',
1974 $updateFields,
1975 [
1976 'phash' => (int)$phash
1977 ]
1978 );
1979 }
1980
1981 /**
1982 * Update SetID of the index_phash record.
1983 *
1984 * @param int $phash phash value
1985 */
1986 public function updateSetId($phash)
1987 {
1988 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1989 return;
1990 }
1991
1992 GeneralUtility::makeInstance(ConnectionPool::class)
1993 ->getConnectionForTable('index_phash')
1994 ->update(
1995 'index_phash',
1996 [
1997 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1998 ],
1999 [
2000 'phash' => (int)$phash
2001 ]
2002 );
2003 }
2004
2005 /**
2006 * Update parsetime for phash row.
2007 *
2008 * @param int $phash phash value.
2009 * @param int $parsetime Parsetime value to set.
2010 */
2011 public function updateParsetime($phash, $parsetime)
2012 {
2013 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
2014 return;
2015 }
2016
2017 GeneralUtility::makeInstance(ConnectionPool::class)
2018 ->getConnectionForTable('index_phash')
2019 ->update(
2020 'index_phash',
2021 [
2022 'parsetime' => (int)$parsetime
2023 ],
2024 [
2025 'phash' => (int)$phash
2026 ]
2027 );
2028 }
2029
2030 /**
2031 * Update section rootline for the page
2032 */
2033 public function updateRootline()
2034 {
2035 if (!IndexedSearchUtility::isTableUsed('index_section')) {
2036 return;
2037 }
2038
2039 $updateFields = [];
2040 $this->getRootLineFields($updateFields);
2041
2042 GeneralUtility::makeInstance(ConnectionPool::class)
2043 ->getConnectionForTable('index_section')
2044 ->update(
2045 'index_section',
2046 $updateFields,
2047 [
2048 'page_id' => (int)$this->conf['id']
2049 ]
2050 );
2051 }
2052
2053 /**
2054 * Adding values for root-line fields.
2055 * rl0, rl1 and rl2 are standard. A hook might add more.
2056 *
2057 * @param array $fieldArray Field array, passed by reference
2058 */
2059 public function getRootLineFields(array &$fieldArray)
2060 {
2061 $fieldArray['rl0'] = (int)$this->conf['rootline_uids'][0];
2062 $fieldArray['rl1'] = (int)$this->conf['rootline_uids'][1];
2063 $fieldArray['rl2'] = (int)$this->conf['rootline_uids'][2];
2064 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] ?? [] as $fieldName => $rootLineLevel) {
2065 $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
2066 }
2067 }
2068
2069 /********************************
2070 *
2071 * SQL; Submitting words
2072 *
2073 *******************************/
2074 /**
2075 * Adds new words to db
2076 *
2077 * @param array $wordListArray Word List array (where each word has information about position etc).
2078 */
2079 public function checkWordList($wordListArray)
2080 {
2081 if (!IndexedSearchUtility::isTableUsed('index_words') || empty($wordListArray)) {
2082 return;
2083 }
2084
2085 $wordListArrayCount = count($wordListArray);
2086 $phashArray = array_map('intval', array_column($wordListArray, 'hash'));
2087
2088 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('index_words');
2089 $count = (int)$queryBuilder->count('baseword')
2090 ->from('index_words')
2091 ->where(
2092 $queryBuilder->expr()->in(
2093 'wid',
2094 $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
2095 )
2096 )
2097 ->execute()
2098 ->fetchColumn();
2099
2100 if ($count !== $wordListArrayCount) {
2101 $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_words');
2102 $queryBuilder = $connection->createQueryBuilder();
2103
2104 $result = $queryBuilder->select('baseword')
2105 ->from('index_words')
2106 ->where(
2107 $queryBuilder->expr()->in(
2108 'wid',
2109 $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
2110 )
2111 )
2112 ->execute();
2113
2114 $this->log_setTSlogMessage('Inserting words: ' . ($wordListArrayCount - $count), 1);
2115 while ($row = $result->fetch()) {
2116 unset($wordListArray[$row['baseword']]);
2117 }
2118
2119 foreach ($wordListArray as $key => $val) {
2120 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as
2121 // long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...)
2122 // this is not a problem.
2123 $connection->insert(
2124 'index_words',
2125 [
2126 'wid' => $val['hash'],
2127 'baseword' => $key,
2128 'metaphone' => $val['metaphone']
2129 ]
2130 );
2131 }
2132 }
2133 }
2134
2135 /**
2136 * Submits RELATIONS between words and phash
2137 *
2138 * @param array $wordList Word list array
2139 * @param int $phash phash value
2140 */
2141 public function submitWords($wordList, $phash)
2142 {
2143 if (!IndexedSearchUtility::isTableUsed('index_rel')) {
2144 return;
2145 }
2146 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
2147 $queryBuilder = $connectionPool->getQueryBuilderForTable('index_words');
2148 $result = $queryBuilder->select('wid')
2149 ->from('index_words')
2150 ->where(
2151 $queryBuilder->expr()->neq('is_stopword', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT))
2152 )
2153 ->groupBy('wid')
2154 ->execute();
2155
2156 $stopWords = [];
2157 while ($row = $result->fetch()) {
2158 $stopWords[$row['wid']] = $row;
2159 }
2160
2161 $connectionPool->getConnectionForTable('index_rel')->delete('index_rel', ['phash' => (int)$phash]);
2162
2163 $fields = ['phash', 'wid', 'count', 'first', 'freq', 'flags'];
2164 $rows = [];
2165 foreach ($wordList as $val) {
2166 if (isset($stopWords[$val['hash']])) {
2167 continue;
2168 }
2169 $rows[] = [
2170 (int)$phash,
2171 (int)$val['hash'],
2172 (int)$val['count'],
2173 (int)$val['first'],
2174 $this->freqMap($val['count'] / $this->wordcount),
2175 $val['cmp'] & $this->flagBitMask
2176 ];
2177 }
2178
2179 if (!empty($rows)) {
2180 $connectionPool->getConnectionForTable('index_rel')->bulkInsert('index_rel', $rows, $fields);
2181 }
2182 }
2183
2184 /**
2185 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
2186 * and back.
2187 *
2188 * @param float $freq Frequency
2189 * @return int Frequency in range.
2190 */
2191 public function freqMap($freq)
2192 {
2193 $mapFactor = $this->freqMax * 100 * $this->freqRange;
2194 if ($freq <= 1) {
2195 $newFreq = $freq * $mapFactor;
2196 $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
2197 } else {
2198 $newFreq = $freq / $mapFactor;
2199 }
2200 return $newFreq;
2201 }
2202
2203 /********************************
2204 *
2205 * Hashing
2206 *
2207 *******************************/
2208 /**
2209 * Get search hash, T3 pages
2210 */
2211 public function setT3Hashes()
2212 {
2213 // Set main array:
2214 $hArray = [
2215 'id' => (int)$this->conf['id'],
2216 'type' => (int)$this->conf['type'],
2217 'sys_lang' => (int)$this->conf['sys_language_uid'],
2218 'MP' => (string)$this->conf['MP'],
2219 'cHash' => $this->cHashParams
2220 ];
2221 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
2222 $this->hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2223 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
2224 $hArray['gr_list'] = (string)$this->conf['gr_list'];
2225 $this->hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2226 }
2227
2228 /**
2229 * Get search hash, external files
2230 *
2231 * @param string $file File name / path which identifies it on the server
2232 * @param array $subinfo Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
2233 * @return array Array with "phash_grouping" and "phash" inside.
2234 */
2235 public function setExtHashes($file, $subinfo = [])
2236 {
2237 // Set main array:
2238 $hash = [];
2239 $hArray = [
2240 'file' => $file
2241 ];
2242 // Set grouping hash:
2243 $hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2244 // Add subinfo
2245 $hArray['subinfo'] = $subinfo;
2246 $hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2247 return $hash;
2248 }
2249
2250 /*********************************
2251 *
2252 * Internal logging functions
2253 *
2254 *********************************/
2255 /**
2256 * Push function wrapper for TT logging
2257 *
2258 * @param string $msg Title to set
2259 * @param string $key Key (?)
2260 */
2261 public function log_push($msg, $key)
2262 {
2263 $this->timeTracker->push($msg, $key);
2264 }
2265
2266 /**
2267 * Pull function wrapper for TT logging
2268 */
2269 public function log_pull()
2270 {
2271 $this->timeTracker->pull();
2272 }
2273
2274 /**
2275 * Set log message function wrapper for TT logging
2276 *
2277 * @param string $msg Message to set
2278 * @param int $errorNum Error number
2279 */
2280 public function log_setTSlogMessage($msg, $errorNum = 0)
2281 {
2282 $this->timeTracker->setTSlogMessage($msg, $errorNum);
2283 $this->internal_log[] = $msg;
2284 }
2285
2286 /**
2287 * Makes sure that keywords are space-separated. This is impotant for their
2288 * proper displaying as a part of fulltext index.
2289 *
2290 * @param string $keywordList
2291 * @return string
2292 * @see http://forge.typo3.org/issues/14959
2293 */
2294 protected function addSpacesToKeywordList($keywordList)
2295 {
2296 $keywords = GeneralUtility::trimExplode(',', $keywordList);
2297 return ' ' . implode(', ', $keywords) . ' ';
2298 }
2299 }