[!!!][TASK] Drop "documentation" extension
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Indexer.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
18 use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
19 use TYPO3\CMS\Core\Core\Environment;
20 use TYPO3\CMS\Core\Database\Connection;
21 use TYPO3\CMS\Core\Database\ConnectionPool;
22 use TYPO3\CMS\Core\TimeTracker\TimeTracker;
23 use TYPO3\CMS\Core\Utility\GeneralUtility;
24 use TYPO3\CMS\Core\Utility\MathUtility;
25 use TYPO3\CMS\Core\Utility\PathUtility;
26 use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
27 use TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility;
28
29 /**
30 * Indexing class for TYPO3 frontend
31 */
32 class Indexer
33 {
34 use PublicPropertyDeprecationTrait;
35
36 /**
37 * List of all deprecated public properties
38 * @var array
39 */
40 protected $deprecatedPublicProperties = [
41 'csObj' => 'Using $csObj within Indexing is discouraged, the property will be removed in TYPO3 v10.0 - if needed instantiate CharsetConverter yourself.',
42 ];
43
44 /**
45 * @var array
46 */
47 public $reasons = [
48 -1 => 'mtime matched the document, so no changes detected and no content updated',
49 -2 => 'The minimum age was not exceeded',
50 1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
51 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
52 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
53 4 => 'Page has never been indexed (is not represented in the index_phash table).'
54 ];
55
56 /**
57 * HTML code blocks to exclude from indexing
58 *
59 * @var string
60 */
61 public $excludeSections = 'script,style';
62
63 /**
64 * Supported Extensions for external files
65 *
66 * @var array
67 */
68 public $external_parsers = [];
69
70 /**
71 * External parser objects, keys are file extension names. Values are objects with certain methods.
72 * Fe-group list (pages might be indexed separately for each usergroup combination to support search
73 * in access limited pages!)
74 *
75 * @var string
76 */
77 public $defaultGrList = '0,-1';
78
79 /**
80 * Min/Max times
81 *
82 * @var int
83 */
84 public $tstamp_maxAge = 0;
85
86 /**
87 * If set, this tells a number of seconds that is the maximum age of an indexed document.
88 * Regardless of mtime the document will be re-indexed if this limit is exceeded.
89 *
90 * @var int
91 */
92 public $tstamp_minAge = 0;
93
94 /**
95 * If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
96 *
97 * @var int
98 */
99 public $maxExternalFiles = 0;
100
101 /**
102 * Max number of external files to index.
103 *
104 * @var bool
105 */
106 public $forceIndexing = false;
107
108 /**
109 * If TRUE, indexing is forced despite of hashes etc.
110 *
111 * @var bool
112 */
113 public $crawlerActive = false;
114
115 /**
116 * Set when crawler is detected (internal)
117 *
118 * @var array
119 */
120 public $defaultContentArray = [
121 'title' => '',
122 'description' => '',
123 'keywords' => '',
124 'body' => ''
125 ];
126
127 /**
128 * @var int
129 */
130 public $wordcount = 0;
131
132 /**
133 * @var int
134 */
135 public $externalFileCounter = 0;
136
137 /**
138 * @var array
139 */
140 public $conf = [];
141
142 /**
143 * Configuration set internally (see init functions for required keys and their meaning)
144 *
145 * @var array
146 */
147 public $indexerConfig = [];
148
149 /**
150 * Indexer configuration, coming from TYPO3's system configuration for EXT:indexed_search
151 *
152 * @var array
153 */
154 public $hash = [];
155
156 /**
157 * Hash array, contains phash and phash_grouping
158 *
159 * @var array
160 */
161 public $file_phash_arr = [];
162
163 /**
164 * Hash array for files
165 *
166 * @var array
167 */
168 public $contentParts = [];
169
170 /**
171 * Content of TYPO3 page
172 *
173 * @var string
174 */
175 public $content_md5h = '';
176
177 /**
178 * @var array
179 */
180 public $internal_log = [];
181
182 /**
183 * Internal log
184 *
185 * @var string
186 */
187 public $indexExternalUrl_content = '';
188
189 /**
190 * @var array
191 */
192 public $cHashParams = [];
193
194 /**
195 * cHashparams array
196 *
197 * @var int
198 */
199 public $freqRange = 32000;
200
201 /**
202 * @var float
203 */
204 public $freqMax = 0.1;
205
206 /**
207 * @var bool
208 */
209 public $enableMetaphoneSearch = false;
210
211 /**
212 * @var bool
213 */
214 public $storeMetaphoneInfoAsWords;
215
216 /**
217 * @var string
218 */
219 public $metaphoneContent = '';
220
221 /**
222 * Charset class object
223 *
224 * @var \TYPO3\CMS\Core\Charset\CharsetConverter
225 * @deprecated since TYPO3 v9.3, will be removed in TYPO3 v10 (also the instantiation in the init() method).
226 */
227 protected $csObj;
228
229 /**
230 * Metaphone object, if any
231 *
232 * @var \TYPO3\CMS\IndexedSearch\Utility\DoubleMetaPhoneUtility
233 */
234 public $metaphoneObj;
235
236 /**
237 * Lexer object for word splitting
238 *
239 * @var \TYPO3\CMS\IndexedSearch\Lexer
240 */
241 public $lexerObj;
242
243 /**
244 * @var bool
245 */
246 public $flagBitMask;
247
248 /**
249 * @var TimeTracker
250 */
251 protected $timeTracker;
252
253 /**
254 * Indexer constructor.
255 */
256 public function __construct()
257 {
258 $this->timeTracker = GeneralUtility::makeInstance(TimeTracker::class);
259 }
260
261 /**
262 * Parent Object (TSFE) Initialization
263 *
264 * @param TypoScriptFrontendController $pObj Parent Object, passed by reference
265 */
266 public function hook_indexContent(&$pObj)
267 {
268 // Indexer configuration from Extension Manager interface:
269 $disableFrontendIndexing = (bool)GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search', 'disableFrontendIndexing');
270 // Crawler activation:
271 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
272 if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler') && $pObj->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
273 // Setting simple log message:
274 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
275 // Setting variables:
276 $this->crawlerActive = true;
277 // Crawler active flag
278 $this->forceIndexing = true;
279 }
280 // Determine if page should be indexed, and if so, configure and initialize indexer
281 if ($pObj->config['config']['index_enable']) {
282 $this->log_push('Index page', '');
283 if (!$disableFrontendIndexing || $this->crawlerActive) {
284 if (!$pObj->page['no_search']) {
285 if (!$pObj->no_cache) {
286 if ((int)$pObj->sys_language_uid === (int)$pObj->sys_language_content) {
287 // Setting up internal configuration from config array:
288 $this->conf = [];
289 // Information about page for which the indexing takes place
290 $this->conf['id'] = $pObj->id;
291 // Page id
292 $this->conf['type'] = $pObj->type;
293 // Page type
294 $this->conf['sys_language_uid'] = $pObj->sys_language_uid;
295 // sys_language UID of the language of the indexing.
296 $this->conf['MP'] = $pObj->MP;
297 // MP variable, if any (Mount Points)
298 $this->conf['gr_list'] = $pObj->gr_list;
299 // Group list
300 $this->conf['cHash'] = $pObj->cHash;
301 // cHash string for additional parameters
302 $this->conf['cHash_array'] = $pObj->cHash_array;
303 // Array of the additional parameters
304 $this->conf['crdate'] = $pObj->page['crdate'];
305 // The creation date of the TYPO3 page
306
307 // reg1 of the caching table. Not known what practical use this has.
308 // @deprecated since TYPO3 v9, will be removed in TYPO3 v10. Remove along with database field data_page_reg1
309 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;
310
311 // Root line uids
312 $this->conf['rootline_uids'] = [];
313 foreach ($pObj->config['rootLine'] as $rlkey => $rldat) {
314 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
315 }
316 // Content of page:
317 $this->conf['content'] = $pObj->content;
318 // Content string (HTML of TYPO3 page)
319 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
320 // Alternative title for indexing
321 $this->conf['metaCharset'] = $pObj->metaCharset;
322 // Character set of content (will be converted to utf-8 during indexing)
323 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED'] ?? $pObj->page['SYS_LASTCHANGED'];
324 // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
325 // Configuration of behavior:
326 $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
327 // Whether to index external documents like PDF, DOC etc. (if possible)
328 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
329 // Length of description text (max 250, default 200)
330 $this->conf['index_metatags'] = $pObj->config['config']['index_metatags'] ?? true;
331 // Set to zero:
332 $this->conf['recordUid'] = 0;
333 $this->conf['freeIndexUid'] = 0;
334 $this->conf['freeIndexSetId'] = 0;
335 // Init and start indexing:
336 $this->init();
337 $this->indexTypo3PageContent();
338 } else {
339 $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
340 }
341 } else {
342 $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
343 }
344 } else {
345 $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
346 }
347 } else {
348 $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
349 }
350 $this->log_pull();
351 }
352 }
353
354 /****************************
355 *
356 * Backend API
357 *
358 ****************************/
359 /**
360 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
361 *
362 * @param int $id The page uid, &id=
363 * @param int $type The page type, &type=
364 * @param int $sys_language_uid sys_language uid, typically &L=
365 * @param string $MP The MP variable (Mount Points), &MP=
366 * @param array $uidRL Rootline array of only UIDs.
367 * @param array $cHash_array Array of GET variables to register with this indexing
368 * @param bool $createCHash If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cacheable pages from the backend!
369 */
370 public function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array = [], $createCHash = false)
371 {
372 // Setting up internal configuration from config array:
373 $this->conf = [];
374 // Information about page for which the indexing takes place
375 $this->conf['id'] = $id;
376 // Page id (int)
377 $this->conf['type'] = $type;
378 // Page type (int)
379 $this->conf['sys_language_uid'] = $sys_language_uid;
380 // sys_language UID of the language of the indexing (int)
381 $this->conf['MP'] = $MP;
382 // MP variable, if any (Mount Points) (string)
383 $this->conf['gr_list'] = '0,-1';
384 // Group list (hardcoded for now...)
385 // cHash values:
386 if ($createCHash) {
387 /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
388 $cacheHash = GeneralUtility::makeInstance(\TYPO3\CMS\Frontend\Page\CacheHashCalculator::class);
389 $this->conf['cHash'] = $cacheHash->generateForParameters(GeneralUtility::implodeArrayForUrl('', $cHash_array));
390 } else {
391 $this->conf['cHash'] = '';
392 }
393 // cHash string for additional parameters
394 $this->conf['cHash_array'] = $cHash_array;
395 // Array of the additional parameters
396 // Set to defaults
397 $this->conf['freeIndexUid'] = 0;
398 $this->conf['freeIndexSetId'] = 0;
399
400 // @deprecated since TYPO3 v9, will be removed in TYPO3 v10. Remove along with database field data_page_reg1
401 $this->conf['page_cache_reg1'] = 0;
402
403 // Root line uids
404 $this->conf['rootline_uids'] = $uidRL;
405 // Configuration of behavior:
406 $this->conf['index_externals'] = 1;
407 // Whether to index external documents like PDF, DOC etc. (if possible)
408 $this->conf['index_descrLgd'] = 200;
409 // Length of description text (max 250, default 200)
410 $this->conf['index_metatags'] = true;
411 // Whether to index document keywords and description (if present)
412 // Init and start indexing:
413 $this->init();
414 }
415
416 /**
417 * Sets the free-index uid. Can be called right after backend_initIndexer()
418 *
419 * @param int $freeIndexUid Free index UID
420 * @param int $freeIndexSetId Set id - an integer identifying the "set" of indexing operations.
421 */
422 public function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId = 0)
423 {
424 $this->conf['freeIndexUid'] = $freeIndexUid;
425 $this->conf['freeIndexSetId'] = $freeIndexSetId;
426 }
427
428 /**
429 * Indexing records as the content of a TYPO3 page.
430 *
431 * @param string $title Title equivalent
432 * @param string $keywords Keywords equivalent
433 * @param string $description Description equivalent
434 * @param string $content The main content to index
435 * @param string $charset The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
436 * @param int $mtime Last modification time, in seconds
437 * @param int $crdate The creation date of the content, in seconds
438 * @param int $recordUid The record UID that the content comes from (for registration with the indexed rows)
439 */
440 public function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate = 0, $recordUid = 0)
441 {
442 // Content of page:
443 $this->conf['mtime'] = $mtime;
444 // Most recent modification time (seconds) of the content
445 $this->conf['crdate'] = $crdate;
446 // The creation date of the TYPO3 content
447 $this->conf['recordUid'] = $recordUid;
448 // UID of the record, if applicable
449 // Construct fake HTML for parsing:
450 $this->conf['content'] = '
451 <html>
452 <head>
453 <title>' . htmlspecialchars($title) . '</title>
454 <meta name="keywords" content="' . htmlspecialchars($keywords) . '" />
455 <meta name="description" content="' . htmlspecialchars($description) . '" />
456 </head>
457 <body>
458 ' . htmlspecialchars($content) . '
459 </body>
460 </html>';
461 // Content string (HTML of TYPO3 page)
462 // Initializing charset:
463 $this->conf['metaCharset'] = $charset;
464 // Character set of content (will be converted to utf-8 during indexing)
465 $this->conf['indexedDocTitle'] = '';
466 // Alternative title for indexing
467 // Index content as if it was a TYPO3 page:
468 $this->indexTypo3PageContent();
469 }
470
471 /********************************
472 *
473 * Initialization
474 *
475 *******************************/
476 /**
477 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
478 */
479 public function init()
480 {
481 // Initializing:
482 $this->cHashParams = $this->conf['cHash_array'];
483 if (is_array($this->cHashParams) && !empty($this->cHashParams)) {
484 if ($this->conf['cHash']) {
485 // Add this so that URL's come out right...
486 $this->cHashParams['cHash'] = $this->conf['cHash'];
487 }
488 unset($this->cHashParams['encryptionKey']);
489 }
490 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
491 $this->setT3Hashes();
492 // Indexer configuration from Extension Manager interface:
493 $this->indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
494 $this->tstamp_minAge = MathUtility::forceIntegerInRange($this->indexerConfig['minAge'] * 3600, 0);
495 $this->tstamp_maxAge = MathUtility::forceIntegerInRange($this->indexerConfig['maxAge'] * 3600, 0);
496 $this->maxExternalFiles = MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
497 $this->flagBitMask = MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
498 // Workaround: If the extension configuration was not updated yet, the value is not existing
499 $this->enableMetaphoneSearch = !isset($this->indexerConfig['enableMetaphoneSearch']) || $this->indexerConfig['enableMetaphoneSearch'];
500 $this->storeMetaphoneInfoAsWords = !IndexedSearchUtility::isTableUsed('index_words') && $this->enableMetaphoneSearch;
501 // Initialize external document parsers:
502 // Example configuration, see ext_localconf.php of this file!
503 if ($this->conf['index_externals']) {
504 $this->initializeExternalParsers();
505 }
506 // Initialize lexer (class that deconstructs the text into words):
507 $lexerObjectClassName = $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] ?: Lexer::class;
508 $this->lexerObj = GeneralUtility::makeInstance($lexerObjectClassName);
509 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
510 // Initialize metaphone hook:
511 // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
512 if ($this->enableMetaphoneSearch && $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']) {
513 $this->metaphoneObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']);
514 $this->metaphoneObj->pObj = $this;
515 }
516 // Init charset class:
517 $this->csObj = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Charset\CharsetConverter::class);
518 }
519
520 /**
521 * Initialize external parsers
522 *
523 * @access private
524 * @see init()
525 */
526 public function initializeExternalParsers()
527 {
528 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] ?? [] as $extension => $className) {
529 $this->external_parsers[$extension] = GeneralUtility::makeInstance($className);
530 $this->external_parsers[$extension]->pObj = $this;
531 // Init parser and if it returns FALSE, unset its entry again:
532 if (!$this->external_parsers[$extension]->initParser($extension)) {
533 unset($this->external_parsers[$extension]);
534 }
535 }
536 }
537
538 /********************************
539 *
540 * Indexing; TYPO3 pages (HTML content)
541 *
542 *******************************/
543 /**
544 * Start indexing of the TYPO3 page
545 */
546 public function indexTypo3PageContent()
547 {
548 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
549 $is_grlist = $this->is_grlist_set($this->hash['phash']);
550 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
551 // Setting message:
552 if ($this->forceIndexing) {
553 $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
554 } elseif ($check > 0) {
555 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
556 } else {
557 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
558 }
559 // Divide into title,keywords,description and body:
560 $this->log_push('Split content', '');
561 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
562 if ($this->conf['indexedDocTitle']) {
563 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
564 }
565 $this->log_pull();
566 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so don't!)
567 $this->content_md5h = IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
568 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
569 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
570 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
571 $checkCHash = $this->checkContentHash();
572 if (!is_array($checkCHash) || $check === 1) {
573 $Pstart = GeneralUtility::milliseconds();
574 $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
575 $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
576 $this->log_pull();
577 // Splitting words
578 $this->log_push('Extract words from content', '');
579 $splitInWords = $this->processWordsInArrays($this->contentParts);
580 $this->log_pull();
581 // Analyse the indexed words.
582 $this->log_push('Analyse the extracted words', '');
583 $indexArr = $this->indexAnalyze($splitInWords);
584 $this->log_pull();
585 // Submitting page (phash) record
586 $this->log_push('Submitting page', '');
587 $this->submitPage();
588 $this->log_pull();
589 // Check words and submit to word list if not there
590 $this->log_push('Check word list and submit words', '');
591 if (IndexedSearchUtility::isTableUsed('index_words')) {
592 $this->checkWordList($indexArr);
593 $this->submitWords($indexArr, $this->hash['phash']);
594 }
595 $this->log_pull();
596 // Set parsetime
597 $this->updateParsetime($this->hash['phash'], GeneralUtility::milliseconds() - $Pstart);
598 // Checking external files if configured for.
599 $this->log_push('Checking external files', '');
600 if ($this->conf['index_externals']) {
601 $this->extractLinks($this->conf['content']);
602 }
603 $this->log_pull();
604 } else {
605 // Update the timestamp
606 $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
607 $this->updateSetId($this->hash['phash']);
608 // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
609 $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
610 $this->updateRootline();
611 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
612 }
613 } else {
614 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
615 }
616 }
617
618 /**
619 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
620 *
621 * @param string $content HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
622 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
623 * @see splitRegularContent()
624 */
625 public function splitHTMLContent($content)
626 {
627 // divide head from body ( u-ouh :) )
628 $contentArr = $this->defaultContentArray;
629 $contentArr['body'] = stristr($content, '<body');
630 $headPart = substr($content, 0, -strlen($contentArr['body']));
631 // get title
632 $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
633 $titleParts = explode(':', $contentArr['title'], 2);
634 $contentArr['title'] = trim($titleParts[1] ?? $titleParts[0]);
635 // get keywords and description metatags
636 if ($this->conf['index_metatags']) {
637 $meta = [];
638 $i = 0;
639 while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
640 $i++;
641 }
642 // @todo The code below stops at first unset tag. Is that correct?
643 for ($i = 0; isset($meta[$i]); $i++) {
644 $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i]);
645 if (stristr($meta[$i]['name'], 'keywords')) {
646 $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
647 }
648 if (stristr($meta[$i]['name'], 'description')) {
649 $contentArr['description'] .= ',' . $meta[$i]['content'];
650 }
651 }
652 }
653 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
654 $this->typoSearchTags($contentArr['body']);
655 // Get rid of unwanted sections (ie. scripting and style stuff) in body
656 $tagList = explode(',', $this->excludeSections);
657 foreach ($tagList as $tag) {
658 while ($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
659 }
660 }
661 // remove tags, but first make sure we don't concatenate words by doing it
662 $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
663 $contentArr['body'] = trim(strip_tags($contentArr['body']));
664 $contentArr['keywords'] = trim($contentArr['keywords']);
665 $contentArr['description'] = trim($contentArr['description']);
666 // Return array
667 return $contentArr;
668 }
669
670 /**
671 * Extract the charset value from HTML meta tag.
672 *
673 * @param string $content HTML content
674 * @return string The charset value if found.
675 */
676 public function getHTMLcharset($content)
677 {
678 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
679 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
680 return $reg2[1];
681 }
682 }
683 }
684
685 /**
686 * Converts a HTML document to utf-8
687 *
688 * @param string $content HTML content, any charset
689 * @param string $charset Optional charset (otherwise extracted from HTML)
690 * @return string Converted HTML
691 */
692 public function convertHTMLToUtf8($content, $charset = '')
693 {
694 // Find charset:
695 $charset = $charset ?: $this->getHTMLcharset($content);
696 $charset = trim(strtolower($charset));
697 // Convert charset:
698 if ($charset && $charset !== 'utf-8') {
699 $content = mb_convert_encoding($content, 'utf-8', $charset);
700 }
701 // Convert entities, assuming document is now UTF-8:
702 return html_entity_decode($content);
703 }
704
705 /**
706 * Finds first occurrence of embracing tags and returns the embraced content and the original string with
707 * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding
708 * <title> of document or removing <script>-sections
709 *
710 * @param string $string String to search in
711 * @param string $tagName Tag name, eg. "script
712 * @param string $tagContent Passed by reference: Content inside found tag
713 * @param string $stringAfter Passed by reference: Content after found tag
714 * @param string $paramList Passed by reference: Attributes of the found tag.
715 * @return bool Returns FALSE if tag was not found, otherwise TRUE.
716 */
717 public function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
718 {
719 $endTag = '</' . $tagName . '>';
720 $startTag = '<' . $tagName;
721 // stristr used because we want a case-insensitive search for the tag.
722 $isTagInText = stristr($string, $startTag);
723 // if the tag was not found, return FALSE
724 if (!$isTagInText) {
725 return false;
726 }
727 list($paramList, $isTagInText) = explode('>', substr($isTagInText, strlen($startTag)), 2);
728 $afterTagInText = stristr($isTagInText, $endTag);
729 if ($afterTagInText) {
730 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
731 $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
732 $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
733 } else {
734 $tagContent = '';
735 $stringAfter = $isTagInText;
736 }
737 return true;
738 }
739
740 /**
741 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
742 *
743 * @param string $body HTML Content, passed by reference
744 * @return bool Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE.
745 */
746 public function typoSearchTags(&$body)
747 {
748 $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
749 if (count($expBody) > 1) {
750 $body = '';
751 foreach ($expBody as $val) {
752 $part = explode('-->', $val, 2);
753 if (trim($part[0]) === 'begin') {
754 $body .= $part[1];
755 $prev = '';
756 } elseif (trim($part[0]) === 'end') {
757 $body .= $prev;
758 } else {
759 $prev = $val;
760 }
761 }
762 return true;
763 }
764 return false;
765 }
766
767 /**
768 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
769 *
770 * @param string $content HTML content
771 */
772 public function extractLinks($content)
773 {
774 // Get links:
775 $list = $this->extractHyperLinks($content);
776 if ($this->indexerConfig['useCrawlerForExternalFiles'] && \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler')) {
777 $crawler = GeneralUtility::makeInstance(\tx_crawler_lib::class);
778 }
779 // Traverse links:
780 foreach ($list as $linkInfo) {
781 // Decode entities:
782 if ($linkInfo['localPath']) {
783 // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
784 $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
785 } else {
786 $linkSource = htmlspecialchars_decode($linkInfo['href']);
787 }
788 // Parse URL:
789 $qParts = parse_url($linkSource);
790 // Check for jumpurl (TYPO3 specific thing...)
791 if ($qParts['query'] && strstr($qParts['query'], 'jumpurl=')) {
792 parse_str($qParts['query'], $getP);
793 $linkSource = $getP['jumpurl'];
794 $qParts = parse_url($linkSource);
795 }
796 if (!$linkInfo['localPath'] && $qParts['scheme']) {
797 if ($this->indexerConfig['indexExternalURLs']) {
798 // Index external URL (http or otherwise)
799 $this->indexExternalUrl($linkSource);
800 }
801 } elseif (!$qParts['query']) {
802 $linkSource = urldecode($linkSource);
803 if (GeneralUtility::isAllowedAbsPath($linkSource)) {
804 $localFile = $linkSource;
805 } else {
806 $localFile = GeneralUtility::getFileAbsFileName(Environment::getPublicPath() . '/' . $linkSource);
807 }
808 if ($localFile && @is_file($localFile)) {
809 // Index local file:
810 if ($linkInfo['localPath']) {
811 $fI = pathinfo($linkSource);
812 $ext = strtolower($fI['extension']);
813 if (is_object($crawler)) {
814 $params = [
815 'document' => $linkSource,
816 'alturl' => $linkInfo['href'],
817 'conf' => $this->conf
818 ];
819 unset($params['conf']['content']);
820 $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
821 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
822 } else {
823 $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
824 }
825 } else {
826 if (is_object($crawler)) {
827 $params = [
828 'document' => $linkSource,
829 'conf' => $this->conf
830 ];
831 unset($params['conf']['content']);
832 $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
833 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
834 } else {
835 $this->indexRegularDocument($linkSource);
836 }
837 }
838 }
839 }
840 }
841 }
842
843 /**
844 * Extracts all links to external documents from the HTML content string
845 *
846 * @param string $html
847 * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
848 * @see extractLinks()
849 */
850 public function extractHyperLinks($html)
851 {
852 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
853 $htmlParts = $htmlParser->splitTags('a', $html);
854 $hyperLinksData = [];
855 foreach ($htmlParts as $index => $tagData) {
856 if ($index % 2 !== 0) {
857 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
858 $firstTagName = $htmlParser->getFirstTagName($tagData);
859 if (strtolower($firstTagName) === 'a') {
860 if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] !== '#') {
861 $hyperLinksData[] = [
862 'tag' => $tagData,
863 'href' => $tagAttributes[0]['href'],
864 'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
865 ];
866 }
867 }
868 }
869 }
870 return $hyperLinksData;
871 }
872
873 /**
874 * Extracts the "base href" from content string.
875 *
876 * @param string $html Content to analyze
877 * @return string The base href or an empty string if not found
878 */
879 public function extractBaseHref($html)
880 {
881 $href = '';
882 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
883 $htmlParts = $htmlParser->splitTags('base', $html);
884 foreach ($htmlParts as $index => $tagData) {
885 if ($index % 2 !== 0) {
886 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
887 $firstTagName = $htmlParser->getFirstTagName($tagData);
888 if (strtolower($firstTagName) === 'base') {
889 $href = $tagAttributes[0]['href'];
890 if ($href) {
891 break;
892 }
893 }
894 }
895 }
896 return $href;
897 }
898
899 /******************************************
900 *
901 * Indexing; external URL
902 *
903 ******************************************/
904 /**
905 * Index External URLs HTML content
906 *
907 * @param string $externalUrl URL, eg. "http://typo3.org/
908 * @see indexRegularDocument()
909 */
910 public function indexExternalUrl($externalUrl)
911 {
912 // Get headers:
913 $urlHeaders = $this->getUrlHeaders($externalUrl);
914 if (stristr($urlHeaders['Content-Type'], 'text/html')) {
915 $content = ($this->indexExternalUrl_content = GeneralUtility::getUrl($externalUrl));
916 if ((string)$content !== '') {
917 // Create temporary file:
918 $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
919 if ($tmpFile) {
920 GeneralUtility::writeFile($tmpFile, $content);
921 // Index that file:
922 $this->indexRegularDocument($externalUrl, true, $tmpFile, 'html');
923 // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
924 unlink($tmpFile);
925 }
926 }
927 }
928 }
929
930 /**
931 * Getting HTTP request headers of URL
932 *
933 * @param string $url The URL
934 * @return mixed If no answer, returns FALSE. Otherwise an array where HTTP headers are keys
935 */
936 public function getUrlHeaders($url)
937 {
938 // Try to get the headers only
939 $content = GeneralUtility::getUrl($url, 2);
940 if ((string)$content !== '') {
941 // Compile headers:
942 $headers = GeneralUtility::trimExplode(LF, $content, true);
943 $retVal = [];
944 foreach ($headers as $line) {
945 if (trim($line) === '') {
946 break;
947 }
948 list($headKey, $headValue) = explode(':', $line, 2);
949 $retVal[$headKey] = $headValue;
950 }
951 return $retVal;
952 }
953 }
954
955 /**
956 * Checks if the file is local
957 *
958 * @param string $sourcePath
959 * @return string Absolute path to file if file is local, else empty string
960 */
961 protected function createLocalPath($sourcePath)
962 {
963 $localPath = '';
964 $pathFunctions = [
965 'createLocalPathFromT3vars',
966 'createLocalPathUsingAbsRefPrefix',
967 'createLocalPathUsingDomainURL',
968 'createLocalPathFromAbsoluteURL',
969 'createLocalPathFromRelativeURL'
970 ];
971 foreach ($pathFunctions as $functionName) {
972 $localPath = $this->{$functionName}($sourcePath);
973 if ($localPath != '') {
974 break;
975 }
976 }
977 return $localPath;
978 }
979
980 /**
981 * Attempts to create a local file path from T3VARs. This is useful for
982 * various download extensions that hide actual file name but still want the
983 * file to be indexed.
984 *
985 * @param string $sourcePath
986 * @return string
987 */
988 protected function createLocalPathFromT3vars($sourcePath)
989 {
990 $localPath = '';
991 $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
992 if (is_array($indexLocalFiles)) {
993 $md5 = GeneralUtility::shortMD5($sourcePath);
994 // Note: not using self::isAllowedLocalFile here because this method
995 // is allowed to index files outside of the web site (for example,
996 // protected downloads)
997 if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
998 $localPath = $indexLocalFiles[$md5];
999 }
1000 }
1001 return $localPath;
1002 }
1003
1004 /**
1005 * Attempts to create a local file path by matching a current request URL.
1006 *
1007 * @param string $sourcePath
1008 * @return string
1009 */
1010 protected function createLocalPathUsingDomainURL($sourcePath)
1011 {
1012 $localPath = '';
1013 $baseURL = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
1014 $baseURLLength = strlen($baseURL);
1015 if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
1016 $sourcePath = substr($sourcePath, $baseURLLength);
1017 $localPath = Environment::getPublicPath() . '/' . $sourcePath;
1018 if (!self::isAllowedLocalFile($localPath)) {
1019 $localPath = '';
1020 }
1021 }
1022 return $localPath;
1023 }
1024
1025 /**
1026 * Attempts to create a local file path by matching absRefPrefix. This
1027 * requires TSFE. If TSFE is missing, this function does nothing.
1028 *
1029 * @param string $sourcePath
1030 * @return string
1031 */
1032 protected function createLocalPathUsingAbsRefPrefix($sourcePath)
1033 {
1034 $localPath = '';
1035 if ($GLOBALS['TSFE'] instanceof \TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController) {
1036 $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
1037 $absRefPrefixLength = strlen($absRefPrefix);
1038 if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
1039 $sourcePath = substr($sourcePath, $absRefPrefixLength);
1040 $localPath = Environment::getPublicPath() . '/' . $sourcePath;
1041 if (!self::isAllowedLocalFile($localPath)) {
1042 $localPath = '';
1043 }
1044 }
1045 }
1046 return $localPath;
1047 }
1048
1049 /**
1050 * Attempts to create a local file path from the absolute URL without
1051 * schema.
1052 *
1053 * @param string $sourcePath
1054 * @return string
1055 */
1056 protected function createLocalPathFromAbsoluteURL($sourcePath)
1057 {
1058 $localPath = '';
1059 if ($sourcePath[0] === '/') {
1060 $sourcePath = substr($sourcePath, 1);
1061 $localPath = Environment::getPublicPath() . '/' . $sourcePath;
1062 if (!self::isAllowedLocalFile($localPath)) {
1063 $localPath = '';
1064 }
1065 }
1066 return $localPath;
1067 }
1068
1069 /**
1070 * Attempts to create a local file path from the relative URL.
1071 *
1072 * @param string $sourcePath
1073 * @return string
1074 */
1075 protected function createLocalPathFromRelativeURL($sourcePath)
1076 {
1077 $localPath = '';
1078 if (self::isRelativeURL($sourcePath)) {
1079 $localPath = Environment::getPublicPath() . '/' . $sourcePath;
1080 if (!self::isAllowedLocalFile($localPath)) {
1081 $localPath = '';
1082 }
1083 }
1084 return $localPath;
1085 }
1086
1087 /**
1088 * Checks if URL is relative.
1089 *
1090 * @param string $url
1091 * @return bool
1092 */
1093 protected static function isRelativeURL($url)
1094 {
1095 $urlParts = @parse_url($url);
1096 return $urlParts['scheme'] == '' && $urlParts['path'][0] !== '/';
1097 }
1098
1099 /**
1100 * Checks if the path points to the file inside the web site
1101 *
1102 * @param string $filePath
1103 * @return bool
1104 */
1105 protected static function isAllowedLocalFile($filePath)
1106 {
1107 $filePath = GeneralUtility::resolveBackPath($filePath);
1108 $insideWebPath = substr($filePath, 0, strlen(Environment::getPublicPath())) === Environment::getPublicPath();
1109 $isFile = is_file($filePath);
1110 return $insideWebPath && $isFile;
1111 }
1112
1113 /******************************************
1114 *
1115 * Indexing; external files (PDF, DOC, etc)
1116 *
1117 ******************************************/
1118 /**
1119 * Indexing a regular document given as $file (relative to public web path, local file)
1120 *
1121 * @param string $file Relative Filename, relative to public web path. It can also be an absolute path as long as it is inside the lockRootPath (validated with \TYPO3\CMS\Core\Utility\GeneralUtility::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
1122 * @param bool $force If set, indexing is forced (despite content hashes, mtime etc).
1123 * @param string $contentTmpFile Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
1124 * @param string $altExtension File extension for temporary file.
1125 */
1126 public function indexRegularDocument($file, $force = false, $contentTmpFile = '', $altExtension = '')
1127 {
1128 // Init
1129 $fI = pathinfo($file);
1130 $ext = $altExtension ?: strtolower($fI['extension']);
1131 // Create abs-path:
1132 if (!$contentTmpFile) {
1133 if (!GeneralUtility::isAbsPath($file)) {
1134 // Relative, prepend public web path:
1135 $absFile = GeneralUtility::getFileAbsFileName(Environment::getPublicPath() . '/' . $file);
1136 } else {
1137 // Absolute, pass-through:
1138 $absFile = $file;
1139 }
1140 $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
1141 } else {
1142 $absFile = $contentTmpFile;
1143 }
1144 // Indexing the document:
1145 if ($absFile && @is_file($absFile)) {
1146 if ($this->external_parsers[$ext]) {
1147 $fileInfo = stat($absFile);
1148 $cParts = $this->fileContentParts($ext, $absFile);
1149 foreach ($cParts as $cPKey) {
1150 $this->internal_log = [];
1151 $this->log_push('Index: ' . str_replace('.', '_', PathUtility::basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
1152 $Pstart = GeneralUtility::milliseconds();
1153 $subinfo = ['key' => $cPKey];
1154 // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1155 $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
1156 $check = $this->checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
1157 if ($check > 0 || $force) {
1158 if ($check > 0) {
1159 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
1160 } else {
1161 $this->log_setTSlogMessage('Indexing forced by flag', 1);
1162 }
1163 // Check external file counter:
1164 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1165 // Divide into title,keywords,description and body:
1166 $this->log_push('Split content', '');
1167 $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
1168 $this->log_pull();
1169 if (is_array($contentParts)) {
1170 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1171 $content_md5h = IndexedSearchUtility::md5inthash(implode($contentParts, ''));
1172 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1173 // Increment counter:
1174 $this->externalFileCounter++;
1175 // Splitting words
1176 $this->log_push('Extract words from content', '');
1177 $splitInWords = $this->processWordsInArrays($contentParts);
1178 $this->log_pull();
1179 // Analyse the indexed words.
1180 $this->log_push('Analyse the extracted words', '');
1181 $indexArr = $this->indexAnalyze($splitInWords);
1182 $this->log_pull();
1183 // Submitting page (phash) record
1184 $this->log_push('Submitting page', '');
1185 // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1186 $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], $content_md5h, $contentParts);
1187 $this->log_pull();
1188 // Check words and submit to word list if not there
1189 $this->log_push('Check word list and submit words', '');
1190 if (IndexedSearchUtility::isTableUsed('index_words')) {
1191 $this->checkWordList($indexArr);
1192 $this->submitWords($indexArr, $phash_arr['phash']);
1193 }
1194 $this->log_pull();
1195 // Set parsetime
1196 $this->updateParsetime($phash_arr['phash'], GeneralUtility::milliseconds() - $Pstart);
1197 } else {
1198 // Update the timestamp
1199 $this->updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
1200 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
1201 }
1202 } else {
1203 $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1204 }
1205 } else {
1206 $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
1207 }
1208 } else {
1209 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
1210 }
1211 // Checking and setting sections:
1212 $this->submitFile_section($phash_arr['phash']);
1213 // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1214 $this->log_pull();
1215 }
1216 } else {
1217 $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
1218 }
1219 } else {
1220 $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
1221 }
1222 }
1223
1224 /**
1225 * Reads the content of an external file being indexed.
1226 * The content from the external parser MUST be returned in utf-8!
1227 *
1228 * @param string $fileExtension File extension, eg. "pdf", "doc" etc.
1229 * @param string $absoluteFileName Absolute filename of file (must exist and be validated OK before calling function)
1230 * @param string $sectionPointer Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1231 * @return array Standard content array (title, description, keywords, body keys)
1232 */
1233 public function readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
1234 {
1235 $contentArray = null;
1236 // Consult relevant external document parser:
1237 if (is_object($this->external_parsers[$fileExtension])) {
1238 $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1239 }
1240 return $contentArray;
1241 }
1242
1243 /**
1244 * Creates an array with pointers to divisions of document.
1245 *
1246 * @param string $ext File extension
1247 * @param string $absFile Absolute filename (must exist and be validated OK before calling function)
1248 * @return array Array of pointers to sections that the document should be divided into
1249 */
1250 public function fileContentParts($ext, $absFile)
1251 {
1252 $cParts = [0];
1253 // Consult relevant external document parser:
1254 if (is_object($this->external_parsers[$ext])) {
1255 $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
1256 }
1257 return $cParts;
1258 }
1259
1260 /**
1261 * Splits non-HTML content (from external files for instance)
1262 *
1263 * @param string $content Input content (non-HTML) to index.
1264 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1265 * @see splitHTMLContent()
1266 */
1267 public function splitRegularContent($content)
1268 {
1269 $contentArr = $this->defaultContentArray;
1270 $contentArr['body'] = $content;
1271 return $contentArr;
1272 }
1273
1274 /**********************************
1275 *
1276 * Analysing content, Extracting words
1277 *
1278 **********************************/
1279 /**
1280 * Convert character set and HTML entities in the value of input content array keys
1281 *
1282 * @param array $contentArr Standard content array
1283 * @param string $charset Charset of the input content (converted to utf-8)
1284 */
1285 public function charsetEntity2utf8(&$contentArr, $charset)
1286 {
1287 // Convert charset if necessary
1288 foreach ($contentArr as $key => $value) {
1289 if ((string)$contentArr[$key] !== '') {
1290 if ($charset !== 'utf-8') {
1291 $contentArr[$key] = mb_convert_encoding($contentArr[$key], 'utf-8', $charset);
1292 }
1293 // decode all numeric / html-entities in the string to real characters:
1294 $contentArr[$key] = html_entity_decode($contentArr[$key]);
1295 }
1296 }
1297 }
1298
1299 /**
1300 * Processing words in the array from split*Content -functions
1301 *
1302 * @param array $contentArr Array of content to index, see splitHTMLContent() and splitRegularContent()
1303 * @return array Content input array modified so each key is not a unique array of words
1304 */
1305 public function processWordsInArrays($contentArr)
1306 {
1307 // split all parts to words
1308 foreach ($contentArr as $key => $value) {
1309 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1310 }
1311 // For title, keywords, and description we don't want duplicates:
1312 $contentArr['title'] = array_unique($contentArr['title']);
1313 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1314 $contentArr['description'] = array_unique($contentArr['description']);
1315 // Return modified array:
1316 return $contentArr;
1317 }
1318
1319 /**
1320 * Extracts the sample description text from the content array.
1321 *
1322 * @param array $contentArr Content array
1323 * @return string Description string
1324 */
1325 public function bodyDescription($contentArr)
1326 {
1327 // Setting description
1328 $maxL = MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1329 if ($maxL) {
1330 $bodyDescription = preg_replace('/\s+/u', ' ', $contentArr['body']);
1331 // Shorten the string:
1332 $bodyDescription = mb_strcut($bodyDescription, 0, $maxL, 'utf-8');
1333 }
1334 return $bodyDescription;
1335 }
1336
1337 /**
1338 * Analyzes content to use for indexing,
1339 *
1340 * @param array $content Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1341 * @return array Index Array (whatever that is...)
1342 */
1343 public function indexAnalyze($content)
1344 {
1345 $indexArr = [];
1346 $this->analyzeHeaderinfo($indexArr, $content, 'title', 7);
1347 $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1348 $this->analyzeHeaderinfo($indexArr, $content, 'description', 5);
1349 $this->analyzeBody($indexArr, $content);
1350 return $indexArr;
1351 }
1352
1353 /**
1354 * Calculates relevant information for headercontent
1355 *
1356 * @param array $retArr Index array, passed by reference
1357 * @param array $content Standard content array
1358 * @param string $key Key from standard content array
1359 * @param int $offset Bit-wise priority to type
1360 */
1361 public function analyzeHeaderinfo(&$retArr, $content, $key, $offset)
1362 {
1363 foreach ($content[$key] as $val) {
1364 $val = substr($val, 0, 60);
1365 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1366 if (!isset($retArr[$val])) {
1367 // Word ID (wid)
1368 $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1369 // Metaphone value is also 60 only chars long
1370 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1371 $retArr[$val]['metaphone'] = $metaphone;
1372 }
1373 // Build metaphone fulltext string (can be used for fulltext indexing)
1374 if ($this->storeMetaphoneInfoAsWords) {
1375 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1376 }
1377 // Priority used for flagBitMask feature (see extension configuration)
1378 $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | pow(2, $offset);
1379 // Increase number of occurrences
1380 $retArr[$val]['count']++;
1381 $this->wordcount++;
1382 }
1383 }
1384
1385 /**
1386 * Calculates relevant information for bodycontent
1387 *
1388 * @param array $retArr Index array, passed by reference
1389 * @param array $content Standard content array
1390 */
1391 public function analyzeBody(&$retArr, $content)
1392 {
1393 foreach ($content['body'] as $key => $val) {
1394 $val = substr($val, 0, 60);
1395 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1396 if (!isset($retArr[$val])) {
1397 // First occurrence (used for ranking results)
1398 $retArr[$val]['first'] = $key;
1399 // Word ID (wid)
1400 $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1401 // Metaphone value is also only 60 chars long
1402 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1403 $retArr[$val]['metaphone'] = $metaphone;
1404 }
1405 // Build metaphone fulltext string (can be used for fulltext indexing)
1406 if ($this->storeMetaphoneInfoAsWords) {
1407 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1408 }
1409 // Increase number of occurrences
1410 $retArr[$val]['count']++;
1411 $this->wordcount++;
1412 }
1413 }
1414
1415 /**
1416 * Creating metaphone based hash from input word
1417 *
1418 * @param string $word Word to convert
1419 * @param bool $returnRawMetaphoneValue If set, returns the raw metaphone value (not hashed)
1420 * @return mixed Metaphone hash integer (or raw value, string)
1421 */
1422 public function metaphone($word, $returnRawMetaphoneValue = false)
1423 {
1424 if (is_object($this->metaphoneObj)) {
1425 $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1426 } else {
1427 // Use native PHP function instead of advanced doubleMetaphone class
1428 $metaphoneRawValue = metaphone($word);
1429 }
1430 if ($returnRawMetaphoneValue) {
1431 $result = $metaphoneRawValue;
1432 } elseif ($metaphoneRawValue !== '') {
1433 // Create hash and return integer
1434 $result = IndexedSearchUtility::md5inthash($metaphoneRawValue);
1435 } else {
1436 $result = 0;
1437 }
1438 return $result;
1439 }
1440
1441 /********************************
1442 *
1443 * SQL; TYPO3 Pages
1444 *
1445 *******************************/
1446 /**
1447 * Updates db with information about the page (TYPO3 page, not external media)
1448 */
1449 public function submitPage()
1450 {
1451 // Remove any current data for this phash:
1452 $this->removeOldIndexedPages($this->hash['phash']);
1453 // setting new phash_row
1454 $fields = [
1455 'phash' => $this->hash['phash'],
1456 'phash_grouping' => $this->hash['phash_grouping'],
1457 'cHashParams' => serialize($this->cHashParams),
1458 'contentHash' => $this->content_md5h,
1459 'data_page_id' => $this->conf['id'],
1460 // @deprecated since TYPO3 v9, will be removed in TYPO3 v10. Remove along with database field data_page_reg1
1461 'data_page_reg1' => $this->conf['page_cache_reg1'],
1462 'data_page_type' => $this->conf['type'],
1463 'data_page_mp' => $this->conf['MP'],
1464 'gr_list' => $this->conf['gr_list'],
1465 'item_type' => 0,
1466 // TYPO3 page
1467 'item_title' => $this->contentParts['title'],
1468 'item_description' => $this->bodyDescription($this->contentParts),
1469 'item_mtime' => (int)$this->conf['mtime'],
1470 'item_size' => strlen($this->conf['content']),
1471 'tstamp' => $GLOBALS['EXEC_TIME'],
1472 'crdate' => $GLOBALS['EXEC_TIME'],
1473 'item_crdate' => $this->conf['crdate'],
1474 // Creation date of page
1475 'sys_language_uid' => $this->conf['sys_language_uid'],
1476 // Sys language uid of the page. Should reflect which language it DOES actually display!
1477 'externalUrl' => 0,
1478 'recordUid' => (int)$this->conf['recordUid'],
1479 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1480 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1481 ];
1482 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1483 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1484 ->getConnectionForTable('index_phash');
1485 $connection->insert(
1486 'index_phash',
1487 $fields,
1488 ['cHashParams' => Connection::PARAM_LOB]
1489 );
1490 }
1491 // PROCESSING index_section
1492 $this->submit_section($this->hash['phash'], $this->hash['phash']);
1493 // PROCESSING index_grlist
1494 $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1495 // PROCESSING index_fulltext
1496 $fields = [
1497 'phash' => $this->hash['phash'],
1498 'fulltextdata' => implode(' ', $this->contentParts),
1499 'metaphonedata' => $this->metaphoneContent
1500 ];
1501 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1502 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1503 }
1504 if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1505 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1506 ->getConnectionForTable('index_fulltext');
1507 $connection->insert('index_fulltext', $fields);
1508 }
1509 // PROCESSING index_debug
1510 if ($this->indexerConfig['debugMode']) {
1511 $fields = [
1512 'phash' => $this->hash['phash'],
1513 'debuginfo' => serialize([
1514 'cHashParams' => $this->cHashParams,
1515 'external_parsers initialized' => array_keys($this->external_parsers),
1516 'conf' => array_merge($this->conf, ['content' => substr($this->conf['content'], 0, 1000)]),
1517 'contentParts' => array_merge($this->contentParts, ['body' => substr($this->contentParts['body'], 0, 1000)]),
1518 'logs' => $this->internal_log,
1519 'lexer' => $this->lexerObj->debugString
1520 ])
1521 ];
1522 if (IndexedSearchUtility::isTableUsed('index_debug')) {
1523 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1524 ->getConnectionForTable('index_debug');
1525 $connection->insert('index_debug', $fields);
1526 }
1527 }
1528 }
1529
1530 /**
1531 * Stores gr_list in the database.
1532 *
1533 * @param int $hash Search result record phash
1534 * @param int $phash_x Actual phash of current content
1535 * @see update_grlist()
1536 */
1537 public function submit_grlist($hash, $phash_x)
1538 {
1539 // Setting the gr_list record
1540 $fields = [
1541 'phash' => $hash,
1542 'phash_x' => $phash_x,
1543 'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1544 'gr_list' => $this->conf['gr_list']
1545 ];
1546 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1547 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1548 ->getConnectionForTable('index_grlist');
1549 $connection->insert('index_grlist', $fields);
1550 }
1551 }
1552
1553 /**
1554 * Stores section
1555 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1556 *
1557 * @param int $hash phash of TYPO3 parent search result record
1558 * @param int $hash_t3 phash of the file indexation search record
1559 */
1560 public function submit_section($hash, $hash_t3)
1561 {
1562 $fields = [
1563 'phash' => $hash,
1564 'phash_t3' => $hash_t3,
1565 'page_id' => (int)$this->conf['id']
1566 ];
1567 $this->getRootLineFields($fields);
1568 if (IndexedSearchUtility::isTableUsed('index_section')) {
1569 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1570 ->getConnectionForTable('index_section');
1571 $connection->insert('index_section', $fields);
1572 }
1573 }
1574
1575 /**
1576 * Removes records for the indexed page, $phash
1577 *
1578 * @param int $phash phash value to flush
1579 */
1580 public function removeOldIndexedPages($phash)
1581 {
1582 // Removing old registrations for all tables. Because the pages are TYPO3 pages
1583 // there can be nothing else than 1-1 relations here.
1584 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1585 $tableArray = ['index_phash', 'index_section', 'index_grlist', 'index_fulltext', 'index_debug'];
1586 foreach ($tableArray as $table) {
1587 if (IndexedSearchUtility::isTableUsed($table)) {
1588 $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1589 }
1590 }
1591
1592 // Removing all index_section records with hash_t3 set to this hash (this includes such
1593 // records set for external media on the page as well!). The re-insert of these records
1594 // are done in indexRegularDocument($file).
1595 if (IndexedSearchUtility::isTableUsed('index_section')) {
1596 $connectionPool->getConnectionForTable('index_section')
1597 ->delete('index_section', ['phash_t3' => (int)$phash]);
1598 }
1599 }
1600
1601 /********************************
1602 *
1603 * SQL; External media
1604 *
1605 *******************************/
1606 /**
1607 * Updates db with information about the file
1608 *
1609 * @param array $hash Array with phash and phash_grouping keys for file
1610 * @param string $file File name
1611 * @param array $subinfo Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1612 * @param string $ext File extension determining the type of media.
1613 * @param int $mtime Modification time of file.
1614 * @param int $ctime Creation time of file.
1615 * @param int $size Size of file in bytes
1616 * @param int $content_md5h Content HASH value.
1617 * @param array $contentParts Standard content array (using only title and body for a file)
1618 */
1619 public function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts)
1620 {
1621 // Find item Type:
1622 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1623 $storeItemType = $storeItemType ?: $ext;
1624 // Remove any current data for this phash:
1625 $this->removeOldIndexedFiles($hash['phash']);
1626 // Split filename:
1627 $fileParts = parse_url($file);
1628 // Setting new
1629 $fields = [
1630 'phash' => $hash['phash'],
1631 'phash_grouping' => $hash['phash_grouping'],
1632 'cHashParams' => serialize($subinfo),
1633 'contentHash' => $content_md5h,
1634 'data_filename' => $file,
1635 'item_type' => $storeItemType,
1636 'item_title' => trim($contentParts['title']) ?: PathUtility::basename($file),
1637 'item_description' => $this->bodyDescription($contentParts),
1638 'item_mtime' => $mtime,
1639 'item_size' => $size,
1640 'item_crdate' => $ctime,
1641 'tstamp' => $GLOBALS['EXEC_TIME'],
1642 'crdate' => $GLOBALS['EXEC_TIME'],
1643 'gr_list' => $this->conf['gr_list'],
1644 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1645 'recordUid' => (int)$this->conf['recordUid'],
1646 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1647 'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1648 'sys_language_uid' => (int)$this->conf['sys_language_uid']
1649 ];
1650 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1651 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1652 ->getConnectionForTable('index_phash');
1653 $connection->insert(
1654 'index_phash',
1655 $fields,
1656 ['cHashParams' => Connection::PARAM_LOB]
1657 );
1658 }
1659 // PROCESSING index_fulltext
1660 $fields = [
1661 'phash' => $hash['phash'],
1662 'fulltextdata' => implode(' ', $contentParts),
1663 'metaphonedata' => $this->metaphoneContent
1664 ];
1665 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1666 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1667 }
1668 if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1669 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1670 ->getConnectionForTable('index_fulltext');
1671 $connection->insert('index_fulltext', $fields);
1672 }
1673 // PROCESSING index_debug
1674 if ($this->indexerConfig['debugMode']) {
1675 $fields = [
1676 'phash' => $hash['phash'],
1677 'debuginfo' => serialize([
1678 'cHashParams' => $subinfo,
1679 'contentParts' => array_merge($contentParts, ['body' => substr($contentParts['body'], 0, 1000)]),
1680 'logs' => $this->internal_log,
1681 'lexer' => $this->lexerObj->debugString
1682 ])
1683 ];
1684 if (IndexedSearchUtility::isTableUsed('index_debug')) {
1685 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1686 ->getConnectionForTable('index_debug');
1687 $connection->insert('index_debug', $fields);
1688 }
1689 }
1690 }
1691
1692 /**
1693 * Stores file gr_list for a file IF it does not exist already
1694 *
1695 * @param int $hash phash value of file
1696 */
1697 public function submitFile_grlist($hash)
1698 {
1699 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1700 if (!IndexedSearchUtility::isTableUsed('index_grlist')) {
1701 return;
1702 }
1703
1704 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1705 ->getQueryBuilderForTable('index_grlist');
1706 $count = (int)$queryBuilder->count('*')
1707 ->from('index_grlist')
1708 ->where(
1709 $queryBuilder->expr()->eq(
1710 'phash',
1711 $queryBuilder->createNamedParameter($hash, \PDO::PARAM_INT)
1712 ),
1713 $queryBuilder->expr()->orX(
1714 $queryBuilder->expr()->eq(
1715 'hash_gr_list',
1716 $queryBuilder->createNamedParameter(
1717 IndexedSearchUtility::md5inthash($this->defaultGrList),
1718 \PDO::PARAM_INT
1719 )
1720 ),
1721 $queryBuilder->expr()->eq(
1722 'hash_gr_list',
1723 $queryBuilder->createNamedParameter(
1724 IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1725 \PDO::PARAM_INT
1726 )
1727 )
1728 )
1729 )
1730 ->execute()
1731 ->fetchColumn();
1732
1733 if ($count === 0) {
1734 $this->submit_grlist($hash, $hash);
1735 }
1736 }
1737
1738 /**
1739 * Stores file section for a file IF it does not exist
1740 *
1741 * @param int $hash phash value of file
1742 */
1743 public function submitFile_section($hash)
1744 {
1745 // Testing if there is already a section
1746 if (!IndexedSearchUtility::isTableUsed('index_section')) {
1747 return;
1748 }
1749
1750 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1751 ->getQueryBuilderForTable('index_section');
1752 $count = (int)$queryBuilder->count('phash')
1753 ->from('index_section')
1754 ->where(
1755 $queryBuilder->expr()->eq(
1756 'phash',
1757 $queryBuilder->createNamedParameter($hash, \PDO::PARAM_INT)
1758 ),
1759 $queryBuilder->expr()->eq(
1760 'page_id',
1761 $queryBuilder->createNamedParameter($this->conf['id'], \PDO::PARAM_INT)
1762 )
1763 )
1764 ->execute()
1765 ->fetchColumn();
1766
1767 if ($count === 0) {
1768 $this->submit_section($hash, $this->hash['phash']);
1769 }
1770 }
1771
1772 /**
1773 * Removes records for the indexed page, $phash
1774 *
1775 * @param int $phash phash value to flush
1776 */
1777 public function removeOldIndexedFiles($phash)
1778 {
1779 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1780 // Removing old registrations for tables.
1781 $tableArray = ['index_phash', 'index_grlist', 'index_fulltext', 'index_debug'];
1782 foreach ($tableArray as $table) {
1783 if (!IndexedSearchUtility::isTableUsed($table)) {
1784 continue;
1785 }
1786 $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1787 }
1788 }
1789
1790 /********************************
1791 *
1792 * SQL Helper functions
1793 *
1794 *******************************/
1795 /**
1796 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1797 * Return positive integer if the page needs to be indexed
1798 *
1799 * @param int $mtime mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1800 * @param int $phash "phash" used to select any already indexed page to see what its mtime is.
1801 * @return int Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1802 */
1803 public function checkMtimeTstamp($mtime, $phash)
1804 {
1805 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1806 // Not indexed (not in index_phash)
1807 $result = 4;
1808 } else {
1809 $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1810 ->select(
1811 ['item_mtime', 'tstamp'],
1812 'index_phash',
1813 ['phash' => (int)$phash],
1814 [],
1815 [],
1816 1
1817 )
1818 ->fetch();
1819 // If there was an indexing of the page...:
1820 if (!empty($row)) {
1821 if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < $GLOBALS['EXEC_TIME']) {
1822 // If max age is exceeded, index the page
1823 // The configured max-age was exceeded for the document and thus it's indexed.
1824 $result = 1;
1825 } else {
1826 if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < $GLOBALS['EXEC_TIME']) {
1827 // if minAge is not set or if minAge is exceeded, consider at mtime
1828 if ($mtime) {
1829 // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1830 if ($row['item_mtime'] != $mtime) {
1831 // And if mtime is different from the index_phash mtime, it's about time to re-index.
1832 // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1833 $result = 2;
1834 } else {
1835 // mtime matched the document, so no changes detected and no content updated
1836 $result = -1;
1837 if ($this->tstamp_maxAge) {
1838 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1839 } else {
1840 $this->updateTstamp($phash);
1841 $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1842 }
1843 }
1844 } else {
1845 // The minimum age was exceed, but mtime was not set, so the page was indexed.
1846 $result = 3;
1847 }
1848 } else {
1849 // The minimum age was not exceeded
1850 $result = -2;
1851 }
1852 }
1853 } else {
1854 // Page has never been indexed (is not represented in the index_phash table).
1855 $result = 4;
1856 }
1857 }
1858 return $result;
1859 }
1860
1861 /**
1862 * Check content hash in phash table
1863 *
1864 * @return mixed Returns TRUE if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1865 */
1866 public function checkContentHash()
1867 {
1868 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1869 $result = true;
1870 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1871 $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1872 ->select(
1873 ['phash'],
1874 'index_phash',
1875 [
1876 'phash_grouping' => (int)$this->hash['phash_grouping'],
1877 'contentHash' => (int)$this->content_md5h
1878 ],
1879 [],
1880 [],
1881 1
1882 )
1883 ->fetch();
1884
1885 if (!empty($row)) {
1886 $result = $row;
1887 }
1888 }
1889 return $result;
1890 }
1891
1892 /**
1893 * Check content hash for external documents
1894 * Returns TRUE if the document needs to be indexed (that is, there was no result)
1895 *
1896 * @param int $hashGr phash value to check (phash_grouping)
1897 * @param int $content_md5h Content hash to check
1898 * @return bool Returns TRUE if the document needs to be indexed (that is, there was no result)
1899 */
1900 public function checkExternalDocContentHash($hashGr, $content_md5h)
1901 {
1902 $result = true;
1903 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1904 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1905 ->getConnectionForTable('index_phash')
1906 ->count(
1907 '*',
1908 'index_phash',
1909 [
1910 'phash_grouping' => (int)$hashGr,
1911 'contentHash' => (int)$content_md5h
1912 ]
1913 );
1914
1915 $result = $count === 0;
1916 }
1917 return $result;
1918 }
1919
1920 /**
1921 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1922 *
1923 * @param int $phash_x Phash integer to test.
1924 * @return bool
1925 */
1926 public function is_grlist_set($phash_x)
1927 {
1928 $result = false;
1929 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1930 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1931 ->getConnectionForTable('index_grlist')
1932 ->count(
1933 'phash_x',
1934 'index_grlist',
1935 ['phash_x' => (int)$phash_x]
1936 );
1937
1938 $result = $count > 0;
1939 }
1940 return $result;
1941 }
1942
1943 /**
1944 * Check if an grlist-entry for this hash exists and if not so, write one.
1945 *
1946 * @param int $phash phash of the search result that should be found
1947 * @param int $phash_x The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1948 * @see submit_grlist()
1949 */
1950 public function update_grlist($phash, $phash_x)
1951 {
1952 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1953 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1954 ->getConnectionForTable('index_grlist')
1955 ->count(
1956 'phash',
1957 'index_grlist',
1958 [
1959 'phash' => (int)$phash,
1960 'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list'])
1961 ]
1962 );
1963
1964 if ($count === 0) {
1965 $this->submit_grlist($phash, $phash_x);
1966 $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1);
1967 }
1968 }
1969 }
1970
1971 /**
1972 * Update tstamp for a phash row.
1973 *
1974 * @param int $phash phash value
1975 * @param int $mtime If set, update the mtime field to this value.
1976 */
1977 public function updateTstamp($phash, $mtime = 0)
1978 {
1979 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1980 return;
1981 }
1982
1983 $updateFields = [
1984 'tstamp' => $GLOBALS['EXEC_TIME']
1985 ];
1986
1987 if ($mtime) {
1988 $updateFields['item_mtime'] = (int)$mtime;
1989 }
1990
1991 GeneralUtility::makeInstance(ConnectionPool::class)
1992 ->getConnectionForTable('index_phash')
1993 ->update(
1994 'index_phash',
1995 $updateFields,
1996 [
1997 'phash' => (int)$phash
1998 ]
1999 );
2000 }
2001
2002 /**
2003 * Update SetID of the index_phash record.
2004 *
2005 * @param int $phash phash value
2006 */
2007 public function updateSetId($phash)
2008 {
2009 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
2010 return;
2011 }
2012
2013 GeneralUtility::makeInstance(ConnectionPool::class)
2014 ->getConnectionForTable('index_phash')
2015 ->update(
2016 'index_phash',
2017 [
2018 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
2019 ],
2020 [
2021 'phash' => (int)$phash
2022 ]
2023 );
2024 }
2025
2026 /**
2027 * Update parsetime for phash row.
2028 *
2029 * @param int $phash phash value.
2030 * @param int $parsetime Parsetime value to set.
2031 */
2032 public function updateParsetime($phash, $parsetime)
2033 {
2034 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
2035 return;
2036 }
2037
2038 GeneralUtility::makeInstance(ConnectionPool::class)
2039 ->getConnectionForTable('index_phash')
2040 ->update(
2041 'index_phash',
2042 [
2043 'parsetime' => (int)$parsetime
2044 ],
2045 [
2046 'phash' => (int)$phash
2047 ]
2048 );
2049 }
2050
2051 /**
2052 * Update section rootline for the page
2053 */
2054 public function updateRootline()
2055 {
2056 if (!IndexedSearchUtility::isTableUsed('index_section')) {
2057 return;
2058 }
2059
2060 $updateFields = [];
2061 $this->getRootLineFields($updateFields);
2062
2063 GeneralUtility::makeInstance(ConnectionPool::class)
2064 ->getConnectionForTable('index_section')
2065 ->update(
2066 'index_section',
2067 $updateFields,
2068 [
2069 'page_id' => (int)$this->conf['id']
2070 ]
2071 );
2072 }
2073
2074 /**
2075 * Adding values for root-line fields.
2076 * rl0, rl1 and rl2 are standard. A hook might add more.
2077 *
2078 * @param array $fieldArray Field array, passed by reference
2079 */
2080 public function getRootLineFields(array &$fieldArray)
2081 {
2082 $fieldArray['rl0'] = (int)$this->conf['rootline_uids'][0];
2083 $fieldArray['rl1'] = (int)$this->conf['rootline_uids'][1];
2084 $fieldArray['rl2'] = (int)$this->conf['rootline_uids'][2];
2085 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] ?? [] as $fieldName => $rootLineLevel) {
2086 $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
2087 }
2088 }
2089
2090 /********************************
2091 *
2092 * SQL; Submitting words
2093 *
2094 *******************************/
2095 /**
2096 * Adds new words to db
2097 *
2098 * @param array $wordListArray Word List array (where each word has information about position etc).
2099 */
2100 public function checkWordList($wordListArray)
2101 {
2102 if (!IndexedSearchUtility::isTableUsed('index_words') || empty($wordListArray)) {
2103 return;
2104 }
2105
2106 $wordListArrayCount = count($wordListArray);
2107 $phashArray = array_map('intval', array_column($wordListArray, 'hash'));
2108
2109 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('index_words');
2110 $count = (int)$queryBuilder->count('baseword')
2111 ->from('index_words')
2112 ->where(
2113 $queryBuilder->expr()->in(
2114 'wid',
2115 $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
2116 )
2117 )
2118 ->execute()
2119 ->fetchColumn();
2120
2121 if ($count !== $wordListArrayCount) {
2122 $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_words');
2123 $queryBuilder = $connection->createQueryBuilder();
2124
2125 $result = $queryBuilder->select('baseword')
2126 ->from('index_words')
2127 ->where(
2128 $queryBuilder->expr()->in(
2129 'wid',
2130 $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
2131 )
2132 )
2133 ->execute();
2134
2135 $this->log_setTSlogMessage('Inserting words: ' . ($wordListArrayCount - $count), 1);
2136 while ($row = $result->fetch()) {
2137 unset($wordListArray[$row['baseword']]);
2138 }
2139
2140 foreach ($wordListArray as $key => $val) {
2141 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as
2142 // long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...)
2143 // this is not a problem.
2144 $connection->insert(
2145 'index_words',
2146 [
2147 'wid' => $val['hash'],
2148 'baseword' => $key,
2149 'metaphone' => $val['metaphone']
2150 ]
2151 );
2152 }
2153 }
2154 }
2155
2156 /**
2157 * Submits RELATIONS between words and phash
2158 *
2159 * @param array $wordList Word list array
2160 * @param int $phash phash value
2161 */
2162 public function submitWords($wordList, $phash)
2163 {
2164 if (!IndexedSearchUtility::isTableUsed('index_rel')) {
2165 return;
2166 }
2167 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
2168 $queryBuilder = $connectionPool->getQueryBuilderForTable('index_words');
2169 $result = $queryBuilder->select('wid')
2170 ->from('index_words')
2171 ->where(
2172 $queryBuilder->expr()->neq('is_stopword', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT))
2173 )
2174 ->groupBy('wid')
2175 ->execute();
2176
2177 $stopWords = [];
2178 while ($row = $result->fetch()) {
2179 $stopWords[$row['wid']] = $row;
2180 }
2181
2182 $connectionPool->getConnectionForTable('index_rel')->delete('index_rel', ['phash' => (int)$phash]);
2183
2184 $fields = ['phash', 'wid', 'count', 'first', 'freq', 'flags'];
2185 $rows = [];
2186 foreach ($wordList as $val) {
2187 if (isset($stopWords[$val['hash']])) {
2188 continue;
2189 }
2190 $rows[] = [
2191 (int)$phash,
2192 (int)$val['hash'],
2193 (int)$val['count'],
2194 (int)$val['first'],
2195 $this->freqMap($val['count'] / $this->wordcount),
2196 $val['cmp'] & $this->flagBitMask
2197 ];
2198 }
2199
2200 if (!empty($rows)) {
2201 $connectionPool->getConnectionForTable('index_rel')->bulkInsert('index_rel', $rows, $fields);
2202 }
2203 }
2204
2205 /**
2206 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
2207 * and back.
2208 *
2209 * @param float $freq Frequency
2210 * @return int Frequency in range.
2211 */
2212 public function freqMap($freq)
2213 {
2214 $mapFactor = $this->freqMax * 100 * $this->freqRange;
2215 if ($freq <= 1) {
2216 $newFreq = $freq * $mapFactor;
2217 $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
2218 } else {
2219 $newFreq = $freq / $mapFactor;
2220 }
2221 return $newFreq;
2222 }
2223
2224 /********************************
2225 *
2226 * Hashing
2227 *
2228 *******************************/
2229 /**
2230 * Get search hash, T3 pages
2231 */
2232 public function setT3Hashes()
2233 {
2234 // Set main array:
2235 $hArray = [
2236 'id' => (int)$this->conf['id'],
2237 'type' => (int)$this->conf['type'],
2238 'sys_lang' => (int)$this->conf['sys_language_uid'],
2239 'MP' => (string)$this->conf['MP'],
2240 'cHash' => $this->cHashParams
2241 ];
2242 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
2243 $this->hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2244 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
2245 $hArray['gr_list'] = (string)$this->conf['gr_list'];
2246 $this->hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2247 }
2248
2249 /**
2250 * Get search hash, external files
2251 *
2252 * @param string $file File name / path which identifies it on the server
2253 * @param array $subinfo Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
2254 * @return array Array with "phash_grouping" and "phash" inside.
2255 */
2256 public function setExtHashes($file, $subinfo = [])
2257 {
2258 // Set main array:
2259 $hash = [];
2260 $hArray = [
2261 'file' => $file
2262 ];
2263 // Set grouping hash:
2264 $hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2265 // Add subinfo
2266 $hArray['subinfo'] = $subinfo;
2267 $hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2268 return $hash;
2269 }
2270
2271 /*********************************
2272 *
2273 * Internal logging functions
2274 *
2275 *********************************/
2276 /**
2277 * Push function wrapper for TT logging
2278 *
2279 * @param string $msg Title to set
2280 * @param string $key Key (?)
2281 */
2282 public function log_push($msg, $key)
2283 {
2284 $this->timeTracker->push($msg, $key);
2285 }
2286
2287 /**
2288 * Pull function wrapper for TT logging
2289 */
2290 public function log_pull()
2291 {
2292 $this->timeTracker->pull();
2293 }
2294
2295 /**
2296 * Set log message function wrapper for TT logging
2297 *
2298 * @param string $msg Message to set
2299 * @param int $errorNum Error number
2300 */
2301 public function log_setTSlogMessage($msg, $errorNum = 0)
2302 {
2303 $this->timeTracker->setTSlogMessage($msg, $errorNum);
2304 $this->internal_log[] = $msg;
2305 }
2306
2307 /**
2308 * Makes sure that keywords are space-separated. This is impotant for their
2309 * proper displaying as a part of fulltext index.
2310 *
2311 * @param string $keywordList
2312 * @return string
2313 * @see http://forge.typo3.org/issues/14959
2314 */
2315 protected function addSpacesToKeywordList($keywordList)
2316 {
2317 $keywords = GeneralUtility::trimExplode(',', $keywordList);
2318 return ' ' . implode(', ', $keywords) . ' ';
2319 }
2320 }