[TASK] Use strict comparison for strings
[Packages/TYPO3.CMS.git] / typo3 / sysext / indexed_search / Classes / Indexer.php
1 <?php
2 namespace TYPO3\CMS\IndexedSearch;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Database\Connection;
18 use TYPO3\CMS\Core\Database\ConnectionPool;
19 use TYPO3\CMS\Core\TimeTracker\TimeTracker;
20 use TYPO3\CMS\Core\Utility\GeneralUtility;
21 use TYPO3\CMS\Core\Utility\MathUtility;
22 use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
23 use TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility;
24
25 /**
26 * Indexing class for TYPO3 frontend
27 */
28 class Indexer
29 {
30 /**
31 * @var array
32 */
33 public $reasons = [
34 -1 => 'mtime matched the document, so no changes detected and no content updated',
35 -2 => 'The minimum age was not exceeded',
36 1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
37 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
38 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
39 4 => 'Page has never been indexed (is not represented in the index_phash table).'
40 ];
41
42 /**
43 * HTML code blocks to exclude from indexing
44 *
45 * @var string
46 */
47 public $excludeSections = 'script,style';
48
49 /**
50 * Supported Extensions for external files
51 *
52 * @var array
53 */
54 public $external_parsers = [];
55
56 /**
57 * External parser objects, keys are file extension names. Values are objects with certain methods.
58 * Fe-group list (pages might be indexed separately for each usergroup combination to support search
59 * in access limited pages!)
60 *
61 * @var string
62 */
63 public $defaultGrList = '0,-1';
64
65 /**
66 * Min/Max times
67 *
68 * @var int
69 */
70 public $tstamp_maxAge = 0;
71
72 /**
73 * If set, this tells a number of seconds that is the maximum age of an indexed document.
74 * Regardless of mtime the document will be re-indexed if this limit is exceeded.
75 *
76 * @var int
77 */
78 public $tstamp_minAge = 0;
79
80 /**
81 * If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
82 *
83 * @var int
84 */
85 public $maxExternalFiles = 0;
86
87 /**
88 * Max number of external files to index.
89 *
90 * @var bool
91 */
92 public $forceIndexing = false;
93
94 /**
95 * If TRUE, indexing is forced despite of hashes etc.
96 *
97 * @var bool
98 */
99 public $crawlerActive = false;
100
101 /**
102 * Set when crawler is detected (internal)
103 *
104 * @var array
105 */
106 public $defaultContentArray = [
107 'title' => '',
108 'description' => '',
109 'keywords' => '',
110 'body' => ''
111 ];
112
113 /**
114 * @var int
115 */
116 public $wordcount = 0;
117
118 /**
119 * @var int
120 */
121 public $externalFileCounter = 0;
122
123 /**
124 * @var array
125 */
126 public $conf = [];
127
128 /**
129 * Configuration set internally (see init functions for required keys and their meaning)
130 *
131 * @var array
132 */
133 public $indexerConfig = [];
134
135 /**
136 * Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']
137 *
138 * @var array
139 */
140 public $hash = [];
141
142 /**
143 * Hash array, contains phash and phash_grouping
144 *
145 * @var array
146 */
147 public $file_phash_arr = [];
148
149 /**
150 * Hash array for files
151 *
152 * @var array
153 */
154 public $contentParts = [];
155
156 /**
157 * Content of TYPO3 page
158 *
159 * @var string
160 */
161 public $content_md5h = '';
162
163 /**
164 * @var array
165 */
166 public $internal_log = [];
167
168 /**
169 * Internal log
170 *
171 * @var string
172 */
173 public $indexExternalUrl_content = '';
174
175 /**
176 * @var array
177 */
178 public $cHashParams = [];
179
180 /**
181 * cHashparams array
182 *
183 * @var int
184 */
185 public $freqRange = 32000;
186
187 /**
188 * @var float
189 */
190 public $freqMax = 0.1;
191
192 /**
193 * @var bool
194 */
195 public $enableMetaphoneSearch = false;
196
197 /**
198 * @var bool
199 */
200 public $storeMetaphoneInfoAsWords;
201
202 /**
203 * @var string
204 */
205 public $metaphoneContent = '';
206
207 /**
208 * Charset class object
209 *
210 * @var \TYPO3\CMS\Core\Charset\CharsetConverter
211 */
212 public $csObj;
213
214 /**
215 * Metaphone object, if any
216 *
217 * @var \TYPO3\CMS\IndexedSearch\Utility\DoubleMetaPhoneUtility
218 */
219 public $metaphoneObj;
220
221 /**
222 * Lexer object for word splitting
223 *
224 * @var \TYPO3\CMS\IndexedSearch\Lexer
225 */
226 public $lexerObj;
227
228 /**
229 * @var bool
230 */
231 public $flagBitMask;
232
233 /**
234 * @var TimeTracker
235 */
236 protected $timeTracker;
237
238 /**
239 * Indexer constructor.
240 */
241 public function __construct()
242 {
243 $this->timeTracker = GeneralUtility::makeInstance(TimeTracker::class);
244 }
245
246 /**
247 * Parent Object (TSFE) Initialization
248 *
249 * @param TypoScriptFrontendController $pObj Parent Object, passed by reference
250 * @return void
251 */
252 public function hook_indexContent(&$pObj)
253 {
254 // Indexer configuration from Extension Manager interface:
255 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search'], ['allowed_classes' => false]);
256 // Crawler activation:
257 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
258 if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler') && $pObj->applicationData['tx_crawler']['running'] && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
259 // Setting simple log message:
260 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
261 // Setting variables:
262 $this->crawlerActive = true;
263 // Crawler active flag
264 $this->forceIndexing = true;
265 }
266 // Determine if page should be indexed, and if so, configure and initialize indexer
267 if ($pObj->config['config']['index_enable']) {
268 $this->log_push('Index page', '');
269 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
270 if (!$pObj->page['no_search']) {
271 if (!$pObj->no_cache) {
272 if ((int)$pObj->sys_language_uid === (int)$pObj->sys_language_content) {
273 // Setting up internal configuration from config array:
274 $this->conf = [];
275 // Information about page for which the indexing takes place
276 $this->conf['id'] = $pObj->id;
277 // Page id
278 $this->conf['type'] = $pObj->type;
279 // Page type
280 $this->conf['sys_language_uid'] = $pObj->sys_language_uid;
281 // sys_language UID of the language of the indexing.
282 $this->conf['MP'] = $pObj->MP;
283 // MP variable, if any (Mount Points)
284 $this->conf['gr_list'] = $pObj->gr_list;
285 // Group list
286 $this->conf['cHash'] = $pObj->cHash;
287 // cHash string for additional parameters
288 $this->conf['cHash_array'] = $pObj->cHash_array;
289 // Array of the additional parameters
290 $this->conf['crdate'] = $pObj->page['crdate'];
291 // The creation date of the TYPO3 page
292 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;
293 // reg1 of the caching table. Not known what practical use this has.
294 // Root line uids
295 $this->conf['rootline_uids'] = [];
296 foreach ($pObj->config['rootLine'] as $rlkey => $rldat) {
297 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
298 }
299 // Content of page:
300 $this->conf['content'] = $pObj->content;
301 // Content string (HTML of TYPO3 page)
302 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
303 // Alternative title for indexing
304 $this->conf['metaCharset'] = $pObj->metaCharset;
305 // Character set of content (will be converted to utf-8 during indexing)
306 $this->conf['mtime'] = isset($pObj->register['SYS_LASTCHANGED']) ? $pObj->register['SYS_LASTCHANGED'] : $pObj->page['SYS_LASTCHANGED'];
307 // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
308 // Configuration of behavior:
309 $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
310 // Whether to index external documents like PDF, DOC etc. (if possible)
311 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
312 // Length of description text (max 250, default 200)
313 $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : true;
314 // Set to zero:
315 $this->conf['recordUid'] = 0;
316 $this->conf['freeIndexUid'] = 0;
317 $this->conf['freeIndexSetId'] = 0;
318 // Init and start indexing:
319 $this->init();
320 $this->indexTypo3PageContent();
321 } else {
322 $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
323 }
324 } else {
325 $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
326 }
327 } else {
328 $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
329 }
330 } else {
331 $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
332 }
333 $this->log_pull();
334 }
335 }
336
337 /****************************
338 *
339 * Backend API
340 *
341 ****************************/
342 /**
343 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
344 *
345 * @param int $id The page uid, &id=
346 * @param int $type The page type, &type=
347 * @param int $sys_language_uid sys_language uid, typically &L=
348 * @param string $MP The MP variable (Mount Points), &MP=
349 * @param array $uidRL Rootline array of only UIDs.
350 * @param array $cHash_array Array of GET variables to register with this indexing
351 * @param bool $createCHash If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cacheable pages from the backend!
352 * @return void
353 */
354 public function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array = [], $createCHash = false)
355 {
356 // Setting up internal configuration from config array:
357 $this->conf = [];
358 // Information about page for which the indexing takes place
359 $this->conf['id'] = $id;
360 // Page id (int)
361 $this->conf['type'] = $type;
362 // Page type (int)
363 $this->conf['sys_language_uid'] = $sys_language_uid;
364 // sys_language UID of the language of the indexing (int)
365 $this->conf['MP'] = $MP;
366 // MP variable, if any (Mount Points) (string)
367 $this->conf['gr_list'] = '0,-1';
368 // Group list (hardcoded for now...)
369 // cHash values:
370 if ($createCHash) {
371 /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
372 $cacheHash = GeneralUtility::makeInstance(\TYPO3\CMS\Frontend\Page\CacheHashCalculator::class);
373 $this->conf['cHash'] = $cacheHash->generateForParameters(GeneralUtility::implodeArrayForUrl('', $cHash_array));
374 } else {
375 $this->conf['cHash'] = '';
376 }
377 // cHash string for additional parameters
378 $this->conf['cHash_array'] = $cHash_array;
379 // Array of the additional parameters
380 // Set to defaults
381 $this->conf['freeIndexUid'] = 0;
382 $this->conf['freeIndexSetId'] = 0;
383 $this->conf['page_cache_reg1'] = '';
384 // Root line uids
385 $this->conf['rootline_uids'] = $uidRL;
386 // Configuration of behavior:
387 $this->conf['index_externals'] = 1;
388 // Whether to index external documents like PDF, DOC etc. (if possible)
389 $this->conf['index_descrLgd'] = 200;
390 // Length of description text (max 250, default 200)
391 $this->conf['index_metatags'] = true;
392 // Whether to index document keywords and description (if present)
393 // Init and start indexing:
394 $this->init();
395 }
396
397 /**
398 * Sets the free-index uid. Can be called right after backend_initIndexer()
399 *
400 * @param int $freeIndexUid Free index UID
401 * @param int $freeIndexSetId Set id - an integer identifying the "set" of indexing operations.
402 * @return void
403 */
404 public function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId = 0)
405 {
406 $this->conf['freeIndexUid'] = $freeIndexUid;
407 $this->conf['freeIndexSetId'] = $freeIndexSetId;
408 }
409
410 /**
411 * Indexing records as the content of a TYPO3 page.
412 *
413 * @param string $title Title equivalent
414 * @param string $keywords Keywords equivalent
415 * @param string $description Description equivalent
416 * @param string $content The main content to index
417 * @param string $charset The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
418 * @param int $mtime Last modification time, in seconds
419 * @param int $crdate The creation date of the content, in seconds
420 * @param int $recordUid The record UID that the content comes from (for registration with the indexed rows)
421 * @return void
422 */
423 public function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate = 0, $recordUid = 0)
424 {
425 // Content of page:
426 $this->conf['mtime'] = $mtime;
427 // Most recent modification time (seconds) of the content
428 $this->conf['crdate'] = $crdate;
429 // The creation date of the TYPO3 content
430 $this->conf['recordUid'] = $recordUid;
431 // UID of the record, if applicable
432 // Construct fake HTML for parsing:
433 $this->conf['content'] = '
434 <html>
435 <head>
436 <title>' . htmlspecialchars($title) . '</title>
437 <meta name="keywords" content="' . htmlspecialchars($keywords) . '" />
438 <meta name="description" content="' . htmlspecialchars($description) . '" />
439 </head>
440 <body>
441 ' . htmlspecialchars($content) . '
442 </body>
443 </html>';
444 // Content string (HTML of TYPO3 page)
445 // Initializing charset:
446 $this->conf['metaCharset'] = $charset;
447 // Character set of content (will be converted to utf-8 during indexing)
448 $this->conf['indexedDocTitle'] = '';
449 // Alternative title for indexing
450 // Index content as if it was a TYPO3 page:
451 $this->indexTypo3PageContent();
452 }
453
454 /********************************
455 *
456 * Initialization
457 *
458 *******************************/
459 /**
460 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
461 *
462 * @return void
463 */
464 public function init()
465 {
466 // Initializing:
467 $this->cHashParams = $this->conf['cHash_array'];
468 if (is_array($this->cHashParams) && !empty($this->cHashParams)) {
469 if ($this->conf['cHash']) {
470 // Add this so that URL's come out right...
471 $this->cHashParams['cHash'] = $this->conf['cHash'];
472 }
473 unset($this->cHashParams['encryptionKey']);
474 }
475 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
476 $this->setT3Hashes();
477 // Indexer configuration from Extension Manager interface:
478 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search'], ['allowed_classes' => false]);
479 $this->tstamp_minAge = MathUtility::forceIntegerInRange($this->indexerConfig['minAge'] * 3600, 0);
480 $this->tstamp_maxAge = MathUtility::forceIntegerInRange($this->indexerConfig['maxAge'] * 3600, 0);
481 $this->maxExternalFiles = MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
482 $this->flagBitMask = MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
483 // Workaround: If the extension configuration was not updated yet, the value is not existing
484 $this->enableMetaphoneSearch = !isset($this->indexerConfig['enableMetaphoneSearch']) || $this->indexerConfig['enableMetaphoneSearch'];
485 $this->storeMetaphoneInfoAsWords = !IndexedSearchUtility::isTableUsed('index_words') && $this->enableMetaphoneSearch;
486 // Initialize external document parsers:
487 // Example configuration, see ext_localconf.php of this file!
488 if ($this->conf['index_externals']) {
489 $this->initializeExternalParsers();
490 }
491 // Initialize lexer (class that deconstructs the text into words):
492 $lexerObjRef = $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] : 'TYPO3\\CMS\\IndexedSearch\\Lexer';
493 $this->lexerObj = GeneralUtility::getUserObj($lexerObjRef);
494 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
495 // Initialize metaphone hook:
496 // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
497 if ($this->enableMetaphoneSearch && $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']) {
498 $this->metaphoneObj = GeneralUtility::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']);
499 $this->metaphoneObj->pObj = $this;
500 }
501 // Init charset class:
502 $this->csObj = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Charset\CharsetConverter::class);
503 }
504
505 /**
506 * Initialize external parsers
507 *
508 * @return void
509 * @access private
510 * @see init()
511 */
512 public function initializeExternalParsers()
513 {
514 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'])) {
515 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
516 $this->external_parsers[$extension] = GeneralUtility::getUserObj($_objRef);
517 $this->external_parsers[$extension]->pObj = $this;
518 // Init parser and if it returns FALSE, unset its entry again:
519 if (!$this->external_parsers[$extension]->initParser($extension)) {
520 unset($this->external_parsers[$extension]);
521 }
522 }
523 }
524 }
525
526 /********************************
527 *
528 * Indexing; TYPO3 pages (HTML content)
529 *
530 *******************************/
531 /**
532 * Start indexing of the TYPO3 page
533 *
534 * @return void
535 */
536 public function indexTypo3PageContent()
537 {
538 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
539 $is_grlist = $this->is_grlist_set($this->hash['phash']);
540 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
541 // Setting message:
542 if ($this->forceIndexing) {
543 $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
544 } elseif ($check > 0) {
545 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
546 } else {
547 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
548 }
549 // Divide into title,keywords,description and body:
550 $this->log_push('Split content', '');
551 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
552 if ($this->conf['indexedDocTitle']) {
553 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
554 }
555 $this->log_pull();
556 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
557 $this->content_md5h = IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
558 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
559 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
560 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
561 $checkCHash = $this->checkContentHash();
562 if (!is_array($checkCHash) || $check === 1) {
563 $Pstart = GeneralUtility::milliseconds();
564 $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
565 $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
566 $this->log_pull();
567 // Splitting words
568 $this->log_push('Extract words from content', '');
569 $splitInWords = $this->processWordsInArrays($this->contentParts);
570 $this->log_pull();
571 // Analyse the indexed words.
572 $this->log_push('Analyse the extracted words', '');
573 $indexArr = $this->indexAnalyze($splitInWords);
574 $this->log_pull();
575 // Submitting page (phash) record
576 $this->log_push('Submitting page', '');
577 $this->submitPage();
578 $this->log_pull();
579 // Check words and submit to word list if not there
580 $this->log_push('Check word list and submit words', '');
581 if (IndexedSearchUtility::isTableUsed('index_words')) {
582 $this->checkWordList($indexArr);
583 $this->submitWords($indexArr, $this->hash['phash']);
584 }
585 $this->log_pull();
586 // Set parsetime
587 $this->updateParsetime($this->hash['phash'], GeneralUtility::milliseconds() - $Pstart);
588 // Checking external files if configured for.
589 $this->log_push('Checking external files', '');
590 if ($this->conf['index_externals']) {
591 $this->extractLinks($this->conf['content']);
592 }
593 $this->log_pull();
594 } else {
595 // Update the timestamp
596 $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
597 $this->updateSetId($this->hash['phash']);
598 // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
599 $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
600 $this->updateRootline();
601 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
602 }
603 } else {
604 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
605 }
606 }
607
608 /**
609 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
610 *
611 * @param string $content HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
612 * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
613 * @see splitRegularContent()
614 */
615 public function splitHTMLContent($content)
616 {
617 // divide head from body ( u-ouh :) )
618 $contentArr = $this->defaultContentArray;
619 $contentArr['body'] = stristr($content, '<body');
620 $headPart = substr($content, 0, -strlen($contentArr['body']));
621 // get title
622 $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
623 $titleParts = explode(':', $contentArr['title'], 2);
624 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
625 // get keywords and description metatags
626 if ($this->conf['index_metatags']) {
627 $meta = [];
628 $i = 0;
629 while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
630 $i++;
631 }
632 // @todo The code below stops at first unset tag. Is that correct?
633 for ($i = 0; isset($meta[$i]); $i++) {
634 $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i]);
635 if (stristr($meta[$i]['name'], 'keywords')) {
636 $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
637 }
638 if (stristr($meta[$i]['name'], 'description')) {
639 $contentArr['description'] .= ',' . $meta[$i]['content'];
640 }
641 }
642 }
643 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
644 $this->typoSearchTags($contentArr['body']);
645 // Get rid of unwanted sections (ie. scripting and style stuff) in body
646 $tagList = explode(',', $this->excludeSections);
647 foreach ($tagList as $tag) {
648 while ($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
649 }
650 }
651 // remove tags, but first make sure we don't concatenate words by doing it
652 $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
653 $contentArr['body'] = trim(strip_tags($contentArr['body']));
654 $contentArr['keywords'] = trim($contentArr['keywords']);
655 $contentArr['description'] = trim($contentArr['description']);
656 // Return array
657 return $contentArr;
658 }
659
660 /**
661 * Extract the charset value from HTML meta tag.
662 *
663 * @param string $content HTML content
664 * @return string The charset value if found.
665 */
666 public function getHTMLcharset($content)
667 {
668 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
669 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
670 return $reg2[1];
671 }
672 }
673 }
674
675 /**
676 * Converts a HTML document to utf-8
677 *
678 * @param string $content HTML content, any charset
679 * @param string $charset Optional charset (otherwise extracted from HTML)
680 * @return string Converted HTML
681 */
682 public function convertHTMLToUtf8($content, $charset = '')
683 {
684 // Find charset:
685 $charset = $charset ?: $this->getHTMLcharset($content);
686 $charset = $this->csObj->parse_charset($charset);
687 // Convert charset:
688 if ($charset && $charset !== 'utf-8') {
689 $content = $this->csObj->conv($content, $charset, 'utf-8');
690 }
691 // Convert entities, assuming document is now UTF-8:
692 return $this->csObj->entities_to_utf8($content);
693 }
694
695 /**
696 * Finds first occurrence of embracing tags and returns the embraced content and the original string with
697 * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding
698 * <title> of document or removing <script>-sections
699 *
700 * @param string $string String to search in
701 * @param string $tagName Tag name, eg. "script
702 * @param string $tagContent Passed by reference: Content inside found tag
703 * @param string $stringAfter Passed by reference: Content after found tag
704 * @param string $paramList Passed by reference: Attributes of the found tag.
705 * @return bool Returns FALSE if tag was not found, otherwise TRUE.
706 */
707 public function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
708 {
709 $endTag = '</' . $tagName . '>';
710 $startTag = '<' . $tagName;
711 // stristr used because we want a case-insensitive search for the tag.
712 $isTagInText = stristr($string, $startTag);
713 // if the tag was not found, return FALSE
714 if (!$isTagInText) {
715 return false;
716 }
717 list($paramList, $isTagInText) = explode('>', substr($isTagInText, strlen($startTag)), 2);
718 $afterTagInText = stristr($isTagInText, $endTag);
719 if ($afterTagInText) {
720 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
721 $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
722 $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
723 } else {
724 $tagContent = '';
725 $stringAfter = $isTagInText;
726 }
727 return true;
728 }
729
730 /**
731 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
732 *
733 * @param string $body HTML Content, passed by reference
734 * @return bool Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE.
735 */
736 public function typoSearchTags(&$body)
737 {
738 $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
739 if (count($expBody) > 1) {
740 $body = '';
741 foreach ($expBody as $val) {
742 $part = explode('-->', $val, 2);
743 if (trim($part[0]) === 'begin') {
744 $body .= $part[1];
745 $prev = '';
746 } elseif (trim($part[0]) === 'end') {
747 $body .= $prev;
748 } else {
749 $prev = $val;
750 }
751 }
752 return true;
753 } else {
754 return false;
755 }
756 }
757
758 /**
759 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
760 *
761 * @param string $content HTML content
762 * @return void
763 */
764 public function extractLinks($content)
765 {
766 // Get links:
767 $list = $this->extractHyperLinks($content);
768 if ($this->indexerConfig['useCrawlerForExternalFiles'] && \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler')) {
769 $crawler = GeneralUtility::makeInstance(\tx_crawler_lib::class);
770 }
771 // Traverse links:
772 foreach ($list as $linkInfo) {
773 // Decode entities:
774 if ($linkInfo['localPath']) {
775 // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
776 $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
777 } else {
778 $linkSource = htmlspecialchars_decode($linkInfo['href']);
779 }
780 // Parse URL:
781 $qParts = parse_url($linkSource);
782 // Check for jumpurl (TYPO3 specific thing...)
783 if ($qParts['query'] && strstr($qParts['query'], 'jumpurl=')) {
784 parse_str($qParts['query'], $getP);
785 $linkSource = $getP['jumpurl'];
786 $qParts = parse_url($linkSource);
787 }
788 if (!$linkInfo['localPath'] && $qParts['scheme']) {
789 if ($this->indexerConfig['indexExternalURLs']) {
790 // Index external URL (http or otherwise)
791 $this->indexExternalUrl($linkSource);
792 }
793 } elseif (!$qParts['query']) {
794 $linkSource = urldecode($linkSource);
795 if (GeneralUtility::isAllowedAbsPath($linkSource)) {
796 $localFile = $linkSource;
797 } else {
798 $localFile = GeneralUtility::getFileAbsFileName(PATH_site . $linkSource);
799 }
800 if ($localFile && @is_file($localFile)) {
801 // Index local file:
802 if ($linkInfo['localPath']) {
803 $fI = pathinfo($linkSource);
804 $ext = strtolower($fI['extension']);
805 if (is_object($crawler)) {
806 $params = [
807 'document' => $linkSource,
808 'alturl' => $linkInfo['href'],
809 'conf' => $this->conf
810 ];
811 unset($params['conf']['content']);
812 $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
813 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
814 } else {
815 $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
816 }
817 } else {
818 if (is_object($crawler)) {
819 $params = [
820 'document' => $linkSource,
821 'conf' => $this->conf
822 ];
823 unset($params['conf']['content']);
824 $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
825 $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
826 } else {
827 $this->indexRegularDocument($linkSource);
828 }
829 }
830 }
831 }
832 }
833 }
834
835 /**
836 * Extracts all links to external documents from the HTML content string
837 *
838 * @param string $html
839 * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
840 * @see extractLinks()
841 */
842 public function extractHyperLinks($html)
843 {
844 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
845 $htmlParts = $htmlParser->splitTags('a', $html);
846 $hyperLinksData = [];
847 foreach ($htmlParts as $index => $tagData) {
848 if ($index % 2 !== 0) {
849 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
850 $firstTagName = $htmlParser->getFirstTagName($tagData);
851 if (strtolower($firstTagName) === 'a') {
852 if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] !== '#') {
853 $hyperLinksData[] = [
854 'tag' => $tagData,
855 'href' => $tagAttributes[0]['href'],
856 'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
857 ];
858 }
859 }
860 }
861 }
862 return $hyperLinksData;
863 }
864
865 /**
866 * Extracts the "base href" from content string.
867 *
868 * @param string $html Content to analyze
869 * @return string The base href or an empty string if not found
870 */
871 public function extractBaseHref($html)
872 {
873 $href = '';
874 $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
875 $htmlParts = $htmlParser->splitTags('base', $html);
876 foreach ($htmlParts as $index => $tagData) {
877 if ($index % 2 !== 0) {
878 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
879 $firstTagName = $htmlParser->getFirstTagName($tagData);
880 if (strtolower($firstTagName) === 'base') {
881 $href = $tagAttributes[0]['href'];
882 if ($href) {
883 break;
884 }
885 }
886 }
887 }
888 return $href;
889 }
890
891 /******************************************
892 *
893 * Indexing; external URL
894 *
895 ******************************************/
896 /**
897 * Index External URLs HTML content
898 *
899 * @param string $externalUrl URL, eg. "http://typo3.org/
900 * @return void
901 * @see indexRegularDocument()
902 */
903 public function indexExternalUrl($externalUrl)
904 {
905 // Get headers:
906 $urlHeaders = $this->getUrlHeaders($externalUrl);
907 if (stristr($urlHeaders['Content-Type'], 'text/html')) {
908 $content = ($this->indexExternalUrl_content = GeneralUtility::getUrl($externalUrl));
909 if ((string)$content !== '') {
910 // Create temporary file:
911 $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
912 if ($tmpFile) {
913 GeneralUtility::writeFile($tmpFile, $content);
914 // Index that file:
915 $this->indexRegularDocument($externalUrl, true, $tmpFile, 'html');
916 // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
917 unlink($tmpFile);
918 }
919 }
920 }
921 }
922
923 /**
924 * Getting HTTP request headers of URL
925 *
926 * @param string $url The URL
927 * @return mixed If no answer, returns FALSE. Otherwise an array where HTTP headers are keys
928 */
929 public function getUrlHeaders($url)
930 {
931 // Try to get the headers only
932 $content = GeneralUtility::getUrl($url, 2);
933 if ((string)$content !== '') {
934 // Compile headers:
935 $headers = GeneralUtility::trimExplode(LF, $content, true);
936 $retVal = [];
937 foreach ($headers as $line) {
938 if (trim($line) === '') {
939 break;
940 }
941 list($headKey, $headValue) = explode(':', $line, 2);
942 $retVal[$headKey] = $headValue;
943 }
944 return $retVal;
945 }
946 }
947
948 /**
949 * Checks if the file is local
950 *
951 * @param string $sourcePath
952 * @return string Absolute path to file if file is local, else empty string
953 */
954 protected function createLocalPath($sourcePath)
955 {
956 $localPath = '';
957 static $pathFunctions = [
958 'createLocalPathFromT3vars',
959 'createLocalPathUsingAbsRefPrefix',
960 'createLocalPathUsingDomainURL',
961 'createLocalPathFromAbsoluteURL',
962 'createLocalPathFromRelativeURL'
963 ];
964 foreach ($pathFunctions as $functionName) {
965 $localPath = $this->{$functionName}($sourcePath);
966 if ($localPath != '') {
967 break;
968 }
969 }
970 return $localPath;
971 }
972
973 /**
974 * Attempts to create a local file path from T3VARs. This is useful for
975 * various download extensions that hide actual file name but still want the
976 * file to be indexed.
977 *
978 * @param string $sourcePath
979 * @return string
980 */
981 protected function createLocalPathFromT3vars($sourcePath)
982 {
983 $localPath = '';
984 $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
985 if (is_array($indexLocalFiles)) {
986 $md5 = GeneralUtility::shortMD5($sourcePath);
987 // Note: not using self::isAllowedLocalFile here because this method
988 // is allowed to index files outside of the web site (for example,
989 // protected downloads)
990 if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
991 $localPath = $indexLocalFiles[$md5];
992 }
993 }
994 return $localPath;
995 }
996
997 /**
998 * Attempts to create a local file path by matching a current request URL.
999 *
1000 * @param string $sourcePath
1001 * @return string
1002 */
1003 protected function createLocalPathUsingDomainURL($sourcePath)
1004 {
1005 $localPath = '';
1006 $baseURL = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
1007 $baseURLLength = strlen($baseURL);
1008 if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
1009 $sourcePath = substr($sourcePath, $baseURLLength);
1010 $localPath = PATH_site . $sourcePath;
1011 if (!self::isAllowedLocalFile($localPath)) {
1012 $localPath = '';
1013 }
1014 }
1015 return $localPath;
1016 }
1017
1018 /**
1019 * Attempts to create a local file path by matching absRefPrefix. This
1020 * requires TSFE. If TSFE is missing, this function does nothing.
1021 *
1022 * @param string $sourcePath
1023 * @return string
1024 */
1025 protected function createLocalPathUsingAbsRefPrefix($sourcePath)
1026 {
1027 $localPath = '';
1028 if ($GLOBALS['TSFE'] instanceof \TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController) {
1029 $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
1030 $absRefPrefixLength = strlen($absRefPrefix);
1031 if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
1032 $sourcePath = substr($sourcePath, $absRefPrefixLength);
1033 $localPath = PATH_site . $sourcePath;
1034 if (!self::isAllowedLocalFile($localPath)) {
1035 $localPath = '';
1036 }
1037 }
1038 }
1039 return $localPath;
1040 }
1041
1042 /**
1043 * Attempts to create a local file path from the absolute URL without
1044 * schema.
1045 *
1046 * @param string $sourcePath
1047 * @return string
1048 */
1049 protected function createLocalPathFromAbsoluteURL($sourcePath)
1050 {
1051 $localPath = '';
1052 if ($sourcePath[0] === '/') {
1053 $sourcePath = substr($sourcePath, 1);
1054 $localPath = PATH_site . $sourcePath;
1055 if (!self::isAllowedLocalFile($localPath)) {
1056 $localPath = '';
1057 }
1058 }
1059 return $localPath;
1060 }
1061
1062 /**
1063 * Attempts to create a local file path from the relative URL.
1064 *
1065 * @param string $sourcePath
1066 * @return string
1067 */
1068 protected function createLocalPathFromRelativeURL($sourcePath)
1069 {
1070 $localPath = '';
1071 if (self::isRelativeURL($sourcePath)) {
1072 $localPath = PATH_site . $sourcePath;
1073 if (!self::isAllowedLocalFile($localPath)) {
1074 $localPath = '';
1075 }
1076 }
1077 return $localPath;
1078 }
1079
1080 /**
1081 * Checks if URL is relative.
1082 *
1083 * @param string $url
1084 * @return bool
1085 */
1086 protected static function isRelativeURL($url)
1087 {
1088 $urlParts = @parse_url($url);
1089 return $urlParts['scheme'] == '' && $urlParts['path'][0] !== '/';
1090 }
1091
1092 /**
1093 * Checks if the path points to the file inside the web site
1094 *
1095 * @param string $filePath
1096 * @return bool
1097 */
1098 protected static function isAllowedLocalFile($filePath)
1099 {
1100 $filePath = GeneralUtility::resolveBackPath($filePath);
1101 $insideWebPath = substr($filePath, 0, strlen(PATH_site)) == PATH_site;
1102 $isFile = is_file($filePath);
1103 return $insideWebPath && $isFile;
1104 }
1105
1106 /******************************************
1107 *
1108 * Indexing; external files (PDF, DOC, etc)
1109 *
1110 ******************************************/
1111 /**
1112 * Indexing a regular document given as $file (relative to PATH_site, local file)
1113 *
1114 * @param string $file Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with \TYPO3\CMS\Core\Utility\GeneralUtility::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
1115 * @param bool $force If set, indexing is forced (despite content hashes, mtime etc).
1116 * @param string $contentTmpFile Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
1117 * @param string $altExtension File extension for temporary file.
1118 * @return void
1119 */
1120 public function indexRegularDocument($file, $force = false, $contentTmpFile = '', $altExtension = '')
1121 {
1122 // Init
1123 $fI = pathinfo($file);
1124 $ext = $altExtension ?: strtolower($fI['extension']);
1125 // Create abs-path:
1126 if (!$contentTmpFile) {
1127 if (!GeneralUtility::isAbsPath($file)) {
1128 // Relative, prepend PATH_site:
1129 $absFile = GeneralUtility::getFileAbsFileName(PATH_site . $file);
1130 } else {
1131 // Absolute, pass-through:
1132 $absFile = $file;
1133 }
1134 $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
1135 } else {
1136 $absFile = $contentTmpFile;
1137 }
1138 // Indexing the document:
1139 if ($absFile && @is_file($absFile)) {
1140 if ($this->external_parsers[$ext]) {
1141 $fileInfo = stat($absFile);
1142 $cParts = $this->fileContentParts($ext, $absFile);
1143 foreach ($cParts as $cPKey) {
1144 $this->internal_log = [];
1145 $this->log_push('Index: ' . str_replace('.', '_', basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
1146 $Pstart = GeneralUtility::milliseconds();
1147 $subinfo = ['key' => $cPKey];
1148 // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
1149 $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
1150 $check = $this->checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
1151 if ($check > 0 || $force) {
1152 if ($check > 0) {
1153 $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
1154 } else {
1155 $this->log_setTSlogMessage('Indexing forced by flag', 1);
1156 }
1157 // Check external file counter:
1158 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
1159 // Divide into title,keywords,description and body:
1160 $this->log_push('Split content', '');
1161 $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
1162 $this->log_pull();
1163 if (is_array($contentParts)) {
1164 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
1165 $content_md5h = IndexedSearchUtility::md5inthash(implode($contentParts, ''));
1166 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
1167 // Increment counter:
1168 $this->externalFileCounter++;
1169 // Splitting words
1170 $this->log_push('Extract words from content', '');
1171 $splitInWords = $this->processWordsInArrays($contentParts);
1172 $this->log_pull();
1173 // Analyse the indexed words.
1174 $this->log_push('Analyse the extracted words', '');
1175 $indexArr = $this->indexAnalyze($splitInWords);
1176 $this->log_pull();
1177 // Submitting page (phash) record
1178 $this->log_push('Submitting page', '');
1179 // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
1180 $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], $content_md5h, $contentParts);
1181 $this->log_pull();
1182 // Check words and submit to word list if not there
1183 $this->log_push('Check word list and submit words', '');
1184 if (IndexedSearchUtility::isTableUsed('index_words')) {
1185 $this->checkWordList($indexArr);
1186 $this->submitWords($indexArr, $phash_arr['phash']);
1187 }
1188 $this->log_pull();
1189 // Set parsetime
1190 $this->updateParsetime($phash_arr['phash'], GeneralUtility::milliseconds() - $Pstart);
1191 } else {
1192 // Update the timestamp
1193 $this->updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
1194 $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
1195 }
1196 } else {
1197 $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
1198 }
1199 } else {
1200 $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
1201 }
1202 } else {
1203 $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
1204 }
1205 // Checking and setting sections:
1206 $this->submitFile_section($phash_arr['phash']);
1207 // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
1208 $this->log_pull();
1209 }
1210 } else {
1211 $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
1212 }
1213 } else {
1214 $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
1215 }
1216 }
1217
1218 /**
1219 * Reads the content of an external file being indexed.
1220 * The content from the external parser MUST be returned in utf-8!
1221 *
1222 * @param string $fileExtension File extension, eg. "pdf", "doc" etc.
1223 * @param string $absoluteFileName Absolute filename of file (must exist and be validated OK before calling function)
1224 * @param string $sectionPointer Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
1225 * @return array Standard content array (title, description, keywords, body keys)
1226 */
1227 public function readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
1228 {
1229 $contentArray = null;
1230 // Consult relevant external document parser:
1231 if (is_object($this->external_parsers[$fileExtension])) {
1232 $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
1233 }
1234 return $contentArray;
1235 }
1236
1237 /**
1238 * Creates an array with pointers to divisions of document.
1239 *
1240 * @param string $ext File extension
1241 * @param string $absFile Absolute filename (must exist and be validated OK before calling function)
1242 * @return array Array of pointers to sections that the document should be divided into
1243 */
1244 public function fileContentParts($ext, $absFile)
1245 {
1246 $cParts = [0];
1247 // Consult relevant external document parser:
1248 if (is_object($this->external_parsers[$ext])) {
1249 $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
1250 }
1251 return $cParts;
1252 }
1253
1254 /**
1255 * Splits non-HTML content (from external files for instance)
1256 *
1257 * @param string $content Input content (non-HTML) to index.
1258 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1259 * @see splitHTMLContent()
1260 */
1261 public function splitRegularContent($content)
1262 {
1263 $contentArr = $this->defaultContentArray;
1264 $contentArr['body'] = $content;
1265 return $contentArr;
1266 }
1267
1268 /**********************************
1269 *
1270 * Analysing content, Extracting words
1271 *
1272 **********************************/
1273 /**
1274 * Convert character set and HTML entities in the value of input content array keys
1275 *
1276 * @param array $contentArr Standard content array
1277 * @param string $charset Charset of the input content (converted to utf-8)
1278 * @return void
1279 */
1280 public function charsetEntity2utf8(&$contentArr, $charset)
1281 {
1282 // Convert charset if necessary
1283 foreach ($contentArr as $key => $value) {
1284 if ((string)$contentArr[$key] !== '') {
1285 if ($charset !== 'utf-8') {
1286 $contentArr[$key] = $this->csObj->conv($contentArr[$key], $charset, 'utf-8');
1287 }
1288 // decode all numeric / html-entities in the string to real characters:
1289 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key]);
1290 }
1291 }
1292 }
1293
1294 /**
1295 * Processing words in the array from split*Content -functions
1296 *
1297 * @param array $contentArr Array of content to index, see splitHTMLContent() and splitRegularContent()
1298 * @return array Content input array modified so each key is not a unique array of words
1299 */
1300 public function processWordsInArrays($contentArr)
1301 {
1302 // split all parts to words
1303 foreach ($contentArr as $key => $value) {
1304 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1305 }
1306 // For title, keywords, and description we don't want duplicates:
1307 $contentArr['title'] = array_unique($contentArr['title']);
1308 $contentArr['keywords'] = array_unique($contentArr['keywords']);
1309 $contentArr['description'] = array_unique($contentArr['description']);
1310 // Return modified array:
1311 return $contentArr;
1312 }
1313
1314 /**
1315 * Extracts the sample description text from the content array.
1316 *
1317 * @param array $contentArr Content array
1318 * @return string Description string
1319 */
1320 public function bodyDescription($contentArr)
1321 {
1322 // Setting description
1323 $maxL = MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1324 if ($maxL) {
1325 $bodyDescription = preg_replace('/\s+/u', ' ', $contentArr['body']);
1326 // Shorten the string:
1327 $bodyDescription = mb_strcut($bodyDescription, 0, $maxL, 'utf-8');
1328 }
1329 return $bodyDescription;
1330 }
1331
1332 /**
1333 * Analyzes content to use for indexing,
1334 *
1335 * @param array $content Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1336 * @return array Index Array (whatever that is...)
1337 */
1338 public function indexAnalyze($content)
1339 {
1340 $indexArr = [];
1341 $this->analyzeHeaderinfo($indexArr, $content, 'title', 7);
1342 $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1343 $this->analyzeHeaderinfo($indexArr, $content, 'description', 5);
1344 $this->analyzeBody($indexArr, $content);
1345 return $indexArr;
1346 }
1347
1348 /**
1349 * Calculates relevant information for headercontent
1350 *
1351 * @param array $retArr Index array, passed by reference
1352 * @param array $content Standard content array
1353 * @param string $key Key from standard content array
1354 * @param int $offset Bit-wise priority to type
1355 * @return void
1356 */
1357 public function analyzeHeaderinfo(&$retArr, $content, $key, $offset)
1358 {
1359 foreach ($content[$key] as $val) {
1360 $val = substr($val, 0, 60);
1361 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1362 if (!isset($retArr[$val])) {
1363 // Word ID (wid)
1364 $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1365 // Metaphone value is also 60 only chars long
1366 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1367 $retArr[$val]['metaphone'] = $metaphone;
1368 }
1369 // Build metaphone fulltext string (can be used for fulltext indexing)
1370 if ($this->storeMetaphoneInfoAsWords) {
1371 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1372 }
1373 // Priority used for flagBitMask feature (see extension configuration)
1374 $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | pow(2, $offset);
1375 // Increase number of occurrences
1376 $retArr[$val]['count']++;
1377 $this->wordcount++;
1378 }
1379 }
1380
1381 /**
1382 * Calculates relevant information for bodycontent
1383 *
1384 * @param array $retArr Index array, passed by reference
1385 * @param array $content Standard content array
1386 * @return void
1387 */
1388 public function analyzeBody(&$retArr, $content)
1389 {
1390 foreach ($content['body'] as $key => $val) {
1391 $val = substr($val, 0, 60);
1392 // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1393 if (!isset($retArr[$val])) {
1394 // First occurrence (used for ranking results)
1395 $retArr[$val]['first'] = $key;
1396 // Word ID (wid)
1397 $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1398 // Metaphone value is also only 60 chars long
1399 $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1400 $retArr[$val]['metaphone'] = $metaphone;
1401 }
1402 // Build metaphone fulltext string (can be used for fulltext indexing)
1403 if ($this->storeMetaphoneInfoAsWords) {
1404 $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1405 }
1406 // Increase number of occurrences
1407 $retArr[$val]['count']++;
1408 $this->wordcount++;
1409 }
1410 }
1411
1412 /**
1413 * Creating metaphone based hash from input word
1414 *
1415 * @param string $word Word to convert
1416 * @param bool $returnRawMetaphoneValue If set, returns the raw metaphone value (not hashed)
1417 * @return mixed Metaphone hash integer (or raw value, string)
1418 */
1419 public function metaphone($word, $returnRawMetaphoneValue = false)
1420 {
1421 if (is_object($this->metaphoneObj)) {
1422 $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1423 } else {
1424 // Use native PHP function instead of advanced doubleMetaphone class
1425 $metaphoneRawValue = metaphone($word);
1426 }
1427 if ($returnRawMetaphoneValue) {
1428 $result = $metaphoneRawValue;
1429 } elseif ($metaphoneRawValue !== '') {
1430 // Create hash and return integer
1431 $result = IndexedSearchUtility::md5inthash($metaphoneRawValue);
1432 } else {
1433 $result = 0;
1434 }
1435 return $result;
1436 }
1437
1438 /********************************
1439 *
1440 * SQL; TYPO3 Pages
1441 *
1442 *******************************/
1443 /**
1444 * Updates db with information about the page (TYPO3 page, not external media)
1445 *
1446 * @return void
1447 */
1448 public function submitPage()
1449 {
1450 // Remove any current data for this phash:
1451 $this->removeOldIndexedPages($this->hash['phash']);
1452 // setting new phash_row
1453 $fields = [
1454 'phash' => $this->hash['phash'],
1455 'phash_grouping' => $this->hash['phash_grouping'],
1456 'cHashParams' => serialize($this->cHashParams),
1457 'contentHash' => $this->content_md5h,
1458 'data_page_id' => $this->conf['id'],
1459 'data_page_reg1' => $this->conf['page_cache_reg1'],
1460 'data_page_type' => $this->conf['type'],
1461 'data_page_mp' => $this->conf['MP'],
1462 'gr_list' => $this->conf['gr_list'],
1463 'item_type' => 0,
1464 // TYPO3 page
1465 'item_title' => $this->contentParts['title'],
1466 'item_description' => $this->bodyDescription($this->contentParts),
1467 'item_mtime' => (int)$this->conf['mtime'],
1468 'item_size' => strlen($this->conf['content']),
1469 'tstamp' => $GLOBALS['EXEC_TIME'],
1470 'crdate' => $GLOBALS['EXEC_TIME'],
1471 'item_crdate' => $this->conf['crdate'],
1472 // Creation date of page
1473 'sys_language_uid' => $this->conf['sys_language_uid'],
1474 // Sys language uid of the page. Should reflect which language it DOES actually display!
1475 'externalUrl' => 0,
1476 'recordUid' => (int)$this->conf['recordUid'],
1477 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1478 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1479 ];
1480 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1481 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1482 ->getConnectionForTable('index_phash');
1483 $connection->insert(
1484 'index_phash',
1485 $fields,
1486 ['cHashParams' => Connection::PARAM_LOB]
1487 );
1488 }
1489 // PROCESSING index_section
1490 $this->submit_section($this->hash['phash'], $this->hash['phash']);
1491 // PROCESSING index_grlist
1492 $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1493 // PROCESSING index_fulltext
1494 $fields = [
1495 'phash' => $this->hash['phash'],
1496 'fulltextdata' => implode(' ', $this->contentParts),
1497 'metaphonedata' => $this->metaphoneContent
1498 ];
1499 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1500 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1501 }
1502 if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1503 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1504 ->getConnectionForTable('index_fulltext');
1505 $connection->insert('index_fulltext', $fields);
1506 }
1507 // PROCESSING index_debug
1508 if ($this->indexerConfig['debugMode']) {
1509 $fields = [
1510 'phash' => $this->hash['phash'],
1511 'debuginfo' => serialize([
1512 'cHashParams' => $this->cHashParams,
1513 'external_parsers initialized' => array_keys($this->external_parsers),
1514 'conf' => array_merge($this->conf, ['content' => substr($this->conf['content'], 0, 1000)]),
1515 'contentParts' => array_merge($this->contentParts, ['body' => substr($this->contentParts['body'], 0, 1000)]),
1516 'logs' => $this->internal_log,
1517 'lexer' => $this->lexerObj->debugString
1518 ])
1519 ];
1520 if (IndexedSearchUtility::isTableUsed('index_debug')) {
1521 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1522 ->getConnectionForTable('index_debug');
1523 $connection->insert('index_debug', $fields);
1524 }
1525 }
1526 }
1527
1528 /**
1529 * Stores gr_list in the database.
1530 *
1531 * @param int $hash Search result record phash
1532 * @param int $phash_x Actual phash of current content
1533 * @return void
1534 * @see update_grlist()
1535 */
1536 public function submit_grlist($hash, $phash_x)
1537 {
1538 // Setting the gr_list record
1539 $fields = [
1540 'phash' => $hash,
1541 'phash_x' => $phash_x,
1542 'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1543 'gr_list' => $this->conf['gr_list']
1544 ];
1545 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1546 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1547 ->getConnectionForTable('index_grlist');
1548 $connection->insert('index_grlist', $fields);
1549 }
1550 }
1551
1552 /**
1553 * Stores section
1554 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1555 *
1556 * @param int $hash phash of TYPO3 parent search result record
1557 * @param int $hash_t3 phash of the file indexation search record
1558 * @return void
1559 */
1560 public function submit_section($hash, $hash_t3)
1561 {
1562 $fields = [
1563 'phash' => $hash,
1564 'phash_t3' => $hash_t3,
1565 'page_id' => (int)$this->conf['id']
1566 ];
1567 $this->getRootLineFields($fields);
1568 if (IndexedSearchUtility::isTableUsed('index_section')) {
1569 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1570 ->getConnectionForTable('index_section');
1571 $connection->insert('index_section', $fields);
1572 }
1573 }
1574
1575 /**
1576 * Removes records for the indexed page, $phash
1577 *
1578 * @param int $phash phash value to flush
1579 * @return void
1580 */
1581 public function removeOldIndexedPages($phash)
1582 {
1583 // Removing old registrations for all tables. Because the pages are TYPO3 pages
1584 // there can be nothing else than 1-1 relations here.
1585 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1586 $tableArray = ['index_phash', 'index_section', 'index_grlist', 'index_fulltext', 'index_debug'];
1587 foreach ($tableArray as $table) {
1588 if (IndexedSearchUtility::isTableUsed($table)) {
1589 $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1590 }
1591 }
1592
1593 // Removing all index_section records with hash_t3 set to this hash (this includes such
1594 // records set for external media on the page as well!). The re-insert of these records
1595 // are done in indexRegularDocument($file).
1596 if (IndexedSearchUtility::isTableUsed('index_section')) {
1597 $connectionPool->getConnectionForTable('index_section')
1598 ->delete('index_section', ['phash_t3' => (int)$phash]);
1599 }
1600 }
1601
1602 /********************************
1603 *
1604 * SQL; External media
1605 *
1606 *******************************/
1607 /**
1608 * Updates db with information about the file
1609 *
1610 * @param array $hash Array with phash and phash_grouping keys for file
1611 * @param string $file File name
1612 * @param array $subinfo Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1613 * @param string $ext File extension determining the type of media.
1614 * @param int $mtime Modification time of file.
1615 * @param int $ctime Creation time of file.
1616 * @param int $size Size of file in bytes
1617 * @param int $content_md5h Content HASH value.
1618 * @param array $contentParts Standard content array (using only title and body for a file)
1619 * @return void
1620 */
1621 public function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts)
1622 {
1623 // Find item Type:
1624 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1625 $storeItemType = $storeItemType ?: $ext;
1626 // Remove any current data for this phash:
1627 $this->removeOldIndexedFiles($hash['phash']);
1628 // Split filename:
1629 $fileParts = parse_url($file);
1630 // Setting new
1631 $fields = [
1632 'phash' => $hash['phash'],
1633 'phash_grouping' => $hash['phash_grouping'],
1634 'cHashParams' => serialize($subinfo),
1635 'contentHash' => $content_md5h,
1636 'data_filename' => $file,
1637 'item_type' => $storeItemType,
1638 'item_title' => trim($contentParts['title']) ?: basename($file),
1639 'item_description' => $this->bodyDescription($contentParts),
1640 'item_mtime' => $mtime,
1641 'item_size' => $size,
1642 'item_crdate' => $ctime,
1643 'tstamp' => $GLOBALS['EXEC_TIME'],
1644 'crdate' => $GLOBALS['EXEC_TIME'],
1645 'gr_list' => $this->conf['gr_list'],
1646 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1647 'recordUid' => (int)$this->conf['recordUid'],
1648 'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1649 'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1650 'sys_language_uid' => (int)$this->conf['sys_language_uid']
1651 ];
1652 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1653 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1654 ->getConnectionForTable('index_phash');
1655 $connection->insert(
1656 'index_phash',
1657 $fields,
1658 ['cHashParams' => Connection::PARAM_LOB]
1659 );
1660 }
1661 // PROCESSING index_fulltext
1662 $fields = [
1663 'phash' => $hash['phash'],
1664 'fulltextdata' => implode(' ', $contentParts),
1665 'metaphonedata' => $this->metaphoneContent
1666 ];
1667 if ($this->indexerConfig['fullTextDataLength'] > 0) {
1668 $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1669 }
1670 if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1671 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1672 ->getConnectionForTable('index_fulltext');
1673 $connection->insert('index_fulltext', $fields);
1674 }
1675 // PROCESSING index_debug
1676 if ($this->indexerConfig['debugMode']) {
1677 $fields = [
1678 'phash' => $hash['phash'],
1679 'debuginfo' => serialize([
1680 'cHashParams' => $subinfo,
1681 'contentParts' => array_merge($contentParts, ['body' => substr($contentParts['body'], 0, 1000)]),
1682 'logs' => $this->internal_log,
1683 'lexer' => $this->lexerObj->debugString
1684 ])
1685 ];
1686 if (IndexedSearchUtility::isTableUsed('index_debug')) {
1687 $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1688 ->getConnectionForTable('index_debug');
1689 $connection->insert('index_debug', $fields);
1690 }
1691 }
1692 }
1693
1694 /**
1695 * Stores file gr_list for a file IF it does not exist already
1696 *
1697 * @param int $hash phash value of file
1698 * @return void
1699 */
1700 public function submitFile_grlist($hash)
1701 {
1702 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1703 if (!IndexedSearchUtility::isTableUsed('index_grlist')) {
1704 return;
1705 }
1706
1707 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1708 ->getQueryBuilderForTable('index_grlist');
1709 $count = (int)$queryBuilder->count('*')
1710 ->from('index_grlist')
1711 ->where(
1712 $queryBuilder->expr()->eq(
1713 'phash',
1714 $queryBuilder->createNamedParameter($hash, \PDO::PARAM_INT)
1715 ),
1716 $queryBuilder->expr()->orX(
1717 $queryBuilder->expr()->eq(
1718 'hash_gr_list',
1719 $queryBuilder->createNamedParameter(
1720 IndexedSearchUtility::md5inthash($this->defaultGrList),
1721 \PDO::PARAM_INT
1722 )
1723 ),
1724 $queryBuilder->expr()->eq(
1725 'hash_gr_list',
1726 $queryBuilder->createNamedParameter(
1727 IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1728 \PDO::PARAM_INT
1729 )
1730 )
1731 )
1732 )
1733 ->execute()
1734 ->fetchColumn();
1735
1736 if ($count === 0) {
1737 $this->submit_grlist($hash, $hash);
1738 }
1739 }
1740
1741 /**
1742 * Stores file section for a file IF it does not exist
1743 *
1744 * @param int $hash phash value of file
1745 * @return void
1746 */
1747 public function submitFile_section($hash)
1748 {
1749 // Testing if there is already a section
1750 if (!IndexedSearchUtility::isTableUsed('index_section')) {
1751 return;
1752 }
1753
1754 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1755 ->getQueryBuilderForTable('index_section');
1756 $count = (int)$queryBuilder->count('phash')
1757 ->from('index_section')
1758 ->where(
1759 $queryBuilder->expr()->eq(
1760 'phash',
1761 $queryBuilder->createNamedParameter($hash, \PDO::PARAM_INT)
1762 ),
1763 $queryBuilder->expr()->eq(
1764 'page_id',
1765 $queryBuilder->createNamedParameter($this->conf['id'], \PDO::PARAM_INT)
1766 )
1767 )
1768 ->execute()
1769 ->fetchColumn();
1770
1771 if ($count === 0) {
1772 $this->submit_section($hash, $this->hash['phash']);
1773 }
1774 }
1775
1776 /**
1777 * Removes records for the indexed page, $phash
1778 *
1779 * @param int $phash phash value to flush
1780 * @return void
1781 */
1782 public function removeOldIndexedFiles($phash)
1783 {
1784 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1785 // Removing old registrations for tables.
1786 $tableArray = ['index_phash', 'index_grlist', 'index_fulltext', 'index_debug'];
1787 foreach ($tableArray as $table) {
1788 if (!IndexedSearchUtility::isTableUsed($table)) {
1789 continue;
1790 }
1791 $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1792 }
1793 }
1794
1795 /********************************
1796 *
1797 * SQL Helper functions
1798 *
1799 *******************************/
1800 /**
1801 * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1802 * Return positive integer if the page needs to be indexed
1803 *
1804 * @param int $mtime mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1805 * @param int $phash "phash" used to select any already indexed page to see what its mtime is.
1806 * @return int Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index.
1807 */
1808 public function checkMtimeTstamp($mtime, $phash)
1809 {
1810 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1811 // Not indexed (not in index_phash)
1812 $result = 4;
1813 } else {
1814 $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1815 ->select(
1816 ['item_mtime', 'tstamp'],
1817 'index_phash',
1818 ['phash' => (int)$phash],
1819 [],
1820 [],
1821 1
1822 )
1823 ->fetch();
1824 // If there was an indexing of the page...:
1825 if (!empty($row)) {
1826 if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < $GLOBALS['EXEC_TIME']) {
1827 // If max age is exceeded, index the page
1828 // The configured max-age was exceeded for the document and thus it's indexed.
1829 $result = 1;
1830 } else {
1831 if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < $GLOBALS['EXEC_TIME']) {
1832 // if minAge is not set or if minAge is exceeded, consider at mtime
1833 if ($mtime) {
1834 // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1835 if ($row['item_mtime'] != $mtime) {
1836 // And if mtime is different from the index_phash mtime, it's about time to re-index.
1837 // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1838 $result = 2;
1839 } else {
1840 // mtime matched the document, so no changes detected and no content updated
1841 $result = -1;
1842 if ($this->tstamp_maxAge) {
1843 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1844 } else {
1845 $this->updateTstamp($phash);
1846 $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1847 }
1848 }
1849 } else {
1850 // The minimum age was exceed, but mtime was not set, so the page was indexed.
1851 $result = 3;
1852 }
1853 } else {
1854 // The minimum age was not exceeded
1855 $result = -2;
1856 }
1857 }
1858 } else {
1859 // Page has never been indexed (is not represented in the index_phash table).
1860 $result = 4;
1861 }
1862 }
1863 return $result;
1864 }
1865
1866 /**
1867 * Check content hash in phash table
1868 *
1869 * @return mixed Returns TRUE if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1870 */
1871 public function checkContentHash()
1872 {
1873 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1874 $result = true;
1875 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1876 $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1877 ->select(
1878 ['phash'],
1879 'index_phash',
1880 [
1881 'phash_grouping' => (int)$this->hash['phash_grouping'],
1882 'contentHash' => (int)$this->content_md5h
1883 ],
1884 [],
1885 [],
1886 1
1887 )
1888 ->fetch();
1889
1890 if (!empty($row)) {
1891 $result = $row;
1892 }
1893 }
1894 return $result;
1895 }
1896
1897 /**
1898 * Check content hash for external documents
1899 * Returns TRUE if the document needs to be indexed (that is, there was no result)
1900 *
1901 * @param int $hashGr phash value to check (phash_grouping)
1902 * @param int $content_md5h Content hash to check
1903 * @return bool Returns TRUE if the document needs to be indexed (that is, there was no result)
1904 */
1905 public function checkExternalDocContentHash($hashGr, $content_md5h)
1906 {
1907 $result = true;
1908 if (IndexedSearchUtility::isTableUsed('index_phash')) {
1909 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1910 ->getConnectionForTable('index_phash')
1911 ->count(
1912 '*',
1913 'index_phash',
1914 [
1915 'phash_grouping' => (int)$hashGr,
1916 'contentHash' => (int)$content_md5h
1917 ]
1918 );
1919
1920 $result = $count === 0;
1921 }
1922 return $result;
1923 }
1924
1925 /**
1926 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1927 *
1928 * @param int $phash_x Phash integer to test.
1929 * @return bool
1930 */
1931 public function is_grlist_set($phash_x)
1932 {
1933 $result = false;
1934 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1935 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1936 ->getConnectionForTable('index_grlist')
1937 ->count(
1938 'phash_x',
1939 'index_grlist',
1940 ['phash_x' => (int)$phash_x]
1941 );
1942
1943 $result = $count > 0;
1944 }
1945 return $result;
1946 }
1947
1948 /**
1949 * Check if an grlist-entry for this hash exists and if not so, write one.
1950 *
1951 * @param int $phash phash of the search result that should be found
1952 * @param int $phash_x The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1953 * @return void
1954 * @see submit_grlist()
1955 */
1956 public function update_grlist($phash, $phash_x)
1957 {
1958 if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1959 $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1960 ->getConnectionForTable('index_grlist')
1961 ->count(
1962 'phash',
1963 'index_grlist',
1964 [
1965 'phash' => (int)$phash,
1966 'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list'])
1967 ]
1968 );
1969
1970 if ($count === 0) {
1971 $this->submit_grlist($phash, $phash_x);
1972 $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1);
1973 }
1974 }
1975 }
1976
1977 /**
1978 * Update tstamp for a phash row.
1979 *
1980 * @param int $phash phash value
1981 * @param int $mtime If set, update the mtime field to this value.
1982 * @return void
1983 */
1984 public function updateTstamp($phash, $mtime = 0)
1985 {
1986 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1987 return;
1988 }
1989
1990 $updateFields = [
1991 'tstamp' => $GLOBALS['EXEC_TIME']
1992 ];
1993
1994 if ($mtime) {
1995 $updateFields['item_mtime'] = (int)$mtime;
1996 }
1997
1998 GeneralUtility::makeInstance(ConnectionPool::class)
1999 ->getConnectionForTable('index_phash')
2000 ->update(
2001 'index_phash',
2002 $updateFields,
2003 [
2004 'phash' => (int)$phash
2005 ]
2006 );
2007 }
2008
2009 /**
2010 * Update SetID of the index_phash record.
2011 *
2012 * @param int $phash phash value
2013 * @return void
2014 */
2015 public function updateSetId($phash)
2016 {
2017 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
2018 return;
2019 }
2020
2021 GeneralUtility::makeInstance(ConnectionPool::class)
2022 ->getConnectionForTable('index_phash')
2023 ->update(
2024 'index_phash',
2025 [
2026 'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
2027 ],
2028 [
2029 'phash' => (int)$phash
2030 ]
2031 );
2032 }
2033
2034 /**
2035 * Update parsetime for phash row.
2036 *
2037 * @param int $phash phash value.
2038 * @param int $parsetime Parsetime value to set.
2039 * @return void
2040 */
2041 public function updateParsetime($phash, $parsetime)
2042 {
2043 if (!IndexedSearchUtility::isTableUsed('index_phash')) {
2044 return;
2045 }
2046
2047 GeneralUtility::makeInstance(ConnectionPool::class)
2048 ->getConnectionForTable('index_phash')
2049 ->update(
2050 'index_phash',
2051 [
2052 'parsetime' => (int)$parsetime
2053 ],
2054 [
2055 'phash' => (int)$phash
2056 ]
2057 );
2058 }
2059
2060 /**
2061 * Update section rootline for the page
2062 *
2063 * @return void
2064 */
2065 public function updateRootline()
2066 {
2067 if (!IndexedSearchUtility::isTableUsed('index_section')) {
2068 return;
2069 }
2070
2071 $updateFields = [];
2072 $this->getRootLineFields($updateFields);
2073
2074 GeneralUtility::makeInstance(ConnectionPool::class)
2075 ->getConnectionForTable('index_section')
2076 ->update(
2077 'index_section',
2078 $updateFields,
2079 [
2080 'page_id' => (int)$this->conf['id']
2081 ]
2082 );
2083 }
2084
2085 /**
2086 * Adding values for root-line fields.
2087 * rl0, rl1 and rl2 are standard. A hook might add more.
2088 *
2089 * @param array $fieldArray Field array, passed by reference
2090 * @return void
2091 */
2092 public function getRootLineFields(array &$fieldArray)
2093 {
2094 $fieldArray['rl0'] = (int)$this->conf['rootline_uids'][0];
2095 $fieldArray['rl1'] = (int)$this->conf['rootline_uids'][1];
2096 $fieldArray['rl2'] = (int)$this->conf['rootline_uids'][2];
2097 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
2098 foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
2099 $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
2100 }
2101 }
2102 }
2103
2104 /**
2105 * Includes the crawler class
2106 *
2107 * @return void
2108 * @deprecated since TYPO3 v8, will be removed in TYPO3 v9, autoloader is taking care of that functionality
2109 */
2110 public function includeCrawlerClass()
2111 {
2112 GeneralUtility::logDeprecatedFunction();
2113 require_once \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('crawler') . 'class.tx_crawler_lib.php';
2114 }
2115
2116 /********************************
2117 *
2118 * SQL; Submitting words
2119 *
2120 *******************************/
2121 /**
2122 * Adds new words to db
2123 *
2124 * @param array $wordListArray Word List array (where each word has information about position etc).
2125 * @return void
2126 */
2127 public function checkWordList($wordListArray)
2128 {
2129 if (!IndexedSearchUtility::isTableUsed('index_words') || empty($wordListArray)) {
2130 return;
2131 }
2132
2133 $wordListArrayCount = count($wordListArray);
2134 $phashArray = array_map('intval', array_column($wordListArray, 'hash'));
2135
2136 $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('index_words');
2137 $count = (int)$queryBuilder->count('baseword')
2138 ->from('index_words')
2139 ->where(
2140 $queryBuilder->expr()->in(
2141 'wid',
2142 $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
2143 )
2144 )
2145 ->execute()
2146 ->fetchColumn();
2147
2148 if ($count !== $wordListArrayCount) {
2149 $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_words');
2150 $queryBuilder = $connection->createQueryBuilder();
2151
2152 $result = $queryBuilder->select('baseword')
2153 ->from('index_words')
2154 ->where(
2155 $queryBuilder->expr()->in(
2156 'wid',
2157 $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
2158 )
2159 )
2160 ->execute();
2161
2162 $this->log_setTSlogMessage('Inserting words: ' . ($wordListArrayCount - $count), 1);
2163 while ($row = $result->fetch()) {
2164 unset($wordListArray[$row['baseword']]);
2165 }
2166
2167 foreach ($wordListArray as $key => $val) {
2168 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as
2169 // long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...)
2170 // this is not a problem.
2171 $connection->insert(
2172 'index_words',
2173 [
2174 'wid' => $val['hash'],
2175 'baseword' => $key,
2176 'metaphone' => $val['metaphone']
2177 ]
2178 );
2179 }
2180 }
2181 }
2182
2183 /**
2184 * Submits RELATIONS between words and phash
2185 *
2186 * @param array $wordList Word list array
2187 * @param int $phash phash value
2188 * @return void
2189 */
2190 public function submitWords($wordList, $phash)
2191 {
2192 if (!IndexedSearchUtility::isTableUsed('index_rel')) {
2193 return;
2194 }
2195 $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
2196 $queryBuilder = $connectionPool->getQueryBuilderForTable('index_words');
2197 $result = $queryBuilder->select('wid')
2198 ->from('index_words')
2199 ->where(
2200 $queryBuilder->expr()->neq('is_stopword', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT))
2201 )
2202 ->groupBy('wid')
2203 ->execute();
2204
2205 $stopWords = [];
2206 while ($row = $result->fetch()) {
2207 $stopWords[$row['wid']] = $row;
2208 }
2209
2210 $connectionPool->getConnectionForTable('index_rel')->delete('index_rel', ['phash' => (int)$phash]);
2211
2212 $fields = ['phash', 'wid', 'count', 'first', 'freq', 'flags'];
2213 $rows = [];
2214 foreach ($wordList as $val) {
2215 if (isset($stopWords[$val['hash']])) {
2216 continue;
2217 }
2218 $rows[] = [
2219 (int)$phash,
2220 (int)$val['hash'],
2221 (int)$val['count'],
2222 (int)$val['first'],
2223 $this->freqMap($val['count'] / $this->wordcount),
2224 $val['cmp'] & $this->flagBitMask
2225 ];
2226 }
2227
2228 $connectionPool->getConnectionForTable('index_rel')->bulkInsert('index_rel', $rows, $fields);
2229 }
2230
2231 /**
2232 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
2233 * and back.
2234 *
2235 * @param float $freq Frequency
2236 * @return int Frequency in range.
2237 */
2238 public function freqMap($freq)
2239 {
2240 $mapFactor = $this->freqMax * 100 * $this->freqRange;
2241 if ($freq <= 1) {
2242 $newFreq = $freq * $mapFactor;
2243 $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
2244 } else {
2245 $newFreq = $freq / $mapFactor;
2246 }
2247 return $newFreq;
2248 }
2249
2250 /********************************
2251 *
2252 * Hashing
2253 *
2254 *******************************/
2255 /**
2256 * Get search hash, T3 pages
2257 *
2258 * @return void
2259 */
2260 public function setT3Hashes()
2261 {
2262 // Set main array:
2263 $hArray = [
2264 'id' => (int)$this->conf['id'],
2265 'type' => (int)$this->conf['type'],
2266 'sys_lang' => (int)$this->conf['sys_language_uid'],
2267 'MP' => (string)$this->conf['MP'],
2268 'cHash' => $this->cHashParams
2269 ];
2270 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
2271 $this->hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2272 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
2273 $hArray['gr_list'] = (string)$this->conf['gr_list'];
2274 $this->hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2275 }
2276
2277 /**
2278 * Get search hash, external files
2279 *
2280 * @param string $file File name / path which identifies it on the server
2281 * @param array $subinfo Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
2282 * @return array Array with "phash_grouping" and "phash" inside.
2283 */
2284 public function setExtHashes($file, $subinfo = [])
2285 {
2286 // Set main array:
2287 $hash = [];
2288 $hArray = [
2289 'file' => $file
2290 ];
2291 // Set grouping hash:
2292 $hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2293 // Add subinfo
2294 $hArray['subinfo'] = $subinfo;
2295 $hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
2296 return $hash;
2297 }
2298
2299 /*********************************
2300 *
2301 * Internal logging functions
2302 *
2303 *********************************/
2304 /**
2305 * Push function wrapper for TT logging
2306 *
2307 * @param string $msg Title to set
2308 * @param string $key Key (?)
2309 * @return void
2310 */
2311 public function log_push($msg, $key)
2312 {
2313 $this->timeTracker->push($msg, $key);
2314 }
2315
2316 /**
2317 * Pull function wrapper for TT logging
2318 *
2319 * @return void
2320 */
2321 public function log_pull()
2322 {
2323 $this->timeTracker->pull();
2324 }
2325
2326 /**
2327 * Set log message function wrapper for TT logging
2328 *
2329 * @param string $msg Message to set
2330 * @param int $errorNum Error number
2331 * @return void
2332 */
2333 public function log_setTSlogMessage($msg, $errorNum = 0)
2334 {
2335 $this->timeTracker->setTSlogMessage($msg, $errorNum);
2336 $this->internal_log[] = $msg;
2337 }
2338
2339 /**
2340 * Makes sure that keywords are space-separated. This is impotant for their
2341 * proper displaying as a part of fulltext index.
2342 *
2343 * @param string $keywordList
2344 * @return string
2345 * @see http://forge.typo3.org/issues/14959
2346 */
2347 protected function addSpacesToKeywordList($keywordList)
2348 {
2349 $keywords = GeneralUtility::trimExplode(',', $keywordList);
2350 return ' ' . implode(', ', $keywords) . ' ';
2351 }
2352 }